393 lines
13 KiB
Rust
393 lines
13 KiB
Rust
//It's really unfortunate that benchmarking is still unstable.
|
||
//#![feature(test)]
|
||
//extern crate test;
|
||
extern crate regex;
|
||
#[macro_use] extern crate lazy_static;
|
||
|
||
use std::fmt::Write;
|
||
use regex::Regex;
|
||
|
||
|
||
#[derive(Clone,Copy,PartialEq,Eq)]
|
||
enum FmtChar {
|
||
Regular,
|
||
Italic,
|
||
Bold,
|
||
Both,
|
||
}
|
||
|
||
impl FmtChar {
|
||
fn add(self, b: Self) -> Self {
|
||
match (self, b) {
|
||
(FmtChar::Regular, x) |
|
||
(x, FmtChar::Regular) => x,
|
||
(FmtChar::Italic, FmtChar::Bold) |
|
||
(FmtChar::Bold, FmtChar::Italic) => FmtChar::Both,
|
||
_ => self
|
||
}
|
||
}
|
||
|
||
fn open(self) -> &'static str {
|
||
match self {
|
||
FmtChar::Regular => "",
|
||
FmtChar::Italic => "<i>",
|
||
FmtChar::Bold => "<b>",
|
||
FmtChar::Both => "<em>",
|
||
}
|
||
}
|
||
|
||
fn close(self) -> &'static str {
|
||
match self {
|
||
FmtChar::Regular => "",
|
||
FmtChar::Italic => "</i>",
|
||
FmtChar::Bold => "</b>",
|
||
FmtChar::Both => "</em>",
|
||
}
|
||
}
|
||
}
|
||
|
||
|
||
/* Simple state machine to interpret the BACKSPACE codes generated by grotty. The format is
|
||
* described as "old behaviour" in grotty(1). Roughly:
|
||
*
|
||
* '_' BACKSPACE 'x' -> 'x' is italic
|
||
* 'x' BACKSPACE 'x' -> 'x' is bold
|
||
* '_' BACKSPACE 'x' BACKSPACE 'x' -> 'x' is bold and italic
|
||
*
|
||
* And other combinations are possible. The BACKSPACE character basically says "combine the
|
||
* following character with previous token". Where "combining" means:
|
||
*
|
||
* a == b -> bold
|
||
* a == _ -> b is italic
|
||
* b == _ -> a is italic
|
||
*
|
||
* See the BSD ul(1) utility for a full interpreter of the format. Fortunately we only have to
|
||
* handle the (limited) output that grotty generates, we don't have to be fully compatible with
|
||
* ul(1).
|
||
*/
|
||
enum CharParse {
|
||
Start,
|
||
Token(char, FmtChar),
|
||
Escape(char, FmtChar),
|
||
}
|
||
|
||
|
||
impl CharParse {
|
||
fn update(&mut self, chr: char) -> Option<(char, FmtChar)> {
|
||
match *self {
|
||
CharParse::Start => {
|
||
*self = CharParse::Token(chr, FmtChar::Regular);
|
||
None
|
||
},
|
||
|
||
CharParse::Token(c, f) =>
|
||
if chr == 8 as char {
|
||
*self = CharParse::Escape(c, f);
|
||
None
|
||
} else {
|
||
*self = CharParse::Token(chr, FmtChar::Regular);
|
||
Some((c, f))
|
||
},
|
||
|
||
CharParse::Escape(c, f) => {
|
||
*self = if c == '_' {
|
||
CharParse::Token(chr, f.add(FmtChar::Italic))
|
||
} else if chr == '_' {
|
||
CharParse::Token(c, f.add(FmtChar::Italic))
|
||
} else {
|
||
CharParse::Token(chr, f.add(FmtChar::Bold))
|
||
};
|
||
None
|
||
},
|
||
}
|
||
}
|
||
}
|
||
|
||
|
||
fn pushfmt(out: &mut String, old: FmtChar, new: FmtChar) {
|
||
if new != old {
|
||
out.push_str(old.close());
|
||
out.push_str(new.open());
|
||
}
|
||
}
|
||
|
||
|
||
// Intermediate text buffer. This buffer contains the entire HTML-escaped man page and a list of
|
||
// indices where text formatting changes are performed.
|
||
struct FmtBuf {
|
||
buf: String,
|
||
// List of formatting chunks. The number indicates the character index where the formatting
|
||
// ends. E.g. [(5,Regular),(10,Bold),(15,Italic)] means:
|
||
// [0..5] is Regular
|
||
// [5..10] is Bold
|
||
// [10..15] is Italic
|
||
fmt: Vec<(usize,FmtChar)>,
|
||
lastfmt: FmtChar,
|
||
}
|
||
|
||
// Output state
|
||
struct Flush<'a, 'b> {
|
||
out: &'a mut String,
|
||
idx: usize, // Last byte in the buffer that has been processed
|
||
fmt: std::iter::Peekable<std::slice::Iter<'b, (usize,FmtChar)>>, // Iterator over FmtBuf.fmt
|
||
}
|
||
|
||
|
||
impl FmtBuf {
|
||
fn push(&mut self, chr: char, fmt: FmtChar) {
|
||
// Consider whitespace and underscore to have the same
|
||
// formatting as the previous character; This generates smaller
|
||
// HTML, and you can't see the difference anyway.
|
||
if self.lastfmt != fmt && !(chr == ' ' || chr == '_') {
|
||
self.fmt.push((self.buf.len(), self.lastfmt));
|
||
self.lastfmt = fmt;
|
||
}
|
||
// WARNING: The '"' character is not escaped, so care must be taken when copying a slice
|
||
// into an attribute value! (In the current implementation, " is simply never part of an
|
||
// attribute value)
|
||
match chr {
|
||
'<' => self.buf.push_str("<"),
|
||
'&' => self.buf.push_str("&"),
|
||
_ => self.buf.push(chr), // <- 30% of the entire processing time is spent here.
|
||
}
|
||
}
|
||
|
||
// Flush all unprocessed bytes until 'end' to the output
|
||
fn flush_to(&self, st: &mut Flush, end: usize) {
|
||
let mut lastfmt = FmtChar::Regular;
|
||
while st.idx < end {
|
||
let &&(chunk, fmt) = st.fmt.peek().unwrap();
|
||
let chunk = if chunk > end {
|
||
end
|
||
} else {
|
||
st.fmt.next();
|
||
chunk
|
||
};
|
||
pushfmt(st.out, lastfmt, fmt);
|
||
st.out.push_str(&self.buf[st.idx..chunk]);
|
||
st.idx = chunk;
|
||
lastfmt = fmt;
|
||
}
|
||
st.out.push_str(lastfmt.close());
|
||
}
|
||
|
||
// Consume the input buffer until 'end' without generating output
|
||
fn flush_skip(&self, st: &mut Flush, end: usize) {
|
||
st.idx = end;
|
||
while st.idx < self.buf.len() && st.fmt.peek().unwrap().0 <= st.idx {
|
||
st.fmt.next();
|
||
}
|
||
}
|
||
|
||
fn flush_include(&self, st: &mut Flush, start: usize, end: usize) {
|
||
lazy_static!(
|
||
static ref REF: Regex = Regex::new(r#"^((?:[^"\s\]]*/)?([^"\s/\]]+))\]\]\]"#).unwrap();
|
||
);
|
||
let m = match REF.captures(&self.buf[end..]) { Some(x) => x, None => return };
|
||
|
||
self.flush_to(st, start);
|
||
st.out.push_str(">> Included manual page: <a href=\"/");
|
||
// Replace ‐ (U+2010) with - (U+2d). ASCII dashes are replaced with an Unicode dash
|
||
// when passed through groff, which we need to revert in order to get the link working.
|
||
// (Apparently it recognizes man page references and URLs, as it doesn't do this
|
||
// replacement in those situations.)
|
||
for c in m[2].chars() {
|
||
st.out.push(if c == '‐' { '-' } else { c });
|
||
}
|
||
st.out.push_str("\">");
|
||
st.out.push_str(&m[1]);
|
||
st.out.push_str("</a>");
|
||
self.flush_skip(st, end + m[0].len());
|
||
}
|
||
|
||
fn flush_url(&self, st: &mut Flush, start: usize) {
|
||
lazy_static!(
|
||
// Some characters considered to never be part of a URL.
|
||
// (Note that we can't match literal '<' because of the HTML escaping done previously)
|
||
static ref URLEND: Regex = Regex::new("(?:\"|<|>|\\s)").unwrap();
|
||
);
|
||
let urlend = match URLEND.find(&self.buf[start..]) { Some(x) => x, None => return };
|
||
let url = &self.buf[start..(start + urlend.start())];
|
||
|
||
// Also catch a Unicode '⟩', which is how groff sometimes ends a .UR, e.g.:
|
||
// - https://manned.org/troff/c4467840
|
||
// - https://manned.org/pass/78413b49
|
||
// - https://manned.org/empathy-accounts/8c05b2c1
|
||
// - https://manned.org/urn/8cb83e85
|
||
// TODO: Add heuristic to only remove ) at the end of the URL if there is no matching (
|
||
// inside the URL.
|
||
let url = url.trim_end_matches(|c|
|
||
match c { '.' | ',' | ';' | ')' | '⟩' | '\'' | ':' | ']' | '}' => true, _ => false }
|
||
);
|
||
if url.len() < 10 {
|
||
return;
|
||
}
|
||
|
||
self.flush_to(st, start);
|
||
write!(st.out, "<a href=\"{0}\" rel=\"nofollow\">{0}</a>", url).unwrap();
|
||
self.flush_skip(st, start + url.len());
|
||
}
|
||
|
||
fn flush_ref(&self, st: &mut Flush, end: usize) {
|
||
// We know where the closing bracket is in the string, so this regex is used to search
|
||
// backwards from there and find the start of the reference.
|
||
// There are a lot of 'special' multi-character section names, so it might not make sense
|
||
// to parse all of them. Here's an estimate of a few 'special' section references, in
|
||
// number of man pages using the reference (using ~ '%(3pm)%' on the 2017-01-14 database):
|
||
// - 3pm 17810
|
||
// - 3w 8729 (just a few packages)
|
||
// - 3tcl 2000
|
||
// - 3tk 758
|
||
// - 3p 309
|
||
// - 3perl 268
|
||
// - 3ssl 198
|
||
lazy_static!(
|
||
// XXX: Make sure to keep this regex in sync with the one in flush()
|
||
static ref REF: Regex = Regex::new(r"([A-Za-z0-9:\._-]+)\(([1-9nl]|3tcl|3pm|3tk)\)$").unwrap();
|
||
);
|
||
|
||
// Disallow some characters following a reference
|
||
if self.buf.len() > end {
|
||
let ch = self.buf[end..].chars().next().unwrap();
|
||
if ch == '_' || ch.is_alphanumeric() {
|
||
return;
|
||
}
|
||
}
|
||
|
||
let m = REF.captures(&self.buf[..end]).unwrap();
|
||
self.flush_to(st, end - m[0].len());
|
||
self.flush_skip(st, end);
|
||
write!(st.out, "<a href=\"/{}.{}\">{}</a>", &m[1], &m[2], &m[0]).unwrap();
|
||
}
|
||
|
||
fn flush(&mut self, out: &mut String) {
|
||
self.fmt.push((self.buf.len(), FmtChar::Regular));
|
||
|
||
// Find the indices where the first line ends, and the last line starts. These are used to
|
||
// efficiently disable reference formatting on the first and last line.
|
||
let firstlineend = self.buf.find('\n').unwrap_or(self.buf.len());
|
||
let lastlinestart = self.buf.trim_end_matches('\n').rfind('\n').unwrap_or(0);
|
||
|
||
// This regex is used to quickly *find* interesting patterns, any further validation
|
||
// and processing is done afterwards by the (slower) specialized flush_ methods.
|
||
lazy_static!(
|
||
static ref SEARCH: Regex = Regex::new(r"(?m)(^\[\[\[MANNEDINCLUDE|https?://|[A-Za-z0-9]+\(([1-9nl]|3tcl|3pm|3tk)\))").unwrap();
|
||
);
|
||
|
||
let mut st = Flush{
|
||
out: out,
|
||
idx: 0,
|
||
fmt: self.fmt.iter().peekable(),
|
||
};
|
||
|
||
for i in SEARCH.find_iter(&self.buf) {
|
||
// This can happen with overlapping detections, e.g. when something inside a URL looks
|
||
// like a man page reference.
|
||
if st.idx > i.start() {
|
||
continue;
|
||
}
|
||
let allowref = i.start() > firstlineend && i.start() < lastlinestart;
|
||
match self.buf.as_bytes()[i.end()-1] {
|
||
0x45 /* E */ => self.flush_include(&mut st, i.start(), i.end()),
|
||
0x2F /* / */ if allowref => self.flush_url(&mut st, i.start()),
|
||
_ if allowref => self.flush_ref(&mut st, i.end()),
|
||
_ => {}
|
||
}
|
||
}
|
||
self.flush_to(&mut st, self.buf.len());
|
||
}
|
||
}
|
||
|
||
|
||
pub fn grotty2html(input: &str) -> String {
|
||
let mut state = CharParse::Start;
|
||
|
||
let mut buf = FmtBuf{
|
||
buf: String::with_capacity(128),
|
||
fmt: Vec::with_capacity(128),
|
||
lastfmt: FmtChar::Regular,
|
||
};
|
||
|
||
for chr in input.chars() {
|
||
if let Some((chr, fmt)) = state.update(chr) {
|
||
buf.push(chr, fmt);
|
||
}
|
||
}
|
||
if let CharParse::Token(chr, fmt) = state {
|
||
buf.push(chr, fmt);
|
||
}
|
||
|
||
let mut out = String::with_capacity(input.len());
|
||
buf.flush(&mut out);
|
||
out
|
||
}
|
||
|
||
|
||
|
||
use std::os::raw::c_ulonglong;
|
||
|
||
#[repr(C)]
|
||
pub struct StringWrap {
|
||
buf: *mut u8,
|
||
len: c_ulonglong,
|
||
cap: c_ulonglong,
|
||
}
|
||
|
||
#[no_mangle]
|
||
pub extern fn grotty2html_wrap(in_buf: *const u8, in_len: c_ulonglong) -> StringWrap {
|
||
let input = unsafe { std::str::from_utf8_unchecked( std::slice::from_raw_parts(in_buf, in_len as usize) ) };
|
||
let mut out = grotty2html(input).into_bytes();
|
||
let r = StringWrap {
|
||
buf: out.as_mut_ptr(),
|
||
len: out.len() as c_ulonglong,
|
||
cap: out.capacity() as c_ulonglong,
|
||
};
|
||
std::mem::forget(out);
|
||
r
|
||
}
|
||
|
||
#[no_mangle]
|
||
pub extern fn grotty2html_free(buf: StringWrap) {
|
||
unsafe { Vec::from_raw_parts(buf.buf, buf.len as usize, buf.cap as usize) };
|
||
}
|
||
|
||
|
||
/*
|
||
#[cfg(test)]
|
||
mod tests {
|
||
use super::*;
|
||
use std::io::Read;
|
||
use test::Bencher;
|
||
|
||
fn bench_file(b: &mut Bencher, f: &str) {
|
||
let mut f = std::fs::File::open(f).unwrap();
|
||
let mut buf = String::new();
|
||
f.read_to_string(&mut buf).unwrap();
|
||
|
||
b.iter(|| {
|
||
test::black_box(grotty2html(&buf));
|
||
});
|
||
}
|
||
|
||
#[bench]
|
||
fn bench_rsync(b: &mut test::Bencher) {
|
||
bench_file(b, "t/rsync.1.output");
|
||
}
|
||
|
||
#[bench]
|
||
fn bench_ncdu(b: &mut test::Bencher) {
|
||
bench_file(b, "t/ncdu.1.output");
|
||
}
|
||
|
||
#[bench]
|
||
fn bench_javadoc(b: &mut test::Bencher) {
|
||
bench_file(b, "t/javadoc.1.output");
|
||
}
|
||
|
||
/*
|
||
#[bench]
|
||
fn bench_wfilter(b: &mut test::Bencher) {
|
||
bench_file(b, "t/wfilter.4.output");
|
||
}
|
||
*/
|
||
}*/
|