//It's really unfortunate that benchmarking is still unstable. //#![feature(test)] //extern crate test; extern crate regex; #[macro_use] extern crate lazy_static; use std::fmt::Write; use regex::Regex; #[derive(Clone,Copy,PartialEq,Eq)] enum FmtChar { Regular, Italic, Bold, Both, } impl FmtChar { fn add(self, b: Self) -> Self { match (self, b) { (FmtChar::Regular, x) | (x, FmtChar::Regular) => x, (FmtChar::Italic, FmtChar::Bold) | (FmtChar::Bold, FmtChar::Italic) => FmtChar::Both, _ => self } } fn open(self) -> &'static str { match self { FmtChar::Regular => "", FmtChar::Italic => "", FmtChar::Bold => "", FmtChar::Both => "", } } fn close(self) -> &'static str { match self { FmtChar::Regular => "", FmtChar::Italic => "", FmtChar::Bold => "", FmtChar::Both => "", } } } /* Simple state machine to interpret the BACKSPACE codes generated by grotty. The format is * described as "old behaviour" in grotty(1). Roughly: * * '_' BACKSPACE 'x' -> 'x' is italic * 'x' BACKSPACE 'x' -> 'x' is bold * '_' BACKSPACE 'x' BACKSPACE 'x' -> 'x' is bold and italic * * And other combinations are possible. The BACKSPACE character basically says "combine the * following character with previous token". Where "combining" means: * * a == b -> bold * a == _ -> b is italic * b == _ -> a is italic * * See the BSD ul(1) utility for a full interpreter of the format. Fortunately we only have to * handle the (limited) output that grotty generates, we don't have to be fully compatible with * ul(1). */ enum CharParse { Start, Token(char, FmtChar), Escape(char, FmtChar), } impl CharParse { fn update(&mut self, chr: char) -> Option<(char, FmtChar)> { match *self { CharParse::Start => { *self = CharParse::Token(chr, FmtChar::Regular); None }, CharParse::Token(c, f) => if chr == 8 as char { *self = CharParse::Escape(c, f); None } else { *self = CharParse::Token(chr, FmtChar::Regular); Some((c, f)) }, CharParse::Escape(c, f) => { *self = if c == '_' { CharParse::Token(chr, f.add(FmtChar::Italic)) } else if chr == '_' { CharParse::Token(c, f.add(FmtChar::Italic)) } else { CharParse::Token(chr, f.add(FmtChar::Bold)) }; None }, } } } fn pushfmt(out: &mut String, old: FmtChar, new: FmtChar) { if new != old { out.push_str(old.close()); out.push_str(new.open()); } } // Intermediate text buffer. This buffer contains the entire HTML-escaped man page and a list of // indices where text formatting changes are performed. struct FmtBuf { buf: String, // List of formatting chunks. The number indicates the character index where the formatting // ends. E.g. [(5,Regular),(10,Bold),(15,Italic)] means: // [0..5] is Regular // [5..10] is Bold // [10..15] is Italic fmt: Vec<(usize,FmtChar)>, lastfmt: FmtChar, } // Output state struct Flush<'a, 'b> { out: &'a mut String, idx: usize, // Last byte in the buffer that has been processed fmt: std::iter::Peekable>, // Iterator over FmtBuf.fmt } impl FmtBuf { fn push(&mut self, chr: char, fmt: FmtChar) { // Consider whitespace and underscore to have the same // formatting as the previous character; This generates smaller // HTML, and you can't see the difference anyway. if self.lastfmt != fmt && !(chr == ' ' || chr == '_') { self.fmt.push((self.buf.len(), self.lastfmt)); self.lastfmt = fmt; } // WARNING: The '"' character is not escaped, so care must be taken when copying a slice // into an attribute value! (In the current implementation, " is simply never part of an // attribute value) match chr { '<' => self.buf.push_str("<"), '&' => self.buf.push_str("&"), _ => self.buf.push(chr), // <- 30% of the entire processing time is spent here. } } // Flush all unprocessed bytes until 'end' to the output fn flush_to(&self, st: &mut Flush, end: usize) { let mut lastfmt = FmtChar::Regular; while st.idx < end { let &&(chunk, fmt) = st.fmt.peek().unwrap(); let chunk = if chunk > end { end } else { st.fmt.next(); chunk }; pushfmt(st.out, lastfmt, fmt); st.out.push_str(&self.buf[st.idx..chunk]); st.idx = chunk; lastfmt = fmt; } st.out.push_str(lastfmt.close()); } // Consume the input buffer until 'end' without generating output fn flush_skip(&self, st: &mut Flush, end: usize) { st.idx = end; while st.idx < self.buf.len() && st.fmt.peek().unwrap().0 <= st.idx { st.fmt.next(); } } fn flush_include(&self, st: &mut Flush, start: usize, end: usize) { lazy_static!( static ref REF: Regex = Regex::new(r#"^((?:[^"\s\]]*/)?([^"\s/\]]+))\]\]\]"#).unwrap(); ); let m = match REF.captures(&self.buf[end..]) { Some(x) => x, None => return }; self.flush_to(st, start); st.out.push_str(">> Included manual page: "); st.out.push_str(&m[1]); st.out.push_str(""); self.flush_skip(st, end + m[0].len()); } fn flush_url(&self, st: &mut Flush, start: usize) { lazy_static!( // Some characters considered to never be part of a URL. // (Note that we can't match literal '<' because of the HTML escaping done previously) static ref URLEND: Regex = Regex::new("(?:\"|<|>|\\s)").unwrap(); ); let urlend = match URLEND.find(&self.buf[start..]) { Some(x) => x, None => return }; let url = &self.buf[start..(start + urlend.start())]; // Also catch a Unicode '⟩', which is how groff sometimes ends a .UR, e.g.: // - https://manned.org/troff/c4467840 // - https://manned.org/pass/78413b49 // - https://manned.org/empathy-accounts/8c05b2c1 // - https://manned.org/urn/8cb83e85 // TODO: Add heuristic to only remove ) at the end of the URL if there is no matching ( // inside the URL. let url = url.trim_end_matches(|c| match c { '.' | ',' | ';' | ')' | '⟩' | '\'' | ':' | ']' | '}' => true, _ => false } ); if url.len() < 10 { return; } self.flush_to(st, start); write!(st.out, "{0}", url).unwrap(); self.flush_skip(st, start + url.len()); } fn flush_ref(&self, st: &mut Flush, end: usize) { // We know where the closing bracket is in the string, so this regex is used to search // backwards from there and find the start of the reference. // There are a lot of 'special' multi-character section names, so it might not make sense // to parse all of them. Here's an estimate of a few 'special' section references, in // number of man pages using the reference (using ~ '%(3pm)%' on the 2017-01-14 database): // - 3pm 17810 // - 3w 8729 (just a few packages) // - 3tcl 2000 // - 3tk 758 // - 3p 309 // - 3perl 268 // - 3ssl 198 lazy_static!( // XXX: Make sure to keep this regex in sync with the one in flush() static ref REF: Regex = Regex::new(r"([A-Za-z0-9:\._-]+)$([1-9nl]|3tcl|3pm|3tk)$$").unwrap(); ); // Disallow some characters following a reference if self.buf.len() > end { let ch = self.buf[end..].chars().next().unwrap(); if ch == '_' || ch.is_alphanumeric() { return; } } let m = REF.captures(&self.buf[..end]).unwrap(); self.flush_to(st, end - m[0].len()); self.flush_skip(st, end); write!(st.out, "{}", &m[1], &m[2], &m[0]).unwrap(); } fn flush(&mut self, out: &mut String) { self.fmt.push((self.buf.len(), FmtChar::Regular)); // Find the indices where the first line ends, and the last line starts. These are used to // efficiently disable reference formatting on the first and last line. let firstlineend = self.buf.find('\n').unwrap_or(self.buf.len()); let lastlinestart = self.buf.trim_end_matches('\n').rfind('\n').unwrap_or(0); // This regex is used to quickly *find* interesting patterns, any further validation // and processing is done afterwards by the (slower) specialized flush_ methods. lazy_static!( static ref SEARCH: Regex = Regex::new(r"(?m)(^\[\[\[MANNEDINCLUDE|https?://|[A-Za-z0-9]+$([1-9nl]|3tcl|3pm|3tk)$)").unwrap(); ); let mut st = Flush{ out: out, idx: 0, fmt: self.fmt.iter().peekable(), }; for i in SEARCH.find_iter(&self.buf) { // This can happen with overlapping detections, e.g. when something inside a URL looks // like a man page reference. if st.idx > i.start() { continue; } let allowref = i.start() > firstlineend && i.start() < lastlinestart; match self.buf.as_bytes()[i.end()-1] { 0x45 /* E */ => self.flush_include(&mut st, i.start(), i.end()), 0x2F /* / */ if allowref => self.flush_url(&mut st, i.start()), _ if allowref => self.flush_ref(&mut st, i.end()), _ => {} } } self.flush_to(&mut st, self.buf.len()); } } pub fn grotty2html(input: &str) -> String { let mut state = CharParse::Start; let mut buf = FmtBuf{ buf: String::with_capacity(128), fmt: Vec::with_capacity(128), lastfmt: FmtChar::Regular, }; for chr in input.chars() { if let Some((chr, fmt)) = state.update(chr) { buf.push(chr, fmt); } } if let CharParse::Token(chr, fmt) = state { buf.push(chr, fmt); } let mut out = String::with_capacity(input.len()); buf.flush(&mut out); out } use std::os::raw::c_ulonglong; #[repr(C)] pub struct StringWrap { buf: *mut u8, len: c_ulonglong, cap: c_ulonglong, } #[no_mangle] pub extern fn grotty2html_wrap(in_buf: *const u8, in_len: c_ulonglong) -> StringWrap { let input = unsafe { std::str::from_utf8_unchecked( std::slice::from_raw_parts(in_buf, in_len as usize) ) }; let mut out = grotty2html(input).into_bytes(); let r = StringWrap { buf: out.as_mut_ptr(), len: out.len() as c_ulonglong, cap: out.capacity() as c_ulonglong, }; std::mem::forget(out); r } #[no_mangle] pub extern fn grotty2html_free(buf: StringWrap) { unsafe { Vec::from_raw_parts(buf.buf, buf.len as usize, buf.cap as usize) }; } /* #[cfg(test)] mod tests { use super::*; use std::io::Read; use test::Bencher; fn bench_file(b: &mut Bencher, f: &str) { let mut f = std::fs::File::open(f).unwrap(); let mut buf = String::new(); f.read_to_string(&mut buf).unwrap(); b.iter(|| { test::black_box(grotty2html(&buf)); }); } #[bench] fn bench_rsync(b: &mut test::Bencher) { bench_file(b, "t/rsync.1.output"); } #[bench] fn bench_ncdu(b: &mut test::Bencher) { bench_file(b, "t/ncdu.1.output"); } #[bench] fn bench_javadoc(b: &mut test::Bencher) { bench_file(b, "t/javadoc.1.output"); } /* #[bench] fn bench_wfilter(b: &mut test::Bencher) { bench_file(b, "t/wfilter.4.output"); } */ }*/