Whole bunch of HTML conversion improvements

- Grotty escape sequences are now better interpreted. I feel rather
  stupid for not realizing the idea behind how those codes are supposed
  to work earlier. It finally hit me when I read the BSD ul(1) source
  code.
- URL end detection is slightly better (much better than the old C code)
- Man page references with : are recognized now (common in Perl modules).
- More efficient HTML escaping, no need to escape > and ".

There's still a bunch of improvements to make, but I have much more
confidence in the current implementation already.
This commit is contained in:
Yorhel 2017-01-15 17:07:03 +01:00
parent 6114b17389
commit 1ccc86ce86

View file

@ -15,51 +15,51 @@ enum FmtChar {
} }
/* Simple state machine to parse the following grammar: /* Simple state machine to interpret the BACKSPACE codes generated by grotty. The format is
* described as "old behaviour" in grotty(1). Roughly:
* *
* fmtchar = escape | double-escape | char * '_' BACKSPACE 'x' -> 'x' is italic
* escape = tag ESC char * 'x' BACKSPACE 'x' -> 'x' is bold
* double-escape = ESC tag ESC char * '_' BACKSPACE 'x' BACKSPACE 'x' -> 'x' is bold and italic
* tag = "_" # italic
* | char # bold
* *
* This format is described as "old behaviour" in grotty(1). The double-escape * And other combinations are possible. The BACKSPACE character basically says "combine the
* seems to be a weird glitch, and can be interpreted as * following character with previous token". Where "combining" means:
* "(tag ESC char) ESC (tag ESC char)". This parser simply skips over any such *
* sequence starting with ESC. */ * a == b -> bold
* a == _ -> b is italic
* b == _ -> a is italic
*
* See the BSD ul(1) utility for a full interpreter of the format. Fortunately we only have to
* handle the (limited) output that grotty generates, we don't have to be fully compatible with
* ul(1).
*/
enum CharParse { enum CharParse {
Start, Start,
One(char), // Seen a single character (either 'char' or 'escape') Token(char, FmtChar),
Escape(char), // Seen a single character + escape Escape(char, FmtChar),
DoubleEsc(u32), // Inside a double-escape, indicates number of characters left to skip
} }
impl CharParse { impl CharParse {
fn update(&mut self, chr: char) -> Option<(char, FmtChar)> { fn update(&mut self, chr: char) -> Option<(char, FmtChar)> {
match *self { match *self {
CharParse::Start => { CharParse::Start => {
*self = if chr == 8 as char { CharParse::DoubleEsc(2) } else { CharParse::One(chr) }; *self = CharParse::Token(chr, FmtChar::Regular);
None None
}, },
CharParse::One(c) => CharParse::Token(c, f) =>
if chr == 8 as char { if chr == 8 as char {
*self = CharParse::Escape(c); *self = CharParse::Escape(c, f);
None None
} else { } else {
*self = CharParse::One(chr); *self = CharParse::Token(chr, FmtChar::Regular);
Some((c, FmtChar::Regular)) Some((c, f))
}, },
CharParse::Escape(c) => { CharParse::Escape(c, _) => {
*self = CharParse::Start; // TODO: Handle combination of bold & italic
Some((chr, if c == '_' { FmtChar::Italic } else { FmtChar::Bold })) *self = CharParse::Token(chr, if c == '_' { FmtChar::Italic } else { FmtChar::Bold });
},
CharParse::DoubleEsc(n) => {
*self = if n == 0 { CharParse::Start } else { CharParse::DoubleEsc(n-1) };
None None
}, },
} }
@ -107,11 +107,12 @@ impl FmtBuf {
self.fmt.push((self.buf.len(), self.lastfmt)); self.fmt.push((self.buf.len(), self.lastfmt));
self.lastfmt = fmt; self.lastfmt = fmt;
} }
// WARNING: The '"' character is not escaped, so care must be taken when copying a slice
// into an attribute value! (In the current implementation, " is simply never part of an
// attribute value)
match chr { match chr {
'>' => self.buf.push_str("&gt;"),
'<' => self.buf.push_str("&lt;"), '<' => self.buf.push_str("&lt;"),
'&' => self.buf.push_str("&amp;"), '&' => self.buf.push_str("&amp;"),
// '"' => self.buf.push_str("&quot;"), // TEMPORARILY disabled for comparison with old code
_ => self.buf.push(chr), // <- 30% of the entire processing time is spent here. _ => self.buf.push(chr), // <- 30% of the entire processing time is spent here.
} }
} }
@ -138,19 +139,19 @@ impl FmtBuf {
// Consume the input buffer until 'end' without generating output // Consume the input buffer until 'end' without generating output
fn flush_skip(&self, st: &mut Flush, end: usize) { fn flush_skip(&self, st: &mut Flush, end: usize) {
st.idx = end; st.idx = end;
while st.fmt.peek().unwrap().0 <= st.idx { while st.idx < self.buf.len() && st.fmt.peek().unwrap().0 <= st.idx {
st.fmt.next(); st.fmt.next();
} }
} }
fn flush_include(&self, st: &mut Flush, start: usize, end: usize) { fn flush_include(&self, st: &mut Flush, start: usize, end: usize) {
lazy_static!( lazy_static!(
static ref REF: Regex = Regex::new(r"^((?:[^\s\]]*/)?([^\s/\]]+))\]\]\]").unwrap(); static ref REF: Regex = Regex::new(r#"^((?:[^"\s\]]*/)?([^"\s/\]]+))\]\]\]"#).unwrap();
); );
let m = match REF.captures(&self.buf[end..]) { Some(x) => x, None => return }; let m = match REF.captures(&self.buf[end..]) { Some(x) => x, None => return };
self.flush_to(st, start); self.flush_to(st, start);
st.out.push_str("\n&gt;&gt; Included man page: <a href=\"/"); st.out.push_str(">> Included manual page: <a href=\"/");
// Replace (U+2010) with - (U+2d). ASCII dashes are replaced with an Unicode dash // Replace (U+2010) with - (U+2d). ASCII dashes are replaced with an Unicode dash
// when passed through groff, which we need to revert in order to get the link working. // when passed through groff, which we need to revert in order to get the link working.
// (Apparently it recognizes man page references and URLs, as it doesn't do this // (Apparently it recognizes man page references and URLs, as it doesn't do this
@ -167,12 +168,10 @@ impl FmtBuf {
fn flush_url(&self, st: &mut Flush, start: usize) { fn flush_url(&self, st: &mut Flush, start: usize) {
lazy_static!( lazy_static!(
// Some characters considered to never be part of a URL. // Some characters considered to never be part of a URL.
// (Note that we can't match literal ><" because of the HTML escaping done previously) // (Note that we can't match literal '<' because of the HTML escaping done previously)
static ref URLEND: Regex = Regex::new("(?:\"|&quot;|&gt;|&lt;|\\s)").unwrap(); static ref URLEND: Regex = Regex::new("(?:\"|&lt;|>|\\s)").unwrap();
); );
let urlend = match URLEND.find(&self.buf[start..]) { Some(x) => x, None => return }; let urlend = match URLEND.find(&self.buf[start..]) { Some(x) => x, None => return };
self.flush_to(st, start);
let url = &self.buf[start..(start + urlend.start())]; let url = &self.buf[start..(start + urlend.start())];
// Also catch a Unicode '⟩', which is how groff sometimes ends a .UR, e.g.: // Also catch a Unicode '⟩', which is how groff sometimes ends a .UR, e.g.:
@ -180,10 +179,14 @@ impl FmtBuf {
// - https://manned.org/pass/78413b49 // - https://manned.org/pass/78413b49
// - https://manned.org/empathy-accounts/8c05b2c1 // - https://manned.org/empathy-accounts/8c05b2c1
// - https://manned.org/urn/8cb83e85 // - https://manned.org/urn/8cb83e85
// TODO: Check the character before the start of the URL, and only remove ) if there is a // TODO: Add heuristic to only remove ) at the end of the URL if there is no matching (
// starting ( before it. // inside the URL.
let url = url.trim_right_matches('.').trim_right_matches(',').trim_right_matches(';').trim_right_matches(')').trim_right_matches('⟩'); let url = url.trim_right_matches('.').trim_right_matches(',').trim_right_matches(';').trim_right_matches(')').trim_right_matches('⟩').trim_right_matches('\'');
if url.len() < 10 {
return;
}
self.flush_to(st, start);
write!(st.out, "<a href=\"{0}\" rel=\"nofollow\">{0}</a>", url).unwrap(); write!(st.out, "<a href=\"{0}\" rel=\"nofollow\">{0}</a>", url).unwrap();
self.flush_skip(st, start + url.len()); self.flush_skip(st, start + url.len());
} }
@ -192,13 +195,13 @@ impl FmtBuf {
// We know where the closing bracket is in the string, so this regex is used to search // We know where the closing bracket is in the string, so this regex is used to search
// backwards from there and find the start of the reference. // backwards from there and find the start of the reference.
lazy_static!( lazy_static!(
static ref REF: Regex = Regex::new(r"([A-Za-z0-9\._-]+)\(([1-8nl])\)$").unwrap(); static ref REF: Regex = Regex::new(r"([A-Za-z0-9:\._-]+)\(([1-9nl])\)$").unwrap();
); );
// Disallow some characters following a reference // Disallow some characters following a reference
if self.buf.len() > end { if self.buf.len() > end {
let ch = self.buf[end..].chars().next().unwrap(); let ch = self.buf[end..].chars().next().unwrap();
if ch == '-' || ch == '_' || ch.is_alphanumeric() { if ch == '_' || ch.is_alphanumeric() {
return; return;
} }
} }
@ -220,7 +223,7 @@ impl FmtBuf {
// This regex is used to quickly *find* interesting patterns, any further validation // This regex is used to quickly *find* interesting patterns, any further validation
// and processing is done afterwards by the (slower) specialized flush_ methods. // and processing is done afterwards by the (slower) specialized flush_ methods.
lazy_static!( lazy_static!(
static ref SEARCH: Regex = Regex::new(r"(?m)(^\[\[\[MANNEDINCLUDE|https?://|[A-Za-z0-9]+\([1-8nl]\))").unwrap(); static ref SEARCH: Regex = Regex::new(r"(?m)(^\[\[\[MANNEDINCLUDE|https?://|[A-Za-z0-9]+\([1-9nl]\))").unwrap();
); );
let mut st = Flush{ let mut st = Flush{
@ -269,8 +272,8 @@ pub fn grotty2html(input: &str) -> String {
//} //}
} }
} }
if let CharParse::One(chr) = state { if let CharParse::Token(chr, fmt) = state {
buf.push(chr, FmtChar::Regular); buf.push(chr, fmt);
} }
let mut out = String::with_capacity(input.len()); let mut out = String::with_capacity(input.len());