Whole bunch of HTML conversion improvements

- Grotty escape sequences are now better interpreted. I feel rather stupid for not realizing the idea behind how those codes are supposed to work earlier. It finally hit me when I read the BSD ul(1) source code. - URL end detection is slightly better (much better than the old C code) - Man page references with : are recognized now (common in Perl modules). - More efficient HTML escaping, no need to escape > and ". There's still a bunch of improvements to make, but I have much more confidence in the current implementation already.
2017-01-15 17:07:03 +01:00 · 2017-01-15 17:07:03 +01:00 · 1ccc86ce86
commit 1ccc86ce86
parent 6114b17389
1 changed files with 46 additions and 43 deletions
--- a/web/src/lib.rs
+++ b/web/src/lib.rs
@ -15,51 +15,51 @@ enum FmtChar {
 }


-/* Simple state machine to parse the following grammar:
+/* Simple state machine to interpret the BACKSPACE codes generated by grotty. The format is
+ * described as "old behaviour" in grotty(1). Roughly:
 *
- * fmtchar       = escape | double-escape | char
- * escape        = tag ESC char
- * double-escape = ESC tag ESC char
- * tag           = "_"  # italic
- *               | char # bold
+ *   '_' BACKSPACE 'x'               -> 'x' is italic
+ *   'x' BACKSPACE 'x'               -> 'x' is bold
+ *   '_' BACKSPACE 'x' BACKSPACE 'x' -> 'x' is bold and italic
 *
- * This format is described as "old behaviour" in grotty(1).  The double-escape
- * seems to be a weird glitch, and can be interpreted as
- * "(tag ESC char) ESC (tag ESC char)".  This parser simply skips over any such
- * sequence starting with ESC. */
+ * And other combinations are possible. The BACKSPACE character basically says "combine the
+ * following character with previous token". Where "combining" means:
+ *
+ *   a == b   -> bold
+ *   a == _   -> b is italic
+ *   b == _   -> a is italic
+ *
+ * See the BSD ul(1) utility for a full interpreter of the format. Fortunately we only have to
+ * handle the (limited) output that grotty generates, we don't have to be fully compatible with
+ * ul(1).
+ */
 enum CharParse {
    Start,
-    One(char),      // Seen a single character (either 'char' or 'escape')
-    Escape(char),   // Seen a single character + escape
-    DoubleEsc(u32), // Inside a double-escape, indicates number of characters left to skip
+    Token(char, FmtChar),
+    Escape(char, FmtChar),
 }


 impl CharParse {
    fn update(&mut self, chr: char) -> Option<(char, FmtChar)> {
        match *self {
-
            CharParse::Start => {
-                *self = if chr == 8 as char { CharParse::DoubleEsc(2) } else { CharParse::One(chr) };
+                *self = CharParse::Token(chr, FmtChar::Regular);
                None
            },

-            CharParse::One(c) =>
+            CharParse::Token(c, f) =>
                if chr == 8 as char {
-                    *self = CharParse::Escape(c);
+                    *self = CharParse::Escape(c, f);
                    None
                } else {
-                    *self = CharParse::One(chr);
-                    Some((c, FmtChar::Regular))
+                    *self = CharParse::Token(chr, FmtChar::Regular);
+                    Some((c, f))
                },

-            CharParse::Escape(c) => {
-                *self = CharParse::Start;
-                Some((chr, if c == '_' { FmtChar::Italic } else { FmtChar::Bold }))
-            },
-
-            CharParse::DoubleEsc(n) => {
-                *self = if n == 0 { CharParse::Start } else { CharParse::DoubleEsc(n-1) };
+            CharParse::Escape(c, _) => {
+                // TODO: Handle combination of bold & italic
+                *self = CharParse::Token(chr, if c == '_' { FmtChar::Italic } else { FmtChar::Bold });
                None
            },
        }
@ -107,11 +107,12 @@ impl FmtBuf {
            self.fmt.push((self.buf.len(), self.lastfmt));
            self.lastfmt = fmt;
        }
+        // WARNING: The '"' character is not escaped, so care must be taken when copying a slice
+        // into an attribute value! (In the current implementation, " is simply never part of an
+        // attribute value)
        match chr {
-            '>' => self.buf.push_str("&gt;"),
            '<' => self.buf.push_str("&lt;"),
            '&' => self.buf.push_str("&amp;"),
-            // '"' => self.buf.push_str("&quot;"), // TEMPORARILY disabled for comparison with old code
            _   => self.buf.push(chr), // <- 30% of the entire processing time is spent here.
        }
    }
@ -138,19 +139,19 @@ impl FmtBuf {
    // Consume the input buffer until 'end' without generating output
    fn flush_skip(&self, st: &mut Flush, end: usize) {
        st.idx = end;
-        while st.fmt.peek().unwrap().0 <= st.idx {
+        while st.idx < self.buf.len() && st.fmt.peek().unwrap().0 <= st.idx {
            st.fmt.next();
        }
    }

    fn flush_include(&self, st: &mut Flush, start: usize, end: usize) {
        lazy_static!(
-            static ref REF: Regex = Regex::new(r"^((?:[^\s\]]*/)?([^\s/\]]+))\]\]\]").unwrap();
+            static ref REF: Regex = Regex::new(r#"^((?:[^"\s\]]*/)?([^"\s/\]]+))\]\]\]"#).unwrap();
        );
        let m = match REF.captures(&self.buf[end..]) { Some(x) => x, None => return };

        self.flush_to(st, start);
-        st.out.push_str("\n&gt;&gt; Included man page: <a href=\"/");
+        st.out.push_str(">> Included manual page: <a href=\"/");
        // Replace ‐ (U+2010) with - (U+2d). ASCII dashes are replaced with an Unicode dash
        // when passed through groff, which we need to revert in order to get the link working.
        // (Apparently it recognizes man page references and URLs, as it doesn't do this
@ -167,12 +168,10 @@ impl FmtBuf {
    fn flush_url(&self, st: &mut Flush, start: usize) {
        lazy_static!(
            // Some characters considered to never be part of a URL.
-            // (Note that we can't match literal ><" because of the HTML escaping done previously)
-            static ref URLEND: Regex = Regex::new("(?:\"|&quot;|&gt;|&lt;|\\s)").unwrap();
+            // (Note that we can't match literal '<' because of the HTML escaping done previously)
+            static ref URLEND: Regex = Regex::new("(?:\"|&lt;|>|\\s)").unwrap();
        );
        let urlend = match URLEND.find(&self.buf[start..]) { Some(x) => x, None => return };
-
-        self.flush_to(st, start);
        let url = &self.buf[start..(start + urlend.start())];

        // Also catch a Unicode '⟩', which is how groff sometimes ends a .UR, e.g.:
@ -180,10 +179,14 @@ impl FmtBuf {
        // - https://manned.org/pass/78413b49
        // - https://manned.org/empathy-accounts/8c05b2c1
        // - https://manned.org/urn/8cb83e85
-        // TODO: Check the character before the start of the URL, and only remove ) if there is a
-        // starting ( before it.
-        let url = url.trim_right_matches('.').trim_right_matches(',').trim_right_matches(';').trim_right_matches(')').trim_right_matches('⟩');
+        // TODO: Add heuristic to only remove ) at the end of the URL if there is no matching (
+        // inside the URL.
+        let url = url.trim_right_matches('.').trim_right_matches(',').trim_right_matches(';').trim_right_matches(')').trim_right_matches('⟩').trim_right_matches('\'');
+        if url.len() < 10 {
+            return;
+        }

+        self.flush_to(st, start);
        write!(st.out, "<a href=\"{0}\" rel=\"nofollow\">{0}</a>", url).unwrap();
        self.flush_skip(st, start + url.len());
    }
@ -192,13 +195,13 @@ impl FmtBuf {
        // We know where the closing bracket is in the string, so this regex is used to search
        // backwards from there and find the start of the reference.
        lazy_static!(
-            static ref REF: Regex = Regex::new(r"([A-Za-z0-9\._-]+)\(([1-8nl])\)$").unwrap();
+            static ref REF: Regex = Regex::new(r"([A-Za-z0-9:\._-]+)\(([1-9nl])\)$").unwrap();
        );

        // Disallow some characters following a reference
        if self.buf.len() > end {
            let ch = self.buf[end..].chars().next().unwrap();
-            if ch == '-' || ch == '_' || ch.is_alphanumeric() {
+            if ch == '_' || ch.is_alphanumeric() {
                return;
            }
        }
@ -220,7 +223,7 @@ impl FmtBuf {
        // This regex is used to quickly *find* interesting patterns, any further validation
        // and processing is done afterwards by the (slower) specialized flush_ methods.
        lazy_static!(
-            static ref SEARCH: Regex = Regex::new(r"(?m)(^\[\[\[MANNEDINCLUDE|https?://|[A-Za-z0-9]+\([1-8nl]\))").unwrap();
+            static ref SEARCH: Regex = Regex::new(r"(?m)(^\[\[\[MANNEDINCLUDE|https?://|[A-Za-z0-9]+\([1-9nl]\))").unwrap();
        );

        let mut st = Flush{
@ -269,8 +272,8 @@ pub fn grotty2html(input: &str) -> String {
            //}
        }
    }
-    if let CharParse::One(chr) = state {
-        buf.push(chr, FmtChar::Regular);
+    if let CharParse::Token(chr, fmt) = state {
+        buf.push(chr, fmt);
    }

    let mut out = String::with_capacity(input.len());