Experimental rewrite of grotty to html conversion in Rust

The previous C code was troublesome. - Didn't handle long lines - I couldn't convince myself that it was free of memory safety issues - Needed improving anyway, there are some formatting bugs. These are hard to fix in the current code. I mostly replicated the formatting bugs of the old C implementation in Rust, and possibly added a few new bugs as well. It's not a significant improvement right now, more testing and fixing will be needed. The performance of both implementations is comparable, with the Rust version being slightly faster in many cases (and slower in some others). I did spend more time trying to optimize this Rust version than I did with the old C code. I initially tried a naive-ish conversion of the C code to Rust, but that turned out to be much slower and I had to resort to using regexes and different data structures fix that.
2017-01-15 10:54:48 +01:00 · 2017-01-15 10:54:48 +01:00 · 6114b17389
commit 6114b17389
parent 8a3af4aee2
10 changed files with 534 additions and 300 deletions
--- a/.gitignore
+++ b/.gitignore
@ -3,4 +3,4 @@
 !/lib/ManUtils/ManUtils.pm
 !/lib/ManUtils/ManUtils.xs
 indexer/target
-
+web/target
--- a/17
+++ b/17
@ -2,18 +2,27 @@
 all: ManUtils indexer
 ManUtils: lib/ManUtils/Build
 	cd lib/ManUtils && perl Build.PL && ./Build install --install-base=inst
-lib/ManUtils/Build: lib/ManUtils/Build.PL
+ManUtils: lib/ManUtils/inst/lib/perl5/x86_64-linux/ManUtils.pm
-	cd lib/ManUtils && perl Build.PL
+
 lib/ManUtils/inst/lib/perl5/x86_64-linux/ManUtils.pm: lib/ManUtils/Build.PL lib/ManUtils/ManUtils.pm lib/ManUtils/ManUtils.xs web/target/release/libweb.a
 	test lib/ManUtils/ManUtils.xs -ot web/target/release/libweb.a && touch -r web/target/release/libweb.a lib/ManUtils/ManUtils.xs
 	cd lib/ManUtils && perl Build.PL && ./Build install --install-base=inst
 	touch lib/ManUtils/inst/lib/perl5/x86_64-linux/ManUtils.pm
 web/target/release/libweb.a: web/Cargo.toml web/src/*.rs
 	cd web && cargo build --release
 	#strip --strip-unneeded web/target/release/libweb.a
 indexer: indexer/target/release/indexer
 indexer/target/release/indexer: indexer/Cargo.toml indexer/src/*.rs
 	cd indexer && cargo build --release
 clean:
 	cd lib/ManUtils && ./Build distclean
 	rm -rf lib/ManUtils/inst
 	cd indexer && cargo clean
 	cd web && cargo clean
--- a/6
+++ b/6
@ -13,10 +13,11 @@ Requirements
  General:
    perl: A somewhat recent version (no idea which, due to my XS usage)
    postgresql: Also a somewhat recent version
    rust + cargo (1.13+)
  www/ & lib/ & webs/: (Website)
    DBI
    DBD::Pg
  www/: (Website)
    TUWF
    JSON::XS
    AnyEvent
@ -24,7 +25,6 @@ Requirements
  util/ & indexer/: (DB updating and package synchronisation stuff)
    curl
    psql
    cargo + rust (1.13+)
 Contact
--- a/lib/ManUtils/Build.PL
+++ b/lib/ManUtils/Build.PL
@ -6,6 +6,8 @@ Module::Build->new(
  dist_name => 'ManUtils',
  dist_version_from => 'ManUtils.pm',
  dist_abstract => 'Utils for manned.org',
  license => 'MIT',
  extra_linker_flags => '../../web/target/release/libweb.a',
  pm_files => {
    'ManUtils.pm' => 'lib/ManUtils.pm',
  },
--- a/lib/ManUtils/ManUtils.pm
+++ b/lib/ManUtils/ManUtils.pm
@ -32,11 +32,12 @@ sub fmt {
  # Other .so's should be handled by html()
  $input =~ s/^\.so (.+)$/.in -10\n.sp\n\[\[\[MANNEDINCLUDE$1\]\]\]/mg;
-  # Disable hyphenation, since that screws up man page references. :-(
+  $input =
-  $input = ".hy 0\n.de hy\n..\n$input";
+    # Disable hyphenation, since that screws up man page references. :-(
-
+     ".hy 0\n.de hy\n..\n"
-  # Emulate man-db's --nj option
+    # Emulate man-db's --nj option
-  $input = ".na\n.de ad\n..\n$input";
+    .".na\n.de ad\n..\n"
    .$input;
  $input = encode_utf8($input);
@ -100,6 +101,4 @@ sub fmt_block {
  $out;
 }
 1;
--- a/lib/ManUtils/ManUtils.xs
+++ b/lib/ManUtils/ManUtils.xs
@ -2,278 +2,13 @@
 #include "perl.h"
 #include "XSUB.h"
 struct StringWrap {
  char *buf;
  unsigned long long len, cap;
 };
-// Convert grotty output to HTML for use in a <pre> tag.
+struct StringWrap grotty2html_wrap(const char *, unsigned long long);
-// It is assumed that the given input string is valid UTF-8, either represented
+void grotty2html_free(struct StringWrap);
 // as a Perl Unicode string, or as a UTF-8 encoded byte string. The data may
 // not contain the 0 character.
 // The formatted HTML is returned as a Perl Unicode string.
 // It is also assumed that hyphenation has been disabled when generating the
 // grotty output.
 // This implementation really is fast enough for "real-time" use in the website
 // code, very much unlike my experiments with Perl. My previous Perl
 // implementation took about 1.5s for rsync(1), whereas I've not seen this
 // implementation take more than 15ms.
 // TODO: Unicode characters aren't truncated correctly when a line exceeds
 // MAXLINE bytes. I've only seen this happening on man pages that grotty
 // couldn't wrap, e.g. some Japanese and Chinese mans.
 // (Ideally, I'd tell grotty how to wrap those correctly)
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <ctype.h>
 #define MAXLINE 1024
 #define LB 1
 #define LI 2
 typedef struct ctx_t {
  const char *src; // Pointer to the source data, or what's left of it.
  SV *dest; // Destination string to write to.
  // Current line
  char line[MAXLINE];
  char flags[MAXLINE]; // 0 = no fmt, LB = bold, LI = italic. (No combinations allowed)
  int linelen;
  int noref; // 1 if the current line shouldn't be checked for references. (Used for first and last line)
 } ctx_t;
 // Escapes and appends a displayed character to the output string.
 static inline void flushescape(ctx_t *x, char c) {
  static char str[2] = {};
  // Most HTML-escape functions also escape " to &quot;, but since we aren't
  // going to put a man page in an XML attribute, we don't really have to worry
  // about that one.
  switch(c) {
    case '>': sv_catpvn(x->dest, "&gt;", 4); break;
    case '<': sv_catpvn(x->dest, "&lt;", 4); break;
    case '&': sv_catpvn(x->dest, "&amp;", 5); break;
    default:
      str[0] = c;
      sv_catpvn(x->dest, str, 1);
  }
 }
 // HTML-escapes and adds formatting tags to a certain chunk of data and appends
 // it to the output string. The chunk is considered as an individual part,
 // assuming that any formatting is disabled at the start of the chunk, and
 // making sure it is disabled again at the end.
 // e points to the last character in s that is not considered part of the chunk.
 static void flushchunk(ctx_t *x, const char *s, const char *f, const char *e) {
  int fmt = 0;
 #define EFMT if(fmt) sv_catpvn(x->dest, fmt == LB ? "</b>" : "</i>", 4)
  while(s != e) {
    // Consider underscore and whitespace to have the same formatting as the
    // previous character.  The grotty escape sequences don't work well for the
    // underscore character, and you can't see the difference either way.
    if(fmt != *f && *s != '_' && *s != ' ') {
      EFMT;
      fmt = *f;
      if(fmt)
        sv_catpvn(x->dest, fmt == LB ? "<b>" : "<i>", 3);
    }
    flushescape(x, *s);
    s++;
    f++;
  }
  EFMT;
 #undef EFMT
 }
 #define ismanchar(x) (isalnum(x) || x == '_' || x == '-' || x == '.')
 static void flushinclude(ctx_t *x) {
  char buf[8] = {};
  char *s = x->line;
  s[x->linelen-3] = 0;
  s += 16;
  char *fn = strrchr(s, '/');
  fn = fn ? fn+1 : s;
  sv_catpv(x->dest, "&gt;&gt; Included manual page: <a href=\"/");
  // Replace ‐ (U+2010) with - (U+2d). ASCII dashes are replaced with an
  // Unicode dash when passed through groff, which we need to revert in order
  // to get the link working. (Apparently it recognizes man page references and
  // URLs, as it doesn't do this replacement in those situations.)
  while(*fn) {
    if(*fn == (char)0xe2 && fn[1] == (char)0x80 && fn[2] == (char)0x90) {
      buf[0] = '-';
      fn += 3;
    } else {
      buf[0] = *fn;
      fn++;
    }
    sv_catpvn(x->dest, buf, 1);
  }
  sv_catpv(x->dest, "\">");
  sv_catpv(x->dest, s);
  sv_catpv(x->dest, "</a>");
 }
 // HTML-escapes and "Flushes" the current line to the output string. Tries to
 // convert man references and URLs into links if format is true.
 static void flushline(ctx_t *x) {
  static const char eol[] = "\n";
  char *s = x->line, *es = x->line;
  // Special-case [[[MANNEDINCLUDE ..]]] directive
  if(x->linelen > 20 && *s == '[' && strncmp(s, "[[[MANNEDINCLUDE", 16) == 0 && strcmp("]]]", s+x->linelen-3) == 0) {
    flushinclude(x);
    goto end;
  }
  if(x->noref) {
    flushchunk(x, x->line, x->flags, x->line+x->linelen);
    goto end;
  }
 #define flush(end) do {\
    flushchunk(x, es, x->flags+(es-x->line), end);\
    es = end;\
  } while(0)
  while(*s) {
    // Man page reference.
    // Detected by the "(x)", but then checked backwards in the buffer to find
    // the start of the reference. This is pretty fast. Fails on:
    // - JSON.3pm: JSON->new->utf8(1)->pretty(1)->encode($perl_scalar)
    if(*s == '(' && (('1' <= s[1] && s[1] <= '9') || s[1] == 'n') && s[2] == ')' && !isalnum(s[3])) {
      char *n = s-1;
      while(n >= es && ismanchar(*n))
        n--;
      if(++n < s) {
        flush(n);
        *s = 0;
        sv_catpvf(x->dest, "<a href=\"/%s.%c\">%s(%c)</a>", n, s[1], n, s[1]);
        s += 3;
        es = s;
        continue;
      }
    }
    // HTTP(s) URL.
    // This is just a simple q{https?://[^ ][.,;"\)>]?( |$)} match, doesn't
    // always work right, e.g.:
    // - https://manned.org/spu_run/414316a1 -> URL wrapped to new line
    // Note: Don't use strncmp() before manually checking for 'http'. The parse
    // time is otherwise increased by a factor 2.
    if(s[0] == 'h' && s[1] == 't' && s[2] == 't' && s[3] == 'p' && (strncmp(s, "http://", 7) == 0 || strncmp(s, "https://", 8) == 0)) {
      // Find the end of the URL (space or some other weird character).
      char *sep = s;
      while(*sep && *sep != '>' && *sep != '<' && *sep != ' ' && *sep != '"')
        sep++;
      char *sp = sep;
      if(sp > s+10) {
        flush(s);
        char endchr = *sp;
        *(sp--) = 0;
        if(*sp == '.' || *sp == ',' || *sp == ';' || *sp == ')') {
          sp[1] = endchr;
          endchr = *sp;
          *(sp--) = 0;
        }
        // Also catch a Unicode '⟩', which is how groff sometimes ends a .UR, e.g.:
        // - https://manned.org/troff/c4467840
        // - https://manned.org/pass/78413b49
        // - https://manned.org/empathy-accounts/8c05b2c1
        // - https://manned.org/urn/8cb83e85
        // - https://manned.org/wine/4a699a22
        if(*sp == '\xa9' && *(sp-1) == '\x9f' && *(sp-2) == '\xe2') {
          sp[1] = endchr;
          sp -= 3;
          endchr = sp[1];
          sp[1] = 0;
        }
        sv_catpvf(x->dest, "<a href=\"%s\" rel=\"nofollow\">%s</a>", s, s);
        *(++sp) = endchr;
        es = s = sp;
        continue;
      }
    }
    s++;
  }
  flush(s);
 #undef flush
 end:
  sv_catpvn(x->dest, eol, sizeof(eol)-1);
 }
 // Adds a character to the current line, calls flushline() when a new line is done.
 // TODO: Convert \t into spaces? The rest of the code is written with the
 // assumption that \t does not occur in the string. I've not seen grotty output
 // tabs yet, but it's still a good idea to define what *we* do with tabs.
 static void appendline(ctx_t *x, char c, char f) {
  if(c == '\r')
    return;
  if(c == '\n' || x->linelen > MAXLINE+1) {
    x->line[x->linelen] = 0;
    flushline(x);
    x->linelen = 0;
    x->noref = 0;
    if(c == '\n')
      return;
  }
  x->line[x->linelen] = c;
  x->flags[x->linelen] = f;
  x->linelen++;
 }
 // Parses the grotty escapes and calls appendline() for each character.
 static void parselines(ctx_t *x) {
  int i, ini = 0, inb = 0;
  const char *buf = x->src;
  while(*buf) {
    int c1 = UTF8SKIP(buf);
    // Escape character right after a formatting code? Ignore the escape
    // character and formatting code after that. Grotty sometimes
    // double-formats a character, so you get "f ESC c ESC f ESC c", which you
    // should read as "(f ESC c) ESC (f ESC c)".
    if(*buf == 8 && buf[1] && buf[1+UTF8SKIP(buf+1)] == 8 && buf[2+UTF8SKIP(buf+1)]) {
      int c2 = UTF8SKIP(buf+1);
      buf += 2 + c2 + UTF8SKIP(buf+1+c2);
      continue;
    }
    // Formatting code
    if(buf[c1] == 8 && buf[c1+1]) {
      int c2 = UTF8SKIP(buf+c1+1);
      for(i=0; i<c2; i++)
        appendline(x, buf[c1+i+1], *buf == '_' ? LI : LB);
      buf += c1+c2+1;
      continue;
    }
    // Regular character
    if(*buf == '\n' && !buf[1])
      x->noref = 1;
    appendline(x, *buf, 0);
    buf++;
  }
  x->noref = 1;
  appendline(x, '\n', 0);
 }
 MODULE = ManUtils	 PACKAGE = ManUtils
@ -281,20 +16,13 @@ MODULE = ManUtils	 PACKAGE = ManUtils
 SV *
 html(str)
  SV *str
  INIT:
    ctx_t *x = malloc(sizeof(ctx_t));
  CODE:
-    x->src = SvPV_nolen(str);
+    STRLEN len;
-    x->dest = newSVpv("", 0);
+    char *inbuf = SvPV(str, len);
-    x->linelen = 0;
+    struct StringWrap buf = grotty2html_wrap(inbuf, len);
-    x->noref = 1;
+    SV *dest = newSVpv(buf.buf, buf.len);
-    parselines(x);
+    grotty2html_free(buf);
-    // Set the UTF8 flag *after* generating the result string. For some reason
+    SvUTF8_on(dest);
-    // that prevents sv_catpvf() from interpreting our C strings as something
+    RETVAL = dest;
    // other than UTF-8.
    SvUTF8_on(x->dest);
    RETVAL = x->dest;
    free(x);
  OUTPUT:
    RETVAL
--- a/web/Cargo.lock
+++ b/web/Cargo.lock
@ -0,0 +1,121 @@
 [root]
 name = "web"
 version = "0.1.0"
 dependencies = [
 "lazy_static 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)",
 "regex 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)",
 ]
 [[package]]
 name = "aho-corasick"
 version = "0.6.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 dependencies = [
 "memchr 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)",
 ]
 [[package]]
 name = "kernel32-sys"
 version = "0.2.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 dependencies = [
 "winapi 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)",
 "winapi-build 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
 ]
 [[package]]
 name = "lazy_static"
 version = "0.2.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 [[package]]
 name = "libc"
 version = "0.2.19"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 [[package]]
 name = "memchr"
 version = "1.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 dependencies = [
 "libc 0.2.19 (registry+https://github.com/rust-lang/crates.io-index)",
 ]
 [[package]]
 name = "regex"
 version = "0.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 dependencies = [
 "aho-corasick 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)",
 "memchr 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)",
 "regex-syntax 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)",
 "thread_local 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)",
 "utf8-ranges 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)",
 ]
 [[package]]
 name = "regex-syntax"
 version = "0.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 [[package]]
 name = "thread-id"
 version = "3.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 dependencies = [
 "kernel32-sys 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)",
 "libc 0.2.19 (registry+https://github.com/rust-lang/crates.io-index)",
 ]
 [[package]]
 name = "thread_local"
 version = "0.3.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 dependencies = [
 "thread-id 3.0.0 (registry+https://github.com/rust-lang/crates.io-index)",
 "unreachable 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
 ]
 [[package]]
 name = "unreachable"
 version = "0.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 dependencies = [
 "void 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)",
 ]
 [[package]]
 name = "utf8-ranges"
 version = "1.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 [[package]]
 name = "void"
 version = "1.0.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 [[package]]
 name = "winapi"
 version = "0.2.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 [[package]]
 name = "winapi-build"
 version = "0.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 [metadata]
 "checksum aho-corasick 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)" = "4f660b942762979b56c9f07b4b36bb559776fbad102f05d6771e1b629e8fd5bf"
 "checksum kernel32-sys 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7507624b29483431c0ba2d82aece8ca6cdba9382bff4ddd0f7490560c056098d"
 "checksum lazy_static 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "6abe0ee2e758cd6bc8a2cd56726359007748fbf4128da998b65d0b70f881e19b"
 "checksum libc 0.2.19 (registry+https://github.com/rust-lang/crates.io-index)" = "9e030dc72013ed68994d1b2cbf36a94dd0e58418ba949c4b0db7eeb70a7a6352"
 "checksum memchr 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)" = "1dbccc0e46f1ea47b9f17e6d67c5a96bd27030519c519c9c91327e31275a47b4"
 "checksum regex 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "4278c17d0f6d62dfef0ab00028feb45bd7d2102843f80763474eeb1be8a10c01"
 "checksum regex-syntax 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "2f9191b1f57603095f105d317e375d19b1c9c5c3185ea9633a99a6dcbed04457"
 "checksum thread-id 3.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "4437c97558c70d129e40629a5b385b3fb1ffac301e63941335e4d354081ec14a"
 "checksum thread_local 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7793b722f0f77ce716e7f1acf416359ca32ff24d04ffbac4269f44a4a83be05d"
 "checksum unreachable 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "1f2ae5ddb18e1c92664717616dd9549dde73f539f01bd7b77c2edb2446bdff91"
 "checksum utf8-ranges 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "662fab6525a98beff2921d7f61a39e7d59e0b425ebc7d0d9e66d316e55124122"
 "checksum void 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "6a02e4885ed3bc0f2de90ea6dd45ebcbb66dacffe03547fadbb0eeae2770887d"
 "checksum winapi 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)" = "167dc9d6949a9b857f3451275e911c3f44255842c1f7a76f33c55103a909087a"
 "checksum winapi-build 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "2d315eee3b34aca4797b2da6b13ed88266e6d612562a0c46390af8299fc699bc"
--- a/web/Cargo.toml
+++ b/web/Cargo.toml
@ -0,0 +1,16 @@
 [package]
 name = "web"
 version = "0.1.0"
 authors = ["yorhel"]
 [lib]
 name = "web"
 crate-type = ["lib", "staticlib"]
 [dependencies]
 regex = "0.2.1"
 lazy_static = "0.2.2"
 # Add debugging symbols even in release mode, in order to help with profiling.
 [profile.release]
 debug = true
--- a/web/src/lib.rs
+++ b/web/src/lib.rs
@ -0,0 +1,348 @@
 #![feature(test)]
 extern crate test;
 extern crate regex;
 #[macro_use] extern crate lazy_static;
 use std::fmt::Write;
 use regex::Regex;
 #[derive(Clone,Copy,PartialEq,Eq)]
 enum FmtChar {
    Regular,
    Italic,
    Bold,
 }
 /* Simple state machine to parse the following grammar:
 *
 * fmtchar       = escape | double-escape | char
 * escape        = tag ESC char
 * double-escape = ESC tag ESC char
 * tag           = "_"  # italic
 *               | char # bold
 *
 * This format is described as "old behaviour" in grotty(1).  The double-escape
 * seems to be a weird glitch, and can be interpreted as
 * "(tag ESC char) ESC (tag ESC char)".  This parser simply skips over any such
 * sequence starting with ESC. */
 enum CharParse {
    Start,
    One(char),      // Seen a single character (either 'char' or 'escape')
    Escape(char),   // Seen a single character + escape
    DoubleEsc(u32), // Inside a double-escape, indicates number of characters left to skip
 }
 impl CharParse {
    fn update(&mut self, chr: char) -> Option<(char, FmtChar)> {
        match *self {
            CharParse::Start => {
                *self = if chr == 8 as char { CharParse::DoubleEsc(2) } else { CharParse::One(chr) };
                None
            },
            CharParse::One(c) =>
                if chr == 8 as char {
                    *self = CharParse::Escape(c);
                    None
                } else {
                    *self = CharParse::One(chr);
                    Some((c, FmtChar::Regular))
                },
            CharParse::Escape(c) => {
                *self = CharParse::Start;
                Some((chr, if c == '_' { FmtChar::Italic } else { FmtChar::Bold }))
            },
            CharParse::DoubleEsc(n) => {
                *self = if n == 0 { CharParse::Start } else { CharParse::DoubleEsc(n-1) };
                None
            },
        }
    }
 }
 fn pushfmt(out: &mut String, old: FmtChar, new: FmtChar) {
    if new != old && old != FmtChar::Regular {
        out.push_str(if old == FmtChar::Italic { "</i>" } else { "</b>" });
    }
    if new != old && new != FmtChar::Regular {
        out.push_str(if new == FmtChar::Italic { "<i>" } else { "<b>" });
    }
 }
 // Intermediate text buffer. This buffer contains the entire HTML-escaped man page and a list of
 // indices where text formatting changes are performed.
 struct FmtBuf {
    buf: String,
    // List of formatting chunks. The number indicates the character index where the formatting
    // ends. E.g. [(5,Regular),(10,Bold),(15,Italic)] means:
    //   [0..5] is Regular
    //   [5..10] is Bold
    //   [10..15] is Italic
    fmt: Vec<(usize,FmtChar)>,
    lastfmt: FmtChar,
 }
 // Output state
 struct Flush<'a, 'b> {
    out: &'a mut String,
    idx: usize, // Last byte in the buffer that has been processed
    fmt: std::iter::Peekable<std::slice::Iter<'b, (usize,FmtChar)>>, // Iterator over FmtBuf.fmt
 }
 impl FmtBuf {
    fn push(&mut self, chr: char, fmt: FmtChar) {
        // Consider whitespace and underscore to have the same
        // formatting as the previous character; This generates smaller
        // HTML, and you can't see the difference anyway.
        if self.lastfmt != fmt && !(chr == ' ' || chr == '_') {
            self.fmt.push((self.buf.len(), self.lastfmt));
            self.lastfmt = fmt;
        }
        match chr {
            '>' => self.buf.push_str("&gt;"),
            '<' => self.buf.push_str("&lt;"),
            '&' => self.buf.push_str("&amp;"),
            // '"' => self.buf.push_str("&quot;"), // TEMPORARILY disabled for comparison with old code
            _   => self.buf.push(chr), // <- 30% of the entire processing time is spent here.
        }
    }
    // Flush all unprocessed bytes until 'end' to the output
    fn flush_to(&self, st: &mut Flush, end: usize) {
        let mut lastfmt = FmtChar::Regular;
        while st.idx < end {
            let &&(chunk, fmt) = st.fmt.peek().unwrap();
            let chunk = if chunk > end {
                end
            } else {
                st.fmt.next();
                chunk
            };
            pushfmt(st.out, lastfmt, fmt);
            st.out.push_str(&self.buf[st.idx..chunk]);
            st.idx = chunk;
            lastfmt = fmt;
        }
        pushfmt(st.out, lastfmt, FmtChar::Regular);
    }
    // Consume the input buffer until 'end' without generating output
    fn flush_skip(&self, st: &mut Flush, end: usize) {
        st.idx = end;
        while st.fmt.peek().unwrap().0 <= st.idx {
            st.fmt.next();
        }
    }
    fn flush_include(&self, st: &mut Flush, start: usize, end: usize) {
        lazy_static!(
            static ref REF: Regex = Regex::new(r"^((?:[^\s\]]*/)?([^\s/\]]+))\]\]\]").unwrap();
        );
        let m = match REF.captures(&self.buf[end..]) { Some(x) => x, None => return };
        self.flush_to(st, start);
        st.out.push_str("\n&gt;&gt; Included man page: <a href=\"/");
        // Replace ‐ (U+2010) with - (U+2d). ASCII dashes are replaced with an Unicode dash
        // when passed through groff, which we need to revert in order to get the link working.
        // (Apparently it recognizes man page references and URLs, as it doesn't do this
        // replacement in those situations.)
        for c in m[2].chars() {
            st.out.push(if c == '‐' { '-' } else { c });
        }
        st.out.push_str("\">");
        st.out.push_str(&m[1]);
        st.out.push_str("</a>");
        self.flush_skip(st, end + m[0].len());
    }
    fn flush_url(&self, st: &mut Flush, start: usize) {
        lazy_static!(
            // Some characters considered to never be part of a URL.
            // (Note that we can't match literal ><" because of the HTML escaping done previously)
            static ref URLEND: Regex = Regex::new("(?:\"|&quot;|&gt;|&lt;|\\s)").unwrap();
        );
        let urlend = match URLEND.find(&self.buf[start..]) { Some(x) => x, None => return };
        self.flush_to(st, start);
        let url = &self.buf[start..(start + urlend.start())];
        // Also catch a Unicode '⟩', which is how groff sometimes ends a .UR, e.g.:
        // - https://manned.org/troff/c4467840
        // - https://manned.org/pass/78413b49
        // - https://manned.org/empathy-accounts/8c05b2c1
        // - https://manned.org/urn/8cb83e85
        // TODO: Check the character before the start of the URL, and only remove ) if there is a
        // starting ( before it.
        let url = url.trim_right_matches('.').trim_right_matches(',').trim_right_matches(';').trim_right_matches(')').trim_right_matches('⟩');
        write!(st.out, "<a href=\"{0}\" rel=\"nofollow\">{0}</a>", url).unwrap();
        self.flush_skip(st, start + url.len());
    }
    fn flush_ref(&self, st: &mut Flush, end: usize) {
        // We know where the closing bracket is in the string, so this regex is used to search
        // backwards from there and find the start of the reference.
        lazy_static!(
            static ref REF: Regex = Regex::new(r"([A-Za-z0-9\._-]+)\(([1-8nl])\)$").unwrap();
        );
        // Disallow some characters following a reference
        if self.buf.len() > end {
            let ch = self.buf[end..].chars().next().unwrap();
            if ch == '-' || ch == '_' || ch.is_alphanumeric() {
                return;
            }
        }
        let m = REF.captures(&self.buf[..end]).unwrap();
        self.flush_to(st, end - m[0].len());
        self.flush_skip(st, end);
        write!(st.out, "<a href=\"/{}.{}\">{}</a>", &m[1], &m[2], &m[0]).unwrap();
    }
    fn flush(&mut self, out: &mut String) {
        self.fmt.push((self.buf.len(), FmtChar::Regular));
        // Find the indices where the first line ends, and the last line starts. These are used to
        // efficiently disable reference formatting on the first and last line.
        let firstlineend = self.buf.find('\n').unwrap_or(self.buf.len());
        let lastlinestart = self.buf.trim_right_matches('\n').rfind('\n').unwrap_or(0);
        // This regex is used to quickly *find* interesting patterns, any further validation
        // and processing is done afterwards by the (slower) specialized flush_ methods.
        lazy_static!(
            static ref SEARCH: Regex = Regex::new(r"(?m)(^\[\[\[MANNEDINCLUDE|https?://|[A-Za-z0-9]+\([1-8nl]\))").unwrap();
        );
        let mut st = Flush{
            out: out,
            idx: 0,
            fmt: self.fmt.iter().peekable(),
        };
        for i in SEARCH.find_iter(&self.buf) {
            // This can happen with overlapping detections, e.g. when something inside a URL looks
            // like a man page reference.
            if st.idx > i.start() {
                continue;
            }
            let allowref = i.start() > firstlineend && i.start() < lastlinestart;
            match self.buf.as_bytes()[i.end()-1] {
                0x45 /* E */ => self.flush_include(&mut st, i.start(), i.end()),
                0x2F /* / */ if allowref => self.flush_url(&mut st, i.start()),
                _            if allowref => self.flush_ref(&mut st, i.end()),
                _ => {}
            }
        }
        self.flush_to(&mut st, self.buf.len());
    }
 }
 pub fn grotty2html(input: &str) -> String {
    let mut state = CharParse::Start;
    let mut buf = FmtBuf{
        buf: String::with_capacity(128),
        fmt: Vec::with_capacity(128),
        lastfmt: FmtChar::Regular,
    };
    for chr in input.chars() {
        if let Some((chr, fmt)) = state.update(chr) {
            buf.push(chr, fmt);
            // Line-based flushing is also possible, but not as fast.
            //if chr == '\n' {
            //    buf.flush(&mut out);
            //    buf.buf.clear();
            //    buf.fmt.clear();
            //    buf.lastfmt = FmtChar::Regular;
            //}
        }
    }
    if let CharParse::One(chr) = state {
        buf.push(chr, FmtChar::Regular);
    }
    let mut out = String::with_capacity(input.len());
    buf.flush(&mut out);
    out
 }
 use std::os::raw::c_ulonglong;
 #[repr(C)]
 pub struct StringWrap {
    buf: *mut u8,
    len: c_ulonglong,
    cap: c_ulonglong,
 }
 #[no_mangle]
 pub extern fn grotty2html_wrap(in_buf: *const u8, in_len: c_ulonglong) -> StringWrap {
    let input = unsafe { std::str::from_utf8_unchecked( std::slice::from_raw_parts(in_buf, in_len as usize) ) };
    let mut out = grotty2html(input).into_bytes();
    let r = StringWrap {
        buf: out.as_mut_ptr(),
        len: out.len() as c_ulonglong,
        cap: out.capacity() as c_ulonglong,
    };
    std::mem::forget(out);
    r
 }
 #[no_mangle]
 pub extern fn grotty2html_free(buf: StringWrap) {
    unsafe { Vec::from_raw_parts(buf.buf, buf.len as usize, buf.cap as usize) };
 }
 #[cfg(test)]
 mod tests {
    use super::*;
    use std::io::Read;
    use test::Bencher;
    fn bench_file(b: &mut Bencher, f: &str) {
        let mut f = std::fs::File::open(f).unwrap();
        let mut buf = String::new();
        f.read_to_string(&mut buf).unwrap();
        b.iter(|| {
            test::black_box(grotty2html(&buf));
        });
    }
    #[bench]
    fn bench_rsync(b: &mut test::Bencher) {
        bench_file(b, "t/rsync.1.output");
    }
    #[bench]
    fn bench_ncdu(b: &mut test::Bencher) {
        bench_file(b, "t/ncdu.1.output");
    }
    #[bench]
    fn bench_javadoc(b: &mut test::Bencher) {
        bench_file(b, "t/javadoc.1.output");
    }
    /*
    #[bench]
    fn bench_wfilter(b: &mut test::Bencher) {
        bench_file(b, "t/wfilter.4.output");
    }
    */
 }
--- a/web/src/main.rs
+++ b/web/src/main.rs
@ -0,0 +1,11 @@
 extern crate web;
 use std::io::{stdin,Read};
 fn main() {
    let rd = stdin();
    let mut buf = String::new();
    rd.lock().read_to_string(&mut buf).unwrap();
    println!("{}", web::grotty2html(&buf));
 }