diff --git a/.gitignore b/.gitignore index a80d19e..a69b956 100644 --- a/.gitignore +++ b/.gitignore @@ -3,4 +3,4 @@ !/lib/ManUtils/ManUtils.pm !/lib/ManUtils/ManUtils.xs indexer/target - +web/target diff --git a/Makefile b/Makefile index 259b15e..79f2a7c 100644 --- a/Makefile +++ b/Makefile @@ -2,18 +2,27 @@ all: ManUtils indexer -ManUtils: lib/ManUtils/Build - cd lib/ManUtils && perl Build.PL && ./Build install --install-base=inst -lib/ManUtils/Build: lib/ManUtils/Build.PL - cd lib/ManUtils && perl Build.PL +ManUtils: lib/ManUtils/inst/lib/perl5/x86_64-linux/ManUtils.pm + +lib/ManUtils/inst/lib/perl5/x86_64-linux/ManUtils.pm: lib/ManUtils/Build.PL lib/ManUtils/ManUtils.pm lib/ManUtils/ManUtils.xs web/target/release/libweb.a + test lib/ManUtils/ManUtils.xs -ot web/target/release/libweb.a && touch -r web/target/release/libweb.a lib/ManUtils/ManUtils.xs + cd lib/ManUtils && perl Build.PL && ./Build install --install-base=inst + touch lib/ManUtils/inst/lib/perl5/x86_64-linux/ManUtils.pm + +web/target/release/libweb.a: web/Cargo.toml web/src/*.rs + cd web && cargo build --release + #strip --strip-unneeded web/target/release/libweb.a + indexer: indexer/target/release/indexer indexer/target/release/indexer: indexer/Cargo.toml indexer/src/*.rs cd indexer && cargo build --release + clean: cd lib/ManUtils && ./Build distclean rm -rf lib/ManUtils/inst cd indexer && cargo clean + cd web && cargo clean diff --git a/README b/README index 2072d41..ea05f9f 100644 --- a/README +++ b/README @@ -13,10 +13,11 @@ Requirements General: perl: A somewhat recent version (no idea which, due to my XS usage) postgresql: Also a somewhat recent version + rust + cargo (1.13+) + + www/ & lib/ & webs/: (Website) DBI DBD::Pg - - www/: (Website) TUWF JSON::XS AnyEvent @@ -24,7 +25,6 @@ Requirements util/ & indexer/: (DB updating and package synchronisation stuff) curl psql - cargo + rust (1.13+) Contact diff --git a/lib/ManUtils/Build.PL b/lib/ManUtils/Build.PL index 35c453c..72a4f69 100644 --- a/lib/ManUtils/Build.PL +++ b/lib/ManUtils/Build.PL @@ -6,6 +6,8 @@ Module::Build->new( dist_name => 'ManUtils', dist_version_from => 'ManUtils.pm', dist_abstract => 'Utils for manned.org', + license => 'MIT', + extra_linker_flags => '../../web/target/release/libweb.a', pm_files => { 'ManUtils.pm' => 'lib/ManUtils.pm', }, diff --git a/lib/ManUtils/ManUtils.pm b/lib/ManUtils/ManUtils.pm index c8f11f5..0b4dd08 100644 --- a/lib/ManUtils/ManUtils.pm +++ b/lib/ManUtils/ManUtils.pm @@ -32,11 +32,12 @@ sub fmt { # Other .so's should be handled by html() $input =~ s/^\.so (.+)$/.in -10\n.sp\n\[\[\[MANNEDINCLUDE$1\]\]\]/mg; - # Disable hyphenation, since that screws up man page references. :-( - $input = ".hy 0\n.de hy\n..\n$input"; - - # Emulate man-db's --nj option - $input = ".na\n.de ad\n..\n$input"; + $input = + # Disable hyphenation, since that screws up man page references. :-( + ".hy 0\n.de hy\n..\n" + # Emulate man-db's --nj option + .".na\n.de ad\n..\n" + .$input; $input = encode_utf8($input); @@ -100,6 +101,4 @@ sub fmt_block { $out; } - 1; - diff --git a/lib/ManUtils/ManUtils.xs b/lib/ManUtils/ManUtils.xs index 630f969..bb70e0a 100644 --- a/lib/ManUtils/ManUtils.xs +++ b/lib/ManUtils/ManUtils.xs @@ -2,278 +2,13 @@ #include "perl.h" #include "XSUB.h" +struct StringWrap { + char *buf; + unsigned long long len, cap; +}; -// Convert grotty output to HTML for use in a
 tag.
-// It is assumed that the given input string is valid UTF-8, either represented
-// as a Perl Unicode string, or as a UTF-8 encoded byte string. The data may
-// not contain the 0 character.
-// The formatted HTML is returned as a Perl Unicode string.
-// It is also assumed that hyphenation has been disabled when generating the
-// grotty output.
-
-
-// This implementation really is fast enough for "real-time" use in the website
-// code, very much unlike my experiments with Perl. My previous Perl
-// implementation took about 1.5s for rsync(1), whereas I've not seen this
-// implementation take more than 15ms.
-
-// TODO: Unicode characters aren't truncated correctly when a line exceeds
-// MAXLINE bytes. I've only seen this happening on man pages that grotty
-// couldn't wrap, e.g. some Japanese and Chinese mans.
-// (Ideally, I'd tell grotty how to wrap those correctly)
-
-#include 
-#include 
-#include 
-#include 
-
-#define MAXLINE 1024
-
-#define LB 1
-#define LI 2
-
-typedef struct ctx_t {
-  const char *src; // Pointer to the source data, or what's left of it.
-  SV *dest; // Destination string to write to.
-
-  // Current line
-  char line[MAXLINE];
-  char flags[MAXLINE]; // 0 = no fmt, LB = bold, LI = italic. (No combinations allowed)
-  int linelen;
-  int noref; // 1 if the current line shouldn't be checked for references. (Used for first and last line)
-} ctx_t;
-
-
-
-// Escapes and appends a displayed character to the output string.
-static inline void flushescape(ctx_t *x, char c) {
-  static char str[2] = {};
-  // Most HTML-escape functions also escape " to ", but since we aren't
-  // going to put a man page in an XML attribute, we don't really have to worry
-  // about that one.
-  switch(c) {
-    case '>': sv_catpvn(x->dest, ">", 4); break;
-    case '<': sv_catpvn(x->dest, "<", 4); break;
-    case '&': sv_catpvn(x->dest, "&", 5); break;
-    default:
-      str[0] = c;
-      sv_catpvn(x->dest, str, 1);
-  }
-}
-
-
-// HTML-escapes and adds formatting tags to a certain chunk of data and appends
-// it to the output string. The chunk is considered as an individual part,
-// assuming that any formatting is disabled at the start of the chunk, and
-// making sure it is disabled again at the end.
-// e points to the last character in s that is not considered part of the chunk.
-static void flushchunk(ctx_t *x, const char *s, const char *f, const char *e) {
-  int fmt = 0;
-
-#define EFMT if(fmt) sv_catpvn(x->dest, fmt == LB ? "" : "", 4)
-
-  while(s != e) {
-    // Consider underscore and whitespace to have the same formatting as the
-    // previous character.  The grotty escape sequences don't work well for the
-    // underscore character, and you can't see the difference either way.
-    if(fmt != *f && *s != '_' && *s != ' ') {
-      EFMT;
-      fmt = *f;
-      if(fmt)
-        sv_catpvn(x->dest, fmt == LB ? "" : "", 3);
-    }
-    flushescape(x, *s);
-    s++;
-    f++;
-  }
-  EFMT;
-
-#undef EFMT
-}
-
-
-#define ismanchar(x) (isalnum(x) || x == '_' || x == '-' || x == '.')
-
-
-static void flushinclude(ctx_t *x) {
-  char buf[8] = {};
-  char *s = x->line;
-
-  s[x->linelen-3] = 0;
-  s += 16;
-  char *fn = strrchr(s, '/');
-  fn = fn ? fn+1 : s;
-  sv_catpv(x->dest, ">> Included manual page: dest, buf, 1);
-  }
-
-  sv_catpv(x->dest, "\">");
-  sv_catpv(x->dest, s);
-  sv_catpv(x->dest, "");
-}
-
-
-// HTML-escapes and "Flushes" the current line to the output string. Tries to
-// convert man references and URLs into links if format is true.
-static void flushline(ctx_t *x) {
-  static const char eol[] = "\n";
-  char *s = x->line, *es = x->line;
-
-  // Special-case [[[MANNEDINCLUDE ..]]] directive
-  if(x->linelen > 20 && *s == '[' && strncmp(s, "[[[MANNEDINCLUDE", 16) == 0 && strcmp("]]]", s+x->linelen-3) == 0) {
-    flushinclude(x);
-    goto end;
-  }
-
-  if(x->noref) {
-    flushchunk(x, x->line, x->flags, x->line+x->linelen);
-    goto end;
-  }
-
-#define flush(end) do {\
-    flushchunk(x, es, x->flags+(es-x->line), end);\
-    es = end;\
-  } while(0)
-
-  while(*s) {
-    // Man page reference.
-    // Detected by the "(x)", but then checked backwards in the buffer to find
-    // the start of the reference. This is pretty fast. Fails on:
-    // - JSON.3pm: JSON->new->utf8(1)->pretty(1)->encode($perl_scalar)
-    if(*s == '(' && (('1' <= s[1] && s[1] <= '9') || s[1] == 'n') && s[2] == ')' && !isalnum(s[3])) {
-      char *n = s-1;
-      while(n >= es && ismanchar(*n))
-        n--;
-      if(++n < s) {
-        flush(n);
-        *s = 0;
-        sv_catpvf(x->dest, "%s(%c)", n, s[1], n, s[1]);
-        s += 3;
-        es = s;
-        continue;
-      }
-    }
-
-    // HTTP(s) URL.
-    // This is just a simple q{https?://[^ ][.,;"\)>]?( |$)} match, doesn't
-    // always work right, e.g.:
-    // - https://manned.org/spu_run/414316a1 -> URL wrapped to new line
-    // Note: Don't use strncmp() before manually checking for 'http'. The parse
-    // time is otherwise increased by a factor 2.
-    if(s[0] == 'h' && s[1] == 't' && s[2] == 't' && s[3] == 'p' && (strncmp(s, "http://", 7) == 0 || strncmp(s, "https://", 8) == 0)) {
-      // Find the end of the URL (space or some other weird character).
-      char *sep = s;
-      while(*sep && *sep != '>' && *sep != '<' && *sep != ' ' && *sep != '"')
-        sep++;
-      char *sp = sep;
-      if(sp > s+10) {
-        flush(s);
-        char endchr = *sp;
-        *(sp--) = 0;
-        if(*sp == '.' || *sp == ',' || *sp == ';' || *sp == ')') {
-          sp[1] = endchr;
-          endchr = *sp;
-          *(sp--) = 0;
-        }
-        // Also catch a Unicode '⟩', which is how groff sometimes ends a .UR, e.g.:
-        // - https://manned.org/troff/c4467840
-        // - https://manned.org/pass/78413b49
-        // - https://manned.org/empathy-accounts/8c05b2c1
-        // - https://manned.org/urn/8cb83e85
-        // - https://manned.org/wine/4a699a22
-        if(*sp == '\xa9' && *(sp-1) == '\x9f' && *(sp-2) == '\xe2') {
-          sp[1] = endchr;
-          sp -= 3;
-          endchr = sp[1];
-          sp[1] = 0;
-        }
-        sv_catpvf(x->dest, "%s", s, s);
-        *(++sp) = endchr;
-        es = s = sp;
-        continue;
-      }
-    }
-    s++;
-  }
-
-  flush(s);
-#undef flush
-
-end:
-  sv_catpvn(x->dest, eol, sizeof(eol)-1);
-}
-
-
-// Adds a character to the current line, calls flushline() when a new line is done.
-// TODO: Convert \t into spaces? The rest of the code is written with the
-// assumption that \t does not occur in the string. I've not seen grotty output
-// tabs yet, but it's still a good idea to define what *we* do with tabs.
-static void appendline(ctx_t *x, char c, char f) {
-  if(c == '\r')
-    return;
-
-  if(c == '\n' || x->linelen > MAXLINE+1) {
-    x->line[x->linelen] = 0;
-    flushline(x);
-    x->linelen = 0;
-    x->noref = 0;
-    if(c == '\n')
-      return;
-  }
-
-  x->line[x->linelen] = c;
-  x->flags[x->linelen] = f;
-  x->linelen++;
-}
-
-
-// Parses the grotty escapes and calls appendline() for each character.
-static void parselines(ctx_t *x) {
-  int i, ini = 0, inb = 0;
-  const char *buf = x->src;
-
-  while(*buf) {
-    int c1 = UTF8SKIP(buf);
-    // Escape character right after a formatting code? Ignore the escape
-    // character and formatting code after that. Grotty sometimes
-    // double-formats a character, so you get "f ESC c ESC f ESC c", which you
-    // should read as "(f ESC c) ESC (f ESC c)".
-    if(*buf == 8 && buf[1] && buf[1+UTF8SKIP(buf+1)] == 8 && buf[2+UTF8SKIP(buf+1)]) {
-      int c2 = UTF8SKIP(buf+1);
-      buf += 2 + c2 + UTF8SKIP(buf+1+c2);
-      continue;
-    }
-    // Formatting code
-    if(buf[c1] == 8 && buf[c1+1]) {
-      int c2 = UTF8SKIP(buf+c1+1);
-      for(i=0; inoref = 1;
-    appendline(x, *buf, 0);
-    buf++;
-  }
-  x->noref = 1;
-  appendline(x, '\n', 0);
-}
-
+struct StringWrap grotty2html_wrap(const char *, unsigned long long);
+void grotty2html_free(struct StringWrap);
 
 
 MODULE = ManUtils	 PACKAGE = ManUtils
@@ -281,20 +16,13 @@ MODULE = ManUtils	 PACKAGE = ManUtils
 SV *
 html(str)
   SV *str
-  INIT:
-    ctx_t *x = malloc(sizeof(ctx_t));
   CODE:
-    x->src = SvPV_nolen(str);
-    x->dest = newSVpv("", 0);
-    x->linelen = 0;
-    x->noref = 1;
-    parselines(x);
-    // Set the UTF8 flag *after* generating the result string. For some reason
-    // that prevents sv_catpvf() from interpreting our C strings as something
-    // other than UTF-8.
-    SvUTF8_on(x->dest);
-    RETVAL = x->dest;
-    free(x);
+    STRLEN len;
+    char *inbuf = SvPV(str, len);
+    struct StringWrap buf = grotty2html_wrap(inbuf, len);
+    SV *dest = newSVpv(buf.buf, buf.len);
+    grotty2html_free(buf);
+    SvUTF8_on(dest);
+    RETVAL = dest;
   OUTPUT:
     RETVAL
-
diff --git a/web/Cargo.lock b/web/Cargo.lock
new file mode 100644
index 0000000..b51547c
--- /dev/null
+++ b/web/Cargo.lock
@@ -0,0 +1,121 @@
+[root]
+name = "web"
+version = "0.1.0"
+dependencies = [
+ "lazy_static 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)",
+ "regex 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)",
+]
+
+[[package]]
+name = "aho-corasick"
+version = "0.6.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+dependencies = [
+ "memchr 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)",
+]
+
+[[package]]
+name = "kernel32-sys"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+dependencies = [
+ "winapi 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)",
+ "winapi-build 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
+]
+
+[[package]]
+name = "lazy_static"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+
+[[package]]
+name = "libc"
+version = "0.2.19"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+
+[[package]]
+name = "memchr"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+dependencies = [
+ "libc 0.2.19 (registry+https://github.com/rust-lang/crates.io-index)",
+]
+
+[[package]]
+name = "regex"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+dependencies = [
+ "aho-corasick 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)",
+ "memchr 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)",
+ "regex-syntax 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)",
+ "thread_local 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)",
+ "utf8-ranges 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)",
+]
+
+[[package]]
+name = "regex-syntax"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+
+[[package]]
+name = "thread-id"
+version = "3.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+dependencies = [
+ "kernel32-sys 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)",
+ "libc 0.2.19 (registry+https://github.com/rust-lang/crates.io-index)",
+]
+
+[[package]]
+name = "thread_local"
+version = "0.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+dependencies = [
+ "thread-id 3.0.0 (registry+https://github.com/rust-lang/crates.io-index)",
+ "unreachable 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
+]
+
+[[package]]
+name = "unreachable"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+dependencies = [
+ "void 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)",
+]
+
+[[package]]
+name = "utf8-ranges"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+
+[[package]]
+name = "void"
+version = "1.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+
+[[package]]
+name = "winapi"
+version = "0.2.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+
+[[package]]
+name = "winapi-build"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+
+[metadata]
+"checksum aho-corasick 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)" = "4f660b942762979b56c9f07b4b36bb559776fbad102f05d6771e1b629e8fd5bf"
+"checksum kernel32-sys 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7507624b29483431c0ba2d82aece8ca6cdba9382bff4ddd0f7490560c056098d"
+"checksum lazy_static 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "6abe0ee2e758cd6bc8a2cd56726359007748fbf4128da998b65d0b70f881e19b"
+"checksum libc 0.2.19 (registry+https://github.com/rust-lang/crates.io-index)" = "9e030dc72013ed68994d1b2cbf36a94dd0e58418ba949c4b0db7eeb70a7a6352"
+"checksum memchr 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)" = "1dbccc0e46f1ea47b9f17e6d67c5a96bd27030519c519c9c91327e31275a47b4"
+"checksum regex 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "4278c17d0f6d62dfef0ab00028feb45bd7d2102843f80763474eeb1be8a10c01"
+"checksum regex-syntax 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "2f9191b1f57603095f105d317e375d19b1c9c5c3185ea9633a99a6dcbed04457"
+"checksum thread-id 3.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "4437c97558c70d129e40629a5b385b3fb1ffac301e63941335e4d354081ec14a"
+"checksum thread_local 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7793b722f0f77ce716e7f1acf416359ca32ff24d04ffbac4269f44a4a83be05d"
+"checksum unreachable 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "1f2ae5ddb18e1c92664717616dd9549dde73f539f01bd7b77c2edb2446bdff91"
+"checksum utf8-ranges 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "662fab6525a98beff2921d7f61a39e7d59e0b425ebc7d0d9e66d316e55124122"
+"checksum void 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "6a02e4885ed3bc0f2de90ea6dd45ebcbb66dacffe03547fadbb0eeae2770887d"
+"checksum winapi 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)" = "167dc9d6949a9b857f3451275e911c3f44255842c1f7a76f33c55103a909087a"
+"checksum winapi-build 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "2d315eee3b34aca4797b2da6b13ed88266e6d612562a0c46390af8299fc699bc"
diff --git a/web/Cargo.toml b/web/Cargo.toml
new file mode 100644
index 0000000..9fa753a
--- /dev/null
+++ b/web/Cargo.toml
@@ -0,0 +1,16 @@
+[package]
+name = "web"
+version = "0.1.0"
+authors = ["yorhel"]
+
+[lib]
+name = "web"
+crate-type = ["lib", "staticlib"]
+
+[dependencies]
+regex = "0.2.1"
+lazy_static = "0.2.2"
+
+# Add debugging symbols even in release mode, in order to help with profiling.
+[profile.release]
+debug = true
diff --git a/web/src/lib.rs b/web/src/lib.rs
new file mode 100644
index 0000000..fc0d13b
--- /dev/null
+++ b/web/src/lib.rs
@@ -0,0 +1,348 @@
+#![feature(test)]
+extern crate test;
+extern crate regex;
+#[macro_use] extern crate lazy_static;
+
+use std::fmt::Write;
+use regex::Regex;
+
+
+#[derive(Clone,Copy,PartialEq,Eq)]
+enum FmtChar {
+    Regular,
+    Italic,
+    Bold,
+}
+
+
+/* Simple state machine to parse the following grammar:
+ *
+ * fmtchar       = escape | double-escape | char
+ * escape        = tag ESC char
+ * double-escape = ESC tag ESC char
+ * tag           = "_"  # italic
+ *               | char # bold
+ *
+ * This format is described as "old behaviour" in grotty(1).  The double-escape
+ * seems to be a weird glitch, and can be interpreted as
+ * "(tag ESC char) ESC (tag ESC char)".  This parser simply skips over any such
+ * sequence starting with ESC. */
+enum CharParse {
+    Start,
+    One(char),      // Seen a single character (either 'char' or 'escape')
+    Escape(char),   // Seen a single character + escape
+    DoubleEsc(u32), // Inside a double-escape, indicates number of characters left to skip
+}
+
+
+impl CharParse {
+    fn update(&mut self, chr: char) -> Option<(char, FmtChar)> {
+        match *self {
+
+            CharParse::Start => {
+                *self = if chr == 8 as char { CharParse::DoubleEsc(2) } else { CharParse::One(chr) };
+                None
+            },
+
+            CharParse::One(c) =>
+                if chr == 8 as char {
+                    *self = CharParse::Escape(c);
+                    None
+                } else {
+                    *self = CharParse::One(chr);
+                    Some((c, FmtChar::Regular))
+                },
+
+            CharParse::Escape(c) => {
+                *self = CharParse::Start;
+                Some((chr, if c == '_' { FmtChar::Italic } else { FmtChar::Bold }))
+            },
+
+            CharParse::DoubleEsc(n) => {
+                *self = if n == 0 { CharParse::Start } else { CharParse::DoubleEsc(n-1) };
+                None
+            },
+        }
+    }
+}
+
+
+fn pushfmt(out: &mut String, old: FmtChar, new: FmtChar) {
+    if new != old && old != FmtChar::Regular {
+        out.push_str(if old == FmtChar::Italic { "" } else { "" });
+    }
+    if new != old && new != FmtChar::Regular {
+        out.push_str(if new == FmtChar::Italic { "" } else { "" });
+    }
+}
+
+
+// Intermediate text buffer. This buffer contains the entire HTML-escaped man page and a list of
+// indices where text formatting changes are performed.
+struct FmtBuf {
+    buf: String,
+    // List of formatting chunks. The number indicates the character index where the formatting
+    // ends. E.g. [(5,Regular),(10,Bold),(15,Italic)] means:
+    //   [0..5] is Regular
+    //   [5..10] is Bold
+    //   [10..15] is Italic
+    fmt: Vec<(usize,FmtChar)>,
+    lastfmt: FmtChar,
+}
+
+// Output state
+struct Flush<'a, 'b> {
+    out: &'a mut String,
+    idx: usize, // Last byte in the buffer that has been processed
+    fmt: std::iter::Peekable>, // Iterator over FmtBuf.fmt
+}
+
+
+impl FmtBuf {
+    fn push(&mut self, chr: char, fmt: FmtChar) {
+        // Consider whitespace and underscore to have the same
+        // formatting as the previous character; This generates smaller
+        // HTML, and you can't see the difference anyway.
+        if self.lastfmt != fmt && !(chr == ' ' || chr == '_') {
+            self.fmt.push((self.buf.len(), self.lastfmt));
+            self.lastfmt = fmt;
+        }
+        match chr {
+            '>' => self.buf.push_str(">"),
+            '<' => self.buf.push_str("<"),
+            '&' => self.buf.push_str("&"),
+            // '"' => self.buf.push_str("""), // TEMPORARILY disabled for comparison with old code
+            _   => self.buf.push(chr), // <- 30% of the entire processing time is spent here.
+        }
+    }
+
+    // Flush all unprocessed bytes until 'end' to the output
+    fn flush_to(&self, st: &mut Flush, end: usize) {
+        let mut lastfmt = FmtChar::Regular;
+        while st.idx < end {
+            let &&(chunk, fmt) = st.fmt.peek().unwrap();
+            let chunk = if chunk > end {
+                end
+            } else {
+                st.fmt.next();
+                chunk
+            };
+            pushfmt(st.out, lastfmt, fmt);
+            st.out.push_str(&self.buf[st.idx..chunk]);
+            st.idx = chunk;
+            lastfmt = fmt;
+        }
+        pushfmt(st.out, lastfmt, FmtChar::Regular);
+    }
+
+    // Consume the input buffer until 'end' without generating output
+    fn flush_skip(&self, st: &mut Flush, end: usize) {
+        st.idx = end;
+        while st.fmt.peek().unwrap().0 <= st.idx {
+            st.fmt.next();
+        }
+    }
+
+    fn flush_include(&self, st: &mut Flush, start: usize, end: usize) {
+        lazy_static!(
+            static ref REF: Regex = Regex::new(r"^((?:[^\s\]]*/)?([^\s/\]]+))\]\]\]").unwrap();
+        );
+        let m = match REF.captures(&self.buf[end..]) { Some(x) => x, None => return };
+
+        self.flush_to(st, start);
+        st.out.push_str("\n>> Included man page: ");
+        st.out.push_str(&m[1]);
+        st.out.push_str("");
+        self.flush_skip(st, end + m[0].len());
+    }
+
+    fn flush_url(&self, st: &mut Flush, start: usize) {
+        lazy_static!(
+            // Some characters considered to never be part of a URL.
+            // (Note that we can't match literal ><" because of the HTML escaping done previously)
+            static ref URLEND: Regex = Regex::new("(?:\"|"|>|<|\\s)").unwrap();
+        );
+        let urlend = match URLEND.find(&self.buf[start..]) { Some(x) => x, None => return };
+
+        self.flush_to(st, start);
+        let url = &self.buf[start..(start + urlend.start())];
+
+        // Also catch a Unicode '⟩', which is how groff sometimes ends a .UR, e.g.:
+        // - https://manned.org/troff/c4467840
+        // - https://manned.org/pass/78413b49
+        // - https://manned.org/empathy-accounts/8c05b2c1
+        // - https://manned.org/urn/8cb83e85
+        // TODO: Check the character before the start of the URL, and only remove ) if there is a
+        // starting ( before it.
+        let url = url.trim_right_matches('.').trim_right_matches(',').trim_right_matches(';').trim_right_matches(')').trim_right_matches('⟩');
+
+        write!(st.out, "{0}", url).unwrap();
+        self.flush_skip(st, start + url.len());
+    }
+
+    fn flush_ref(&self, st: &mut Flush, end: usize) {
+        // We know where the closing bracket is in the string, so this regex is used to search
+        // backwards from there and find the start of the reference.
+        lazy_static!(
+            static ref REF: Regex = Regex::new(r"([A-Za-z0-9\._-]+)\(([1-8nl])\)$").unwrap();
+        );
+
+        // Disallow some characters following a reference
+        if self.buf.len() > end {
+            let ch = self.buf[end..].chars().next().unwrap();
+            if ch == '-' || ch == '_' || ch.is_alphanumeric() {
+                return;
+            }
+        }
+
+        let m = REF.captures(&self.buf[..end]).unwrap();
+        self.flush_to(st, end - m[0].len());
+        self.flush_skip(st, end);
+        write!(st.out, "{}", &m[1], &m[2], &m[0]).unwrap();
+    }
+
+    fn flush(&mut self, out: &mut String) {
+        self.fmt.push((self.buf.len(), FmtChar::Regular));
+
+        // Find the indices where the first line ends, and the last line starts. These are used to
+        // efficiently disable reference formatting on the first and last line.
+        let firstlineend = self.buf.find('\n').unwrap_or(self.buf.len());
+        let lastlinestart = self.buf.trim_right_matches('\n').rfind('\n').unwrap_or(0);
+
+        // This regex is used to quickly *find* interesting patterns, any further validation
+        // and processing is done afterwards by the (slower) specialized flush_ methods.
+        lazy_static!(
+            static ref SEARCH: Regex = Regex::new(r"(?m)(^\[\[\[MANNEDINCLUDE|https?://|[A-Za-z0-9]+\([1-8nl]\))").unwrap();
+        );
+
+        let mut st = Flush{
+            out: out,
+            idx: 0,
+            fmt: self.fmt.iter().peekable(),
+        };
+
+        for i in SEARCH.find_iter(&self.buf) {
+            // This can happen with overlapping detections, e.g. when something inside a URL looks
+            // like a man page reference.
+            if st.idx > i.start() {
+                continue;
+            }
+            let allowref = i.start() > firstlineend && i.start() < lastlinestart;
+            match self.buf.as_bytes()[i.end()-1] {
+                0x45 /* E */ => self.flush_include(&mut st, i.start(), i.end()),
+                0x2F /* / */ if allowref => self.flush_url(&mut st, i.start()),
+                _            if allowref => self.flush_ref(&mut st, i.end()),
+                _ => {}
+            }
+        }
+        self.flush_to(&mut st, self.buf.len());
+    }
+}
+
+
+pub fn grotty2html(input: &str) -> String {
+    let mut state = CharParse::Start;
+
+    let mut buf = FmtBuf{
+        buf: String::with_capacity(128),
+        fmt: Vec::with_capacity(128),
+        lastfmt: FmtChar::Regular,
+    };
+
+    for chr in input.chars() {
+        if let Some((chr, fmt)) = state.update(chr) {
+            buf.push(chr, fmt);
+            // Line-based flushing is also possible, but not as fast.
+            //if chr == '\n' {
+            //    buf.flush(&mut out);
+            //    buf.buf.clear();
+            //    buf.fmt.clear();
+            //    buf.lastfmt = FmtChar::Regular;
+            //}
+        }
+    }
+    if let CharParse::One(chr) = state {
+        buf.push(chr, FmtChar::Regular);
+    }
+
+    let mut out = String::with_capacity(input.len());
+    buf.flush(&mut out);
+    out
+}
+
+
+
+use std::os::raw::c_ulonglong;
+
+#[repr(C)]
+pub struct StringWrap {
+    buf: *mut u8,
+    len: c_ulonglong,
+    cap: c_ulonglong,
+}
+
+#[no_mangle]
+pub extern fn grotty2html_wrap(in_buf: *const u8, in_len: c_ulonglong) -> StringWrap {
+    let input = unsafe { std::str::from_utf8_unchecked( std::slice::from_raw_parts(in_buf, in_len as usize) ) };
+    let mut out = grotty2html(input).into_bytes();
+    let r = StringWrap {
+        buf: out.as_mut_ptr(),
+        len: out.len() as c_ulonglong,
+        cap: out.capacity() as c_ulonglong,
+    };
+    std::mem::forget(out);
+    r
+}
+
+#[no_mangle]
+pub extern fn grotty2html_free(buf: StringWrap) {
+    unsafe { Vec::from_raw_parts(buf.buf, buf.len as usize, buf.cap as usize) };
+}
+
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::io::Read;
+    use test::Bencher;
+
+    fn bench_file(b: &mut Bencher, f: &str) {
+        let mut f = std::fs::File::open(f).unwrap();
+        let mut buf = String::new();
+        f.read_to_string(&mut buf).unwrap();
+
+        b.iter(|| {
+            test::black_box(grotty2html(&buf));
+        });
+    }
+
+    #[bench]
+    fn bench_rsync(b: &mut test::Bencher) {
+        bench_file(b, "t/rsync.1.output");
+    }
+
+    #[bench]
+    fn bench_ncdu(b: &mut test::Bencher) {
+        bench_file(b, "t/ncdu.1.output");
+    }
+
+    #[bench]
+    fn bench_javadoc(b: &mut test::Bencher) {
+        bench_file(b, "t/javadoc.1.output");
+    }
+
+    /*
+    #[bench]
+    fn bench_wfilter(b: &mut test::Bencher) {
+        bench_file(b, "t/wfilter.4.output");
+    }
+    */
+}
diff --git a/web/src/main.rs b/web/src/main.rs
new file mode 100644
index 0000000..b9a8ccd
--- /dev/null
+++ b/web/src/main.rs
@@ -0,0 +1,11 @@
+extern crate web;
+
+use std::io::{stdin,Read};
+
+fn main() {
+    let rd = stdin();
+    let mut buf = String::new();
+    rd.lock().read_to_string(&mut buf).unwrap();
+    println!("{}", web::grotty2html(&buf));
+}
+