Experimental rewrite of grotty to html conversion in Rust

The previous C code was troublesome. - Didn't handle long lines - I couldn't convince myself that it was free of memory safety issues - Needed improving anyway, there are some formatting bugs. These are hard to fix in the current code. I mostly replicated the formatting bugs of the old C implementation in Rust, and possibly added a few new bugs as well. It's not a significant improvement right now, more testing and fixing will be needed. The performance of both implementations is comparable, with the Rust version being slightly faster in many cases (and slower in some others). I did spend more time trying to optimize this Rust version than I did with the old C code. I initially tried a naive-ish conversion of the C code to Rust, but that turned out to be much slower and I had to resort to using regexes and different data structures fix that.
2017-01-15 10:54:48 +01:00 · 2017-01-15 10:54:48 +01:00 · 6114b17389
commit 6114b17389
parent 8a3af4aee2
10 changed files with 534 additions and 300 deletions
--- a/lib/ManUtils/Build.PL
+++ b/lib/ManUtils/Build.PL
@ -6,6 +6,8 @@ Module::Build->new(
  dist_name => 'ManUtils',
  dist_version_from => 'ManUtils.pm',
  dist_abstract => 'Utils for manned.org',
+  license => 'MIT',
+  extra_linker_flags => '../../web/target/release/libweb.a',
  pm_files => {
    'ManUtils.pm' => 'lib/ManUtils.pm',
  },
--- a/lib/ManUtils/ManUtils.pm
+++ b/lib/ManUtils/ManUtils.pm
@ -32,11 +32,12 @@ sub fmt {
  # Other .so's should be handled by html()
  $input =~ s/^\.so (.+)$/.in -10\n.sp\n\[\[\[MANNEDINCLUDE$1\]\]\]/mg;

-  # Disable hyphenation, since that screws up man page references. :-(
-  $input = ".hy 0\n.de hy\n..\n$input";
-
-  # Emulate man-db's --nj option
-  $input = ".na\n.de ad\n..\n$input";
+  $input =
+    # Disable hyphenation, since that screws up man page references. :-(
+     ".hy 0\n.de hy\n..\n"
+    # Emulate man-db's --nj option
+    .".na\n.de ad\n..\n"
+    .$input;

  $input = encode_utf8($input);

@ -100,6 +101,4 @@ sub fmt_block {
  $out;
 }

-
 1;
-
--- a/lib/ManUtils/ManUtils.xs
+++ b/lib/ManUtils/ManUtils.xs
@ -2,278 +2,13 @@
 #include "perl.h"
 #include "XSUB.h"

+struct StringWrap {
+  char *buf;
+  unsigned long long len, cap;
+};

-// Convert grotty output to HTML for use in a <pre> tag.
-// It is assumed that the given input string is valid UTF-8, either represented
-// as a Perl Unicode string, or as a UTF-8 encoded byte string. The data may
-// not contain the 0 character.
-// The formatted HTML is returned as a Perl Unicode string.
-// It is also assumed that hyphenation has been disabled when generating the
-// grotty output.
-
-
-// This implementation really is fast enough for "real-time" use in the website
-// code, very much unlike my experiments with Perl. My previous Perl
-// implementation took about 1.5s for rsync(1), whereas I've not seen this
-// implementation take more than 15ms.
-
-// TODO: Unicode characters aren't truncated correctly when a line exceeds
-// MAXLINE bytes. I've only seen this happening on man pages that grotty
-// couldn't wrap, e.g. some Japanese and Chinese mans.
-// (Ideally, I'd tell grotty how to wrap those correctly)
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <ctype.h>
-
-#define MAXLINE 1024
-
-#define LB 1
-#define LI 2
-
-typedef struct ctx_t {
-  const char *src; // Pointer to the source data, or what's left of it.
-  SV *dest; // Destination string to write to.
-
-  // Current line
-  char line[MAXLINE];
-  char flags[MAXLINE]; // 0 = no fmt, LB = bold, LI = italic. (No combinations allowed)
-  int linelen;
-  int noref; // 1 if the current line shouldn't be checked for references. (Used for first and last line)
-} ctx_t;
-
-
-
-// Escapes and appends a displayed character to the output string.
-static inline void flushescape(ctx_t *x, char c) {
-  static char str[2] = {};
-  // Most HTML-escape functions also escape " to &quot;, but since we aren't
-  // going to put a man page in an XML attribute, we don't really have to worry
-  // about that one.
-  switch(c) {
-    case '>': sv_catpvn(x->dest, "&gt;", 4); break;
-    case '<': sv_catpvn(x->dest, "&lt;", 4); break;
-    case '&': sv_catpvn(x->dest, "&amp;", 5); break;
-    default:
-      str[0] = c;
-      sv_catpvn(x->dest, str, 1);
-  }
-}
-
-
-// HTML-escapes and adds formatting tags to a certain chunk of data and appends
-// it to the output string. The chunk is considered as an individual part,
-// assuming that any formatting is disabled at the start of the chunk, and
-// making sure it is disabled again at the end.
-// e points to the last character in s that is not considered part of the chunk.
-static void flushchunk(ctx_t *x, const char *s, const char *f, const char *e) {
-  int fmt = 0;
-
-#define EFMT if(fmt) sv_catpvn(x->dest, fmt == LB ? "</b>" : "</i>", 4)
-
-  while(s != e) {
-    // Consider underscore and whitespace to have the same formatting as the
-    // previous character.  The grotty escape sequences don't work well for the
-    // underscore character, and you can't see the difference either way.
-    if(fmt != *f && *s != '_' && *s != ' ') {
-      EFMT;
-      fmt = *f;
-      if(fmt)
-        sv_catpvn(x->dest, fmt == LB ? "<b>" : "<i>", 3);
-    }
-    flushescape(x, *s);
-    s++;
-    f++;
-  }
-  EFMT;
-
-#undef EFMT
-}
-
-
-#define ismanchar(x) (isalnum(x) || x == '_' || x == '-' || x == '.')
-
-
-static void flushinclude(ctx_t *x) {
-  char buf[8] = {};
-  char *s = x->line;
-
-  s[x->linelen-3] = 0;
-  s += 16;
-  char *fn = strrchr(s, '/');
-  fn = fn ? fn+1 : s;
-  sv_catpv(x->dest, "&gt;&gt; Included manual page: <a href=\"/");
-
-  // Replace ‐ (U+2010) with - (U+2d). ASCII dashes are replaced with an
-  // Unicode dash when passed through groff, which we need to revert in order
-  // to get the link working. (Apparently it recognizes man page references and
-  // URLs, as it doesn't do this replacement in those situations.)
-  while(*fn) {
-    if(*fn == (char)0xe2 && fn[1] == (char)0x80 && fn[2] == (char)0x90) {
-      buf[0] = '-';
-      fn += 3;
-    } else {
-      buf[0] = *fn;
-      fn++;
-    }
-    sv_catpvn(x->dest, buf, 1);
-  }
-
-  sv_catpv(x->dest, "\">");
-  sv_catpv(x->dest, s);
-  sv_catpv(x->dest, "</a>");
-}
-
-
-// HTML-escapes and "Flushes" the current line to the output string. Tries to
-// convert man references and URLs into links if format is true.
-static void flushline(ctx_t *x) {
-  static const char eol[] = "\n";
-  char *s = x->line, *es = x->line;
-
-  // Special-case [[[MANNEDINCLUDE ..]]] directive
-  if(x->linelen > 20 && *s == '[' && strncmp(s, "[[[MANNEDINCLUDE", 16) == 0 && strcmp("]]]", s+x->linelen-3) == 0) {
-    flushinclude(x);
-    goto end;
-  }
-
-  if(x->noref) {
-    flushchunk(x, x->line, x->flags, x->line+x->linelen);
-    goto end;
-  }
-
-#define flush(end) do {\
-    flushchunk(x, es, x->flags+(es-x->line), end);\
-    es = end;\
-  } while(0)
-
-  while(*s) {
-    // Man page reference.
-    // Detected by the "(x)", but then checked backwards in the buffer to find
-    // the start of the reference. This is pretty fast. Fails on:
-    // - JSON.3pm: JSON->new->utf8(1)->pretty(1)->encode($perl_scalar)
-    if(*s == '(' && (('1' <= s[1] && s[1] <= '9') || s[1] == 'n') && s[2] == ')' && !isalnum(s[3])) {
-      char *n = s-1;
-      while(n >= es && ismanchar(*n))
-        n--;
-      if(++n < s) {
-        flush(n);
-        *s = 0;
-        sv_catpvf(x->dest, "<a href=\"/%s.%c\">%s(%c)</a>", n, s[1], n, s[1]);
-        s += 3;
-        es = s;
-        continue;
-      }
-    }
-
-    // HTTP(s) URL.
-    // This is just a simple q{https?://[^ ][.,;"\)>]?( |$)} match, doesn't
-    // always work right, e.g.:
-    // - https://manned.org/spu_run/414316a1 -> URL wrapped to new line
-    // Note: Don't use strncmp() before manually checking for 'http'. The parse
-    // time is otherwise increased by a factor 2.
-    if(s[0] == 'h' && s[1] == 't' && s[2] == 't' && s[3] == 'p' && (strncmp(s, "http://", 7) == 0 || strncmp(s, "https://", 8) == 0)) {
-      // Find the end of the URL (space or some other weird character).
-      char *sep = s;
-      while(*sep && *sep != '>' && *sep != '<' && *sep != ' ' && *sep != '"')
-        sep++;
-      char *sp = sep;
-      if(sp > s+10) {
-        flush(s);
-        char endchr = *sp;
-        *(sp--) = 0;
-        if(*sp == '.' || *sp == ',' || *sp == ';' || *sp == ')') {
-          sp[1] = endchr;
-          endchr = *sp;
-          *(sp--) = 0;
-        }
-        // Also catch a Unicode '⟩', which is how groff sometimes ends a .UR, e.g.:
-        // - https://manned.org/troff/c4467840
-        // - https://manned.org/pass/78413b49
-        // - https://manned.org/empathy-accounts/8c05b2c1
-        // - https://manned.org/urn/8cb83e85
-        // - https://manned.org/wine/4a699a22
-        if(*sp == '\xa9' && *(sp-1) == '\x9f' && *(sp-2) == '\xe2') {
-          sp[1] = endchr;
-          sp -= 3;
-          endchr = sp[1];
-          sp[1] = 0;
-        }
-        sv_catpvf(x->dest, "<a href=\"%s\" rel=\"nofollow\">%s</a>", s, s);
-        *(++sp) = endchr;
-        es = s = sp;
-        continue;
-      }
-    }
-    s++;
-  }
-
-  flush(s);
-#undef flush
-
-end:
-  sv_catpvn(x->dest, eol, sizeof(eol)-1);
-}
-
-
-// Adds a character to the current line, calls flushline() when a new line is done.
-// TODO: Convert \t into spaces? The rest of the code is written with the
-// assumption that \t does not occur in the string. I've not seen grotty output
-// tabs yet, but it's still a good idea to define what *we* do with tabs.
-static void appendline(ctx_t *x, char c, char f) {
-  if(c == '\r')
-    return;
-
-  if(c == '\n' || x->linelen > MAXLINE+1) {
-    x->line[x->linelen] = 0;
-    flushline(x);
-    x->linelen = 0;
-    x->noref = 0;
-    if(c == '\n')
-      return;
-  }
-
-  x->line[x->linelen] = c;
-  x->flags[x->linelen] = f;
-  x->linelen++;
-}
-
-
-// Parses the grotty escapes and calls appendline() for each character.
-static void parselines(ctx_t *x) {
-  int i, ini = 0, inb = 0;
-  const char *buf = x->src;
-
-  while(*buf) {
-    int c1 = UTF8SKIP(buf);
-    // Escape character right after a formatting code? Ignore the escape
-    // character and formatting code after that. Grotty sometimes
-    // double-formats a character, so you get "f ESC c ESC f ESC c", which you
-    // should read as "(f ESC c) ESC (f ESC c)".
-    if(*buf == 8 && buf[1] && buf[1+UTF8SKIP(buf+1)] == 8 && buf[2+UTF8SKIP(buf+1)]) {
-      int c2 = UTF8SKIP(buf+1);
-      buf += 2 + c2 + UTF8SKIP(buf+1+c2);
-      continue;
-    }
-    // Formatting code
-    if(buf[c1] == 8 && buf[c1+1]) {
-      int c2 = UTF8SKIP(buf+c1+1);
-      for(i=0; i<c2; i++)
-        appendline(x, buf[c1+i+1], *buf == '_' ? LI : LB);
-      buf += c1+c2+1;
-      continue;
-    }
-    // Regular character
-    if(*buf == '\n' && !buf[1])
-      x->noref = 1;
-    appendline(x, *buf, 0);
-    buf++;
-  }
-  x->noref = 1;
-  appendline(x, '\n', 0);
-}
-
+struct StringWrap grotty2html_wrap(const char *, unsigned long long);
+void grotty2html_free(struct StringWrap);


 MODULE = ManUtils	 PACKAGE = ManUtils
@ -281,20 +16,13 @@ MODULE = ManUtils	 PACKAGE = ManUtils
 SV *
 html(str)
  SV *str
-  INIT:
-    ctx_t *x = malloc(sizeof(ctx_t));
  CODE:
-    x->src = SvPV_nolen(str);
-    x->dest = newSVpv("", 0);
-    x->linelen = 0;
-    x->noref = 1;
-    parselines(x);
-    // Set the UTF8 flag *after* generating the result string. For some reason
-    // that prevents sv_catpvf() from interpreting our C strings as something
-    // other than UTF-8.
-    SvUTF8_on(x->dest);
-    RETVAL = x->dest;
-    free(x);
+    STRLEN len;
+    char *inbuf = SvPV(str, len);
+    struct StringWrap buf = grotty2html_wrap(inbuf, len);
+    SV *dest = newSVpv(buf.buf, buf.len);
+    grotty2html_free(buf);
+    SvUTF8_on(dest);
+    RETVAL = dest;
  OUTPUT:
    RETVAL
-