Experimental rewrite of grotty to html conversion in Rust

The previous C code was troublesome.
- Didn't handle long lines
- I couldn't convince myself that it was free of memory safety issues
- Needed improving anyway, there are some formatting bugs. These are
  hard to fix in the current code.

I mostly replicated the formatting bugs of the old C implementation in
Rust, and possibly added a few new bugs as well. It's not a significant
improvement right now, more testing and fixing will be needed.

The performance of both implementations is comparable, with the Rust
version being slightly faster in many cases (and slower in some others).
I did spend more time trying to optimize this Rust version than I did
with the old C code. I initially tried a naive-ish conversion of the C
code to Rust, but that turned out to be much slower and I had to resort
to using regexes and different data structures fix that.
This commit is contained in:
Yorhel 2017-01-15 10:54:48 +01:00
parent 8a3af4aee2
commit 6114b17389
10 changed files with 534 additions and 300 deletions

2
.gitignore vendored
View file

@ -3,4 +3,4 @@
!/lib/ManUtils/ManUtils.pm
!/lib/ManUtils/ManUtils.xs
indexer/target
web/target

View file

@ -2,18 +2,27 @@
all: ManUtils indexer
ManUtils: lib/ManUtils/Build
cd lib/ManUtils && perl Build.PL && ./Build install --install-base=inst
lib/ManUtils/Build: lib/ManUtils/Build.PL
cd lib/ManUtils && perl Build.PL
ManUtils: lib/ManUtils/inst/lib/perl5/x86_64-linux/ManUtils.pm
lib/ManUtils/inst/lib/perl5/x86_64-linux/ManUtils.pm: lib/ManUtils/Build.PL lib/ManUtils/ManUtils.pm lib/ManUtils/ManUtils.xs web/target/release/libweb.a
test lib/ManUtils/ManUtils.xs -ot web/target/release/libweb.a && touch -r web/target/release/libweb.a lib/ManUtils/ManUtils.xs
cd lib/ManUtils && perl Build.PL && ./Build install --install-base=inst
touch lib/ManUtils/inst/lib/perl5/x86_64-linux/ManUtils.pm
web/target/release/libweb.a: web/Cargo.toml web/src/*.rs
cd web && cargo build --release
#strip --strip-unneeded web/target/release/libweb.a
indexer: indexer/target/release/indexer
indexer/target/release/indexer: indexer/Cargo.toml indexer/src/*.rs
cd indexer && cargo build --release
clean:
cd lib/ManUtils && ./Build distclean
rm -rf lib/ManUtils/inst
cd indexer && cargo clean
cd web && cargo clean

6
README
View file

@ -13,10 +13,11 @@ Requirements
General:
perl: A somewhat recent version (no idea which, due to my XS usage)
postgresql: Also a somewhat recent version
rust + cargo (1.13+)
www/ & lib/ & webs/: (Website)
DBI
DBD::Pg
www/: (Website)
TUWF
JSON::XS
AnyEvent
@ -24,7 +25,6 @@ Requirements
util/ & indexer/: (DB updating and package synchronisation stuff)
curl
psql
cargo + rust (1.13+)
Contact

View file

@ -6,6 +6,8 @@ Module::Build->new(
dist_name => 'ManUtils',
dist_version_from => 'ManUtils.pm',
dist_abstract => 'Utils for manned.org',
license => 'MIT',
extra_linker_flags => '../../web/target/release/libweb.a',
pm_files => {
'ManUtils.pm' => 'lib/ManUtils.pm',
},

View file

@ -32,11 +32,12 @@ sub fmt {
# Other .so's should be handled by html()
$input =~ s/^\.so (.+)$/.in -10\n.sp\n\[\[\[MANNEDINCLUDE$1\]\]\]/mg;
$input =
# Disable hyphenation, since that screws up man page references. :-(
$input = ".hy 0\n.de hy\n..\n$input";
".hy 0\n.de hy\n..\n"
# Emulate man-db's --nj option
$input = ".na\n.de ad\n..\n$input";
.".na\n.de ad\n..\n"
.$input;
$input = encode_utf8($input);
@ -100,6 +101,4 @@ sub fmt_block {
$out;
}
1;

View file

@ -2,278 +2,13 @@
#include "perl.h"
#include "XSUB.h"
struct StringWrap {
char *buf;
unsigned long long len, cap;
};
// Convert grotty output to HTML for use in a <pre> tag.
// It is assumed that the given input string is valid UTF-8, either represented
// as a Perl Unicode string, or as a UTF-8 encoded byte string. The data may
// not contain the 0 character.
// The formatted HTML is returned as a Perl Unicode string.
// It is also assumed that hyphenation has been disabled when generating the
// grotty output.
// This implementation really is fast enough for "real-time" use in the website
// code, very much unlike my experiments with Perl. My previous Perl
// implementation took about 1.5s for rsync(1), whereas I've not seen this
// implementation take more than 15ms.
// TODO: Unicode characters aren't truncated correctly when a line exceeds
// MAXLINE bytes. I've only seen this happening on man pages that grotty
// couldn't wrap, e.g. some Japanese and Chinese mans.
// (Ideally, I'd tell grotty how to wrap those correctly)
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#define MAXLINE 1024
#define LB 1
#define LI 2
typedef struct ctx_t {
const char *src; // Pointer to the source data, or what's left of it.
SV *dest; // Destination string to write to.
// Current line
char line[MAXLINE];
char flags[MAXLINE]; // 0 = no fmt, LB = bold, LI = italic. (No combinations allowed)
int linelen;
int noref; // 1 if the current line shouldn't be checked for references. (Used for first and last line)
} ctx_t;
// Escapes and appends a displayed character to the output string.
static inline void flushescape(ctx_t *x, char c) {
static char str[2] = {};
// Most HTML-escape functions also escape " to &quot;, but since we aren't
// going to put a man page in an XML attribute, we don't really have to worry
// about that one.
switch(c) {
case '>': sv_catpvn(x->dest, "&gt;", 4); break;
case '<': sv_catpvn(x->dest, "&lt;", 4); break;
case '&': sv_catpvn(x->dest, "&amp;", 5); break;
default:
str[0] = c;
sv_catpvn(x->dest, str, 1);
}
}
// HTML-escapes and adds formatting tags to a certain chunk of data and appends
// it to the output string. The chunk is considered as an individual part,
// assuming that any formatting is disabled at the start of the chunk, and
// making sure it is disabled again at the end.
// e points to the last character in s that is not considered part of the chunk.
static void flushchunk(ctx_t *x, const char *s, const char *f, const char *e) {
int fmt = 0;
#define EFMT if(fmt) sv_catpvn(x->dest, fmt == LB ? "</b>" : "</i>", 4)
while(s != e) {
// Consider underscore and whitespace to have the same formatting as the
// previous character. The grotty escape sequences don't work well for the
// underscore character, and you can't see the difference either way.
if(fmt != *f && *s != '_' && *s != ' ') {
EFMT;
fmt = *f;
if(fmt)
sv_catpvn(x->dest, fmt == LB ? "<b>" : "<i>", 3);
}
flushescape(x, *s);
s++;
f++;
}
EFMT;
#undef EFMT
}
#define ismanchar(x) (isalnum(x) || x == '_' || x == '-' || x == '.')
static void flushinclude(ctx_t *x) {
char buf[8] = {};
char *s = x->line;
s[x->linelen-3] = 0;
s += 16;
char *fn = strrchr(s, '/');
fn = fn ? fn+1 : s;
sv_catpv(x->dest, "&gt;&gt; Included manual page: <a href=\"/");
// Replace (U+2010) with - (U+2d). ASCII dashes are replaced with an
// Unicode dash when passed through groff, which we need to revert in order
// to get the link working. (Apparently it recognizes man page references and
// URLs, as it doesn't do this replacement in those situations.)
while(*fn) {
if(*fn == (char)0xe2 && fn[1] == (char)0x80 && fn[2] == (char)0x90) {
buf[0] = '-';
fn += 3;
} else {
buf[0] = *fn;
fn++;
}
sv_catpvn(x->dest, buf, 1);
}
sv_catpv(x->dest, "\">");
sv_catpv(x->dest, s);
sv_catpv(x->dest, "</a>");
}
// HTML-escapes and "Flushes" the current line to the output string. Tries to
// convert man references and URLs into links if format is true.
static void flushline(ctx_t *x) {
static const char eol[] = "\n";
char *s = x->line, *es = x->line;
// Special-case [[[MANNEDINCLUDE ..]]] directive
if(x->linelen > 20 && *s == '[' && strncmp(s, "[[[MANNEDINCLUDE", 16) == 0 && strcmp("]]]", s+x->linelen-3) == 0) {
flushinclude(x);
goto end;
}
if(x->noref) {
flushchunk(x, x->line, x->flags, x->line+x->linelen);
goto end;
}
#define flush(end) do {\
flushchunk(x, es, x->flags+(es-x->line), end);\
es = end;\
} while(0)
while(*s) {
// Man page reference.
// Detected by the "(x)", but then checked backwards in the buffer to find
// the start of the reference. This is pretty fast. Fails on:
// - JSON.3pm: JSON->new->utf8(1)->pretty(1)->encode($perl_scalar)
if(*s == '(' && (('1' <= s[1] && s[1] <= '9') || s[1] == 'n') && s[2] == ')' && !isalnum(s[3])) {
char *n = s-1;
while(n >= es && ismanchar(*n))
n--;
if(++n < s) {
flush(n);
*s = 0;
sv_catpvf(x->dest, "<a href=\"/%s.%c\">%s(%c)</a>", n, s[1], n, s[1]);
s += 3;
es = s;
continue;
}
}
// HTTP(s) URL.
// This is just a simple q{https?://[^ ][.,;"\)>]?( |$)} match, doesn't
// always work right, e.g.:
// - https://manned.org/spu_run/414316a1 -> URL wrapped to new line
// Note: Don't use strncmp() before manually checking for 'http'. The parse
// time is otherwise increased by a factor 2.
if(s[0] == 'h' && s[1] == 't' && s[2] == 't' && s[3] == 'p' && (strncmp(s, "http://", 7) == 0 || strncmp(s, "https://", 8) == 0)) {
// Find the end of the URL (space or some other weird character).
char *sep = s;
while(*sep && *sep != '>' && *sep != '<' && *sep != ' ' && *sep != '"')
sep++;
char *sp = sep;
if(sp > s+10) {
flush(s);
char endchr = *sp;
*(sp--) = 0;
if(*sp == '.' || *sp == ',' || *sp == ';' || *sp == ')') {
sp[1] = endchr;
endchr = *sp;
*(sp--) = 0;
}
// Also catch a Unicode '⟩', which is how groff sometimes ends a .UR, e.g.:
// - https://manned.org/troff/c4467840
// - https://manned.org/pass/78413b49
// - https://manned.org/empathy-accounts/8c05b2c1
// - https://manned.org/urn/8cb83e85
// - https://manned.org/wine/4a699a22
if(*sp == '\xa9' && *(sp-1) == '\x9f' && *(sp-2) == '\xe2') {
sp[1] = endchr;
sp -= 3;
endchr = sp[1];
sp[1] = 0;
}
sv_catpvf(x->dest, "<a href=\"%s\" rel=\"nofollow\">%s</a>", s, s);
*(++sp) = endchr;
es = s = sp;
continue;
}
}
s++;
}
flush(s);
#undef flush
end:
sv_catpvn(x->dest, eol, sizeof(eol)-1);
}
// Adds a character to the current line, calls flushline() when a new line is done.
// TODO: Convert \t into spaces? The rest of the code is written with the
// assumption that \t does not occur in the string. I've not seen grotty output
// tabs yet, but it's still a good idea to define what *we* do with tabs.
static void appendline(ctx_t *x, char c, char f) {
if(c == '\r')
return;
if(c == '\n' || x->linelen > MAXLINE+1) {
x->line[x->linelen] = 0;
flushline(x);
x->linelen = 0;
x->noref = 0;
if(c == '\n')
return;
}
x->line[x->linelen] = c;
x->flags[x->linelen] = f;
x->linelen++;
}
// Parses the grotty escapes and calls appendline() for each character.
static void parselines(ctx_t *x) {
int i, ini = 0, inb = 0;
const char *buf = x->src;
while(*buf) {
int c1 = UTF8SKIP(buf);
// Escape character right after a formatting code? Ignore the escape
// character and formatting code after that. Grotty sometimes
// double-formats a character, so you get "f ESC c ESC f ESC c", which you
// should read as "(f ESC c) ESC (f ESC c)".
if(*buf == 8 && buf[1] && buf[1+UTF8SKIP(buf+1)] == 8 && buf[2+UTF8SKIP(buf+1)]) {
int c2 = UTF8SKIP(buf+1);
buf += 2 + c2 + UTF8SKIP(buf+1+c2);
continue;
}
// Formatting code
if(buf[c1] == 8 && buf[c1+1]) {
int c2 = UTF8SKIP(buf+c1+1);
for(i=0; i<c2; i++)
appendline(x, buf[c1+i+1], *buf == '_' ? LI : LB);
buf += c1+c2+1;
continue;
}
// Regular character
if(*buf == '\n' && !buf[1])
x->noref = 1;
appendline(x, *buf, 0);
buf++;
}
x->noref = 1;
appendline(x, '\n', 0);
}
struct StringWrap grotty2html_wrap(const char *, unsigned long long);
void grotty2html_free(struct StringWrap);
MODULE = ManUtils PACKAGE = ManUtils
@ -281,20 +16,13 @@ MODULE = ManUtils PACKAGE = ManUtils
SV *
html(str)
SV *str
INIT:
ctx_t *x = malloc(sizeof(ctx_t));
CODE:
x->src = SvPV_nolen(str);
x->dest = newSVpv("", 0);
x->linelen = 0;
x->noref = 1;
parselines(x);
// Set the UTF8 flag *after* generating the result string. For some reason
// that prevents sv_catpvf() from interpreting our C strings as something
// other than UTF-8.
SvUTF8_on(x->dest);
RETVAL = x->dest;
free(x);
STRLEN len;
char *inbuf = SvPV(str, len);
struct StringWrap buf = grotty2html_wrap(inbuf, len);
SV *dest = newSVpv(buf.buf, buf.len);
grotty2html_free(buf);
SvUTF8_on(dest);
RETVAL = dest;
OUTPUT:
RETVAL

121
web/Cargo.lock generated Normal file
View file

@ -0,0 +1,121 @@
[root]
name = "web"
version = "0.1.0"
dependencies = [
"lazy_static 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)",
"regex 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "aho-corasick"
version = "0.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"memchr 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "kernel32-sys"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"winapi 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)",
"winapi-build 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "lazy_static"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "libc"
version = "0.2.19"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "memchr"
version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"libc 0.2.19 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "regex"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"aho-corasick 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)",
"memchr 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)",
"regex-syntax 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)",
"thread_local 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)",
"utf8-ranges 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "regex-syntax"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "thread-id"
version = "3.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"kernel32-sys 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)",
"libc 0.2.19 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "thread_local"
version = "0.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"thread-id 3.0.0 (registry+https://github.com/rust-lang/crates.io-index)",
"unreachable 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "unreachable"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"void 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "utf8-ranges"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "void"
version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "winapi"
version = "0.2.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "winapi-build"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
[metadata]
"checksum aho-corasick 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)" = "4f660b942762979b56c9f07b4b36bb559776fbad102f05d6771e1b629e8fd5bf"
"checksum kernel32-sys 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7507624b29483431c0ba2d82aece8ca6cdba9382bff4ddd0f7490560c056098d"
"checksum lazy_static 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "6abe0ee2e758cd6bc8a2cd56726359007748fbf4128da998b65d0b70f881e19b"
"checksum libc 0.2.19 (registry+https://github.com/rust-lang/crates.io-index)" = "9e030dc72013ed68994d1b2cbf36a94dd0e58418ba949c4b0db7eeb70a7a6352"
"checksum memchr 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)" = "1dbccc0e46f1ea47b9f17e6d67c5a96bd27030519c519c9c91327e31275a47b4"
"checksum regex 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "4278c17d0f6d62dfef0ab00028feb45bd7d2102843f80763474eeb1be8a10c01"
"checksum regex-syntax 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "2f9191b1f57603095f105d317e375d19b1c9c5c3185ea9633a99a6dcbed04457"
"checksum thread-id 3.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "4437c97558c70d129e40629a5b385b3fb1ffac301e63941335e4d354081ec14a"
"checksum thread_local 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7793b722f0f77ce716e7f1acf416359ca32ff24d04ffbac4269f44a4a83be05d"
"checksum unreachable 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "1f2ae5ddb18e1c92664717616dd9549dde73f539f01bd7b77c2edb2446bdff91"
"checksum utf8-ranges 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "662fab6525a98beff2921d7f61a39e7d59e0b425ebc7d0d9e66d316e55124122"
"checksum void 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "6a02e4885ed3bc0f2de90ea6dd45ebcbb66dacffe03547fadbb0eeae2770887d"
"checksum winapi 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)" = "167dc9d6949a9b857f3451275e911c3f44255842c1f7a76f33c55103a909087a"
"checksum winapi-build 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "2d315eee3b34aca4797b2da6b13ed88266e6d612562a0c46390af8299fc699bc"

16
web/Cargo.toml Normal file
View file

@ -0,0 +1,16 @@
[package]
name = "web"
version = "0.1.0"
authors = ["yorhel"]
[lib]
name = "web"
crate-type = ["lib", "staticlib"]
[dependencies]
regex = "0.2.1"
lazy_static = "0.2.2"
# Add debugging symbols even in release mode, in order to help with profiling.
[profile.release]
debug = true

348
web/src/lib.rs Normal file
View file

@ -0,0 +1,348 @@
#![feature(test)]
extern crate test;
extern crate regex;
#[macro_use] extern crate lazy_static;
use std::fmt::Write;
use regex::Regex;
#[derive(Clone,Copy,PartialEq,Eq)]
enum FmtChar {
Regular,
Italic,
Bold,
}
/* Simple state machine to parse the following grammar:
*
* fmtchar = escape | double-escape | char
* escape = tag ESC char
* double-escape = ESC tag ESC char
* tag = "_" # italic
* | char # bold
*
* This format is described as "old behaviour" in grotty(1). The double-escape
* seems to be a weird glitch, and can be interpreted as
* "(tag ESC char) ESC (tag ESC char)". This parser simply skips over any such
* sequence starting with ESC. */
enum CharParse {
Start,
One(char), // Seen a single character (either 'char' or 'escape')
Escape(char), // Seen a single character + escape
DoubleEsc(u32), // Inside a double-escape, indicates number of characters left to skip
}
impl CharParse {
fn update(&mut self, chr: char) -> Option<(char, FmtChar)> {
match *self {
CharParse::Start => {
*self = if chr == 8 as char { CharParse::DoubleEsc(2) } else { CharParse::One(chr) };
None
},
CharParse::One(c) =>
if chr == 8 as char {
*self = CharParse::Escape(c);
None
} else {
*self = CharParse::One(chr);
Some((c, FmtChar::Regular))
},
CharParse::Escape(c) => {
*self = CharParse::Start;
Some((chr, if c == '_' { FmtChar::Italic } else { FmtChar::Bold }))
},
CharParse::DoubleEsc(n) => {
*self = if n == 0 { CharParse::Start } else { CharParse::DoubleEsc(n-1) };
None
},
}
}
}
fn pushfmt(out: &mut String, old: FmtChar, new: FmtChar) {
if new != old && old != FmtChar::Regular {
out.push_str(if old == FmtChar::Italic { "</i>" } else { "</b>" });
}
if new != old && new != FmtChar::Regular {
out.push_str(if new == FmtChar::Italic { "<i>" } else { "<b>" });
}
}
// Intermediate text buffer. This buffer contains the entire HTML-escaped man page and a list of
// indices where text formatting changes are performed.
struct FmtBuf {
buf: String,
// List of formatting chunks. The number indicates the character index where the formatting
// ends. E.g. [(5,Regular),(10,Bold),(15,Italic)] means:
// [0..5] is Regular
// [5..10] is Bold
// [10..15] is Italic
fmt: Vec<(usize,FmtChar)>,
lastfmt: FmtChar,
}
// Output state
struct Flush<'a, 'b> {
out: &'a mut String,
idx: usize, // Last byte in the buffer that has been processed
fmt: std::iter::Peekable<std::slice::Iter<'b, (usize,FmtChar)>>, // Iterator over FmtBuf.fmt
}
impl FmtBuf {
fn push(&mut self, chr: char, fmt: FmtChar) {
// Consider whitespace and underscore to have the same
// formatting as the previous character; This generates smaller
// HTML, and you can't see the difference anyway.
if self.lastfmt != fmt && !(chr == ' ' || chr == '_') {
self.fmt.push((self.buf.len(), self.lastfmt));
self.lastfmt = fmt;
}
match chr {
'>' => self.buf.push_str("&gt;"),
'<' => self.buf.push_str("&lt;"),
'&' => self.buf.push_str("&amp;"),
// '"' => self.buf.push_str("&quot;"), // TEMPORARILY disabled for comparison with old code
_ => self.buf.push(chr), // <- 30% of the entire processing time is spent here.
}
}
// Flush all unprocessed bytes until 'end' to the output
fn flush_to(&self, st: &mut Flush, end: usize) {
let mut lastfmt = FmtChar::Regular;
while st.idx < end {
let &&(chunk, fmt) = st.fmt.peek().unwrap();
let chunk = if chunk > end {
end
} else {
st.fmt.next();
chunk
};
pushfmt(st.out, lastfmt, fmt);
st.out.push_str(&self.buf[st.idx..chunk]);
st.idx = chunk;
lastfmt = fmt;
}
pushfmt(st.out, lastfmt, FmtChar::Regular);
}
// Consume the input buffer until 'end' without generating output
fn flush_skip(&self, st: &mut Flush, end: usize) {
st.idx = end;
while st.fmt.peek().unwrap().0 <= st.idx {
st.fmt.next();
}
}
fn flush_include(&self, st: &mut Flush, start: usize, end: usize) {
lazy_static!(
static ref REF: Regex = Regex::new(r"^((?:[^\s\]]*/)?([^\s/\]]+))\]\]\]").unwrap();
);
let m = match REF.captures(&self.buf[end..]) { Some(x) => x, None => return };
self.flush_to(st, start);
st.out.push_str("\n&gt;&gt; Included man page: <a href=\"/");
// Replace (U+2010) with - (U+2d). ASCII dashes are replaced with an Unicode dash
// when passed through groff, which we need to revert in order to get the link working.
// (Apparently it recognizes man page references and URLs, as it doesn't do this
// replacement in those situations.)
for c in m[2].chars() {
st.out.push(if c == '' { '-' } else { c });
}
st.out.push_str("\">");
st.out.push_str(&m[1]);
st.out.push_str("</a>");
self.flush_skip(st, end + m[0].len());
}
fn flush_url(&self, st: &mut Flush, start: usize) {
lazy_static!(
// Some characters considered to never be part of a URL.
// (Note that we can't match literal ><" because of the HTML escaping done previously)
static ref URLEND: Regex = Regex::new("(?:\"|&quot;|&gt;|&lt;|\\s)").unwrap();
);
let urlend = match URLEND.find(&self.buf[start..]) { Some(x) => x, None => return };
self.flush_to(st, start);
let url = &self.buf[start..(start + urlend.start())];
// Also catch a Unicode '⟩', which is how groff sometimes ends a .UR, e.g.:
// - https://manned.org/troff/c4467840
// - https://manned.org/pass/78413b49
// - https://manned.org/empathy-accounts/8c05b2c1
// - https://manned.org/urn/8cb83e85
// TODO: Check the character before the start of the URL, and only remove ) if there is a
// starting ( before it.
let url = url.trim_right_matches('.').trim_right_matches(',').trim_right_matches(';').trim_right_matches(')').trim_right_matches('⟩');
write!(st.out, "<a href=\"{0}\" rel=\"nofollow\">{0}</a>", url).unwrap();
self.flush_skip(st, start + url.len());
}
fn flush_ref(&self, st: &mut Flush, end: usize) {
// We know where the closing bracket is in the string, so this regex is used to search
// backwards from there and find the start of the reference.
lazy_static!(
static ref REF: Regex = Regex::new(r"([A-Za-z0-9\._-]+)\(([1-8nl])\)$").unwrap();
);
// Disallow some characters following a reference
if self.buf.len() > end {
let ch = self.buf[end..].chars().next().unwrap();
if ch == '-' || ch == '_' || ch.is_alphanumeric() {
return;
}
}
let m = REF.captures(&self.buf[..end]).unwrap();
self.flush_to(st, end - m[0].len());
self.flush_skip(st, end);
write!(st.out, "<a href=\"/{}.{}\">{}</a>", &m[1], &m[2], &m[0]).unwrap();
}
fn flush(&mut self, out: &mut String) {
self.fmt.push((self.buf.len(), FmtChar::Regular));
// Find the indices where the first line ends, and the last line starts. These are used to
// efficiently disable reference formatting on the first and last line.
let firstlineend = self.buf.find('\n').unwrap_or(self.buf.len());
let lastlinestart = self.buf.trim_right_matches('\n').rfind('\n').unwrap_or(0);
// This regex is used to quickly *find* interesting patterns, any further validation
// and processing is done afterwards by the (slower) specialized flush_ methods.
lazy_static!(
static ref SEARCH: Regex = Regex::new(r"(?m)(^\[\[\[MANNEDINCLUDE|https?://|[A-Za-z0-9]+\([1-8nl]\))").unwrap();
);
let mut st = Flush{
out: out,
idx: 0,
fmt: self.fmt.iter().peekable(),
};
for i in SEARCH.find_iter(&self.buf) {
// This can happen with overlapping detections, e.g. when something inside a URL looks
// like a man page reference.
if st.idx > i.start() {
continue;
}
let allowref = i.start() > firstlineend && i.start() < lastlinestart;
match self.buf.as_bytes()[i.end()-1] {
0x45 /* E */ => self.flush_include(&mut st, i.start(), i.end()),
0x2F /* / */ if allowref => self.flush_url(&mut st, i.start()),
_ if allowref => self.flush_ref(&mut st, i.end()),
_ => {}
}
}
self.flush_to(&mut st, self.buf.len());
}
}
pub fn grotty2html(input: &str) -> String {
let mut state = CharParse::Start;
let mut buf = FmtBuf{
buf: String::with_capacity(128),
fmt: Vec::with_capacity(128),
lastfmt: FmtChar::Regular,
};
for chr in input.chars() {
if let Some((chr, fmt)) = state.update(chr) {
buf.push(chr, fmt);
// Line-based flushing is also possible, but not as fast.
//if chr == '\n' {
// buf.flush(&mut out);
// buf.buf.clear();
// buf.fmt.clear();
// buf.lastfmt = FmtChar::Regular;
//}
}
}
if let CharParse::One(chr) = state {
buf.push(chr, FmtChar::Regular);
}
let mut out = String::with_capacity(input.len());
buf.flush(&mut out);
out
}
use std::os::raw::c_ulonglong;
#[repr(C)]
pub struct StringWrap {
buf: *mut u8,
len: c_ulonglong,
cap: c_ulonglong,
}
#[no_mangle]
pub extern fn grotty2html_wrap(in_buf: *const u8, in_len: c_ulonglong) -> StringWrap {
let input = unsafe { std::str::from_utf8_unchecked( std::slice::from_raw_parts(in_buf, in_len as usize) ) };
let mut out = grotty2html(input).into_bytes();
let r = StringWrap {
buf: out.as_mut_ptr(),
len: out.len() as c_ulonglong,
cap: out.capacity() as c_ulonglong,
};
std::mem::forget(out);
r
}
#[no_mangle]
pub extern fn grotty2html_free(buf: StringWrap) {
unsafe { Vec::from_raw_parts(buf.buf, buf.len as usize, buf.cap as usize) };
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::Read;
use test::Bencher;
fn bench_file(b: &mut Bencher, f: &str) {
let mut f = std::fs::File::open(f).unwrap();
let mut buf = String::new();
f.read_to_string(&mut buf).unwrap();
b.iter(|| {
test::black_box(grotty2html(&buf));
});
}
#[bench]
fn bench_rsync(b: &mut test::Bencher) {
bench_file(b, "t/rsync.1.output");
}
#[bench]
fn bench_ncdu(b: &mut test::Bencher) {
bench_file(b, "t/ncdu.1.output");
}
#[bench]
fn bench_javadoc(b: &mut test::Bencher) {
bench_file(b, "t/javadoc.1.output");
}
/*
#[bench]
fn bench_wfilter(b: &mut test::Bencher) {
bench_file(b, "t/wfilter.4.output");
}
*/
}

11
web/src/main.rs Normal file
View file

@ -0,0 +1,11 @@
extern crate web;
use std::io::{stdin,Read};
fn main() {
let rd = stdin();
let mut buf = String::new();
rd.lock().read_to_string(&mut buf).unwrap();
println!("{}", web::grotty2html(&buf));
}