From 746889851c3e32acb596533f4782ff42acad6c4a Mon Sep 17 00:00:00 2001 From: Yorhel Date: Sun, 15 Jan 2017 20:27:16 +0100 Subject: [PATCH] A few more HTML conversion improvements - Fix segfault on empty output (bug was in XS code) - Still better end-of-URL detection - Recognize a few common multicharacter sections in man references --- Makefile | 2 +- lib/ManUtils/ManUtils.xs | 2 +- web/src/lib.rs | 19 ++++++++++++++++--- 3 files changed, 18 insertions(+), 5 deletions(-) diff --git a/Makefile b/Makefile index 79f2a7c..3ffb83b 100644 --- a/Makefile +++ b/Makefile @@ -6,7 +6,7 @@ all: ManUtils indexer ManUtils: lib/ManUtils/inst/lib/perl5/x86_64-linux/ManUtils.pm lib/ManUtils/inst/lib/perl5/x86_64-linux/ManUtils.pm: lib/ManUtils/Build.PL lib/ManUtils/ManUtils.pm lib/ManUtils/ManUtils.xs web/target/release/libweb.a - test lib/ManUtils/ManUtils.xs -ot web/target/release/libweb.a && touch -r web/target/release/libweb.a lib/ManUtils/ManUtils.xs + -test lib/ManUtils/ManUtils.xs -ot web/target/release/libweb.a && touch -r web/target/release/libweb.a lib/ManUtils/ManUtils.xs cd lib/ManUtils && perl Build.PL && ./Build install --install-base=inst touch lib/ManUtils/inst/lib/perl5/x86_64-linux/ManUtils.pm diff --git a/lib/ManUtils/ManUtils.xs b/lib/ManUtils/ManUtils.xs index bb70e0a..e162c68 100644 --- a/lib/ManUtils/ManUtils.xs +++ b/lib/ManUtils/ManUtils.xs @@ -20,7 +20,7 @@ html(str) STRLEN len; char *inbuf = SvPV(str, len); struct StringWrap buf = grotty2html_wrap(inbuf, len); - SV *dest = newSVpv(buf.buf, buf.len); + SV *dest = buf.len ? newSVpv(buf.buf, buf.len) : newSVpv("", 0); grotty2html_free(buf); SvUTF8_on(dest); RETVAL = dest; diff --git a/web/src/lib.rs b/web/src/lib.rs index 367b69e..5f1fa97 100644 --- a/web/src/lib.rs +++ b/web/src/lib.rs @@ -181,7 +181,9 @@ impl FmtBuf { // - https://manned.org/urn/8cb83e85 // TODO: Add heuristic to only remove ) at the end of the URL if there is no matching ( // inside the URL. - let url = url.trim_right_matches('.').trim_right_matches(',').trim_right_matches(';').trim_right_matches(')').trim_right_matches('⟩').trim_right_matches('\''); + let url = url.trim_right_matches(|c| + match c { '.' | ',' | ';' | ')' | '⟩' | '\'' | ':' | ']' | '}' => true, _ => false } + ); if url.len() < 10 { return; } @@ -194,8 +196,19 @@ impl FmtBuf { fn flush_ref(&self, st: &mut Flush, end: usize) { // We know where the closing bracket is in the string, so this regex is used to search // backwards from there and find the start of the reference. + // There are a lot of 'special' multi-character section names, so it might not make sense + // to parse all of them. Here's an estimate of a few 'special' section references, in + // number of man pages using the reference (using ~ '%(3pm)%' on the 2017-01-14 database): + // - 3pm 17810 + // - 3w 8729 (just a few packages) + // - 3tcl 2000 + // - 3tk 758 + // - 3p 309 + // - 3perl 268 + // - 3ssl 198 lazy_static!( - static ref REF: Regex = Regex::new(r"([A-Za-z0-9:\._-]+)\(([1-9nl])\)$").unwrap(); + // XXX: Make sure to keep this regex in sync with the one in flush() + static ref REF: Regex = Regex::new(r"([A-Za-z0-9:\._-]+)\(([1-9nl]|3tcl|3pm|3tk)\)$").unwrap(); ); // Disallow some characters following a reference @@ -223,7 +236,7 @@ impl FmtBuf { // This regex is used to quickly *find* interesting patterns, any further validation // and processing is done afterwards by the (slower) specialized flush_ methods. lazy_static!( - static ref SEARCH: Regex = Regex::new(r"(?m)(^\[\[\[MANNEDINCLUDE|https?://|[A-Za-z0-9]+\([1-9nl]\))").unwrap(); + static ref SEARCH: Regex = Regex::new(r"(?m)(^\[\[\[MANNEDINCLUDE|https?://|[A-Za-z0-9]+\(([1-9nl]|3tcl|3pm|3tk)\))").unwrap(); ); let mut st = Flush{