A few more HTML conversion improvements

- Fix segfault on empty output (bug was in XS code)
- Still better end-of-URL detection
- Recognize a few common multicharacter sections in man references
This commit is contained in:
Yorhel 2017-01-15 20:27:16 +01:00
parent 1ccc86ce86
commit 746889851c
3 changed files with 18 additions and 5 deletions

View file

@ -6,7 +6,7 @@ all: ManUtils indexer
ManUtils: lib/ManUtils/inst/lib/perl5/x86_64-linux/ManUtils.pm
lib/ManUtils/inst/lib/perl5/x86_64-linux/ManUtils.pm: lib/ManUtils/Build.PL lib/ManUtils/ManUtils.pm lib/ManUtils/ManUtils.xs web/target/release/libweb.a
test lib/ManUtils/ManUtils.xs -ot web/target/release/libweb.a && touch -r web/target/release/libweb.a lib/ManUtils/ManUtils.xs
-test lib/ManUtils/ManUtils.xs -ot web/target/release/libweb.a && touch -r web/target/release/libweb.a lib/ManUtils/ManUtils.xs
cd lib/ManUtils && perl Build.PL && ./Build install --install-base=inst
touch lib/ManUtils/inst/lib/perl5/x86_64-linux/ManUtils.pm

View file

@ -20,7 +20,7 @@ html(str)
STRLEN len;
char *inbuf = SvPV(str, len);
struct StringWrap buf = grotty2html_wrap(inbuf, len);
SV *dest = newSVpv(buf.buf, buf.len);
SV *dest = buf.len ? newSVpv(buf.buf, buf.len) : newSVpv("", 0);
grotty2html_free(buf);
SvUTF8_on(dest);
RETVAL = dest;

View file

@ -181,7 +181,9 @@ impl FmtBuf {
// - https://manned.org/urn/8cb83e85
// TODO: Add heuristic to only remove ) at the end of the URL if there is no matching (
// inside the URL.
let url = url.trim_right_matches('.').trim_right_matches(',').trim_right_matches(';').trim_right_matches(')').trim_right_matches('⟩').trim_right_matches('\'');
let url = url.trim_right_matches(|c|
match c { '.' | ',' | ';' | ')' | '⟩' | '\'' | ':' | ']' | '}' => true, _ => false }
);
if url.len() < 10 {
return;
}
@ -194,8 +196,19 @@ impl FmtBuf {
fn flush_ref(&self, st: &mut Flush, end: usize) {
// We know where the closing bracket is in the string, so this regex is used to search
// backwards from there and find the start of the reference.
// There are a lot of 'special' multi-character section names, so it might not make sense
// to parse all of them. Here's an estimate of a few 'special' section references, in
// number of man pages using the reference (using ~ '%(3pm)%' on the 2017-01-14 database):
// - 3pm 17810
// - 3w 8729 (just a few packages)
// - 3tcl 2000
// - 3tk 758
// - 3p 309
// - 3perl 268
// - 3ssl 198
lazy_static!(
static ref REF: Regex = Regex::new(r"([A-Za-z0-9:\._-]+)\(([1-9nl])\)$").unwrap();
// XXX: Make sure to keep this regex in sync with the one in flush()
static ref REF: Regex = Regex::new(r"([A-Za-z0-9:\._-]+)\(([1-9nl]|3tcl|3pm|3tk)\)$").unwrap();
);
// Disallow some characters following a reference
@ -223,7 +236,7 @@ impl FmtBuf {
// This regex is used to quickly *find* interesting patterns, any further validation
// and processing is done afterwards by the (slower) specialized flush_ methods.
lazy_static!(
static ref SEARCH: Regex = Regex::new(r"(?m)(^\[\[\[MANNEDINCLUDE|https?://|[A-Za-z0-9]+\([1-9nl]\))").unwrap();
static ref SEARCH: Regex = Regex::new(r"(?m)(^\[\[\[MANNEDINCLUDE|https?://|[A-Za-z0-9]+\(([1-9nl]|3tcl|3pm|3tk)\))").unwrap();
);
let mut st = Flush{