Whole bunch of HTML conversion improvements
- Grotty escape sequences are now better interpreted. I feel rather stupid for not realizing the idea behind how those codes are supposed to work earlier. It finally hit me when I read the BSD ul(1) source code. - URL end detection is slightly better (much better than the old C code) - Man page references with : are recognized now (common in Perl modules). - More efficient HTML escaping, no need to escape > and ". There's still a bunch of improvements to make, but I have much more confidence in the current implementation already.
This commit is contained in:
parent
6114b17389
commit
1ccc86ce86
1 changed files with 46 additions and 43 deletions
|
|
@ -15,51 +15,51 @@ enum FmtChar {
|
|||
}
|
||||
|
||||
|
||||
/* Simple state machine to parse the following grammar:
|
||||
/* Simple state machine to interpret the BACKSPACE codes generated by grotty. The format is
|
||||
* described as "old behaviour" in grotty(1). Roughly:
|
||||
*
|
||||
* fmtchar = escape | double-escape | char
|
||||
* escape = tag ESC char
|
||||
* double-escape = ESC tag ESC char
|
||||
* tag = "_" # italic
|
||||
* | char # bold
|
||||
* '_' BACKSPACE 'x' -> 'x' is italic
|
||||
* 'x' BACKSPACE 'x' -> 'x' is bold
|
||||
* '_' BACKSPACE 'x' BACKSPACE 'x' -> 'x' is bold and italic
|
||||
*
|
||||
* This format is described as "old behaviour" in grotty(1). The double-escape
|
||||
* seems to be a weird glitch, and can be interpreted as
|
||||
* "(tag ESC char) ESC (tag ESC char)". This parser simply skips over any such
|
||||
* sequence starting with ESC. */
|
||||
* And other combinations are possible. The BACKSPACE character basically says "combine the
|
||||
* following character with previous token". Where "combining" means:
|
||||
*
|
||||
* a == b -> bold
|
||||
* a == _ -> b is italic
|
||||
* b == _ -> a is italic
|
||||
*
|
||||
* See the BSD ul(1) utility for a full interpreter of the format. Fortunately we only have to
|
||||
* handle the (limited) output that grotty generates, we don't have to be fully compatible with
|
||||
* ul(1).
|
||||
*/
|
||||
enum CharParse {
|
||||
Start,
|
||||
One(char), // Seen a single character (either 'char' or 'escape')
|
||||
Escape(char), // Seen a single character + escape
|
||||
DoubleEsc(u32), // Inside a double-escape, indicates number of characters left to skip
|
||||
Token(char, FmtChar),
|
||||
Escape(char, FmtChar),
|
||||
}
|
||||
|
||||
|
||||
impl CharParse {
|
||||
fn update(&mut self, chr: char) -> Option<(char, FmtChar)> {
|
||||
match *self {
|
||||
|
||||
CharParse::Start => {
|
||||
*self = if chr == 8 as char { CharParse::DoubleEsc(2) } else { CharParse::One(chr) };
|
||||
*self = CharParse::Token(chr, FmtChar::Regular);
|
||||
None
|
||||
},
|
||||
|
||||
CharParse::One(c) =>
|
||||
CharParse::Token(c, f) =>
|
||||
if chr == 8 as char {
|
||||
*self = CharParse::Escape(c);
|
||||
*self = CharParse::Escape(c, f);
|
||||
None
|
||||
} else {
|
||||
*self = CharParse::One(chr);
|
||||
Some((c, FmtChar::Regular))
|
||||
*self = CharParse::Token(chr, FmtChar::Regular);
|
||||
Some((c, f))
|
||||
},
|
||||
|
||||
CharParse::Escape(c) => {
|
||||
*self = CharParse::Start;
|
||||
Some((chr, if c == '_' { FmtChar::Italic } else { FmtChar::Bold }))
|
||||
},
|
||||
|
||||
CharParse::DoubleEsc(n) => {
|
||||
*self = if n == 0 { CharParse::Start } else { CharParse::DoubleEsc(n-1) };
|
||||
CharParse::Escape(c, _) => {
|
||||
// TODO: Handle combination of bold & italic
|
||||
*self = CharParse::Token(chr, if c == '_' { FmtChar::Italic } else { FmtChar::Bold });
|
||||
None
|
||||
},
|
||||
}
|
||||
|
|
@ -107,11 +107,12 @@ impl FmtBuf {
|
|||
self.fmt.push((self.buf.len(), self.lastfmt));
|
||||
self.lastfmt = fmt;
|
||||
}
|
||||
// WARNING: The '"' character is not escaped, so care must be taken when copying a slice
|
||||
// into an attribute value! (In the current implementation, " is simply never part of an
|
||||
// attribute value)
|
||||
match chr {
|
||||
'>' => self.buf.push_str(">"),
|
||||
'<' => self.buf.push_str("<"),
|
||||
'&' => self.buf.push_str("&"),
|
||||
// '"' => self.buf.push_str("""), // TEMPORARILY disabled for comparison with old code
|
||||
_ => self.buf.push(chr), // <- 30% of the entire processing time is spent here.
|
||||
}
|
||||
}
|
||||
|
|
@ -138,19 +139,19 @@ impl FmtBuf {
|
|||
// Consume the input buffer until 'end' without generating output
|
||||
fn flush_skip(&self, st: &mut Flush, end: usize) {
|
||||
st.idx = end;
|
||||
while st.fmt.peek().unwrap().0 <= st.idx {
|
||||
while st.idx < self.buf.len() && st.fmt.peek().unwrap().0 <= st.idx {
|
||||
st.fmt.next();
|
||||
}
|
||||
}
|
||||
|
||||
fn flush_include(&self, st: &mut Flush, start: usize, end: usize) {
|
||||
lazy_static!(
|
||||
static ref REF: Regex = Regex::new(r"^((?:[^\s\]]*/)?([^\s/\]]+))\]\]\]").unwrap();
|
||||
static ref REF: Regex = Regex::new(r#"^((?:[^"\s\]]*/)?([^"\s/\]]+))\]\]\]"#).unwrap();
|
||||
);
|
||||
let m = match REF.captures(&self.buf[end..]) { Some(x) => x, None => return };
|
||||
|
||||
self.flush_to(st, start);
|
||||
st.out.push_str("\n>> Included man page: <a href=\"/");
|
||||
st.out.push_str(">> Included manual page: <a href=\"/");
|
||||
// Replace ‐ (U+2010) with - (U+2d). ASCII dashes are replaced with an Unicode dash
|
||||
// when passed through groff, which we need to revert in order to get the link working.
|
||||
// (Apparently it recognizes man page references and URLs, as it doesn't do this
|
||||
|
|
@ -167,12 +168,10 @@ impl FmtBuf {
|
|||
fn flush_url(&self, st: &mut Flush, start: usize) {
|
||||
lazy_static!(
|
||||
// Some characters considered to never be part of a URL.
|
||||
// (Note that we can't match literal ><" because of the HTML escaping done previously)
|
||||
static ref URLEND: Regex = Regex::new("(?:\"|"|>|<|\\s)").unwrap();
|
||||
// (Note that we can't match literal '<' because of the HTML escaping done previously)
|
||||
static ref URLEND: Regex = Regex::new("(?:\"|<|>|\\s)").unwrap();
|
||||
);
|
||||
let urlend = match URLEND.find(&self.buf[start..]) { Some(x) => x, None => return };
|
||||
|
||||
self.flush_to(st, start);
|
||||
let url = &self.buf[start..(start + urlend.start())];
|
||||
|
||||
// Also catch a Unicode '⟩', which is how groff sometimes ends a .UR, e.g.:
|
||||
|
|
@ -180,10 +179,14 @@ impl FmtBuf {
|
|||
// - https://manned.org/pass/78413b49
|
||||
// - https://manned.org/empathy-accounts/8c05b2c1
|
||||
// - https://manned.org/urn/8cb83e85
|
||||
// TODO: Check the character before the start of the URL, and only remove ) if there is a
|
||||
// starting ( before it.
|
||||
let url = url.trim_right_matches('.').trim_right_matches(',').trim_right_matches(';').trim_right_matches(')').trim_right_matches('⟩');
|
||||
// TODO: Add heuristic to only remove ) at the end of the URL if there is no matching (
|
||||
// inside the URL.
|
||||
let url = url.trim_right_matches('.').trim_right_matches(',').trim_right_matches(';').trim_right_matches(')').trim_right_matches('⟩').trim_right_matches('\'');
|
||||
if url.len() < 10 {
|
||||
return;
|
||||
}
|
||||
|
||||
self.flush_to(st, start);
|
||||
write!(st.out, "<a href=\"{0}\" rel=\"nofollow\">{0}</a>", url).unwrap();
|
||||
self.flush_skip(st, start + url.len());
|
||||
}
|
||||
|
|
@ -192,13 +195,13 @@ impl FmtBuf {
|
|||
// We know where the closing bracket is in the string, so this regex is used to search
|
||||
// backwards from there and find the start of the reference.
|
||||
lazy_static!(
|
||||
static ref REF: Regex = Regex::new(r"([A-Za-z0-9\._-]+)\(([1-8nl])\)$").unwrap();
|
||||
static ref REF: Regex = Regex::new(r"([A-Za-z0-9:\._-]+)\(([1-9nl])\)$").unwrap();
|
||||
);
|
||||
|
||||
// Disallow some characters following a reference
|
||||
if self.buf.len() > end {
|
||||
let ch = self.buf[end..].chars().next().unwrap();
|
||||
if ch == '-' || ch == '_' || ch.is_alphanumeric() {
|
||||
if ch == '_' || ch.is_alphanumeric() {
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
|
@ -220,7 +223,7 @@ impl FmtBuf {
|
|||
// This regex is used to quickly *find* interesting patterns, any further validation
|
||||
// and processing is done afterwards by the (slower) specialized flush_ methods.
|
||||
lazy_static!(
|
||||
static ref SEARCH: Regex = Regex::new(r"(?m)(^\[\[\[MANNEDINCLUDE|https?://|[A-Za-z0-9]+\([1-8nl]\))").unwrap();
|
||||
static ref SEARCH: Regex = Regex::new(r"(?m)(^\[\[\[MANNEDINCLUDE|https?://|[A-Za-z0-9]+\([1-9nl]\))").unwrap();
|
||||
);
|
||||
|
||||
let mut st = Flush{
|
||||
|
|
@ -269,8 +272,8 @@ pub fn grotty2html(input: &str) -> String {
|
|||
//}
|
||||
}
|
||||
}
|
||||
if let CharParse::One(chr) = state {
|
||||
buf.push(chr, FmtChar::Regular);
|
||||
if let CharParse::Token(chr, fmt) = state {
|
||||
buf.push(chr, fmt);
|
||||
}
|
||||
|
||||
let mut out = String::with_capacity(input.len());
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue