#include "EXTERN.h" #include "perl.h" #include "XSUB.h" // Convert grotty output to HTML for use in a
 tag.
// It is assumed that the given input string is valid UTF-8, either represented
// as a Perl Unicode string, or as a UTF-8 encoded byte string. The data may
// not contain the 0 character.
// The formatted HTML is returned as a Perl Unicode string.
// It is also assumed that hyphenation has been disabled when generating the
// grotty output.


// This implementation really is fast enough for "real-time" use in the website
// code, very much unlike my experiments with Perl. My previous Perl
// implementation took about 1.5s for rsync(1), whereas I've not seen this
// implementation take more than 15ms.

// TODO: Unicode characters aren't truncated correctly when a line exceeds
// MAXLINE bytes. I've only seen this happening on man pages that grotty
// couldn't wrap, e.g. some Japanese and Chinese mans.
// (Ideally, I'd tell grotty how to wrap those correctly)

#include 
#include 
#include 
#include 

#define MAXLINE 1024

#define LB 1
#define LI 2

typedef struct ctx_t {
  const char *src; // Pointer to the source data, or what's left of it.
  SV *dest; // Destination string to write to.

  // Current line
  char line[MAXLINE];
  char flags[MAXLINE]; // 0 = no fmt, LB = bold, LI = italic. (No combinations allowed)
  int linelen;
  int noref; // 1 if the current line shouldn't be checked for references. (Used for first and last line)
} ctx_t;



// Escapes and appends a displayed character to the output string.
static inline void flushescape(ctx_t *x, char c) {
  static char str[2] = {};
  // Most HTML-escape functions also escape " to ", but since we aren't
  // going to put a man page in an XML attribute, we don't really have to worry
  // about that one.
  switch(c) {
    case '>': sv_catpvn(x->dest, ">", 4); break;
    case '<': sv_catpvn(x->dest, "<", 4); break;
    case '&': sv_catpvn(x->dest, "&", 5); break;
    default:
      str[0] = c;
      sv_catpvn(x->dest, str, 1);
  }
}


// HTML-escapes and adds formatting tags to a certain chunk of data and appends
// it to the output string. The chunk is considered as an individual part,
// assuming that any formatting is disabled at the start of the chunk, and
// making sure it is disabled again at the end.
// e points to the last character in s that is not considered part of the chunk.
static void flushchunk(ctx_t *x, const char *s, const char *f, const char *e) {
  int fmt = 0;

#define EFMT if(fmt) sv_catpvn(x->dest, fmt == LB ? "" : "", 4)

  while(s != e) {
    // Consider underscore and whitespace to have the same formatting as the
    // previous character.  The grotty escape sequences don't work well for the
    // underscore character, and you can't see the difference either way.
    if(fmt != *f && *s != '_' && *s != ' ') {
      EFMT;
      fmt = *f;
      if(fmt)
        sv_catpvn(x->dest, fmt == LB ? "" : "", 3);
    }
    flushescape(x, *s);
    s++;
    f++;
  }
  EFMT;

#undef EFMT
}


#define ismanchar(x) (isalnum(x) || x == '_' || x == '-' || x == '.')


static void flushinclude(ctx_t *x) {
  char buf[8] = {};
  char *s = x->line;

  s[x->linelen-3] = 0;
  s += 16;
  char *fn = strrchr(s, '/');
  fn = fn ? fn+1 : s;
  sv_catpv(x->dest, ">> Included manual page: dest, buf, 1);
  }

  sv_catpv(x->dest, "\">");
  sv_catpv(x->dest, s);
  sv_catpv(x->dest, "");
}


// HTML-escapes and "Flushes" the current line to the output string. Tries to
// convert man references and URLs into links if format is true.
static void flushline(ctx_t *x) {
  static const char eol[] = "\n";
  char *s = x->line, *es = x->line;

  // Special-case [[[MANNEDINCLUDE ..]]] directive
  if(x->linelen > 20 && *s == '[' && strncmp(s, "[[[MANNEDINCLUDE", 16) == 0 && strcmp("]]]", s+x->linelen-3) == 0) {
    flushinclude(x);
    goto end;
  }

  if(x->noref) {
    flushchunk(x, x->line, x->flags, x->line+x->linelen);
    goto end;
  }

#define flush(end) do {\
    flushchunk(x, es, x->flags+(es-x->line), end);\
    es = end;\
  } while(0)

  while(*s) {
    // Man page reference.
    // Detected by the "(x)", but then checked backwards in the buffer to find
    // the start of the reference. This is pretty fast. Fails on:
    // - JSON.3pm: JSON->new->utf8(1)->pretty(1)->encode($perl_scalar)
    if(*s == '(' && (('1' <= s[1] && s[1] <= '9') || s[1] == 'n') && s[2] == ')' && !isalnum(s[3])) {
      char *n = s-1;
      while(n >= es && ismanchar(*n))
        n--;
      if(++n < s) {
        flush(n);
        *s = 0;
        sv_catpvf(x->dest, "%s(%c)", n, s[1], n, s[1]);
        s += 3;
        es = s;
        continue;
      }
    }

    // HTTP(s) URL.
    // This is just a simple q{https?://[^ ][.,;"\)>]?( |$)} match, doesn't
    // always work right, e.g.:
    // - https://manned.org/spu_run/414316a1 -> URL wrapped to new line
    // Note: Don't use strncmp() before manually checking for 'http'. The parse
    // time is otherwise increased by a factor 2.
    if(s[0] == 'h' && s[1] == 't' && s[2] == 't' && s[3] == 'p' && (strncmp(s, "http://", 7) == 0 || strncmp(s, "https://", 8) == 0)) {
      // Find the end of the URL (space or some other weird character).
      char *sep = s;
      while(*sep && *sep != '>' && *sep != '<' && *sep != ' ' && *sep != '"')
        sep++;
      char *sp = sep;
      if(sp > s+10) {
        flush(s);
        char endchr = *sp;
        *(sp--) = 0;
        if(*sp == '.' || *sp == ',' || *sp == ';' || *sp == ')') {
          sp[1] = endchr;
          endchr = *sp;
          *(sp--) = 0;
        }
        // Also catch a Unicode '⟩', which is how groff sometimes ends a .UR, e.g.:
        // - https://manned.org/troff/c4467840
        // - https://manned.org/pass/78413b49
        // - https://manned.org/empathy-accounts/8c05b2c1
        // - https://manned.org/urn/8cb83e85
        // - https://manned.org/wine/4a699a22
        if(*sp == '\xa9' && *(sp-1) == '\x9f' && *(sp-2) == '\xe2') {
          sp[1] = endchr;
          sp -= 3;
          endchr = sp[1];
          sp[1] = 0;
        }
        sv_catpvf(x->dest, "%s", s, s);
        *(++sp) = endchr;
        es = s = sp;
        continue;
      }
    }
    s++;
  }

  flush(s);
#undef flush

end:
  sv_catpvn(x->dest, eol, sizeof(eol)-1);
}


// Adds a character to the current line, calls flushline() when a new line is done.
// TODO: Convert \t into spaces? The rest of the code is written with the
// assumption that \t does not occur in the string. I've not seen grotty output
// tabs yet, but it's still a good idea to define what *we* do with tabs.
static void appendline(ctx_t *x, char c, char f) {
  if(c == '\r')
    return;

  if(c == '\n' || x->linelen > MAXLINE+1) {
    x->line[x->linelen] = 0;
    flushline(x);
    x->linelen = 0;
    x->noref = 0;
    if(c == '\n')
      return;
  }

  x->line[x->linelen] = c;
  x->flags[x->linelen] = f;
  x->linelen++;
}


// Parses the grotty escapes and calls appendline() for each character.
static void parselines(ctx_t *x) {
  int i, ini = 0, inb = 0;
  const char *buf = x->src;

  while(*buf) {
    int c1 = UTF8SKIP(buf);
    // Escape character right after a formatting code? Ignore the escape
    // character and formatting code after that. Grotty sometimes
    // double-formats a character, so you get "f ESC c ESC f ESC c", which you
    // should read as "(f ESC c) ESC (f ESC c)".
    if(*buf == 8 && buf[1] && buf[1+UTF8SKIP(buf+1)] == 8 && buf[2+UTF8SKIP(buf+1)]) {
      int c2 = UTF8SKIP(buf+1);
      buf += 2 + c2 + UTF8SKIP(buf+1+c2);
      continue;
    }
    // Formatting code
    if(buf[c1] == 8 && buf[c1+1]) {
      int c2 = UTF8SKIP(buf+c1+1);
      for(i=0; inoref = 1;
    appendline(x, *buf, 0);
    buf++;
  }
  x->noref = 1;
  appendline(x, '\n', 0);
}



MODULE = ManUtils	 PACKAGE = ManUtils

SV *
html(str)
  SV *str
  INIT:
    ctx_t *x = malloc(sizeof(ctx_t));
  CODE:
    x->src = SvPV_nolen(str);
    x->dest = newSVpv("", 0);
    x->linelen = 0;
    x->noref = 1;
    parselines(x);
    // Set the UTF8 flag *after* generating the result string. For some reason
    // that prevents sv_catpvf() from interpreting our C strings as something
    // other than UTF-8.
    SvUTF8_on(x->dest);
    RETVAL = x->dest;
    free(x);
  OUTPUT:
    RETVAL