fu/c/xmlwr.c
Yorhel 9014e2900c Add FU::XMLWriter
And remove UTF-8 check in JSON writer. It honestly feels kind of silly
to do that validation there while I've never done similar validations in
any other output routines - including this XML writer.

FU::XMLWriter is a copy of TUWF::XMLXS with a bunch of improvements
applied: now uses refcounts to determine the current output instance,
auto-generates XS functions and has faster escaped string output -
inspired by the JSON writer.

TODO:
- Integrate into FU
- Do something with bool attribute values
- Benchmarks
- Should $content be optional for all tags? The reason they weren't in
  TUWF::XMLXS is because TUWF::XML supports opening tags without closing
  them, but that idea turned out to suck and isn't supported anymore.

This is hopefully the last XS module for the FU framework. The only C
code being written now should be bug fixes and extending FU::Pg with
some planned features. Already ended up with more C than I had
planned...
2025-02-19 12:01:24 +01:00

161 lines
5 KiB
C

typedef struct fuxmlwr fuxmlwr;
struct fuxmlwr {
SV *self;
fuxmlwr *next, *prev;
fustr out;
};
static fuxmlwr *fuxmlwr_tail = NULL;
static SV *fuxmlwr_new(pTHX) {
fuxmlwr *wr = safemalloc(sizeof(*wr));
wr->next = NULL;
wr->prev = fuxmlwr_tail;
if (fuxmlwr_tail) fuxmlwr_tail->next = wr;
fuxmlwr_tail = wr;
fustr_init(&wr->out, NULL, SIZE_MAX);
return fu_selfobj(wr, "FU::XMLWriter");
}
static void fuxmlwr_destroy(pTHX_ fuxmlwr *wr) {
if (fuxmlwr_tail == wr) fuxmlwr_tail = wr->next ? wr->next : wr->prev;
if (wr->next) wr->next->prev = wr->prev;
if (wr->prev) wr->prev->next = wr->next;
if (wr->out.sv) SvREFCNT_dec(wr->out.sv);
safefree(wr);
}
static void fuxmlwr_escape(pTHX_ fuxmlwr *wr, SV *sv) {
STRLEN len;
const unsigned char *str = (unsigned char *)SvPV_const(sv, len);
const unsigned char *tmp, *end = str + len;
unsigned char x = 0;
int utf8 = SvUTF8(sv);
while (str < end) {
tmp = str;
if (utf8) {
while (tmp < end) {
x = *tmp;
if (x == '<' || x == '&' || x == '"') break;
tmp++;
}
} else {
while (tmp < end) {
x = *tmp;
if (x == '<' || x == '&' || x == '"' || x >= 0x80) break;
tmp++;
}
}
fustr_write(&wr->out, (const char *)str, tmp-str);
if (tmp == end) return;
switch (x) {
case '<': fustr_write(&wr->out, "&lt;", 4); break;
case '&': fustr_write(&wr->out, "&amp;", 5); break;
case '"': fustr_write(&wr->out, "&quot;", 6); break;
default:
unsigned char *buf = (unsigned char *)fustr_write_buf(&wr->out, 2);
buf[0] = 0xc0 | (x >> 6);
buf[1] = 0x80 | (x & 0x3f);
break;
}
str = tmp + 1;
}
}
static int fuxmlwr_isnamechar(unsigned int x) {
return (x|32)-'a' < 26 || x-'0' < 10 || x == '_' || x == ':' || x == '-';
}
// Validate a tag or attribute name. Pretty much /^[a-z0-9_:-]+$/i.
// This does not at all match with the XML and HTML standards, but this
// approach is simpler and catches the most important bugs anyway.
static void fuxmlwr_isname(const char *str) {
const char *x = str;
while (fuxmlwr_isnamechar(*x)) x++;
if (*x || x == str) fu_confess("Invalid tag or attribute name: '%s'", str);
}
static void fuxmlwr_tag(pTHX_ fuxmlwr *wr, I32 ax, I32 offset, I32 argc, int selfclose, const char *tagname, int tagnamelen) {
SV *key, *val;
const char *keys, *lastkey = NULL;
int isopen = 0;
dSP;
if (!selfclose && ((argc - offset) & 1) == 0) fu_confess("Invalid number of arguments");
fustr_write_ch(&wr->out, '<');
fustr_write(&wr->out, tagname, tagnamelen);
while (offset < argc-1) {
key = ST(offset);
offset++;
val = ST(offset);
offset++;
// Don't even try to stringify other arguments; non-string keys are always a bug.
if (!SvPOK(key)) fu_confess("Non-string attribute");
keys = SvPVX(key);
SvGETMAGIC(val);
/* TODO: Support boolean values */
if (keys[0] == '+' && keys[1] == 0) {
if (!SvOK(val)) {
// ignore
} else if (isopen) {
fustr_write_ch(&wr->out, ' ');
fuxmlwr_escape(aTHX_ wr, val);
} else if (lastkey) {
fustr_write_ch(&wr->out, ' ');
fustr_write(&wr->out, lastkey, strlen(lastkey));
fustr_write(&wr->out, "=\"", 2);
fuxmlwr_escape(aTHX_ wr, val);
isopen = 1;
} else {
fu_confess("Cannot use '+' as first attribute");
}
} else {
if (isopen) {
fustr_write_ch(&wr->out, '"');
isopen = 0;
}
fuxmlwr_isname(keys);
if (!SvOK(val)) {
lastkey = keys;
} else {
fustr_write_ch(&wr->out, ' ');
fustr_write(&wr->out, keys, SvCUR(key));
fustr_write(&wr->out, "=\"", 2);
fuxmlwr_escape(aTHX_ wr, val);
isopen = 1;
}
}
}
if (isopen) fustr_write_ch(&wr->out, '"');
if (offset < argc) {
val = ST(offset);
SvGETMAGIC(val);
} else
val = &PL_sv_undef;
if (!SvOK(val)) { // undef
fustr_write(&wr->out, " />", 3);
} else if (SvROK(val) && strcmp(sv_reftype(SvRV(val), 0), "CODE") == 0) { // CODE ref
fustr_write_ch(&wr->out, '>');
PUSHMARK(SP);
call_sv(val, G_VOID|G_DISCARD|G_NOARGS);
fustr_write(&wr->out, "</", 2);
fustr_write(&wr->out, tagname, tagnamelen);
fustr_write_ch(&wr->out, '>');
} else {
fustr_write_ch(&wr->out, '>');
fuxmlwr_escape(aTHX_ wr, val);
fustr_write(&wr->out, "</", 2);
fustr_write(&wr->out, tagname, tagnamelen);
fustr_write_ch(&wr->out, '>');
}
}