jsonparse: A bunch of performance improvements

Turrns out JSON::XS had some pretty good ideas that I could borrow.
This commit is contained in:
Yorhel 2025-01-31 16:35:47 +01:00
parent ca8d1b72be
commit ebe84167e7
5 changed files with 144 additions and 111 deletions

View file

@ -14,19 +14,19 @@ static void fujson_parse_ws(pTHX_ fujson_parse_ctx *ctx) {
}
}
static int fujson_parse_string_escape(pTHX_ fujson_parse_ctx *ctx, fustr *r) {
static inline int fujson_parse_string_escape(fujson_parse_ctx *ctx, fustr *r) {
unsigned int n, s;
ctx->buf++; /* '\\' */
if (ctx->buf == ctx->end) return 1;
if (UNLIKELY(ctx->buf == ctx->end)) return 1;
switch (*(ctx->buf++)) {
case '"': fustr_write(r, "\"", 1); break;
case '\\': fustr_write(r, "\\", 1); break;
case '/': fustr_write(r, "/", 1); break; /* We don't escape this one */
case 'b': fustr_write(r, "\x08", 1); break;
case 't': fustr_write(r, "\x09", 1); break;
case 'n': fustr_write(r, "\x0a", 1); break;
case 'f': fustr_write(r, "\x0c", 1); break;
case 'r': fustr_write(r, "\x0d", 1); break;
case '"': *(r->cur++) = '\"'; break;
case '\\':*(r->cur++) = '\\'; break;
case '/': *(r->cur++) = '/'; break; /* We don't escape this one */
case 'b': *(r->cur++) = 0x08; break;
case 't': *(r->cur++) = 0x09; break;
case 'n': *(r->cur++) = 0x0a; break;
case 'f': *(r->cur++) = 0x0c; break;
case 'r': *(r->cur++) = 0x0d; break;
case 'u':
/* (awful code adapted from ncdu) */
#define INV (1<<16)
@ -44,8 +44,8 @@ static int fujson_parse_string_escape(pTHX_ fujson_parse_ctx *ctx, fustr *r) {
n = 0x10000 + (((n & 0x03ff) << 10) | (s & 0x03ff));
ctx->buf += 6;
}
fustr_reserve(r, 4);
r->cur = (char *)uvchr_to_utf8((U8 *)r->cur, n);
if (n >= 0x80) r->setutf8 = 1;
break;
#undef INV
#undef hn
@ -56,45 +56,41 @@ static int fujson_parse_string_escape(pTHX_ fujson_parse_ctx *ctx, fustr *r) {
return 0;
}
static SV *fujson_parse_string(pTHX_ fujson_parse_ctx *ctx) {
fustr r;
size_t len, maxlen;
unsigned char x = 0;
fustr_init(&r, newSV(32), SIZE_MAX);
static int fujson_parse_string_buf(pTHX_ fujson_parse_ctx *ctx, fustr *r) {
size_t len;
unsigned char x;
ctx->buf++; /* '"' */
while (true) {
/* Fast path: ASCII, no unescaping needed */
len = 0;
maxlen = ctx->end - ctx->buf;
while (len < maxlen) {
x = ctx->buf[len];
/* While we always escape 0x7f when formatting, JSON does permit it unescaped */
if (x <= 0x1f || x == '"' || x == '\\' || x >= 0x80) break;
len++;
}
if (len == maxlen) goto err;
fustr_write(&r, (const char *)ctx->buf, len);
ctx->buf += len;
/* Slow path */
if (x == '"') {
fustr_reserve(aTHX_ r, 4);
if (UNLIKELY(ctx->buf == ctx->end)) return 1;
x = *ctx->buf;
if (UNLIKELY(x == '"')) {
ctx->buf++;
SvUTF8_on(fustr_done(&r));
return r.sv;
} else if (x == '\\') {
if (fujson_parse_string_escape(aTHX_ ctx, &r)) goto err;
return 0;
} else if (UNLIKELY(x == '\\')) {
if (fujson_parse_string_escape(ctx, r)) return 1;
} else if (x >= 0x80) {
len = isC9_STRICT_UTF8_CHAR(ctx->buf, ctx->end);
if (len == 0) goto err;
fustr_write(&r, (const char *)ctx->buf, len);
if (UNLIKELY((len = isC9_STRICT_UTF8_CHAR(ctx->buf, ctx->end)) == 0)) return 1;
memcpy(r->cur, ctx->buf, len);
r->cur += len;
ctx->buf += len;
} else {
goto err;
}
r->setutf8 = 1;
} else if (x >= 0x20) {
*(r->cur++) = x;
ctx->buf++;
} else return 1;
}
}
static SV *fujson_parse_string(pTHX_ fujson_parse_ctx *ctx) {
fustr r;
fustr_init(&r, NULL, SIZE_MAX);
if (fujson_parse_string_buf(aTHX_ ctx, &r)) {
if (r.sv) SvREFCNT_dec(r.sv);
return NULL;
} else {
return fustr_done(&r);
}
err:
SvREFCNT_dec(r.sv);
return NULL;
}
/* Validate JSON grammar of a number, increments ctx->buf to the end of the
@ -180,8 +176,12 @@ err:
static SV *fujson_parse_obj(pTHX_ fujson_parse_ctx *ctx) {
HV *hv = newHV();
SV *key = NULL;
SV *val;
char *keystart;
UV keyhash;
fustr key;
fustr_init(&key, NULL, SIZE_MAX);
ctx->buf++; /* '{' */
fujson_parse_ws(aTHX_ ctx);
if (ctx->buf == ctx->end) goto err;
@ -189,9 +189,11 @@ static SV *fujson_parse_obj(pTHX_ fujson_parse_ctx *ctx) {
while (true) {
/* key */
if (*ctx->buf != '"') goto err;
if (!(key = fujson_parse_string(aTHX_ ctx))) goto err;
/* TODO: Use precomputed hash */
if (hv_exists_ent(hv, key, 0)) goto err;
if (fujson_parse_string_buf(aTHX_ ctx, &key)) goto err;
keystart = fustr_start(&key);
if (key.setutf8) keyhash = 0;
else PERL_HASH(keyhash, keystart, key.cur - keystart);
if (hv_common(hv, NULL, keystart, key.cur - keystart, key.setutf8, HV_FETCH_ISEXISTS, NULL, keyhash)) goto err;
/* ':' */
fujson_parse_ws(aTHX_ ctx);
@ -201,9 +203,9 @@ static SV *fujson_parse_obj(pTHX_ fujson_parse_ctx *ctx) {
/* value */
if (!(val = fujson_parse(aTHX_ ctx))) goto err;
hv_store_ent(hv, key, val, 0);
SvREFCNT_dec(key); /* TODO: can reuse buffer */
key = NULL;
hv_common(hv, NULL, keystart, key.cur - keystart, key.setutf8, HV_FETCH_ISSTORE|HV_FETCH_JUST_SV, val, keyhash);
key.cur = keystart;
key.setutf8 = 0;
fujson_parse_ws(aTHX_ ctx);
if (ctx->buf == ctx->end) goto err;
@ -213,10 +215,11 @@ static SV *fujson_parse_obj(pTHX_ fujson_parse_ctx *ctx) {
fujson_parse_ws(aTHX_ ctx);
}
done:
if (key.sv) SvREFCNT_dec(key.sv);
ctx->buf++; /* '}' */
return newRV_noinc((SV *)hv);
err:
if (key) SvREFCNT_dec(key);
if (key.sv) SvREFCNT_dec(key.sv);
SvREFCNT_dec((SV *)hv);
return NULL;
}