typedef struct { const unsigned char *buf; const unsigned char *end; } fujson_parse_ctx; static SV *fujson_parse(pTHX_ fujson_parse_ctx *); static void fujson_parse_ws(pTHX_ fujson_parse_ctx *ctx) { unsigned char x; while (ctx->buf < ctx->end) { x = *ctx->buf; if (!(x == 0x09 || x == 0x0a || x == 0x0d || x == 0x20)) break; ctx->buf++; } } static int fujson_parse_string_escape(pTHX_ fujson_parse_ctx *ctx, fustr *r) { unsigned int n, s; ctx->buf++; /* '\\' */ if (ctx->buf == ctx->end) return 1; switch (*(ctx->buf++)) { case '"': fustr_write(r, "\"", 1); break; case '\\': fustr_write(r, "\\", 1); break; case '/': fustr_write(r, "/", 1); break; /* We don't escape this one */ case 'b': fustr_write(r, "\x08", 1); break; case 't': fustr_write(r, "\x09", 1); break; case 'n': fustr_write(r, "\x0a", 1); break; case 'f': fustr_write(r, "\x0c", 1); break; case 'r': fustr_write(r, "\x0d", 1); break; case 'u': /* (awful code adapted from ncdu) */ #define INV (1<<16) #define hn(n) (n >= '0' && n <= '9' ? n-'0' : n >= 'A' && n <= 'F' ? n-'A'+10 : n >= 'a' && n <= 'f' ? n-'a'+10 : INV) #define h4(b) (hn((b)[0])<<12) + (hn((b)[1])<<8) + (hn((b)[2])<<4) + hn((b)[3]) if (ctx->end - ctx->buf < 4) return 1; n = h4(ctx->buf); if (n >= INV || (n & 0xfc00) == 0xdc00) return 1; ctx->buf += 4; if ((n & 0xfc00) == 0xd800) { /* high surrogate */ if (ctx->end - ctx->buf < 6) return 1; if (ctx->buf[0] != '\\' || ctx->buf[1] != 'u') return 1; s = h4(ctx->buf+2); if (s >= INV || (s & 0xfc00) != 0xdc00) return 1; n = 0x10000 + (((n & 0x03ff) << 10) | (s & 0x03ff)); ctx->buf += 6; } fustr_reserve(r, 4); r->cur = (char *)uvchr_to_utf8((U8 *)r->cur, n); break; #undef INV #undef hn #undef h4 default: return 1; } return 0; } static SV *fujson_parse_string(pTHX_ fujson_parse_ctx *ctx) { fustr r; size_t len, maxlen; unsigned char x = 0; fustr_init(&r, newSV(32), SIZE_MAX); ctx->buf++; /* '"' */ while (true) { /* Fast path: ASCII, no unescaping needed */ len = 0; maxlen = ctx->end - ctx->buf; while (len < maxlen) { x = ctx->buf[len]; /* While we always escape 0x7f when formatting, JSON does permit it unescaped */ if (x <= 0x1f || x == '"' || x == '\\' || x >= 0x80) break; len++; } if (len == maxlen) goto err; fustr_write(&r, (const char *)ctx->buf, len); ctx->buf += len; /* Slow path */ if (x == '"') { ctx->buf++; SvUTF8_on(fustr_done(&r)); return r.sv; } else if (x == '\\') { if (fujson_parse_string_escape(aTHX_ ctx, &r)) goto err; } else if (x >= 0x80) { len = isC9_STRICT_UTF8_CHAR(ctx->buf, ctx->end); if (len == 0) goto err; fustr_write(&r, (const char *)ctx->buf, len); ctx->buf += len; } else { goto err; } } err: SvREFCNT_dec(r.sv); return NULL; } /* Validate JSON grammar of a number, increments ctx->buf to the end of the * number and returns -1 on error, 0 if it's an int, 1 for floats. */ static int fujson_parse_number_grammar(fujson_parse_ctx *ctx) { int ret = 0; if (*ctx->buf == '-') ctx->buf++; if (ctx->buf == ctx->end) return -1; if (*ctx->buf == '0' && (ctx->buf+1 == ctx->end || !(ctx->buf[1] == '.' || ctx->buf[1] == 'e' || ctx->buf[1] == 'E'))) { /* rfc8259 permits "-0", so we'll not check for that */ ctx->buf++; return 0; } #define DIG1 \ if (ctx->buf == ctx->end || *ctx->buf < '0' || *ctx->buf > '9') return -1; \ ctx->buf++; \ while (ctx->buf != ctx->end && *ctx->buf >= '0' && *ctx->buf <= '9') ctx->buf++; /* int part */ DIG1; /* decimal part */ if (ctx->buf != ctx->end && *ctx->buf == '.') { ret = 1; ctx->buf++; DIG1; } /* exponent */ if (ctx->buf != ctx->end && (*ctx->buf == 'e' || *ctx->buf == 'E')) { ret = 1; ctx->buf++; if (ctx->buf == ctx->end) return -1; if (*ctx->buf == '+' || *ctx->buf == '-') ctx->buf++; DIG1; } #undef DIG1 return ret; } static SV *fujson_parse_number(pTHX_ fujson_parse_ctx *ctx) { const unsigned char *start = ctx->buf; int isnum = fujson_parse_number_grammar(ctx); if (isnum == -1) return NULL; UV uv; const char *end = (const char *)ctx->buf; /* grok_atoUV() in this context can only return false on overflow */ if (!isnum && grok_atoUV((const char *)(*start == '-' ? start+1 : start), &uv, &end)) { if (*start != '-') return newSVuv(uv); if (uv <= ((UV)IV_MAX)+1) return newSViv(-uv); } /* floating point or overflowed integer, might lose precision */ NV val; my_atof3((const char *)start, &val, ctx->buf - start); /* this function is not documented to be public... */ return newSVnv(val); } static SV *fujson_parse_array(pTHX_ fujson_parse_ctx *ctx) { AV *av = newAV(); SV *r; ctx->buf++; /* '[' */ fujson_parse_ws(aTHX_ ctx); if (ctx->buf == ctx->end) goto err; if (*ctx->buf == ']') goto done; while (true) { if (!(r = fujson_parse(aTHX_ ctx))) goto err; av_push_simple(av, r); fujson_parse_ws(aTHX_ ctx); if (ctx->buf == ctx->end) goto err; if (*ctx->buf == ']') goto done; if (*ctx->buf != ',') goto err; ctx->buf++; } done: ctx->buf++; /* ']' */ return newRV_noinc((SV *)av); err: SvREFCNT_dec((SV *)av); return NULL; } static SV *fujson_parse_obj(pTHX_ fujson_parse_ctx *ctx) { HV *hv = newHV(); SV *key = NULL; SV *val; ctx->buf++; /* '{' */ fujson_parse_ws(aTHX_ ctx); if (ctx->buf == ctx->end) goto err; if (*ctx->buf == '}') goto done; while (true) { /* key */ if (*ctx->buf != '"') goto err; if (!(key = fujson_parse_string(aTHX_ ctx))) goto err; /* TODO: Use precomputed hash */ if (hv_exists_ent(hv, key, 0)) goto err; /* ':' */ fujson_parse_ws(aTHX_ ctx); if (ctx->buf == ctx->end) goto err; if (*ctx->buf != ':') goto err; ctx->buf++; /* value */ if (!(val = fujson_parse(aTHX_ ctx))) goto err; hv_store_ent(hv, key, val, 0); SvREFCNT_dec(key); /* TODO: can reuse buffer */ key = NULL; fujson_parse_ws(aTHX_ ctx); if (ctx->buf == ctx->end) goto err; if (*ctx->buf == '}') goto done; if (*ctx->buf != ',') goto err; ctx->buf++; fujson_parse_ws(aTHX_ ctx); } done: ctx->buf++; /* '}' */ return newRV_noinc((SV *)hv); err: if (key) SvREFCNT_dec(key); SvREFCNT_dec((SV *)hv); return NULL; } static SV *fujson_parse(pTHX_ fujson_parse_ctx *ctx) { fujson_parse_ws(aTHX_ ctx); if (ctx->buf == ctx->end) return NULL; switch (*ctx->buf) { case '"': return fujson_parse_string(aTHX_ ctx); case '{': return fujson_parse_obj(aTHX_ ctx); case '[': return fujson_parse_array(aTHX_ ctx); case 't': if (ctx->end - ctx->buf < 4) return NULL; if (memcmp(ctx->buf, "true", 4) != 0) return NULL; ctx->buf += 4; return newSV_true(); case 'f': if (ctx->end - ctx->buf < 5) return NULL; if (memcmp(ctx->buf, "false", 5) != 0) return NULL; ctx->buf += 5; return newSV_false(); case 'n': if (ctx->end - ctx->buf < 4) return NULL; if (memcmp(ctx->buf, "null", 4) != 0) return NULL; ctx->buf += 4; return newSV(0); default: if (*ctx->buf == '-' || (*ctx->buf >= '0' && *ctx->buf <= '9')) return fujson_parse_number(aTHX_ ctx); } return NULL; } static SV *fujson_parse_xs(pTHX_ I32 ax, I32 argc, SV *val) { I32 i = 1; char *arg; SV *r; int decutf8 = 0; STRLEN buflen; fujson_parse_ctx ctx; while (i < argc) { arg = SvPV_nolen(ST(i)); i++; if (i == argc) croak("Odd name/value argument for json_parse()"); r = ST(i); i++; if (strcmp(arg, "utf8") == 0) decutf8 = SvPVXtrue(r); else croak("Unknown flag: '%s'", arg); } arg = decutf8 ? SvPVbyte(val, buflen) : SvPVutf8(val, buflen); ctx.buf = (const unsigned char *)arg; ctx.end = ctx.buf + buflen; r = fujson_parse(aTHX_ &ctx); if (!r) croak("JSON parsing failed at offset %"UVuf, (UV)((char *)ctx.buf - arg)); fujson_parse_ws(aTHX_ &ctx); if (ctx.buf != ctx.end) { SvREFCNT_dec(r); croak("garbage after JSON value at offset %"UVuf, (UV)((char *)ctx.buf - arg)); } return sv_2mortal(r); } /* TODO: incremental parsing (accept & return a byte offset) */ /* TODO: max_depth & max_size */