diff --git a/FU.xs b/FU.xs index 146d7e7..7b7bb48 100644 --- a/FU.xs +++ b/FU.xs @@ -5,11 +5,17 @@ #include "c/common.c" #include "c/jsonfmt.c" +#include "c/jsonparse.c" -MODULE = FU PACKAGE = FU::XS + +MODULE = FU PACKAGE = FU::Util PROTOTYPES: DISABLE void json_format(SV *val, ...) CODE: ST(0) = fujson_fmt_xs(aTHX_ ax, items, val); + +void json_parse(SV *val, ...) + CODE: + ST(0) = fujson_parse_xs(aTHX_ ax, items, val); diff --git a/FU/Util.pm b/FU/Util.pm index 32533b9..d22a639 100644 --- a/FU/Util.pm +++ b/FU/Util.pm @@ -4,8 +4,6 @@ use v5.36; use FU::XS; use Exporter 'import'; -our @EXPORT_OK = qw/json_format/; - -*json_format = *FU::XS::json_format; +our @EXPORT_OK = qw/json_format json_parse/; 1; diff --git a/FU/Util.pod b/FU/Util.pod index f597576..67bb5a2 100644 --- a/FU/Util.pod +++ b/FU/Util.pod @@ -20,24 +20,47 @@ functions conform strictly to L, non-standard extensions are not supported and never will be. It also happens to be pretty fast, refer to L for some numbers. -JSON booleans are decoded into C and C. When +JSON booleans are parsed into C and C. When formatting, those builtin constants are the I recognized boolean values - alternative representations such as C and C are not recognized and attempting to format such values will croak. +JSON numbers that are too large fit into a Perl integer are parsed into a +floating point value instead. This obviously loses precision, but is consistent +with C in JavaScript land - except Perl does support the full +range of a 64bit integer. JSON numbers with a fraction or exponent are also +converted into floating point, which may lose precision as well. +L and L are not currently supported. Attempting +to format a floating point C or C results in an error. + =over +=item json_parse($string, %options) + +Parse a JSON string and return a Perl value. With the default options, this +function is roughly similar to: + + JSON::PP->new->allow_nonref->core_bools-decode($string); + +Supported C<%options>: + +=over + +=item utf8 + +Boolean, interpret the input C<$string> as a UTF-8 encoded byte string instead +of a Perl Unicode string. + +=back + + =item json_format($scalar, %options) -Format a Perl value as JSON. - -With the default options, this function behaves roughly similar to: +Format a Perl value as JSON. With the default options, this function behaves +roughly similar to: JSON::PP->new->allow_nonref->core_bools->convert_blessed->encode($scalar); -This function croaks when attempting to format a floating point C or -C. - Some modules escape the slash character in encoded strings to prevent a potential XSS vulnerability when embedding JSON inside C<< >> tags. This function does I do that because it might not even diff --git a/c/common.c b/c/common.c index db2b94f..5b63046 100644 --- a/c/common.c +++ b/c/common.c @@ -7,9 +7,9 @@ typedef struct { size_t maxlen; } fustr; -static void fustr_init_(pTHX_ fustr *s, size_t prealloc, size_t maxlen) { - if (prealloc > maxlen) prealloc = maxlen; - s->sv = sv_2mortal(newSV(prealloc)); +/* sv must be a new SV with a preallocated buffer */ +static void fustr_init_(pTHX_ fustr *s, SV *sv, size_t maxlen) { + s->sv = sv; SvPOK_only(s->sv); s->cur = SvPVX(s->sv); s->end = SvEND(s->sv); diff --git a/c/jsonfmt.c b/c/jsonfmt.c index 4c1cf8f..fe91e29 100644 --- a/c/jsonfmt.c +++ b/c/jsonfmt.c @@ -81,7 +81,6 @@ static void fujson_fmt_str(pTHX_ fujson_fmt_ctx *ctx, const char *stri, size_t l fustr_write(&ctx->out, "\"", 1); } -/* All digits between 0 and 100, a trick I borrowed from the Zig stdlib. */ static const char fujson_digits[] = "00010203040506070809" "10111213141516171819" @@ -306,7 +305,7 @@ static SV *fujson_fmt_xs(pTHX_ I32 ax, I32 argc, SV *val) { if (ctx.out.maxlen == 0) ctx.out.maxlen = 1<<30; if (ctx.depth == 0) ctx.depth = 512; - fustr_init(&ctx.out, 128, ctx.out.maxlen); + fustr_init(&ctx.out, sv_2mortal(newSV(128)), ctx.out.maxlen); fujson_fmt(aTHX_ &ctx, val); if (ctx.pretty >= 0) fustr_write(&ctx.out, "\n", 1); r = fustr_done(&ctx.out); diff --git a/c/jsonparse.c b/c/jsonparse.c new file mode 100644 index 0000000..9e1e16c --- /dev/null +++ b/c/jsonparse.c @@ -0,0 +1,289 @@ +typedef struct { + const unsigned char *buf; + const unsigned char *end; +} fujson_parse_ctx; + +static SV *fujson_parse(pTHX_ fujson_parse_ctx *); + +static void fujson_parse_ws(pTHX_ fujson_parse_ctx *ctx) { + unsigned char x; + while (ctx->buf < ctx->end) { + x = *ctx->buf; + if (!(x == 0x09 || x == 0x0a || x == 0x0d || x == 0x20)) break; + ctx->buf++; + } +} + +static int fujson_parse_string_escape(pTHX_ fujson_parse_ctx *ctx, fustr *r) { + unsigned int n, s; + ctx->buf++; /* '\\' */ + if (ctx->buf == ctx->end) return 1; + switch (*(ctx->buf++)) { + case '"': fustr_write(r, "\"", 1); break; + case '\\': fustr_write(r, "\\", 1); break; + case '/': fustr_write(r, "/", 1); break; /* We don't escape this one */ + case 'b': fustr_write(r, "\x08", 1); break; + case 't': fustr_write(r, "\x09", 1); break; + case 'n': fustr_write(r, "\x0a", 1); break; + case 'f': fustr_write(r, "\x0c", 1); break; + case 'r': fustr_write(r, "\x0d", 1); break; + case 'u': + /* (awful code adapted from ncdu) */ +#define INV (1<<16) +#define hn(n) (n >= '0' && n <= '9' ? n-'0' : n >= 'A' && n <= 'F' ? n-'A'+10 : n >= 'a' && n <= 'f' ? n-'a'+10 : INV) +#define h4(b) (hn((b)[0])<<12) + (hn((b)[1])<<8) + (hn((b)[2])<<4) + hn((b)[3]) + if (ctx->end - ctx->buf < 4) return 1; + n = h4(ctx->buf); + if (n >= INV || (n & 0xfc00) == 0xdc00) return 1; + ctx->buf += 4; + if ((n & 0xfc00) == 0xd800) { /* high surrogate */ + if (ctx->end - ctx->buf < 6) return 1; + if (ctx->buf[0] != '\\' || ctx->buf[1] != 'u') return 1; + s = h4(ctx->buf+2); + if (s >= INV || (s & 0xfc00) != 0xdc00) return 1; + n = 0x10000 + (((n & 0x03ff) << 10) | (s & 0x03ff)); + ctx->buf += 6; + } + fustr_reserve(r, 4); + r->cur = (char *)uvchr_to_utf8((U8 *)r->cur, n); + break; +#undef INV +#undef hn +#undef h4 + default: + return 1; + } + return 0; +} + +static SV *fujson_parse_string(pTHX_ fujson_parse_ctx *ctx) { + fustr r; + size_t len, maxlen; + unsigned char x = 0; + fustr_init(&r, newSV(32), SIZE_MAX); + ctx->buf++; /* '"' */ + while (true) { + /* Fast path: ASCII, no unescaping needed */ + len = 0; + maxlen = ctx->end - ctx->buf; + while (len < maxlen) { + x = ctx->buf[len]; + /* While we always escape 0x7f when formatting, JSON does permit it unescaped */ + if (x <= 0x1f || x == '"' || x == '\\' || x >= 0x80) break; + len++; + } + if (len == maxlen) goto err; + fustr_write(&r, (const char *)ctx->buf, len); + ctx->buf += len; + + /* Slow path */ + if (x == '"') { + ctx->buf++; + SvUTF8_on(fustr_done(&r)); + return r.sv; + } else if (x == '\\') { + if (fujson_parse_string_escape(aTHX_ ctx, &r)) goto err; + } else if (x >= 0x80) { + len = isC9_STRICT_UTF8_CHAR(ctx->buf, ctx->end); + if (len == 0) goto err; + fustr_write(&r, (const char *)ctx->buf, len); + ctx->buf += len; + } else { + goto err; + } + } +err: + SvREFCNT_dec(r.sv); + return NULL; +} + +/* Validate JSON grammar of a number, increments ctx->buf to the end of the + * number and returns -1 on error, 0 if it's an int, 1 for floats. */ +static int fujson_parse_number_grammar(fujson_parse_ctx *ctx) { + int ret = 0; + if (*ctx->buf == '-') ctx->buf++; + if (ctx->buf == ctx->end) return -1; + if (*ctx->buf == '0' && (ctx->buf+1 == ctx->end || + !(ctx->buf[1] == '.' || ctx->buf[1] == 'e' || ctx->buf[1] == 'E'))) { + /* rfc8259 permits "-0", so we'll not check for that */ + ctx->buf++; + return 0; + } +#define DIG1 \ + if (ctx->buf == ctx->end || *ctx->buf < '0' || *ctx->buf > '9') return -1; \ + ctx->buf++; \ + while (ctx->buf != ctx->end && *ctx->buf >= '0' && *ctx->buf <= '9') ctx->buf++; + + /* int part */ + DIG1; + /* decimal part */ + if (ctx->buf != ctx->end && *ctx->buf == '.') { + ret = 1; + ctx->buf++; + DIG1; + } + /* exponent */ + if (ctx->buf != ctx->end && (*ctx->buf == 'e' || *ctx->buf == 'E')) { + ret = 1; + ctx->buf++; + if (ctx->buf == ctx->end) return -1; + if (*ctx->buf == '+' || *ctx->buf == '-') ctx->buf++; + DIG1; + } + +#undef DIG1 + return ret; +} + +static SV *fujson_parse_number(pTHX_ fujson_parse_ctx *ctx) { + const unsigned char *start = ctx->buf; + int isnum = fujson_parse_number_grammar(ctx); + if (isnum == -1) return NULL; + + UV uv; + const char *end = (const char *)ctx->buf; + /* grok_atoUV() in this context can only return false on overflow */ + if (!isnum && grok_atoUV((const char *)(*start == '-' ? start+1 : start), &uv, &end)) { + if (*start != '-') return newSVuv(uv); + if (uv <= ((UV)IV_MAX)+1) return newSViv(-uv); + } + + /* floating point or overflowed integer, might lose precision */ + NV val; + my_atof3((const char *)start, &val, ctx->buf - start); /* this function is not documented to be public... */ + return newSVnv(val); +} + +static SV *fujson_parse_array(pTHX_ fujson_parse_ctx *ctx) { + AV *av = newAV(); + SV *r; + ctx->buf++; /* '[' */ + fujson_parse_ws(aTHX_ ctx); + if (ctx->buf == ctx->end) goto err; + if (*ctx->buf == ']') goto done; + while (true) { + if (!(r = fujson_parse(aTHX_ ctx))) goto err; + av_push_simple(av, r); + fujson_parse_ws(aTHX_ ctx); + if (ctx->buf == ctx->end) goto err; + if (*ctx->buf == ']') goto done; + if (*ctx->buf != ',') goto err; + ctx->buf++; + } +done: + ctx->buf++; /* ']' */ + return newRV_noinc((SV *)av); +err: + SvREFCNT_dec((SV *)av); + return NULL; +} + +static SV *fujson_parse_obj(pTHX_ fujson_parse_ctx *ctx) { + HV *hv = newHV(); + SV *key = NULL; + SV *val; + ctx->buf++; /* '{' */ + fujson_parse_ws(aTHX_ ctx); + if (ctx->buf == ctx->end) goto err; + if (*ctx->buf == '}') goto done; + while (true) { + /* key */ + if (*ctx->buf != '"') goto err; + if (!(key = fujson_parse_string(aTHX_ ctx))) goto err; + /* TODO: Use precomputed hash */ + if (hv_exists_ent(hv, key, 0)) goto err; + + /* ':' */ + fujson_parse_ws(aTHX_ ctx); + if (ctx->buf == ctx->end) goto err; + if (*ctx->buf != ':') goto err; + ctx->buf++; + + /* value */ + if (!(val = fujson_parse(aTHX_ ctx))) goto err; + hv_store_ent(hv, key, val, 0); + SvREFCNT_dec(key); /* TODO: can reuse buffer */ + key = NULL; + + fujson_parse_ws(aTHX_ ctx); + if (ctx->buf == ctx->end) goto err; + if (*ctx->buf == '}') goto done; + if (*ctx->buf != ',') goto err; + ctx->buf++; + fujson_parse_ws(aTHX_ ctx); + } +done: + ctx->buf++; /* '}' */ + return newRV_noinc((SV *)hv); +err: + if (key) SvREFCNT_dec(key); + SvREFCNT_dec((SV *)hv); + return NULL; +} + +static SV *fujson_parse(pTHX_ fujson_parse_ctx *ctx) { + fujson_parse_ws(aTHX_ ctx); + if (ctx->buf == ctx->end) return NULL; + switch (*ctx->buf) { + case '"': return fujson_parse_string(aTHX_ ctx); + case '{': return fujson_parse_obj(aTHX_ ctx); + case '[': return fujson_parse_array(aTHX_ ctx); + case 't': + if (ctx->end - ctx->buf < 4) return NULL; + if (memcmp(ctx->buf, "true", 4) != 0) return NULL; + ctx->buf += 4; + return newSV_true(); + case 'f': + if (ctx->end - ctx->buf < 5) return NULL; + if (memcmp(ctx->buf, "false", 5) != 0) return NULL; + ctx->buf += 5; + return newSV_false(); + case 'n': + if (ctx->end - ctx->buf < 4) return NULL; + if (memcmp(ctx->buf, "null", 4) != 0) return NULL; + ctx->buf += 4; + return newSV(0); + default: + if (*ctx->buf == '-' || (*ctx->buf >= '0' && *ctx->buf <= '9')) + return fujson_parse_number(aTHX_ ctx); + } + return NULL; +} + +static SV *fujson_parse_xs(pTHX_ I32 ax, I32 argc, SV *val) { + I32 i = 1; + char *arg; + SV *r; + int decutf8 = 0; + STRLEN buflen; + fujson_parse_ctx ctx; + + while (i < argc) { + arg = SvPV_nolen(ST(i)); + i++; + if (i == argc) croak("Odd name/value argument for json_parse()"); + r = ST(i); + i++; + + if (strcmp(arg, "utf8") == 0) decutf8 = SvPVXtrue(r); + else croak("Unknown flag: '%s'", arg); + } + + arg = decutf8 ? SvPVbyte(val, buflen) : SvPVutf8(val, buflen); + ctx.buf = (const unsigned char *)arg; + ctx.end = ctx.buf + buflen; + + r = fujson_parse(aTHX_ &ctx); + if (!r) croak("JSON parsing failed at offset %"UVuf, (UV)((char *)ctx.buf - arg)); + + fujson_parse_ws(aTHX_ &ctx); + if (ctx.buf != ctx.end) { + SvREFCNT_dec(r); + croak("garbage after JSON value at offset %"UVuf, (UV)((char *)ctx.buf - arg)); + } + + return sv_2mortal(r); +} + +/* TODO: incremental parsing (accept & return a byte offset) */ +/* TODO: max_depth & max_size */ diff --git a/t/json_format.t b/t/json_format.t index a958468..d0b5747 100644 --- a/t/json_format.t +++ b/t/json_format.t @@ -31,11 +31,9 @@ my @tests = ( "\x01é\r\n\x1f💩", '"\u0001é\r\n\u001f💩"', )}, - do { use bytes; ( - "\x011\r\n\x8c", "\"\\u00011\\r\\n\x8c\"", - "\xff\xff", "\"\xff\xff\"", - "\x{1f4a9}", do { use utf8; '"💩"' }, - )}, + "\x011\r\n\x8c", "\"\\u00011\\r\\n\x8c\"", + "\xff\xff", "\"\xff\xff\"", + "\x{1f4a9}", do { use utf8; '"💩"' }, [], '[]', [0,1], '[0,1]', @@ -63,6 +61,8 @@ my @errors = ( *STDOUT, qr/unable to format unknown value/, 'NaN'+0, qr/unable to format floating point NaN or Inf as JSON/, 'Inf'+0, qr/unable to format floating point NaN or Inf as JSON/, + "\x{D83D}", qr/invalid codepoint encountered in string/, + "\x{DE03}", qr/invalid codepoint encountered in string/, do { no warnings 'portable'; "\x{ffffffff}" }, qr/invalid codepoint encountered in string/, do { my $o = {}; bless $o, 'FU::Whatever' }, qr/unable to format 'FU::Whatever' object as JSON/, do { my $o = {}; bless $o, 'MyToJSONSelf' }, qr/MyToJSONSelf::TO_JSON method returned same object as was passed instead of a new one/, diff --git a/t/json_parse.t b/t/json_parse.t new file mode 100644 index 0000000..da225bf --- /dev/null +++ b/t/json_parse.t @@ -0,0 +1,172 @@ +use v5.36; +use Test::More; +use FU::Util 'json_parse'; +no warnings 'experimental::builtin'; +use builtin 'is_bool', 'created_as_number'; + +my @error = ( + '', + 'tru', + 'nul', + 'fals', + 'true,', + + '"', + "\"\x00\"", + '"\x"', + '"\u', + '" \u123', + '"\\', + '"\ud812"', + '"\u123g"', + '"\udc12"', + '"\udc12\u1234"', + "\"\x{110000}\"", + + '1.', + '01', + '1e', + '1e+', + '1x', + '1e-', + '--1', + '+1', + '0x1', + '1..1', + '1ee1', + '1e1.1', + + ' [ ', + '[,true]', + '[true,]', + '[,]', + + ' { ', + '{1:2}', + '{""}', + '{"":}', + '{"":1', + '{"":1,}', + '{,}', + '{"":1,"":2}', +); +for my $s (@error) { + ok !eval { json_parse($s); 1 }; +} + +my $v; + +ok !defined json_parse " null "; + +$v = json_parse " true \t\r\n "; +ok is_bool $v; +ok $v; + +$v = json_parse " false "; +ok is_bool $v; +ok !$v; + +sub str($in, $exp) { + utf8::encode(my $str = $in); + my $out = json_parse($in); + is $out, $exp, $str; + ok utf8::is_utf8($out); + $out = json_parse($str, utf8 => 1); + is $out, $exp, $str; + ok utf8::is_utf8($out); +} +str '""', ''; +str '"hello, world"', 'hello, world'; +str '"\u0000\u0099\u0234\u1234"', "\x{00}\x{99}\x{234}\x{1234}"; +str "\"\x{7f}\x{99}\x{234}\x{1234}\x{12345}\"", "\x{7f}\x{99}\x{234}\x{1234}\x{12345}"; +str '"\/\"\\\\\b\t\n\f\r"', "/\"\\\x{08}\x{09}\x{0a}\x{0c}\x{0d}"; +str '"\uD83D\uDE03"', "\x{1F603}"; + +sub num($in, $exp=$in) { + my $out = json_parse($in); + is $out, $exp; + ok created_as_number $out; +} +num 0; +num ' -0 ', 0; +num '-9223372036854775808'; +num '9223372036854775807'; +num '18446744073709551615'; +num '-9223372036854775809', -9.22337203685478e+18; +num '18446744073709551616', 1.84467440737096e+19; +num '1.234'; +num '1e5', 100000; +num '1e+5', 100000; +num '1e-5', 0.00001; +num '2.5e-5', 0.000025; +num '2.5e5', 250000; +num '2.5E5', 250000; +num '-0.000000000000000000000000000000000000000000000000000000000000000000000000000001', -1e-78; + +$v = json_parse ' [ ] '; +is ref $v, 'ARRAY'; +is scalar @$v, 0; + +$v = json_parse ' [ true , null , false ] '; +is ref $v, 'ARRAY'; +is scalar @$v, 3; +ok $v->[0]; +ok !defined $v->[1]; +ok !$v->[2]; + +$v = json_parse ' [true,null,false] '; +is ref $v, 'ARRAY'; +is scalar @$v, 3; +ok $v->[0]; +ok !defined $v->[1]; +ok !$v->[2]; + +$v = json_parse ' [ [] ] '; +is ref $v, 'ARRAY'; +is scalar @$v, 1; +is ref $v->[0], 'ARRAY'; +is scalar $v->[0]->@*, 0; + +$v = json_parse '{}'; +is ref $v, 'HASH'; +is keys %$v, 0; + +$v = json_parse '{"a":1}'; +is ref $v, 'HASH'; +is keys %$v, 1; +is $v->{a}, 1; + +sub large($s) { + $v = json_parse $s; + is ref $v, 'HASH'; + is keys %$v, 3; + + ok exists $v->{a}; + is ref $v->{a}, 'ARRAY'; + is scalar $v->{a}->@*, 5; + ok created_as_number $v->{a}[0]; + is $v->{a}[0], 1; + ok created_as_number $v->{a}[1]; + is $v->{a}[1], 0.1; + ok is_bool $v->{a}[2]; + ok $v->{a}[2]; + ok !defined $v->{a}[3]; + is ref $v->{a}[4], 'HASH'; + is keys $v->{a}[4]->%*, 0; + + ok exists $v->{''}; + ok created_as_number $v->{''}; + is $v->{''}, 0; + + ok exists $v->{'ë'}; + is ref $v->{'ë'}, 'ARRAY'; + is scalar $v->{'ë'}->@*, 0; +} +large '{"a":[1,0.1,true,null,{}],"":-0,"ë":[]}'; +large ' { + "a" : [ 1 , 0.1 , true , null , { } ] , + "" : -0 , + "ë" : [ ] +} '; + +done_testing;