From 13eaeb1d4aafb133ade19804c0c60fa9c21b5247 Mon Sep 17 00:00:00 2001 From: Yorhel Date: Sat, 1 Feb 2025 11:01:43 +0100 Subject: [PATCH] jsonparse: Add max_depth, max_size and offset options This completes all the functionality that I wanted from the JSON parser. --- FU/Util.pm | 40 +++++++++++++++++++++++++++++++++++----- c/jsonparse.c | 32 ++++++++++++++++++++++++++++---- t/json_parse.t | 42 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 105 insertions(+), 9 deletions(-) diff --git a/FU/Util.pm b/FU/Util.pm index 5ff032c..23afdf1 100644 --- a/FU/Util.pm +++ b/FU/Util.pm @@ -18,9 +18,9 @@ doesn't believe in the concept of a "batteries included" standard library. =head1 SYNOPSIS - use FU::Util qw/json_format/; + use FU::Util qw/json_format/; - my $data = json_format [1, 2, 3]; + my $data = json_format [1, 2, 3]; =head1 DESCRIPTION @@ -51,7 +51,12 @@ to format a floating point C or C results in an error. Parse a JSON string and return a Perl value. With the default options, this function is roughly similar to: - JSON::PP->new->allow_nonref->core_bools-decode($string); + JSON::PP->new->allow_nonref->core_bools-decode($string); + +Croaks on invalid JSON, but the error messages are not super useful. This +function also throws an error on JSON objects with duplicate keys, which is +consistent with the default behavior of L but inconsistent +with other modules. Supported C<%options>: @@ -62,6 +67,31 @@ Supported C<%options>: Boolean, interpret the input C<$string> as a UTF-8 encoded byte string instead of a Perl Unicode string. +=item max_depth + +Maximum permitted nesting depth of arrays and objects. Defaults to 512. + +=item max_size + +Throw an error if the JSON data is larger than the given size in bytes. +Defaults to 1 GiB. + +=item offset + +Takes a reference to a scalar that indicates from which byte offset in +C<$string> to start parsing. On success, the offset is updated to point to the +next non-whitespace character or C if the string has been fully +consumed. + +This option can be used to parse a stream of JSON values: + + my $data = '{"obj":1}{"obj":2}'; + my $offset = 0; + my $obj1 = json_parse($data, offset => \$offset); + # $obj1 = {obj=>1}; $offset = 9; + my $obj2 = json_parse($data, offset => \$offset); + # $obj2 = {obj=>2}; $offset = undef; + =back @@ -70,14 +100,14 @@ of a Perl Unicode string. Format a Perl value as JSON. With the default options, this function behaves roughly similar to: - JSON::PP->new->allow_nonref->core_bools->convert_blessed->encode($scalar); + JSON::PP->new->allow_nonref->core_bools->convert_blessed->encode($scalar); Some modules escape the slash character in encoded strings to prevent a potential XSS vulnerability when embedding JSON inside C<< >> tags. This function does I do that because it might not even be sufficient. The following is probably an improvement: - json_format($data) =~ s{ are supported: diff --git a/c/jsonparse.c b/c/jsonparse.c index 8a52b5b..3b9d43f 100644 --- a/c/jsonparse.c +++ b/c/jsonparse.c @@ -1,6 +1,7 @@ typedef struct { const unsigned char *buf; const unsigned char *end; + UV depth; } fujson_parse_ctx; static SV *fujson_parse(pTHX_ fujson_parse_ctx *); @@ -153,6 +154,7 @@ static SV *fujson_parse_number(pTHX_ fujson_parse_ctx *ctx) { static SV *fujson_parse_array(pTHX_ fujson_parse_ctx *ctx) { AV *av = newAV(); SV *r; + if (--ctx->depth == 0) return NULL; ctx->buf++; /* '[' */ fujson_parse_ws(aTHX_ ctx); if (ctx->buf == ctx->end) goto err; @@ -168,6 +170,7 @@ static SV *fujson_parse_array(pTHX_ fujson_parse_ctx *ctx) { } done: ctx->buf++; /* ']' */ + ctx->depth++; return newRV_noinc((SV *)av); err: SvREFCNT_dec((SV *)av); @@ -182,6 +185,7 @@ static SV *fujson_parse_obj(pTHX_ fujson_parse_ctx *ctx) { fustr key; fustr_init(&key, NULL, SIZE_MAX); + if (--ctx->depth == 0) return NULL; ctx->buf++; /* '{' */ fujson_parse_ws(aTHX_ ctx); if (ctx->buf == ctx->end) goto err; @@ -217,6 +221,7 @@ static SV *fujson_parse_obj(pTHX_ fujson_parse_ctx *ctx) { done: if (key.sv) SvREFCNT_dec(key.sv); ctx->buf++; /* '}' */ + ctx->depth++; return newRV_noinc((SV *)hv); err: if (key.sv) SvREFCNT_dec(key.sv); @@ -257,10 +262,13 @@ static SV *fujson_parse_xs(pTHX_ I32 ax, I32 argc, SV *val) { I32 i = 1; char *arg; SV *r; + SV *offset = NULL; + UV maxlen = 0; int decutf8 = 0; STRLEN buflen; fujson_parse_ctx ctx; + ctx.depth = 0; while (i < argc) { arg = SvPV_nolen(ST(i)); i++; @@ -269,24 +277,40 @@ static SV *fujson_parse_xs(pTHX_ I32 ax, I32 argc, SV *val) { i++; if (strcmp(arg, "utf8") == 0) decutf8 = SvPVXtrue(r); + else if (strcmp(arg, "max_size") == 0) maxlen = SvUV(r); + else if (strcmp(arg, "max_depth") == 0) ctx.depth = SvUV(r); + else if (strcmp(arg, "offset") == 0) offset = r; else croak("Unknown flag: '%s'", arg); } + if (maxlen == 0) maxlen = 1<<30; + if (ctx.depth == 0) ctx.depth = 512; arg = decutf8 ? SvPVbyte(val, buflen) : SvPVutf8(val, buflen); ctx.buf = (const unsigned char *)arg; ctx.end = ctx.buf + buflen; + if (offset) { + if (!SvROK(offset)) croak("Offset must be a reference to a scalar"); + offset = SvRV(offset); + if (!looks_like_number(offset) || SvIV(offset) < 0) croak("Offset must be a positive integer"); + if (SvUV(offset) >= buflen) croak("Offset too large"); + ctx.buf += SvUV(offset); + if ((UV)(ctx.end - ctx.buf) > maxlen) ctx.end = ctx.buf + maxlen; + + } else if ((UV)(ctx.end - ctx.buf) > maxlen) + croak("Input string is larger than max_size"); + r = fujson_parse(aTHX_ &ctx); if (!r) croak("JSON parsing failed at offset %"UVuf, (UV)((char *)ctx.buf - arg)); fujson_parse_ws(aTHX_ &ctx); - if (ctx.buf != ctx.end) { + if (offset) { + if (ctx.buf == ctx.end) sv_set_undef(offset); + else SvUV_set(offset, (UV)((char *)ctx.buf - arg)); + } else if (ctx.buf != ctx.end) { SvREFCNT_dec(r); croak("garbage after JSON value at offset %"UVuf, (UV)((char *)ctx.buf - arg)); } return sv_2mortal(r); } - -/* TODO: incremental parsing (accept & return a byte offset) */ -/* TODO: max_depth & max_size */ diff --git a/t/json_parse.t b/t/json_parse.t index 65e0748..d223077 100644 --- a/t/json_parse.t +++ b/t/json_parse.t @@ -50,6 +50,9 @@ my @error = ( '{,}', '{"":1,"":2}', '{"ë":1,"ë":1}', + + '[] x', + '{}x', ); for my $s (@error) { ok !eval { json_parse($s); 1 }; @@ -177,6 +180,10 @@ for (2000..2100, 4000..4200, 8100..8200, 12200..12300, 16300..16400) { is json_parse("\"$s\""), $s } +ok !eval { json_parse '[[[[]]]]', max_depth => 4; 1 }; +ok !eval { json_parse '{"":{"":{"":{"":1}}}}', max_depth => 4; 1 }; + + # 500 depth { $v = json_parse('['x500 . ']'x500); @@ -191,4 +198,39 @@ for (2000..2100, 4000..4200, 8100..8200, 12200..12300, 16300..16400) { is $i, 500; } + +# offset / max_size +{ + my $off = 0; + my $str = '0123-5.3e1"x"[]{}truefalse 1 '; # cursed + is json_parse($str, offset => \$off), 0; + is $off, 1; + is json_parse($str, offset => \$off, max_size => 4), 123; + is $off, 4; + is json_parse($str, offset => \$off), -53; + is $off, 10; + is json_parse($str, offset => \$off), 'x'; + is $off, 13; + is ref json_parse($str, offset => \$off), 'ARRAY'; + is $off, 15; + is ref json_parse($str, offset => \$off), 'HASH'; + is $off, 17; + ok json_parse($str, offset => \$off); + is $off, 21; + ok !json_parse($str, offset => \$off); + is $off, 27; + is json_parse($str, offset => \$off), 1; + ok !defined $off; + ok !eval { json_parse $str, offset => \$off; 1 }; + + $off = 100; + ok !eval { json_parse $str, offset => \$off; 1 }; + + $off = 17; + ok !eval { json_parse $str, offset => \$off, max_size => 3; 1 }; + + is json_parse('"string"', max_size => 8), 'string'; + ok !eval { json_parse '"string"', max_size => 7 }; +} + done_testing;