jsonparse: Add max_depth, max_size and offset options

This completes all the functionality that I wanted from the JSON parser.
This commit is contained in:
Yorhel 2025-02-01 11:01:43 +01:00
parent abfbba3c10
commit 13eaeb1d4a
3 changed files with 105 additions and 9 deletions

View file

@ -18,9 +18,9 @@ doesn't believe in the concept of a "batteries included" standard library.
=head1 SYNOPSIS
use FU::Util qw/json_format/;
use FU::Util qw/json_format/;
my $data = json_format [1, 2, 3];
my $data = json_format [1, 2, 3];
=head1 DESCRIPTION
@ -51,7 +51,12 @@ to format a floating point C<NaN> or C<Inf> results in an error.
Parse a JSON string and return a Perl value. With the default options, this
function is roughly similar to:
JSON::PP->new->allow_nonref->core_bools-decode($string);
JSON::PP->new->allow_nonref->core_bools-decode($string);
Croaks on invalid JSON, but the error messages are not super useful. This
function also throws an error on JSON objects with duplicate keys, which is
consistent with the default behavior of L<Cpanel::JSON::XS> but inconsistent
with other modules.
Supported C<%options>:
@ -62,6 +67,31 @@ Supported C<%options>:
Boolean, interpret the input C<$string> as a UTF-8 encoded byte string instead
of a Perl Unicode string.
=item max_depth
Maximum permitted nesting depth of arrays and objects. Defaults to 512.
=item max_size
Throw an error if the JSON data is larger than the given size in bytes.
Defaults to 1 GiB.
=item offset
Takes a reference to a scalar that indicates from which byte offset in
C<$string> to start parsing. On success, the offset is updated to point to the
next non-whitespace character or C<undef> if the string has been fully
consumed.
This option can be used to parse a stream of JSON values:
my $data = '{"obj":1}{"obj":2}';
my $offset = 0;
my $obj1 = json_parse($data, offset => \$offset);
# $obj1 = {obj=>1}; $offset = 9;
my $obj2 = json_parse($data, offset => \$offset);
# $obj2 = {obj=>2}; $offset = undef;
=back
@ -70,14 +100,14 @@ of a Perl Unicode string.
Format a Perl value as JSON. With the default options, this function behaves
roughly similar to:
JSON::PP->new->allow_nonref->core_bools->convert_blessed->encode($scalar);
JSON::PP->new->allow_nonref->core_bools->convert_blessed->encode($scalar);
Some modules escape the slash character in encoded strings to prevent a
potential XSS vulnerability when embedding JSON inside C<< <script> ..
</script> >> tags. This function does I<not> do that because it might not even
be sufficient. The following is probably an improvement:
json_format($data) =~ s{</}{<\\/}rg =~ s/<!--/<\\u0021--/rg;
json_format($data) =~ s{</}{<\\/}rg =~ s/<!--/<\\u0021--/rg;
The following C<%options> are supported:

View file

@ -1,6 +1,7 @@
typedef struct {
const unsigned char *buf;
const unsigned char *end;
UV depth;
} fujson_parse_ctx;
static SV *fujson_parse(pTHX_ fujson_parse_ctx *);
@ -153,6 +154,7 @@ static SV *fujson_parse_number(pTHX_ fujson_parse_ctx *ctx) {
static SV *fujson_parse_array(pTHX_ fujson_parse_ctx *ctx) {
AV *av = newAV();
SV *r;
if (--ctx->depth == 0) return NULL;
ctx->buf++; /* '[' */
fujson_parse_ws(aTHX_ ctx);
if (ctx->buf == ctx->end) goto err;
@ -168,6 +170,7 @@ static SV *fujson_parse_array(pTHX_ fujson_parse_ctx *ctx) {
}
done:
ctx->buf++; /* ']' */
ctx->depth++;
return newRV_noinc((SV *)av);
err:
SvREFCNT_dec((SV *)av);
@ -182,6 +185,7 @@ static SV *fujson_parse_obj(pTHX_ fujson_parse_ctx *ctx) {
fustr key;
fustr_init(&key, NULL, SIZE_MAX);
if (--ctx->depth == 0) return NULL;
ctx->buf++; /* '{' */
fujson_parse_ws(aTHX_ ctx);
if (ctx->buf == ctx->end) goto err;
@ -217,6 +221,7 @@ static SV *fujson_parse_obj(pTHX_ fujson_parse_ctx *ctx) {
done:
if (key.sv) SvREFCNT_dec(key.sv);
ctx->buf++; /* '}' */
ctx->depth++;
return newRV_noinc((SV *)hv);
err:
if (key.sv) SvREFCNT_dec(key.sv);
@ -257,10 +262,13 @@ static SV *fujson_parse_xs(pTHX_ I32 ax, I32 argc, SV *val) {
I32 i = 1;
char *arg;
SV *r;
SV *offset = NULL;
UV maxlen = 0;
int decutf8 = 0;
STRLEN buflen;
fujson_parse_ctx ctx;
ctx.depth = 0;
while (i < argc) {
arg = SvPV_nolen(ST(i));
i++;
@ -269,24 +277,40 @@ static SV *fujson_parse_xs(pTHX_ I32 ax, I32 argc, SV *val) {
i++;
if (strcmp(arg, "utf8") == 0) decutf8 = SvPVXtrue(r);
else if (strcmp(arg, "max_size") == 0) maxlen = SvUV(r);
else if (strcmp(arg, "max_depth") == 0) ctx.depth = SvUV(r);
else if (strcmp(arg, "offset") == 0) offset = r;
else croak("Unknown flag: '%s'", arg);
}
if (maxlen == 0) maxlen = 1<<30;
if (ctx.depth == 0) ctx.depth = 512;
arg = decutf8 ? SvPVbyte(val, buflen) : SvPVutf8(val, buflen);
ctx.buf = (const unsigned char *)arg;
ctx.end = ctx.buf + buflen;
if (offset) {
if (!SvROK(offset)) croak("Offset must be a reference to a scalar");
offset = SvRV(offset);
if (!looks_like_number(offset) || SvIV(offset) < 0) croak("Offset must be a positive integer");
if (SvUV(offset) >= buflen) croak("Offset too large");
ctx.buf += SvUV(offset);
if ((UV)(ctx.end - ctx.buf) > maxlen) ctx.end = ctx.buf + maxlen;
} else if ((UV)(ctx.end - ctx.buf) > maxlen)
croak("Input string is larger than max_size");
r = fujson_parse(aTHX_ &ctx);
if (!r) croak("JSON parsing failed at offset %"UVuf, (UV)((char *)ctx.buf - arg));
fujson_parse_ws(aTHX_ &ctx);
if (ctx.buf != ctx.end) {
if (offset) {
if (ctx.buf == ctx.end) sv_set_undef(offset);
else SvUV_set(offset, (UV)((char *)ctx.buf - arg));
} else if (ctx.buf != ctx.end) {
SvREFCNT_dec(r);
croak("garbage after JSON value at offset %"UVuf, (UV)((char *)ctx.buf - arg));
}
return sv_2mortal(r);
}
/* TODO: incremental parsing (accept & return a byte offset) */
/* TODO: max_depth & max_size */

View file

@ -50,6 +50,9 @@ my @error = (
'{,}',
'{"":1,"":2}',
'{"ë":1,"ë":1}',
'[] x',
'{}x',
);
for my $s (@error) {
ok !eval { json_parse($s); 1 };
@ -177,6 +180,10 @@ for (2000..2100, 4000..4200, 8100..8200, 12200..12300, 16300..16400) {
is json_parse("\"$s\""), $s
}
ok !eval { json_parse '[[[[]]]]', max_depth => 4; 1 };
ok !eval { json_parse '{"":{"":{"":{"":1}}}}', max_depth => 4; 1 };
# 500 depth
{
$v = json_parse('['x500 . ']'x500);
@ -191,4 +198,39 @@ for (2000..2100, 4000..4200, 8100..8200, 12200..12300, 16300..16400) {
is $i, 500;
}
# offset / max_size
{
my $off = 0;
my $str = '0123-5.3e1"x"[]{}truefalse 1 '; # cursed
is json_parse($str, offset => \$off), 0;
is $off, 1;
is json_parse($str, offset => \$off, max_size => 4), 123;
is $off, 4;
is json_parse($str, offset => \$off), -53;
is $off, 10;
is json_parse($str, offset => \$off), 'x';
is $off, 13;
is ref json_parse($str, offset => \$off), 'ARRAY';
is $off, 15;
is ref json_parse($str, offset => \$off), 'HASH';
is $off, 17;
ok json_parse($str, offset => \$off);
is $off, 21;
ok !json_parse($str, offset => \$off);
is $off, 27;
is json_parse($str, offset => \$off), 1;
ok !defined $off;
ok !eval { json_parse $str, offset => \$off; 1 };
$off = 100;
ok !eval { json_parse $str, offset => \$off; 1 };
$off = 17;
ok !eval { json_parse $str, offset => \$off, max_size => 3; 1 };
is json_parse('"string"', max_size => 8), 'string';
ok !eval { json_parse '"string"', max_size => 7 };
}
done_testing;