jsonparse: Add max_depth, max_size and offset options
This completes all the functionality that I wanted from the JSON parser.
This commit is contained in:
parent
abfbba3c10
commit
13eaeb1d4a
3 changed files with 105 additions and 9 deletions
40
FU/Util.pm
40
FU/Util.pm
|
|
@ -18,9 +18,9 @@ doesn't believe in the concept of a "batteries included" standard library.
|
||||||
|
|
||||||
=head1 SYNOPSIS
|
=head1 SYNOPSIS
|
||||||
|
|
||||||
use FU::Util qw/json_format/;
|
use FU::Util qw/json_format/;
|
||||||
|
|
||||||
my $data = json_format [1, 2, 3];
|
my $data = json_format [1, 2, 3];
|
||||||
|
|
||||||
=head1 DESCRIPTION
|
=head1 DESCRIPTION
|
||||||
|
|
||||||
|
|
@ -51,7 +51,12 @@ to format a floating point C<NaN> or C<Inf> results in an error.
|
||||||
Parse a JSON string and return a Perl value. With the default options, this
|
Parse a JSON string and return a Perl value. With the default options, this
|
||||||
function is roughly similar to:
|
function is roughly similar to:
|
||||||
|
|
||||||
JSON::PP->new->allow_nonref->core_bools-decode($string);
|
JSON::PP->new->allow_nonref->core_bools-decode($string);
|
||||||
|
|
||||||
|
Croaks on invalid JSON, but the error messages are not super useful. This
|
||||||
|
function also throws an error on JSON objects with duplicate keys, which is
|
||||||
|
consistent with the default behavior of L<Cpanel::JSON::XS> but inconsistent
|
||||||
|
with other modules.
|
||||||
|
|
||||||
Supported C<%options>:
|
Supported C<%options>:
|
||||||
|
|
||||||
|
|
@ -62,6 +67,31 @@ Supported C<%options>:
|
||||||
Boolean, interpret the input C<$string> as a UTF-8 encoded byte string instead
|
Boolean, interpret the input C<$string> as a UTF-8 encoded byte string instead
|
||||||
of a Perl Unicode string.
|
of a Perl Unicode string.
|
||||||
|
|
||||||
|
=item max_depth
|
||||||
|
|
||||||
|
Maximum permitted nesting depth of arrays and objects. Defaults to 512.
|
||||||
|
|
||||||
|
=item max_size
|
||||||
|
|
||||||
|
Throw an error if the JSON data is larger than the given size in bytes.
|
||||||
|
Defaults to 1 GiB.
|
||||||
|
|
||||||
|
=item offset
|
||||||
|
|
||||||
|
Takes a reference to a scalar that indicates from which byte offset in
|
||||||
|
C<$string> to start parsing. On success, the offset is updated to point to the
|
||||||
|
next non-whitespace character or C<undef> if the string has been fully
|
||||||
|
consumed.
|
||||||
|
|
||||||
|
This option can be used to parse a stream of JSON values:
|
||||||
|
|
||||||
|
my $data = '{"obj":1}{"obj":2}';
|
||||||
|
my $offset = 0;
|
||||||
|
my $obj1 = json_parse($data, offset => \$offset);
|
||||||
|
# $obj1 = {obj=>1}; $offset = 9;
|
||||||
|
my $obj2 = json_parse($data, offset => \$offset);
|
||||||
|
# $obj2 = {obj=>2}; $offset = undef;
|
||||||
|
|
||||||
=back
|
=back
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -70,14 +100,14 @@ of a Perl Unicode string.
|
||||||
Format a Perl value as JSON. With the default options, this function behaves
|
Format a Perl value as JSON. With the default options, this function behaves
|
||||||
roughly similar to:
|
roughly similar to:
|
||||||
|
|
||||||
JSON::PP->new->allow_nonref->core_bools->convert_blessed->encode($scalar);
|
JSON::PP->new->allow_nonref->core_bools->convert_blessed->encode($scalar);
|
||||||
|
|
||||||
Some modules escape the slash character in encoded strings to prevent a
|
Some modules escape the slash character in encoded strings to prevent a
|
||||||
potential XSS vulnerability when embedding JSON inside C<< <script> ..
|
potential XSS vulnerability when embedding JSON inside C<< <script> ..
|
||||||
</script> >> tags. This function does I<not> do that because it might not even
|
</script> >> tags. This function does I<not> do that because it might not even
|
||||||
be sufficient. The following is probably an improvement:
|
be sufficient. The following is probably an improvement:
|
||||||
|
|
||||||
json_format($data) =~ s{</}{<\\/}rg =~ s/<!--/<\\u0021--/rg;
|
json_format($data) =~ s{</}{<\\/}rg =~ s/<!--/<\\u0021--/rg;
|
||||||
|
|
||||||
The following C<%options> are supported:
|
The following C<%options> are supported:
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,7 @@
|
||||||
typedef struct {
|
typedef struct {
|
||||||
const unsigned char *buf;
|
const unsigned char *buf;
|
||||||
const unsigned char *end;
|
const unsigned char *end;
|
||||||
|
UV depth;
|
||||||
} fujson_parse_ctx;
|
} fujson_parse_ctx;
|
||||||
|
|
||||||
static SV *fujson_parse(pTHX_ fujson_parse_ctx *);
|
static SV *fujson_parse(pTHX_ fujson_parse_ctx *);
|
||||||
|
|
@ -153,6 +154,7 @@ static SV *fujson_parse_number(pTHX_ fujson_parse_ctx *ctx) {
|
||||||
static SV *fujson_parse_array(pTHX_ fujson_parse_ctx *ctx) {
|
static SV *fujson_parse_array(pTHX_ fujson_parse_ctx *ctx) {
|
||||||
AV *av = newAV();
|
AV *av = newAV();
|
||||||
SV *r;
|
SV *r;
|
||||||
|
if (--ctx->depth == 0) return NULL;
|
||||||
ctx->buf++; /* '[' */
|
ctx->buf++; /* '[' */
|
||||||
fujson_parse_ws(aTHX_ ctx);
|
fujson_parse_ws(aTHX_ ctx);
|
||||||
if (ctx->buf == ctx->end) goto err;
|
if (ctx->buf == ctx->end) goto err;
|
||||||
|
|
@ -168,6 +170,7 @@ static SV *fujson_parse_array(pTHX_ fujson_parse_ctx *ctx) {
|
||||||
}
|
}
|
||||||
done:
|
done:
|
||||||
ctx->buf++; /* ']' */
|
ctx->buf++; /* ']' */
|
||||||
|
ctx->depth++;
|
||||||
return newRV_noinc((SV *)av);
|
return newRV_noinc((SV *)av);
|
||||||
err:
|
err:
|
||||||
SvREFCNT_dec((SV *)av);
|
SvREFCNT_dec((SV *)av);
|
||||||
|
|
@ -182,6 +185,7 @@ static SV *fujson_parse_obj(pTHX_ fujson_parse_ctx *ctx) {
|
||||||
fustr key;
|
fustr key;
|
||||||
fustr_init(&key, NULL, SIZE_MAX);
|
fustr_init(&key, NULL, SIZE_MAX);
|
||||||
|
|
||||||
|
if (--ctx->depth == 0) return NULL;
|
||||||
ctx->buf++; /* '{' */
|
ctx->buf++; /* '{' */
|
||||||
fujson_parse_ws(aTHX_ ctx);
|
fujson_parse_ws(aTHX_ ctx);
|
||||||
if (ctx->buf == ctx->end) goto err;
|
if (ctx->buf == ctx->end) goto err;
|
||||||
|
|
@ -217,6 +221,7 @@ static SV *fujson_parse_obj(pTHX_ fujson_parse_ctx *ctx) {
|
||||||
done:
|
done:
|
||||||
if (key.sv) SvREFCNT_dec(key.sv);
|
if (key.sv) SvREFCNT_dec(key.sv);
|
||||||
ctx->buf++; /* '}' */
|
ctx->buf++; /* '}' */
|
||||||
|
ctx->depth++;
|
||||||
return newRV_noinc((SV *)hv);
|
return newRV_noinc((SV *)hv);
|
||||||
err:
|
err:
|
||||||
if (key.sv) SvREFCNT_dec(key.sv);
|
if (key.sv) SvREFCNT_dec(key.sv);
|
||||||
|
|
@ -257,10 +262,13 @@ static SV *fujson_parse_xs(pTHX_ I32 ax, I32 argc, SV *val) {
|
||||||
I32 i = 1;
|
I32 i = 1;
|
||||||
char *arg;
|
char *arg;
|
||||||
SV *r;
|
SV *r;
|
||||||
|
SV *offset = NULL;
|
||||||
|
UV maxlen = 0;
|
||||||
int decutf8 = 0;
|
int decutf8 = 0;
|
||||||
STRLEN buflen;
|
STRLEN buflen;
|
||||||
fujson_parse_ctx ctx;
|
fujson_parse_ctx ctx;
|
||||||
|
|
||||||
|
ctx.depth = 0;
|
||||||
while (i < argc) {
|
while (i < argc) {
|
||||||
arg = SvPV_nolen(ST(i));
|
arg = SvPV_nolen(ST(i));
|
||||||
i++;
|
i++;
|
||||||
|
|
@ -269,24 +277,40 @@ static SV *fujson_parse_xs(pTHX_ I32 ax, I32 argc, SV *val) {
|
||||||
i++;
|
i++;
|
||||||
|
|
||||||
if (strcmp(arg, "utf8") == 0) decutf8 = SvPVXtrue(r);
|
if (strcmp(arg, "utf8") == 0) decutf8 = SvPVXtrue(r);
|
||||||
|
else if (strcmp(arg, "max_size") == 0) maxlen = SvUV(r);
|
||||||
|
else if (strcmp(arg, "max_depth") == 0) ctx.depth = SvUV(r);
|
||||||
|
else if (strcmp(arg, "offset") == 0) offset = r;
|
||||||
else croak("Unknown flag: '%s'", arg);
|
else croak("Unknown flag: '%s'", arg);
|
||||||
}
|
}
|
||||||
|
if (maxlen == 0) maxlen = 1<<30;
|
||||||
|
if (ctx.depth == 0) ctx.depth = 512;
|
||||||
|
|
||||||
arg = decutf8 ? SvPVbyte(val, buflen) : SvPVutf8(val, buflen);
|
arg = decutf8 ? SvPVbyte(val, buflen) : SvPVutf8(val, buflen);
|
||||||
ctx.buf = (const unsigned char *)arg;
|
ctx.buf = (const unsigned char *)arg;
|
||||||
ctx.end = ctx.buf + buflen;
|
ctx.end = ctx.buf + buflen;
|
||||||
|
|
||||||
|
if (offset) {
|
||||||
|
if (!SvROK(offset)) croak("Offset must be a reference to a scalar");
|
||||||
|
offset = SvRV(offset);
|
||||||
|
if (!looks_like_number(offset) || SvIV(offset) < 0) croak("Offset must be a positive integer");
|
||||||
|
if (SvUV(offset) >= buflen) croak("Offset too large");
|
||||||
|
ctx.buf += SvUV(offset);
|
||||||
|
if ((UV)(ctx.end - ctx.buf) > maxlen) ctx.end = ctx.buf + maxlen;
|
||||||
|
|
||||||
|
} else if ((UV)(ctx.end - ctx.buf) > maxlen)
|
||||||
|
croak("Input string is larger than max_size");
|
||||||
|
|
||||||
r = fujson_parse(aTHX_ &ctx);
|
r = fujson_parse(aTHX_ &ctx);
|
||||||
if (!r) croak("JSON parsing failed at offset %"UVuf, (UV)((char *)ctx.buf - arg));
|
if (!r) croak("JSON parsing failed at offset %"UVuf, (UV)((char *)ctx.buf - arg));
|
||||||
|
|
||||||
fujson_parse_ws(aTHX_ &ctx);
|
fujson_parse_ws(aTHX_ &ctx);
|
||||||
if (ctx.buf != ctx.end) {
|
if (offset) {
|
||||||
|
if (ctx.buf == ctx.end) sv_set_undef(offset);
|
||||||
|
else SvUV_set(offset, (UV)((char *)ctx.buf - arg));
|
||||||
|
} else if (ctx.buf != ctx.end) {
|
||||||
SvREFCNT_dec(r);
|
SvREFCNT_dec(r);
|
||||||
croak("garbage after JSON value at offset %"UVuf, (UV)((char *)ctx.buf - arg));
|
croak("garbage after JSON value at offset %"UVuf, (UV)((char *)ctx.buf - arg));
|
||||||
}
|
}
|
||||||
|
|
||||||
return sv_2mortal(r);
|
return sv_2mortal(r);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* TODO: incremental parsing (accept & return a byte offset) */
|
|
||||||
/* TODO: max_depth & max_size */
|
|
||||||
|
|
|
||||||
|
|
@ -50,6 +50,9 @@ my @error = (
|
||||||
'{,}',
|
'{,}',
|
||||||
'{"":1,"":2}',
|
'{"":1,"":2}',
|
||||||
'{"ë":1,"ë":1}',
|
'{"ë":1,"ë":1}',
|
||||||
|
|
||||||
|
'[] x',
|
||||||
|
'{}x',
|
||||||
);
|
);
|
||||||
for my $s (@error) {
|
for my $s (@error) {
|
||||||
ok !eval { json_parse($s); 1 };
|
ok !eval { json_parse($s); 1 };
|
||||||
|
|
@ -177,6 +180,10 @@ for (2000..2100, 4000..4200, 8100..8200, 12200..12300, 16300..16400) {
|
||||||
is json_parse("\"$s\""), $s
|
is json_parse("\"$s\""), $s
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ok !eval { json_parse '[[[[]]]]', max_depth => 4; 1 };
|
||||||
|
ok !eval { json_parse '{"":{"":{"":{"":1}}}}', max_depth => 4; 1 };
|
||||||
|
|
||||||
|
|
||||||
# 500 depth
|
# 500 depth
|
||||||
{
|
{
|
||||||
$v = json_parse('['x500 . ']'x500);
|
$v = json_parse('['x500 . ']'x500);
|
||||||
|
|
@ -191,4 +198,39 @@ for (2000..2100, 4000..4200, 8100..8200, 12200..12300, 16300..16400) {
|
||||||
is $i, 500;
|
is $i, 500;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# offset / max_size
|
||||||
|
{
|
||||||
|
my $off = 0;
|
||||||
|
my $str = '0123-5.3e1"x"[]{}truefalse 1 '; # cursed
|
||||||
|
is json_parse($str, offset => \$off), 0;
|
||||||
|
is $off, 1;
|
||||||
|
is json_parse($str, offset => \$off, max_size => 4), 123;
|
||||||
|
is $off, 4;
|
||||||
|
is json_parse($str, offset => \$off), -53;
|
||||||
|
is $off, 10;
|
||||||
|
is json_parse($str, offset => \$off), 'x';
|
||||||
|
is $off, 13;
|
||||||
|
is ref json_parse($str, offset => \$off), 'ARRAY';
|
||||||
|
is $off, 15;
|
||||||
|
is ref json_parse($str, offset => \$off), 'HASH';
|
||||||
|
is $off, 17;
|
||||||
|
ok json_parse($str, offset => \$off);
|
||||||
|
is $off, 21;
|
||||||
|
ok !json_parse($str, offset => \$off);
|
||||||
|
is $off, 27;
|
||||||
|
is json_parse($str, offset => \$off), 1;
|
||||||
|
ok !defined $off;
|
||||||
|
ok !eval { json_parse $str, offset => \$off; 1 };
|
||||||
|
|
||||||
|
$off = 100;
|
||||||
|
ok !eval { json_parse $str, offset => \$off; 1 };
|
||||||
|
|
||||||
|
$off = 17;
|
||||||
|
ok !eval { json_parse $str, offset => \$off, max_size => 3; 1 };
|
||||||
|
|
||||||
|
is json_parse('"string"', max_size => 8), 'string';
|
||||||
|
ok !eval { json_parse '"string"', max_size => 7 };
|
||||||
|
}
|
||||||
|
|
||||||
done_testing;
|
done_testing;
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue