json_parse(): Disallow control characters in strings by default
Deviating from the standard, but more consistent other FU functions.
This commit is contained in:
parent
a43dc70ff9
commit
55baa6c9a6
3 changed files with 28 additions and 10 deletions
16
FU/Util.pm
16
FU/Util.pm
|
|
@ -137,7 +137,7 @@ value for C<$val>, due to C<\0> and C<\1> being considered booleans.
|
||||||
=head1 JSON Parsing & Formatting
|
=head1 JSON Parsing & Formatting
|
||||||
|
|
||||||
This module comes with a custom C-based JSON parser and formatter. These
|
This module comes with a custom C-based JSON parser and formatter. These
|
||||||
functions conform strictly to L<RFC-8259|https://tools.ietf.org/html/rfc8259>,
|
functions conform to L<RFC-8259|https://tools.ietf.org/html/rfc8259>,
|
||||||
non-standard extensions are not supported and never will be. It also happens to
|
non-standard extensions are not supported and never will be. It also happens to
|
||||||
be pretty fast, refer to L<FU::Benchmarks> for some numbers.
|
be pretty fast, refer to L<FU::Benchmarks> for some numbers.
|
||||||
|
|
||||||
|
|
@ -171,6 +171,13 @@ Supported C<%options>:
|
||||||
|
|
||||||
=over
|
=over
|
||||||
|
|
||||||
|
=item allow_control
|
||||||
|
|
||||||
|
Boolean, set to true to allow (encoded) ASCII control characters in JSON
|
||||||
|
strings, such as C<\u0000>, C<\b>, C<\u007f>, etc. These characters are
|
||||||
|
permitted per RFC-8259, but disallowed by this parser by default. See
|
||||||
|
C<utf8_decode()> below.
|
||||||
|
|
||||||
=item utf8
|
=item utf8
|
||||||
|
|
||||||
Boolean, interpret the input C<$string> as a UTF-8 encoded byte string instead
|
Boolean, interpret the input C<$string> as a UTF-8 encoded byte string instead
|
||||||
|
|
@ -251,10 +258,9 @@ value. There is no way to do that without violating JSON specs, so you should
|
||||||
use entity escaping instead.
|
use entity escaping instead.
|
||||||
|
|
||||||
Some JSON modules escape the forward slash (C</>) character instead, but that
|
Some JSON modules escape the forward slash (C</>) character instead, but that
|
||||||
is, at best, B<only> sufficient for embedding inside a C<< <script> >> tag (I'm
|
is I<only> sufficient for embedding inside a C<< <script> >> tag. In any other
|
||||||
not sure how C<< <!-- >> and C<< <![CDATA[ >> are treated in that context). In
|
context, you'll need the more thourough escaping provided by this C<html_safe>
|
||||||
any other context, you'll need the more thourough escaping provided by this
|
option.
|
||||||
C<html_safe> option.
|
|
||||||
|
|
||||||
=item max_size
|
=item max_size
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -2,6 +2,7 @@ typedef struct {
|
||||||
const unsigned char *buf;
|
const unsigned char *buf;
|
||||||
const unsigned char *end;
|
const unsigned char *end;
|
||||||
UV depth;
|
UV depth;
|
||||||
|
int allow_control;
|
||||||
} fujson_parse_ctx;
|
} fujson_parse_ctx;
|
||||||
|
|
||||||
static SV *fujson_parse(pTHX_ fujson_parse_ctx *);
|
static SV *fujson_parse(pTHX_ fujson_parse_ctx *);
|
||||||
|
|
@ -23,10 +24,10 @@ static inline int fujson_parse_string_escape(pTHX_ fujson_parse_ctx *ctx, fustr
|
||||||
case '"': *(r->cur++) = '\"'; break;
|
case '"': *(r->cur++) = '\"'; break;
|
||||||
case '\\':*(r->cur++) = '\\'; break;
|
case '\\':*(r->cur++) = '\\'; break;
|
||||||
case '/': *(r->cur++) = '/'; break; /* We don't escape this one */
|
case '/': *(r->cur++) = '/'; break; /* We don't escape this one */
|
||||||
case 'b': *(r->cur++) = 0x08; break;
|
case 'b': if (!ctx->allow_control) return 1; *(r->cur++) = 0x08; break;
|
||||||
case 't': *(r->cur++) = 0x09; break;
|
case 't': *(r->cur++) = 0x09; break;
|
||||||
case 'n': *(r->cur++) = 0x0a; break;
|
case 'n': *(r->cur++) = 0x0a; break;
|
||||||
case 'f': *(r->cur++) = 0x0c; break;
|
case 'f': if (!ctx->allow_control) return 1; *(r->cur++) = 0x0c; break;
|
||||||
case 'r': *(r->cur++) = 0x0d; break;
|
case 'r': *(r->cur++) = 0x0d; break;
|
||||||
case 'u':
|
case 'u':
|
||||||
/* (awful code adapted from ncdu) */
|
/* (awful code adapted from ncdu) */
|
||||||
|
|
@ -43,6 +44,9 @@ static inline int fujson_parse_string_escape(pTHX_ fujson_parse_ctx *ctx, fustr
|
||||||
n = 0x10000 + (((n & 0x03ff) << 10) | (s & 0x03ff));
|
n = 0x10000 + (((n & 0x03ff) << 10) | (s & 0x03ff));
|
||||||
ctx->buf += 6;
|
ctx->buf += 6;
|
||||||
}
|
}
|
||||||
|
if (!ctx->allow_control &&
|
||||||
|
(n <= 8 || n == 0x0b || n == 0x0c || (n >= 0x0e && n <= 0x1f) || n == 0x7f))
|
||||||
|
return 1;
|
||||||
r->cur = (char *)uvchr_to_utf8((U8 *)r->cur, n);
|
r->cur = (char *)uvchr_to_utf8((U8 *)r->cur, n);
|
||||||
if (n >= 0x80) r->setutf8 = 1;
|
if (n >= 0x80) r->setutf8 = 1;
|
||||||
break;
|
break;
|
||||||
|
|
@ -265,6 +269,7 @@ static SV *fujson_parse_xs(pTHX_ I32 ax, I32 argc, SV *val) {
|
||||||
fujson_parse_ctx ctx;
|
fujson_parse_ctx ctx;
|
||||||
|
|
||||||
ctx.depth = 0;
|
ctx.depth = 0;
|
||||||
|
ctx.allow_control = 0;
|
||||||
while (i < argc) {
|
while (i < argc) {
|
||||||
arg = SvPV_nolen(ST(i));
|
arg = SvPV_nolen(ST(i));
|
||||||
i++;
|
i++;
|
||||||
|
|
@ -275,6 +280,7 @@ static SV *fujson_parse_xs(pTHX_ I32 ax, I32 argc, SV *val) {
|
||||||
if (strcmp(arg, "utf8") == 0) decutf8 = SvTRUEx(r);
|
if (strcmp(arg, "utf8") == 0) decutf8 = SvTRUEx(r);
|
||||||
else if (strcmp(arg, "max_size") == 0) maxlen = SvUV(r);
|
else if (strcmp(arg, "max_size") == 0) maxlen = SvUV(r);
|
||||||
else if (strcmp(arg, "max_depth") == 0) ctx.depth = SvUV(r);
|
else if (strcmp(arg, "max_depth") == 0) ctx.depth = SvUV(r);
|
||||||
|
else if (strcmp(arg, "allow_control") == 0) ctx.allow_control = SvTRUE(r);
|
||||||
else if (strcmp(arg, "offset") == 0) offset = r;
|
else if (strcmp(arg, "offset") == 0) offset = r;
|
||||||
else croak("Unknown flag: '%s'", arg);
|
else croak("Unknown flag: '%s'", arg);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -24,6 +24,11 @@ my @error = (
|
||||||
'"\udc12\u1234"',
|
'"\udc12\u1234"',
|
||||||
"\"\x{110000}\"",
|
"\"\x{110000}\"",
|
||||||
|
|
||||||
|
'"\u0000"',
|
||||||
|
'"\b"',
|
||||||
|
'"\f"',
|
||||||
|
'"\u007f"',
|
||||||
|
|
||||||
'1.',
|
'1.',
|
||||||
'01',
|
'01',
|
||||||
'1e',
|
'1e',
|
||||||
|
|
@ -82,9 +87,9 @@ sub str($in, $exp) {
|
||||||
}
|
}
|
||||||
str '""', '';
|
str '""', '';
|
||||||
str '"hello, world"', 'hello, world';
|
str '"hello, world"', 'hello, world';
|
||||||
str '"\u0000\u0099\u0234\u1234"', "\x{00}\x{99}\x{234}\x{1234}";
|
str '"\u0099\u0234\u1234"', "\x{99}\x{234}\x{1234}";
|
||||||
str "\"\x{7f}\x{99}\x{234}\x{1234}\x{12345}\"", "\x{7f}\x{99}\x{234}\x{1234}\x{12345}";
|
str "\"\x{99}\x{234}\x{1234}\x{12345}\"", "\x{99}\x{234}\x{1234}\x{12345}";
|
||||||
str '"\/\"\\\\\b\t\n\f\r"', "/\"\\\x{08}\x{09}\x{0a}\x{0c}\x{0d}";
|
str '"\/\"\\\\\t\n\r"', "/\"\\\x{09}\x{0a}\x{0d}";
|
||||||
str '"\uD83D\uDE03"', "\x{1F603}";
|
str '"\uD83D\uDE03"', "\x{1F603}";
|
||||||
|
|
||||||
sub num($in, $exp=$in) {
|
sub num($in, $exp=$in) {
|
||||||
|
|
@ -186,6 +191,7 @@ for (2000..2100, 4000..4200, 8100..8200, 12200..12300, 16300..16400) {
|
||||||
ok !eval { json_parse '[[[[]]]]', max_depth => 4; 1 };
|
ok !eval { json_parse '[[[[]]]]', max_depth => 4; 1 };
|
||||||
ok !eval { json_parse '{"":{"":{"":{"":1}}}}', max_depth => 4; 1 };
|
ok !eval { json_parse '{"":{"":{"":{"":1}}}}', max_depth => 4; 1 };
|
||||||
|
|
||||||
|
is json_parse('"\u0000\b\f\u007f"', allow_control => 1), "\x00\x08\x0c\x7f";
|
||||||
|
|
||||||
# 500 depth
|
# 500 depth
|
||||||
{
|
{
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue