json_parse(): Disallow control characters in strings by default

Deviating from the standard, but more consistent other FU functions.
This commit is contained in:
Yorhel 2025-06-04 18:48:06 +02:00
parent a43dc70ff9
commit 55baa6c9a6
3 changed files with 28 additions and 10 deletions

View file

@ -137,7 +137,7 @@ value for C<$val>, due to C<\0> and C<\1> being considered booleans.
=head1 JSON Parsing & Formatting
This module comes with a custom C-based JSON parser and formatter. These
functions conform strictly to L<RFC-8259|https://tools.ietf.org/html/rfc8259>,
functions conform to L<RFC-8259|https://tools.ietf.org/html/rfc8259>,
non-standard extensions are not supported and never will be. It also happens to
be pretty fast, refer to L<FU::Benchmarks> for some numbers.
@ -171,6 +171,13 @@ Supported C<%options>:
=over
=item allow_control
Boolean, set to true to allow (encoded) ASCII control characters in JSON
strings, such as C<\u0000>, C<\b>, C<\u007f>, etc. These characters are
permitted per RFC-8259, but disallowed by this parser by default. See
C<utf8_decode()> below.
=item utf8
Boolean, interpret the input C<$string> as a UTF-8 encoded byte string instead
@ -251,10 +258,9 @@ value. There is no way to do that without violating JSON specs, so you should
use entity escaping instead.
Some JSON modules escape the forward slash (C</>) character instead, but that
is, at best, B<only> sufficient for embedding inside a C<< <script> >> tag (I'm
not sure how C<< <!-- >> and C<< <![CDATA[ >> are treated in that context). In
any other context, you'll need the more thourough escaping provided by this
C<html_safe> option.
is I<only> sufficient for embedding inside a C<< <script> >> tag. In any other
context, you'll need the more thourough escaping provided by this C<html_safe>
option.
=item max_size

View file

@ -2,6 +2,7 @@ typedef struct {
const unsigned char *buf;
const unsigned char *end;
UV depth;
int allow_control;
} fujson_parse_ctx;
static SV *fujson_parse(pTHX_ fujson_parse_ctx *);
@ -23,10 +24,10 @@ static inline int fujson_parse_string_escape(pTHX_ fujson_parse_ctx *ctx, fustr
case '"': *(r->cur++) = '\"'; break;
case '\\':*(r->cur++) = '\\'; break;
case '/': *(r->cur++) = '/'; break; /* We don't escape this one */
case 'b': *(r->cur++) = 0x08; break;
case 'b': if (!ctx->allow_control) return 1; *(r->cur++) = 0x08; break;
case 't': *(r->cur++) = 0x09; break;
case 'n': *(r->cur++) = 0x0a; break;
case 'f': *(r->cur++) = 0x0c; break;
case 'f': if (!ctx->allow_control) return 1; *(r->cur++) = 0x0c; break;
case 'r': *(r->cur++) = 0x0d; break;
case 'u':
/* (awful code adapted from ncdu) */
@ -43,6 +44,9 @@ static inline int fujson_parse_string_escape(pTHX_ fujson_parse_ctx *ctx, fustr
n = 0x10000 + (((n & 0x03ff) << 10) | (s & 0x03ff));
ctx->buf += 6;
}
if (!ctx->allow_control &&
(n <= 8 || n == 0x0b || n == 0x0c || (n >= 0x0e && n <= 0x1f) || n == 0x7f))
return 1;
r->cur = (char *)uvchr_to_utf8((U8 *)r->cur, n);
if (n >= 0x80) r->setutf8 = 1;
break;
@ -265,6 +269,7 @@ static SV *fujson_parse_xs(pTHX_ I32 ax, I32 argc, SV *val) {
fujson_parse_ctx ctx;
ctx.depth = 0;
ctx.allow_control = 0;
while (i < argc) {
arg = SvPV_nolen(ST(i));
i++;
@ -275,6 +280,7 @@ static SV *fujson_parse_xs(pTHX_ I32 ax, I32 argc, SV *val) {
if (strcmp(arg, "utf8") == 0) decutf8 = SvTRUEx(r);
else if (strcmp(arg, "max_size") == 0) maxlen = SvUV(r);
else if (strcmp(arg, "max_depth") == 0) ctx.depth = SvUV(r);
else if (strcmp(arg, "allow_control") == 0) ctx.allow_control = SvTRUE(r);
else if (strcmp(arg, "offset") == 0) offset = r;
else croak("Unknown flag: '%s'", arg);
}

View file

@ -24,6 +24,11 @@ my @error = (
'"\udc12\u1234"',
"\"\x{110000}\"",
'"\u0000"',
'"\b"',
'"\f"',
'"\u007f"',
'1.',
'01',
'1e',
@ -82,9 +87,9 @@ sub str($in, $exp) {
}
str '""', '';
str '"hello, world"', 'hello, world';
str '"\u0000\u0099\u0234\u1234"', "\x{00}\x{99}\x{234}\x{1234}";
str "\"\x{7f}\x{99}\x{234}\x{1234}\x{12345}\"", "\x{7f}\x{99}\x{234}\x{1234}\x{12345}";
str '"\/\"\\\\\b\t\n\f\r"', "/\"\\\x{08}\x{09}\x{0a}\x{0c}\x{0d}";
str '"\u0099\u0234\u1234"', "\x{99}\x{234}\x{1234}";
str "\"\x{99}\x{234}\x{1234}\x{12345}\"", "\x{99}\x{234}\x{1234}\x{12345}";
str '"\/\"\\\\\t\n\r"', "/\"\\\x{09}\x{0a}\x{0d}";
str '"\uD83D\uDE03"', "\x{1F603}";
sub num($in, $exp=$in) {
@ -186,6 +191,7 @@ for (2000..2100, 4000..4200, 8100..8200, 12200..12300, 16300..16400) {
ok !eval { json_parse '[[[[]]]]', max_depth => 4; 1 };
ok !eval { json_parse '{"":{"":{"":{"":1}}}}', max_depth => 4; 1 };
is json_parse('"\u0000\b\f\u007f"', allow_control => 1), "\x00\x08\x0c\x7f";
# 500 depth
{