json_parse(): Disallow control characters in strings by default

Deviating from the standard, but more consistent other FU functions.
This commit is contained in:
Yorhel 2025-06-04 18:48:06 +02:00
parent a43dc70ff9
commit 55baa6c9a6
3 changed files with 28 additions and 10 deletions

View file

@ -137,7 +137,7 @@ value for C<$val>, due to C<\0> and C<\1> being considered booleans.
=head1 JSON Parsing & Formatting =head1 JSON Parsing & Formatting
This module comes with a custom C-based JSON parser and formatter. These This module comes with a custom C-based JSON parser and formatter. These
functions conform strictly to L<RFC-8259|https://tools.ietf.org/html/rfc8259>, functions conform to L<RFC-8259|https://tools.ietf.org/html/rfc8259>,
non-standard extensions are not supported and never will be. It also happens to non-standard extensions are not supported and never will be. It also happens to
be pretty fast, refer to L<FU::Benchmarks> for some numbers. be pretty fast, refer to L<FU::Benchmarks> for some numbers.
@ -171,6 +171,13 @@ Supported C<%options>:
=over =over
=item allow_control
Boolean, set to true to allow (encoded) ASCII control characters in JSON
strings, such as C<\u0000>, C<\b>, C<\u007f>, etc. These characters are
permitted per RFC-8259, but disallowed by this parser by default. See
C<utf8_decode()> below.
=item utf8 =item utf8
Boolean, interpret the input C<$string> as a UTF-8 encoded byte string instead Boolean, interpret the input C<$string> as a UTF-8 encoded byte string instead
@ -251,10 +258,9 @@ value. There is no way to do that without violating JSON specs, so you should
use entity escaping instead. use entity escaping instead.
Some JSON modules escape the forward slash (C</>) character instead, but that Some JSON modules escape the forward slash (C</>) character instead, but that
is, at best, B<only> sufficient for embedding inside a C<< <script> >> tag (I'm is I<only> sufficient for embedding inside a C<< <script> >> tag. In any other
not sure how C<< <!-- >> and C<< <![CDATA[ >> are treated in that context). In context, you'll need the more thourough escaping provided by this C<html_safe>
any other context, you'll need the more thourough escaping provided by this option.
C<html_safe> option.
=item max_size =item max_size

View file

@ -2,6 +2,7 @@ typedef struct {
const unsigned char *buf; const unsigned char *buf;
const unsigned char *end; const unsigned char *end;
UV depth; UV depth;
int allow_control;
} fujson_parse_ctx; } fujson_parse_ctx;
static SV *fujson_parse(pTHX_ fujson_parse_ctx *); static SV *fujson_parse(pTHX_ fujson_parse_ctx *);
@ -23,10 +24,10 @@ static inline int fujson_parse_string_escape(pTHX_ fujson_parse_ctx *ctx, fustr
case '"': *(r->cur++) = '\"'; break; case '"': *(r->cur++) = '\"'; break;
case '\\':*(r->cur++) = '\\'; break; case '\\':*(r->cur++) = '\\'; break;
case '/': *(r->cur++) = '/'; break; /* We don't escape this one */ case '/': *(r->cur++) = '/'; break; /* We don't escape this one */
case 'b': *(r->cur++) = 0x08; break; case 'b': if (!ctx->allow_control) return 1; *(r->cur++) = 0x08; break;
case 't': *(r->cur++) = 0x09; break; case 't': *(r->cur++) = 0x09; break;
case 'n': *(r->cur++) = 0x0a; break; case 'n': *(r->cur++) = 0x0a; break;
case 'f': *(r->cur++) = 0x0c; break; case 'f': if (!ctx->allow_control) return 1; *(r->cur++) = 0x0c; break;
case 'r': *(r->cur++) = 0x0d; break; case 'r': *(r->cur++) = 0x0d; break;
case 'u': case 'u':
/* (awful code adapted from ncdu) */ /* (awful code adapted from ncdu) */
@ -43,6 +44,9 @@ static inline int fujson_parse_string_escape(pTHX_ fujson_parse_ctx *ctx, fustr
n = 0x10000 + (((n & 0x03ff) << 10) | (s & 0x03ff)); n = 0x10000 + (((n & 0x03ff) << 10) | (s & 0x03ff));
ctx->buf += 6; ctx->buf += 6;
} }
if (!ctx->allow_control &&
(n <= 8 || n == 0x0b || n == 0x0c || (n >= 0x0e && n <= 0x1f) || n == 0x7f))
return 1;
r->cur = (char *)uvchr_to_utf8((U8 *)r->cur, n); r->cur = (char *)uvchr_to_utf8((U8 *)r->cur, n);
if (n >= 0x80) r->setutf8 = 1; if (n >= 0x80) r->setutf8 = 1;
break; break;
@ -265,6 +269,7 @@ static SV *fujson_parse_xs(pTHX_ I32 ax, I32 argc, SV *val) {
fujson_parse_ctx ctx; fujson_parse_ctx ctx;
ctx.depth = 0; ctx.depth = 0;
ctx.allow_control = 0;
while (i < argc) { while (i < argc) {
arg = SvPV_nolen(ST(i)); arg = SvPV_nolen(ST(i));
i++; i++;
@ -275,6 +280,7 @@ static SV *fujson_parse_xs(pTHX_ I32 ax, I32 argc, SV *val) {
if (strcmp(arg, "utf8") == 0) decutf8 = SvTRUEx(r); if (strcmp(arg, "utf8") == 0) decutf8 = SvTRUEx(r);
else if (strcmp(arg, "max_size") == 0) maxlen = SvUV(r); else if (strcmp(arg, "max_size") == 0) maxlen = SvUV(r);
else if (strcmp(arg, "max_depth") == 0) ctx.depth = SvUV(r); else if (strcmp(arg, "max_depth") == 0) ctx.depth = SvUV(r);
else if (strcmp(arg, "allow_control") == 0) ctx.allow_control = SvTRUE(r);
else if (strcmp(arg, "offset") == 0) offset = r; else if (strcmp(arg, "offset") == 0) offset = r;
else croak("Unknown flag: '%s'", arg); else croak("Unknown flag: '%s'", arg);
} }

View file

@ -24,6 +24,11 @@ my @error = (
'"\udc12\u1234"', '"\udc12\u1234"',
"\"\x{110000}\"", "\"\x{110000}\"",
'"\u0000"',
'"\b"',
'"\f"',
'"\u007f"',
'1.', '1.',
'01', '01',
'1e', '1e',
@ -82,9 +87,9 @@ sub str($in, $exp) {
} }
str '""', ''; str '""', '';
str '"hello, world"', 'hello, world'; str '"hello, world"', 'hello, world';
str '"\u0000\u0099\u0234\u1234"', "\x{00}\x{99}\x{234}\x{1234}"; str '"\u0099\u0234\u1234"', "\x{99}\x{234}\x{1234}";
str "\"\x{7f}\x{99}\x{234}\x{1234}\x{12345}\"", "\x{7f}\x{99}\x{234}\x{1234}\x{12345}"; str "\"\x{99}\x{234}\x{1234}\x{12345}\"", "\x{99}\x{234}\x{1234}\x{12345}";
str '"\/\"\\\\\b\t\n\f\r"', "/\"\\\x{08}\x{09}\x{0a}\x{0c}\x{0d}"; str '"\/\"\\\\\t\n\r"', "/\"\\\x{09}\x{0a}\x{0d}";
str '"\uD83D\uDE03"', "\x{1F603}"; str '"\uD83D\uDE03"', "\x{1F603}";
sub num($in, $exp=$in) { sub num($in, $exp=$in) {
@ -186,6 +191,7 @@ for (2000..2100, 4000..4200, 8100..8200, 12200..12300, 16300..16400) {
ok !eval { json_parse '[[[[]]]]', max_depth => 4; 1 }; ok !eval { json_parse '[[[[]]]]', max_depth => 4; 1 };
ok !eval { json_parse '{"":{"":{"":{"":1}}}}', max_depth => 4; 1 }; ok !eval { json_parse '{"":{"":{"":{"":1}}}}', max_depth => 4; 1 };
is json_parse('"\u0000\b\f\u007f"', allow_control => 1), "\x00\x08\x0c\x7f";
# 500 depth # 500 depth
{ {