json_parse(): Disallow control characters in strings by default

Deviating from the standard, but more consistent other FU functions.
2025-06-04 18:48:06 +02:00 · 2025-06-04 18:48:06 +02:00 · 55baa6c9a6
commit 55baa6c9a6
parent a43dc70ff9
3 changed files with 28 additions and 10 deletions
--- a/FU/Util.pm
+++ b/FU/Util.pm
@ -137,7 +137,7 @@ value for C<$val>, due to C<\0> and C<\1> being considered booleans.
 =head1 JSON Parsing & Formatting
 This module comes with a custom C-based JSON parser and formatter. These
-functions conform strictly to L<RFC-8259|https://tools.ietf.org/html/rfc8259>,
+functions conform to L<RFC-8259|https://tools.ietf.org/html/rfc8259>,
 non-standard extensions are not supported and never will be. It also happens to
 be pretty fast, refer to L<FU::Benchmarks> for some numbers.
@ -171,6 +171,13 @@ Supported C<%options>:
 =over
 =item allow_control
 Boolean, set to true to allow (encoded) ASCII control characters in JSON
 strings, such as C<\u0000>, C<\b>, C<\u007f>, etc.  These characters are
 permitted per RFC-8259, but disallowed by this parser by default. See
 C<utf8_decode()> below.
 =item utf8
 Boolean, interpret the input C<$string> as a UTF-8 encoded byte string instead
@ -251,10 +258,9 @@ value. There is no way to do that without violating JSON specs, so you should
 use entity escaping instead.
 Some JSON modules escape the forward slash (C</>) character instead, but that
-is, at best, B<only> sufficient for embedding inside a C<< <script> >> tag (I'm
+is I<only> sufficient for embedding inside a C<< <script> >> tag. In any other
-not sure how C<< <!-- >> and C<< <![CDATA[ >> are treated in that context). In
+context, you'll need the more thourough escaping provided by this C<html_safe>
-any other context, you'll need the more thourough escaping provided by this
+option.
 C<html_safe> option.
 =item max_size
--- a/c/jsonparse.c
+++ b/c/jsonparse.c
@ -2,6 +2,7 @@ typedef struct {
    const unsigned char *buf;
    const unsigned char *end;
    UV depth;
    int allow_control;
 } fujson_parse_ctx;
 static SV *fujson_parse(pTHX_ fujson_parse_ctx *);
@ -23,10 +24,10 @@ static inline int fujson_parse_string_escape(pTHX_ fujson_parse_ctx *ctx, fustr
        case '"': *(r->cur++) = '\"'; break;
        case '\\':*(r->cur++) = '\\'; break;
        case '/': *(r->cur++) = '/';  break; /* We don't escape this one */
-        case 'b': *(r->cur++) = 0x08; break;
+        case 'b': if (!ctx->allow_control) return 1; *(r->cur++) = 0x08; break;
        case 't': *(r->cur++) = 0x09; break;
        case 'n': *(r->cur++) = 0x0a; break;
-        case 'f': *(r->cur++) = 0x0c; break;
+        case 'f': if (!ctx->allow_control) return 1; *(r->cur++) = 0x0c; break;
        case 'r': *(r->cur++) = 0x0d; break;
        case 'u':
            /* (awful code adapted from ncdu) */
@ -43,6 +44,9 @@ static inline int fujson_parse_string_escape(pTHX_ fujson_parse_ctx *ctx, fustr
                n = 0x10000 + (((n & 0x03ff) << 10) | (s & 0x03ff));
                ctx->buf += 6;
            }
            if (!ctx->allow_control &&
                    (n <= 8 || n == 0x0b || n == 0x0c || (n >= 0x0e && n <= 0x1f) || n == 0x7f))
                return 1;
            r->cur = (char *)uvchr_to_utf8((U8 *)r->cur, n);
            if (n >= 0x80) r->setutf8 = 1;
            break;
@ -265,6 +269,7 @@ static SV *fujson_parse_xs(pTHX_ I32 ax, I32 argc, SV *val) {
    fujson_parse_ctx ctx;
    ctx.depth = 0;
    ctx.allow_control = 0;
    while (i < argc) {
        arg = SvPV_nolen(ST(i));
        i++;
@ -275,6 +280,7 @@ static SV *fujson_parse_xs(pTHX_ I32 ax, I32 argc, SV *val) {
        if (strcmp(arg, "utf8") == 0) decutf8 = SvTRUEx(r);
        else if (strcmp(arg, "max_size") == 0) maxlen = SvUV(r);
        else if (strcmp(arg, "max_depth") == 0) ctx.depth = SvUV(r);
        else if (strcmp(arg, "allow_control") == 0) ctx.allow_control = SvTRUE(r);
        else if (strcmp(arg, "offset") == 0) offset = r;
        else croak("Unknown flag: '%s'", arg);
    }
--- a/t/json_parse.t
+++ b/t/json_parse.t
@ -24,6 +24,11 @@ my @error = (
    '"\udc12\u1234"',
    "\"\x{110000}\"",
    '"\u0000"',
    '"\b"',
    '"\f"',
    '"\u007f"',
    '1.',
    '01',
    '1e',
@ -82,9 +87,9 @@ sub str($in, $exp) {
 }
 str '""', '';
 str '"hello, world"', 'hello, world';
-str '"\u0000\u0099\u0234\u1234"', "\x{00}\x{99}\x{234}\x{1234}";
+str '"\u0099\u0234\u1234"', "\x{99}\x{234}\x{1234}";
-str "\"\x{7f}\x{99}\x{234}\x{1234}\x{12345}\"", "\x{7f}\x{99}\x{234}\x{1234}\x{12345}";
+str "\"\x{99}\x{234}\x{1234}\x{12345}\"", "\x{99}\x{234}\x{1234}\x{12345}";
-str '"\/\"\\\\\b\t\n\f\r"', "/\"\\\x{08}\x{09}\x{0a}\x{0c}\x{0d}";
+str '"\/\"\\\\\t\n\r"', "/\"\\\x{09}\x{0a}\x{0d}";
 str '"\uD83D\uDE03"', "\x{1F603}";
 sub num($in, $exp=$in) {
@ -186,6 +191,7 @@ for (2000..2100, 4000..4200, 8100..8200, 12200..12300, 16300..16400) {
 ok !eval { json_parse '[[[[]]]]', max_depth => 4; 1 };
 ok !eval { json_parse '{"":{"":{"":{"":1}}}}', max_depth => 4; 1 };
 is json_parse('"\u0000\b\f\u007f"', allow_control => 1), "\x00\x08\x0c\x7f";
 # 500 depth
 {