diff --git a/FU.pm b/FU.pm index f43dbaf..2fabf2a 100644 --- a/FU.pm +++ b/FU.pm @@ -312,7 +312,7 @@ sub _read_req($c) { ($REQ->{path}, my $qs) = split /\?/, $REQ->{path}//'', 2; $REQ->{qs} //= $qs; - eval { $REQ->{path} = FU::Util::uri_unescape($REQ->{path}); 1; } || fu->error(400, $@); + eval { $REQ->{path} = FU::Util::uri_unescape($REQ->{path}); FU::Util::check_control($REQ->{path}); 1; } || fu->error(400, $@); fu->error(400, 'Invalid character in path') if $REQ->{path} =~ /[\r\n\t]/; # There are plenty other questionable characters, but newlines and tabs are definitely out } diff --git a/FU/MultipartFormData.pm b/FU/MultipartFormData.pm index e6b44e2..14ce323 100644 --- a/FU/MultipartFormData.pm +++ b/FU/MultipartFormData.pm @@ -175,9 +175,7 @@ this on large fields. =item value -Returns a copy of the field value as a Unicode string. Uses C -from L, so also throws an error if the value contains control -characters. +Returns a copy of the field value as a Unicode string. =item substr($off, $len) diff --git a/FU/Util.pm b/FU/Util.pm index 2074ada..77635a2 100644 --- a/FU/Util.pm +++ b/FU/Util.pm @@ -11,20 +11,26 @@ use experimental 'builtin'; our @EXPORT_OK = qw/ to_bool json_format json_parse - utf8_decode uri_escape uri_unescape + has_control check_control utf8_decode + uri_escape uri_unescape query_decode query_encode httpdate_format httpdate_parse gzip_lib gzip_compress brotli_compress fdpass_send fdpass_recv /; + +# Internal utility function +sub has_control :prototype($) ($s) { defined $s && $s =~ /[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]/ } +sub check_control :prototype($) ($s) { confess 'Invalid control character' if has_control $s; } + +# Deprecated, call Encode::decode() directly. sub utf8_decode :prototype($) { return if !defined $_[0]; eval { $_[0] = Encode::decode('UTF-8', $_[0], Encode::FB_CROAK); 1 } || confess($@ =~ s/ at .+\n$//r); - confess 'Invalid control character' if $_[0] =~ /[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]/; $_[0] } @@ -175,13 +181,6 @@ Supported C<%options>: =over -=item allow_control - -Boolean, set to true to allow (encoded) ASCII control characters in JSON -strings, such as C<\u0000>, C<\b>, C<\u007f>, etc. These characters are -permitted per RFC-8259, but disallowed by this parser by default. See -C below. - =item utf8 Boolean, interpret the input C<$string> as a UTF-8 encoded byte string instead @@ -296,18 +295,6 @@ inputs, at the cost of flexibility. =over -=item utf8_decode($bytes) - -Convert a (perl-UTF-8 encoded) byte string into a sanitized perl Unicode -string. The conversion is performed in-place, so the C<$bytes> argument is -turned into a Unicode string. Returns the same string for convenience. - -This function throws an error if the input is not valid UTF-8 or if it contains -ASCII control characters - that is, any character between C<0x00> and C<0x1f> -except for tab, newline and carriage return. - -(This is a tiny wrapper around C with some extra checks) - =item uri_escape($string) Takes an Unicode string and returns a percent-encoded ASCII string, suitable @@ -316,8 +303,7 @@ for use in a query parameter. =item uri_unescape($string) Takes an Unicode string potentially containing percent-encoding and returns a -decoded Unicode string. Also checks for ASCII control characters as per -C. +decoded Unicode string. =item query_decode($string) @@ -334,8 +320,7 @@ have a value are decoded as C. Example: # } The input C<$string> is assumed to be a perl Unicode string. An error is thrown -if the resulting data decodes into invalid UTF-8 or contains control -characters, as per C. +if the resulting data decodes into invalid UTF-8. =item query_encode($hashref) diff --git a/FU/Validate.pm b/FU/Validate.pm index adca929..fa3bdcc 100644 --- a/FU/Validate.pm +++ b/FU/Validate.pm @@ -4,7 +4,7 @@ use v5.36; use experimental 'builtin', 'for_list'; use builtin qw/true false blessed trim/; use Carp 'confess'; -use FU::Util 'to_bool'; +use FU::Util 'to_bool', 'has_control'; # Unavailable as custom validation names @@ -12,7 +12,7 @@ my %builtin = map +($_,1), qw/ type default onerror - trim + trim allow_control elems sort unique accept_scalar accept_array keys values unknown missing @@ -296,8 +296,13 @@ sub _validate_input { $_[1] = $_[1]->@* == 0 ? undef : $c->{accept_array} eq 'first' ? $_[1][0] : $_[1][ $#{$_[1]} ] if $c->{accept_array} && ref $_[1] eq 'ARRAY'; - # trim (needs to be done before the 'default' test) - $_[1] = trim $_[1] =~ s/\r//rg if defined $_[1] && !ref $_[1] && $type eq 'scalar' && (!exists $c->{trim} || $c->{trim}); + # early scalar checks + if (defined $_[1] && !ref $_[1] && $type eq 'scalar') { + # trim needs to be done before the 'default' test + $_[1] = trim $_[1] =~ s/\r//rg if !exists $c->{trim} || $c->{trim}; + + return { validation => 'allow_control' } if !$c->{allow_control} && has_control $_[1]; + } # default if (!defined $_[1] || (!ref $_[1] && $_[1] eq '')) { @@ -403,6 +408,7 @@ sub _inval($t,$v) { sprintf 'invalid %s: %s', $t, _fmtval $v } # TODO: document. our %error_format = ( required => sub { 'required value missing' }, + allow_control => sub { 'invalid control character' }, type => sub($e) { "invalid type, expected '$e->{expected}' but got '$e->{got}'" }, unknown => sub($e) { sprintf 'unknown key%s: %s', $e->{keys}->@* == 1 ? '' : 's', join ', ', map _fmtkey($_), $e->{keys}->@* }, minlength => sub($e) { sprintf "input too short, expected minimum of %d but got %d", $e->{expected}, $e->{got} }, @@ -590,6 +596,9 @@ Upon failure, the error object will look something like: got => 'scalar' } +Beware: setting the type to I causes the I and I +validations to be skipped. + =item default => $val If not set, or set to C<\'required'> (note: scalarref), then a value is required @@ -623,6 +632,12 @@ By default, any whitespace around scalar-type input is removed before testing any other validations. Setting I to a false value will disable this behavior. +=item allow_control => 0/1 + +By default, ASCII control characters in the input are not permitted for scalar +values and trigger a validation error. Set this to a positive value to disable +the check. + =item keys => $hashref Implies C<< type => 'hash' >>, this option specifies which keys are permitted, diff --git a/c/jsonparse.c b/c/jsonparse.c index bf46557..136034a 100644 --- a/c/jsonparse.c +++ b/c/jsonparse.c @@ -2,7 +2,6 @@ typedef struct { const unsigned char *buf; const unsigned char *end; UV depth; - int allow_control; } fujson_parse_ctx; static SV *fujson_parse(pTHX_ fujson_parse_ctx *); @@ -24,10 +23,10 @@ static inline int fujson_parse_string_escape(pTHX_ fujson_parse_ctx *ctx, fustr case '"': *(r->cur++) = '\"'; break; case '\\':*(r->cur++) = '\\'; break; case '/': *(r->cur++) = '/'; break; /* We don't escape this one */ - case 'b': if (!ctx->allow_control) return 1; *(r->cur++) = 0x08; break; + case 'b': *(r->cur++) = 0x08; break; case 't': *(r->cur++) = 0x09; break; case 'n': *(r->cur++) = 0x0a; break; - case 'f': if (!ctx->allow_control) return 1; *(r->cur++) = 0x0c; break; + case 'f': *(r->cur++) = 0x0c; break; case 'r': *(r->cur++) = 0x0d; break; case 'u': /* (awful code adapted from ncdu) */ @@ -44,9 +43,6 @@ static inline int fujson_parse_string_escape(pTHX_ fujson_parse_ctx *ctx, fustr n = 0x10000 + (((n & 0x03ff) << 10) | (s & 0x03ff)); ctx->buf += 6; } - if (!ctx->allow_control && - (n <= 8 || n == 0x0b || n == 0x0c || (n >= 0x0e && n <= 0x1f) || n == 0x7f)) - return 1; r->cur = (char *)uvchr_to_utf8((U8 *)r->cur, n); if (n >= 0x80) r->setutf8 = 1; break; @@ -269,7 +265,6 @@ static SV *fujson_parse_xs(pTHX_ I32 ax, I32 argc, SV *val) { fujson_parse_ctx ctx; ctx.depth = 0; - ctx.allow_control = 0; while (i < argc) { arg = SvPV_nolen(ST(i)); i++; @@ -280,7 +275,7 @@ static SV *fujson_parse_xs(pTHX_ I32 ax, I32 argc, SV *val) { if (strcmp(arg, "utf8") == 0) decutf8 = SvTRUEx(r); else if (strcmp(arg, "max_size") == 0) maxlen = SvUV(r); else if (strcmp(arg, "max_depth") == 0) ctx.depth = SvUV(r); - else if (strcmp(arg, "allow_control") == 0) ctx.allow_control = SvTRUE(r); + else if (strcmp(arg, "allow_control") == 0) {} else if (strcmp(arg, "offset") == 0) offset = r; else croak("Unknown flag: '%s'", arg); } diff --git a/t/json_parse.t b/t/json_parse.t index 3ad3838..686a1e6 100644 --- a/t/json_parse.t +++ b/t/json_parse.t @@ -24,11 +24,6 @@ my @error = ( '"\udc12\u1234"', "\"\x{110000}\"", - '"\u0000"', - '"\b"', - '"\f"', - '"\u007f"', - '1.', '01', '1e', @@ -87,6 +82,7 @@ sub str($in, $exp) { } str '""', ''; str '"hello, world"', 'hello, world'; +str '"\u0000\b"', "\x00\b"; str '"\u0099\u0234\u1234"', "\x{99}\x{234}\x{1234}"; str "\"\x{99}\x{234}\x{1234}\x{12345}\"", "\x{99}\x{234}\x{1234}\x{12345}"; str '"\/\"\\\\\t\n\r"', "/\"\\\x{09}\x{0a}\x{0d}"; diff --git a/t/query.t b/t/query.t index 9d1ca4a..96ae403 100644 --- a/t/query.t +++ b/t/query.t @@ -7,9 +7,6 @@ is_deeply query_decode('a&a&%c3%be=%26%3d%c3%be&a=3'), { a => [ builtin::true, builtin::true, 3 ], "\xfe" => "&=\xfe" }; -ok !eval { query_decode('%10'); 1 }; -like $@, qr/Invalid control character/; - ok !eval { query_decode('a=%fe%83%bf%bf%bf%bf%bf%0a'); 1 }; like $@, qr/does not map to Unicode/; diff --git a/t/validate.t b/t/validate.t index 26704cd..46dc173 100644 --- a/t/validate.t +++ b/t/validate.t @@ -79,6 +79,10 @@ t { trim => 0 }, " Va\rl id \n ", " Va\rl id \n "; f {}, ' ', { validation => 'required' }, 'required value missing'; t { trim => 0 }, ' ', ' '; +# allow_control +f {}, "\b", { validation => 'allow_control' }, 'invalid control character'; +t { allow_control => 1 }, "\b", "\b"; + # accept_array t { default => undef, accept_array => 'first' }, [], undef; t { default => undef, accept_array => 'first' }, [' x '], 'x';