Move control character checking to FU::Validate, deprecate FU::Util::utf8_decode()

URI, JSON and formdata decoding no longer checks for control characters,
but FU::Validate now rejects control characters by default. This
decouples semantic validation from format parsing and gives better
control over when control characters are allowed.
This commit is contained in:
Yorhel 2025-08-22 09:51:56 +02:00
parent 2e9a40da69
commit a8ac435f85
8 changed files with 39 additions and 49 deletions

2
FU.pm
View file

@ -312,7 +312,7 @@ sub _read_req($c) {
($REQ->{path}, my $qs) = split /\?/, $REQ->{path}//'', 2; ($REQ->{path}, my $qs) = split /\?/, $REQ->{path}//'', 2;
$REQ->{qs} //= $qs; $REQ->{qs} //= $qs;
eval { $REQ->{path} = FU::Util::uri_unescape($REQ->{path}); 1; } || fu->error(400, $@); eval { $REQ->{path} = FU::Util::uri_unescape($REQ->{path}); FU::Util::check_control($REQ->{path}); 1; } || fu->error(400, $@);
fu->error(400, 'Invalid character in path') if $REQ->{path} =~ /[\r\n\t]/; # There are plenty other questionable characters, but newlines and tabs are definitely out fu->error(400, 'Invalid character in path') if $REQ->{path} =~ /[\r\n\t]/; # There are plenty other questionable characters, but newlines and tabs are definitely out
} }

View file

@ -175,9 +175,7 @@ this on large fields.
=item value =item value
Returns a copy of the field value as a Unicode string. Uses C<utf8_decode()> Returns a copy of the field value as a Unicode string.
from L<FU::Util>, so also throws an error if the value contains control
characters.
=item substr($off, $len) =item substr($off, $len)

View file

@ -11,20 +11,26 @@ use experimental 'builtin';
our @EXPORT_OK = qw/ our @EXPORT_OK = qw/
to_bool to_bool
json_format json_parse json_format json_parse
utf8_decode uri_escape uri_unescape has_control check_control utf8_decode
uri_escape uri_unescape
query_decode query_encode query_decode query_encode
httpdate_format httpdate_parse httpdate_format httpdate_parse
gzip_lib gzip_compress brotli_compress gzip_lib gzip_compress brotli_compress
fdpass_send fdpass_recv fdpass_send fdpass_recv
/; /;
# Internal utility function
sub has_control :prototype($) ($s) { defined $s && $s =~ /[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]/ }
sub check_control :prototype($) ($s) { confess 'Invalid control character' if has_control $s; }
# Deprecated, call Encode::decode() directly.
sub utf8_decode :prototype($) { sub utf8_decode :prototype($) {
return if !defined $_[0]; return if !defined $_[0];
eval { eval {
$_[0] = Encode::decode('UTF-8', $_[0], Encode::FB_CROAK); $_[0] = Encode::decode('UTF-8', $_[0], Encode::FB_CROAK);
1 1
} || confess($@ =~ s/ at .+\n$//r); } || confess($@ =~ s/ at .+\n$//r);
confess 'Invalid control character' if $_[0] =~ /[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]/;
$_[0] $_[0]
} }
@ -175,13 +181,6 @@ Supported C<%options>:
=over =over
=item allow_control
Boolean, set to true to allow (encoded) ASCII control characters in JSON
strings, such as C<\u0000>, C<\b>, C<\u007f>, etc. These characters are
permitted per RFC-8259, but disallowed by this parser by default. See
C<utf8_decode()> below.
=item utf8 =item utf8
Boolean, interpret the input C<$string> as a UTF-8 encoded byte string instead Boolean, interpret the input C<$string> as a UTF-8 encoded byte string instead
@ -296,18 +295,6 @@ inputs, at the cost of flexibility.
=over =over
=item utf8_decode($bytes)
Convert a (perl-UTF-8 encoded) byte string into a sanitized perl Unicode
string. The conversion is performed in-place, so the C<$bytes> argument is
turned into a Unicode string. Returns the same string for convenience.
This function throws an error if the input is not valid UTF-8 or if it contains
ASCII control characters - that is, any character between C<0x00> and C<0x1f>
except for tab, newline and carriage return.
(This is a tiny wrapper around C<utf8::decode()> with some extra checks)
=item uri_escape($string) =item uri_escape($string)
Takes an Unicode string and returns a percent-encoded ASCII string, suitable Takes an Unicode string and returns a percent-encoded ASCII string, suitable
@ -316,8 +303,7 @@ for use in a query parameter.
=item uri_unescape($string) =item uri_unescape($string)
Takes an Unicode string potentially containing percent-encoding and returns a Takes an Unicode string potentially containing percent-encoding and returns a
decoded Unicode string. Also checks for ASCII control characters as per decoded Unicode string.
C<utf8_decode()>.
=item query_decode($string) =item query_decode($string)
@ -334,8 +320,7 @@ have a value are decoded as C<builtin::true>. Example:
# } # }
The input C<$string> is assumed to be a perl Unicode string. An error is thrown The input C<$string> is assumed to be a perl Unicode string. An error is thrown
if the resulting data decodes into invalid UTF-8 or contains control if the resulting data decodes into invalid UTF-8.
characters, as per C<utf8_decode>.
=item query_encode($hashref) =item query_encode($hashref)

View file

@ -4,7 +4,7 @@ use v5.36;
use experimental 'builtin', 'for_list'; use experimental 'builtin', 'for_list';
use builtin qw/true false blessed trim/; use builtin qw/true false blessed trim/;
use Carp 'confess'; use Carp 'confess';
use FU::Util 'to_bool'; use FU::Util 'to_bool', 'has_control';
# Unavailable as custom validation names # Unavailable as custom validation names
@ -12,7 +12,7 @@ my %builtin = map +($_,1), qw/
type type
default default
onerror onerror
trim trim allow_control
elems sort unique elems sort unique
accept_scalar accept_array accept_scalar accept_array
keys values unknown missing keys values unknown missing
@ -296,8 +296,13 @@ sub _validate_input {
$_[1] = $_[1]->@* == 0 ? undef : $c->{accept_array} eq 'first' ? $_[1][0] : $_[1][ $#{$_[1]} ] $_[1] = $_[1]->@* == 0 ? undef : $c->{accept_array} eq 'first' ? $_[1][0] : $_[1][ $#{$_[1]} ]
if $c->{accept_array} && ref $_[1] eq 'ARRAY'; if $c->{accept_array} && ref $_[1] eq 'ARRAY';
# trim (needs to be done before the 'default' test) # early scalar checks
$_[1] = trim $_[1] =~ s/\r//rg if defined $_[1] && !ref $_[1] && $type eq 'scalar' && (!exists $c->{trim} || $c->{trim}); if (defined $_[1] && !ref $_[1] && $type eq 'scalar') {
# trim needs to be done before the 'default' test
$_[1] = trim $_[1] =~ s/\r//rg if !exists $c->{trim} || $c->{trim};
return { validation => 'allow_control' } if !$c->{allow_control} && has_control $_[1];
}
# default # default
if (!defined $_[1] || (!ref $_[1] && $_[1] eq '')) { if (!defined $_[1] || (!ref $_[1] && $_[1] eq '')) {
@ -403,6 +408,7 @@ sub _inval($t,$v) { sprintf 'invalid %s: %s', $t, _fmtval $v }
# TODO: document. # TODO: document.
our %error_format = ( our %error_format = (
required => sub { 'required value missing' }, required => sub { 'required value missing' },
allow_control => sub { 'invalid control character' },
type => sub($e) { "invalid type, expected '$e->{expected}' but got '$e->{got}'" }, type => sub($e) { "invalid type, expected '$e->{expected}' but got '$e->{got}'" },
unknown => sub($e) { sprintf 'unknown key%s: %s', $e->{keys}->@* == 1 ? '' : 's', join ', ', map _fmtkey($_), $e->{keys}->@* }, unknown => sub($e) { sprintf 'unknown key%s: %s', $e->{keys}->@* == 1 ? '' : 's', join ', ', map _fmtkey($_), $e->{keys}->@* },
minlength => sub($e) { sprintf "input too short, expected minimum of %d but got %d", $e->{expected}, $e->{got} }, minlength => sub($e) { sprintf "input too short, expected minimum of %d but got %d", $e->{expected}, $e->{got} },
@ -590,6 +596,9 @@ Upon failure, the error object will look something like:
got => 'scalar' got => 'scalar'
} }
Beware: setting the type to I<any> causes the I<trim> and I<allow_control>
validations to be skipped.
=item default => $val =item default => $val
If not set, or set to C<\'required'> (note: scalarref), then a value is required If not set, or set to C<\'required'> (note: scalarref), then a value is required
@ -623,6 +632,12 @@ By default, any whitespace around scalar-type input is removed before testing
any other validations. Setting I<trim> to a false value will disable this any other validations. Setting I<trim> to a false value will disable this
behavior. behavior.
=item allow_control => 0/1
By default, ASCII control characters in the input are not permitted for scalar
values and trigger a validation error. Set this to a positive value to disable
the check.
=item keys => $hashref =item keys => $hashref
Implies C<< type => 'hash' >>, this option specifies which keys are permitted, Implies C<< type => 'hash' >>, this option specifies which keys are permitted,

View file

@ -2,7 +2,6 @@ typedef struct {
const unsigned char *buf; const unsigned char *buf;
const unsigned char *end; const unsigned char *end;
UV depth; UV depth;
int allow_control;
} fujson_parse_ctx; } fujson_parse_ctx;
static SV *fujson_parse(pTHX_ fujson_parse_ctx *); static SV *fujson_parse(pTHX_ fujson_parse_ctx *);
@ -24,10 +23,10 @@ static inline int fujson_parse_string_escape(pTHX_ fujson_parse_ctx *ctx, fustr
case '"': *(r->cur++) = '\"'; break; case '"': *(r->cur++) = '\"'; break;
case '\\':*(r->cur++) = '\\'; break; case '\\':*(r->cur++) = '\\'; break;
case '/': *(r->cur++) = '/'; break; /* We don't escape this one */ case '/': *(r->cur++) = '/'; break; /* We don't escape this one */
case 'b': if (!ctx->allow_control) return 1; *(r->cur++) = 0x08; break; case 'b': *(r->cur++) = 0x08; break;
case 't': *(r->cur++) = 0x09; break; case 't': *(r->cur++) = 0x09; break;
case 'n': *(r->cur++) = 0x0a; break; case 'n': *(r->cur++) = 0x0a; break;
case 'f': if (!ctx->allow_control) return 1; *(r->cur++) = 0x0c; break; case 'f': *(r->cur++) = 0x0c; break;
case 'r': *(r->cur++) = 0x0d; break; case 'r': *(r->cur++) = 0x0d; break;
case 'u': case 'u':
/* (awful code adapted from ncdu) */ /* (awful code adapted from ncdu) */
@ -44,9 +43,6 @@ static inline int fujson_parse_string_escape(pTHX_ fujson_parse_ctx *ctx, fustr
n = 0x10000 + (((n & 0x03ff) << 10) | (s & 0x03ff)); n = 0x10000 + (((n & 0x03ff) << 10) | (s & 0x03ff));
ctx->buf += 6; ctx->buf += 6;
} }
if (!ctx->allow_control &&
(n <= 8 || n == 0x0b || n == 0x0c || (n >= 0x0e && n <= 0x1f) || n == 0x7f))
return 1;
r->cur = (char *)uvchr_to_utf8((U8 *)r->cur, n); r->cur = (char *)uvchr_to_utf8((U8 *)r->cur, n);
if (n >= 0x80) r->setutf8 = 1; if (n >= 0x80) r->setutf8 = 1;
break; break;
@ -269,7 +265,6 @@ static SV *fujson_parse_xs(pTHX_ I32 ax, I32 argc, SV *val) {
fujson_parse_ctx ctx; fujson_parse_ctx ctx;
ctx.depth = 0; ctx.depth = 0;
ctx.allow_control = 0;
while (i < argc) { while (i < argc) {
arg = SvPV_nolen(ST(i)); arg = SvPV_nolen(ST(i));
i++; i++;
@ -280,7 +275,7 @@ static SV *fujson_parse_xs(pTHX_ I32 ax, I32 argc, SV *val) {
if (strcmp(arg, "utf8") == 0) decutf8 = SvTRUEx(r); if (strcmp(arg, "utf8") == 0) decutf8 = SvTRUEx(r);
else if (strcmp(arg, "max_size") == 0) maxlen = SvUV(r); else if (strcmp(arg, "max_size") == 0) maxlen = SvUV(r);
else if (strcmp(arg, "max_depth") == 0) ctx.depth = SvUV(r); else if (strcmp(arg, "max_depth") == 0) ctx.depth = SvUV(r);
else if (strcmp(arg, "allow_control") == 0) ctx.allow_control = SvTRUE(r); else if (strcmp(arg, "allow_control") == 0) {}
else if (strcmp(arg, "offset") == 0) offset = r; else if (strcmp(arg, "offset") == 0) offset = r;
else croak("Unknown flag: '%s'", arg); else croak("Unknown flag: '%s'", arg);
} }

View file

@ -24,11 +24,6 @@ my @error = (
'"\udc12\u1234"', '"\udc12\u1234"',
"\"\x{110000}\"", "\"\x{110000}\"",
'"\u0000"',
'"\b"',
'"\f"',
'"\u007f"',
'1.', '1.',
'01', '01',
'1e', '1e',
@ -87,6 +82,7 @@ sub str($in, $exp) {
} }
str '""', ''; str '""', '';
str '"hello, world"', 'hello, world'; str '"hello, world"', 'hello, world';
str '"\u0000\b"', "\x00\b";
str '"\u0099\u0234\u1234"', "\x{99}\x{234}\x{1234}"; str '"\u0099\u0234\u1234"', "\x{99}\x{234}\x{1234}";
str "\"\x{99}\x{234}\x{1234}\x{12345}\"", "\x{99}\x{234}\x{1234}\x{12345}"; str "\"\x{99}\x{234}\x{1234}\x{12345}\"", "\x{99}\x{234}\x{1234}\x{12345}";
str '"\/\"\\\\\t\n\r"', "/\"\\\x{09}\x{0a}\x{0d}"; str '"\/\"\\\\\t\n\r"', "/\"\\\x{09}\x{0a}\x{0d}";

View file

@ -7,9 +7,6 @@ is_deeply
query_decode('a&a&%c3%be=%26%3d%c3%be&a=3'), query_decode('a&a&%c3%be=%26%3d%c3%be&a=3'),
{ a => [ builtin::true, builtin::true, 3 ], "\xfe" => "&=\xfe" }; { a => [ builtin::true, builtin::true, 3 ], "\xfe" => "&=\xfe" };
ok !eval { query_decode('%10'); 1 };
like $@, qr/Invalid control character/;
ok !eval { query_decode('a=%fe%83%bf%bf%bf%bf%bf%0a'); 1 }; ok !eval { query_decode('a=%fe%83%bf%bf%bf%bf%bf%0a'); 1 };
like $@, qr/does not map to Unicode/; like $@, qr/does not map to Unicode/;

View file

@ -79,6 +79,10 @@ t { trim => 0 }, " Va\rl id \n ", " Va\rl id \n ";
f {}, ' ', { validation => 'required' }, 'required value missing'; f {}, ' ', { validation => 'required' }, 'required value missing';
t { trim => 0 }, ' ', ' '; t { trim => 0 }, ' ', ' ';
# allow_control
f {}, "\b", { validation => 'allow_control' }, 'invalid control character';
t { allow_control => 1 }, "\b", "\b";
# accept_array # accept_array
t { default => undef, accept_array => 'first' }, [], undef; t { default => undef, accept_array => 'first' }, [], undef;
t { default => undef, accept_array => 'first' }, [' x '], 'x'; t { default => undef, accept_array => 'first' }, [' x '], 'x';