Move control character checking to FU::Validate, deprecate FU::Util::utf8_decode()
URI, JSON and formdata decoding no longer checks for control characters, but FU::Validate now rejects control characters by default. This decouples semantic validation from format parsing and gives better control over when control characters are allowed.
This commit is contained in:
parent
2e9a40da69
commit
a8ac435f85
8 changed files with 39 additions and 49 deletions
2
FU.pm
2
FU.pm
|
|
@ -312,7 +312,7 @@ sub _read_req($c) {
|
||||||
|
|
||||||
($REQ->{path}, my $qs) = split /\?/, $REQ->{path}//'', 2;
|
($REQ->{path}, my $qs) = split /\?/, $REQ->{path}//'', 2;
|
||||||
$REQ->{qs} //= $qs;
|
$REQ->{qs} //= $qs;
|
||||||
eval { $REQ->{path} = FU::Util::uri_unescape($REQ->{path}); 1; } || fu->error(400, $@);
|
eval { $REQ->{path} = FU::Util::uri_unescape($REQ->{path}); FU::Util::check_control($REQ->{path}); 1; } || fu->error(400, $@);
|
||||||
fu->error(400, 'Invalid character in path') if $REQ->{path} =~ /[\r\n\t]/; # There are plenty other questionable characters, but newlines and tabs are definitely out
|
fu->error(400, 'Invalid character in path') if $REQ->{path} =~ /[\r\n\t]/; # There are plenty other questionable characters, but newlines and tabs are definitely out
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -175,9 +175,7 @@ this on large fields.
|
||||||
|
|
||||||
=item value
|
=item value
|
||||||
|
|
||||||
Returns a copy of the field value as a Unicode string. Uses C<utf8_decode()>
|
Returns a copy of the field value as a Unicode string.
|
||||||
from L<FU::Util>, so also throws an error if the value contains control
|
|
||||||
characters.
|
|
||||||
|
|
||||||
=item substr($off, $len)
|
=item substr($off, $len)
|
||||||
|
|
||||||
|
|
|
||||||
35
FU/Util.pm
35
FU/Util.pm
|
|
@ -11,20 +11,26 @@ use experimental 'builtin';
|
||||||
our @EXPORT_OK = qw/
|
our @EXPORT_OK = qw/
|
||||||
to_bool
|
to_bool
|
||||||
json_format json_parse
|
json_format json_parse
|
||||||
utf8_decode uri_escape uri_unescape
|
has_control check_control utf8_decode
|
||||||
|
uri_escape uri_unescape
|
||||||
query_decode query_encode
|
query_decode query_encode
|
||||||
httpdate_format httpdate_parse
|
httpdate_format httpdate_parse
|
||||||
gzip_lib gzip_compress brotli_compress
|
gzip_lib gzip_compress brotli_compress
|
||||||
fdpass_send fdpass_recv
|
fdpass_send fdpass_recv
|
||||||
/;
|
/;
|
||||||
|
|
||||||
|
|
||||||
|
# Internal utility function
|
||||||
|
sub has_control :prototype($) ($s) { defined $s && $s =~ /[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]/ }
|
||||||
|
sub check_control :prototype($) ($s) { confess 'Invalid control character' if has_control $s; }
|
||||||
|
|
||||||
|
# Deprecated, call Encode::decode() directly.
|
||||||
sub utf8_decode :prototype($) {
|
sub utf8_decode :prototype($) {
|
||||||
return if !defined $_[0];
|
return if !defined $_[0];
|
||||||
eval {
|
eval {
|
||||||
$_[0] = Encode::decode('UTF-8', $_[0], Encode::FB_CROAK);
|
$_[0] = Encode::decode('UTF-8', $_[0], Encode::FB_CROAK);
|
||||||
1
|
1
|
||||||
} || confess($@ =~ s/ at .+\n$//r);
|
} || confess($@ =~ s/ at .+\n$//r);
|
||||||
confess 'Invalid control character' if $_[0] =~ /[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]/;
|
|
||||||
$_[0]
|
$_[0]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -175,13 +181,6 @@ Supported C<%options>:
|
||||||
|
|
||||||
=over
|
=over
|
||||||
|
|
||||||
=item allow_control
|
|
||||||
|
|
||||||
Boolean, set to true to allow (encoded) ASCII control characters in JSON
|
|
||||||
strings, such as C<\u0000>, C<\b>, C<\u007f>, etc. These characters are
|
|
||||||
permitted per RFC-8259, but disallowed by this parser by default. See
|
|
||||||
C<utf8_decode()> below.
|
|
||||||
|
|
||||||
=item utf8
|
=item utf8
|
||||||
|
|
||||||
Boolean, interpret the input C<$string> as a UTF-8 encoded byte string instead
|
Boolean, interpret the input C<$string> as a UTF-8 encoded byte string instead
|
||||||
|
|
@ -296,18 +295,6 @@ inputs, at the cost of flexibility.
|
||||||
|
|
||||||
=over
|
=over
|
||||||
|
|
||||||
=item utf8_decode($bytes)
|
|
||||||
|
|
||||||
Convert a (perl-UTF-8 encoded) byte string into a sanitized perl Unicode
|
|
||||||
string. The conversion is performed in-place, so the C<$bytes> argument is
|
|
||||||
turned into a Unicode string. Returns the same string for convenience.
|
|
||||||
|
|
||||||
This function throws an error if the input is not valid UTF-8 or if it contains
|
|
||||||
ASCII control characters - that is, any character between C<0x00> and C<0x1f>
|
|
||||||
except for tab, newline and carriage return.
|
|
||||||
|
|
||||||
(This is a tiny wrapper around C<utf8::decode()> with some extra checks)
|
|
||||||
|
|
||||||
=item uri_escape($string)
|
=item uri_escape($string)
|
||||||
|
|
||||||
Takes an Unicode string and returns a percent-encoded ASCII string, suitable
|
Takes an Unicode string and returns a percent-encoded ASCII string, suitable
|
||||||
|
|
@ -316,8 +303,7 @@ for use in a query parameter.
|
||||||
=item uri_unescape($string)
|
=item uri_unescape($string)
|
||||||
|
|
||||||
Takes an Unicode string potentially containing percent-encoding and returns a
|
Takes an Unicode string potentially containing percent-encoding and returns a
|
||||||
decoded Unicode string. Also checks for ASCII control characters as per
|
decoded Unicode string.
|
||||||
C<utf8_decode()>.
|
|
||||||
|
|
||||||
=item query_decode($string)
|
=item query_decode($string)
|
||||||
|
|
||||||
|
|
@ -334,8 +320,7 @@ have a value are decoded as C<builtin::true>. Example:
|
||||||
# }
|
# }
|
||||||
|
|
||||||
The input C<$string> is assumed to be a perl Unicode string. An error is thrown
|
The input C<$string> is assumed to be a perl Unicode string. An error is thrown
|
||||||
if the resulting data decodes into invalid UTF-8 or contains control
|
if the resulting data decodes into invalid UTF-8.
|
||||||
characters, as per C<utf8_decode>.
|
|
||||||
|
|
||||||
=item query_encode($hashref)
|
=item query_encode($hashref)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -4,7 +4,7 @@ use v5.36;
|
||||||
use experimental 'builtin', 'for_list';
|
use experimental 'builtin', 'for_list';
|
||||||
use builtin qw/true false blessed trim/;
|
use builtin qw/true false blessed trim/;
|
||||||
use Carp 'confess';
|
use Carp 'confess';
|
||||||
use FU::Util 'to_bool';
|
use FU::Util 'to_bool', 'has_control';
|
||||||
|
|
||||||
|
|
||||||
# Unavailable as custom validation names
|
# Unavailable as custom validation names
|
||||||
|
|
@ -12,7 +12,7 @@ my %builtin = map +($_,1), qw/
|
||||||
type
|
type
|
||||||
default
|
default
|
||||||
onerror
|
onerror
|
||||||
trim
|
trim allow_control
|
||||||
elems sort unique
|
elems sort unique
|
||||||
accept_scalar accept_array
|
accept_scalar accept_array
|
||||||
keys values unknown missing
|
keys values unknown missing
|
||||||
|
|
@ -296,8 +296,13 @@ sub _validate_input {
|
||||||
$_[1] = $_[1]->@* == 0 ? undef : $c->{accept_array} eq 'first' ? $_[1][0] : $_[1][ $#{$_[1]} ]
|
$_[1] = $_[1]->@* == 0 ? undef : $c->{accept_array} eq 'first' ? $_[1][0] : $_[1][ $#{$_[1]} ]
|
||||||
if $c->{accept_array} && ref $_[1] eq 'ARRAY';
|
if $c->{accept_array} && ref $_[1] eq 'ARRAY';
|
||||||
|
|
||||||
# trim (needs to be done before the 'default' test)
|
# early scalar checks
|
||||||
$_[1] = trim $_[1] =~ s/\r//rg if defined $_[1] && !ref $_[1] && $type eq 'scalar' && (!exists $c->{trim} || $c->{trim});
|
if (defined $_[1] && !ref $_[1] && $type eq 'scalar') {
|
||||||
|
# trim needs to be done before the 'default' test
|
||||||
|
$_[1] = trim $_[1] =~ s/\r//rg if !exists $c->{trim} || $c->{trim};
|
||||||
|
|
||||||
|
return { validation => 'allow_control' } if !$c->{allow_control} && has_control $_[1];
|
||||||
|
}
|
||||||
|
|
||||||
# default
|
# default
|
||||||
if (!defined $_[1] || (!ref $_[1] && $_[1] eq '')) {
|
if (!defined $_[1] || (!ref $_[1] && $_[1] eq '')) {
|
||||||
|
|
@ -403,6 +408,7 @@ sub _inval($t,$v) { sprintf 'invalid %s: %s', $t, _fmtval $v }
|
||||||
# TODO: document.
|
# TODO: document.
|
||||||
our %error_format = (
|
our %error_format = (
|
||||||
required => sub { 'required value missing' },
|
required => sub { 'required value missing' },
|
||||||
|
allow_control => sub { 'invalid control character' },
|
||||||
type => sub($e) { "invalid type, expected '$e->{expected}' but got '$e->{got}'" },
|
type => sub($e) { "invalid type, expected '$e->{expected}' but got '$e->{got}'" },
|
||||||
unknown => sub($e) { sprintf 'unknown key%s: %s', $e->{keys}->@* == 1 ? '' : 's', join ', ', map _fmtkey($_), $e->{keys}->@* },
|
unknown => sub($e) { sprintf 'unknown key%s: %s', $e->{keys}->@* == 1 ? '' : 's', join ', ', map _fmtkey($_), $e->{keys}->@* },
|
||||||
minlength => sub($e) { sprintf "input too short, expected minimum of %d but got %d", $e->{expected}, $e->{got} },
|
minlength => sub($e) { sprintf "input too short, expected minimum of %d but got %d", $e->{expected}, $e->{got} },
|
||||||
|
|
@ -590,6 +596,9 @@ Upon failure, the error object will look something like:
|
||||||
got => 'scalar'
|
got => 'scalar'
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Beware: setting the type to I<any> causes the I<trim> and I<allow_control>
|
||||||
|
validations to be skipped.
|
||||||
|
|
||||||
=item default => $val
|
=item default => $val
|
||||||
|
|
||||||
If not set, or set to C<\'required'> (note: scalarref), then a value is required
|
If not set, or set to C<\'required'> (note: scalarref), then a value is required
|
||||||
|
|
@ -623,6 +632,12 @@ By default, any whitespace around scalar-type input is removed before testing
|
||||||
any other validations. Setting I<trim> to a false value will disable this
|
any other validations. Setting I<trim> to a false value will disable this
|
||||||
behavior.
|
behavior.
|
||||||
|
|
||||||
|
=item allow_control => 0/1
|
||||||
|
|
||||||
|
By default, ASCII control characters in the input are not permitted for scalar
|
||||||
|
values and trigger a validation error. Set this to a positive value to disable
|
||||||
|
the check.
|
||||||
|
|
||||||
=item keys => $hashref
|
=item keys => $hashref
|
||||||
|
|
||||||
Implies C<< type => 'hash' >>, this option specifies which keys are permitted,
|
Implies C<< type => 'hash' >>, this option specifies which keys are permitted,
|
||||||
|
|
|
||||||
|
|
@ -2,7 +2,6 @@ typedef struct {
|
||||||
const unsigned char *buf;
|
const unsigned char *buf;
|
||||||
const unsigned char *end;
|
const unsigned char *end;
|
||||||
UV depth;
|
UV depth;
|
||||||
int allow_control;
|
|
||||||
} fujson_parse_ctx;
|
} fujson_parse_ctx;
|
||||||
|
|
||||||
static SV *fujson_parse(pTHX_ fujson_parse_ctx *);
|
static SV *fujson_parse(pTHX_ fujson_parse_ctx *);
|
||||||
|
|
@ -24,10 +23,10 @@ static inline int fujson_parse_string_escape(pTHX_ fujson_parse_ctx *ctx, fustr
|
||||||
case '"': *(r->cur++) = '\"'; break;
|
case '"': *(r->cur++) = '\"'; break;
|
||||||
case '\\':*(r->cur++) = '\\'; break;
|
case '\\':*(r->cur++) = '\\'; break;
|
||||||
case '/': *(r->cur++) = '/'; break; /* We don't escape this one */
|
case '/': *(r->cur++) = '/'; break; /* We don't escape this one */
|
||||||
case 'b': if (!ctx->allow_control) return 1; *(r->cur++) = 0x08; break;
|
case 'b': *(r->cur++) = 0x08; break;
|
||||||
case 't': *(r->cur++) = 0x09; break;
|
case 't': *(r->cur++) = 0x09; break;
|
||||||
case 'n': *(r->cur++) = 0x0a; break;
|
case 'n': *(r->cur++) = 0x0a; break;
|
||||||
case 'f': if (!ctx->allow_control) return 1; *(r->cur++) = 0x0c; break;
|
case 'f': *(r->cur++) = 0x0c; break;
|
||||||
case 'r': *(r->cur++) = 0x0d; break;
|
case 'r': *(r->cur++) = 0x0d; break;
|
||||||
case 'u':
|
case 'u':
|
||||||
/* (awful code adapted from ncdu) */
|
/* (awful code adapted from ncdu) */
|
||||||
|
|
@ -44,9 +43,6 @@ static inline int fujson_parse_string_escape(pTHX_ fujson_parse_ctx *ctx, fustr
|
||||||
n = 0x10000 + (((n & 0x03ff) << 10) | (s & 0x03ff));
|
n = 0x10000 + (((n & 0x03ff) << 10) | (s & 0x03ff));
|
||||||
ctx->buf += 6;
|
ctx->buf += 6;
|
||||||
}
|
}
|
||||||
if (!ctx->allow_control &&
|
|
||||||
(n <= 8 || n == 0x0b || n == 0x0c || (n >= 0x0e && n <= 0x1f) || n == 0x7f))
|
|
||||||
return 1;
|
|
||||||
r->cur = (char *)uvchr_to_utf8((U8 *)r->cur, n);
|
r->cur = (char *)uvchr_to_utf8((U8 *)r->cur, n);
|
||||||
if (n >= 0x80) r->setutf8 = 1;
|
if (n >= 0x80) r->setutf8 = 1;
|
||||||
break;
|
break;
|
||||||
|
|
@ -269,7 +265,6 @@ static SV *fujson_parse_xs(pTHX_ I32 ax, I32 argc, SV *val) {
|
||||||
fujson_parse_ctx ctx;
|
fujson_parse_ctx ctx;
|
||||||
|
|
||||||
ctx.depth = 0;
|
ctx.depth = 0;
|
||||||
ctx.allow_control = 0;
|
|
||||||
while (i < argc) {
|
while (i < argc) {
|
||||||
arg = SvPV_nolen(ST(i));
|
arg = SvPV_nolen(ST(i));
|
||||||
i++;
|
i++;
|
||||||
|
|
@ -280,7 +275,7 @@ static SV *fujson_parse_xs(pTHX_ I32 ax, I32 argc, SV *val) {
|
||||||
if (strcmp(arg, "utf8") == 0) decutf8 = SvTRUEx(r);
|
if (strcmp(arg, "utf8") == 0) decutf8 = SvTRUEx(r);
|
||||||
else if (strcmp(arg, "max_size") == 0) maxlen = SvUV(r);
|
else if (strcmp(arg, "max_size") == 0) maxlen = SvUV(r);
|
||||||
else if (strcmp(arg, "max_depth") == 0) ctx.depth = SvUV(r);
|
else if (strcmp(arg, "max_depth") == 0) ctx.depth = SvUV(r);
|
||||||
else if (strcmp(arg, "allow_control") == 0) ctx.allow_control = SvTRUE(r);
|
else if (strcmp(arg, "allow_control") == 0) {}
|
||||||
else if (strcmp(arg, "offset") == 0) offset = r;
|
else if (strcmp(arg, "offset") == 0) offset = r;
|
||||||
else croak("Unknown flag: '%s'", arg);
|
else croak("Unknown flag: '%s'", arg);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -24,11 +24,6 @@ my @error = (
|
||||||
'"\udc12\u1234"',
|
'"\udc12\u1234"',
|
||||||
"\"\x{110000}\"",
|
"\"\x{110000}\"",
|
||||||
|
|
||||||
'"\u0000"',
|
|
||||||
'"\b"',
|
|
||||||
'"\f"',
|
|
||||||
'"\u007f"',
|
|
||||||
|
|
||||||
'1.',
|
'1.',
|
||||||
'01',
|
'01',
|
||||||
'1e',
|
'1e',
|
||||||
|
|
@ -87,6 +82,7 @@ sub str($in, $exp) {
|
||||||
}
|
}
|
||||||
str '""', '';
|
str '""', '';
|
||||||
str '"hello, world"', 'hello, world';
|
str '"hello, world"', 'hello, world';
|
||||||
|
str '"\u0000\b"', "\x00\b";
|
||||||
str '"\u0099\u0234\u1234"', "\x{99}\x{234}\x{1234}";
|
str '"\u0099\u0234\u1234"', "\x{99}\x{234}\x{1234}";
|
||||||
str "\"\x{99}\x{234}\x{1234}\x{12345}\"", "\x{99}\x{234}\x{1234}\x{12345}";
|
str "\"\x{99}\x{234}\x{1234}\x{12345}\"", "\x{99}\x{234}\x{1234}\x{12345}";
|
||||||
str '"\/\"\\\\\t\n\r"', "/\"\\\x{09}\x{0a}\x{0d}";
|
str '"\/\"\\\\\t\n\r"', "/\"\\\x{09}\x{0a}\x{0d}";
|
||||||
|
|
|
||||||
|
|
@ -7,9 +7,6 @@ is_deeply
|
||||||
query_decode('a&a&%c3%be=%26%3d%c3%be&a=3'),
|
query_decode('a&a&%c3%be=%26%3d%c3%be&a=3'),
|
||||||
{ a => [ builtin::true, builtin::true, 3 ], "\xfe" => "&=\xfe" };
|
{ a => [ builtin::true, builtin::true, 3 ], "\xfe" => "&=\xfe" };
|
||||||
|
|
||||||
ok !eval { query_decode('%10'); 1 };
|
|
||||||
like $@, qr/Invalid control character/;
|
|
||||||
|
|
||||||
ok !eval { query_decode('a=%fe%83%bf%bf%bf%bf%bf%0a'); 1 };
|
ok !eval { query_decode('a=%fe%83%bf%bf%bf%bf%bf%0a'); 1 };
|
||||||
like $@, qr/does not map to Unicode/;
|
like $@, qr/does not map to Unicode/;
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -79,6 +79,10 @@ t { trim => 0 }, " Va\rl id \n ", " Va\rl id \n ";
|
||||||
f {}, ' ', { validation => 'required' }, 'required value missing';
|
f {}, ' ', { validation => 'required' }, 'required value missing';
|
||||||
t { trim => 0 }, ' ', ' ';
|
t { trim => 0 }, ' ', ' ';
|
||||||
|
|
||||||
|
# allow_control
|
||||||
|
f {}, "\b", { validation => 'allow_control' }, 'invalid control character';
|
||||||
|
t { allow_control => 1 }, "\b", "\b";
|
||||||
|
|
||||||
# accept_array
|
# accept_array
|
||||||
t { default => undef, accept_array => 'first' }, [], undef;
|
t { default => undef, accept_array => 'first' }, [], undef;
|
||||||
t { default => undef, accept_array => 'first' }, [' x '], 'x';
|
t { default => undef, accept_array => 'first' }, [' x '], 'x';
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue