Move control character checking to FU::Validate, deprecate FU::Util::utf8_decode()

URI, JSON and formdata decoding no longer checks for control characters, but FU::Validate now rejects control characters by default. This decouples semantic validation from format parsing and gives better control over when control characters are allowed.
2025-08-22 09:51:56 +02:00 · 2025-08-22 09:51:56 +02:00 · a8ac435f85
commit a8ac435f85
parent 2e9a40da69
8 changed files with 39 additions and 49 deletions
--- a/FU/MultipartFormData.pm
+++ b/FU/MultipartFormData.pm
@ -175,9 +175,7 @@ this on large fields.

 =item value

-Returns a copy of the field value as a Unicode string. Uses C<utf8_decode()>
-from L<FU::Util>, so also throws an error if the value contains control
-characters.
+Returns a copy of the field value as a Unicode string.

 =item substr($off, $len)

--- a/FU/Util.pm
+++ b/FU/Util.pm
@ -11,20 +11,26 @@ use experimental 'builtin';
 our @EXPORT_OK = qw/
    to_bool
    json_format json_parse
-    utf8_decode uri_escape uri_unescape
+    has_control check_control utf8_decode
+    uri_escape uri_unescape
    query_decode query_encode
    httpdate_format httpdate_parse
    gzip_lib gzip_compress brotli_compress
    fdpass_send fdpass_recv
 /;

+
+# Internal utility function
+sub has_control :prototype($) ($s) { defined $s && $s =~ /[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]/ }
+sub check_control :prototype($) ($s) { confess 'Invalid control character' if has_control $s; }
+
+# Deprecated, call Encode::decode() directly.
 sub utf8_decode :prototype($) {
    return if !defined $_[0];
    eval {
        $_[0] = Encode::decode('UTF-8', $_[0], Encode::FB_CROAK);
        1
    } || confess($@ =~ s/ at .+\n$//r);
-    confess 'Invalid control character' if $_[0] =~ /[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]/;
    $_[0]
 }

@ -175,13 +181,6 @@ Supported C<%options>:

 =over

-=item allow_control
-
-Boolean, set to true to allow (encoded) ASCII control characters in JSON
-strings, such as C<\u0000>, C<\b>, C<\u007f>, etc.  These characters are
-permitted per RFC-8259, but disallowed by this parser by default. See
-C<utf8_decode()> below.
-
 =item utf8

 Boolean, interpret the input C<$string> as a UTF-8 encoded byte string instead
@ -296,18 +295,6 @@ inputs, at the cost of flexibility.

 =over

-=item utf8_decode($bytes)
-
-Convert a (perl-UTF-8 encoded) byte string into a sanitized perl Unicode
-string. The conversion is performed in-place, so the C<$bytes> argument is
-turned into a Unicode string. Returns the same string for convenience.
-
-This function throws an error if the input is not valid UTF-8 or if it contains
-ASCII control characters - that is, any character between C<0x00> and C<0x1f>
-except for tab, newline and carriage return.
-
-(This is a tiny wrapper around C<utf8::decode()> with some extra checks)
-
 =item uri_escape($string)

 Takes an Unicode string and returns a percent-encoded ASCII string, suitable
@ -316,8 +303,7 @@ for use in a query parameter.
 =item uri_unescape($string)

 Takes an Unicode string potentially containing percent-encoding and returns a
-decoded Unicode string. Also checks for ASCII control characters as per
-C<utf8_decode()>.
+decoded Unicode string.

 =item query_decode($string)

@ -334,8 +320,7 @@ have a value are decoded as C<builtin::true>. Example:
    # }

 The input C<$string> is assumed to be a perl Unicode string. An error is thrown
-if the resulting data decodes into invalid UTF-8 or contains control
-characters, as per C<utf8_decode>.
+if the resulting data decodes into invalid UTF-8.

 =item query_encode($hashref)

--- a/FU/Validate.pm
+++ b/FU/Validate.pm
@ -4,7 +4,7 @@ use v5.36;
 use experimental 'builtin', 'for_list';
 use builtin qw/true false blessed trim/;
 use Carp 'confess';
-use FU::Util 'to_bool';
+use FU::Util 'to_bool', 'has_control';


 # Unavailable as custom validation names
@ -12,7 +12,7 @@ my %builtin = map +($_,1), qw/
    type
    default
    onerror
-    trim
+    trim allow_control
    elems sort unique
    accept_scalar accept_array
    keys values unknown missing
@ -296,8 +296,13 @@ sub _validate_input {
    $_[1] = $_[1]->@* == 0 ? undef : $c->{accept_array} eq 'first' ? $_[1][0] : $_[1][ $#{$_[1]} ]
        if $c->{accept_array} && ref $_[1] eq 'ARRAY';

-    # trim (needs to be done before the 'default' test)
-    $_[1] = trim $_[1] =~ s/\r//rg if defined $_[1] && !ref $_[1] && $type eq 'scalar' && (!exists $c->{trim} || $c->{trim});
+    # early scalar checks
+    if (defined $_[1] && !ref $_[1] && $type eq 'scalar') {
+        # trim needs to be done before the 'default' test
+        $_[1] = trim $_[1] =~ s/\r//rg if !exists $c->{trim} || $c->{trim};
+
+        return { validation => 'allow_control' } if !$c->{allow_control} && has_control $_[1];
+    }

    # default
    if (!defined $_[1] || (!ref $_[1] && $_[1] eq '')) {
@ -403,6 +408,7 @@ sub _inval($t,$v) { sprintf 'invalid %s: %s', $t, _fmtval $v }
 # TODO: document.
 our %error_format = (
    required  => sub { 'required value missing' },
+    allow_control => sub { 'invalid control character' },
    type      => sub($e) { "invalid type, expected '$e->{expected}' but got '$e->{got}'" },
    unknown   => sub($e) { sprintf 'unknown key%s: %s', $e->{keys}->@* == 1 ? '' : 's', join ', ', map _fmtkey($_), $e->{keys}->@* },
    minlength => sub($e) { sprintf "input too short, expected minimum of %d but got %d", $e->{expected}, $e->{got} },
@ -590,6 +596,9 @@ Upon failure, the error object will look something like:
    got        => 'scalar'
  }

+Beware: setting the type to I<any> causes the I<trim> and I<allow_control>
+validations to be skipped.
+
 =item default => $val

 If not set, or set to C<\'required'> (note: scalarref), then a value is required
@ -623,6 +632,12 @@ By default, any whitespace around scalar-type input is removed before testing
 any other validations. Setting I<trim> to a false value will disable this
 behavior.

+=item allow_control => 0/1
+
+By default, ASCII control characters in the input are not permitted for scalar
+values and trigger a validation error. Set this to a positive value to disable
+the check.
+
 =item keys => $hashref

 Implies C<< type => 'hash' >>, this option specifies which keys are permitted,