From 17176738a0d942f5925e76afee4414349d9a6fbd Mon Sep 17 00:00:00 2001
From: Yorhel <git@yorhel.nl>
Date: Sat, 8 Mar 2025 14:02:51 +0100
Subject: [PATCH] FU: Support multipart file uploads + some doc fixes

API is not super convenient and implementation is lousy, but uploading
files is not a super common operation so that should be fine.

At least it supports large files with only a single in-memory copy.
---
 FU.pm                   |  25 +++--
 FU/MultipartFormData.pm | 209 ++++++++++++++++++++++++++++++++++++++++
 FU/Pg.pm                |   4 +-
 FU/SQL.pm               |   6 +-
 FU/XMLWriter.pm         |   2 +-
 t/multipart.t           |  47 +++++++++
 6 files changed, 280 insertions(+), 13 deletions(-)
 create mode 100644 FU/MultipartFormData.pm
 create mode 100644 t/multipart.t
diff --git a/FU.pm b/FU.pm
index 57123a5..4807e0b 100644
--- a/FU.pm
+++ b/FU.pm
@@ -253,8 +253,9 @@ sub _read_req_http($sock, $req) {
 
     $req->{body} = '';
     while ($len > 0) {
-        my $r = $sock->read($req->{body}, $len, -1);
-        fu->error(400, 'Client disconnect before request was read') if !$r
+        my $r = $sock->read($req->{body}, $len, length $req->{body});
+        fu->error(400, 'Client disconnect before request was read') if !$r;
+        $len -= $r;
     }
 }
 
@@ -654,7 +655,6 @@ sub query {
 sub formdata {
     shift;
     $FU::REQ->{formdata} ||= eval {
-        # TODO: Support multipart encoding
         confess "Invalid content type for form data"
             if (fu->header('content-type')||'') ne 'application/x-www-form-urlencoded';
         FU::Util::query_decode($FU::REQ->{data});
@@ -662,6 +662,13 @@ sub formdata {
     _getfield $FU::REQ->{formdata}, @_;
 }
 
+sub multipart {
+    require FU::MultipartFormData;
+    $FU::REQ->{multipart} ||= eval {
+        FU::MultipartFormData->parse(fu->header('content-type')||'', $FU::REQ->{body})
+    } || fu->error(400, $@);
+}
+
 
 
 
@@ -1239,12 +1246,18 @@ Parse, validate and return multiple query parameters.
 
 =item fu->formdata($schema)
 
-Like C<< fu->query() >> but returns data from the POST request body.
+Like C<< fu->query() >> but returns data from the POST request body. This
+method only supports form data encoded as C<application/x-www-form-urlencoded>,
+which is the default for HTML C<< <form> >>s. To handle multipart form data,
+use C<< fu->multipart >> instead.
+
+=item fu->multipart
+
+Parse the request body as C<multipart/form-data> and return an array of fields.
+Refer to L<FU::MultipartFormData> for more information.
 
 =back
 
-I<TODO:> Support C<multipart/form-data> and file uploads.
-
 I<TODO:> Support JSON bodies.
 
 I<TODO:> Cookie parsing.
diff --git a/FU/MultipartFormData.pm b/FU/MultipartFormData.pm
new file mode 100644
index 0000000..da415f8
--- /dev/null
+++ b/FU/MultipartFormData.pm
@@ -0,0 +1,209 @@
+package FU::MultipartFormData;
+use v5.36;
+use Carp 'confess';
+use FU::Util 'utf8_decode';
+
+sub _arg($d) { $d =~ s/^\s+//r =~ s/\s+$//r =~ s/^"(.+)"$/$1/r }
+
+sub parse($pkg, $header, $data) {
+    confess "Invalid multipart header '$header'"
+        if $header !~ m{^multipart/form-data\s*;\s*boundary\s*=(.+)$};
+    my $boundary = _arg $1;
+    confess "Invalid multipart boundary '$boundary'" if $boundary !~ /^[\x21-\x7e]+$/;
+    utf8::encode($boundary);
+
+    my @a;
+    while ($data =~ m{--\Q$boundary\E(?:--\r\n|\r\n((?:.+\r\n)+)\r\n)}xg) {
+        my $hdrs = $1;
+        $a[$#a]{length} = $-[0] - 2 - $a[$#a]{start} if @a;
+        if (!$hdrs) {
+            confess "Trailing garbage" if pos $data != length $data;
+            last;
+        }
+
+        my $d = bless {
+            data => $data,
+            start => pos $data,
+        }, $pkg;
+
+        confess "Missing content-disposition header" if $hdrs !~ /content-disposition:\s*form-data;(.+)/i;
+        my $v = $1;
+        confess "Missing 'name' parameter" if $v !~ /[;\s]name=([^[;\s]+)/;
+        $d->{name} = utf8_decode _arg $1;
+        $d->{filename} = utf8_decode _arg $1 if $v =~ /[;\s]filename=([^;\s]+)/;
+
+        if ($hdrs =~ /content-type:\s*([^;\s]+)(?:\s*;\s*charset=([^;\s]+))?/i) {
+            $d->{mime} = utf8_decode _arg $1;
+            $d->{charset} = utf8_decode _arg $2 if $2;
+        }
+        push @a, $d;
+    }
+    confess "Missing end-of-data marker" if @a && !defined $a[$#a]{length};
+    \@a
+}
+
+sub name     { $_[0]{name} }
+sub filename { $_[0]{filename} }
+sub mime     { $_[0]{mime} }
+sub charset  { $_[0]{charset} }
+sub length   { $_[0]{length} }
+
+sub substr($o,$off,$len=undef) {
+    $off += $o->{length} if $off < 0;
+    $off = 0 if $off < 0;
+    $off = $o->{length} if $off > $o->{length};
+
+    $len //= $o->{length} - $off;
+    $len += $o->{length} - 1 if $len < 0;
+    $len = 0 if $len < 0;
+    $len = $o->{length} - $off if $len > $o->{length} - $off;
+
+    substr $o->{data}, $o->{start} + $off, $len;
+}
+
+sub data     { $_[0]->substr(0) }
+sub value    { utf8_decode $_[0]->data }
+
+sub syswrite($o, $fh) {
+    my $off = $o->{start};
+    my $end = $o->{start} + $o->{length};
+    while ($off < $end) {
+        my $r = syswrite $fh, $o->{data}, $end-$off, $off;
+        return if !defined $r;
+        $off += $r;
+    }
+    $o->{length};
+}
+
+sub save($o, $fn) {
+    open my $F, '>', $fn or confess "Error opening '$fn': $!";
+    $o->syswrite($F) or confess "Error writing to '$fn': $!";
+}
+
+sub describe($o) {
+    my $head = eval { utf8_decode $o->substr(0, 100) };
+    if (defined $head && $head =~ /\n/) {
+        ($head) = split /\n/, $head, 2;
+        $head .= '...';
+    } elsif (defined $head && $o->{length} > 100) {
+        $head .= '...';
+    }
+    $o->{name}.': '.join ' ',
+        $o->{filename} ? "filename=$o->{filename}" : (),
+        $o->{mime} ? "mime=$o->{mime}" : (),
+        $o->{charset} ? "charset=$o->{charset}" : (),
+        "length=$o->{length}",
+        defined $head ? "value=$head" : ();
+}
+
+1;
+__END__
+
+=head1 NAME
+
+FU::MultipartFormData - Parse multipart/form-data
+
+=head1 SYNOPSIS
+
+  my $fields = FU::MultipartFormData->parse($content_type_header, $request_body);
+
+  for my $f (@$fields) {
+      print "%s   %d\n", $f->name, $f->length;
+
+      $f->save('file.png') if $f->name eq 'image';
+  }
+
+=head1 DESCRIPTION
+
+This is a tiny module to parse an HTTP request body encoded as
+C<multipart/form-data>, which is typically used to handle file uploads.
+
+The entire request body is assumed to be in memory as a Perl string, but this
+module makes an attempt to avoid any further copies of data values.
+
+=head2 Parsing
+
+=over FU::MultipartFormData->parse($header, $body)
+
+Returns an array of field objects from the given C<$header>, which must be a
+valid value for the C<Content-Type> request header, and the given C<$body>,
+which must hold the request body as a byte string. An error is thrown if the
+header is not valid or parsing failed.
+
+This module is pretty lousy and does not fully comform to any HTTP standards,
+but it does happen to be able to parse POST data from any browser that I've
+tried.
+
+=back
+
+=head2 Field Object
+
+Each field is parsed into a field object that supports the following methods:
+
+=over
+
+=item name
+
+Returns the field name as a Perl Unicode string.
+
+=item filename
+
+Returns the filename as a Perl Unicode string, or C<undef> if no filename was
+provided.
+
+=item mime
+
+Returns the mime type extracted from the field's C<Content-Type> header, or
+C<undef> if none was present.
+
+=item charset
+
+Returns the charset extracted from the field's C<Content-Type> header, or
+C<undef> if none was present.
+
+=item length
+
+Returns the byte length of the field value.
+
+=item data
+
+Returns a copy of the field value as a byte string. You'll want to avoid using
+this on large fields.
+
+=item value
+
+Returns a copy of the field value as a Unicode string. Uses C<utf8_decode()>
+from L<FU::Util>, so also throws an error if the value contains control
+characters.
+
+=item substr($off, $len)
+
+Equivalent to calling C<substr()> on the string returned by C<data>, but avoids
+a copy of the entire field value.
+
+=item syswrite($fh)
+
+Write the field value to C<$fh> using Perl's C<syswrite()>, returns C<undef> on
+error or the number of bytes written on success.
+
+Can be used to write uploaded file data to a file or send it over a socket or
+pipe, without making a full in-memory copy of the data.
+
+=item save($fn)
+
+Save the field value to the file C<$fn>, throws an error on failure.
+
+=item describe
+
+Returns a human-readable string to describe this field. Mainly for debugging
+purposes, the exact format is subject to change.
+
+=back
+
+=head1 COPYRIGHT
+
+MIT.
+
+=head1 AUTHOR
+
+Yorhel <projects@yorhel.nl>
diff --git a/FU/Pg.pm b/FU/Pg.pm
index 7b9de5d..176a8ac 100644
--- a/FU/Pg.pm
+++ b/FU/Pg.pm
@@ -400,9 +400,7 @@ results into Perl values.
 
 Observed query preparation time, in seconds, including network round-trip.
 Returns 0 if a cached prepared statement was used or C<undef> if the query was
-executed without a separate preparation phase (currently only happens with C<<
-$conn->exec() >>, but support for direct query execution may be added for other
-queries in the future as well).
+executed without a separate preparation phase.
 
 =item $st->get_cache
 
diff --git a/FU/SQL.pm b/FU/SQL.pm
index 73f5613..c86a3fb 100644
--- a/FU/SQL.pm
+++ b/FU/SQL.pm
@@ -356,12 +356,12 @@ values. This function results in different SQL depending on the C<in_style>
 option given to C<compile()>. The default C<'dbi'> style passes each value as a
 bind parameter:
 
-  SQL 'WHERE id', IN([1, 2, 3, 4]);
+  SQL 'WHERE id', IN [1, 2, 3, 4];
   # 'WHERE id IN(?, ?, ?, ?)', parameters: 1, 2, 3, 4
 
 The C<'pg'> style passes the entire array as a single bind parameter instead:
 
-  SQL 'WHERE id', IN([1, 2, 3, 4]);
+  SQL 'WHERE id', IN [1, 2, 3, 4];
   # 'WHERE id = ANY(?)', parameter: [1, 2, 3, 4]
 
 The C<'pg'> style allows for more efficient re-use of cached prepared
@@ -372,7 +372,7 @@ with L<DBD::Pg> or L<Pg::PQ>.
 
 Can be used in the C<$hashref> versions of C<AND>, C<OR> and C<WHERE> as well:
 
-  WHERE { id => IN([1, 2]) }
+  WHERE { id => IN [1, 2] }
   # 'WHERE id IN(?, ?)'
 
 =back
diff --git a/FU/XMLWriter.pm b/FU/XMLWriter.pm
index 0911ced..08fc54d 100644
--- a/FU/XMLWriter.pm
+++ b/FU/XMLWriter.pm
@@ -146,7 +146,7 @@ These functions all return a byte string with (UTF-8) encoded XML.
 
 =item fragment($block)
 
-Executes C<$block> and captures the output of all I</"Output functions">
+Executes C<$block> and captures the output of all L</"Output functions">
 called within the same scope into a string. This function can be safely nested:
 
   my $string = fragment {
diff --git a/t/multipart.t b/t/multipart.t
new file mode 100644
index 0000000..842b9cd
--- /dev/null
+++ b/t/multipart.t
@@ -0,0 +1,47 @@
+use v5.36;
+use Test::More;
+use FU::MultipartFormData;
+
+# Example based on https://developer.mozilla.org/en-US/docs/Web/HTTP/Methods/POST
+my $t = <<'_' =~ s/\n/\r\n/rg;
+--delimiter12345
+Content-Disposition: form-data; name="field1"
+content-type: hello; charset=x
+
+value1
+--delimiter12345
+Content-Type: text
+Content-Disposition: form-data; filename="example.txt"; name=field2
+
+value2
+--delimiter12345--
+_
+
+
+my $l = FU::MultipartFormData->parse('multipart/form-data;boundary="delimiter12345"', $t);
+is scalar @$l, 2;
+
+my $v = $l->[0];
+is $v->name, 'field1';
+is $v->filename, undef;
+is $v->mime, 'hello';
+is $v->charset, 'x';
+is $v->length, 6;
+is $v->data, 'value1';
+
+is $v->substr(4), 'e1';
+is $v->substr(1, 2), 'al';
+is $v->substr(-2, 1), 'e';
+is $v->substr(-2, 5), 'e1';
+is $v->substr(-100, 2), 'va';
+is $v->substr(1, -3), 'al';
+
+$v = $l->[1];
+is $v->name, 'field2';
+is $v->filename, 'example.txt';
+is $v->mime, 'text';
+is $v->charset, undef;
+is $v->length, 6;
+is $v->data, 'value2';
+
+done_testing;