And remove UTF-8 check in JSON writer. It honestly feels kind of silly to do that validation there while I've never done similar validations in any other output routines - including this XML writer. FU::XMLWriter is a copy of TUWF::XMLXS with a bunch of improvements applied: now uses refcounts to determine the current output instance, auto-generates XS functions and has faster escaped string output - inspired by the JSON writer. TODO: - Integrate into FU - Do something with bool attribute values - Benchmarks - Should $content be optional for all tags? The reason they weren't in TUWF::XMLXS is because TUWF::XML supports opening tags without closing them, but that idea turned out to suck and isn't supported anymore. This is hopefully the last XS module for the FU framework. The only C code being written now should be bug fixes and extending FU::Pg with some planned features. Already ended up with more C than I had planned...
314 lines
10 KiB
Perl
314 lines
10 KiB
Perl
package FU::Util 0.1;
|
|
|
|
use v5.36;
|
|
use FU::XS;
|
|
use Carp 'confess';
|
|
use Exporter 'import';
|
|
use experimental 'builtin';
|
|
|
|
our @EXPORT_OK = qw/
|
|
json_format json_parse
|
|
utf8_decode uri_escape uri_unescape
|
|
query_decode query_encode
|
|
fdpass_send fdpass_recv
|
|
/;
|
|
|
|
sub utf8_decode :prototype($) {
|
|
return if !defined $_[0];
|
|
confess 'Invalid UTF-8' if !utf8::decode($_[0]);
|
|
confess 'Invalid control character' if $_[0] =~ /[\x00-\x08\x0b\x0c\x0e-\x1f]/;
|
|
$_[0]
|
|
}
|
|
|
|
sub uri_escape :prototype($) ($s) {
|
|
utf8::encode($s);
|
|
$s =~ s/([^A-Za-z0-9._~-])/sprintf '%%%02x', ord $1/eg;
|
|
$s;
|
|
}
|
|
|
|
sub uri_unescape :prototype($) ($s) {
|
|
return if !defined $s;
|
|
utf8::encode($s);
|
|
$s =~ s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg;
|
|
utf8_decode $s;
|
|
}
|
|
|
|
sub query_decode :prototype($) ($s) {
|
|
my %o;
|
|
for (split /&/, $s//'') {
|
|
my($k,$v) = map uri_unescape($_), split /=/;
|
|
$v //= builtin::true;
|
|
if (ref $o{$k}) { push $o{$k}->@*, $v }
|
|
elsif (exists $o{$k}) { $o{$k} = [ $o{$k}, $v ] }
|
|
else { $o{$k} = $v }
|
|
}
|
|
\%o
|
|
}
|
|
|
|
sub query_encode :prototype($) ($o) {
|
|
return join '&', map {
|
|
my($k, $v) = ($_, $o->{$_});
|
|
$k = uri_escape $k;
|
|
map {
|
|
my $a = $_;
|
|
$a = $a->TO_QUERY() if builtin::blessed($a) && $a->can('TO_QUERY');
|
|
!defined $a || (builtin::is_bool($a) && !$a)
|
|
? ()
|
|
: builtin::is_bool($a) ? $k
|
|
: $k.'='.uri_escape($a)
|
|
} ref $v eq 'ARRAY' ? @$v : ($v);
|
|
} sort keys %$o;
|
|
}
|
|
|
|
1;
|
|
__END__
|
|
|
|
=head1 NAME
|
|
|
|
FU::Util - Miscellaneous utility functions that really should have been part of
|
|
a core Perl installation but aren't for some reason because the Perl community
|
|
doesn't believe in the concept of a "batteries included" standard library.
|
|
</rant>
|
|
|
|
=head1 SYNOPSIS
|
|
|
|
use FU::Util qw/json_format/;
|
|
|
|
my $data = json_format [1, 2, 3];
|
|
|
|
=head1 DESCRIPTION
|
|
|
|
=head2 JSON parsing & formatting
|
|
|
|
This module comes with a custom C-based JSON parser and formatter. These
|
|
functions conform strictly to L<RFC-8259|https://tools.ietf.org/html/rfc8259>,
|
|
non-standard extensions are not supported and never will be. It also happens to
|
|
be pretty fast, refer to L<FU::Benchmarks> for some numbers.
|
|
|
|
JSON booleans are parsed into C<builtin::true> and C<builtin::false>. When
|
|
formatting, those builtin constants are the I<only> recognized boolean values -
|
|
alternative representations such as C<JSON::PP::true> and C<JSON::PP::false>
|
|
are not recognized and attempting to format such values will croak.
|
|
|
|
JSON numbers that are too large fit into a Perl integer are parsed into a
|
|
floating point value instead. This obviously loses precision, but is consistent
|
|
with C<JSON.parse()> in JavaScript land - except Perl does support the full
|
|
range of a 64bit integer. JSON numbers with a fraction or exponent are also
|
|
converted into floating point, which may lose precision as well.
|
|
L<Math::BigInt> and L<Math::BigFloat> are not currently supported. Attempting
|
|
to format a floating point C<NaN> or C<Inf> results in an error.
|
|
|
|
=over
|
|
|
|
=item json_parse($string, %options)
|
|
|
|
Parse a JSON string and return a Perl value. With the default options, this
|
|
function is roughly similar to:
|
|
|
|
JSON::PP->new->allow_nonref->core_bools-decode($string);
|
|
|
|
Croaks on invalid JSON, but the error messages are not super useful. This
|
|
function also throws an error on JSON objects with duplicate keys, which is
|
|
consistent with the default behavior of L<Cpanel::JSON::XS> but inconsistent
|
|
with other modules.
|
|
|
|
Supported C<%options>:
|
|
|
|
=over
|
|
|
|
=item utf8
|
|
|
|
Boolean, interpret the input C<$string> as a UTF-8 encoded byte string instead
|
|
of a Perl Unicode string.
|
|
|
|
=item max_depth
|
|
|
|
Maximum permitted nesting depth of arrays and objects. Defaults to 512.
|
|
|
|
=item max_size
|
|
|
|
Throw an error if the JSON data is larger than the given size in bytes.
|
|
Defaults to 1 GiB.
|
|
|
|
=item offset
|
|
|
|
Takes a reference to a scalar that indicates from which byte offset in
|
|
C<$string> to start parsing. On success, the offset is updated to point to the
|
|
next non-whitespace character or C<undef> if the string has been fully
|
|
consumed.
|
|
|
|
This option can be used to parse a stream of JSON values:
|
|
|
|
my $data = '{"obj":1}{"obj":2}';
|
|
my $offset = 0;
|
|
my $obj1 = json_parse($data, offset => \$offset);
|
|
# $obj1 = {obj=>1}; $offset = 9;
|
|
my $obj2 = json_parse($data, offset => \$offset);
|
|
# $obj2 = {obj=>2}; $offset = undef;
|
|
|
|
=back
|
|
|
|
|
|
=item json_format($scalar, %options)
|
|
|
|
Format a Perl value as JSON. With the default options, this function behaves
|
|
roughly similar to:
|
|
|
|
JSON::PP->new->allow_nonref->core_bools->convert_blessed->encode($scalar);
|
|
|
|
Some modules escape the slash character in encoded strings to prevent a
|
|
potential XSS vulnerability when embedding JSON inside C<< <script> ..
|
|
</script> >> tags. This function does I<not> do that because it might not even
|
|
be sufficient. The following is probably an improvement:
|
|
|
|
json_format($data) =~ s{</}{<\\/}rg =~ s/<!--/<\\u0021--/rg;
|
|
|
|
This function generates invalid JSON if you pass it a string with invalid
|
|
Unicode characters; I don't see how you'd ever accidentally end up with such a
|
|
string, anyway.
|
|
|
|
The following C<%options> are supported:
|
|
|
|
=over
|
|
|
|
=item canonical
|
|
|
|
Boolean, write hash keys in deterministic (sorted) order. This option currently
|
|
has no effect on tied hashes.
|
|
|
|
=item pretty
|
|
|
|
Boolean, format JSON with newlines and indentation for easier reading. Beauty
|
|
is in the eye of the beholder, this option currently follows the convention
|
|
used by L<JSON::XS> and others: 3 space indent and one space around the C<:>
|
|
separating object keys and values. The exact format might change in later
|
|
versions.
|
|
|
|
=item utf8
|
|
|
|
Boolean, returns a UTF-8 encoded byte string instead of a Perl Unicode string.
|
|
|
|
=item max_size
|
|
|
|
Maximum permitted size, in bytes, of the generated JSON string. Defaults to 1 GiB.
|
|
|
|
=item max_depth
|
|
|
|
Maximum permitted nesting depth of Perl values. Defaults to 512.
|
|
|
|
=back
|
|
|
|
=back
|
|
|
|
(Why the hell yet another JSON codec when CPAN is already full of them!? Well,
|
|
L<JSON::XS> is pretty cool but isn't going to be updated to support Perl's new
|
|
builtin booleans. L<JSON::PP> is slow and while L<Cpanel::JSON::XS> is
|
|
perfectly adequate, its codebase is too large and messy for my taste - too many
|
|
unnecessary features and C<#ifdef>s to support ancient perls and esoteric
|
|
configurations. Still, if you need anything not provided by these functions,
|
|
L<JSON::PP> and L<Cpanel::JSON::XS> are perfectly fine alternatives.
|
|
L<JSON::SIMD> and L<Mojo::JSON> also look like good and maintained candidates.)
|
|
|
|
|
|
=head2 URI-Related Functions
|
|
|
|
While URIs are capable of encoding arbitrary binary data, the functions below
|
|
assume you're only dealing with text. This makes them more robust against weird
|
|
inputs, at the cost of flexibility.
|
|
|
|
=over
|
|
|
|
=item utf8_decode($bytes)
|
|
|
|
Convert a (perl-UTF-8 encoded) byte string into a sanitized perl Unicode
|
|
string. The conversion is performed in-place, so the C<$bytes> argument is
|
|
turned into a Unicode string. Returns the same string for convenience.
|
|
|
|
This function throws an error if the input is not valid UTF-8 or if it contains
|
|
ASCII control characters - that is, any character between C<0x00> and C<0x1f>
|
|
except for tab, newline and carriage return.
|
|
|
|
(This is a tiny wrapper around C<utf8::decode()> with some extra checks)
|
|
|
|
=item uri_escape($string)
|
|
|
|
Takes an Unicode string and returns a percent-encoded ASCII string, suitable
|
|
for use in a query parameter.
|
|
|
|
=item uri_unescape($string)
|
|
|
|
Takes an Unicode string potentially containing percent-encoding and returns a
|
|
decoded Unicode string. Also checks for ASCII control characters as per
|
|
C<utf8_decode()>.
|
|
|
|
=item query_decode($string)
|
|
|
|
Decode a query string or C<application/x-www-form-urlencoded> format (they're
|
|
the same thing). Returns a hashref with decoded key/value pairs. Values for
|
|
duplicated keys are collected into a single array value. Bare keys that do not
|
|
have a value are decoded as C<builtin::true>. Example:
|
|
|
|
my $hash = query_decode 'bare&a=1&a=2&something=else';
|
|
# $hash = {
|
|
# bare => builtin::true,
|
|
# a => [ 1, 2 ],
|
|
# something => 'else'
|
|
# }
|
|
|
|
The input C<$string> is assumed to be a perl Unicode string. An error is thrown
|
|
if the resulting data decodes into invalid UTF-8 or contains control
|
|
characters, as per C<utf8_decode>.
|
|
|
|
=item query_encode($hashref)
|
|
|
|
The opposite of C<query_decode>. Takes a hashref of similar structure and
|
|
returns an ASCII-encoded query string. Keys with C<undef> or C<builtin::false>
|
|
values are omitted in the output.
|
|
|
|
If a given value is a blessed object with a C<TO_QUERY()> method, that method
|
|
is called and it should return either C<undef>, a boolean or a string, which is
|
|
then encoded.
|
|
|
|
=back
|
|
|
|
|
|
=head2 File Descriptor Passing
|
|
|
|
UNIX sockets (see L<IO::Socket::UNIX>) have the fancy property of letting you
|
|
send file descriptors over them, allowing you to pass, for example, a socket
|
|
from one process to another. This is a pretty low-level operation and not
|
|
something you'll often need, but two functions to use that feature are provided
|
|
here anyway because the L<FU> supervisor uses them:
|
|
|
|
=over
|
|
|
|
=item fdpass_send($send_fd, $pass_fd, $message)
|
|
|
|
Send a message and a file descriptor (C<$pass_fd>) over the given socket
|
|
(<$send_fd>). C<$message> must not be empty, even if you don't intend to do
|
|
anything with it on receipt. Both C<$send_fd> and C<$pass_fd> must be numeric
|
|
file descriptors, as obtained by C<fileno()>.
|
|
|
|
=item ($fd, $message) = fdpass_recv($recv_fd, $max_message_len)
|
|
|
|
Read a file descriptor and message from the given C<$recv_fd>, which must be
|
|
the numeric file descriptor of a socket. This function can be used as a
|
|
replacement for C<sysread()>: the returned C<$fd> is undef if no file
|
|
descriptor was received. The returned C<$message> is undef on error or an empty
|
|
string on EOF.
|
|
|
|
Like regular socket I/O, a single C<fdpass_send()> message may be split across
|
|
multiple C<fdpass_recv()> calls; in that case the C<$fd> will only be received
|
|
on the first call.
|
|
|
|
Don't use this function if the sender may include multiple file descriptors in
|
|
a single message, weird things can happen. File descriptors received this way
|
|
do not have the C<CLOEXEC> flag and will thus survive a call to C<exec()>.
|
|
Refer to L<this wonderful
|
|
discussion|https://gist.github.com/kentonv/bc7592af98c68ba2738f4436920868dc>
|
|
for more weirdness and edge cases.
|
|
|
|
=back
|
|
|
|
See also L<IO::FDPass> for a more portable solution, although that one does not
|
|
support passing along regular data.
|