From c16a9fa4932abd8daff068745f456b3325ec14fd Mon Sep 17 00:00:00 2001 From: Yorhel Date: Mon, 27 Jan 2025 15:37:05 +0100 Subject: [PATCH] Add initial JSON formatter It works and can format all "plain" Perl data, but has a few known bugs and limitations that still need to be worked out. It's about 8x smaller than JSON::XS's encoder and *much* smaller than Cpanel::JSON::XS, but this is just a first attempt, it'll grow. --- FU.pod | 13 ++++ FU.xs | 18 ++++++ FU/Util.pm | 17 +++++ FU/Util.pod | 60 ++++++++++++++++++ FU/XS.pm | 1 + Makefile.PL | 2 + README.md | 17 +++++ c/common.c | 43 +++++++++++++ c/jsonfmt.c | 162 ++++++++++++++++++++++++++++++++++++++++++++++++ t/json_format.t | 88 ++++++++++++++++++++++++++ 10 files changed, 421 insertions(+) create mode 100644 FU.pod create mode 100644 FU/Util.pm create mode 100644 FU/Util.pod create mode 100644 c/common.c create mode 100644 c/jsonfmt.c create mode 100644 t/json_format.t diff --git a/FU.pod b/FU.pod new file mode 100644 index 0000000..d11dd5b --- /dev/null +++ b/FU.pod @@ -0,0 +1,13 @@ +=head1 NAME + +FU - A collection of awesome modules plus a lean and efficient web framework. + +=head1 SYNOPSIS + +=head1 DESCRIPTION + +=head2 Properties + +- Requires a moderately recent Perl (>= 5.36). +- Only works on 64-bit Linux (and possibly *BSD). +- Assumes that no threading is used; not all modules are thread-safe. diff --git a/FU.xs b/FU.xs index d0d44f0..9b9d17c 100644 --- a/FU.xs +++ b/FU.xs @@ -3,4 +3,22 @@ #include "perl.h" #include "XSUB.h" +#include "c/common.c" +#include "c/jsonfmt.c" + MODULE = FU PACKAGE = FU::XS + +PROTOTYPES: DISABLE + +SV *json_format(val) + SV *val + PREINIT: + SV *r; + fustr buf = {}; + CODE: + fujson_fmt(&buf, val); + r = fustr_sv(&buf); + SvUTF8_on(r); + RETVAL = r; + OUTPUT: + RETVAL diff --git a/FU/Util.pm b/FU/Util.pm new file mode 100644 index 0000000..1b40b45 --- /dev/null +++ b/FU/Util.pm @@ -0,0 +1,17 @@ +package FU::Util 0.1; + +use v5.36; +use FU::XS; +use Exporter 'import'; + +our @EXPORT_OK = qw/json_format/; + + +sub json_format($val, %opt) { + my $r = FU::XS::json_format($val); + # XXX: Does this go over the bytes? If so, not setting SvUTF8_on() in the first place would be much faster. + utf8::encode($r) if $opt{utf8}; + $r +} + +1; diff --git a/FU/Util.pod b/FU/Util.pod new file mode 100644 index 0000000..d543796 --- /dev/null +++ b/FU/Util.pod @@ -0,0 +1,60 @@ +=head1 NAME + +FU::Util - Miscellaneous utility functions that really should have been part of +a core Perl installation but aren't for some reason because the Perl community +doesn't believe in the concept of a "batteries included" standard library. + + +=head1 SYNOPSIS + + use FU::Util qw/json_format/; + + my $data = json_format [1, 2, 3]; + +=head1 DESCRIPTION + +=head2 JSON parsing & formatting + +This module comes with a custom C-based JSON parser and formatter. These +functions conform strictly to L, +non-standard extensions are not supported and never will be. + +JSON booleans are decoded into C and C. When +formatting, those builtin constants are the I recognized boolean values - +alternative representations such as C and C +are not recognized and attempting to format such values will croak. + +I + +I + +=over + +=item json_format($scalar, %options) + +Format a Perl value as JSON. + +With the default options, this function behaves roughly similar to: + + JSON::PP->new->allow_nonref->core_bools->convert_blessed->encode($scalar); + +This function croaks when attempting to format a floating point C or +C. + +Some modules escape the slash character in encoded strings to prevent a +potential XSS vulnerability when embedding JSON inside C<< >> tags. This function does I do that because it might not even +be sufficient. The following is probably an improvement: + + json_format($data) =~ s{ is pretty cool but isn't going to be updated to support Perl's new +builtin booleans. L is slow and while L is +perfectly adequate, its codebase is a little too messy for my taste - too many +unnecessary features and C<#ifdef>s to support ancient perls and esoteric +configurations. Still, if you need anything not provided by these functions, +L and L are perfectly fine alternatives. +L and L also look like good and maintained candidates.) diff --git a/FU/XS.pm b/FU/XS.pm index 646ea10..f267ba3 100644 --- a/FU/XS.pm +++ b/FU/XS.pm @@ -1,3 +1,4 @@ +# This module is for internal use by other FU modules. package FU::XS 0.1; use Carp; # may be called by XS. use XSLoader; diff --git a/Makefile.PL b/Makefile.PL index 8e110be..f2d48be 100644 --- a/Makefile.PL +++ b/Makefile.PL @@ -8,4 +8,6 @@ WriteMakefile( META_MERGE => { dynamic_config => 0, }, + OPTIMIZE => '-g -O2 -Wall -Wextra', + depend => { '$(OBJECT)', 'c/*.c' }, ); diff --git a/README.md b/README.md index 5103de5..38543bc 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,9 @@ # FU - Framework Ultimatum +WIP. + +*Contributing:* Refer to my [contribution guidelines)[https://dev.yorhel.nl/contributing]. + ## Build & Install ```sh @@ -7,3 +11,16 @@ perl Makefile.PL make make install ``` + +## Project ideas + +Things that may or may not happen: + +- FU - The website framework, taking inspiration from TUWF. +- FU::HTTPServer / FU::FastCGI - Minimal libs to support the web framework. +- FU::JSON - JSON::{XS,PP,etc}-compatible wrapper around FU::Util's JSON functions? I prolly won't need this myself, but could be handy. +- FU::Log - Basic logger. +- FU::PG - PostgreSQL client with support for custom types and a small query builder. +- FU::Util additions: `uri_escape`, `scrypt`, `urandom`. +- FU::Validate - TUWF::Validate & normalization with some improvements. +- FU::XML - TUWF::XMLXS with some improvements. diff --git a/c/common.c b/c/common.c new file mode 100644 index 0000000..c3f0ac8 --- /dev/null +++ b/c/common.c @@ -0,0 +1,43 @@ +/* Custom string builder, comparable to functionality provided by SV* + * functions, but with less magic and better inlineable. */ + +typedef struct { + size_t len; + size_t size; + char *buf; +} fustr; + +/* No need to call this, an empty fustr is already usable. + * This allows setting a custom initial size. */ +static void fustr_init(fustr *s, size_t prealloc) { + s->len = 0; + s->size = prealloc; + s->buf = safemalloc(prealloc); +} + +static void fustr_grow(fustr *s, size_t add) { + if (s->size == 0) s->size = 512; + while (s->size < s->len + add) + s->size *= 2; + s->buf = saferealloc(s->buf, s->size); +} + +#define fustr_reserve(s, n) do {\ + if (UNLIKELY((s)->size < (s)->len + (n))) fustr_grow(s, n);\ + } while(0) + +#define fustr_write(s, str, n) do {\ + fustr_reserve(s, n);\ + memcpy((s)->buf+(s)->len, str, (n));\ + (s)->len += (n);\ + } while(0) + +/* Move the string buffer into a new SV; fustr should be considered invalid after this call. + * Does not set the UTF8 flag. */ +static SV *fustr_sv(fustr *s) { + SV *r = newSV(0); + fustr_write(s, "", 1); // trailing nul + sv_usepvn_flags(r, s->buf, s->len-1, SV_HAS_TRAILING_NUL); + // TODO: SvPV_shrink_to_cur? + return r; +} diff --git a/c/jsonfmt.c b/c/jsonfmt.c new file mode 100644 index 0000000..c7c843f --- /dev/null +++ b/c/jsonfmt.c @@ -0,0 +1,162 @@ +static void fujson_fmt(fustr *, SV *); + +static void fujson_fmt_str(fustr *out, const char *stri, size_t len, int utf8) { + size_t off = 0, loff; + const unsigned char *str = (const unsigned char *)stri; + unsigned char x = 0; + + /* Validate entire string for conformance if this is flagged as a utf8 string, this lets us be lazy further on. */ + if (utf8 && !is_c9strict_utf8_string(str, len)) { + return; /* TODO: Throw error. */ + } + + fustr_write(out, "\"", 1); + fustr_reserve(out, len); + + while (off < len) { + /* Fast path: no escaping needed */ + loff = off; + if (utf8) { + /* we already validated everything >=0x80 */ + while (off < len) { + x = str[off]; + if (x <= 0x1f || x == '"' || x == '\\' || x == 0x7f) break; + off++; + } + } else { + /* binary strings need special handling for >=0x80 */ + while (off < len) { + x = str[off]; + if (x <= 0x1f || x == '"' || x == '\\' || x >= 0x7f) break; + off++; + } + } + fustr_write(out, str+loff, off-loff); + + if (off < len) { /* early break, which means current byte needs special processing */ + switch (x) { + case '"': fustr_write(out, "\\\"", 2); break; + case '\\': fustr_write(out, "\\\\", 2); break; + case 0x08: fustr_write(out, "\\b", 2); break; + case 0x09: fustr_write(out, "\\t", 2); break; + case 0x0a: fustr_write(out, "\\n", 2); break; + case 0x0c: fustr_write(out, "\\f", 2); break; + case 0x0d: fustr_write(out, "\\r", 2); break; + default: + if (x < 0x80) { + fustr_reserve(out, 6); + memcpy(out->buf+out->len, "\\u00", 4); + out->buf[out->len+4] = PL_hexdigit[(x >> 4) & 0x0f]; + out->buf[out->len+5] = PL_hexdigit[x & 0x0f]; + out->len += 6; + } else { /* x >= 0x80, !utf8, so encode as 2-byte UTF-8 */ + fustr_reserve(out, 2); + out->buf[out->len ] = 0xc0 | (x >> 6); + out->buf[out->len+1] = 0x80 | (x & 0x3f); + out->len += 2; + } + } + off++; + } + } + + fustr_write(out, "\"", 1); +} + +static void fujson_fmt_int(fustr *out, SV *val) { + char buf[32]; + size_t idx = 32; + int neg = 0; + IV iv; + UV uv; + + if (SvIsUV(val)) { /* Why is this macro not documented? */ + uv = SvUV_nomg(val); + } else { + iv = SvIV_nomg(val); + neg = iv < 0; + uv = neg ? -iv : iv; + } + + if (uv == 0) { + fustr_write(out, "0", 1); + return; + } + + while (uv > 0) { + /* TODO: can use a lookup table to optimize for 0 - 100; need benchmark */ + buf[--idx] = '0' + (uv % 10); + uv /= 10; + } + if (neg) buf[--idx] = '-'; + fustr_write(out, buf+idx, sizeof buf - idx); +} + +static void fujson_fmt_av(fustr *out, AV *av) { + int i, len = av_count(av); + fustr_write(out, "[", 1); + for (i=0; ibuf + out->len); + out->len += strlen(out->buf + out->len); + } else if (SvIOKp(val)) { + fujson_fmt_int(out, val); + } else if (SvROK(val)) { + SV *rv = SvRV(val); + SvGETMAGIC(rv); + if (UNLIKELY(SvOBJECT(rv))) { /* TODO: Check for TO_JSON */ } + else if (SvTYPE(rv) == SVt_PVHV) fujson_fmt_hv(out, (HV *)rv); + else if (SvTYPE(rv) == SVt_PVAV) fujson_fmt_av(out, (AV *)rv); + else return; /* TODO: error */ + } else if (!SvOK(val)) { + fustr_write(out, "null", 4); + } else { + /* TODO: error */ + } +} + +/* TODO: canonical */ +/* TODO: pretty */ +/* TODO: max depth? */ +/* TODO: threading support */ diff --git a/t/json_format.t b/t/json_format.t new file mode 100644 index 0000000..b5334a2 --- /dev/null +++ b/t/json_format.t @@ -0,0 +1,88 @@ +use v5.36; +use builtin 'true', 'false'; +use Test::More; +use Tie::Array; +use Tie::Hash; +use FU::Util 'json_format'; + +# TODO: gab some more tests from other JSON libs +# TODO: Test invalid utf8 + +my @tests = ( + undef, 'null', + true, 'true', + false, 'false', + 0, '0', + 1, '1', + -1, '-1', + -9223372036854775808, '-9223372036854775808', + 9223372036854775807, '9223372036854775807', + 18446744073709551615, '18446744073709551615', + 0.1, '0.1', + 0.000123, '0.000123', + -1e100, '-1e+100', + + do { use utf8; ( + "\x01é\r\n\x1f💩", '"\u0001é\r\n\u001f💩"', + )}, + + do { use bytes; ( + "\x011\r\n\x8c", "\"\\u00011\\r\\n\x8c\"", + "\xff\xff", "\"\xff\xff\"", + "\x{1f4a9}", do { use utf8; '"💩"' }, + )}, + + [], '[]', + [0,1], '[0,1]', + [true,'hi',-0.123, [undef]], '[true,"hi",-0.123,[null]]', + do { tie my @a, 'Tie::StdArray'; @a = (1,2); \@a }, '[1,2]', + + {}, '{}', + {'a',1}, '{"a":1}', + do { tie my %h, 'Tie::StdHash'; %h = ('a',1); \%h }, '{"a":1}', + do { tie my %h, 'MyOrderedHash', one => 1, two => undef, three => []; \%h }, '{"one":1,"two":null,"three":[]}', + + # from http://e-choroba.eu/18-yapc + $$, $$, + ''.$$, '"'.$$.'"', + do { my $x = 12; utf8::decode($x); $x }, '"12"', + do { no warnings 'numeric'; my $x = '19a'; $x += 0; $x }, '19', + 1844674407370955161 / 10, '1.84467440737096e+17', +); + +plan tests => @tests + 6; + +for my($in, $exp) (@tests) { + my $out = json_format $in; + is $out, $exp; + ok utf8::is_utf8($out); +} + + + +# http://e-choroba.eu/18-yapc slide 6 + +tie my $incs, 'MyIncrementer', 'Xa'; +is json_format($incs), '"Xa"'; +is json_format($incs), '"Xb"'; +is json_format($incs), '"Xc"'; + +tie my $incu, 'MyIncrementer', 4; +is json_format($incu), 4; +is json_format($incu), 5; +is json_format($incu), 6; + +package MyIncrementer; +use Tie::Scalar; +use parent -norequire => 'Tie::StdScalar'; +sub TIESCALAR { my ($class, $val) = @_; bless \$val, $class } +sub FETCH { my $s = shift; $$s++ } + + +package MyOrderedHash; +sub TIEHASH { shift; bless [ [ map $_[$_*2], 0..$#_/2 ], +{@_}, 0 ], __PACKAGE__ }; +sub FETCH { $_[0][1]{$_[1]} } +sub EXISTS { exists $_[0][1]{$_[1]} } +sub FIRSTKEY { $_[0][2] = 0; &NEXTKEY } +sub NEXTKEY { $_[0][0][ $_[0][2]++ ] } +sub SCALAR { scalar $_[0][0]->@* }