Add initial JSON formatter

It works and can format all "plain" Perl data, but has a few known bugs
and limitations that still need to be worked out.

It's about 8x smaller than JSON::XS's encoder and *much* smaller than
Cpanel::JSON::XS, but this is just a first attempt, it'll grow.
This commit is contained in:
Yorhel 2025-01-27 15:37:05 +01:00
parent 9c80f2465a
commit c16a9fa493
10 changed files with 421 additions and 0 deletions

13
FU.pod Normal file
View file

@ -0,0 +1,13 @@
=head1 NAME
FU - A collection of awesome modules plus a lean and efficient web framework.
=head1 SYNOPSIS
=head1 DESCRIPTION
=head2 Properties
- Requires a moderately recent Perl (>= 5.36).
- Only works on 64-bit Linux (and possibly *BSD).
- Assumes that no threading is used; not all modules are thread-safe.

18
FU.xs
View file

@ -3,4 +3,22 @@
#include "perl.h" #include "perl.h"
#include "XSUB.h" #include "XSUB.h"
#include "c/common.c"
#include "c/jsonfmt.c"
MODULE = FU PACKAGE = FU::XS MODULE = FU PACKAGE = FU::XS
PROTOTYPES: DISABLE
SV *json_format(val)
SV *val
PREINIT:
SV *r;
fustr buf = {};
CODE:
fujson_fmt(&buf, val);
r = fustr_sv(&buf);
SvUTF8_on(r);
RETVAL = r;
OUTPUT:
RETVAL

17
FU/Util.pm Normal file
View file

@ -0,0 +1,17 @@
package FU::Util 0.1;
use v5.36;
use FU::XS;
use Exporter 'import';
our @EXPORT_OK = qw/json_format/;
sub json_format($val, %opt) {
my $r = FU::XS::json_format($val);
# XXX: Does this go over the bytes? If so, not setting SvUTF8_on() in the first place would be much faster.
utf8::encode($r) if $opt{utf8};
$r
}
1;

60
FU/Util.pod Normal file
View file

@ -0,0 +1,60 @@
=head1 NAME
FU::Util - Miscellaneous utility functions that really should have been part of
a core Perl installation but aren't for some reason because the Perl community
doesn't believe in the concept of a "batteries included" standard library.
</rant>
=head1 SYNOPSIS
use FU::Util qw/json_format/;
my $data = json_format [1, 2, 3];
=head1 DESCRIPTION
=head2 JSON parsing & formatting
This module comes with a custom C-based JSON parser and formatter. These
functions conform strictly to L<RFC-8259|https://tools.ietf.org/html/rfc8259>,
non-standard extensions are not supported and never will be.
JSON booleans are decoded into C<builtin::true> and C<builtin::false>. When
formatting, those builtin constants are the I<only> recognized boolean values -
alternative representations such as C<JSON::PP::true> and C<JSON::PP::false>
are not recognized and attempting to format such values will croak.
I<TODO: point to benchmarks.>
I<TODO: FU::JSON wrapper with somewhat-compatible JSON::{PP,XS} API>
=over
=item json_format($scalar, %options)
Format a Perl value as JSON.
With the default options, this function behaves roughly similar to:
JSON::PP->new->allow_nonref->core_bools->convert_blessed->encode($scalar);
This function croaks when attempting to format a floating point C<NaN> or
C<Inf>.
Some modules escape the slash character in encoded strings to prevent a
potential XSS vulnerability when embedding JSON inside C<< <script> ..
</script> >> tags. This function does I<not> do that because it might not even
be sufficient. The following is probably an improvement:
json_format($data) =~ s{</}{<\\/}rg =~ s/<!--/<\\u0021--/rg;
=back
(Why the hell yet another JSON codec when CPAN is already full of them!? Well,
L<JSON::XS> is pretty cool but isn't going to be updated to support Perl's new
builtin booleans. L<JSON::PP> is slow and while L<Cpanel::JSON::XS> is
perfectly adequate, its codebase is a little too messy for my taste - too many
unnecessary features and C<#ifdef>s to support ancient perls and esoteric
configurations. Still, if you need anything not provided by these functions,
L<JSON::PP> and L<Cpanel::JSON::XS> are perfectly fine alternatives.
L<JSON::SIMD> and L<Mojo::JSON> also look like good and maintained candidates.)

View file

@ -1,3 +1,4 @@
# This module is for internal use by other FU modules.
package FU::XS 0.1; package FU::XS 0.1;
use Carp; # may be called by XS. use Carp; # may be called by XS.
use XSLoader; use XSLoader;

View file

@ -8,4 +8,6 @@ WriteMakefile(
META_MERGE => { META_MERGE => {
dynamic_config => 0, dynamic_config => 0,
}, },
OPTIMIZE => '-g -O2 -Wall -Wextra',
depend => { '$(OBJECT)', 'c/*.c' },
); );

View file

@ -1,5 +1,9 @@
# FU - Framework Ultimatum # FU - Framework Ultimatum
WIP.
*Contributing:* Refer to my [contribution guidelines)[https://dev.yorhel.nl/contributing].
## Build & Install ## Build & Install
```sh ```sh
@ -7,3 +11,16 @@ perl Makefile.PL
make make
make install make install
``` ```
## Project ideas
Things that may or may not happen:
- FU - The website framework, taking inspiration from TUWF.
- FU::HTTPServer / FU::FastCGI - Minimal libs to support the web framework.
- FU::JSON - JSON::{XS,PP,etc}-compatible wrapper around FU::Util's JSON functions? I prolly won't need this myself, but could be handy.
- FU::Log - Basic logger.
- FU::PG - PostgreSQL client with support for custom types and a small query builder.
- FU::Util additions: `uri_escape`, `scrypt`, `urandom`.
- FU::Validate - TUWF::Validate & normalization with some improvements.
- FU::XML - TUWF::XMLXS with some improvements.

43
c/common.c Normal file
View file

@ -0,0 +1,43 @@
/* Custom string builder, comparable to functionality provided by SV*
* functions, but with less magic and better inlineable. */
typedef struct {
size_t len;
size_t size;
char *buf;
} fustr;
/* No need to call this, an empty fustr is already usable.
* This allows setting a custom initial size. */
static void fustr_init(fustr *s, size_t prealloc) {
s->len = 0;
s->size = prealloc;
s->buf = safemalloc(prealloc);
}
static void fustr_grow(fustr *s, size_t add) {
if (s->size == 0) s->size = 512;
while (s->size < s->len + add)
s->size *= 2;
s->buf = saferealloc(s->buf, s->size);
}
#define fustr_reserve(s, n) do {\
if (UNLIKELY((s)->size < (s)->len + (n))) fustr_grow(s, n);\
} while(0)
#define fustr_write(s, str, n) do {\
fustr_reserve(s, n);\
memcpy((s)->buf+(s)->len, str, (n));\
(s)->len += (n);\
} while(0)
/* Move the string buffer into a new SV; fustr should be considered invalid after this call.
* Does not set the UTF8 flag. */
static SV *fustr_sv(fustr *s) {
SV *r = newSV(0);
fustr_write(s, "", 1); // trailing nul
sv_usepvn_flags(r, s->buf, s->len-1, SV_HAS_TRAILING_NUL);
// TODO: SvPV_shrink_to_cur?
return r;
}

162
c/jsonfmt.c Normal file
View file

@ -0,0 +1,162 @@
static void fujson_fmt(fustr *, SV *);
static void fujson_fmt_str(fustr *out, const char *stri, size_t len, int utf8) {
size_t off = 0, loff;
const unsigned char *str = (const unsigned char *)stri;
unsigned char x = 0;
/* Validate entire string for conformance if this is flagged as a utf8 string, this lets us be lazy further on. */
if (utf8 && !is_c9strict_utf8_string(str, len)) {
return; /* TODO: Throw error. */
}
fustr_write(out, "\"", 1);
fustr_reserve(out, len);
while (off < len) {
/* Fast path: no escaping needed */
loff = off;
if (utf8) {
/* we already validated everything >=0x80 */
while (off < len) {
x = str[off];
if (x <= 0x1f || x == '"' || x == '\\' || x == 0x7f) break;
off++;
}
} else {
/* binary strings need special handling for >=0x80 */
while (off < len) {
x = str[off];
if (x <= 0x1f || x == '"' || x == '\\' || x >= 0x7f) break;
off++;
}
}
fustr_write(out, str+loff, off-loff);
if (off < len) { /* early break, which means current byte needs special processing */
switch (x) {
case '"': fustr_write(out, "\\\"", 2); break;
case '\\': fustr_write(out, "\\\\", 2); break;
case 0x08: fustr_write(out, "\\b", 2); break;
case 0x09: fustr_write(out, "\\t", 2); break;
case 0x0a: fustr_write(out, "\\n", 2); break;
case 0x0c: fustr_write(out, "\\f", 2); break;
case 0x0d: fustr_write(out, "\\r", 2); break;
default:
if (x < 0x80) {
fustr_reserve(out, 6);
memcpy(out->buf+out->len, "\\u00", 4);
out->buf[out->len+4] = PL_hexdigit[(x >> 4) & 0x0f];
out->buf[out->len+5] = PL_hexdigit[x & 0x0f];
out->len += 6;
} else { /* x >= 0x80, !utf8, so encode as 2-byte UTF-8 */
fustr_reserve(out, 2);
out->buf[out->len ] = 0xc0 | (x >> 6);
out->buf[out->len+1] = 0x80 | (x & 0x3f);
out->len += 2;
}
}
off++;
}
}
fustr_write(out, "\"", 1);
}
static void fujson_fmt_int(fustr *out, SV *val) {
char buf[32];
size_t idx = 32;
int neg = 0;
IV iv;
UV uv;
if (SvIsUV(val)) { /* Why is this macro not documented? */
uv = SvUV_nomg(val);
} else {
iv = SvIV_nomg(val);
neg = iv < 0;
uv = neg ? -iv : iv;
}
if (uv == 0) {
fustr_write(out, "0", 1);
return;
}
while (uv > 0) {
/* TODO: can use a lookup table to optimize for 0 - 100; need benchmark */
buf[--idx] = '0' + (uv % 10);
uv /= 10;
}
if (neg) buf[--idx] = '-';
fustr_write(out, buf+idx, sizeof buf - idx);
}
static void fujson_fmt_av(fustr *out, AV *av) {
int i, len = av_count(av);
fustr_write(out, "[", 1);
for (i=0; i<len; i++) {
if (i) fustr_write(out, ",", 1);
SV **sv = av_fetch(av, i, 0);
if (sv) fujson_fmt(out, *sv); /* sv will have magic if av is tied, but fujson_fmt() handles that. */
else fustr_write(out, "null", 4);
}
fustr_write(out, "]", 1);
}
static void fujson_fmt_hv(fustr *out, HV *hv) {
HE *he;
STRLEN helen;
char *hestr = NULL;
hv_iterinit(hv);
fustr_write(out, "{", 1);
while ((he = hv_iternext(hv))) {
if (hestr) fustr_write(out, ",", 1);
hestr = HePV(he, helen);
fujson_fmt_str(out, hestr, helen, HeUTF8(he));
fustr_write(out, ":", 1);
fujson_fmt(out, UNLIKELY(SvMAGICAL(hv)) ? hv_iterval(hv, he) : HeVAL(he));
}
fustr_write(out, "}", 1);
}
/* BUG: Leaks *out on error, that should be on the temp stack */
static void fujson_fmt(fustr *out, SV *val) {
SvGETMAGIC(val);
/* XXX: &PL_sv_yes and &PL_sv_no are proper booleans under 5.40, so no need
* to explicitly check for those; does this work in 5.36 as well? */
if (SvIsBOOL(val)) { /* Must check before IOKp & POKp, because bool implies both flags */
if (BOOL_INTERNALS_sv_isbool_true(val)) fustr_write(out, "true", 4);
else fustr_write(out, "false", 5);
} else if (SvPOKp(val)) {
fujson_fmt_str(out, SvPVX(val), SvCUR(val), SvUTF8(val));
} else if (SvNOKp(val)) { /* Must check before IOKp, because integer conversion might have been lossy */
/* TODO: quadmath? */
NV nv = SvNV_nomg(val);
if (isinfnan(nv)) return; /* TODO: error */
fustr_reserve(out, NV_DIG+1);
Gconvert(nv, NV_DIG, 0, out->buf + out->len);
out->len += strlen(out->buf + out->len);
} else if (SvIOKp(val)) {
fujson_fmt_int(out, val);
} else if (SvROK(val)) {
SV *rv = SvRV(val);
SvGETMAGIC(rv);
if (UNLIKELY(SvOBJECT(rv))) { /* TODO: Check for TO_JSON */ }
else if (SvTYPE(rv) == SVt_PVHV) fujson_fmt_hv(out, (HV *)rv);
else if (SvTYPE(rv) == SVt_PVAV) fujson_fmt_av(out, (AV *)rv);
else return; /* TODO: error */
} else if (!SvOK(val)) {
fustr_write(out, "null", 4);
} else {
/* TODO: error */
}
}
/* TODO: canonical */
/* TODO: pretty */
/* TODO: max depth? */
/* TODO: threading support */

88
t/json_format.t Normal file
View file

@ -0,0 +1,88 @@
use v5.36;
use builtin 'true', 'false';
use Test::More;
use Tie::Array;
use Tie::Hash;
use FU::Util 'json_format';
# TODO: gab some more tests from other JSON libs
# TODO: Test invalid utf8
my @tests = (
undef, 'null',
true, 'true',
false, 'false',
0, '0',
1, '1',
-1, '-1',
-9223372036854775808, '-9223372036854775808',
9223372036854775807, '9223372036854775807',
18446744073709551615, '18446744073709551615',
0.1, '0.1',
0.000123, '0.000123',
-1e100, '-1e+100',
do { use utf8; (
"\x01é\r\n\x1f💩", '"\u0001é\r\n\u001f💩"',
)},
do { use bytes; (
"\x011\r\n\x8c", "\"\\u00011\\r\\n\x8c\"",
"\xff\xff", "\"\xff\xff\"",
"\x{1f4a9}", do { use utf8; '"💩"' },
)},
[], '[]',
[0,1], '[0,1]',
[true,'hi',-0.123, [undef]], '[true,"hi",-0.123,[null]]',
do { tie my @a, 'Tie::StdArray'; @a = (1,2); \@a }, '[1,2]',
{}, '{}',
{'a',1}, '{"a":1}',
do { tie my %h, 'Tie::StdHash'; %h = ('a',1); \%h }, '{"a":1}',
do { tie my %h, 'MyOrderedHash', one => 1, two => undef, three => []; \%h }, '{"one":1,"two":null,"three":[]}',
# from http://e-choroba.eu/18-yapc
$$, $$,
''.$$, '"'.$$.'"',
do { my $x = 12; utf8::decode($x); $x }, '"12"',
do { no warnings 'numeric'; my $x = '19a'; $x += 0; $x }, '19',
1844674407370955161 / 10, '1.84467440737096e+17',
);
plan tests => @tests + 6;
for my($in, $exp) (@tests) {
my $out = json_format $in;
is $out, $exp;
ok utf8::is_utf8($out);
}
# http://e-choroba.eu/18-yapc slide 6
tie my $incs, 'MyIncrementer', 'Xa';
is json_format($incs), '"Xa"';
is json_format($incs), '"Xb"';
is json_format($incs), '"Xc"';
tie my $incu, 'MyIncrementer', 4;
is json_format($incu), 4;
is json_format($incu), 5;
is json_format($incu), 6;
package MyIncrementer;
use Tie::Scalar;
use parent -norequire => 'Tie::StdScalar';
sub TIESCALAR { my ($class, $val) = @_; bless \$val, $class }
sub FETCH { my $s = shift; $$s++ }
package MyOrderedHash;
sub TIEHASH { shift; bless [ [ map $_[$_*2], 0..$#_/2 ], +{@_}, 0 ], __PACKAGE__ };
sub FETCH { $_[0][1]{$_[1]} }
sub EXISTS { exists $_[0][1]{$_[1]} }
sub FIRSTKEY { $_[0][2] = 0; &NEXTKEY }
sub NEXTKEY { $_[0][0][ $_[0][2]++ ] }
sub SCALAR { scalar $_[0][0]->@* }