Large-ish SQL schema revamp/optimizations

Primarily aimed at reducing the size of the old 'man' (now: files)
table, using smaller integers to refer to man contents and text fields,
and storing a shorthash as an integer for quick lookups. This better
normalization also removes the need to keep a separate 'man_index' cache
for the search function.

The old schema wasn't necessarily bad, but I was in the mood for some
optimizations. And a little cleanup.

Prolly introduces a bunch of new bugs, I haven't tested this too well.
This commit is contained in:
Yorhel 2021-12-14 15:06:05 +01:00
parent 6f7f59c6df
commit f376f1f137
6 changed files with 268 additions and 128 deletions

View file

@ -73,8 +73,11 @@ sub sysbyshort { state $s ||= { map +($_->{short}, $_), systems->@* } }
# Firefox seems to escape [ and ] in URLs. It doesn't really have to...
sub normalize_name { $_[0] =~ s/%5b/[/irg =~ s/%5d/]/irg =~ s/%20/ /rg }
sub shorthash_to_hex { unpack 'H*', pack 'i', $_[0] } # int -> hex
sub shorthash_to_int { unpack 'i', pack 'H*', $_[0] } # hex -> int
# Subquery returning all packages that have a man page.
my $packages_with_man = '(SELECT * FROM packages p WHERE EXISTS(SELECT 1 FROM package_versions pv WHERE pv.package = p.id AND EXISTS(SELECT 1 FROM man m WHERE m.package = pv.id)))';
my $packages_with_man = '(SELECT * FROM packages p WHERE EXISTS(SELECT 1 FROM package_versions pv WHERE pv.package = p.id AND EXISTS(SELECT 1 FROM files f WHERE f.pkgver = pv.id)))';
sub escape_like { $_[0] =~ s/([_%\\])/\\$1/rg }
@ -86,13 +89,6 @@ sub sql_join {
sub sql_and { @_ ? sql_join 'AND', map sql('(', $_, ')'), @_ : sql '1=1' }
sub sql_or { @_ ? sql_join 'OR', map sql('(', $_, ')'), @_ : sql '1=0' }
# Subquery to match $sql_expr::bytea against a $prefix (hex string). Hopefully indexable.
sub sql_hash_prefix {
my($sql_expr, $prefix) = @_;
my $esc = unpack 'H*', escape_like pack 'H*', $prefix;
sql '(', $sql_expr, "like ('\\x$esc'::bytea||'%'))"
}
sub pkg_frompath {
my($sys_where, $path) = @_;
@ -139,27 +135,29 @@ sub man_pref {
# 7. sysrel: Prefer a more recent system release over an older release
# 8. secorder: Lower sections before higher sections (because man does it this way, for some reason)
# 9. pkgdate: Prefer more recent packages (cross-distro)
# 10. Fall back on hash comparison, to ensure the result is stable
# 10. Fall back on shorthash comparison, to ensure the result is stable
state $archid = sysbyshort->{arch}{id};
state $debid = (sort { $b->{id} <=> $a->{id} } grep $_->{short} =~ /^debian-/, systems->@*)[0]{id};
tuwf->dbRowi(q{
WITH unfiltered AS (
SELECT s AS sys, p AS pkg, v AS ver, m AS man
FROM man m
JOIN package_versions v ON v.id = m.package
SELECT m.name, m.section, l.locale, f.shorthash, f.content, f.filename, s AS sys, p AS pkg, v AS ver
FROM files f
JOIN locales l ON l.id = f.locale
JOIN mans m ON m.id = f.man
JOIN package_versions v ON v.id = f.pkgver
JOIN packages p ON p.id = v.package
JOIN systems s ON s.id = p.system
WHERE}, $where, q{
), f_english AS(
SELECT * FROM unfiltered WHERE NOT EXISTS(SELECT 1 FROM unfiltered WHERE is_english_locale((man).locale)) OR is_english_locale((man).locale)
SELECT * FROM unfiltered WHERE NOT EXISTS(SELECT 1 FROM unfiltered WHERE is_english_locale(locale)) OR is_english_locale(locale)
), f_pkgver AS(
SELECT * FROM f_english a WHERE NOT EXISTS(SELECT 1 FROM f_english b WHERE (a.ver).package = (b.ver).package AND (a.ver).released < (b.ver).released)
), f_stdloc AS(
SELECT * FROM f_pkgver WHERE NOT EXISTS(SELECT 1 FROM f_pkgver WHERE is_standard_man_location((man).filename)) OR is_standard_man_location((man).filename)
SELECT * FROM f_pkgver WHERE NOT EXISTS(SELECT 1 FROM f_pkgver WHERE is_standard_man_location(filename)) OR is_standard_man_location(filename)
), f_secmatch AS(
SELECT * FROM f_stdloc WHERE NOT EXISTS(SELECT 1 FROM f_stdloc WHERE (man).section =}, \$section, q{) OR (man).section =}, \$section, q{
SELECT * FROM f_stdloc WHERE NOT EXISTS(SELECT 1 FROM f_stdloc WHERE section =}, \$section, q{) OR section =}, \$section, q{
), f_arch AS(
SELECT * FROM f_secmatch WHERE NOT EXISTS(SELECT 1 FROM}, length $section ? 'f_secmatch' : 'f_stdloc', qq{WHERE (sys).id = $archid) OR (sys).id = $archid
), f_debian AS(
@ -167,13 +165,13 @@ sub man_pref {
), f_sysrel AS(
SELECT * FROM f_debian a WHERE NOT EXISTS(SELECT 1 FROM f_debian b WHERE (a.sys).name = (b.sys).name AND (a.sys).id < (b.sys).id)
), f_secorder AS(
SELECT * FROM f_sysrel a WHERE NOT EXISTS(SELECT 1 FROM f_sysrel b WHERE (a.man).section > (b.man).section)
SELECT * FROM f_sysrel a WHERE NOT EXISTS(SELECT 1 FROM f_sysrel b WHERE section > section)
), f_pkgdate AS(
SELECT * FROM f_secorder a WHERE NOT EXISTS(SELECT 1 FROM f_secorder b WHERE (a.ver).released < (b.ver).released)
)
SELECT (pkg).system, (pkg).category, (pkg).name AS package, (ver).version, (ver).released, (ver).id AS verid,
(man).name, (man).section, (man).filename, (man).locale, encode((man).hash, 'hex') AS hash
FROM f_pkgdate ORDER BY (man).hash LIMIT 1
name, section, filename, locale, shorthash, content
FROM f_pkgdate ORDER BY shorthash LIMIT 1
});
}
@ -562,13 +560,11 @@ sub search_man {
my $sect = $q =~ s/^([0-9])\s+// || $q =~ s/\(([a-zA-Z0-9]+)\)$// ? $1 : '';
my $name = $q =~ s/^([a-zA-Z0-9,.:_-]+)// ? $1 : '';
return !$name ? [] : tuwf->dbAll(
'SELECT name, section FROM man_index !W ORDER BY name, section LIMIT ?',
{
'lower(name) LIKE ?' => escape_like(lc $name).'%',
$sect ? ('section ILIKE ?' => escape_like(lc $sect).'%') : (),
},
$limit
return !$name ? [] : tuwf->dbAlli(
'SELECT name, section FROM mans WHERE', sql_and(
sql('lower(name) LIKE', \(escape_like(lc $name).'%')),
$sect ? sql('section ILIKE', \(escape_like(lc $sect).'%')) : (),
), 'ORDER BY name, section LIMIT', \$limit,
);
}
@ -614,10 +610,11 @@ TUWF::get qr{/([^/]+)/([0-9a-f]{8})/src} => sub {
my $nfo = tuwf->dbRowi('
SELECT m.name, m.section, v.released, c.content
FROM man m
JOIN package_versions v ON v.id = m.package
JOIN contents c ON c.hash = m.hash
WHERE m.name =', \$name, 'AND', sql_hash_prefix('m.hash', $hash), '
FROM files f
JOIN mans m ON m.id = f.man
JOIN package_versions v ON v.id = f.pkgver
JOIN contents c ON c.id = f.content
WHERE m.name =', \$name, 'AND f.shorthash =', \shorthash_to_int($hash), '
LIMIT 1'
);
return tuwf->resNotFound if !$nfo->{name};
@ -633,13 +630,16 @@ sub _man_nav {
my($man, $toc) = @_;
my @sect = map $_->{section}, tuwf->dbAlli(
'SELECT DISTINCT section FROM man WHERE name =', \$man->{name}, 'ORDER BY section'
'SELECT DISTINCT section FROM mans WHERE name =', \$man->{name}, 'ORDER BY section'
)->@*;
my @lang = map $_->{lang}, tuwf->dbAlli(
"SELECT DISTINCT substring(locale from '^[^.]+') AS lang
FROM man WHERE name =", \$man->{name}, 'AND section =', \$man->{section}, "
ORDER BY substring(locale from '^[^.]+') NULLS FIRST"
"SELECT DISTINCT substring(l.locale from '^[^.]+') AS lang
FROM files f
JOIN mans m ON m.id = f.man
JOIN locales l ON l.id = f.locale
WHERE m.name =", \$man->{name}, 'AND m.section =', \$man->{section}, "
ORDER BY substring(l.locale from '^[^.]+') NULLS FIRST"
)->@*;
return if !@sect && !@lang && !@$toc;
@ -707,7 +707,7 @@ sub soelim {
my($man) = $verid ? man_pref_name $name, sql 'v.id =', \$verid : ();
$man->{name}
# Recursive soelim, but the second call gets $verid=0 so we don't keep checking the database
? soelim(0, tuwf->dbVali("SELECT content FROM contents WHERE hash = decode(", \$man->{hash}, ", 'hex')"))
? soelim(0, tuwf->dbVali("SELECT content FROM contents WHERE id =", \$man->{content}))
: ".in -10\n.sp\n\[\[\[MANNEDINCLUDE$path\]\]\]"
}emg;
$src;
@ -727,12 +727,12 @@ TUWF::get qr{/(?<name>[^/]+)(?:/(?<hash>[0-9a-f]{8}))?} => sub {
# the same package as the requested man page. Use the man_pref logic here
# to deterministically select a good package.
my($man, undef) = $shorthash
? man_pref undef, sql 'm.name =', \$name, 'AND', sql_hash_prefix 'm.hash', $shorthash
? man_pref undef, sql 'm.name =', \$name, 'AND f.shorthash =', shorthash_to_int($shorthash)
: man_pref_name $name, 'true';
return tuwf->resNotFound() if !$man->{name};
my $fmt = ManUtils::html ManUtils::fmt_block soelim $man->{verid},
tuwf->dbVali("SELECT content FROM contents WHERE hash = decode(", \$man->{hash}, ", 'hex')");
my $content = tuwf->dbRowi('SELECT encode(hash, \'hex\') AS hash, content FROM contents WHERE id =', \$man->{content});
my $fmt = ManUtils::html ManUtils::fmt_block soelim $man->{verid}, $content->{content};
my @toc;
$fmt =~ s{\n<b>(.+?)<\/b>\n}{
push @toc, $1;
@ -741,9 +741,14 @@ TUWF::get qr{/(?<name>[^/]+)(?:/(?<hash>[0-9a-f]{8}))?} => sub {
}eg;
my $hasversions = tuwf->dbVali(
'SELECT 1 FROM man WHERE name =', \$man->{name}, 'AND section =', \$man->{section},
'AND locale IS NOT DISTINCT FROM', \$man->{locale},
'AND hash <> decode(', \$man->{hash}, ", 'hex') LIMIT 1"
'SELECT 1
FROM files f
JOIN mans m ON m.id = f.man
JOIN locales l ON l.id = f.locale
WHERE m.name =', \$man->{name}, 'AND m.section =', \$man->{section}, '
AND l.locale =', \$man->{locale}, '
AND f.shorthash <> ', \shorthash_to_int($man->{shorthash}), '
LIMIT 1'
);
tuwf->resLastMod($man->{released});
@ -751,14 +756,14 @@ TUWF::get qr{/(?<name>[^/]+)(?:/(?<hash>[0-9a-f]{8}))?} => sub {
_man_nav $man, \@toc;
div_ id => 'manbuttons', sub {
h1_ $man->{name};
ul_ 'data-hash' => $man->{hash},
ul_ 'data-hash' => $content->{hash},
'data-name' => $man->{name},
'data-section' => $man->{section},
'data-locale' => $man->{locale}||'',
'data-hasversions' => $hasversions?1:0,
sub {
li_ sub { a_ href => "/$man->{name}/".substr($man->{hash}, 0, 8).'/src', 'source' };
li_ sub { a_ href => "/$man->{name}/".substr($man->{hash}, 0, 8), 'permalink' };
li_ sub { a_ href => "/$man->{name}/".shorthash_to_hex($man->{shorthash}).'/src', 'source' };
li_ sub { a_ href => "/$man->{name}/".shorthash_to_hex($man->{shorthash}), 'permalink' };
}
};
div_ id => 'manres', class => 'hidden', '';
@ -828,7 +833,7 @@ TUWF::get qr{/pkg/([^/]+)/(.+)} => sub {
SELECT id, version, released
FROM package_versions v
WHERE package =', \$pkg->{id}, '
AND EXISTS(SELECT 1 FROM man m WHERE m.package = v.id)
AND EXISTS(SELECT 1 FROM files f WHERE f.pkgver = v.id)
ORDER BY released DESC'
);
my $sel = $ver ? (grep $_->{version} eq $ver, @$vers)[0] : $vers->[0];
@ -836,11 +841,14 @@ TUWF::get qr{/pkg/([^/]+)/(.+)} => sub {
my $p = tuwf->validate(get => p => { onerror => 1, uint => 1, range => [1,100] })->data;
my $count = tuwf->dbVali('SELECT count(*) FROM man WHERE package =', \$sel->{id});
my $count = tuwf->dbVali('SELECT count(*) FROM files WHERE pkgver =', \$sel->{id});
my $mans = tuwf->dbPagei({ results => 200, page => $p },
"SELECT name, encode(hash, 'hex') AS hash, section, locale, filename
FROM man WHERE package =", \$sel->{id}, '
ORDER BY name, locale NULLS FIRST, filename'
"SELECT m.name, m.section, f.shorthash, f.filename, l.locale
FROM files f
JOIN locales l ON l.id = f.locale
JOIN mans m ON m.id = f.man
WHERE f.pkgver =", \$sel->{id}, '
ORDER BY m.name, l.locale, f.filename'
);
# Latest version of this package determines last modification date of the page.
@ -871,7 +879,7 @@ TUWF::get qr{/pkg/([^/]+)/(.+)} => sub {
paginate_ "/pkg/$sys->{short}/$pkg->{category}/$pkg->{name}/$sel->{version}?p=", $count, 200, $p;
ul_ sub {
li_ sub {
a_ href => "/$_->{name}/".substr($_->{hash},0,8), "$_->{name}($_->{section})";
a_ href => "/$_->{name}/".shorthash_to_hex($_->{shorthash}), "$_->{name}($_->{section})";
b_ " $_->{locale}" if $_->{locale};
small_ " $_->{filename}";
} for(@$mans);
@ -922,7 +930,7 @@ TUWF::get qr{/man/([^/]+)/(.+)} => sub {
}
return tuwf->resNotFound if !$man;
tuwf->resRedirect("/$man->{name}/".substr($man->{hash}, 0, 8), 'temp');
tuwf->resRedirect("/$man->{name}/".shorthash_to_hex($man->{shorthash}), 'temp');
};
@ -932,9 +940,9 @@ TUWF::get qr{/lang/([^/]+)/([^/]+)} => sub {
my $lang = tuwf->capture(1);
my $name = normalize_name tuwf->capture(2);
my($man, undef) = man_pref_name $name,
sql "substring(locale from '^[^.]+') ilike", \escape_like $lang;
return tuwf->resNotFound if !$man->{name};
tuwf->resRedirect("/$man->{name}/".substr($man->{hash}, 0, 8), 'temp');
sql "substring(l.locale from '^[^.]+') ilike", \(escape_like($lang).'%');
return tuwf->resNotFound if !length $man->{name};
tuwf->resRedirect("/$man->{name}/".shorthash_to_hex($man->{shorthash}), 'temp');
};
@ -949,22 +957,24 @@ TUWF::get '/json/tree.json' => sub {
return tuwf->resNotFound() if !$f->{hash} && !($f->{section} && $f->{name});
my $l = tuwf->dbAlli("
SELECT p.system, p.category, p.name AS package, v.version, v.released, v.id AS verid, m.name, m.section, m.filename, m.locale, encode(m.hash, 'hex') AS hash
FROM man m
JOIN package_versions v ON v.id = m.package
SELECT p.system, p.category, p.name AS package, v.version, v.released, v.id AS verid, m.name, m.section, f.filename, f.shorthash, l.locale
FROM files f
JOIN locales l ON l.id = f.locale
JOIN mans m ON m.id = f.man
JOIN package_versions v ON v.id = f.pkgver
JOIN packages p ON p.id = v.package
JOIN systems s ON s.id = p.system
WHERE", sql_and(
length $f->{hash} ? sql 'm.hash = decode(', \$f->{hash}, ", 'hex')" : (),
length $f->{hash} ? sql 'f.content = (SELECT id FROM contents WHERE hash = decode(', \$f->{hash}, ", 'hex'))" : (),
length $f->{name} ? sql 'm.name =', \$f->{name} : (),
length $f->{section} ? sql 'm.section =', \$f->{section} : (),
length $f->{locale} ? sql 'm.locale =', \$f->{locale} : (),
defined $f->{locale} && $f->{locale} eq '' ? 'm.locale IS NULL' : (),
defined $f->{locale} ? sql 'l.locale =', \$f->{locale} : (),
), '
ORDER BY s.name, s.id DESC, p.name, v.released DESC, m.name, m.locale NULLS FIRST, m.filename
ORDER BY s.name, s.id DESC, p.name, v.released DESC, m.name, l.locale, f.filename
');
# Convert the list into a tree
my $cur = $f->{cur} ? shorthash_to_int substr $f->{cur}, 0, 8 : 0;
my $tree = [];
my($sys, $sysver, $pkg, $pkgver);
for my $m (@$l) {
@ -992,12 +1002,12 @@ TUWF::get '/json/tree.json' => sub {
$pkgver && $pkgver eq $m->{version} ? {name=>''} :
{name => $m->{version}, href => "/pkg/".sysbyid->{$m->{system}}{short}."/$m->{category}/$m->{package}/$m->{version}"},
{ name => "$m->{name}($m->{section})",
$f->{hash} || lc($m->{hash}) eq lc($f->{cur}) ? ()
: (href => sprintf('/%s/%s', $m->{name}, substr $m->{hash}, 0, 8))
$f->{hash} || $cur == $m->{shorthash} ? ()
: (href => sprintf('/%s/%s', $m->{name}, shorthash_to_hex $m->{shorthash}))
},
{ name => substr($m->{hash}, 0, 8),
$f->{hash} || lc($m->{hash}) eq lc($f->{cur}) ? ()
: (href => sprintf('/%s/%s', $m->{name}, substr $m->{hash}, 0, 8))
{ name => shorthash_to_hex($m->{shorthash}),
$f->{hash} || $cur == $m->{shorthash} ? ()
: (href => sprintf('/%s/%s', $m->{name}, shorthash_to_hex $m->{shorthash}))
},
{ name => $m->{filename} }
];