Large-ish SQL schema revamp/optimizations

Primarily aimed at reducing the size of the old 'man' (now: files)
table, using smaller integers to refer to man contents and text fields,
and storing a shorthash as an integer for quick lookups. This better
normalization also removes the need to keep a separate 'man_index' cache
for the search function.

The old schema wasn't necessarily bad, but I was in the mood for some
optimizations. And a little cleanup.

Prolly introduces a bunch of new bugs, I haven't tested this too well.
This commit is contained in:
Yorhel 2021-12-14 15:06:05 +01:00
parent 6f7f59c6df
commit f376f1f137
6 changed files with 268 additions and 128 deletions

View file

@ -7,15 +7,44 @@ CREATE TABLE systems (
short varchar NOT NULL
);
CREATE TABLE contents (
id SERIAL PRIMARY KEY,
-- 'hash' is the SHA1 of the man page file after decompression but *before*
-- encoding conversion and removing 0-bytes. This means taking sha1(content)
-- may not necessary match the hash, and it's possible for the same content
-- to be in the database under multiple hashes (but I suspect that's rare).
hash bytea PRIMARY KEY,
content varchar NOT NULL
hash bytea NOT NULL UNIQUE,
content text NOT NULL
);
-- Unique man page, as identified by name & section
CREATE TABLE mans (
id SERIAL PRIMARY KEY,
name text NOT NULL,
section text NOT NULL,
UNIQUE(name, section)
);
CREATE INDEX mans_name ON mans USING btree(lower(name) text_pattern_ops);
-- List of man page locales for efficient referencing. Some locales include
-- the encoding in their name, which isn't really correct or even necessary
-- since we convert everything to UTF-8 anyway, but w/e, Can fix later.
CREATE TABLE locales (
id SMALLSERIAL PRIMARY KEY,
locale text NOT NULL UNIQUE
);
-- List of encodings for efficient referencing.
CREATE TABLE encodings (
id SMALLSERIAL PRIMARY KEY,
encoding text NOT NULL UNIQUE
);
CREATE TABLE packages (
id SERIAL PRIMARY KEY,
system integer NOT NULL REFERENCES systems(id) ON DELETE CASCADE ON UPDATE CASCADE,
@ -31,6 +60,7 @@ CREATE TABLE packages (
UNIQUE(system, name, category) -- Note the order, lookups on (system,name) are common
);
CREATE TABLE package_versions (
id SERIAL PRIMARY KEY,
package integer NOT NULL REFERENCES packages(id) ON DELETE CASCADE,
@ -40,43 +70,36 @@ CREATE TABLE package_versions (
UNIQUE(package, version)
);
CREATE TABLE man (
package integer NOT NULL REFERENCES package_versions(id) ON DELETE CASCADE,
name varchar NOT NULL,
filename varchar NOT NULL,
locale varchar,
hash bytea NOT NULL,
section varchar NOT NULL,
encoding varchar,
UNIQUE(package, filename)
CREATE TABLE files (
pkgver integer NOT NULL REFERENCES package_versions(id) ON DELETE CASCADE,
man integer NOT NULL REFERENCES mans(id),
content integer NOT NULL REFERENCES content(id),
shorthash integer NOT NULL, -- cache: hash_to_shorthash(content.hash)
locale smallint NOT NULL REFERENCES locales(id)
-- The original encoding the man page was found in. This column isn't really
-- used at the moment, but is potentially useful when investigating encoding
-- issues.
encoding smallint NOT NULL REFERENCES encodings(id),
filename text NOT NULL,
PRIMARY KEY(pkgver, filename)
);
CREATE INDEX ON files (man, shorthash);
CREATE INDEX ON files (content);
CREATE INDEX ON man (hash);
CREATE INDEX ON man (name);
-- For stats_cache
\i util/update_indices.sql
CREATE TABLE man_index AS SELECT DISTINCT name, section FROM man;
CREATE INDEX ON man_index USING btree(lower(name) text_pattern_ops);
CREATE TABLE stats_cache AS SELECT count(distinct hash) AS hashes, count(distinct name) AS mans, count(*) AS files, count(distinct package) AS packages FROM man;
-- Removes any path components and compression extensions from the filename.
CREATE OR REPLACE FUNCTION basename_from_filename(fn text) RETURNS text AS $$
SELECT regexp_replace(fn, '^.+/([^/][^/]*?)(?:\.gz|\.lzma|\.xz|\.bz2|\.zst)*$', '\1');
$$ LANGUAGE SQL;
CREATE OR REPLACE FUNCTION section_from_filename(text) RETURNS text AS $$
SELECT regexp_replace(basename_from_filename($1), '^.+\.([^.]+)$', '\1');
$$ LANGUAGE SQL;
CREATE OR REPLACE FUNCTION name_from_filename(text) RETURNS text AS $$
SELECT regexp_replace(basename_from_filename($1), '^(.+)\.[^.]+$', '\1');
$$ LANGUAGE SQL;
-- Interpret first 4 bytes of hash as a signed 32-bit integer.
CREATE OR REPLACE FUNCTION hash_to_shorthash(hash bytea) RETURNS integer AS $$
SELECT CASE WHEN get_byte(hash, 3) < 128
THEN (get_byte(hash, 3)::int<<24) + (get_byte(hash, 2)::int<<16) + (get_byte(hash, 1)::int<<8) + get_byte(hash, 0)
ELSE -2147483648 + ((get_byte(hash, 3)::int - 128)<<24) + (get_byte(hash, 2)::int<<16) + (get_byte(hash, 1)::int<<8) + get_byte(hash, 0)
END;
$$ LANGUAGE SQL IMMUTABLE;
CREATE OR REPLACE FUNCTION is_english_locale(locale text) RETURNS bool AS $$

106
sql/update-2021-12-14.sql Normal file
View file

@ -0,0 +1,106 @@
-- WARNING: This script does a complete rewrite of the two largest tables.
-- This requires ~25G of temporary space and takes anywhere between 10 and 20 minutes.
-- The site needs to be taken down during that time.
DROP FUNCTION section_from_filename(text);
DROP FUNCTION name_from_filename(text);
DROP FUNCTION basename_from_filename(text);
-- Interpret first 4 bytes of hash as a signed 32-bit integer.
CREATE OR REPLACE FUNCTION hash_to_shorthash(hash bytea) RETURNS integer AS $$
SELECT CASE WHEN get_byte(hash, 3) < 128
THEN (get_byte(hash, 3)::int<<24) + (get_byte(hash, 2)::int<<16) + (get_byte(hash, 1)::int<<8) + get_byte(hash, 0)
ELSE -2147483648 + ((get_byte(hash, 3)::int - 128)<<24) + (get_byte(hash, 2)::int<<16) + (get_byte(hash, 1)::int<<8) + get_byte(hash, 0)
END;
$$ LANGUAGE SQL IMMUTABLE;
ALTER TABLE contents DROP CONSTRAINT contents_pkey;
ALTER TABLE contents RENAME TO contents_old;
CREATE SEQUENCE contents_id_seq AS integer;
CREATE TABLE contents (
id integer NOT NULL DEFAULT nextval('contents_id_seq'::regclass),
hash bytea NOT NULL,
content text NOT NULL
);
ALTER SEQUENCE contents_id_seq OWNED BY contents.id;
-- 4m15s; start 29.3G end 53.3G; +24G
INSERT INTO contents (hash, content) SELECT hash, content FROM contents_old;
DROP TABLE contents_old; -- back to 29.3G
ALTER TABLE contents ADD PRIMARY KEY (id);
ALTER TABLE contents ADD UNIQUE (hash);
-- 29.7G at this point
CREATE TABLE mans (
id SERIAL PRIMARY KEY,
name text NOT NULL,
section text NOT NULL,
UNIQUE(name, section)
);
INSERT INTO mans (name, section) SELECT DISTINCT name, section FROM man;
CREATE INDEX mans_name ON mans USING btree(lower(name) text_pattern_ops);
CREATE TABLE locales (
id SMALLSERIAL PRIMARY KEY,
locale text NOT NULL UNIQUE
);
INSERT INTO locales (id, locale) VALUES (0,''); -- 0 for default locale is handy, I guess
INSERT INTO locales (locale) SELECT locale FROM man WHERE locale IS NOT NULL GROUP BY locale ORDER BY locale;
-- Encodings are stored for reference but are never actually used anywhere.
CREATE TABLE encodings (
id SMALLSERIAL PRIMARY KEY,
encoding text NOT NULL UNIQUE
);
INSERT INTO encodings (id, encoding) VALUES (0,'');
INSERT INTO encodings (encoding) SELECT encoding FROM man WHERE encoding IS NOT NULL GROUP BY encoding ORDER BY encoding;
-- Replaces the 'man' table; It's the largest table in terms of number of rows
-- and it's pretty frequently accessed, so keeping the rows small helps.
CREATE TABLE files (
pkgver integer NOT NULL, -- package_versions.id
man integer NOT NULL, -- mans.id
content integer NOT NULL, -- content.id
shorthash integer NOT NULL, -- cache: hash_to_shorthash(content.hash)
locale smallint NOT NULL, -- locales.id
encoding smallint NOT NULL, -- encodings.id
filename text NOT NULL
);
-- 1min; 29.7G -> 31.8G
INSERT INTO files
SELECT o.package, m.id, c.id, hash_to_shorthash(o.hash), l.id, e.id, o.filename
FROM man o
JOIN mans m ON m.name = o.name AND m.section = o.section
JOIN contents c ON c.hash = o.hash
JOIN locales l ON l.locale = coalesce(o.locale, '')
JOIN encodings e ON e.encoding = coalesce(o.encoding, '');
-- 1min; 31.8G -> 33.7G for both indices
ALTER TABLE files ADD PRIMARY KEY (pkgver, filename);
CREATE INDEX ON files (man, shorthash);
CREATE INDEX ON files (content);
-- 20sec to verify
ALTER TABLE files
ADD CONSTRAINT files_pkgver_fkey FOREIGN KEY (pkgver) REFERENCES package_versions (id) ON DELETE CASCADE,
ADD CONSTRAINT files_man_fkey FOREIGN KEY (man) REFERENCES mans (id),
ADD CONSTRAINT files_content_fkey FOREIGN KEY (content) REFERENCES contents (id),
ADD CONSTRAINT files_locale_fkey FOREIGN KEY (locale) REFERENCES locales (id),
ADD CONSTRAINT files_encoding_fkey FOREIGN KEY (encoding) REFERENCES encodings (id);
DROP TABLE man;
DROP TABLE man_index;
-- final: 29.1G; we saved a whole 300M! \o/
-- There's only about a 100 unreferenced rows, leftovers from removals of
-- incorrectly indexed packages. Let's remove them while we're at it.
DELETE FROM contents WHERE NOT EXISTS(SELECT 1 FROM files WHERE content = id);
VACUUM ANALYZE mans, files, contents, locales, encodings;