Large-ish SQL schema revamp/optimizations

Primarily aimed at reducing the size of the old 'man' (now: files) table, using smaller integers to refer to man contents and text fields, and storing a shorthash as an integer for quick lookups. This better normalization also removes the need to keep a separate 'man_index' cache for the search function. The old schema wasn't necessarily bad, but I was in the mood for some optimizations. And a little cleanup. Prolly introduces a bunch of new bugs, I haven't tested this too well.
2021-12-14 15:06:05 +01:00 · 2021-12-14 15:06:05 +01:00 · f376f1f137
commit f376f1f137
parent 6f7f59c6df
6 changed files with 268 additions and 128 deletions
--- a/sql/update-2021-12-14.sql
+++ b/sql/update-2021-12-14.sql
@ -0,0 +1,106 @@
+-- WARNING: This script does a complete rewrite of the two largest tables.
+-- This requires ~25G of temporary space and takes anywhere between 10 and 20 minutes.
+-- The site needs to be taken down during that time.
+
+
+DROP FUNCTION section_from_filename(text);
+DROP FUNCTION name_from_filename(text);
+DROP FUNCTION basename_from_filename(text);
+
+-- Interpret first 4 bytes of hash as a signed 32-bit integer.
+CREATE OR REPLACE FUNCTION hash_to_shorthash(hash bytea) RETURNS integer AS $$
+  SELECT CASE WHEN get_byte(hash, 3) < 128
+         THEN (get_byte(hash, 3)::int<<24) + (get_byte(hash, 2)::int<<16) + (get_byte(hash, 1)::int<<8) + get_byte(hash, 0)
+         ELSE -2147483648 + ((get_byte(hash, 3)::int - 128)<<24) + (get_byte(hash, 2)::int<<16) + (get_byte(hash, 1)::int<<8) + get_byte(hash, 0)
+         END;
+$$ LANGUAGE SQL IMMUTABLE;
+
+
+ALTER TABLE contents DROP CONSTRAINT contents_pkey;
+ALTER TABLE contents RENAME TO contents_old;
+
+CREATE SEQUENCE contents_id_seq AS integer;
+CREATE TABLE contents (
+  id      integer NOT NULL DEFAULT nextval('contents_id_seq'::regclass),
+  hash    bytea NOT NULL,
+  content text NOT NULL
+);
+ALTER SEQUENCE contents_id_seq OWNED BY contents.id;
+-- 4m15s; start 29.3G end 53.3G; +24G
+INSERT INTO contents (hash, content) SELECT hash, content FROM contents_old;
+DROP TABLE contents_old; -- back to 29.3G
+ALTER TABLE contents ADD PRIMARY KEY (id);
+ALTER TABLE contents ADD UNIQUE (hash);
+-- 29.7G at this point
+
+
+
+CREATE TABLE mans (
+  id      SERIAL PRIMARY KEY,
+  name    text NOT NULL,
+  section text NOT NULL,
+  UNIQUE(name, section)
+);
+INSERT INTO mans (name, section) SELECT DISTINCT name, section FROM man;
+CREATE INDEX mans_name ON mans USING btree(lower(name) text_pattern_ops);
+
+CREATE TABLE locales (
+  id      SMALLSERIAL PRIMARY KEY,
+  locale  text NOT NULL UNIQUE
+);
+INSERT INTO locales (id, locale) VALUES (0,''); -- 0 for default locale is handy, I guess
+INSERT INTO locales (locale) SELECT locale FROM man WHERE locale IS NOT NULL GROUP BY locale ORDER BY locale;
+
+-- Encodings are stored for reference but are never actually used anywhere.
+CREATE TABLE encodings (
+  id       SMALLSERIAL PRIMARY KEY,
+  encoding text NOT NULL UNIQUE
+);
+INSERT INTO encodings (id, encoding) VALUES (0,'');
+INSERT INTO encodings (encoding) SELECT encoding FROM man WHERE encoding IS NOT NULL GROUP BY encoding ORDER BY encoding;
+
+
+-- Replaces the 'man' table; It's the largest table in terms of number of rows
+-- and it's pretty frequently accessed, so keeping the rows small helps.
+CREATE TABLE files (
+  pkgver    integer   NOT NULL, -- package_versions.id
+  man       integer   NOT NULL, -- mans.id
+  content   integer   NOT NULL, -- content.id
+  shorthash integer   NOT NULL, -- cache: hash_to_shorthash(content.hash)
+  locale    smallint  NOT NULL, -- locales.id
+  encoding  smallint  NOT NULL, -- encodings.id
+  filename  text      NOT NULL
+);
+
+-- 1min; 29.7G -> 31.8G
+INSERT INTO files
+    SELECT o.package, m.id, c.id, hash_to_shorthash(o.hash), l.id, e.id, o.filename
+      FROM man o
+      JOIN mans m ON m.name = o.name AND m.section = o.section
+      JOIN contents c ON c.hash = o.hash
+      JOIN locales l ON l.locale = coalesce(o.locale, '')
+      JOIN encodings e ON e.encoding = coalesce(o.encoding, '');
+
+-- 1min; 31.8G -> 33.7G for both indices
+ALTER TABLE files ADD PRIMARY KEY (pkgver, filename);
+CREATE INDEX ON files (man, shorthash);
+CREATE INDEX ON files (content);
+
+-- 20sec to verify
+ALTER TABLE files
+    ADD CONSTRAINT files_pkgver_fkey FOREIGN KEY (pkgver) REFERENCES package_versions (id) ON DELETE CASCADE,
+    ADD CONSTRAINT files_man_fkey FOREIGN KEY (man) REFERENCES mans (id),
+    ADD CONSTRAINT files_content_fkey FOREIGN KEY (content) REFERENCES contents (id),
+    ADD CONSTRAINT files_locale_fkey FOREIGN KEY (locale) REFERENCES locales (id),
+    ADD CONSTRAINT files_encoding_fkey FOREIGN KEY (encoding) REFERENCES encodings (id);
+
+DROP TABLE man;
+DROP TABLE man_index;
+-- final: 29.1G; we saved a whole 300M! \o/
+
+
+-- There's only about a 100 unreferenced rows, leftovers from removals of
+-- incorrectly indexed packages. Let's remove them while we're at it.
+DELETE FROM contents WHERE NOT EXISTS(SELECT 1 FROM files WHERE content = id);
+
+VACUUM ANALYZE mans, files, contents, locales, encodings;