-- WARNING: This script does a complete rewrite of the two largest tables. -- This requires ~25G of temporary space and takes anywhere between 10 and 20 minutes. -- The site needs to be taken down during that time. DROP FUNCTION section_from_filename(text); DROP FUNCTION name_from_filename(text); DROP FUNCTION basename_from_filename(text); -- Interpret first 4 bytes of hash as a signed 32-bit integer. CREATE OR REPLACE FUNCTION hash_to_shorthash(hash bytea) RETURNS integer AS $$ SELECT CASE WHEN get_byte(hash, 3) < 128 THEN (get_byte(hash, 3)::int<<24) + (get_byte(hash, 2)::int<<16) + (get_byte(hash, 1)::int<<8) + get_byte(hash, 0) ELSE -2147483648 + ((get_byte(hash, 3)::int - 128)<<24) + (get_byte(hash, 2)::int<<16) + (get_byte(hash, 1)::int<<8) + get_byte(hash, 0) END; $$ LANGUAGE SQL IMMUTABLE; ALTER TABLE contents DROP CONSTRAINT contents_pkey; ALTER TABLE contents RENAME TO contents_old; CREATE SEQUENCE contents_id_seq AS integer; CREATE TABLE contents ( id integer NOT NULL DEFAULT nextval('contents_id_seq'::regclass), hash bytea NOT NULL, content text NOT NULL ); ALTER SEQUENCE contents_id_seq OWNED BY contents.id; -- 4m15s; start 29.3G end 53.3G; +24G INSERT INTO contents (hash, content) SELECT hash, content FROM contents_old; DROP TABLE contents_old; -- back to 29.3G ALTER TABLE contents ADD PRIMARY KEY (id); ALTER TABLE contents ADD UNIQUE (hash); -- 29.7G at this point CREATE TABLE mans ( id SERIAL PRIMARY KEY, name text NOT NULL, section text NOT NULL, UNIQUE(name, section) ); INSERT INTO mans (name, section) SELECT DISTINCT name, section FROM man; CREATE INDEX mans_name ON mans USING btree(lower(name) text_pattern_ops); CREATE TABLE locales ( id SMALLSERIAL PRIMARY KEY, locale text NOT NULL UNIQUE ); INSERT INTO locales (id, locale) VALUES (0,''); -- 0 for default locale is handy, I guess INSERT INTO locales (locale) SELECT locale FROM man WHERE locale IS NOT NULL GROUP BY locale ORDER BY locale; -- Encodings are stored for reference but are never actually used anywhere. CREATE TABLE encodings ( id SMALLSERIAL PRIMARY KEY, encoding text NOT NULL UNIQUE ); INSERT INTO encodings (id, encoding) VALUES (0,''); INSERT INTO encodings (encoding) SELECT encoding FROM man WHERE encoding IS NOT NULL GROUP BY encoding ORDER BY encoding; -- Replaces the 'man' table; It's the largest table in terms of number of rows -- and it's pretty frequently accessed, so keeping the rows small helps. CREATE TABLE files ( pkgver integer NOT NULL, -- package_versions.id man integer NOT NULL, -- mans.id content integer NOT NULL, -- content.id shorthash integer NOT NULL, -- cache: hash_to_shorthash(content.hash) locale smallint NOT NULL, -- locales.id encoding smallint NOT NULL, -- encodings.id filename text NOT NULL ); -- 1min; 29.7G -> 31.8G INSERT INTO files SELECT o.package, m.id, c.id, hash_to_shorthash(o.hash), l.id, e.id, o.filename FROM man o JOIN mans m ON m.name = o.name AND m.section = o.section JOIN contents c ON c.hash = o.hash JOIN locales l ON l.locale = coalesce(o.locale, '') JOIN encodings e ON e.encoding = coalesce(o.encoding, ''); -- 1min; 31.8G -> 33.7G for both indices ALTER TABLE files ADD PRIMARY KEY (pkgver, filename); CREATE INDEX ON files (man, shorthash); CREATE INDEX ON files (content); -- 20sec to verify ALTER TABLE files ADD CONSTRAINT files_pkgver_fkey FOREIGN KEY (pkgver) REFERENCES package_versions (id) ON DELETE CASCADE, ADD CONSTRAINT files_man_fkey FOREIGN KEY (man) REFERENCES mans (id), ADD CONSTRAINT files_content_fkey FOREIGN KEY (content) REFERENCES contents (id), ADD CONSTRAINT files_locale_fkey FOREIGN KEY (locale) REFERENCES locales (id), ADD CONSTRAINT files_encoding_fkey FOREIGN KEY (encoding) REFERENCES encodings (id); DROP TABLE man; DROP TABLE man_index; -- final: 29.1G; we saved a whole 300M! \o/ -- There's only about a 100 unreferenced rows, leftovers from removals of -- incorrectly indexed packages. Let's remove them while we're at it. DELETE FROM contents WHERE NOT EXISTS(SELECT 1 FROM files WHERE content = id); VACUUM ANALYZE mans, files, contents, locales, encodings;