-- To import an existing database dump, make sure the data is available in -- $table.tsv.zst files in the current directory and then run: -- -- psql -U manned -f import.sql -- -- Missing some data files is fine if you're only interested in a subset of the -- data, error messages relating to the missing tables can be safely ignored. -- -- -- To stream and import a database dump without saving the files separately: -- -- URL=https://dl.manned.org/2025-10-07 && curl -s $URL/import.sql \ -- | sed "s#zstd -dc \([^ ]\+.tsv.zst\)#curl -s $URL/\\1 | zstd -d#" \ -- | psql -U manned -- -- -- To initialize a fresh, empty database: -- -- grep -v '^\\copy' import.sql | psql -U manned -- -- -- All numeric identifiers should be considered internal and unstable. They are -- not exposed through the web interface. -- -- The `stats_cache` table is missing from this file, refer to -- util/update_indices.sql. -- -- Tables -- CREATE TABLE systems ( -- Manually assigned number. The id is also used for ordering different -- releases of the same system, as identified by 'name'. id integer NOT NULL, name text NOT NULL, release text, short text NOT NULL ); CREATE TABLE contents ( id SERIAL NOT NULL, -- 'hash' is the SHA1 of the man page file after decompression but *before* -- encoding conversion and removing 0-bytes. This means taking sha1(content) -- may not necessary match the hash, and it's possible for the same content -- to be in the database under multiple hashes (but I suspect that's rare). hash bytea NOT NULL, content text NOT NULL, html text ); -- Unique man page, as identified by name & section CREATE TABLE mans ( id SERIAL NOT NULL, name text NOT NULL, section text NOT NULL ); -- List of man page locales for efficient referencing. Some locales include -- the encoding in their name, which isn't really correct or even necessary -- since we convert everything to UTF-8 anyway, but w/e, Can fix later. CREATE TABLE locales ( id SMALLSERIAL NOT NULL, locale text NOT NULL ); -- List of encodings for efficient referencing. CREATE TABLE encodings ( id SMALLSERIAL NOT NULL, encoding text NOT NULL ); CREATE TABLE packages ( id SERIAL NOT NULL, system integer NOT NULL, name text NOT NULL, -- Whether this package has been seen in the last repository update. This -- field is only updated for a few systems that are likely to delete packages -- over time; non-rolling-release distros tend to not delete packages after -- all. -- Packages where the latest version does not have any man pages may also be -- marked as dead even if the package is still available in the repos. dead boolean NOT NULL DEFAULT FALSE, -- Whether this package has at least one man page indexed in the database. -- The indexer uses this table to keep track of which packages it has -- already indexed, but not all packages seen by the indexer have a man page. -- This cache helps the web front-end filter out irrelevant packages faster. c_hasman boolean NOT NULL DEFAULT FALSE ); CREATE TABLE package_versions ( id SERIAL NOT NULL, package integer NOT NULL, version text NOT NULL, released date NOT NULL, arch text ); CREATE TABLE files ( pkgver integer NOT NULL, man integer NOT NULL, content integer NOT NULL, shorthash integer NOT NULL, -- cache: hash_to_shorthash(content.hash) locale smallint NOT NULL, -- The original encoding the man page was found in. This column isn't really -- used at the moment, but is potentially useful when investigating encoding -- issues. encoding smallint NOT NULL, filename text NOT NULL ); -- -- Data -- \copy systems from program 'zstd -dc systems.tsv.zst' \copy contents (id, hash, content) from program 'zstd -dc contents.tsv.zst' \copy mans from program 'zstd -dc mans.tsv.zst' \copy locales from program 'zstd -dc locales.tsv.zst' \copy encodings from program 'zstd -dc encodings.tsv.zst' \copy packages from program 'zstd -dc packages.tsv.zst' \copy package_versions from program 'zstd -dc package_versions.tsv.zst' \copy files from program 'zstd -dc files.tsv.zst' -- -- Primary keys & indices -- ALTER TABLE systems ADD CONSTRAINT systems_pkey PRIMARY KEY (id); ALTER TABLE contents ADD CONSTRAINT contents_pkey PRIMARY KEY (id); ALTER TABLE mans ADD CONSTRAINT mans_pkey PRIMARY KEY (id); ALTER TABLE locales ADD CONSTRAINT locales_pkey PRIMARY KEY (id); ALTER TABLE encodings ADD CONSTRAINT encodings_pkey PRIMARY KEY (id); ALTER TABLE packages ADD CONSTRAINT packages_pkey PRIMARY KEY (id); ALTER TABLE package_versions ADD CONSTRAINT package_versions_pkey PRIMARY KEY (id); ALTER TABLE files ADD CONSTRAINT files_pkey PRIMARY KEY (pkgver, filename); CREATE UNIQUE INDEX contents_hash_key ON contents (hash); CREATE UNIQUE INDEX mans_name_section_key ON mans (name, section); CREATE UNIQUE INDEX locales_locale_key ON locales (locale); CREATE UNIQUE INDEX encodings_encoding_key ON encodings (encoding); CREATE UNIQUE INDEX packages_system_name_key ON packages (system, name) INCLUDE (id, c_hasman, dead); CREATE UNIQUE INDEX package_versions_package_version_key ON package_versions (package, version); CREATE INDEX contents_nohtml ON contents (id) WHERE html IS NULL; CREATE INDEX mans_name ON mans USING btree(lower(name) text_pattern_ops); CREATE INDEX files_man_shorthash_idx ON files (man, shorthash); CREATE INDEX files_content_idx ON files (content); -- -- Constraints -- ALTER TABLE packages ADD CONSTRAINT packages_system_fkey FOREIGN KEY (system) REFERENCES systems(id) ON DELETE CASCADE ON UPDATE CASCADE; ALTER TABLE package_versions ADD CONSTRAINT package_versions_package_fkey FOREIGN KEY (package) REFERENCES packages(id) ON DELETE CASCADE ON UPDATE CASCADE; ALTER TABLE files ADD CONSTRAINT files_pkgver_fkey FOREIGN KEY (pkgver) REFERENCES package_versions(id) ON DELETE CASCADE ON UPDATE CASCADE, ADD CONSTRAINT files_man_fkey FOREIGN KEY (man) REFERENCES mans(id), ADD CONSTRAINT files_content_fkey FOREIGN KEY (content) REFERENCES contents(id), ADD CONSTRAINT files_locale_fkey FOREIGN KEY (locale) REFERENCES locales(id), ADD CONSTRAINT files_encoding_fkey FOREIGN KEY (encoding) REFERENCES encodings(id); -- -- Sequences -- SELECT setval('contents_id_seq', (SELECT MAX(id) FROM contents)); SELECT setval('encodings_id_seq', (SELECT MAX(id) FROM encodings)); SELECT setval('locales_id_seq', (SELECT MAX(id) FROM locales)); SELECT setval('mans_id_seq', (SELECT MAX(id) FROM mans)); SELECT setval('package_versions_id_seq', (SELECT MAX(id) FROM package_versions)); SELECT setval('packages_id_seq', (SELECT MAX(id) FROM packages)); -- -- Utility functions -- -- Interpret first 4 bytes of hash as a signed 32-bit integer. -- TODO: Postgres 18 allows casting between bytea and int, see if that can be used instead. CREATE OR REPLACE FUNCTION hash_to_shorthash(hash bytea) RETURNS integer AS $$ SELECT CASE WHEN get_byte(hash, 3) < 128 THEN (get_byte(hash, 3)::int<<24) + (get_byte(hash, 2)::int<<16) + (get_byte(hash, 1)::int<<8) + get_byte(hash, 0) ELSE -2147483648 + ((get_byte(hash, 3)::int - 128)<<24) + (get_byte(hash, 2)::int<<16) + (get_byte(hash, 1)::int<<8) + get_byte(hash, 0) END; $$ IMMUTABLE LANGUAGE SQL; CREATE OR REPLACE FUNCTION is_english_locale(locale text) RETURNS bool AS $$ SELECT locale IS NULL OR locale = '' OR locale LIKE 'en%'; $$ IMMUTABLE LANGUAGE SQL; CREATE OR REPLACE FUNCTION is_standard_man_location(path text) RETURNS bool AS $$ SELECT path LIKE '/usr/share/man/man%' OR path LIKE '/usr/local/man/man%'; $$ IMMUTABLE LANGUAGE sql; -- Convenient function to match the first character of a string. Second argument must be lowercase 'a'-'z' or '0'. -- Postgres can inline and partially evaluate this function into the query plan, so it's fairly efficient. CREATE OR REPLACE FUNCTION match_firstchar(str text, chr text) RETURNS boolean AS $$ SELECT CASE WHEN chr = '0' THEN (ascii(str) < 97 OR ascii(str) > 122) AND (ascii(str) < 65 OR ascii(str) > 90) ELSE ascii(str) IN(ascii(chr),ascii(upper(chr))) END; $$ IMMUTABLE LANGUAGE SQL;