From 902048e2820496c4e7ca648c0c1aca6fa421e75c Mon Sep 17 00:00:00 2001 From: Yorhel Date: Wed, 8 Oct 2025 09:40:50 +0200 Subject: [PATCH] Change database dump format + add import & export scripts The new format allows for downloading and importing only a part of the database - useful when only the metadata is required - and doesn't include the wasteful preformatted HTML cache. This also ensures that the new import.sql script is actually usable and in sync with the actual database. The old schema.sql was neither. (And this simplifies my backup scripts) --- .gitignore | 1 + README.md | 1 - import.sql | 227 ++++++++++++++++++++++++++++++++++++++++ schema.sql | 127 ---------------------- util/cron.sh | 3 + util/export.sh | 34 ++++++ util/update_indices.sql | 2 +- www/index.pl | 19 ++-- 8 files changed, 273 insertions(+), 141 deletions(-) create mode 100644 import.sql delete mode 100644 schema.sql create mode 100755 util/export.sh diff --git a/.gitignore b/.gitignore index 2f129b8..a23fd4b 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,4 @@ indexer/target web/target util/.config +dl/ diff --git a/README.md b/README.md index 05c5b33..837abfe 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,6 @@ Ironically, documentation about how things work is completely lacking. - **indexer/** -> The Rust program that scans package repositories for updates, fetches new packages and extracts the man pages. - **ManUtils/** -> Perl/XS helper module to format man pages into HTML (uses **web/**). -- **sql/** -> Database schema & updates. - **util/** -> Cron job and scripts to run **indexer/** on the right repositories. - **web/** -> Badly named Rust library to convert man pages into HTML. - **www/** -> The web front-end. diff --git a/import.sql b/import.sql new file mode 100644 index 0000000..1bec1fb --- /dev/null +++ b/import.sql @@ -0,0 +1,227 @@ +-- To import an existing database dump, make sure the data is available in +-- $table.tsv.zst files in the current directory and then run: +-- +-- psql -U manned -f import.sql +-- +-- Missing some data files is fine if you're only interested in a subset of the +-- data, error messages relating to the missing tables can be safely ignored. +-- +-- +-- To stream and import a database dump without saving the files separately: +-- +-- URL=https://dl.manned.org/2025-10-07 && curl -s $URL/import.sql \ +-- | sed "s#zstd -dc \([^ ]\+.tsv.zst\)#curl -s $URL/\\1 | zstd -d#" \ +-- | psql -U manned +-- +-- +-- To initialize a fresh, empty database: +-- +-- grep -v '^\\copy' import.sql | psql -U manned +-- +-- +-- All numeric identifiers should be considered internal and unstable. They are +-- not exposed through the web interface. +-- +-- The `stats_cache` table is missing from this file, refer to +-- util/update_indices.sql. + + +-- +-- Tables +-- + +CREATE TABLE systems ( + -- Manually assigned number. The id is also used for ordering different + -- releases of the same system, as identified by 'name'. + id integer NOT NULL, + name text NOT NULL, + release text, + short text NOT NULL +); + + +CREATE TABLE contents ( + id SERIAL NOT NULL, + -- 'hash' is the SHA1 of the man page file after decompression but *before* + -- encoding conversion and removing 0-bytes. This means taking sha1(content) + -- may not necessary match the hash, and it's possible for the same content + -- to be in the database under multiple hashes (but I suspect that's rare). + hash bytea NOT NULL, + content text NOT NULL, + html text +); + + +-- Unique man page, as identified by name & section +CREATE TABLE mans ( + id SERIAL NOT NULL, + name text NOT NULL, + section text NOT NULL +); + + +-- List of man page locales for efficient referencing. Some locales include +-- the encoding in their name, which isn't really correct or even necessary +-- since we convert everything to UTF-8 anyway, but w/e, Can fix later. +CREATE TABLE locales ( + id SMALLSERIAL NOT NULL, + locale text NOT NULL +); + + +-- List of encodings for efficient referencing. +CREATE TABLE encodings ( + id SMALLSERIAL NOT NULL, + encoding text NOT NULL +); + + +CREATE TABLE packages ( + id SERIAL NOT NULL, + system integer NOT NULL, + name text NOT NULL, + -- Whether this package has been seen in the last repository update. This + -- field is only updated for a few systems that are likely to delete packages + -- over time; non-rolling-release distros tend to not delete packages after + -- all. + -- Packages where the latest version does not have any man pages may also be + -- marked as dead even if the package is still available in the repos. + dead boolean NOT NULL DEFAULT FALSE, + -- Whether this package has at least one man page indexed in the database. + -- The indexer uses this table to keep track of which packages it has + -- already indexed, but not all packages seen by the indexer have a man page. + -- This cache helps the web front-end filter out irrelevant packages faster. + c_hasman boolean NOT NULL DEFAULT FALSE +); + + +CREATE TABLE package_versions ( + id SERIAL NOT NULL, + package integer NOT NULL, + version text NOT NULL, + released date NOT NULL, + arch text +); + + +CREATE TABLE files ( + pkgver integer NOT NULL, + man integer NOT NULL, + content integer NOT NULL, + shorthash integer NOT NULL, -- cache: hash_to_shorthash(content.hash) + locale smallint NOT NULL, + -- The original encoding the man page was found in. This column isn't really + -- used at the moment, but is potentially useful when investigating encoding + -- issues. + encoding smallint NOT NULL, + filename text NOT NULL +); + + + +-- +-- Data +-- + +\copy systems from program 'zstd -dc systems.tsv.zst' +\copy contents (id, hash, content) from program 'zstd -dc contents.tsv.zst' +\copy mans from program 'zstd -dc mans.tsv.zst' +\copy locales from program 'zstd -dc locales.tsv.zst' +\copy encodings from program 'zstd -dc encodings.tsv.zst' +\copy packages from program 'zstd -dc packages.tsv.zst' +\copy package_versions from program 'zstd -dc package_versions.tsv.zst' +\copy files from program 'zstd -dc files.tsv.zst' + + + +-- +-- Primary keys & indices +-- + +ALTER TABLE systems ADD CONSTRAINT systems_pkey PRIMARY KEY (id); +ALTER TABLE contents ADD CONSTRAINT contents_pkey PRIMARY KEY (id); +ALTER TABLE mans ADD CONSTRAINT mans_pkey PRIMARY KEY (id); +ALTER TABLE locales ADD CONSTRAINT locales_pkey PRIMARY KEY (id); +ALTER TABLE encodings ADD CONSTRAINT encodings_pkey PRIMARY KEY (id); +ALTER TABLE packages ADD CONSTRAINT packages_pkey PRIMARY KEY (id); +ALTER TABLE package_versions ADD CONSTRAINT package_versions_pkey PRIMARY KEY (id); +ALTER TABLE files ADD CONSTRAINT files_pkey PRIMARY KEY (pkgver, filename); + + +CREATE UNIQUE INDEX contents_hash_key ON contents (hash); +CREATE UNIQUE INDEX mans_name_section_key ON mans (name, section); +CREATE UNIQUE INDEX locales_locale_key ON locales (locale); +CREATE UNIQUE INDEX encodings_encoding_key ON encodings (encoding); +CREATE UNIQUE INDEX packages_system_name_key ON packages (system, name) INCLUDE (id, c_hasman, dead); +CREATE UNIQUE INDEX package_versions_package_version_key ON package_versions (package, version); +CREATE INDEX contents_nohtml ON contents (id) WHERE html IS NULL; +CREATE INDEX mans_name ON mans USING btree(lower(name) text_pattern_ops); +CREATE INDEX files_man_shorthash_idx ON files (man, shorthash); +CREATE INDEX files_content_idx ON files (content); + + + +-- +-- Constraints +-- + +ALTER TABLE packages + ADD CONSTRAINT packages_system_fkey FOREIGN KEY (system) REFERENCES systems(id) ON DELETE CASCADE ON UPDATE CASCADE; + +ALTER TABLE package_versions + ADD CONSTRAINT package_versions_package_fkey FOREIGN KEY (package) REFERENCES packages(id) ON DELETE CASCADE ON UPDATE CASCADE; + +ALTER TABLE files + ADD CONSTRAINT files_pkgver_fkey FOREIGN KEY (pkgver) REFERENCES package_versions(id) ON DELETE CASCADE ON UPDATE CASCADE, + ADD CONSTRAINT files_man_fkey FOREIGN KEY (man) REFERENCES mans(id), + ADD CONSTRAINT files_content_fkey FOREIGN KEY (content) REFERENCES contents(id), + ADD CONSTRAINT files_locale_fkey FOREIGN KEY (locale) REFERENCES locales(id), + ADD CONSTRAINT files_encoding_fkey FOREIGN KEY (encoding) REFERENCES encodings(id); + + + +-- +-- Sequences +-- + +SELECT setval('contents_id_seq', (SELECT MAX(id) FROM contents)); +SELECT setval('encodings_id_seq', (SELECT MAX(id) FROM encodings)); +SELECT setval('locales_id_seq', (SELECT MAX(id) FROM locales)); +SELECT setval('mans_id_seq', (SELECT MAX(id) FROM mans)); +SELECT setval('package_versions_id_seq', (SELECT MAX(id) FROM package_versions)); +SELECT setval('packages_id_seq', (SELECT MAX(id) FROM packages)); + + + +-- +-- Utility functions +-- + + +-- Interpret first 4 bytes of hash as a signed 32-bit integer. +-- TODO: Postgres 18 allows casting between bytea and int, see if that can be used instead. +CREATE OR REPLACE FUNCTION hash_to_shorthash(hash bytea) RETURNS integer AS $$ + SELECT CASE WHEN get_byte(hash, 3) < 128 + THEN (get_byte(hash, 3)::int<<24) + (get_byte(hash, 2)::int<<16) + (get_byte(hash, 1)::int<<8) + get_byte(hash, 0) + ELSE -2147483648 + ((get_byte(hash, 3)::int - 128)<<24) + (get_byte(hash, 2)::int<<16) + (get_byte(hash, 1)::int<<8) + get_byte(hash, 0) + END; +$$ IMMUTABLE LANGUAGE SQL; + + +CREATE OR REPLACE FUNCTION is_english_locale(locale text) RETURNS bool AS $$ + SELECT locale IS NULL OR locale = '' OR locale LIKE 'en%'; +$$ IMMUTABLE LANGUAGE SQL; + + +CREATE OR REPLACE FUNCTION is_standard_man_location(path text) RETURNS bool AS $$ + SELECT path LIKE '/usr/share/man/man%' OR path LIKE '/usr/local/man/man%'; +$$ IMMUTABLE LANGUAGE sql; + +-- Convenient function to match the first character of a string. Second argument must be lowercase 'a'-'z' or '0'. +-- Postgres can inline and partially evaluate this function into the query plan, so it's fairly efficient. +CREATE OR REPLACE FUNCTION match_firstchar(str text, chr text) RETURNS boolean AS $$ + SELECT CASE WHEN chr = '0' + THEN (ascii(str) < 97 OR ascii(str) > 122) AND (ascii(str) < 65 OR ascii(str) > 90) + ELSE ascii(str) IN(ascii(chr),ascii(upper(chr))) + END; +$$ IMMUTABLE LANGUAGE SQL; diff --git a/schema.sql b/schema.sql deleted file mode 100644 index 45eb069..0000000 --- a/schema.sql +++ /dev/null @@ -1,127 +0,0 @@ -CREATE TABLE systems ( - -- Manually assigned number. The id is also used for ordering different - -- releases of the same system, as identified by 'name'. - id integer PRIMARY KEY, - name varchar NOT NULL, - release varchar, - short varchar NOT NULL -); - - -CREATE TABLE contents ( - id SERIAL PRIMARY KEY, - -- 'hash' is the SHA1 of the man page file after decompression but *before* - -- encoding conversion and removing 0-bytes. This means taking sha1(content) - -- may not necessary match the hash, and it's possible for the same content - -- to be in the database under multiple hashes (but I suspect that's rare). - hash bytea NOT NULL UNIQUE, - content text NOT NULL, - html text -); -CREATE INDEX contents_nohtml ON contents (id) WHERE html IS NULL; - - --- Unique man page, as identified by name & section -CREATE TABLE mans ( - id SERIAL PRIMARY KEY, - name text NOT NULL, - section text NOT NULL, - UNIQUE(name, section) -); -CREATE INDEX mans_name ON mans USING btree(lower(name) text_pattern_ops); - - --- List of man page locales for efficient referencing. Some locales include --- the encoding in their name, which isn't really correct or even necessary --- since we convert everything to UTF-8 anyway, but w/e, Can fix later. -CREATE TABLE locales ( - id SMALLSERIAL PRIMARY KEY, - locale text NOT NULL UNIQUE -); - - --- List of encodings for efficient referencing. -CREATE TABLE encodings ( - id SMALLSERIAL PRIMARY KEY, - encoding text NOT NULL UNIQUE -); - - -CREATE TABLE packages ( - id SERIAL PRIMARY KEY, - system integer NOT NULL REFERENCES systems(id) ON DELETE CASCADE ON UPDATE CASCADE, - name varchar NOT NULL, - -- Whether this package has been seen in the last repository update. This - -- field is only updated for a few systems that are likely to delete packages - -- over time; non-rolling-release distros tend to not delete packages after - -- all. - -- Packages where the latest version does not have any man pages may also be - -- marked as dead even if the package is still available in the repos. - dead boolean NOT NULL DEFAULT FALSE, - -- Whether this package has at least one man page indexed in the database. - -- The indexer uses this table to keep track of which packages it has - -- already indexed, but not all packages seen by the indexer have a man page. - -- This cache helps the web front-end filter out irrelevant packages faster. - c_hasman boolean NOT NULL DEFAULT FALSE, - UNIQUE(system, name) INCLUDE (id, c_hasman, dead) -); - - -CREATE TABLE package_versions ( - id SERIAL PRIMARY KEY, - package integer NOT NULL REFERENCES packages(id) ON DELETE CASCADE ON UPDATE CASCADE, - version varchar NOT NULL, - released date NOT NULL, - arch varchar, - UNIQUE(package, version) -); - - -CREATE TABLE files ( - pkgver integer NOT NULL REFERENCES package_versions(id) ON DELETE CASCADE ON UPDATE CASCADE, - man integer NOT NULL REFERENCES mans(id), - content integer NOT NULL REFERENCES content(id), - shorthash integer NOT NULL, -- cache: hash_to_shorthash(content.hash) - locale smallint NOT NULL REFERENCES locales(id) - -- The original encoding the man page was found in. This column isn't really - -- used at the moment, but is potentially useful when investigating encoding - -- issues. - encoding smallint NOT NULL REFERENCES encodings(id), - filename text NOT NULL, - PRIMARY KEY(pkgver, filename) -); -CREATE INDEX ON files (man, shorthash); -CREATE INDEX ON files (content); - - --- For stats_cache -\i util/update_indices.sql - - - --- Interpret first 4 bytes of hash as a signed 32-bit integer. -CREATE OR REPLACE FUNCTION hash_to_shorthash(hash bytea) RETURNS integer AS $$ - SELECT CASE WHEN get_byte(hash, 3) < 128 - THEN (get_byte(hash, 3)::int<<24) + (get_byte(hash, 2)::int<<16) + (get_byte(hash, 1)::int<<8) + get_byte(hash, 0) - ELSE -2147483648 + ((get_byte(hash, 3)::int - 128)<<24) + (get_byte(hash, 2)::int<<16) + (get_byte(hash, 1)::int<<8) + get_byte(hash, 0) - END; -$$ LANGUAGE SQL IMMUTABLE; - - -CREATE OR REPLACE FUNCTION is_english_locale(locale text) RETURNS bool AS $$ - SELECT locale IS NULL OR locale = '' OR locale LIKE 'en%'; -$$ IMMUTABLE LANGUAGE SQL; - - -CREATE OR REPLACE FUNCTION is_standard_man_location(path text) RETURNS bool AS $$ - SELECT path LIKE '/usr/share/man/man%' OR path LIKE '/usr/local/man/man%'; -$$ IMMUTABLE LANGUAGE sql; - --- Convenient function to match the first character of a string. Second argument must be lowercase 'a'-'z' or '0'. --- Postgres can inline and partially evaluate this function into the query plan, so it's fairly efficient. -CREATE OR REPLACE FUNCTION match_firstchar(str text, chr text) RETURNS boolean AS $$ - SELECT CASE WHEN chr = '0' - THEN (ascii(str) < 97 OR ascii(str) > 122) AND (ascii(str) < 65 OR ascii(str) > 90) - ELSE ascii(str) IN(ascii(chr),ascii(upper(chr))) - END; -$$ LANGUAGE SQL IMMUTABLE; diff --git a/util/cron.sh b/util/cron.sh index 4487061..07cb0b4 100755 --- a/util/cron.sh +++ b/util/cron.sh @@ -16,3 +16,6 @@ $PSQL -f update_indices.sql echo "============ Updating HTML cache" test -f .config && source ./.config ./cache-html.pl --batch=5 --delay=0.5 --maxbatches=100 + +echo "============ Updating database dumps" +./export.sh diff --git a/util/export.sh b/util/export.sh new file mode 100755 index 0000000..8b3619b --- /dev/null +++ b/util/export.sh @@ -0,0 +1,34 @@ +#!/bin/sh + +set -e + +mkdir -p ../dl + +# Only run once a week +[ -e ../dl/current ] && [ -z $(find ../dl/current -daystart -mtime +7) ] && exit + +# Only keep the last dump +rm -rf $(printf '%s\n' ../dl/????-??-?? | sort | head -n -1) + +# Create a new dump +OUT=../dl/.work +export OUT +rm -fr $OUT +mkdir -p $OUT + +cp ../import.sql $OUT/import.sql + +psql -wqU manned <../dl/current diff --git a/util/update_indices.sql b/util/update_indices.sql index d1f510d..aa00fdc 100644 --- a/util/update_indices.sql +++ b/util/update_indices.sql @@ -6,7 +6,7 @@ CREATE TABLE stats_cache_new AS SELECT (SELECT count(*) FROM contents) AS hashes, (SELECT count(distinct name) FROM mans) AS mans, * FROM (SELECT count(*), count(distinct pkgver) FROM files) x(files, packages); -DROP TABLE stats_cache; +DROP TABLE IF EXISTS stats_cache; ALTER TABLE stats_cache_new RENAME TO stats_cache; COMMIT; diff --git a/www/index.pl b/www/index.pl index 90ab5d4..7af2197 100755 --- a/www/index.pl +++ b/www/index.pl @@ -496,19 +496,14 @@ FU::get '/info/about' => sub {

Database download

- This site is backed by a PostgreSQL database containing all the man pages. - Weekly dumps of the full database are available for download at - http://dl.manned.org/dumps/. + This site is backed by a PostgreSQL database containing all the metadata + and man pages. Weekly dumps of the full database are available for + download at https://dl.manned.org/.

- Be warned that the download server may not be terribly fast or reliable, - so it is advisable to use a client that supports resumption of partial - downloads. See wget's -c or - curl's -C. -

- The database schema is "documented" at schema.sql - in the git repo. Keep in mind that these dumps don't constitute a stable - API and, while this won't happen frequently, incompatible schema changes + The database schema is "documented" in import.sql. + Keep in mind that these dumps don't constitute a stable API and, while + this won't happen frequently, incompatible schema changes, format changes or Postgres major version bumps will occassionally occur.