Change database dump format + add import & export scripts

The new format allows for downloading and importing only a part of the database - useful when only the metadata is required - and doesn't include the wasteful preformatted HTML cache. This also ensures that the new import.sql script is actually usable and in sync with the actual database. The old schema.sql was neither. (And this simplifies my backup scripts)
2025-10-08 09:40:50 +02:00 · 2025-10-08 09:40:50 +02:00 · 902048e282
commit 902048e282
parent 23b2686672
8 changed files with 273 additions and 141 deletions
--- a/.gitignore
+++ b/.gitignore
@ -6,3 +6,4 @@
 indexer/target
 web/target
 util/.config
+dl/
--- a/README.md
+++ b/README.md
@ -24,7 +24,6 @@ Ironically, documentation about how things work is completely lacking.

 - **indexer/** -> The Rust program that scans package repositories for updates, fetches new packages and extracts the man pages.
 - **ManUtils/** -> Perl/XS helper module to format man pages into HTML (uses **web/**).
- **sql/** -> Database schema & updates.
 - **util/** -> Cron job and scripts to run **indexer/** on the right repositories.
 - **web/** -> Badly named Rust library to convert man pages into HTML.
 - **www/** -> The web front-end.
--- a/import.sql
+++ b/import.sql
@ -0,0 +1,227 @@
+-- To import an existing database dump, make sure the data is available in
+-- $table.tsv.zst files in the current directory and then run:
+--
+--   psql -U manned -f import.sql
+--
+-- Missing some data files is fine if you're only interested in a subset of the
+-- data, error messages relating to the missing tables can be safely ignored.
+--
+--
+-- To stream and import a database dump without saving the files separately:
+--
+--   URL=https://dl.manned.org/2025-10-07 && curl -s $URL/import.sql \
+--     | sed "s#zstd -dc \([^ ]\+.tsv.zst\)#curl -s $URL/\\1 | zstd -d#" \
+--     | psql -U manned
+--
+--
+-- To initialize a fresh, empty database:
+--
+--   grep -v '^\\copy' import.sql | psql -U manned
+--
+--
+-- All numeric identifiers should be considered internal and unstable. They are
+-- not exposed through the web interface.
+--
+-- The `stats_cache` table is missing from this file, refer to
+-- util/update_indices.sql.
+
+
+--
+--   Tables
+--
+
+CREATE TABLE systems (
+  -- Manually assigned number. The id is also used for ordering different
+  -- releases of the same system, as identified by 'name'.
+  id       integer NOT NULL,
+  name     text NOT NULL,
+  release  text,
+  short    text NOT NULL
+);
+
+
+CREATE TABLE contents (
+  id      SERIAL NOT NULL,
+  -- 'hash' is the SHA1 of the man page file after decompression but *before*
+  -- encoding conversion and removing 0-bytes. This means taking sha1(content)
+  -- may not necessary match the hash, and it's possible for the same content
+  -- to be in the database under multiple hashes (but I suspect that's rare).
+  hash    bytea NOT NULL,
+  content text  NOT NULL,
+  html    text
+);
+
+
+-- Unique man page, as identified by name & section
+CREATE TABLE mans (
+  id      SERIAL NOT NULL,
+  name    text NOT NULL,
+  section text NOT NULL
+);
+
+
+-- List of man page locales for efficient referencing.  Some locales include
+-- the encoding in their name, which isn't really correct or even necessary
+-- since we convert everything to UTF-8 anyway, but w/e, Can fix later.
+CREATE TABLE locales (
+  id      SMALLSERIAL NOT NULL,
+  locale  text NOT NULL
+);
+
+
+-- List of encodings for efficient referencing.
+CREATE TABLE encodings (
+  id       SMALLSERIAL NOT NULL,
+  encoding text NOT NULL
+);
+
+
+CREATE TABLE packages (
+  id       SERIAL NOT NULL,
+  system   integer NOT NULL,
+  name     text NOT NULL,
+  -- Whether this package has been seen in the last repository update. This
+  -- field is only updated for a few systems that are likely to delete packages
+  -- over time; non-rolling-release distros tend to not delete packages after
+  -- all.
+  -- Packages where the latest version does not have any man pages may also be
+  -- marked as dead even if the package is still available in the repos.
+  dead     boolean NOT NULL DEFAULT FALSE,
+  -- Whether this package has at least one man page indexed in the database.
+  -- The indexer uses this table to keep track of which packages it has
+  -- already indexed, but not all packages seen by the indexer have a man page.
+  -- This cache helps the web front-end filter out irrelevant packages faster.
+  c_hasman boolean NOT NULL DEFAULT FALSE
+);
+
+
+CREATE TABLE package_versions (
+  id       SERIAL  NOT NULL,
+  package  integer NOT NULL,
+  version  text    NOT NULL,
+  released date    NOT NULL,
+  arch     text
+);
+
+
+CREATE TABLE files (
+  pkgver    integer   NOT NULL,
+  man       integer   NOT NULL,
+  content   integer   NOT NULL,
+  shorthash integer   NOT NULL, -- cache: hash_to_shorthash(content.hash)
+  locale    smallint  NOT NULL,
+  -- The original encoding the man page was found in. This column isn't really
+  -- used at the moment, but is potentially useful when investigating encoding
+  -- issues.
+  encoding  smallint  NOT NULL,
+  filename  text      NOT NULL
+);
+
+
+
+--
+--   Data
+--
+
+\copy systems          from program 'zstd -dc systems.tsv.zst'
+\copy contents (id, hash, content) from program 'zstd -dc contents.tsv.zst'
+\copy mans             from program 'zstd -dc mans.tsv.zst'
+\copy locales          from program 'zstd -dc locales.tsv.zst'
+\copy encodings        from program 'zstd -dc encodings.tsv.zst'
+\copy packages         from program 'zstd -dc packages.tsv.zst'
+\copy package_versions from program 'zstd -dc package_versions.tsv.zst'
+\copy files            from program 'zstd -dc files.tsv.zst'
+
+
+
+--
+--   Primary keys & indices
+--
+
+ALTER TABLE systems          ADD CONSTRAINT systems_pkey          PRIMARY KEY (id);
+ALTER TABLE contents         ADD CONSTRAINT contents_pkey         PRIMARY KEY (id);
+ALTER TABLE mans             ADD CONSTRAINT mans_pkey             PRIMARY KEY (id);
+ALTER TABLE locales          ADD CONSTRAINT locales_pkey          PRIMARY KEY (id);
+ALTER TABLE encodings        ADD CONSTRAINT encodings_pkey        PRIMARY KEY (id);
+ALTER TABLE packages         ADD CONSTRAINT packages_pkey         PRIMARY KEY (id);
+ALTER TABLE package_versions ADD CONSTRAINT package_versions_pkey PRIMARY KEY (id);
+ALTER TABLE files            ADD CONSTRAINT files_pkey            PRIMARY KEY (pkgver, filename);
+
+
+CREATE UNIQUE INDEX contents_hash_key                    ON contents (hash);
+CREATE UNIQUE INDEX mans_name_section_key                ON mans (name, section);
+CREATE UNIQUE INDEX locales_locale_key                   ON locales (locale);
+CREATE UNIQUE INDEX encodings_encoding_key               ON encodings (encoding);
+CREATE UNIQUE INDEX packages_system_name_key             ON packages (system, name) INCLUDE (id, c_hasman, dead);
+CREATE UNIQUE INDEX package_versions_package_version_key ON package_versions (package, version);
+CREATE        INDEX contents_nohtml                      ON contents (id) WHERE html IS NULL;
+CREATE        INDEX mans_name                            ON mans USING btree(lower(name) text_pattern_ops);
+CREATE        INDEX files_man_shorthash_idx              ON files (man, shorthash);
+CREATE        INDEX files_content_idx                    ON files (content);
+
+
+
+--
+--   Constraints
+--
+
+ALTER TABLE packages
+  ADD CONSTRAINT packages_system_fkey FOREIGN KEY (system) REFERENCES systems(id) ON DELETE CASCADE ON UPDATE CASCADE;
+
+ALTER TABLE package_versions
+  ADD CONSTRAINT package_versions_package_fkey FOREIGN KEY (package) REFERENCES packages(id) ON DELETE CASCADE ON UPDATE CASCADE;
+
+ALTER TABLE files
+  ADD CONSTRAINT files_pkgver_fkey   FOREIGN KEY (pkgver)   REFERENCES package_versions(id) ON DELETE CASCADE ON UPDATE CASCADE,
+  ADD CONSTRAINT files_man_fkey      FOREIGN KEY (man)      REFERENCES mans(id),
+  ADD CONSTRAINT files_content_fkey  FOREIGN KEY (content)  REFERENCES contents(id),
+  ADD CONSTRAINT files_locale_fkey   FOREIGN KEY (locale)   REFERENCES locales(id),
+  ADD CONSTRAINT files_encoding_fkey FOREIGN KEY (encoding) REFERENCES encodings(id);
+
+
+
+--
+--   Sequences
+--
+
+SELECT setval('contents_id_seq',         (SELECT MAX(id) FROM contents));
+SELECT setval('encodings_id_seq',        (SELECT MAX(id) FROM encodings));
+SELECT setval('locales_id_seq',          (SELECT MAX(id) FROM locales));
+SELECT setval('mans_id_seq',             (SELECT MAX(id) FROM mans));
+SELECT setval('package_versions_id_seq', (SELECT MAX(id) FROM package_versions));
+SELECT setval('packages_id_seq',         (SELECT MAX(id) FROM packages));
+
+
+
+--
+--   Utility functions
+--
+
+
+-- Interpret first 4 bytes of hash as a signed 32-bit integer.
+-- TODO: Postgres 18 allows casting between bytea and int, see if that can be used instead.
+CREATE OR REPLACE FUNCTION hash_to_shorthash(hash bytea) RETURNS integer AS $$
+  SELECT CASE WHEN get_byte(hash, 3) < 128
+         THEN (get_byte(hash, 3)::int<<24) + (get_byte(hash, 2)::int<<16) + (get_byte(hash, 1)::int<<8) + get_byte(hash, 0)
+         ELSE -2147483648 + ((get_byte(hash, 3)::int - 128)<<24) + (get_byte(hash, 2)::int<<16) + (get_byte(hash, 1)::int<<8) + get_byte(hash, 0)
+         END;
+$$ IMMUTABLE LANGUAGE SQL;
+
+
+CREATE OR REPLACE FUNCTION is_english_locale(locale text) RETURNS bool AS $$
+  SELECT locale IS NULL OR locale = '' OR locale LIKE 'en%';
+$$ IMMUTABLE LANGUAGE SQL;
+
+
+CREATE OR REPLACE FUNCTION is_standard_man_location(path text) RETURNS bool AS $$
+  SELECT path LIKE '/usr/share/man/man%' OR path LIKE '/usr/local/man/man%';
+$$ IMMUTABLE LANGUAGE sql;
+
+-- Convenient function to match the first character of a string. Second argument must be lowercase 'a'-'z' or '0'.
+-- Postgres can inline and partially evaluate this function into the query plan, so it's fairly efficient.
+CREATE OR REPLACE FUNCTION match_firstchar(str text, chr text) RETURNS boolean AS $$
+  SELECT CASE WHEN chr = '0'
+         THEN (ascii(str) < 97 OR ascii(str) > 122) AND (ascii(str) < 65 OR ascii(str) > 90)
+         ELSE ascii(str) IN(ascii(chr),ascii(upper(chr)))
+         END;
+$$ IMMUTABLE LANGUAGE SQL;
--- a/schema.sql
+++ b/schema.sql
@ -1,127 +0,0 @@
-CREATE TABLE systems (
-  -- Manually assigned number. The id is also used for ordering different
-  -- releases of the same system, as identified by 'name'.
-  id       integer PRIMARY KEY,
-  name     varchar NOT NULL,
-  release  varchar,
-  short    varchar NOT NULL
-);
-
-
-CREATE TABLE contents (
-  id      SERIAL PRIMARY KEY,
-  -- 'hash' is the SHA1 of the man page file after decompression but *before*
-  -- encoding conversion and removing 0-bytes. This means taking sha1(content)
-  -- may not necessary match the hash, and it's possible for the same content
-  -- to be in the database under multiple hashes (but I suspect that's rare).
-  hash    bytea NOT NULL UNIQUE,
-  content text  NOT NULL,
-  html    text
-);
-CREATE INDEX contents_nohtml ON contents (id) WHERE html IS NULL;
-
-
-- Unique man page, as identified by name & section
-CREATE TABLE mans (
-  id      SERIAL PRIMARY KEY,
-  name    text NOT NULL,
-  section text NOT NULL,
-  UNIQUE(name, section)
-);
-CREATE INDEX mans_name ON mans USING btree(lower(name) text_pattern_ops);
-
-
-- List of man page locales for efficient referencing.  Some locales include
-- the encoding in their name, which isn't really correct or even necessary
-- since we convert everything to UTF-8 anyway, but w/e, Can fix later.
-CREATE TABLE locales (
-  id      SMALLSERIAL PRIMARY KEY,
-  locale  text NOT NULL UNIQUE
-);
-
-
-- List of encodings for efficient referencing.
-CREATE TABLE encodings (
-  id       SMALLSERIAL PRIMARY KEY,
-  encoding text NOT NULL UNIQUE
-);
-
-
-CREATE TABLE packages (
-  id       SERIAL    PRIMARY KEY,
-  system   integer   NOT NULL REFERENCES systems(id) ON DELETE CASCADE ON UPDATE CASCADE,
-  name     varchar   NOT NULL,
-  -- Whether this package has been seen in the last repository update. This
-  -- field is only updated for a few systems that are likely to delete packages
-  -- over time; non-rolling-release distros tend to not delete packages after
-  -- all.
-  -- Packages where the latest version does not have any man pages may also be
-  -- marked as dead even if the package is still available in the repos.
-  dead     boolean   NOT NULL DEFAULT FALSE,
-  -- Whether this package has at least one man page indexed in the database.
-  -- The indexer uses this table to keep track of which packages it has
-  -- already indexed, but not all packages seen by the indexer have a man page.
-  -- This cache helps the web front-end filter out irrelevant packages faster.
-  c_hasman boolean   NOT NULL DEFAULT FALSE,
-  UNIQUE(system, name) INCLUDE (id, c_hasman, dead)
-);
-
-
-CREATE TABLE package_versions (
-  id       SERIAL    PRIMARY KEY,
-  package  integer   NOT NULL REFERENCES packages(id) ON DELETE CASCADE ON UPDATE CASCADE,
-  version  varchar   NOT NULL,
-  released date      NOT NULL,
-  arch     varchar,
-  UNIQUE(package, version)
-);
-
-
-CREATE TABLE files (
-  pkgver    integer   NOT NULL REFERENCES package_versions(id) ON DELETE CASCADE ON UPDATE CASCADE,
-  man       integer   NOT NULL REFERENCES mans(id),
-  content   integer   NOT NULL REFERENCES content(id),
-  shorthash integer   NOT NULL, -- cache: hash_to_shorthash(content.hash)
-  locale    smallint  NOT NULL REFERENCES locales(id)
-  -- The original encoding the man page was found in. This column isn't really
-  -- used at the moment, but is potentially useful when investigating encoding
-  -- issues.
-  encoding  smallint  NOT NULL REFERENCES encodings(id),
-  filename  text      NOT NULL,
-  PRIMARY KEY(pkgver, filename)
-);
-CREATE INDEX ON files (man, shorthash);
-CREATE INDEX ON files (content);
-
-
-- For stats_cache
-\i util/update_indices.sql
-
-
-
-- Interpret first 4 bytes of hash as a signed 32-bit integer.
-CREATE OR REPLACE FUNCTION hash_to_shorthash(hash bytea) RETURNS integer AS $$
-  SELECT CASE WHEN get_byte(hash, 3) < 128
-         THEN (get_byte(hash, 3)::int<<24) + (get_byte(hash, 2)::int<<16) + (get_byte(hash, 1)::int<<8) + get_byte(hash, 0)
-         ELSE -2147483648 + ((get_byte(hash, 3)::int - 128)<<24) + (get_byte(hash, 2)::int<<16) + (get_byte(hash, 1)::int<<8) + get_byte(hash, 0)
-         END;
-$$ LANGUAGE SQL IMMUTABLE;
-
-
-CREATE OR REPLACE FUNCTION is_english_locale(locale text) RETURNS bool AS $$
-  SELECT locale IS NULL OR locale = '' OR locale LIKE 'en%';
-$$ IMMUTABLE LANGUAGE SQL;
-
-
-CREATE OR REPLACE FUNCTION is_standard_man_location(path text) RETURNS bool AS $$
-  SELECT path LIKE '/usr/share/man/man%' OR path LIKE '/usr/local/man/man%';
-$$ IMMUTABLE LANGUAGE sql;
-
-- Convenient function to match the first character of a string. Second argument must be lowercase 'a'-'z' or '0'.
-- Postgres can inline and partially evaluate this function into the query plan, so it's fairly efficient.
-CREATE OR REPLACE FUNCTION match_firstchar(str text, chr text) RETURNS boolean AS $$
-  SELECT CASE WHEN chr = '0'
-         THEN (ascii(str) < 97 OR ascii(str) > 122) AND (ascii(str) < 65 OR ascii(str) > 90)
-         ELSE ascii(str) IN(ascii(chr),ascii(upper(chr)))
-         END;
-$$ LANGUAGE SQL IMMUTABLE;
--- a/util/cron.sh
+++ b/util/cron.sh
@ -16,3 +16,6 @@ $PSQL -f update_indices.sql
 echo "============ Updating HTML cache"
 test -f .config && source ./.config
 ./cache-html.pl --batch=5 --delay=0.5 --maxbatches=100
+
+echo "============ Updating database dumps"
+./export.sh
--- a/util/export.sh
+++ b/util/export.sh
@ -0,0 +1,34 @@
+#!/bin/sh
+
+set -e
+
+mkdir -p ../dl
+
+# Only run once a week
+[ -e ../dl/current ] && [ -z $(find ../dl/current -daystart -mtime +7) ] && exit
+
+# Only keep the last dump
+rm -rf $(printf '%s\n' ../dl/????-??-?? | sort | head -n -1)
+
+# Create a new dump
+OUT=../dl/.work
+export OUT
+rm -fr $OUT
+mkdir -p $OUT
+
+cp ../import.sql $OUT/import.sql
+
+psql -wqU manned <<EOF
+\copy systems          to program 'zstd - -qo $OUT/systems.tsv.zst'
+\copy contents (id, hash, content) to program 'zstd - -qo $OUT/contents.tsv.zst'
+\copy mans             to program 'zstd - -qo $OUT/mans.tsv.zst'
+\copy locales          to program 'zstd - -qo $OUT/locales.tsv.zst'
+\copy encodings        to program 'zstd - -qo $OUT/encodings.tsv.zst'
+\copy packages         to program 'zstd - -qo $OUT/packages.tsv.zst'
+\copy package_versions to program 'zstd - -qo $OUT/package_versions.tsv.zst'
+\copy files            to program 'zstd - -qo $OUT/files.tsv.zst'
+EOF
+
+DATE=$(date +%F)
+mv -T $OUT ../dl/$DATE
+echo $DATE >../dl/current
--- a/util/update_indices.sql
+++ b/util/update_indices.sql
@ -6,7 +6,7 @@ CREATE TABLE stats_cache_new AS
    SELECT (SELECT count(*) FROM contents) AS hashes,
           (SELECT count(distinct name) FROM mans) AS mans, *
      FROM (SELECT count(*), count(distinct pkgver) FROM files) x(files, packages);
-DROP TABLE stats_cache;
+DROP TABLE IF EXISTS stats_cache;
 ALTER TABLE stats_cache_new RENAME TO stats_cache;
 COMMIT;

--- a/www/index.pl
+++ b/www/index.pl
@ -496,19 +496,14 @@ FU::get '/info/about' => sub {

    <h2 id="database-download">Database download</h2>
    <p>
-     This site is backed by a PostgreSQL database containing all the man pages.
-     Weekly dumps of the full database are available for download at
-     <a href="http://dl.manned.org/dumps/">http://dl.manned.org/dumps/</a>.
+     This site is backed by a PostgreSQL database containing all the metadata
+     and man pages. Weekly dumps of the full database are available for
+     download at <a href="https://dl.manned.org/">https://dl.manned.org/</a>.
     <br /><br />
-     Be warned that the download server may not be terribly fast or reliable,
-     so it is advisable to use a client that supports resumption of partial
-     downloads. See <a href="/wget">wget's -c</a> or
-     <a href="/curl">curl's -C</a>.
-     <br /><br />
-     The database schema is "documented" at <a
-     href="https://code.blicky.net/yorhel/manned/src/branch/master/schema.sql">schema.sql</a>
-     in the git repo. Keep in mind that these dumps don't constitute a stable
-     API and, while this won't happen frequently, incompatible schema changes
+     The database schema is "documented" in <a
+     href="https://code.blicky.net/yorhel/manned/src/branch/master/import.sql">import.sql</a>.
+     Keep in mind that these dumps don't constitute a stable API and, while
+     this won't happen frequently, incompatible schema changes, format changes
     or Postgres major version bumps will occassionally occur.
    </p>