Change database dump format + add import & export scripts

The new format allows for downloading and importing only a part of the
database - useful when only the metadata is required - and doesn't
include the wasteful preformatted HTML cache.

This also ensures that the new import.sql script is actually usable and
in sync with the actual database. The old schema.sql was neither.

(And this simplifies my backup scripts)
This commit is contained in:
Yorhel 2025-10-08 09:40:50 +02:00
parent 23b2686672
commit 902048e282
8 changed files with 273 additions and 141 deletions

1
.gitignore vendored
View file

@ -6,3 +6,4 @@
indexer/target
web/target
util/.config
dl/

View file

@ -24,7 +24,6 @@ Ironically, documentation about how things work is completely lacking.
- **indexer/** -> The Rust program that scans package repositories for updates, fetches new packages and extracts the man pages.
- **ManUtils/** -> Perl/XS helper module to format man pages into HTML (uses **web/**).
- **sql/** -> Database schema & updates.
- **util/** -> Cron job and scripts to run **indexer/** on the right repositories.
- **web/** -> Badly named Rust library to convert man pages into HTML.
- **www/** -> The web front-end.

227
import.sql Normal file
View file

@ -0,0 +1,227 @@
-- To import an existing database dump, make sure the data is available in
-- $table.tsv.zst files in the current directory and then run:
--
-- psql -U manned -f import.sql
--
-- Missing some data files is fine if you're only interested in a subset of the
-- data, error messages relating to the missing tables can be safely ignored.
--
--
-- To stream and import a database dump without saving the files separately:
--
-- URL=https://dl.manned.org/2025-10-07 && curl -s $URL/import.sql \
-- | sed "s#zstd -dc \([^ ]\+.tsv.zst\)#curl -s $URL/\\1 | zstd -d#" \
-- | psql -U manned
--
--
-- To initialize a fresh, empty database:
--
-- grep -v '^\\copy' import.sql | psql -U manned
--
--
-- All numeric identifiers should be considered internal and unstable. They are
-- not exposed through the web interface.
--
-- The `stats_cache` table is missing from this file, refer to
-- util/update_indices.sql.
--
-- Tables
--
CREATE TABLE systems (
-- Manually assigned number. The id is also used for ordering different
-- releases of the same system, as identified by 'name'.
id integer NOT NULL,
name text NOT NULL,
release text,
short text NOT NULL
);
CREATE TABLE contents (
id SERIAL NOT NULL,
-- 'hash' is the SHA1 of the man page file after decompression but *before*
-- encoding conversion and removing 0-bytes. This means taking sha1(content)
-- may not necessary match the hash, and it's possible for the same content
-- to be in the database under multiple hashes (but I suspect that's rare).
hash bytea NOT NULL,
content text NOT NULL,
html text
);
-- Unique man page, as identified by name & section
CREATE TABLE mans (
id SERIAL NOT NULL,
name text NOT NULL,
section text NOT NULL
);
-- List of man page locales for efficient referencing. Some locales include
-- the encoding in their name, which isn't really correct or even necessary
-- since we convert everything to UTF-8 anyway, but w/e, Can fix later.
CREATE TABLE locales (
id SMALLSERIAL NOT NULL,
locale text NOT NULL
);
-- List of encodings for efficient referencing.
CREATE TABLE encodings (
id SMALLSERIAL NOT NULL,
encoding text NOT NULL
);
CREATE TABLE packages (
id SERIAL NOT NULL,
system integer NOT NULL,
name text NOT NULL,
-- Whether this package has been seen in the last repository update. This
-- field is only updated for a few systems that are likely to delete packages
-- over time; non-rolling-release distros tend to not delete packages after
-- all.
-- Packages where the latest version does not have any man pages may also be
-- marked as dead even if the package is still available in the repos.
dead boolean NOT NULL DEFAULT FALSE,
-- Whether this package has at least one man page indexed in the database.
-- The indexer uses this table to keep track of which packages it has
-- already indexed, but not all packages seen by the indexer have a man page.
-- This cache helps the web front-end filter out irrelevant packages faster.
c_hasman boolean NOT NULL DEFAULT FALSE
);
CREATE TABLE package_versions (
id SERIAL NOT NULL,
package integer NOT NULL,
version text NOT NULL,
released date NOT NULL,
arch text
);
CREATE TABLE files (
pkgver integer NOT NULL,
man integer NOT NULL,
content integer NOT NULL,
shorthash integer NOT NULL, -- cache: hash_to_shorthash(content.hash)
locale smallint NOT NULL,
-- The original encoding the man page was found in. This column isn't really
-- used at the moment, but is potentially useful when investigating encoding
-- issues.
encoding smallint NOT NULL,
filename text NOT NULL
);
--
-- Data
--
\copy systems from program 'zstd -dc systems.tsv.zst'
\copy contents (id, hash, content) from program 'zstd -dc contents.tsv.zst'
\copy mans from program 'zstd -dc mans.tsv.zst'
\copy locales from program 'zstd -dc locales.tsv.zst'
\copy encodings from program 'zstd -dc encodings.tsv.zst'
\copy packages from program 'zstd -dc packages.tsv.zst'
\copy package_versions from program 'zstd -dc package_versions.tsv.zst'
\copy files from program 'zstd -dc files.tsv.zst'
--
-- Primary keys & indices
--
ALTER TABLE systems ADD CONSTRAINT systems_pkey PRIMARY KEY (id);
ALTER TABLE contents ADD CONSTRAINT contents_pkey PRIMARY KEY (id);
ALTER TABLE mans ADD CONSTRAINT mans_pkey PRIMARY KEY (id);
ALTER TABLE locales ADD CONSTRAINT locales_pkey PRIMARY KEY (id);
ALTER TABLE encodings ADD CONSTRAINT encodings_pkey PRIMARY KEY (id);
ALTER TABLE packages ADD CONSTRAINT packages_pkey PRIMARY KEY (id);
ALTER TABLE package_versions ADD CONSTRAINT package_versions_pkey PRIMARY KEY (id);
ALTER TABLE files ADD CONSTRAINT files_pkey PRIMARY KEY (pkgver, filename);
CREATE UNIQUE INDEX contents_hash_key ON contents (hash);
CREATE UNIQUE INDEX mans_name_section_key ON mans (name, section);
CREATE UNIQUE INDEX locales_locale_key ON locales (locale);
CREATE UNIQUE INDEX encodings_encoding_key ON encodings (encoding);
CREATE UNIQUE INDEX packages_system_name_key ON packages (system, name) INCLUDE (id, c_hasman, dead);
CREATE UNIQUE INDEX package_versions_package_version_key ON package_versions (package, version);
CREATE INDEX contents_nohtml ON contents (id) WHERE html IS NULL;
CREATE INDEX mans_name ON mans USING btree(lower(name) text_pattern_ops);
CREATE INDEX files_man_shorthash_idx ON files (man, shorthash);
CREATE INDEX files_content_idx ON files (content);
--
-- Constraints
--
ALTER TABLE packages
ADD CONSTRAINT packages_system_fkey FOREIGN KEY (system) REFERENCES systems(id) ON DELETE CASCADE ON UPDATE CASCADE;
ALTER TABLE package_versions
ADD CONSTRAINT package_versions_package_fkey FOREIGN KEY (package) REFERENCES packages(id) ON DELETE CASCADE ON UPDATE CASCADE;
ALTER TABLE files
ADD CONSTRAINT files_pkgver_fkey FOREIGN KEY (pkgver) REFERENCES package_versions(id) ON DELETE CASCADE ON UPDATE CASCADE,
ADD CONSTRAINT files_man_fkey FOREIGN KEY (man) REFERENCES mans(id),
ADD CONSTRAINT files_content_fkey FOREIGN KEY (content) REFERENCES contents(id),
ADD CONSTRAINT files_locale_fkey FOREIGN KEY (locale) REFERENCES locales(id),
ADD CONSTRAINT files_encoding_fkey FOREIGN KEY (encoding) REFERENCES encodings(id);
--
-- Sequences
--
SELECT setval('contents_id_seq', (SELECT MAX(id) FROM contents));
SELECT setval('encodings_id_seq', (SELECT MAX(id) FROM encodings));
SELECT setval('locales_id_seq', (SELECT MAX(id) FROM locales));
SELECT setval('mans_id_seq', (SELECT MAX(id) FROM mans));
SELECT setval('package_versions_id_seq', (SELECT MAX(id) FROM package_versions));
SELECT setval('packages_id_seq', (SELECT MAX(id) FROM packages));
--
-- Utility functions
--
-- Interpret first 4 bytes of hash as a signed 32-bit integer.
-- TODO: Postgres 18 allows casting between bytea and int, see if that can be used instead.
CREATE OR REPLACE FUNCTION hash_to_shorthash(hash bytea) RETURNS integer AS $$
SELECT CASE WHEN get_byte(hash, 3) < 128
THEN (get_byte(hash, 3)::int<<24) + (get_byte(hash, 2)::int<<16) + (get_byte(hash, 1)::int<<8) + get_byte(hash, 0)
ELSE -2147483648 + ((get_byte(hash, 3)::int - 128)<<24) + (get_byte(hash, 2)::int<<16) + (get_byte(hash, 1)::int<<8) + get_byte(hash, 0)
END;
$$ IMMUTABLE LANGUAGE SQL;
CREATE OR REPLACE FUNCTION is_english_locale(locale text) RETURNS bool AS $$
SELECT locale IS NULL OR locale = '' OR locale LIKE 'en%';
$$ IMMUTABLE LANGUAGE SQL;
CREATE OR REPLACE FUNCTION is_standard_man_location(path text) RETURNS bool AS $$
SELECT path LIKE '/usr/share/man/man%' OR path LIKE '/usr/local/man/man%';
$$ IMMUTABLE LANGUAGE sql;
-- Convenient function to match the first character of a string. Second argument must be lowercase 'a'-'z' or '0'.
-- Postgres can inline and partially evaluate this function into the query plan, so it's fairly efficient.
CREATE OR REPLACE FUNCTION match_firstchar(str text, chr text) RETURNS boolean AS $$
SELECT CASE WHEN chr = '0'
THEN (ascii(str) < 97 OR ascii(str) > 122) AND (ascii(str) < 65 OR ascii(str) > 90)
ELSE ascii(str) IN(ascii(chr),ascii(upper(chr)))
END;
$$ IMMUTABLE LANGUAGE SQL;

View file

@ -1,127 +0,0 @@
CREATE TABLE systems (
-- Manually assigned number. The id is also used for ordering different
-- releases of the same system, as identified by 'name'.
id integer PRIMARY KEY,
name varchar NOT NULL,
release varchar,
short varchar NOT NULL
);
CREATE TABLE contents (
id SERIAL PRIMARY KEY,
-- 'hash' is the SHA1 of the man page file after decompression but *before*
-- encoding conversion and removing 0-bytes. This means taking sha1(content)
-- may not necessary match the hash, and it's possible for the same content
-- to be in the database under multiple hashes (but I suspect that's rare).
hash bytea NOT NULL UNIQUE,
content text NOT NULL,
html text
);
CREATE INDEX contents_nohtml ON contents (id) WHERE html IS NULL;
-- Unique man page, as identified by name & section
CREATE TABLE mans (
id SERIAL PRIMARY KEY,
name text NOT NULL,
section text NOT NULL,
UNIQUE(name, section)
);
CREATE INDEX mans_name ON mans USING btree(lower(name) text_pattern_ops);
-- List of man page locales for efficient referencing. Some locales include
-- the encoding in their name, which isn't really correct or even necessary
-- since we convert everything to UTF-8 anyway, but w/e, Can fix later.
CREATE TABLE locales (
id SMALLSERIAL PRIMARY KEY,
locale text NOT NULL UNIQUE
);
-- List of encodings for efficient referencing.
CREATE TABLE encodings (
id SMALLSERIAL PRIMARY KEY,
encoding text NOT NULL UNIQUE
);
CREATE TABLE packages (
id SERIAL PRIMARY KEY,
system integer NOT NULL REFERENCES systems(id) ON DELETE CASCADE ON UPDATE CASCADE,
name varchar NOT NULL,
-- Whether this package has been seen in the last repository update. This
-- field is only updated for a few systems that are likely to delete packages
-- over time; non-rolling-release distros tend to not delete packages after
-- all.
-- Packages where the latest version does not have any man pages may also be
-- marked as dead even if the package is still available in the repos.
dead boolean NOT NULL DEFAULT FALSE,
-- Whether this package has at least one man page indexed in the database.
-- The indexer uses this table to keep track of which packages it has
-- already indexed, but not all packages seen by the indexer have a man page.
-- This cache helps the web front-end filter out irrelevant packages faster.
c_hasman boolean NOT NULL DEFAULT FALSE,
UNIQUE(system, name) INCLUDE (id, c_hasman, dead)
);
CREATE TABLE package_versions (
id SERIAL PRIMARY KEY,
package integer NOT NULL REFERENCES packages(id) ON DELETE CASCADE ON UPDATE CASCADE,
version varchar NOT NULL,
released date NOT NULL,
arch varchar,
UNIQUE(package, version)
);
CREATE TABLE files (
pkgver integer NOT NULL REFERENCES package_versions(id) ON DELETE CASCADE ON UPDATE CASCADE,
man integer NOT NULL REFERENCES mans(id),
content integer NOT NULL REFERENCES content(id),
shorthash integer NOT NULL, -- cache: hash_to_shorthash(content.hash)
locale smallint NOT NULL REFERENCES locales(id)
-- The original encoding the man page was found in. This column isn't really
-- used at the moment, but is potentially useful when investigating encoding
-- issues.
encoding smallint NOT NULL REFERENCES encodings(id),
filename text NOT NULL,
PRIMARY KEY(pkgver, filename)
);
CREATE INDEX ON files (man, shorthash);
CREATE INDEX ON files (content);
-- For stats_cache
\i util/update_indices.sql
-- Interpret first 4 bytes of hash as a signed 32-bit integer.
CREATE OR REPLACE FUNCTION hash_to_shorthash(hash bytea) RETURNS integer AS $$
SELECT CASE WHEN get_byte(hash, 3) < 128
THEN (get_byte(hash, 3)::int<<24) + (get_byte(hash, 2)::int<<16) + (get_byte(hash, 1)::int<<8) + get_byte(hash, 0)
ELSE -2147483648 + ((get_byte(hash, 3)::int - 128)<<24) + (get_byte(hash, 2)::int<<16) + (get_byte(hash, 1)::int<<8) + get_byte(hash, 0)
END;
$$ LANGUAGE SQL IMMUTABLE;
CREATE OR REPLACE FUNCTION is_english_locale(locale text) RETURNS bool AS $$
SELECT locale IS NULL OR locale = '' OR locale LIKE 'en%';
$$ IMMUTABLE LANGUAGE SQL;
CREATE OR REPLACE FUNCTION is_standard_man_location(path text) RETURNS bool AS $$
SELECT path LIKE '/usr/share/man/man%' OR path LIKE '/usr/local/man/man%';
$$ IMMUTABLE LANGUAGE sql;
-- Convenient function to match the first character of a string. Second argument must be lowercase 'a'-'z' or '0'.
-- Postgres can inline and partially evaluate this function into the query plan, so it's fairly efficient.
CREATE OR REPLACE FUNCTION match_firstchar(str text, chr text) RETURNS boolean AS $$
SELECT CASE WHEN chr = '0'
THEN (ascii(str) < 97 OR ascii(str) > 122) AND (ascii(str) < 65 OR ascii(str) > 90)
ELSE ascii(str) IN(ascii(chr),ascii(upper(chr)))
END;
$$ LANGUAGE SQL IMMUTABLE;

View file

@ -16,3 +16,6 @@ $PSQL -f update_indices.sql
echo "============ Updating HTML cache"
test -f .config && source ./.config
./cache-html.pl --batch=5 --delay=0.5 --maxbatches=100
echo "============ Updating database dumps"
./export.sh

34
util/export.sh Executable file
View file

@ -0,0 +1,34 @@
#!/bin/sh
set -e
mkdir -p ../dl
# Only run once a week
[ -e ../dl/current ] && [ -z $(find ../dl/current -daystart -mtime +7) ] && exit
# Only keep the last dump
rm -rf $(printf '%s\n' ../dl/????-??-?? | sort | head -n -1)
# Create a new dump
OUT=../dl/.work
export OUT
rm -fr $OUT
mkdir -p $OUT
cp ../import.sql $OUT/import.sql
psql -wqU manned <<EOF
\copy systems to program 'zstd - -qo $OUT/systems.tsv.zst'
\copy contents (id, hash, content) to program 'zstd - -qo $OUT/contents.tsv.zst'
\copy mans to program 'zstd - -qo $OUT/mans.tsv.zst'
\copy locales to program 'zstd - -qo $OUT/locales.tsv.zst'
\copy encodings to program 'zstd - -qo $OUT/encodings.tsv.zst'
\copy packages to program 'zstd - -qo $OUT/packages.tsv.zst'
\copy package_versions to program 'zstd - -qo $OUT/package_versions.tsv.zst'
\copy files to program 'zstd - -qo $OUT/files.tsv.zst'
EOF
DATE=$(date +%F)
mv -T $OUT ../dl/$DATE
echo $DATE >../dl/current

View file

@ -6,7 +6,7 @@ CREATE TABLE stats_cache_new AS
SELECT (SELECT count(*) FROM contents) AS hashes,
(SELECT count(distinct name) FROM mans) AS mans, *
FROM (SELECT count(*), count(distinct pkgver) FROM files) x(files, packages);
DROP TABLE stats_cache;
DROP TABLE IF EXISTS stats_cache;
ALTER TABLE stats_cache_new RENAME TO stats_cache;
COMMIT;

View file

@ -496,19 +496,14 @@ FU::get '/info/about' => sub {
<h2 id="database-download">Database download</h2>
<p>
This site is backed by a PostgreSQL database containing all the man pages.
Weekly dumps of the full database are available for download at
<a href="http://dl.manned.org/dumps/">http://dl.manned.org/dumps/</a>.
This site is backed by a PostgreSQL database containing all the metadata
and man pages. Weekly dumps of the full database are available for
download at <a href="https://dl.manned.org/">https://dl.manned.org/</a>.
<br /><br />
Be warned that the download server may not be terribly fast or reliable,
so it is advisable to use a client that supports resumption of partial
downloads. See <a href="/wget">wget's -c</a> or
<a href="/curl">curl's -C</a>.
<br /><br />
The database schema is "documented" at <a
href="https://code.blicky.net/yorhel/manned/src/branch/master/schema.sql">schema.sql</a>
in the git repo. Keep in mind that these dumps don't constitute a stable
API and, while this won't happen frequently, incompatible schema changes
The database schema is "documented" in <a
href="https://code.blicky.net/yorhel/manned/src/branch/master/import.sql">import.sql</a>.
Keep in mind that these dumps don't constitute a stable API and, while
this won't happen frequently, incompatible schema changes, format changes
or Postgres major version bumps will occassionally occur.
</p>