diff --git a/.gitignore b/.gitignore index 2f129b8..a23fd4b 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,4 @@ indexer/target web/target util/.config +dl/ diff --git a/README.md b/README.md index 05c5b33..837abfe 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,6 @@ Ironically, documentation about how things work is completely lacking. - **indexer/** -> The Rust program that scans package repositories for updates, fetches new packages and extracts the man pages. - **ManUtils/** -> Perl/XS helper module to format man pages into HTML (uses **web/**). -- **sql/** -> Database schema & updates. - **util/** -> Cron job and scripts to run **indexer/** on the right repositories. - **web/** -> Badly named Rust library to convert man pages into HTML. - **www/** -> The web front-end. diff --git a/import.sql b/import.sql new file mode 100644 index 0000000..1bec1fb --- /dev/null +++ b/import.sql @@ -0,0 +1,227 @@ +-- To import an existing database dump, make sure the data is available in +-- $table.tsv.zst files in the current directory and then run: +-- +-- psql -U manned -f import.sql +-- +-- Missing some data files is fine if you're only interested in a subset of the +-- data, error messages relating to the missing tables can be safely ignored. +-- +-- +-- To stream and import a database dump without saving the files separately: +-- +-- URL=https://dl.manned.org/2025-10-07 && curl -s $URL/import.sql \ +-- | sed "s#zstd -dc \([^ ]\+.tsv.zst\)#curl -s $URL/\\1 | zstd -d#" \ +-- | psql -U manned +-- +-- +-- To initialize a fresh, empty database: +-- +-- grep -v '^\\copy' import.sql | psql -U manned +-- +-- +-- All numeric identifiers should be considered internal and unstable. They are +-- not exposed through the web interface. +-- +-- The `stats_cache` table is missing from this file, refer to +-- util/update_indices.sql. + + +-- +-- Tables +-- + +CREATE TABLE systems ( + -- Manually assigned number. The id is also used for ordering different + -- releases of the same system, as identified by 'name'. + id integer NOT NULL, + name text NOT NULL, + release text, + short text NOT NULL +); + + +CREATE TABLE contents ( + id SERIAL NOT NULL, + -- 'hash' is the SHA1 of the man page file after decompression but *before* + -- encoding conversion and removing 0-bytes. This means taking sha1(content) + -- may not necessary match the hash, and it's possible for the same content + -- to be in the database under multiple hashes (but I suspect that's rare). + hash bytea NOT NULL, + content text NOT NULL, + html text +); + + +-- Unique man page, as identified by name & section +CREATE TABLE mans ( + id SERIAL NOT NULL, + name text NOT NULL, + section text NOT NULL +); + + +-- List of man page locales for efficient referencing. Some locales include +-- the encoding in their name, which isn't really correct or even necessary +-- since we convert everything to UTF-8 anyway, but w/e, Can fix later. +CREATE TABLE locales ( + id SMALLSERIAL NOT NULL, + locale text NOT NULL +); + + +-- List of encodings for efficient referencing. +CREATE TABLE encodings ( + id SMALLSERIAL NOT NULL, + encoding text NOT NULL +); + + +CREATE TABLE packages ( + id SERIAL NOT NULL, + system integer NOT NULL, + name text NOT NULL, + -- Whether this package has been seen in the last repository update. This + -- field is only updated for a few systems that are likely to delete packages + -- over time; non-rolling-release distros tend to not delete packages after + -- all. + -- Packages where the latest version does not have any man pages may also be + -- marked as dead even if the package is still available in the repos. + dead boolean NOT NULL DEFAULT FALSE, + -- Whether this package has at least one man page indexed in the database. + -- The indexer uses this table to keep track of which packages it has + -- already indexed, but not all packages seen by the indexer have a man page. + -- This cache helps the web front-end filter out irrelevant packages faster. + c_hasman boolean NOT NULL DEFAULT FALSE +); + + +CREATE TABLE package_versions ( + id SERIAL NOT NULL, + package integer NOT NULL, + version text NOT NULL, + released date NOT NULL, + arch text +); + + +CREATE TABLE files ( + pkgver integer NOT NULL, + man integer NOT NULL, + content integer NOT NULL, + shorthash integer NOT NULL, -- cache: hash_to_shorthash(content.hash) + locale smallint NOT NULL, + -- The original encoding the man page was found in. This column isn't really + -- used at the moment, but is potentially useful when investigating encoding + -- issues. + encoding smallint NOT NULL, + filename text NOT NULL +); + + + +-- +-- Data +-- + +\copy systems from program 'zstd -dc systems.tsv.zst' +\copy contents (id, hash, content) from program 'zstd -dc contents.tsv.zst' +\copy mans from program 'zstd -dc mans.tsv.zst' +\copy locales from program 'zstd -dc locales.tsv.zst' +\copy encodings from program 'zstd -dc encodings.tsv.zst' +\copy packages from program 'zstd -dc packages.tsv.zst' +\copy package_versions from program 'zstd -dc package_versions.tsv.zst' +\copy files from program 'zstd -dc files.tsv.zst' + + + +-- +-- Primary keys & indices +-- + +ALTER TABLE systems ADD CONSTRAINT systems_pkey PRIMARY KEY (id); +ALTER TABLE contents ADD CONSTRAINT contents_pkey PRIMARY KEY (id); +ALTER TABLE mans ADD CONSTRAINT mans_pkey PRIMARY KEY (id); +ALTER TABLE locales ADD CONSTRAINT locales_pkey PRIMARY KEY (id); +ALTER TABLE encodings ADD CONSTRAINT encodings_pkey PRIMARY KEY (id); +ALTER TABLE packages ADD CONSTRAINT packages_pkey PRIMARY KEY (id); +ALTER TABLE package_versions ADD CONSTRAINT package_versions_pkey PRIMARY KEY (id); +ALTER TABLE files ADD CONSTRAINT files_pkey PRIMARY KEY (pkgver, filename); + + +CREATE UNIQUE INDEX contents_hash_key ON contents (hash); +CREATE UNIQUE INDEX mans_name_section_key ON mans (name, section); +CREATE UNIQUE INDEX locales_locale_key ON locales (locale); +CREATE UNIQUE INDEX encodings_encoding_key ON encodings (encoding); +CREATE UNIQUE INDEX packages_system_name_key ON packages (system, name) INCLUDE (id, c_hasman, dead); +CREATE UNIQUE INDEX package_versions_package_version_key ON package_versions (package, version); +CREATE INDEX contents_nohtml ON contents (id) WHERE html IS NULL; +CREATE INDEX mans_name ON mans USING btree(lower(name) text_pattern_ops); +CREATE INDEX files_man_shorthash_idx ON files (man, shorthash); +CREATE INDEX files_content_idx ON files (content); + + + +-- +-- Constraints +-- + +ALTER TABLE packages + ADD CONSTRAINT packages_system_fkey FOREIGN KEY (system) REFERENCES systems(id) ON DELETE CASCADE ON UPDATE CASCADE; + +ALTER TABLE package_versions + ADD CONSTRAINT package_versions_package_fkey FOREIGN KEY (package) REFERENCES packages(id) ON DELETE CASCADE ON UPDATE CASCADE; + +ALTER TABLE files + ADD CONSTRAINT files_pkgver_fkey FOREIGN KEY (pkgver) REFERENCES package_versions(id) ON DELETE CASCADE ON UPDATE CASCADE, + ADD CONSTRAINT files_man_fkey FOREIGN KEY (man) REFERENCES mans(id), + ADD CONSTRAINT files_content_fkey FOREIGN KEY (content) REFERENCES contents(id), + ADD CONSTRAINT files_locale_fkey FOREIGN KEY (locale) REFERENCES locales(id), + ADD CONSTRAINT files_encoding_fkey FOREIGN KEY (encoding) REFERENCES encodings(id); + + + +-- +-- Sequences +-- + +SELECT setval('contents_id_seq', (SELECT MAX(id) FROM contents)); +SELECT setval('encodings_id_seq', (SELECT MAX(id) FROM encodings)); +SELECT setval('locales_id_seq', (SELECT MAX(id) FROM locales)); +SELECT setval('mans_id_seq', (SELECT MAX(id) FROM mans)); +SELECT setval('package_versions_id_seq', (SELECT MAX(id) FROM package_versions)); +SELECT setval('packages_id_seq', (SELECT MAX(id) FROM packages)); + + + +-- +-- Utility functions +-- + + +-- Interpret first 4 bytes of hash as a signed 32-bit integer. +-- TODO: Postgres 18 allows casting between bytea and int, see if that can be used instead. +CREATE OR REPLACE FUNCTION hash_to_shorthash(hash bytea) RETURNS integer AS $$ + SELECT CASE WHEN get_byte(hash, 3) < 128 + THEN (get_byte(hash, 3)::int<<24) + (get_byte(hash, 2)::int<<16) + (get_byte(hash, 1)::int<<8) + get_byte(hash, 0) + ELSE -2147483648 + ((get_byte(hash, 3)::int - 128)<<24) + (get_byte(hash, 2)::int<<16) + (get_byte(hash, 1)::int<<8) + get_byte(hash, 0) + END; +$$ IMMUTABLE LANGUAGE SQL; + + +CREATE OR REPLACE FUNCTION is_english_locale(locale text) RETURNS bool AS $$ + SELECT locale IS NULL OR locale = '' OR locale LIKE 'en%'; +$$ IMMUTABLE LANGUAGE SQL; + + +CREATE OR REPLACE FUNCTION is_standard_man_location(path text) RETURNS bool AS $$ + SELECT path LIKE '/usr/share/man/man%' OR path LIKE '/usr/local/man/man%'; +$$ IMMUTABLE LANGUAGE sql; + +-- Convenient function to match the first character of a string. Second argument must be lowercase 'a'-'z' or '0'. +-- Postgres can inline and partially evaluate this function into the query plan, so it's fairly efficient. +CREATE OR REPLACE FUNCTION match_firstchar(str text, chr text) RETURNS boolean AS $$ + SELECT CASE WHEN chr = '0' + THEN (ascii(str) < 97 OR ascii(str) > 122) AND (ascii(str) < 65 OR ascii(str) > 90) + ELSE ascii(str) IN(ascii(chr),ascii(upper(chr))) + END; +$$ IMMUTABLE LANGUAGE SQL; diff --git a/schema.sql b/schema.sql deleted file mode 100644 index 45eb069..0000000 --- a/schema.sql +++ /dev/null @@ -1,127 +0,0 @@ -CREATE TABLE systems ( - -- Manually assigned number. The id is also used for ordering different - -- releases of the same system, as identified by 'name'. - id integer PRIMARY KEY, - name varchar NOT NULL, - release varchar, - short varchar NOT NULL -); - - -CREATE TABLE contents ( - id SERIAL PRIMARY KEY, - -- 'hash' is the SHA1 of the man page file after decompression but *before* - -- encoding conversion and removing 0-bytes. This means taking sha1(content) - -- may not necessary match the hash, and it's possible for the same content - -- to be in the database under multiple hashes (but I suspect that's rare). - hash bytea NOT NULL UNIQUE, - content text NOT NULL, - html text -); -CREATE INDEX contents_nohtml ON contents (id) WHERE html IS NULL; - - --- Unique man page, as identified by name & section -CREATE TABLE mans ( - id SERIAL PRIMARY KEY, - name text NOT NULL, - section text NOT NULL, - UNIQUE(name, section) -); -CREATE INDEX mans_name ON mans USING btree(lower(name) text_pattern_ops); - - --- List of man page locales for efficient referencing. Some locales include --- the encoding in their name, which isn't really correct or even necessary --- since we convert everything to UTF-8 anyway, but w/e, Can fix later. -CREATE TABLE locales ( - id SMALLSERIAL PRIMARY KEY, - locale text NOT NULL UNIQUE -); - - --- List of encodings for efficient referencing. -CREATE TABLE encodings ( - id SMALLSERIAL PRIMARY KEY, - encoding text NOT NULL UNIQUE -); - - -CREATE TABLE packages ( - id SERIAL PRIMARY KEY, - system integer NOT NULL REFERENCES systems(id) ON DELETE CASCADE ON UPDATE CASCADE, - name varchar NOT NULL, - -- Whether this package has been seen in the last repository update. This - -- field is only updated for a few systems that are likely to delete packages - -- over time; non-rolling-release distros tend to not delete packages after - -- all. - -- Packages where the latest version does not have any man pages may also be - -- marked as dead even if the package is still available in the repos. - dead boolean NOT NULL DEFAULT FALSE, - -- Whether this package has at least one man page indexed in the database. - -- The indexer uses this table to keep track of which packages it has - -- already indexed, but not all packages seen by the indexer have a man page. - -- This cache helps the web front-end filter out irrelevant packages faster. - c_hasman boolean NOT NULL DEFAULT FALSE, - UNIQUE(system, name) INCLUDE (id, c_hasman, dead) -); - - -CREATE TABLE package_versions ( - id SERIAL PRIMARY KEY, - package integer NOT NULL REFERENCES packages(id) ON DELETE CASCADE ON UPDATE CASCADE, - version varchar NOT NULL, - released date NOT NULL, - arch varchar, - UNIQUE(package, version) -); - - -CREATE TABLE files ( - pkgver integer NOT NULL REFERENCES package_versions(id) ON DELETE CASCADE ON UPDATE CASCADE, - man integer NOT NULL REFERENCES mans(id), - content integer NOT NULL REFERENCES content(id), - shorthash integer NOT NULL, -- cache: hash_to_shorthash(content.hash) - locale smallint NOT NULL REFERENCES locales(id) - -- The original encoding the man page was found in. This column isn't really - -- used at the moment, but is potentially useful when investigating encoding - -- issues. - encoding smallint NOT NULL REFERENCES encodings(id), - filename text NOT NULL, - PRIMARY KEY(pkgver, filename) -); -CREATE INDEX ON files (man, shorthash); -CREATE INDEX ON files (content); - - --- For stats_cache -\i util/update_indices.sql - - - --- Interpret first 4 bytes of hash as a signed 32-bit integer. -CREATE OR REPLACE FUNCTION hash_to_shorthash(hash bytea) RETURNS integer AS $$ - SELECT CASE WHEN get_byte(hash, 3) < 128 - THEN (get_byte(hash, 3)::int<<24) + (get_byte(hash, 2)::int<<16) + (get_byte(hash, 1)::int<<8) + get_byte(hash, 0) - ELSE -2147483648 + ((get_byte(hash, 3)::int - 128)<<24) + (get_byte(hash, 2)::int<<16) + (get_byte(hash, 1)::int<<8) + get_byte(hash, 0) - END; -$$ LANGUAGE SQL IMMUTABLE; - - -CREATE OR REPLACE FUNCTION is_english_locale(locale text) RETURNS bool AS $$ - SELECT locale IS NULL OR locale = '' OR locale LIKE 'en%'; -$$ IMMUTABLE LANGUAGE SQL; - - -CREATE OR REPLACE FUNCTION is_standard_man_location(path text) RETURNS bool AS $$ - SELECT path LIKE '/usr/share/man/man%' OR path LIKE '/usr/local/man/man%'; -$$ IMMUTABLE LANGUAGE sql; - --- Convenient function to match the first character of a string. Second argument must be lowercase 'a'-'z' or '0'. --- Postgres can inline and partially evaluate this function into the query plan, so it's fairly efficient. -CREATE OR REPLACE FUNCTION match_firstchar(str text, chr text) RETURNS boolean AS $$ - SELECT CASE WHEN chr = '0' - THEN (ascii(str) < 97 OR ascii(str) > 122) AND (ascii(str) < 65 OR ascii(str) > 90) - ELSE ascii(str) IN(ascii(chr),ascii(upper(chr))) - END; -$$ LANGUAGE SQL IMMUTABLE; diff --git a/util/cron.sh b/util/cron.sh index 4487061..07cb0b4 100755 --- a/util/cron.sh +++ b/util/cron.sh @@ -16,3 +16,6 @@ $PSQL -f update_indices.sql echo "============ Updating HTML cache" test -f .config && source ./.config ./cache-html.pl --batch=5 --delay=0.5 --maxbatches=100 + +echo "============ Updating database dumps" +./export.sh diff --git a/util/export.sh b/util/export.sh new file mode 100755 index 0000000..8b3619b --- /dev/null +++ b/util/export.sh @@ -0,0 +1,34 @@ +#!/bin/sh + +set -e + +mkdir -p ../dl + +# Only run once a week +[ -e ../dl/current ] && [ -z $(find ../dl/current -daystart -mtime +7) ] && exit + +# Only keep the last dump +rm -rf $(printf '%s\n' ../dl/????-??-?? | sort | head -n -1) + +# Create a new dump +OUT=../dl/.work +export OUT +rm -fr $OUT +mkdir -p $OUT + +cp ../import.sql $OUT/import.sql + +psql -wqU manned <../dl/current diff --git a/util/update_indices.sql b/util/update_indices.sql index d1f510d..aa00fdc 100644 --- a/util/update_indices.sql +++ b/util/update_indices.sql @@ -6,7 +6,7 @@ CREATE TABLE stats_cache_new AS SELECT (SELECT count(*) FROM contents) AS hashes, (SELECT count(distinct name) FROM mans) AS mans, * FROM (SELECT count(*), count(distinct pkgver) FROM files) x(files, packages); -DROP TABLE stats_cache; +DROP TABLE IF EXISTS stats_cache; ALTER TABLE stats_cache_new RENAME TO stats_cache; COMMIT; diff --git a/www/index.pl b/www/index.pl index 90ab5d4..7af2197 100755 --- a/www/index.pl +++ b/www/index.pl @@ -496,19 +496,14 @@ FU::get '/info/about' => sub {

Database download

- This site is backed by a PostgreSQL database containing all the man pages. - Weekly dumps of the full database are available for download at - http://dl.manned.org/dumps/. + This site is backed by a PostgreSQL database containing all the metadata + and man pages. Weekly dumps of the full database are available for + download at https://dl.manned.org/.

- Be warned that the download server may not be terribly fast or reliable, - so it is advisable to use a client that supports resumption of partial - downloads. See wget's -c or - curl's -C. -

- The database schema is "documented" at schema.sql - in the git repo. Keep in mind that these dumps don't constitute a stable - API and, while this won't happen frequently, incompatible schema changes + The database schema is "documented" in import.sql. + Keep in mind that these dumps don't constitute a stable API and, while + this won't happen frequently, incompatible schema changes, format changes or Postgres major version bumps will occassionally occur.