manned/schema.sql
Yorhel d3bebc8888 Add support for caching HTML-rendered pages
Downside is that this consumes significant disk space, requires
recreating the entire cache when changing something to the way that
pages are rendered and removes flexibility to add dynamic
render-influencing settings in the future.

Alas, crawlers are getting more aggressive and I don't like the idea of
adding more invasive anti-bot tech.
This might not be enough in the long term, we also have a few slow SQL
queries that I'm not yet sure how to optimize. But this ought to give us
more time, at least.
2025-05-25 14:41:15 +02:00

127 lines
4.8 KiB
PL/PgSQL

CREATE TABLE systems (
-- Manually assigned number. The id is also used for ordering different
-- releases of the same system, as identified by 'name'.
id integer PRIMARY KEY,
name varchar NOT NULL,
release varchar,
short varchar NOT NULL
);
CREATE TABLE contents (
id SERIAL PRIMARY KEY,
-- 'hash' is the SHA1 of the man page file after decompression but *before*
-- encoding conversion and removing 0-bytes. This means taking sha1(content)
-- may not necessary match the hash, and it's possible for the same content
-- to be in the database under multiple hashes (but I suspect that's rare).
hash bytea NOT NULL UNIQUE,
content text NOT NULL,
html text
);
CREATE INDEX contents_nohtml ON contents (id) WHERE html IS NULL;
-- Unique man page, as identified by name & section
CREATE TABLE mans (
id SERIAL PRIMARY KEY,
name text NOT NULL,
section text NOT NULL,
UNIQUE(name, section)
);
CREATE INDEX mans_name ON mans USING btree(lower(name) text_pattern_ops);
-- List of man page locales for efficient referencing. Some locales include
-- the encoding in their name, which isn't really correct or even necessary
-- since we convert everything to UTF-8 anyway, but w/e, Can fix later.
CREATE TABLE locales (
id SMALLSERIAL PRIMARY KEY,
locale text NOT NULL UNIQUE
);
-- List of encodings for efficient referencing.
CREATE TABLE encodings (
id SMALLSERIAL PRIMARY KEY,
encoding text NOT NULL UNIQUE
);
CREATE TABLE packages (
id SERIAL PRIMARY KEY,
system integer NOT NULL REFERENCES systems(id) ON DELETE CASCADE ON UPDATE CASCADE,
name varchar NOT NULL,
-- Whether this package has been seen in the last repository update. This
-- field is only updated for a few systems that are likely to delete packages
-- over time; non-rolling-release distros tend to not delete packages after
-- all.
-- Packages where the latest version does not have any man pages may also be
-- marked as dead even if the package is still available in the repos.
dead boolean NOT NULL DEFAULT FALSE,
-- Whether this package has at least one man page indexed in the database.
-- The indexer uses this table to keep track of which packages it has
-- already indexed, but not all packages seen by the indexer have a man page.
-- This cache helps the web front-end filter out irrelevant packages faster.
c_hasman boolean NOT NULL DEFAULT FALSE,
UNIQUE(system, name) INCLUDE (id, c_hasman, dead)
);
CREATE TABLE package_versions (
id SERIAL PRIMARY KEY,
package integer NOT NULL REFERENCES packages(id) ON DELETE CASCADE ON UPDATE CASCADE,
version varchar NOT NULL,
released date NOT NULL,
arch varchar,
UNIQUE(package, version)
);
CREATE TABLE files (
pkgver integer NOT NULL REFERENCES package_versions(id) ON DELETE CASCADE ON UPDATE CASCADE,
man integer NOT NULL REFERENCES mans(id),
content integer NOT NULL REFERENCES content(id),
shorthash integer NOT NULL, -- cache: hash_to_shorthash(content.hash)
locale smallint NOT NULL REFERENCES locales(id)
-- The original encoding the man page was found in. This column isn't really
-- used at the moment, but is potentially useful when investigating encoding
-- issues.
encoding smallint NOT NULL REFERENCES encodings(id),
filename text NOT NULL,
PRIMARY KEY(pkgver, filename)
);
CREATE INDEX ON files (man, shorthash);
CREATE INDEX ON files (content);
-- For stats_cache
\i util/update_indices.sql
-- Interpret first 4 bytes of hash as a signed 32-bit integer.
CREATE OR REPLACE FUNCTION hash_to_shorthash(hash bytea) RETURNS integer AS $$
SELECT CASE WHEN get_byte(hash, 3) < 128
THEN (get_byte(hash, 3)::int<<24) + (get_byte(hash, 2)::int<<16) + (get_byte(hash, 1)::int<<8) + get_byte(hash, 0)
ELSE -2147483648 + ((get_byte(hash, 3)::int - 128)<<24) + (get_byte(hash, 2)::int<<16) + (get_byte(hash, 1)::int<<8) + get_byte(hash, 0)
END;
$$ LANGUAGE SQL IMMUTABLE;
CREATE OR REPLACE FUNCTION is_english_locale(locale text) RETURNS bool AS $$
SELECT locale IS NULL OR locale = '' OR locale LIKE 'en%';
$$ IMMUTABLE LANGUAGE SQL;
CREATE OR REPLACE FUNCTION is_standard_man_location(path text) RETURNS bool AS $$
SELECT path LIKE '/usr/share/man/man%' OR path LIKE '/usr/local/man/man%';
$$ IMMUTABLE LANGUAGE sql;
-- Convenient function to match the first character of a string. Second argument must be lowercase 'a'-'z' or '0'.
-- Postgres can inline and partially evaluate this function into the query plan, so it's fairly efficient.
CREATE OR REPLACE FUNCTION match_firstchar(str text, chr text) RETURNS boolean AS $$
SELECT CASE WHEN chr = '0'
THEN (ascii(str) < 97 OR ascii(str) > 122) AND (ascii(str) < 65 OR ascii(str) > 90)
ELSE ascii(str) IN(ascii(chr),ascii(upper(chr)))
END;
$$ LANGUAGE SQL IMMUTABLE;