manned/sql/schema.sql

CREATE TABLE systems (
  -- Manually assigned number. The id is also used for ordering different
  -- releases of the same system, as identified by 'name'.
  id       integer PRIMARY KEY,
  name     varchar NOT NULL,
  release  varchar,
  short    varchar NOT NULL
);

CREATE TABLE contents (
  -- 'hash' is the SHA1 of the man page file after decompression but *before*
  -- encoding conversion and removing 0-bytes. This means taking sha1(content)
  -- may not necessary match the hash, and it's possible for the same content
  -- to be in the database under multiple hashes (but I suspect that's rare).
  hash    bytea      PRIMARY KEY,
  content varchar    NOT NULL
);

CREATE TABLE packages (
  id       SERIAL    PRIMARY KEY,
  system   integer   NOT NULL REFERENCES systems(id) ON DELETE CASCADE ON UPDATE CASCADE,
  category varchar   NOT NULL,
  name     varchar   NOT NULL,
  -- Whether this package has been seen in the last repository update. This
  -- field is only updated for a few systems that are likely to delete packages
  -- over time; non-rolling-release distros tend to not delete packages after
  -- all.
  -- Packages where the latest version does not have any man pages may also be
  -- marked as dead even if the package is still available in the repos.
  dead     boolean   NOT NULL DEFAULT FALSE,
  UNIQUE(system, name, category) -- Note the order, lookups on (system,name) are common
);

CREATE TABLE package_versions (
  id       SERIAL    PRIMARY KEY,
  package  integer   NOT NULL REFERENCES packages(id) ON DELETE CASCADE,
  version  varchar   NOT NULL,
  released date      NOT NULL,
  arch     varchar,
  UNIQUE(package, version)
);

CREATE TABLE man (
  package  integer   NOT NULL REFERENCES package_versions(id) ON DELETE CASCADE,
  name     varchar   NOT NULL,
  filename varchar   NOT NULL,
  locale   varchar,
  hash     bytea     NOT NULL REFERENCES contents(hash),
  section  varchar   NOT NULL,
  encoding varchar,
  UNIQUE(package, filename)
);

CREATE INDEX ON man (hash);
CREATE INDEX ON man (name);


CREATE TABLE man_index AS SELECT DISTINCT name, section FROM man;
CREATE INDEX ON man_index USING btree(lower(name) text_pattern_ops);

CREATE TABLE stats_cache AS SELECT count(distinct hash) AS hashes, count(distinct name) AS mans, count(*) AS files, count(distinct package) AS packages FROM man;


-- Removes any path components and compression extensions from the filename.
CREATE OR REPLACE FUNCTION basename_from_filename(fn text) RETURNS text AS $$
DECLARE
  ret text;
  tmp text;
BEGIN
  ret := regexp_replace(fn, '^.+/([^/]+)', E'\\1');
  LOOP
    tmp := regexp_replace(regexp_replace(regexp_replace(ret, E'\\.gz$', ''), E'\\.lzma$', ''), E'\\.bz2$', '');
    EXIT WHEN tmp = ret;
    ret := tmp;
  END LOOP;
  RETURN ret;
END;
$$ LANGUAGE plpgsql;


CREATE OR REPLACE FUNCTION section_from_filename(text) RETURNS text AS $$
  SELECT regexp_replace(basename_from_filename($1), E'^.+\\.([^.]+)$', E'\\1');
$$ LANGUAGE SQL;


CREATE OR REPLACE FUNCTION name_from_filename(text) RETURNS text AS $$
  SELECT regexp_replace(basename_from_filename($1), E'^(.+)\\.[^.]+$', E'\\1');
$$ LANGUAGE SQL;


CREATE OR REPLACE FUNCTION is_english_locale(locale text) RETURNS bool AS $$
  SELECT locale IS NULL OR locale LIKE 'en%';
$$ IMMUTABLE LANGUAGE SQL;


CREATE OR REPLACE FUNCTION is_standard_man_location(path text) RETURNS bool AS $$
  SELECT path LIKE '/usr/share/man/man%' OR path LIKE '/usr/local/man/man%';
$$ IMMUTABLE LANGUAGE sql;

-- Convenient function to match the first character of a string. Second argument must be lowercase 'a'-'z' or '0'.
-- Postgres can inline and partially evaluate this function into the query plan, so it's fairly efficient.
CREATE OR REPLACE FUNCTION match_firstchar(str text, chr text) RETURNS boolean AS $$
  SELECT CASE WHEN chr = '0'
         THEN (ascii(str) < 97 OR ascii(str) > 122) AND (ascii(str) < 65 OR ascii(str) > 90)
         ELSE ascii(str) IN(ascii(chr),ascii(upper(chr)))
         END;
$$ LANGUAGE SQL IMMUTABLE;