diff --git a/indexer/src/pkg.rs b/indexer/src/pkg.rs index 4d0fe0c..654de1a 100644 --- a/indexer/src/pkg.rs +++ b/indexer/src/pkg.rs @@ -51,10 +51,14 @@ pub struct PkgOpt<'a> { fn insert_pkg(tr: &mut postgres::Transaction, opt: &PkgOpt) -> Option { let pkginfo = format!("sys {} / {} / {} - {} @ {:?} @ {}", opt.sys, opt.cat, opt.pkg, opt.ver, opt.date, opt.file.path); - // The ON CONFLICT .. DO UPDATE is used instead of DO NOTHING because in that case the - // RETURNING clause wouldn't give us a package id. - let q = "INSERT INTO packages (system, category, name) VALUES($1, $2, $3) - ON CONFLICT ON CONSTRAINT packages_system_name_category_key DO UPDATE SET name=$3 RETURNING id"; + // Use a custom CTE-based insert-or-update. Using an INSERT with an ON CONFLICT clause would be + // easier, but has the downside of allocating a new package id even if one already exists. + // The separate UPDATE query makes sure to unflag the package as dead while not causing any + // database writes when the row's already fine. + let q = "WITH p(id) AS (SELECT id FROM packages WHERE system = $1 AND category = $2 AND name = $3), + u AS (UPDATE packages SET dead = FALSE FROM p WHERE packages.id = p.id AND dead), + i(id) AS (INSERT INTO packages (system, category, name) SELECT $1, $2, $3 WHERE NOT EXISTS(SELECT 1 FROM p) RETURNING id) + SELECT id FROM p UNION SELECT id FROM i"; let pkgid: i32 = match tr.query_one(q, &[&opt.sys, &opt.cat, &opt.pkg]) { Err(e) => { error!("Can't insert package in database: {}", e); diff --git a/indexer/src/sys_arch.rs b/indexer/src/sys_arch.rs index 665a220..4c0ea41 100644 --- a/indexer/src/sys_arch.rs +++ b/indexer/src/sys_arch.rs @@ -1,5 +1,6 @@ use std::str::FromStr; use std::io::{Read,BufRead,BufReader,Result}; +use std::collections::HashSet; use regex::Regex; use chrono::NaiveDateTime; use postgres; @@ -91,6 +92,7 @@ pub fn sync(pg: &mut T, sys: i32, mirror: &str, repo let mut hasman = false; let mut meta = None; + let mut allpkgs = HashSet::new(); let r = archive::walk(ent, |x| { if x.filetype() == archive::FileType::Directory { hasman = false; @@ -120,6 +122,7 @@ pub fn sync(pg: &mut T, sys: i32, mirror: &str, repo canbelocal: false, }, }); + allpkgs.insert(m.name.into_boxed_str()); } Ok(true) @@ -128,4 +131,30 @@ pub fn sync(pg: &mut T, sys: i32, mirror: &str, repo if let Err(e) = r { error!("Error reading package index: {}", e); } + mark_dead(pg, sys, repo, allpkgs); +} + +fn mark_dead(pg: &mut T, sys: i32, repo: &str, pkgs: HashSet>) { + let mut dead = Vec::new(); + for row in pg.query("SELECT id, name FROM packages WHERE system = $1 AND category = $2 AND NOT dead", &[&sys,&repo]).unwrap() { + let id: i32 = row.get(0); + let name: &str = row.get(1); + if !pkgs.contains(name) { + info!("Package not available in database anymore, marking dead; sys {} / {} / pkg {} ({})", sys, repo, id, name); + dead.push(id); + } + } + if dead.is_empty() { + return; + } + + let mut tr = pg.transaction().unwrap(); + let q = tr.prepare("UPDATE packages SET dead = TRUE WHERE id = $1").unwrap(); + for id in dead { + tr.execute(&q, &[&id]).unwrap(); + } + + if let Err(e) = tr.commit() { + error!("Error finishing transaction: {}", e); + } } diff --git a/sql/schema.sql b/sql/schema.sql index c81d284..2ab0ca9 100644 --- a/sql/schema.sql +++ b/sql/schema.sql @@ -19,8 +19,15 @@ CREATE TABLE contents ( CREATE TABLE packages ( id SERIAL PRIMARY KEY, system integer NOT NULL REFERENCES systems(id) ON DELETE CASCADE ON UPDATE CASCADE, - category varchar, + category varchar NOT NULL, name varchar NOT NULL, + -- Whether this package has been seen in the last repository update. This + -- field is only updated for a few systems that are likely to delete packages + -- over time; non-rolling-release distros tend to not delete packages after + -- all. + -- Packages where the latest version does not have any man pages may also be + -- marked as dead even if the package is still available in the repos. + dead boolean NOT NULL DEFAULT FALSE, UNIQUE(system, name, category) -- Note the order, lookups on (system,name) are common ); diff --git a/sql/update-2021-12-13b.sql b/sql/update-2021-12-13b.sql new file mode 100644 index 0000000..9dad7ae --- /dev/null +++ b/sql/update-2021-12-13b.sql @@ -0,0 +1,3 @@ +ALTER TABLE packages + ALTER COLUMN category SET NOT NULL, + ADD COLUMN dead boolean NOT NULL DEFAULT FALSE; diff --git a/www/index.pl b/www/index.pl index acd7ec7..c6143da 100755 --- a/www/index.pl +++ b/www/index.pl @@ -781,10 +781,10 @@ TUWF::get qr{/pkg/([^/]+)} => sub { p => { onerror => 1, uint => 1, range => [1,200] }, )->data; - my $where = sql 'system =', \$sys->{id}, $f->{c} ne 'all' ? ('AND match_firstchar(name,', \$f->{c}, ')') : (); + my $where = sql 'NOT dead AND system =', \$sys->{id}, $f->{c} ne 'all' ? ('AND match_firstchar(name,', \$f->{c}, ')') : (); my $count = tuwf->dbVali('SELECT count(*) FROM', $packages_with_man, 'p WHERE', $where); my $pkg = tuwf->dbPagei({ results => 200, page => $f->{p} }, - 'SELECT id, system, name, category FROM', $packages_with_man, 'p WHERE', $where, 'ORDER BY name, category' + 'SELECT id, system, name, category, dead FROM', $packages_with_man, 'p WHERE', $where, 'ORDER BY name, category' ); my $title = $sys->{name}.($sys->{release}?" $sys->{release}":"");