From b27d55215af58b7a4e6e707a87ee450a8c06aeda Mon Sep 17 00:00:00 2001 From: Yorhel Date: Mon, 13 Dec 2021 08:18:13 +0100 Subject: [PATCH] Arch: Mark deleted packages as dead and hide them from listings We've got a lot of packages in the DB that have long been removed from the Arch repos. These are still indexed, but won't clutter the package listing anymore. Also fixed an issue with packages.id numbers getting rather large because the indexer allocates a new ID for every package on every update. --- indexer/src/pkg.rs | 12 ++++++++---- indexer/src/sys_arch.rs | 29 +++++++++++++++++++++++++++++ sql/schema.sql | 9 ++++++++- sql/update-2021-12-13b.sql | 3 +++ www/index.pl | 4 ++-- 5 files changed, 50 insertions(+), 7 deletions(-) create mode 100644 sql/update-2021-12-13b.sql diff --git a/indexer/src/pkg.rs b/indexer/src/pkg.rs index 4d0fe0c..654de1a 100644 --- a/indexer/src/pkg.rs +++ b/indexer/src/pkg.rs @@ -51,10 +51,14 @@ pub struct PkgOpt<'a> { fn insert_pkg(tr: &mut postgres::Transaction, opt: &PkgOpt) -> Option { let pkginfo = format!("sys {} / {} / {} - {} @ {:?} @ {}", opt.sys, opt.cat, opt.pkg, opt.ver, opt.date, opt.file.path); - // The ON CONFLICT .. DO UPDATE is used instead of DO NOTHING because in that case the - // RETURNING clause wouldn't give us a package id. - let q = "INSERT INTO packages (system, category, name) VALUES($1, $2, $3) - ON CONFLICT ON CONSTRAINT packages_system_name_category_key DO UPDATE SET name=$3 RETURNING id"; + // Use a custom CTE-based insert-or-update. Using an INSERT with an ON CONFLICT clause would be + // easier, but has the downside of allocating a new package id even if one already exists. + // The separate UPDATE query makes sure to unflag the package as dead while not causing any + // database writes when the row's already fine. + let q = "WITH p(id) AS (SELECT id FROM packages WHERE system = $1 AND category = $2 AND name = $3), + u AS (UPDATE packages SET dead = FALSE FROM p WHERE packages.id = p.id AND dead), + i(id) AS (INSERT INTO packages (system, category, name) SELECT $1, $2, $3 WHERE NOT EXISTS(SELECT 1 FROM p) RETURNING id) + SELECT id FROM p UNION SELECT id FROM i"; let pkgid: i32 = match tr.query_one(q, &[&opt.sys, &opt.cat, &opt.pkg]) { Err(e) => { error!("Can't insert package in database: {}", e); diff --git a/indexer/src/sys_arch.rs b/indexer/src/sys_arch.rs index 665a220..4c0ea41 100644 --- a/indexer/src/sys_arch.rs +++ b/indexer/src/sys_arch.rs @@ -1,5 +1,6 @@ use std::str::FromStr; use std::io::{Read,BufRead,BufReader,Result}; +use std::collections::HashSet; use regex::Regex; use chrono::NaiveDateTime; use postgres; @@ -91,6 +92,7 @@ pub fn sync(pg: &mut T, sys: i32, mirror: &str, repo let mut hasman = false; let mut meta = None; + let mut allpkgs = HashSet::new(); let r = archive::walk(ent, |x| { if x.filetype() == archive::FileType::Directory { hasman = false; @@ -120,6 +122,7 @@ pub fn sync(pg: &mut T, sys: i32, mirror: &str, repo canbelocal: false, }, }); + allpkgs.insert(m.name.into_boxed_str()); } Ok(true) @@ -128,4 +131,30 @@ pub fn sync(pg: &mut T, sys: i32, mirror: &str, repo if let Err(e) = r { error!("Error reading package index: {}", e); } + mark_dead(pg, sys, repo, allpkgs); +} + +fn mark_dead(pg: &mut T, sys: i32, repo: &str, pkgs: HashSet>) { + let mut dead = Vec::new(); + for row in pg.query("SELECT id, name FROM packages WHERE system = $1 AND category = $2 AND NOT dead", &[&sys,&repo]).unwrap() { + let id: i32 = row.get(0); + let name: &str = row.get(1); + if !pkgs.contains(name) { + info!("Package not available in database anymore, marking dead; sys {} / {} / pkg {} ({})", sys, repo, id, name); + dead.push(id); + } + } + if dead.is_empty() { + return; + } + + let mut tr = pg.transaction().unwrap(); + let q = tr.prepare("UPDATE packages SET dead = TRUE WHERE id = $1").unwrap(); + for id in dead { + tr.execute(&q, &[&id]).unwrap(); + } + + if let Err(e) = tr.commit() { + error!("Error finishing transaction: {}", e); + } } diff --git a/sql/schema.sql b/sql/schema.sql index c81d284..2ab0ca9 100644 --- a/sql/schema.sql +++ b/sql/schema.sql @@ -19,8 +19,15 @@ CREATE TABLE contents ( CREATE TABLE packages ( id SERIAL PRIMARY KEY, system integer NOT NULL REFERENCES systems(id) ON DELETE CASCADE ON UPDATE CASCADE, - category varchar, + category varchar NOT NULL, name varchar NOT NULL, + -- Whether this package has been seen in the last repository update. This + -- field is only updated for a few systems that are likely to delete packages + -- over time; non-rolling-release distros tend to not delete packages after + -- all. + -- Packages where the latest version does not have any man pages may also be + -- marked as dead even if the package is still available in the repos. + dead boolean NOT NULL DEFAULT FALSE, UNIQUE(system, name, category) -- Note the order, lookups on (system,name) are common ); diff --git a/sql/update-2021-12-13b.sql b/sql/update-2021-12-13b.sql new file mode 100644 index 0000000..9dad7ae --- /dev/null +++ b/sql/update-2021-12-13b.sql @@ -0,0 +1,3 @@ +ALTER TABLE packages + ALTER COLUMN category SET NOT NULL, + ADD COLUMN dead boolean NOT NULL DEFAULT FALSE; diff --git a/www/index.pl b/www/index.pl index acd7ec7..c6143da 100755 --- a/www/index.pl +++ b/www/index.pl @@ -781,10 +781,10 @@ TUWF::get qr{/pkg/([^/]+)} => sub { p => { onerror => 1, uint => 1, range => [1,200] }, )->data; - my $where = sql 'system =', \$sys->{id}, $f->{c} ne 'all' ? ('AND match_firstchar(name,', \$f->{c}, ')') : (); + my $where = sql 'NOT dead AND system =', \$sys->{id}, $f->{c} ne 'all' ? ('AND match_firstchar(name,', \$f->{c}, ')') : (); my $count = tuwf->dbVali('SELECT count(*) FROM', $packages_with_man, 'p WHERE', $where); my $pkg = tuwf->dbPagei({ results => 200, page => $f->{p} }, - 'SELECT id, system, name, category FROM', $packages_with_man, 'p WHERE', $where, 'ORDER BY name, category' + 'SELECT id, system, name, category, dead FROM', $packages_with_man, 'p WHERE', $where, 'ORDER BY name, category' ); my $title = $sys->{name}.($sys->{release}?" $sys->{release}":"");