Add postgres package indexing + cli options
This commit is contained in:
parent
0cab758665
commit
aff68205b0
5 changed files with 376 additions and 38 deletions
|
|
@ -1,17 +1,71 @@
|
|||
#[macro_use] extern crate log;
|
||||
#[macro_use] extern crate lazy_static;
|
||||
#[macro_use] extern crate clap;
|
||||
extern crate env_logger;
|
||||
extern crate regex;
|
||||
extern crate libarchive3_sys;
|
||||
extern crate libc;
|
||||
extern crate ring;
|
||||
extern crate encoding;
|
||||
extern crate postgres;
|
||||
|
||||
mod archive;
|
||||
mod archread;
|
||||
mod man;
|
||||
mod pkg;
|
||||
|
||||
|
||||
// Convenience function to get a system id by short-name. Panics if the system doesn't exist.
|
||||
fn sysbyshort(conn: &postgres::GenericConnection, short: &str) -> i32 {
|
||||
let r = conn.query("SELECT id FROM systems WHERE short = $1", &[&short]).unwrap();
|
||||
if r.is_empty() {
|
||||
panic!("Invalid system: {}", short);
|
||||
}
|
||||
r.get(0).get(0)
|
||||
}
|
||||
|
||||
|
||||
fn main() {
|
||||
env_logger::init().unwrap();
|
||||
info!("Hello, world!");
|
||||
let arg = clap_app!(indexer =>
|
||||
(about: "Manned.org man page indexer")
|
||||
(@arg v: -v +multiple "Increase verbosity")
|
||||
(@arg host: -h +required +takes_value "PostgreSQL connection string")
|
||||
(@subcommand pkg =>
|
||||
(about: "Index a single package")
|
||||
(@arg sys: --sys +required +takes_value "System short-name")
|
||||
(@arg cat: --cat +required +takes_value "Package category")
|
||||
(@arg pkg: --pkg +required +takes_value "Package name")
|
||||
(@arg ver: --ver +required +takes_value "Package version")
|
||||
(@arg date: --date +required +takes_value "Package release date")
|
||||
(@arg FILE: +required "Package file")
|
||||
)
|
||||
).get_matches();
|
||||
|
||||
let verbose = arg.occurrences_of("v");
|
||||
env_logger::LogBuilder::new()
|
||||
.filter(Some("indexer"), match verbose {
|
||||
0 => log::LogLevelFilter::Warn,
|
||||
1 => log::LogLevelFilter::Info,
|
||||
2 => log::LogLevelFilter::Debug,
|
||||
_ => log::LogLevelFilter::Trace,
|
||||
})
|
||||
.filter(Some("postgres"), if verbose >= 4 { log::LogLevelFilter::Trace } else { log::LogLevelFilter::Info })
|
||||
.init().unwrap();
|
||||
|
||||
let db = match postgres::Connection::connect(arg.value_of("host").unwrap(), postgres::TlsMode::None) {
|
||||
Ok(x) => x,
|
||||
Err(x) => { error!("Can't connect to postgres: {}", x); return },
|
||||
};
|
||||
debug!("Connected to database");
|
||||
|
||||
if let Some(matches) = arg.subcommand_matches("pkg") {
|
||||
pkg::pkg(&db, pkg::PkgOpt {
|
||||
sys: sysbyshort(&db, matches.value_of("sys").unwrap()),
|
||||
cat: matches.value_of("cat").unwrap(),
|
||||
pkg: matches.value_of("pkg").unwrap(),
|
||||
ver: matches.value_of("ver").unwrap(),
|
||||
date: matches.value_of("date").unwrap(),
|
||||
file: matches.value_of("FILE").unwrap()
|
||||
});
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -18,7 +18,7 @@ const MIN_MAN_SIZE: u64 = 9;
|
|||
|
||||
// Checks a path for a man page candidate. Returns None if it doesn't seem like a man page
|
||||
// location, otherwise Some((manPageName, Section, Locale)).
|
||||
fn parse_path(path: &str) -> Option<(&str, &str, &str)> {
|
||||
pub fn parse_path(path: &str) -> Option<(&str, &str, &str)> {
|
||||
// Roughly: man[/locale]/man1/manpage.section[.compression]+
|
||||
lazy_static! {
|
||||
static ref RE: Regex = Regex::new(r"(?x)
|
||||
|
|
@ -112,8 +112,8 @@ fn codec_from_tag(data: &Vec<u8>) -> Option<EncodingRef> {
|
|||
// latin-1 isn't in the whatwg spec under that name
|
||||
"latin-1" => Some(all::WINDOWS_1252),
|
||||
|
||||
// Waaaaaaaaah we can't decode this :(
|
||||
"armscii-8" => None,
|
||||
// armscii isn't in the whatwg spec at all
|
||||
"armscii-8" => Some(all::ARMSCII_8),
|
||||
|
||||
// Anything else should be found by its whatwg label.
|
||||
x => match encoding_from_whatwg_label(x) {
|
||||
|
|
|
|||
139
indexer/src/pkg.rs
Normal file
139
indexer/src/pkg.rs
Normal file
|
|
@ -0,0 +1,139 @@
|
|||
use std;
|
||||
use std::io::Read;
|
||||
use postgres;
|
||||
|
||||
use archive;
|
||||
use archread;
|
||||
use man;
|
||||
use archive::Archive;
|
||||
|
||||
pub struct PkgOpt<'a> {
|
||||
pub sys: i32,
|
||||
pub cat: &'a str,
|
||||
pub pkg: &'a str,
|
||||
pub ver: &'a str,
|
||||
pub date: &'a str,
|
||||
pub file: &'a str
|
||||
}
|
||||
|
||||
|
||||
fn insert_pkg(tr: &postgres::transaction::Transaction, opt: &PkgOpt) -> Option<i32> {
|
||||
// The ON CONFLICT .. DO UPDATE is used instead of DO NOTHING because in that case the
|
||||
// RETURNING clause wouldn't give us a package id.
|
||||
let q = "INSERT INTO packages (system, category, name) VALUES($1, $2, $3)
|
||||
ON CONFLICT ON CONSTRAINT packages_system_name_category_key DO UPDATE SET name=$3 RETURNING id";
|
||||
let pkgid: i32 = match tr.query(q, &[&opt.sys, &opt.cat, &opt.pkg]) {
|
||||
Err(e) => {
|
||||
error!("Can't insert package in database: {}", e);
|
||||
return None;
|
||||
},
|
||||
Ok(r) => r.get(0).get(0),
|
||||
};
|
||||
|
||||
// TODO: option to overwrite an existing package version
|
||||
let q = "INSERT INTO package_versions (package, version, released) VALUES($1, $2, $3::text::date) RETURNING id";
|
||||
let verid: i32 = match tr.query(q, &[&pkgid, &opt.ver, &opt.date]) {
|
||||
Err(e) => {
|
||||
error!("Can't insert package version in database: {}", e);
|
||||
return None;
|
||||
},
|
||||
Ok(r) => r.get(0).get(0),
|
||||
};
|
||||
trace!("Package pkgid {} verid {}", pkgid, verid);
|
||||
Some(verid)
|
||||
}
|
||||
|
||||
|
||||
fn insert_man_row(tr: &postgres::GenericConnection, verid: i32, path: &str, hash: &[u8]) {
|
||||
// TODO: Store 'encoding' in the database
|
||||
let (name, sect, locale) = man::parse_path(path).unwrap();
|
||||
if let Err(e) = tr.execute(
|
||||
"INSERT INTO man (package, name, filename, locale, hash, section) VALUES ($1, $2, '/'||$3, $4, $5, $6)",
|
||||
&[&verid, &name, &path, &locale, &hash, §]
|
||||
) {
|
||||
// I think this can only happen if archread gives us the same file twice, which really
|
||||
// shouldn't happen. But I'd rather continue with an error logged than panic.
|
||||
error!("Can't insert verid {} fn {}: {}", verid, path, e);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
fn insert_man(tr: &postgres::GenericConnection, verid: i32, paths: &[&str], ent: &mut Read) {
|
||||
let (dig, enc, cont) = match man::decode(paths, ent) {
|
||||
Err(e) => { error!("Error decoding {}: {}", paths[0], e); return },
|
||||
Ok(x) => x,
|
||||
};
|
||||
|
||||
// TODO: Overwrite entry if the contents are different? It's possible that earlier decoding
|
||||
// implementations didn't properly detect the encoding. (On the other hand, due to differences
|
||||
// in filenames it's also possible that THIS decoding step went wrong. Ugh)
|
||||
tr.execute(
|
||||
"INSERT INTO contents (hash, content) VALUES($1, $2) ON CONFLICT (hash) DO NOTHING",
|
||||
&[&dig.as_ref(), &cont]
|
||||
).unwrap();
|
||||
|
||||
for path in paths {
|
||||
insert_man_row(tr, verid, path, dig.as_ref());
|
||||
debug!("Inserted man page: {} ({})", path, enc);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
fn insert_link(tr: &postgres::GenericConnection, verid: i32, src: &str, dest: &str) {
|
||||
let hash = tr.query("SELECT hash FROM man WHERE package = $1 AND filename = '/'||$2", &[&verid, &dest]).unwrap();
|
||||
if hash.is_empty() { /* Can happen if man::decode() failed previously. */
|
||||
error!("Link to unindexed man page: {} -> {}", src, dest);
|
||||
return;
|
||||
}
|
||||
let hash: Vec<u8> = hash.get(0).get(0);
|
||||
insert_man_row(tr, verid, src, &hash);
|
||||
debug!("Inserted man link: {} -> {}", src, dest);
|
||||
}
|
||||
|
||||
|
||||
fn with_pkg<T,F>(file: &str, cb: F) -> std::io::Result<T>
|
||||
where F: FnOnce(Option<archive::ArchiveEntry>) -> std::io::Result<T>
|
||||
{
|
||||
// TODO: Support streaming from URLs
|
||||
// TODO: How does .deb support fit into this? (Or anything else with metadata)
|
||||
let mut f = try!(std::fs::File::open(file));
|
||||
let ent = try!(Archive::open_archive(&mut f));
|
||||
cb(ent)
|
||||
}
|
||||
|
||||
|
||||
fn index_pkg(tr: &postgres::GenericConnection, opt: &PkgOpt, verid: i32) -> std::io::Result<()> {
|
||||
let indexfunc = |paths: &[&str], ent: &mut archive::ArchiveEntry| {
|
||||
insert_man(tr, verid, paths, ent);
|
||||
Ok(()) /* Don't propagate errors, continue handling other man pages */
|
||||
};
|
||||
|
||||
let missed = try!(
|
||||
with_pkg(opt.file, |ent| { archread::FileList::read(ent, man::ismanpath, &indexfunc) })
|
||||
).links(|src, dest| { insert_link(tr, verid, src, dest) });
|
||||
|
||||
if let Some(missed) = missed {
|
||||
warn!("Some links were missed, reading package again");
|
||||
try!(with_pkg(opt.file, |ent| { missed.read(ent, indexfunc) }))
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
pub fn pkg(conn: &postgres::GenericConnection, opt: PkgOpt) {
|
||||
info!("Handling pkg: {} / {} / {} - {} @ {} in {}", opt.sys, opt.cat, opt.pkg, opt.ver, opt.date, opt.file);
|
||||
|
||||
let tr = conn.transaction().unwrap();
|
||||
tr.set_rollback();
|
||||
|
||||
let verid = match insert_pkg(&tr, &opt) { Some(x) => x, None => return };
|
||||
|
||||
match index_pkg(&tr, &opt, verid) {
|
||||
Err(e) => error!("Error reading package: {}", e),
|
||||
Ok(_) => tr.set_commit()
|
||||
}
|
||||
|
||||
if let Err(e) = tr.finish() {
|
||||
error!("Error finishing transaction: {}", e);
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue