diff --git a/indexer/src/main.rs b/indexer/src/main.rs index ed6f200..47142ee 100644 --- a/indexer/src/main.rs +++ b/indexer/src/main.rs @@ -35,6 +35,7 @@ fn main() { let arg = clap_app!(indexer => (about: "Manned.org man page indexer") (@arg v: -v +multiple "Increase verbosity") + (@arg dry: --dryrun "Don't actually download and index packages") (@subcommand pkg => (about: "Index a single package") (@arg force: --force "Overwrite existing indexed package") @@ -56,11 +57,13 @@ fn main() { (about: "Index a Debian repository") (@arg sys: --sys +required +takes_value "System short-name") (@arg mirror: --mirror +required +takes_value "Mirror URL") - (@arg contents: --contents +required +takes_value "Contents file") + (@arg contents: --contents +takes_value "Contents file") (@arg packages: --packages +required +takes_value "Packages file") ) ).get_matches(); + unsafe { pkg::DRY_RUN = arg.is_present("dry") }; + let verbose = arg.occurrences_of("v"); env_logger::LogBuilder::new() .filter(Some("indexer"), match verbose { @@ -85,7 +88,7 @@ fn main() { Ok(x) => x, Err(x) => { error!("Can't connect to postgres: {}", x); return }, }; - debug!("Connected to database"); + trace!("Connected to database"); if let Some(matches) = arg.subcommand_matches("pkg") { let date = match matches.value_of("date").unwrap() { @@ -116,8 +119,10 @@ fn main() { sys_deb::sync(&db, sysbyshort(&db, matches.value_of("sys").unwrap()), matches.value_of("mirror").unwrap(), - open::Path{ path: matches.value_of("contents").unwrap(), cache: true, canbelocal: true}, + matches.value_of("contents").map(|e| { open::Path{ path: e, cache: true, canbelocal: true} }), open::Path{ path: matches.value_of("packages").unwrap(), cache: true, canbelocal: true}, ); } + + trace!("Exiting"); } diff --git a/indexer/src/man.rs b/indexer/src/man.rs index 024f652..9cb50e0 100644 --- a/indexer/src/man.rs +++ b/indexer/src/man.rs @@ -204,6 +204,7 @@ pub fn decode(paths: &[&str], ent: &mut Read) -> io::Result<(digest::Digest,&'st let dig = digest::digest(&digest::SHA1, &data); // TODO: Handle BOM? UTF-16? + // TODO: This fails badly for ISO-2022-JP. How the hell do we cleanly fix that? // If it passes as UTF-8, then just consider it UTF-8. if let Ok(_) = str::from_utf8(&data) { return Ok((dig, "utf8", unsafe { String::from_utf8_unchecked(data) } )); diff --git a/indexer/src/pkg.rs b/indexer/src/pkg.rs index a6fa0e1..1975cee 100644 --- a/indexer/src/pkg.rs +++ b/indexer/src/pkg.rs @@ -8,6 +8,8 @@ use archread; use man; use archive::{Format,Archive,ArchiveEntry}; +pub static mut DRY_RUN: bool = false; + #[derive(Debug,Clone,Copy)] pub enum Date<'a> { @@ -72,6 +74,7 @@ fn insert_pkg(tr: &postgres::transaction::Transaction, opt: &PkgOpt) -> Option x, None => return }; + if unsafe { DRY_RUN } { + return; + } match index_pkg(&tr, opt, verid) { Err(e) => error!("Error reading package: {}", e), diff --git a/indexer/src/sys_deb.rs b/indexer/src/sys_deb.rs index 971aab1..45a40f8 100644 --- a/indexer/src/sys_deb.rs +++ b/indexer/src/sys_deb.rs @@ -2,6 +2,7 @@ use std::io::{Result,BufReader,BufRead}; use std::collections::HashSet; use std::str; use postgres; +use regex; use regex::bytes::Regex; use man; @@ -11,7 +12,8 @@ use archive; // Reference: https://wiki.debian.org/RepositoryFormat -fn get_contents(f: open::Path) -> Result> { +fn get_contents(f: Option) -> Result> { + let f = match f { Some(f) => f, None => return Ok(HashSet::new()) }; let mut fd = f.open()?; let rd = archive::Archive::open_raw(&mut fd)?; let brd = BufReader::new(rd); @@ -53,14 +55,24 @@ struct Pkg { fn handlepkg(pg: &postgres::GenericConnection, sys: i32, mirror: &str, manpkgs: &HashSet, pkg: &Pkg) { let name = match pkg.name { Some(ref x) => x, None => return }; - if !manpkgs.contains(name) { + if manpkgs.len() > 0 && !manpkgs.contains(name) { return } let section = match pkg.section { Some(ref x) => x, None => { error!("Package {} has no section", name); return } }; - let arch = match pkg.arch { Some(ref x) => x, None => { error!("Package {} has no arch", name); return } }; let version = match pkg.version { Some(ref x) => x, None => { error!("Package {} has no version", name); return } }; let filename = match pkg.filename { Some(ref x) => x, None => { error!("Package {} has no filename", name); return } }; - let uri = format!("{}{}", mirror, filename); + + // Workarounds for some bad repos + let uri = if sys == 18 || sys == 19 { + let filename = regex::Regex::new("^(Debian-1.[12])/").unwrap().replace(filename, "dists/$1/main/"); + if filename.starts_with("contrib/") { + format!("{}dists/Debian-1.{}/{}", mirror, if sys == 18 { 1 } else { 2 }, filename) + } else { + format!("{}{}", mirror, filename) + } + } else { + format!("{}{}", mirror, filename) + }; pkg::pkg(pg, pkg::PkgOpt{ force: false, @@ -69,7 +81,7 @@ fn handlepkg(pg: &postgres::GenericConnection, sys: i32, mirror: &str, manpkgs: pkg: &name, ver: &version, date: pkg::Date::Deb, - arch: Some(arch), + arch: pkg.arch.as_ref().map(|e| &e[..]), file: open::Path{ path: &uri, cache: false, @@ -79,9 +91,9 @@ fn handlepkg(pg: &postgres::GenericConnection, sys: i32, mirror: &str, manpkgs: } -pub fn sync(pg: &postgres::GenericConnection, sys: i32, mirror: &str, contents: open::Path, packages: open::Path) { +pub fn sync(pg: &postgres::GenericConnection, sys: i32, mirror: &str, contents: Option, packages: open::Path) { let manpkgs = match get_contents(contents) { - Err(e) => { error!("Can't read {}: {}", contents.path, e); return }, + Err(e) => { error!("Can't read {}: {}", contents.unwrap().path, e); return }, Ok(x) => x, }; @@ -110,12 +122,13 @@ pub fn sync(pg: &postgres::GenericConnection, sys: i32, mirror: &str, contents: } if let Some(cap) = kv.captures(&line) { let val = str::from_utf8(cap.at(2).unwrap()).unwrap(); - match str::from_utf8(cap.at(1).unwrap()).unwrap() { - "Package" => pkg.name = Some(val.to_string()), - "Section" => pkg.section = Some(val.to_string()), - "Version" => pkg.version = Some(val.to_string()), - "Architecture" => pkg.arch = Some(val.to_string()), - "Filename" => pkg.filename = Some(val.to_string()), + // Use case-insensitive matching, older package archives used lowercase keys + match str::from_utf8(cap.at(1).unwrap()).unwrap().to_lowercase().as_ref() { + "package" => pkg.name = Some(val.to_string()), + "section" => pkg.section = Some(val.to_string()), + "version" => pkg.version = Some(val.to_string()), + "architecture" => pkg.arch = Some(val.to_string()), + "filename" => pkg.filename = Some(val.to_string()), _ => {} } }