indexer: Add file caching + Arch Linux indexing

This commit is contained in:
Yorhel 2016-11-06 13:34:22 +01:00
parent 35fab522d6
commit 1ca43665a1
6 changed files with 295 additions and 39 deletions

48
indexer/Cargo.lock generated
View file

@ -2,6 +2,7 @@
name = "indexer"
version = "0.1.0"
dependencies = [
"chrono 0.2.25 (registry+https://github.com/rust-lang/crates.io-index)",
"clap 2.17.1 (registry+https://github.com/rust-lang/crates.io-index)",
"encoding 0.3.0-dev (git+https://github.com/lifthrasiir/rust-encoding)",
"env_logger 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)",
@ -13,6 +14,7 @@ dependencies = [
"postgres 0.12.0 (registry+https://github.com/rust-lang/crates.io-index)",
"regex 0.1.80 (registry+https://github.com/rust-lang/crates.io-index)",
"ring 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)",
"url 1.2.3 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
@ -43,6 +45,15 @@ name = "byteorder"
version = "0.5.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "chrono"
version = "0.2.25"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"num 0.1.36 (registry+https://github.com/rust-lang/crates.io-index)",
"time 0.1.35 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "clap"
version = "2.17.1"
@ -282,6 +293,38 @@ dependencies = [
"log 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "num"
version = "0.1.36"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"num-integer 0.1.32 (registry+https://github.com/rust-lang/crates.io-index)",
"num-iter 0.1.32 (registry+https://github.com/rust-lang/crates.io-index)",
"num-traits 0.1.36 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "num-integer"
version = "0.1.32"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"num-traits 0.1.36 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "num-iter"
version = "0.1.32"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"num-integer 0.1.32 (registry+https://github.com/rust-lang/crates.io-index)",
"num-traits 0.1.36 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "num-traits"
version = "0.1.36"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "num_cpus"
version = "1.1.0"
@ -584,6 +627,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
"checksum bitflags 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "aad18937a628ec6abcd26d1489012cc0e18c21798210f491af69ded9b881106d"
"checksum bufstream 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7b48dbe2ff0e98fa2f03377d204a9637d3c9816cd431bfe05a8abbd0ea11d074"
"checksum byteorder 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)" = "0fc10e8cc6b2580fda3f36eb6dc5316657f812a3df879a44a66fc9f0fdbc4855"
"checksum chrono 0.2.25 (registry+https://github.com/rust-lang/crates.io-index)" = "9213f7cd7c27e95c2b57c49f0e69b1ea65b27138da84a170133fd21b07659c00"
"checksum clap 2.17.1 (registry+https://github.com/rust-lang/crates.io-index)" = "27dac76762fb56019b04aed3ccb43a770a18f80f9c2eb62ee1a18d9fb4ea2430"
"checksum cookie 0.2.5 (registry+https://github.com/rust-lang/crates.io-index)" = "0e3d6405328b6edb412158b3b7710e2634e23f3614b9bb1c412df7952489a626"
"checksum encoding 0.3.0-dev (git+https://github.com/lifthrasiir/rust-encoding)" = "<none>"
@ -614,6 +658,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
"checksum md5 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "7df230903ccdffd6b3b4ec21624498ea64c912ce50297846907f0b8e1bb249dd"
"checksum memchr 0.1.11 (registry+https://github.com/rust-lang/crates.io-index)" = "d8b629fb514376c675b98c1421e80b151d3817ac42d7c667717d282761418d20"
"checksum mime 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "b5c93a4bd787ddc6e7833c519b73a50883deb5863d76d9b71eb8216fb7f94e66"
"checksum num 0.1.36 (registry+https://github.com/rust-lang/crates.io-index)" = "bde7c03b09e7c6a301ee81f6ddf66d7a28ec305699e3d3b056d2fc56470e3120"
"checksum num-integer 0.1.32 (registry+https://github.com/rust-lang/crates.io-index)" = "fb24d9bfb3f222010df27995441ded1e954f8f69cd35021f6bef02ca9552fb92"
"checksum num-iter 0.1.32 (registry+https://github.com/rust-lang/crates.io-index)" = "287a1c9969a847055e1122ec0ea7a5c5d6f72aad97934e131c83d5c08ab4e45c"
"checksum num-traits 0.1.36 (registry+https://github.com/rust-lang/crates.io-index)" = "a16a42856a256b39c6d3484f097f6713e14feacd9bfb02290917904fae46c81c"
"checksum num_cpus 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "8890e6084723d57d0df8d2720b0d60c6ee67d6c93e7169630e4371e88765dcad"
"checksum openssl 0.7.14 (registry+https://github.com/rust-lang/crates.io-index)" = "c4117b6244aac42ed0150a6019b4d953d28247c5dd6ae6f46ae469b5f2318733"
"checksum openssl-sys 0.7.17 (registry+https://github.com/rust-lang/crates.io-index)" = "89c47ee94c352eea9ddaf8e364be7f978a3bb6d66d73176572484238dd5a5c3f"

View file

@ -15,3 +15,5 @@ ring = "0.5.3"
postgres = "0.12.0"
clap = "2.16.3"
hyper = "0.9.11"
url = "1.2.3"
chrono = "0.2.25"

View file

@ -9,11 +9,15 @@ extern crate ring;
extern crate encoding;
extern crate postgres;
extern crate hyper;
extern crate url;
extern crate chrono;
mod archive;
mod archread;
mod man;
mod open;
mod pkg;
mod sys_arch;
// Convenience function to get a system id by short-name. Panics if the system doesn't exist.
@ -40,6 +44,12 @@ fn main() {
(@arg date: --date +required +takes_value "Package release date")
(@arg FILE: +required "Package file")
)
(@subcommand arch =>
(about: "Index an Arch Linux repository")
(@arg sys: --sys +required +takes_value "System short-name")
(@arg mirror: --mirror +required +takes_value "Mirror URL")
(@arg repo: --repo +required +takes_value "Repository name")
)
).get_matches();
let verbose = arg.occurrences_of("v");
@ -71,7 +81,15 @@ fn main() {
pkg: matches.value_of("pkg").unwrap(),
ver: matches.value_of("ver").unwrap(),
date: matches.value_of("date").unwrap(),
file: matches.value_of("FILE").unwrap()
file: open::Path{ path: matches.value_of("FILE").unwrap(), cache: false, canbelocal: true},
});
}
if let Some(matches) = arg.subcommand_matches("arch") {
sys_arch::sync(&db,
sysbyshort(&db, matches.value_of("sys").unwrap()),
matches.value_of("mirror").unwrap(),
matches.value_of("repo").unwrap()
);
}
}

82
indexer/src/open.rs Normal file
View file

@ -0,0 +1,82 @@
use std::io::{Read,Result,Error,ErrorKind,copy};
use std::fs::{File,create_dir_all,metadata};
use std::hash::{Hash,Hasher,SipHasher};
use std::time::{Duration,SystemTime};
use url::Url;
use hyper;
const CACHE_PATH: &'static str = "/var/tmp/manned-indexer";
const CACHE_TIME: u64 = 24*3600;
pub struct Path<'a> {
pub path: &'a str,
pub cache: bool,
pub canbelocal: bool,
}
fn cache_fn(url: &Url) -> String {
let name = url.path_segments().unwrap().last().unwrap();
let name = if name == "" { "index" } else { name };
let mut hash = SipHasher::new();
url.hash(&mut hash);
format!("{}/{}-{}-{:x}", CACHE_PATH, url.host_str().unwrap(), name, hash.finish())
}
fn fetch(url: &str) -> Result<Box<Read>> {
let res = try!(hyper::Client::new()
.get(url)
.header(hyper::header::UserAgent("Man page crawler (info@manned.org; https://manned.org/)".to_owned()))
.send()
.map_err(|e| Error::new(ErrorKind::Other, format!("Hyper: {}", e)))
);
if !res.status.is_success() {
return Err(Error::new(ErrorKind::Other, format!("HTTP: {}", res.status) ));
}
Ok(Box::new(res) as Box<Read>)
}
fn file(path: &str) -> Result<Box<Read>> {
Ok(Box::new(try!(File::open(path))) as Box<Read>)
}
impl<'a> Path<'a> {
pub fn open(&self) -> Result<Box<Read>> {
if let Ok(url) = Url::parse(self.path) {
if url.scheme() != "http" && url.scheme() != "https" {
return Err(Error::new(ErrorKind::Other, "Invalid scheme"));
}
if self.cache {
let cfn = cache_fn(&url);
if let Ok(m) = metadata(&cfn) {
if m.modified().unwrap() > SystemTime::now() - Duration::from_secs(CACHE_TIME) {
return file(&cfn);
}
}
try!(create_dir_all(CACHE_PATH));
{
let mut rd = try!(fetch(url.as_str()));
let mut wr = try!(File::create(&cfn));
try!(copy(&mut rd, &mut wr));
}
file(&cfn)
} else {
fetch(url.as_str())
}
} else if self.canbelocal {
file(self.path)
} else {
Err(Error::new(ErrorKind::Other, "Invalid URL"))
}
}
}

View file

@ -1,12 +1,11 @@
use std;
use std::io::Read;
use postgres;
use hyper;
use archive;
use open;
use archread;
use man;
use archive::Archive;
use archive::{Archive,ArchiveEntry};
pub struct PkgOpt<'a> {
pub force: bool,
@ -15,7 +14,7 @@ pub struct PkgOpt<'a> {
pub pkg: &'a str,
pub ver: &'a str,
pub date: &'a str, // TODO: Option to extract date from package metadata itself
pub file: &'a str
pub file: open::Path<'a>
}
@ -32,19 +31,19 @@ fn insert_pkg(tr: &postgres::transaction::Transaction, opt: &PkgOpt) -> Option<i
Ok(r) => r.get(0).get(0),
};
let q = "SELECT id FROM package_versions WHERE package = $1 AND version = $2 AND released = $3::text::date";
let res = tr.query(q, &[&pkgid, &opt.ver, &opt.date]).unwrap();
let q = "SELECT id FROM package_versions WHERE package = $1 AND version = $2";
let res = tr.query(q, &[&pkgid, &opt.ver]).unwrap();
let verid : i32;
if res.is_empty() {
let q = "INSERT INTO package_versions (package, version, released) VALUES($1, $2, $3::text::date) RETURNING id";
verid = tr.query(q, &[&pkgid, &opt.ver, &opt.date]).unwrap().get(0).get(0);
trace!("New package pkgid {} verid {}", pkgid, verid);
info!("New package pkgid {} verid {}", pkgid, verid);
Some(verid)
} else if opt.force {
verid = res.get(0).get(0);
trace!("Overwriting package pkgid {} verid {}", pkgid, verid);
info!("Overwriting package pkgid {} verid {}", pkgid, verid);
tr.query("DELETE FROM man WHERE package = $1", &[&verid]).unwrap();
Some(verid)
@ -103,50 +102,29 @@ fn insert_link(tr: &postgres::GenericConnection, verid: i32, src: &str, dest: &s
}
fn with_pkg<T,F>(file: &str, cb: F) -> std::io::Result<T>
where F: FnOnce(Option<archive::ArchiveEntry>) -> std::io::Result<T>
{
// TODO: .deb support
if file.starts_with("http://") || file.starts_with("https://") {
let mut res = try!(
hyper::Client::new().get(file).send()
.map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, format!("Hyper: {}", e)))
);
if !res.status.is_success() {
return Err(std::io::Error::new(std::io::ErrorKind::Other, format!("HTTP: {}", res.status) ));
}
let ent = try!(Archive::open_archive(&mut res));
cb(ent)
} else {
let mut res = try!(std::fs::File::open(file));
let ent = try!(Archive::open_archive(&mut res));
cb(ent)
}
}
fn index_pkg(tr: &postgres::GenericConnection, opt: &PkgOpt, verid: i32) -> std::io::Result<()> {
let indexfunc = |paths: &[&str], ent: &mut archive::ArchiveEntry| {
let indexfunc = |paths: &[&str], ent: &mut ArchiveEntry| {
insert_man(tr, verid, paths, ent);
Ok(()) /* Don't propagate errors, continue handling other man pages */
};
let missed = try!(
with_pkg(opt.file, |ent| { archread::FileList::read(ent, man::ismanpath, &indexfunc) })
).links(|src, dest| { insert_link(tr, verid, src, dest) });
let mut rd = try!(opt.file.open());
let missed = try!(archread::FileList::read(
try!(Archive::open_archive(&mut rd)),
man::ismanpath, &indexfunc))
.links(|src, dest| { insert_link(tr, verid, src, dest) });
if let Some(missed) = missed {
warn!("Some links were missed, reading package again");
try!(with_pkg(opt.file, |ent| { missed.read(ent, indexfunc) }))
let mut rd = try!(opt.file.open());
try!(missed.read(try!(Archive::open_archive(&mut rd)), indexfunc));
}
Ok(())
}
pub fn pkg(conn: &postgres::GenericConnection, opt: PkgOpt) {
info!("Handling pkg: {} / {} / {} - {} @ {} @ {}", opt.sys, opt.cat, opt.pkg, opt.ver, opt.date, opt.file);
info!("Handling pkg: {} / {} / {} - {} @ {} @ {}", opt.sys, opt.cat, opt.pkg, opt.ver, opt.date, opt.file.path);
let tr = conn.transaction().unwrap();
tr.set_rollback();

128
indexer/src/sys_arch.rs Normal file
View file

@ -0,0 +1,128 @@
use std::str::FromStr;
use std::io::{Read,BufRead,BufReader,Result};
use regex::Regex;
use chrono::NaiveDateTime;
use postgres;
use archive;
use open;
use man;
use pkg;
struct Meta {
filename: String,
name: String,
version: String,
date: String,
}
fn read_files<T: Read>(lst: T) -> Result<bool> {
let rd = BufReader::new(lst);
for line in rd.lines() {
let line = try!(line);
if man::ismanpath(&line) {
return Ok(true);
}
}
Ok(false)
}
fn read_desc(rd: &mut archive::ArchiveEntry) -> Result<Option<Meta>> {
let mut data = String::new();
try!(rd.take(64*1024).read_to_string(&mut data));
let path = rd.path().unwrap();
lazy_static! {
static ref RE: Regex = Regex::new(r"\s*%([^%]+)%\s*\n\s*([^\n]+)\s*\n").unwrap();
}
let mut filename = None;
let mut name = None;
let mut version = None;
let mut builddate = None;
for kv in RE.captures_iter(&data) {
let key = kv.at(1).unwrap();
let val = kv.at(2).unwrap();
trace!("{}: {} = {}", path, key, val);
match key {
"FILENAME" => filename = Some(val),
"NAME" => name = Some(val),
"VERSION" => version = Some(val),
"BUILDDATE" => builddate = i64::from_str(val).ok(),
_ => {},
}
}
if filename.is_some() && name.is_some() && version.is_some() && builddate.is_some() {
Ok(Some(Meta {
filename: filename.unwrap().to_string(),
name: name.unwrap().to_string(),
version: version.unwrap().to_string(),
date: NaiveDateTime::from_timestamp(builddate.unwrap(), 0).format("%Y-%m-%d").to_string(),
}))
} else {
warn!("Metadata missing from package description: {}", path);
Ok(None)
}
}
// TODO: Switch to x86_64 instead of i686
pub fn sync(pg: &postgres::GenericConnection, sys: i32, mirror: &str, repo: &str) {
info!("Reading packages from {} {}", mirror, repo);
let path = format!("{}/{}/os/i686/{1:}.files.tar.gz", mirror, repo);
let path = open::Path{ path: &path, cache: true, canbelocal: false };
let mut index = match path.open() {
Err(e) => { error!("Can't read package index: {}", e); return },
Ok(x) => x,
};
let ent = match archive::Archive::open_archive(&mut index) {
Err(e) => { error!("Can't read package index: {}", e); return },
Ok(x) => x,
};
let mut hasman = false;
let mut meta = None;
let r = archive::walk(ent, |x| {
if x.filetype() == archive::FileType::Directory {
hasman = false;
meta = None;
} else if x.path().unwrap().ends_with("/files") {
hasman = try!(read_files(x));
} else if x.path().unwrap().ends_with("/desc") {
meta = try!(read_desc(x));
}
if hasman && meta.is_some() {
hasman = false;
let m = meta.take().unwrap();
let p = format!("{}/{}/os/i686/{}", mirror, repo, m.filename);
pkg::pkg(pg, pkg::PkgOpt{
force: false,
sys: sys,
cat: repo,
pkg: &m.name,
ver: &m.version,
date: &m.date,
file: open::Path{
path: &p,
cache: false,
canbelocal: false,
},
});
}
Ok(true)
});
if let Err(e) = r {
error!("Error reading package index: {}", e);
}
}