indexer: Add file caching + Arch Linux indexing
This commit is contained in:
parent
35fab522d6
commit
1ca43665a1
6 changed files with 295 additions and 39 deletions
|
|
@ -9,11 +9,15 @@ extern crate ring;
|
|||
extern crate encoding;
|
||||
extern crate postgres;
|
||||
extern crate hyper;
|
||||
extern crate url;
|
||||
extern crate chrono;
|
||||
|
||||
mod archive;
|
||||
mod archread;
|
||||
mod man;
|
||||
mod open;
|
||||
mod pkg;
|
||||
mod sys_arch;
|
||||
|
||||
|
||||
// Convenience function to get a system id by short-name. Panics if the system doesn't exist.
|
||||
|
|
@ -40,6 +44,12 @@ fn main() {
|
|||
(@arg date: --date +required +takes_value "Package release date")
|
||||
(@arg FILE: +required "Package file")
|
||||
)
|
||||
(@subcommand arch =>
|
||||
(about: "Index an Arch Linux repository")
|
||||
(@arg sys: --sys +required +takes_value "System short-name")
|
||||
(@arg mirror: --mirror +required +takes_value "Mirror URL")
|
||||
(@arg repo: --repo +required +takes_value "Repository name")
|
||||
)
|
||||
).get_matches();
|
||||
|
||||
let verbose = arg.occurrences_of("v");
|
||||
|
|
@ -71,7 +81,15 @@ fn main() {
|
|||
pkg: matches.value_of("pkg").unwrap(),
|
||||
ver: matches.value_of("ver").unwrap(),
|
||||
date: matches.value_of("date").unwrap(),
|
||||
file: matches.value_of("FILE").unwrap()
|
||||
file: open::Path{ path: matches.value_of("FILE").unwrap(), cache: false, canbelocal: true},
|
||||
});
|
||||
}
|
||||
|
||||
if let Some(matches) = arg.subcommand_matches("arch") {
|
||||
sys_arch::sync(&db,
|
||||
sysbyshort(&db, matches.value_of("sys").unwrap()),
|
||||
matches.value_of("mirror").unwrap(),
|
||||
matches.value_of("repo").unwrap()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
82
indexer/src/open.rs
Normal file
82
indexer/src/open.rs
Normal file
|
|
@ -0,0 +1,82 @@
|
|||
use std::io::{Read,Result,Error,ErrorKind,copy};
|
||||
use std::fs::{File,create_dir_all,metadata};
|
||||
use std::hash::{Hash,Hasher,SipHasher};
|
||||
use std::time::{Duration,SystemTime};
|
||||
use url::Url;
|
||||
use hyper;
|
||||
|
||||
|
||||
const CACHE_PATH: &'static str = "/var/tmp/manned-indexer";
|
||||
const CACHE_TIME: u64 = 24*3600;
|
||||
|
||||
|
||||
pub struct Path<'a> {
|
||||
pub path: &'a str,
|
||||
pub cache: bool,
|
||||
pub canbelocal: bool,
|
||||
}
|
||||
|
||||
|
||||
fn cache_fn(url: &Url) -> String {
|
||||
let name = url.path_segments().unwrap().last().unwrap();
|
||||
let name = if name == "" { "index" } else { name };
|
||||
|
||||
let mut hash = SipHasher::new();
|
||||
url.hash(&mut hash);
|
||||
format!("{}/{}-{}-{:x}", CACHE_PATH, url.host_str().unwrap(), name, hash.finish())
|
||||
}
|
||||
|
||||
|
||||
fn fetch(url: &str) -> Result<Box<Read>> {
|
||||
let res = try!(hyper::Client::new()
|
||||
.get(url)
|
||||
.header(hyper::header::UserAgent("Man page crawler (info@manned.org; https://manned.org/)".to_owned()))
|
||||
.send()
|
||||
.map_err(|e| Error::new(ErrorKind::Other, format!("Hyper: {}", e)))
|
||||
);
|
||||
if !res.status.is_success() {
|
||||
return Err(Error::new(ErrorKind::Other, format!("HTTP: {}", res.status) ));
|
||||
}
|
||||
Ok(Box::new(res) as Box<Read>)
|
||||
}
|
||||
|
||||
|
||||
fn file(path: &str) -> Result<Box<Read>> {
|
||||
Ok(Box::new(try!(File::open(path))) as Box<Read>)
|
||||
}
|
||||
|
||||
|
||||
impl<'a> Path<'a> {
|
||||
pub fn open(&self) -> Result<Box<Read>> {
|
||||
if let Ok(url) = Url::parse(self.path) {
|
||||
if url.scheme() != "http" && url.scheme() != "https" {
|
||||
return Err(Error::new(ErrorKind::Other, "Invalid scheme"));
|
||||
}
|
||||
|
||||
if self.cache {
|
||||
let cfn = cache_fn(&url);
|
||||
if let Ok(m) = metadata(&cfn) {
|
||||
if m.modified().unwrap() > SystemTime::now() - Duration::from_secs(CACHE_TIME) {
|
||||
return file(&cfn);
|
||||
}
|
||||
}
|
||||
try!(create_dir_all(CACHE_PATH));
|
||||
{
|
||||
let mut rd = try!(fetch(url.as_str()));
|
||||
let mut wr = try!(File::create(&cfn));
|
||||
try!(copy(&mut rd, &mut wr));
|
||||
}
|
||||
file(&cfn)
|
||||
|
||||
} else {
|
||||
fetch(url.as_str())
|
||||
}
|
||||
|
||||
} else if self.canbelocal {
|
||||
file(self.path)
|
||||
|
||||
} else {
|
||||
Err(Error::new(ErrorKind::Other, "Invalid URL"))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,12 +1,11 @@
|
|||
use std;
|
||||
use std::io::Read;
|
||||
use postgres;
|
||||
use hyper;
|
||||
|
||||
use archive;
|
||||
use open;
|
||||
use archread;
|
||||
use man;
|
||||
use archive::Archive;
|
||||
use archive::{Archive,ArchiveEntry};
|
||||
|
||||
pub struct PkgOpt<'a> {
|
||||
pub force: bool,
|
||||
|
|
@ -15,7 +14,7 @@ pub struct PkgOpt<'a> {
|
|||
pub pkg: &'a str,
|
||||
pub ver: &'a str,
|
||||
pub date: &'a str, // TODO: Option to extract date from package metadata itself
|
||||
pub file: &'a str
|
||||
pub file: open::Path<'a>
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -32,19 +31,19 @@ fn insert_pkg(tr: &postgres::transaction::Transaction, opt: &PkgOpt) -> Option<i
|
|||
Ok(r) => r.get(0).get(0),
|
||||
};
|
||||
|
||||
let q = "SELECT id FROM package_versions WHERE package = $1 AND version = $2 AND released = $3::text::date";
|
||||
let res = tr.query(q, &[&pkgid, &opt.ver, &opt.date]).unwrap();
|
||||
let q = "SELECT id FROM package_versions WHERE package = $1 AND version = $2";
|
||||
let res = tr.query(q, &[&pkgid, &opt.ver]).unwrap();
|
||||
|
||||
let verid : i32;
|
||||
if res.is_empty() {
|
||||
let q = "INSERT INTO package_versions (package, version, released) VALUES($1, $2, $3::text::date) RETURNING id";
|
||||
verid = tr.query(q, &[&pkgid, &opt.ver, &opt.date]).unwrap().get(0).get(0);
|
||||
trace!("New package pkgid {} verid {}", pkgid, verid);
|
||||
info!("New package pkgid {} verid {}", pkgid, verid);
|
||||
Some(verid)
|
||||
|
||||
} else if opt.force {
|
||||
verid = res.get(0).get(0);
|
||||
trace!("Overwriting package pkgid {} verid {}", pkgid, verid);
|
||||
info!("Overwriting package pkgid {} verid {}", pkgid, verid);
|
||||
tr.query("DELETE FROM man WHERE package = $1", &[&verid]).unwrap();
|
||||
Some(verid)
|
||||
|
||||
|
|
@ -103,50 +102,29 @@ fn insert_link(tr: &postgres::GenericConnection, verid: i32, src: &str, dest: &s
|
|||
}
|
||||
|
||||
|
||||
fn with_pkg<T,F>(file: &str, cb: F) -> std::io::Result<T>
|
||||
where F: FnOnce(Option<archive::ArchiveEntry>) -> std::io::Result<T>
|
||||
{
|
||||
// TODO: .deb support
|
||||
|
||||
if file.starts_with("http://") || file.starts_with("https://") {
|
||||
let mut res = try!(
|
||||
hyper::Client::new().get(file).send()
|
||||
.map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, format!("Hyper: {}", e)))
|
||||
);
|
||||
if !res.status.is_success() {
|
||||
return Err(std::io::Error::new(std::io::ErrorKind::Other, format!("HTTP: {}", res.status) ));
|
||||
}
|
||||
let ent = try!(Archive::open_archive(&mut res));
|
||||
cb(ent)
|
||||
|
||||
} else {
|
||||
let mut res = try!(std::fs::File::open(file));
|
||||
let ent = try!(Archive::open_archive(&mut res));
|
||||
cb(ent)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
fn index_pkg(tr: &postgres::GenericConnection, opt: &PkgOpt, verid: i32) -> std::io::Result<()> {
|
||||
let indexfunc = |paths: &[&str], ent: &mut archive::ArchiveEntry| {
|
||||
let indexfunc = |paths: &[&str], ent: &mut ArchiveEntry| {
|
||||
insert_man(tr, verid, paths, ent);
|
||||
Ok(()) /* Don't propagate errors, continue handling other man pages */
|
||||
};
|
||||
|
||||
let missed = try!(
|
||||
with_pkg(opt.file, |ent| { archread::FileList::read(ent, man::ismanpath, &indexfunc) })
|
||||
).links(|src, dest| { insert_link(tr, verid, src, dest) });
|
||||
let mut rd = try!(opt.file.open());
|
||||
let missed = try!(archread::FileList::read(
|
||||
try!(Archive::open_archive(&mut rd)),
|
||||
man::ismanpath, &indexfunc))
|
||||
.links(|src, dest| { insert_link(tr, verid, src, dest) });
|
||||
|
||||
if let Some(missed) = missed {
|
||||
warn!("Some links were missed, reading package again");
|
||||
try!(with_pkg(opt.file, |ent| { missed.read(ent, indexfunc) }))
|
||||
let mut rd = try!(opt.file.open());
|
||||
try!(missed.read(try!(Archive::open_archive(&mut rd)), indexfunc));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
pub fn pkg(conn: &postgres::GenericConnection, opt: PkgOpt) {
|
||||
info!("Handling pkg: {} / {} / {} - {} @ {} @ {}", opt.sys, opt.cat, opt.pkg, opt.ver, opt.date, opt.file);
|
||||
info!("Handling pkg: {} / {} / {} - {} @ {} @ {}", opt.sys, opt.cat, opt.pkg, opt.ver, opt.date, opt.file.path);
|
||||
|
||||
let tr = conn.transaction().unwrap();
|
||||
tr.set_rollback();
|
||||
|
|
|
|||
128
indexer/src/sys_arch.rs
Normal file
128
indexer/src/sys_arch.rs
Normal file
|
|
@ -0,0 +1,128 @@
|
|||
use std::str::FromStr;
|
||||
use std::io::{Read,BufRead,BufReader,Result};
|
||||
use regex::Regex;
|
||||
use chrono::NaiveDateTime;
|
||||
use postgres;
|
||||
|
||||
use archive;
|
||||
use open;
|
||||
use man;
|
||||
use pkg;
|
||||
|
||||
|
||||
struct Meta {
|
||||
filename: String,
|
||||
name: String,
|
||||
version: String,
|
||||
date: String,
|
||||
}
|
||||
|
||||
|
||||
fn read_files<T: Read>(lst: T) -> Result<bool> {
|
||||
let rd = BufReader::new(lst);
|
||||
for line in rd.lines() {
|
||||
let line = try!(line);
|
||||
if man::ismanpath(&line) {
|
||||
return Ok(true);
|
||||
}
|
||||
}
|
||||
Ok(false)
|
||||
}
|
||||
|
||||
|
||||
fn read_desc(rd: &mut archive::ArchiveEntry) -> Result<Option<Meta>> {
|
||||
let mut data = String::new();
|
||||
try!(rd.take(64*1024).read_to_string(&mut data));
|
||||
|
||||
let path = rd.path().unwrap();
|
||||
lazy_static! {
|
||||
static ref RE: Regex = Regex::new(r"\s*%([^%]+)%\s*\n\s*([^\n]+)\s*\n").unwrap();
|
||||
}
|
||||
|
||||
let mut filename = None;
|
||||
let mut name = None;
|
||||
let mut version = None;
|
||||
let mut builddate = None;
|
||||
|
||||
for kv in RE.captures_iter(&data) {
|
||||
let key = kv.at(1).unwrap();
|
||||
let val = kv.at(2).unwrap();
|
||||
trace!("{}: {} = {}", path, key, val);
|
||||
match key {
|
||||
"FILENAME" => filename = Some(val),
|
||||
"NAME" => name = Some(val),
|
||||
"VERSION" => version = Some(val),
|
||||
"BUILDDATE" => builddate = i64::from_str(val).ok(),
|
||||
_ => {},
|
||||
}
|
||||
}
|
||||
|
||||
if filename.is_some() && name.is_some() && version.is_some() && builddate.is_some() {
|
||||
Ok(Some(Meta {
|
||||
filename: filename.unwrap().to_string(),
|
||||
name: name.unwrap().to_string(),
|
||||
version: version.unwrap().to_string(),
|
||||
date: NaiveDateTime::from_timestamp(builddate.unwrap(), 0).format("%Y-%m-%d").to_string(),
|
||||
}))
|
||||
} else {
|
||||
warn!("Metadata missing from package description: {}", path);
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// TODO: Switch to x86_64 instead of i686
|
||||
pub fn sync(pg: &postgres::GenericConnection, sys: i32, mirror: &str, repo: &str) {
|
||||
info!("Reading packages from {} {}", mirror, repo);
|
||||
|
||||
let path = format!("{}/{}/os/i686/{1:}.files.tar.gz", mirror, repo);
|
||||
let path = open::Path{ path: &path, cache: true, canbelocal: false };
|
||||
let mut index = match path.open() {
|
||||
Err(e) => { error!("Can't read package index: {}", e); return },
|
||||
Ok(x) => x,
|
||||
};
|
||||
|
||||
let ent = match archive::Archive::open_archive(&mut index) {
|
||||
Err(e) => { error!("Can't read package index: {}", e); return },
|
||||
Ok(x) => x,
|
||||
};
|
||||
|
||||
let mut hasman = false;
|
||||
let mut meta = None;
|
||||
let r = archive::walk(ent, |x| {
|
||||
if x.filetype() == archive::FileType::Directory {
|
||||
hasman = false;
|
||||
meta = None;
|
||||
} else if x.path().unwrap().ends_with("/files") {
|
||||
hasman = try!(read_files(x));
|
||||
} else if x.path().unwrap().ends_with("/desc") {
|
||||
meta = try!(read_desc(x));
|
||||
}
|
||||
|
||||
if hasman && meta.is_some() {
|
||||
hasman = false;
|
||||
let m = meta.take().unwrap();
|
||||
|
||||
let p = format!("{}/{}/os/i686/{}", mirror, repo, m.filename);
|
||||
pkg::pkg(pg, pkg::PkgOpt{
|
||||
force: false,
|
||||
sys: sys,
|
||||
cat: repo,
|
||||
pkg: &m.name,
|
||||
ver: &m.version,
|
||||
date: &m.date,
|
||||
file: open::Path{
|
||||
path: &p,
|
||||
cache: false,
|
||||
canbelocal: false,
|
||||
},
|
||||
});
|
||||
}
|
||||
|
||||
Ok(true)
|
||||
});
|
||||
|
||||
if let Err(e) = r {
|
||||
error!("Error reading package index: {}", e);
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue