indexer: Add file caching + Arch Linux indexing
This commit is contained in:
parent
35fab522d6
commit
1ca43665a1
6 changed files with 295 additions and 39 deletions
48
indexer/Cargo.lock
generated
48
indexer/Cargo.lock
generated
|
|
@ -2,6 +2,7 @@
|
|||
name = "indexer"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"chrono 0.2.25 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"clap 2.17.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"encoding 0.3.0-dev (git+https://github.com/lifthrasiir/rust-encoding)",
|
||||
"env_logger 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
|
|
@ -13,6 +14,7 @@ dependencies = [
|
|||
"postgres 0.12.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"regex 0.1.80 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"ring 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"url 1.2.3 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
|
@ -43,6 +45,15 @@ name = "byteorder"
|
|||
version = "0.5.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
|
||||
[[package]]
|
||||
name = "chrono"
|
||||
version = "0.2.25"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
dependencies = [
|
||||
"num 0.1.36 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"time 0.1.35 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "clap"
|
||||
version = "2.17.1"
|
||||
|
|
@ -282,6 +293,38 @@ dependencies = [
|
|||
"log 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "num"
|
||||
version = "0.1.36"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
dependencies = [
|
||||
"num-integer 0.1.32 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"num-iter 0.1.32 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"num-traits 0.1.36 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "num-integer"
|
||||
version = "0.1.32"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
dependencies = [
|
||||
"num-traits 0.1.36 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "num-iter"
|
||||
version = "0.1.32"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
dependencies = [
|
||||
"num-integer 0.1.32 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"num-traits 0.1.36 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "num-traits"
|
||||
version = "0.1.36"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
|
||||
[[package]]
|
||||
name = "num_cpus"
|
||||
version = "1.1.0"
|
||||
|
|
@ -584,6 +627,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||
"checksum bitflags 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "aad18937a628ec6abcd26d1489012cc0e18c21798210f491af69ded9b881106d"
|
||||
"checksum bufstream 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7b48dbe2ff0e98fa2f03377d204a9637d3c9816cd431bfe05a8abbd0ea11d074"
|
||||
"checksum byteorder 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)" = "0fc10e8cc6b2580fda3f36eb6dc5316657f812a3df879a44a66fc9f0fdbc4855"
|
||||
"checksum chrono 0.2.25 (registry+https://github.com/rust-lang/crates.io-index)" = "9213f7cd7c27e95c2b57c49f0e69b1ea65b27138da84a170133fd21b07659c00"
|
||||
"checksum clap 2.17.1 (registry+https://github.com/rust-lang/crates.io-index)" = "27dac76762fb56019b04aed3ccb43a770a18f80f9c2eb62ee1a18d9fb4ea2430"
|
||||
"checksum cookie 0.2.5 (registry+https://github.com/rust-lang/crates.io-index)" = "0e3d6405328b6edb412158b3b7710e2634e23f3614b9bb1c412df7952489a626"
|
||||
"checksum encoding 0.3.0-dev (git+https://github.com/lifthrasiir/rust-encoding)" = "<none>"
|
||||
|
|
@ -614,6 +658,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||
"checksum md5 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "7df230903ccdffd6b3b4ec21624498ea64c912ce50297846907f0b8e1bb249dd"
|
||||
"checksum memchr 0.1.11 (registry+https://github.com/rust-lang/crates.io-index)" = "d8b629fb514376c675b98c1421e80b151d3817ac42d7c667717d282761418d20"
|
||||
"checksum mime 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "b5c93a4bd787ddc6e7833c519b73a50883deb5863d76d9b71eb8216fb7f94e66"
|
||||
"checksum num 0.1.36 (registry+https://github.com/rust-lang/crates.io-index)" = "bde7c03b09e7c6a301ee81f6ddf66d7a28ec305699e3d3b056d2fc56470e3120"
|
||||
"checksum num-integer 0.1.32 (registry+https://github.com/rust-lang/crates.io-index)" = "fb24d9bfb3f222010df27995441ded1e954f8f69cd35021f6bef02ca9552fb92"
|
||||
"checksum num-iter 0.1.32 (registry+https://github.com/rust-lang/crates.io-index)" = "287a1c9969a847055e1122ec0ea7a5c5d6f72aad97934e131c83d5c08ab4e45c"
|
||||
"checksum num-traits 0.1.36 (registry+https://github.com/rust-lang/crates.io-index)" = "a16a42856a256b39c6d3484f097f6713e14feacd9bfb02290917904fae46c81c"
|
||||
"checksum num_cpus 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "8890e6084723d57d0df8d2720b0d60c6ee67d6c93e7169630e4371e88765dcad"
|
||||
"checksum openssl 0.7.14 (registry+https://github.com/rust-lang/crates.io-index)" = "c4117b6244aac42ed0150a6019b4d953d28247c5dd6ae6f46ae469b5f2318733"
|
||||
"checksum openssl-sys 0.7.17 (registry+https://github.com/rust-lang/crates.io-index)" = "89c47ee94c352eea9ddaf8e364be7f978a3bb6d66d73176572484238dd5a5c3f"
|
||||
|
|
|
|||
|
|
@ -15,3 +15,5 @@ ring = "0.5.3"
|
|||
postgres = "0.12.0"
|
||||
clap = "2.16.3"
|
||||
hyper = "0.9.11"
|
||||
url = "1.2.3"
|
||||
chrono = "0.2.25"
|
||||
|
|
|
|||
|
|
@ -9,11 +9,15 @@ extern crate ring;
|
|||
extern crate encoding;
|
||||
extern crate postgres;
|
||||
extern crate hyper;
|
||||
extern crate url;
|
||||
extern crate chrono;
|
||||
|
||||
mod archive;
|
||||
mod archread;
|
||||
mod man;
|
||||
mod open;
|
||||
mod pkg;
|
||||
mod sys_arch;
|
||||
|
||||
|
||||
// Convenience function to get a system id by short-name. Panics if the system doesn't exist.
|
||||
|
|
@ -40,6 +44,12 @@ fn main() {
|
|||
(@arg date: --date +required +takes_value "Package release date")
|
||||
(@arg FILE: +required "Package file")
|
||||
)
|
||||
(@subcommand arch =>
|
||||
(about: "Index an Arch Linux repository")
|
||||
(@arg sys: --sys +required +takes_value "System short-name")
|
||||
(@arg mirror: --mirror +required +takes_value "Mirror URL")
|
||||
(@arg repo: --repo +required +takes_value "Repository name")
|
||||
)
|
||||
).get_matches();
|
||||
|
||||
let verbose = arg.occurrences_of("v");
|
||||
|
|
@ -71,7 +81,15 @@ fn main() {
|
|||
pkg: matches.value_of("pkg").unwrap(),
|
||||
ver: matches.value_of("ver").unwrap(),
|
||||
date: matches.value_of("date").unwrap(),
|
||||
file: matches.value_of("FILE").unwrap()
|
||||
file: open::Path{ path: matches.value_of("FILE").unwrap(), cache: false, canbelocal: true},
|
||||
});
|
||||
}
|
||||
|
||||
if let Some(matches) = arg.subcommand_matches("arch") {
|
||||
sys_arch::sync(&db,
|
||||
sysbyshort(&db, matches.value_of("sys").unwrap()),
|
||||
matches.value_of("mirror").unwrap(),
|
||||
matches.value_of("repo").unwrap()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
82
indexer/src/open.rs
Normal file
82
indexer/src/open.rs
Normal file
|
|
@ -0,0 +1,82 @@
|
|||
use std::io::{Read,Result,Error,ErrorKind,copy};
|
||||
use std::fs::{File,create_dir_all,metadata};
|
||||
use std::hash::{Hash,Hasher,SipHasher};
|
||||
use std::time::{Duration,SystemTime};
|
||||
use url::Url;
|
||||
use hyper;
|
||||
|
||||
|
||||
const CACHE_PATH: &'static str = "/var/tmp/manned-indexer";
|
||||
const CACHE_TIME: u64 = 24*3600;
|
||||
|
||||
|
||||
pub struct Path<'a> {
|
||||
pub path: &'a str,
|
||||
pub cache: bool,
|
||||
pub canbelocal: bool,
|
||||
}
|
||||
|
||||
|
||||
fn cache_fn(url: &Url) -> String {
|
||||
let name = url.path_segments().unwrap().last().unwrap();
|
||||
let name = if name == "" { "index" } else { name };
|
||||
|
||||
let mut hash = SipHasher::new();
|
||||
url.hash(&mut hash);
|
||||
format!("{}/{}-{}-{:x}", CACHE_PATH, url.host_str().unwrap(), name, hash.finish())
|
||||
}
|
||||
|
||||
|
||||
fn fetch(url: &str) -> Result<Box<Read>> {
|
||||
let res = try!(hyper::Client::new()
|
||||
.get(url)
|
||||
.header(hyper::header::UserAgent("Man page crawler (info@manned.org; https://manned.org/)".to_owned()))
|
||||
.send()
|
||||
.map_err(|e| Error::new(ErrorKind::Other, format!("Hyper: {}", e)))
|
||||
);
|
||||
if !res.status.is_success() {
|
||||
return Err(Error::new(ErrorKind::Other, format!("HTTP: {}", res.status) ));
|
||||
}
|
||||
Ok(Box::new(res) as Box<Read>)
|
||||
}
|
||||
|
||||
|
||||
fn file(path: &str) -> Result<Box<Read>> {
|
||||
Ok(Box::new(try!(File::open(path))) as Box<Read>)
|
||||
}
|
||||
|
||||
|
||||
impl<'a> Path<'a> {
|
||||
pub fn open(&self) -> Result<Box<Read>> {
|
||||
if let Ok(url) = Url::parse(self.path) {
|
||||
if url.scheme() != "http" && url.scheme() != "https" {
|
||||
return Err(Error::new(ErrorKind::Other, "Invalid scheme"));
|
||||
}
|
||||
|
||||
if self.cache {
|
||||
let cfn = cache_fn(&url);
|
||||
if let Ok(m) = metadata(&cfn) {
|
||||
if m.modified().unwrap() > SystemTime::now() - Duration::from_secs(CACHE_TIME) {
|
||||
return file(&cfn);
|
||||
}
|
||||
}
|
||||
try!(create_dir_all(CACHE_PATH));
|
||||
{
|
||||
let mut rd = try!(fetch(url.as_str()));
|
||||
let mut wr = try!(File::create(&cfn));
|
||||
try!(copy(&mut rd, &mut wr));
|
||||
}
|
||||
file(&cfn)
|
||||
|
||||
} else {
|
||||
fetch(url.as_str())
|
||||
}
|
||||
|
||||
} else if self.canbelocal {
|
||||
file(self.path)
|
||||
|
||||
} else {
|
||||
Err(Error::new(ErrorKind::Other, "Invalid URL"))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,12 +1,11 @@
|
|||
use std;
|
||||
use std::io::Read;
|
||||
use postgres;
|
||||
use hyper;
|
||||
|
||||
use archive;
|
||||
use open;
|
||||
use archread;
|
||||
use man;
|
||||
use archive::Archive;
|
||||
use archive::{Archive,ArchiveEntry};
|
||||
|
||||
pub struct PkgOpt<'a> {
|
||||
pub force: bool,
|
||||
|
|
@ -15,7 +14,7 @@ pub struct PkgOpt<'a> {
|
|||
pub pkg: &'a str,
|
||||
pub ver: &'a str,
|
||||
pub date: &'a str, // TODO: Option to extract date from package metadata itself
|
||||
pub file: &'a str
|
||||
pub file: open::Path<'a>
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -32,19 +31,19 @@ fn insert_pkg(tr: &postgres::transaction::Transaction, opt: &PkgOpt) -> Option<i
|
|||
Ok(r) => r.get(0).get(0),
|
||||
};
|
||||
|
||||
let q = "SELECT id FROM package_versions WHERE package = $1 AND version = $2 AND released = $3::text::date";
|
||||
let res = tr.query(q, &[&pkgid, &opt.ver, &opt.date]).unwrap();
|
||||
let q = "SELECT id FROM package_versions WHERE package = $1 AND version = $2";
|
||||
let res = tr.query(q, &[&pkgid, &opt.ver]).unwrap();
|
||||
|
||||
let verid : i32;
|
||||
if res.is_empty() {
|
||||
let q = "INSERT INTO package_versions (package, version, released) VALUES($1, $2, $3::text::date) RETURNING id";
|
||||
verid = tr.query(q, &[&pkgid, &opt.ver, &opt.date]).unwrap().get(0).get(0);
|
||||
trace!("New package pkgid {} verid {}", pkgid, verid);
|
||||
info!("New package pkgid {} verid {}", pkgid, verid);
|
||||
Some(verid)
|
||||
|
||||
} else if opt.force {
|
||||
verid = res.get(0).get(0);
|
||||
trace!("Overwriting package pkgid {} verid {}", pkgid, verid);
|
||||
info!("Overwriting package pkgid {} verid {}", pkgid, verid);
|
||||
tr.query("DELETE FROM man WHERE package = $1", &[&verid]).unwrap();
|
||||
Some(verid)
|
||||
|
||||
|
|
@ -103,50 +102,29 @@ fn insert_link(tr: &postgres::GenericConnection, verid: i32, src: &str, dest: &s
|
|||
}
|
||||
|
||||
|
||||
fn with_pkg<T,F>(file: &str, cb: F) -> std::io::Result<T>
|
||||
where F: FnOnce(Option<archive::ArchiveEntry>) -> std::io::Result<T>
|
||||
{
|
||||
// TODO: .deb support
|
||||
|
||||
if file.starts_with("http://") || file.starts_with("https://") {
|
||||
let mut res = try!(
|
||||
hyper::Client::new().get(file).send()
|
||||
.map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, format!("Hyper: {}", e)))
|
||||
);
|
||||
if !res.status.is_success() {
|
||||
return Err(std::io::Error::new(std::io::ErrorKind::Other, format!("HTTP: {}", res.status) ));
|
||||
}
|
||||
let ent = try!(Archive::open_archive(&mut res));
|
||||
cb(ent)
|
||||
|
||||
} else {
|
||||
let mut res = try!(std::fs::File::open(file));
|
||||
let ent = try!(Archive::open_archive(&mut res));
|
||||
cb(ent)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
fn index_pkg(tr: &postgres::GenericConnection, opt: &PkgOpt, verid: i32) -> std::io::Result<()> {
|
||||
let indexfunc = |paths: &[&str], ent: &mut archive::ArchiveEntry| {
|
||||
let indexfunc = |paths: &[&str], ent: &mut ArchiveEntry| {
|
||||
insert_man(tr, verid, paths, ent);
|
||||
Ok(()) /* Don't propagate errors, continue handling other man pages */
|
||||
};
|
||||
|
||||
let missed = try!(
|
||||
with_pkg(opt.file, |ent| { archread::FileList::read(ent, man::ismanpath, &indexfunc) })
|
||||
).links(|src, dest| { insert_link(tr, verid, src, dest) });
|
||||
let mut rd = try!(opt.file.open());
|
||||
let missed = try!(archread::FileList::read(
|
||||
try!(Archive::open_archive(&mut rd)),
|
||||
man::ismanpath, &indexfunc))
|
||||
.links(|src, dest| { insert_link(tr, verid, src, dest) });
|
||||
|
||||
if let Some(missed) = missed {
|
||||
warn!("Some links were missed, reading package again");
|
||||
try!(with_pkg(opt.file, |ent| { missed.read(ent, indexfunc) }))
|
||||
let mut rd = try!(opt.file.open());
|
||||
try!(missed.read(try!(Archive::open_archive(&mut rd)), indexfunc));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
pub fn pkg(conn: &postgres::GenericConnection, opt: PkgOpt) {
|
||||
info!("Handling pkg: {} / {} / {} - {} @ {} @ {}", opt.sys, opt.cat, opt.pkg, opt.ver, opt.date, opt.file);
|
||||
info!("Handling pkg: {} / {} / {} - {} @ {} @ {}", opt.sys, opt.cat, opt.pkg, opt.ver, opt.date, opt.file.path);
|
||||
|
||||
let tr = conn.transaction().unwrap();
|
||||
tr.set_rollback();
|
||||
|
|
|
|||
128
indexer/src/sys_arch.rs
Normal file
128
indexer/src/sys_arch.rs
Normal file
|
|
@ -0,0 +1,128 @@
|
|||
use std::str::FromStr;
|
||||
use std::io::{Read,BufRead,BufReader,Result};
|
||||
use regex::Regex;
|
||||
use chrono::NaiveDateTime;
|
||||
use postgres;
|
||||
|
||||
use archive;
|
||||
use open;
|
||||
use man;
|
||||
use pkg;
|
||||
|
||||
|
||||
struct Meta {
|
||||
filename: String,
|
||||
name: String,
|
||||
version: String,
|
||||
date: String,
|
||||
}
|
||||
|
||||
|
||||
fn read_files<T: Read>(lst: T) -> Result<bool> {
|
||||
let rd = BufReader::new(lst);
|
||||
for line in rd.lines() {
|
||||
let line = try!(line);
|
||||
if man::ismanpath(&line) {
|
||||
return Ok(true);
|
||||
}
|
||||
}
|
||||
Ok(false)
|
||||
}
|
||||
|
||||
|
||||
fn read_desc(rd: &mut archive::ArchiveEntry) -> Result<Option<Meta>> {
|
||||
let mut data = String::new();
|
||||
try!(rd.take(64*1024).read_to_string(&mut data));
|
||||
|
||||
let path = rd.path().unwrap();
|
||||
lazy_static! {
|
||||
static ref RE: Regex = Regex::new(r"\s*%([^%]+)%\s*\n\s*([^\n]+)\s*\n").unwrap();
|
||||
}
|
||||
|
||||
let mut filename = None;
|
||||
let mut name = None;
|
||||
let mut version = None;
|
||||
let mut builddate = None;
|
||||
|
||||
for kv in RE.captures_iter(&data) {
|
||||
let key = kv.at(1).unwrap();
|
||||
let val = kv.at(2).unwrap();
|
||||
trace!("{}: {} = {}", path, key, val);
|
||||
match key {
|
||||
"FILENAME" => filename = Some(val),
|
||||
"NAME" => name = Some(val),
|
||||
"VERSION" => version = Some(val),
|
||||
"BUILDDATE" => builddate = i64::from_str(val).ok(),
|
||||
_ => {},
|
||||
}
|
||||
}
|
||||
|
||||
if filename.is_some() && name.is_some() && version.is_some() && builddate.is_some() {
|
||||
Ok(Some(Meta {
|
||||
filename: filename.unwrap().to_string(),
|
||||
name: name.unwrap().to_string(),
|
||||
version: version.unwrap().to_string(),
|
||||
date: NaiveDateTime::from_timestamp(builddate.unwrap(), 0).format("%Y-%m-%d").to_string(),
|
||||
}))
|
||||
} else {
|
||||
warn!("Metadata missing from package description: {}", path);
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// TODO: Switch to x86_64 instead of i686
|
||||
pub fn sync(pg: &postgres::GenericConnection, sys: i32, mirror: &str, repo: &str) {
|
||||
info!("Reading packages from {} {}", mirror, repo);
|
||||
|
||||
let path = format!("{}/{}/os/i686/{1:}.files.tar.gz", mirror, repo);
|
||||
let path = open::Path{ path: &path, cache: true, canbelocal: false };
|
||||
let mut index = match path.open() {
|
||||
Err(e) => { error!("Can't read package index: {}", e); return },
|
||||
Ok(x) => x,
|
||||
};
|
||||
|
||||
let ent = match archive::Archive::open_archive(&mut index) {
|
||||
Err(e) => { error!("Can't read package index: {}", e); return },
|
||||
Ok(x) => x,
|
||||
};
|
||||
|
||||
let mut hasman = false;
|
||||
let mut meta = None;
|
||||
let r = archive::walk(ent, |x| {
|
||||
if x.filetype() == archive::FileType::Directory {
|
||||
hasman = false;
|
||||
meta = None;
|
||||
} else if x.path().unwrap().ends_with("/files") {
|
||||
hasman = try!(read_files(x));
|
||||
} else if x.path().unwrap().ends_with("/desc") {
|
||||
meta = try!(read_desc(x));
|
||||
}
|
||||
|
||||
if hasman && meta.is_some() {
|
||||
hasman = false;
|
||||
let m = meta.take().unwrap();
|
||||
|
||||
let p = format!("{}/{}/os/i686/{}", mirror, repo, m.filename);
|
||||
pkg::pkg(pg, pkg::PkgOpt{
|
||||
force: false,
|
||||
sys: sys,
|
||||
cat: repo,
|
||||
pkg: &m.name,
|
||||
ver: &m.version,
|
||||
date: &m.date,
|
||||
file: open::Path{
|
||||
path: &p,
|
||||
cache: false,
|
||||
canbelocal: false,
|
||||
},
|
||||
});
|
||||
}
|
||||
|
||||
Ok(true)
|
||||
});
|
||||
|
||||
if let Err(e) = r {
|
||||
error!("Error reading package index: {}", e);
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue