From 022e9acc4f7e42b807bb63e021655cc79ce9f398 Mon Sep 17 00:00:00 2001 From: Yorhel Date: Sat, 22 Oct 2016 14:54:35 +0200 Subject: [PATCH 1/7] WIP: Rewritten man page indexer in Rust Currently just figuring out how to read archives. Turns out to not be as simple as I had expected. --- .gitignore | 2 + indexer/Cargo.lock | 143 ++++++++++++ indexer/Cargo.toml | 10 + indexer/src/archive.rs | 368 +++++++++++++++++++++++++++++++ indexer/src/main.rs | 97 ++++++++ indexer/tests/mktar.sh | 49 ++++ indexer/tests/testarchive.tar.xz | Bin 0 -> 616 bytes 7 files changed, 669 insertions(+) create mode 100644 indexer/Cargo.lock create mode 100644 indexer/Cargo.toml create mode 100644 indexer/src/archive.rs create mode 100644 indexer/src/main.rs create mode 100755 indexer/tests/mktar.sh create mode 100644 indexer/tests/testarchive.tar.xz diff --git a/.gitignore b/.gitignore index 0da3d84..a80d19e 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,5 @@ !/lib/ManUtils/Build.PL !/lib/ManUtils/ManUtils.pm !/lib/ManUtils/ManUtils.xs +indexer/target + diff --git a/indexer/Cargo.lock b/indexer/Cargo.lock new file mode 100644 index 0000000..53c701d --- /dev/null +++ b/indexer/Cargo.lock @@ -0,0 +1,143 @@ +[root] +name = "indexer" +version = "0.1.0" +dependencies = [ + "env_logger 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)", + "libarchive 0.1.1 (git+https://github.com/17dec/libarchive-rust)", + "log 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", + "regex 0.1.77 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "aho-corasick" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "memchr 0.1.11 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "env_logger" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "log 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", + "regex 0.1.77 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "kernel32-sys" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "winapi 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)", + "winapi-build 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "libarchive" +version = "0.1.1" +source = "git+https://github.com/17dec/libarchive-rust#3f723cf0064561f21f0cebbd534a75076e6dbcaa" +dependencies = [ + "libarchive3-sys 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.17 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "libarchive3-sys" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "libc 0.2.17 (registry+https://github.com/rust-lang/crates.io-index)", + "pkg-config 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "libc" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "log" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "memchr" +version = "0.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "libc 0.2.17 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "pkg-config" +version = "0.3.8" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "regex" +version = "0.1.77" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "aho-corasick 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)", + "memchr 0.1.11 (registry+https://github.com/rust-lang/crates.io-index)", + "regex-syntax 0.3.7 (registry+https://github.com/rust-lang/crates.io-index)", + "thread_local 0.2.7 (registry+https://github.com/rust-lang/crates.io-index)", + "utf8-ranges 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "regex-syntax" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "thread-id" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "kernel32-sys 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.17 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "thread_local" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "thread-id 2.0.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "utf8-ranges" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "winapi" +version = "0.2.8" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "winapi-build" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[metadata] +"checksum aho-corasick 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)" = "ca972c2ea5f742bfce5687b9aef75506a764f61d37f8f649047846a9686ddb66" +"checksum env_logger 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)" = "15abd780e45b3ea4f76b4e9a26ff4843258dd8a3eed2775a0e7368c2e7936c2f" +"checksum kernel32-sys 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7507624b29483431c0ba2d82aece8ca6cdba9382bff4ddd0f7490560c056098d" +"checksum libarchive 0.1.1 (git+https://github.com/17dec/libarchive-rust)" = "" +"checksum libarchive3-sys 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "3cd3beae8f59a4c7a806523269b5392037577c150446e88d684dfa6de6031ca7" +"checksum libc 0.2.17 (registry+https://github.com/rust-lang/crates.io-index)" = "044d1360593a78f5c8e5e710beccdc24ab71d1f01bc19a29bcacdba22e8475d8" +"checksum log 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)" = "ab83497bf8bf4ed2a74259c1c802351fcd67a65baa86394b6ba73c36f4838054" +"checksum memchr 0.1.11 (registry+https://github.com/rust-lang/crates.io-index)" = "d8b629fb514376c675b98c1421e80b151d3817ac42d7c667717d282761418d20" +"checksum pkg-config 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)" = "8cee804ecc7eaf201a4a207241472cc870e825206f6c031e3ee2a72fa425f2fa" +"checksum regex 0.1.77 (registry+https://github.com/rust-lang/crates.io-index)" = "64b03446c466d35b42f2a8b203c8e03ed8b91c0f17b56e1f84f7210a257aa665" +"checksum regex-syntax 0.3.7 (registry+https://github.com/rust-lang/crates.io-index)" = "48f0573bcee95a48da786f8823465b5f2a1fae288a55407aca991e5b3e0eae11" +"checksum thread-id 2.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "a9539db560102d1cef46b8b78ce737ff0bb64e7e18d35b2a5688f7d097d0ff03" +"checksum thread_local 0.2.7 (registry+https://github.com/rust-lang/crates.io-index)" = "8576dbbfcaef9641452d5cf0df9b0e7eeab7694956dd33bb61515fb8f18cfdd5" +"checksum utf8-ranges 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)" = "a1ca13c08c41c9c3e04224ed9ff80461d97e121589ff27c753a16cb10830ae0f" +"checksum winapi 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)" = "167dc9d6949a9b857f3451275e911c3f44255842c1f7a76f33c55103a909087a" +"checksum winapi-build 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "2d315eee3b34aca4797b2da6b13ed88266e6d612562a0c46390af8299fc699bc" diff --git a/indexer/Cargo.toml b/indexer/Cargo.toml new file mode 100644 index 0000000..7b1b918 --- /dev/null +++ b/indexer/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "indexer" +version = "0.1.0" +authors = ["Yorhel "] + +[dependencies] +regex = "0.1.77" +log = "0.3.6" +env_logger = "0.3.5" +libarchive = { git = "https://github.com/17dec/libarchive-rust" } diff --git a/indexer/src/archive.rs b/indexer/src/archive.rs new file mode 100644 index 0000000..f1b9403 --- /dev/null +++ b/indexer/src/archive.rs @@ -0,0 +1,368 @@ +use std::path::Path; +use std::collections::HashMap; +use libarchive::reader::Reader as ArchiveReader; +use libarchive::reader::{FileReader,Builder}; +use libarchive::archive::{Entry,FileType,ReadFormat,ReadFilter}; +use libarchive::error::ArchiveResult; + + +pub fn open_file>(path: T) -> ArchiveResult { + let mut builder = Builder::new(); + try!(builder.support_format(ReadFormat::All)); + try!(builder.support_filter(ReadFilter::All)); + builder.open_file(path) +} + + +#[derive(Clone,Debug,PartialEq,Eq)] +pub enum EntryType { + // Regular file that has been handled/indexed + Handled, + // Regular file that hasn't been handled because the caller wasn't interested in it. Could + // still be an interesting file if it is referenced from an interesting path. + Regular, + // Link to another file (interesting or not is irrelevant) + Link(String), + // Directory; need this information when resolving links + Directory, + // Something that couldn't be a an interesting file (chardev/socket/etc); If any link resolves + // to this we know we're done. + Other, +} + + +/* + * I had hoped that reading man pages from an archive would just be a simple: + * + * 1. Walk through all files in the archive in a streaming fashion + * 2. Parse/index man pages + * + * But alas, it was not to be. Symlinks and hardlinks have ruined it. Now we have to... + * + * 1. Walk through all entries in the archive in a streaming fashion + * 2. Parse/index regular file man pages + * 3. Keep track of all paths in the archive + * 4. Use the result of step (3) to resolve symlinks/hardlinks to their actual file + * 5. Read the entire damn archive again if one of the links resolved to a file that was not + * recognized as a man page in step (2). Luckily, this isn't very common. + * + * And this doesn't even cover the problem of duplicate entries in a tar, which is also quite + * annoying to handle. + * + * What annoys me the most about all of this is that it's not possible to stream an archive from + * the network and read/index the entire thing in a single step. Now we have to buffer packages to + * disk in order to be able to read the archive a second time. + * + * (Note that it is possible to resolve links while walking through the entries, which will allow + * us to match files found later in the archive against links found earlier, thus potentially + * saving the need to read the archive a second time. This is merely a performance improvement for + * an uncommon case, and it certainly won't simplify the code) + * + * (Note that it's also possible to just flush all files <10MB* to disk to completely avoid the + * need for a second archive read, but that's going to significantly slow down the common case in + * order to handle a rare case. It's possible to further optimize this using some heuristics to + * determine whether a file is potentially a man page, but that's both complex and may not even + * save much) + * + * (* So apparently some man pages are close to 10MB...) + */ +pub struct Reader { + // List of seen files. This is used to resolve links + seen: HashMap, + // List of interesting links + links: Vec, + // List of files we have to read in a second walk through the archive + missedfiles: HashMap>, +} + + +// Generalized API: +// 1. Read once +// reader.read(file, interest_cb, file_cb) -> Error +// file: A libarchive::Reader +// interest_cb(path) -> bool +// Called on every file/link name, should return whether it's a file the caller is interested +// in. (e.g. parse_path(), but also +DESC and other metadata). +// file_cb(path, reader, entry) -> Error +// Called on every interesting (actual) file, given the (normalized?) path, the +// libarchive::Reader and a ReaderEntry +// +// 2. Read links +// reader.links(link_cb) -> Error +// link_cb(path, dest) -> Error +// Called on every link which has as 'dest' a file path that has already been given to +// file_cb() before. +// +// 3. (Optionally) read a second time +// if reader.need_reread() { +// reader.reread(file, file_cb) +// } +impl Reader { + pub fn new() -> Reader { + Reader { + seen: HashMap::new(), + links: Vec::new(), + missedfiles: HashMap::new(), + } + } + + // Convenience function to read the path/type/link from the next header. + fn read_header(rd: &mut ArchiveReader) -> Option<(String, EntryType)> { + let ent = match rd.next_header() { + Some(x) => x, + None => return None, + }; + let path = ent.pathname().trim_left_matches('/').trim_left_matches("./").trim_right_matches('/').to_string(); + + // Hard links are apparently relative to the root of the archive. + let link = ent.hardlink().map(|x| format!("/{}", x)) + .or(ent.symlink().map(str::to_string)); + + let(fts, ret) = match ent.filetype() { + FileType::BlockDevice => ("blk", EntryType::Other), + FileType::SymbolicLink => ("sym", match link { Some(l) => EntryType::Link(l), _ => EntryType::Other }), + FileType::Socket => ("sck", EntryType::Other), + FileType::CharacterDevice => ("chr", EntryType::Other), + FileType::Directory => ("dir", EntryType::Directory), + FileType::NamedPipe => ("fif", EntryType::Other), + FileType::Mount => ("mnt", EntryType::Other), + FileType::RegularFile => ("reg", EntryType::Regular), + FileType::Unknown => ("unk", match link { Some(l) => EntryType::Link(l), _ => EntryType::Other }), + }; + + trace!("Archive entry: {}{:10} bytes, path={:?} type={:?}", fts, ent.size(), path, ret); + Some((path, ret)) + } + + pub fn read(&mut self, rd: &mut ArchiveReader, interest_cb: F, mut file_cb: G) -> ArchiveResult<()> + where F: Fn(&str) -> bool, G: FnMut(&[&str], &mut ArchiveReader) -> ArchiveResult<()> + { + while let Some((path, t)) = Self::read_header(rd) { + // We ought to throw away the result of the previous entry with the same name and use + // this new entry instead, but fuck it. This case is too rare, so let's just warn! it. + if let Some(_) = self.seen.get(&path) { + warn!("Duplicate file entry: {}", path); + continue; + } + + let mut newt = t; + match newt { + EntryType::Regular if interest_cb(&path) => { + let pathv = [&path as &str]; + try!(file_cb(&pathv[..], rd)); + newt = EntryType::Handled + }, + EntryType::Link(_) if interest_cb(&path) => { + self.links.push(path.clone()); + }, + _ => () + }; + self.seen.insert(path, newt); + } + Ok(()) + } + + // This is basically realpath(), using the virtual filesystem in self.seen. + // This method is not particularly efficient, it allocates like crazy. + fn resolve_link(&self, base: &str, path: &str, depth: usize) -> Option<(EntryType, Vec)> { + if depth < 1 { + warn!("Unresolved link: {} -> {}; Recursion depth exceeded", base, path); + return None + } + + // Remove filename from the base + let basedir = if let Some(i) = base.rfind('/') { base.split_at(i).0 } else { return None }; + + let comp : Vec<&str> = + if path.starts_with('/') { path.split('/').collect() } + else { basedir.split('/').chain(path.split('/')).collect() }; + + let mut dest = Vec::new(); + + for (i, &c) in comp.iter().enumerate() { + if c == "" || c == "." { + continue; + } + if c == ".." { + if dest.len() > 1 { + dest.pop(); + } + continue; + } + dest.push(c.to_string()); + let curpath = dest.join("/"); + match self.seen.get(&curpath) { + + // If it's a directory, we're good + Some(&EntryType::Directory) => (), + + // If it's a file or man page, it must be the last item. + Some(& ref x@ EntryType::Regular) | + Some(& ref x@ EntryType::Handled) => return + if i == comp.len()-1 { + Some((x.clone(), dest)) + } else { + warn!("Unresolved link: {} -> {}; Non-directory component", base, path); + None + }, + + // Links... Ugh + Some(&EntryType::Link(ref d)) => { + match self.resolve_link(&curpath, &d, depth-1) { + // Same as above, with dirs we can continue, files have to be last + Some((EntryType::Directory, d)) => dest = d, + x@Some((EntryType::Regular, _)) | + x@Some((EntryType::Handled, _)) => return + if i == comp.len()-1 { x } + else { + warn!("Unresolved link: {} -> {}; Non-directory link component", base, path); + None + }, + _ => return None, + } + }, + + // Don't care about anything else, just stop. + _ => { + warn!("Unresolved link: {} -> {}; Special or non-existing file", base, path); + return None + } + } + } + Some((EntryType::Directory, dest)) + } + + pub fn links(&mut self, mut cb: F) where F: FnMut(&str, &str) { + for p in self.links.iter() { + let dest = match self.seen.get(p) { Some(&EntryType::Link(ref x)) => x, _ => unreachable!() }; + + match self.resolve_link(&p, dest, 32) { + Some((EntryType::Handled, d)) => { + let dstr = d.join("/"); + cb(&p, &dstr) + }, + Some((EntryType::Regular, d)) => { + let dstr = d.join("/"); + self.missedfiles.entry(dstr).or_insert_with(Vec::new).push(p.to_string()); + } + _ => {}, + } + } + // We can reclaim this memory early. + self.links = Vec::new(); + self.seen = HashMap::new(); + } + + pub fn need_reread(&self) -> bool { + self.missedfiles.len() > 0 + } + + pub fn reread(&mut self, rd: &mut ArchiveReader, mut file_cb: G) -> ArchiveResult<()> + where G: FnMut(&[&str], &mut ArchiveReader) -> ArchiveResult<()> + { + while let Some((path, _)) = Self::read_header(rd) { + if let Some(f) = self.missedfiles.remove(&path) { + let v: Vec<&str> = f.iter().map(|x| x as &str).collect(); + try!(file_cb(&v, rd)) + } + if self.missedfiles.len() < 1 { + break; + } + } + Ok(()) + } +} + + + + +#[cfg(test)] +mod tests { + use super::*; + use env_logger; + + fn test_read(r: &mut Reader) { + let mut f = open_file("tests/testarchive.tar.xz").unwrap(); + let mut files = Vec::new(); + r.read(&mut f, + |p| p.starts_with("man/man"), + |p,_| { files.extend(p.iter().map(|x| x.to_string())); Ok(()) } + ).unwrap(); + assert_eq!(files, vec!["man/man3/helloworld.3".to_string()]); + } + + fn test_resolve_links(r: &mut Reader) { + let res = |p| { + if let Some(&EntryType::Link(ref l)) = r.seen.get(p) { + r.resolve_link(p, &l, 5) + } else { + panic!("Not found or not a link: {}", p); + } + }; + let helloworld = Some((EntryType::Handled, vec!["man".to_string(), "man3".to_string(), "helloworld.3".to_string()])); + + assert_eq!(res("man/mans"), Some((EntryType::Directory, vec!["man".to_string(), "man3".to_string()]))); + assert_eq!(res("man/man6/hardlink.6"), helloworld); + assert_eq!(res("man/man1/symlinkbefore.1"), helloworld); + assert_eq!(res("man/man6/symlinkafter.6"), helloworld); + + assert_eq!(res("man/man1/badsymlink1.1"), None); + assert_eq!(res("man/man1/badsymlink2.1"), None); + assert_eq!(res("man/man1/badsymlink3.1"), None); + assert_eq!(res("man/man1/badsymlink4.1"), None); + assert_eq!(res("man/man1/badsymlink5.1"), None); + + assert_eq!(res("man/man1/doublesymlink1.1"), helloworld); + assert_eq!(res("man/man1/doublesymlink2.1"), helloworld); + assert_eq!(res("man/man1/triplesymlink.1"), helloworld); + assert_eq!(res("man/man1/infinitesymlink.1"), None); + } + + fn test_links(r: &mut Reader) { + let mut links = Vec::new(); + r.links(|p,d| links.push((p.to_string(), d.to_string()))); + links.sort(); + + { + let mut res = |p:&str| { + let r = links.remove(0); + assert_eq!(r.0, p.to_string()); + assert_eq!(r.1, "man/man3/helloworld.3".to_string()); + }; + res("man/man1/doublesymlink1.1"); + res("man/man1/doublesymlink2.1"); + res("man/man1/symlinkbefore.1"); + res("man/man1/triplesymlink.1"); + res("man/man6/hardlink.6"); + res("man/man6/symlinkafter.6"); + } + assert_eq!(links.len(), 0); + } + + fn test_reread(r: &mut Reader) { + assert!(r.need_reread()); + + let mut f = open_file("tests/testarchive.tar.xz").unwrap(); + let mut files = Vec::new(); + r.reread(&mut f, + |p,_| { files.extend(p.iter().map(|x| x.to_string())); Ok(()) } + ).unwrap(); + + files.sort(); + assert_eq!(files, vec![ + "man/man3/needreread.3".to_string(), + "man/man6/needreread.6".to_string() + ]); + } + + #[test] + fn test_reader() { + env_logger::init().unwrap(); + + let mut r = Reader::new(); + test_read(&mut r); + test_resolve_links(&mut r); + test_links(&mut r); + test_reread(&mut r); + } +} diff --git a/indexer/src/main.rs b/indexer/src/main.rs new file mode 100644 index 0000000..d0eb4e2 --- /dev/null +++ b/indexer/src/main.rs @@ -0,0 +1,97 @@ +#[macro_use] extern crate log; +extern crate env_logger; +extern crate libarchive; +extern crate regex; + +use regex::Regex; + +mod archive; + + +// Checks a path for a man page candidate. Returns None if it doesn't seem like a man page +// location, otherwise Some((manPageName, Section, Locale)). +fn parse_path(path: &str) -> Option<(&str, &str, &str)> { + // Roughly: man[/locale]/man1/manpage.section[.compression]+ + // TODO: lazy_static + let re = Regex::new(r"(?x) + man + (?: / ([^/]+) )? # Optional locale + /man[a-z0-9]/ # Subdir + ([^/]+?) # Man page name (non-greedy) + \. ([^/\.]+) # Section + (?: \. (?: gz|lzma|bz2|xz ))* $ # Any number of compression extensions + ").unwrap(); + + let cap = match re.captures(path) { Some(x) => x, None => return None }; + let locale = cap.at(1).unwrap_or(""); + let name = cap.at(2).unwrap(); + let section = cap.at(3).unwrap(); + + // Not everything matching the regex is necessarily a man page, exclude some special cases. + match (name, section, locale) { + // Files that totally aren't man pages + ("Makefile", "in", _) | + ("Makefile", "am", _) | + (".cvsignore", _, _) | + (_, "gz", _) | + (_, "lzma", _) | + (_, "bz2", _) | + (_, "xz", _) | + (_, "html", _) => None, + // Some weird directories that happen to match the locale + (n, s, "5man") | + (n, s, "c") | + (n, s, "man1") | + (n, s, "man2") | + (n, s, "man3") | + (n, s, "man4") | + (n, s, "man5") | + (n, s, "man6") | + (n, s, "man7") | + (n, s, "man8") | + (n, s, "Man-Part1") | + (n, s, "Man-Part2") => Some((n, s, "")), + // Nothing special! + x => Some(x) + } +} + + +fn main() { + env_logger::init().unwrap(); + info!("Hello, world!"); +} + + +#[test] +fn test_parse_path() { + // Generic tests + assert_eq!(parse_path("/"), None); + assert_eq!(parse_path("/man1/ncdu.1"), None); + assert_eq!(parse_path("/man/man?/ncdu.1"), None); + assert_eq!(parse_path("/man/man1/ncdu.1"), Some(("ncdu", "1", ""))); + assert_eq!(parse_path("/man/man1/ncdu.1.gz.lzma.xz.bz2.gz"), Some(("ncdu", "1", ""))); // This stuff happens + assert_eq!(parse_path("/man/en_US.UTF-8/man1/ncdu.1"), Some(("ncdu", "1", "en_US.UTF-8"))); + + // Special cases + assert_eq!(parse_path("/usr/share/man/man1/INDEX"), None); + assert_eq!(parse_path("/usr/share/man/man1/Makefile"), None); + assert_eq!(parse_path("/usr/share/man/man1/Makefile.am"), None); + assert_eq!(parse_path("/usr/share/man/man1/Makefile.in"), None); + assert_eq!(parse_path("/usr/share/man/man1/.cvsignore"), None); + assert_eq!(parse_path("/usr/share/man/man1/.cvsignore.gz"), None); + + // Some actual locations + assert_eq!(parse_path("/usr/local/man/man1/list_audio_tracks.1.gz"), Some(("list_audio_tracks", "1", ""))); + assert_eq!(parse_path("/usr/local/lib/perl5/site_perl/man/man3/DBIx::Class::Helper::ResultSet::DateMethods1::Announcement.3.gz"), Some(("DBIx::Class::Helper::ResultSet::DateMethods1::Announcement", "3", ""))); + assert_eq!(parse_path("/usr/man/man3/exit.3tk"), Some(("exit", "3tk", ""))); + assert_eq!(parse_path("/usr/local/brlcad/share/man/mann/exit.nged.gz"), Some(("exit", "nged", ""))); + assert_eq!(parse_path("/usr/X11R6/man/man3/intro.3xglut.gz"), Some(("intro", "3xglut", ""))); + assert_eq!(parse_path("/usr/local/share/man/ko_KR.eucKR/man3/intro.3.gz"), Some(("intro", "3", "ko_KR.eucKR"))); + + assert_eq!(parse_path("/usr/lib/scilab/man/Man-Part1/man1/ans.1"), Some(("ans", "1", ""))); + assert_eq!(parse_path("/heirloom/usr/share/man/5man/man1/chgrp.1.gz"), Some(("chgrp", "1", ""))); + + assert_eq!(parse_path("/usr/local/plan9/man/man8/index.html"), None); + assert_eq!(parse_path("/usr/local/share/doc/gmt/html/man/grdpaste.html"), None); +} diff --git a/indexer/tests/mktar.sh b/indexer/tests/mktar.sh new file mode 100755 index 0000000..3f5ba4e --- /dev/null +++ b/indexer/tests/mktar.sh @@ -0,0 +1,49 @@ +#!/bin/sh + +# The order of inserting the files into the tar is not fully deterministic this +# way. The tests will fail quite badly if hardlink.6 is considered the +# "original" version. + +mkdir man +cd man + +mkdir man1 +mkdir man3 +mkdir man6 +ln -s man3 mans + +echo 'Hello World' >man3/helloworld.3 +echo 'Not a very interesting file' >notinteresting +echo 'Potentially interesting file' >possiblyinteresting + +ln man3/helloworld.3 man6/hardlink.6 + +ln -s ../man3/helloworld.3 man1/symlinkbefore.1 +ln -s ../man3/helloworld.3 man6/symlinkafter.6 + +ln -s notadir/../../man3/helloworld.3 man1/badsymlink1.1 +ln -s man3/helloworld.3 man1/badsymlink2.1 +ln -s ../man3/helloworld.3/. man1/badsymlink3.1 +ln -s ../man3/helloworld.3/../helloworld.3 man1/badsymlink4.1 +ln -s ../man1/symlinkbefore.1/../../man1/helloworld.3 man1/badsymlink5.1 + +ln -s symlinkbefore.1 man1/doublesymlink1.1 +ln -s ../mans/helloworld.3 man1/doublesymlink2.1 +ln -s ../mans/../man1/symlinkbefore.1 man1/triplesymlink.1 +ln -s infinitesymlink.1 man1/infinitesymlink.1 + +ln -s ../possiblyinteresting man3/needreread.3 +ln -s ../possiblyinteresting man6/needreread.6 + +cd .. +rm -f testarchive.tar +tar -cf testarchive.tar man/ +rm -r man/ + +mkdir man +echo 'Overwritten file' >man/possiblyinteresting +tar -rf testarchive.tar man/ +rm -r man/ + +rm -f testarchive.tar.xz +xz testarchive.tar diff --git a/indexer/tests/testarchive.tar.xz b/indexer/tests/testarchive.tar.xz new file mode 100644 index 0000000000000000000000000000000000000000..0fe9760170b551b0ef3438c652ae8f7a4f9f4ac2 GIT binary patch literal 616 zcmV-u0+;>$H+ooF000E$*0e?f03iVu0001VFXf})PyYfXT>v(iN*x^x(XuqR&o7_G zqKJ_QaG??>54EHV`~R9~@-$qVakc37Igz++Ye>*PW3Fd=eOl$6n&`e4NsNE<4=t6w zS`9CCx}(t#Z>BrXMfF;+wTH0tnp;8i%GyqxP-pJeiwX<#QRl_Gj~L6}6XNKZNJ7vL z{}#7}Z9hiJyK(!$u%0Y%`5$*;J#I6T{n#=SG@DD= z&*>1DD{ItdQr-Pnb#7GiLBoULZnFe}81bZ@LG%E_)BNrK9hE}aw}j0P{%Q( zqA~9VMUF2`90K(?c@K8RNEBy%2>8sb#+D*3qn14)Y5#AkoXaZwL{?lkKDZ~>G7$AN zlUilzW>cOW(2k(KEttbOlUS9;E>b*;Xzod^qJNiQaYaHKOHbCAI>W8*}* zv7r5qOIVwt@0*c@UQRk9Ijj#?<-p2S3)ilT(O-QU^B&-o9mJdylOU39(@Jn|UXdjI z%sM(NeaitdL@Skfs&D`R000124H0kp!jrlH0l@@-paB3DWn00q#Ao{g000001X)_j CkRw Date: Wed, 26 Oct 2016 18:26:06 +0200 Subject: [PATCH 2/7] Use libarchive3-sys crate directly + improve archread API This all should offer a more convenient and robust interface to handle all sorts of archives. --- indexer/Cargo.lock | 30 +- indexer/Cargo.toml | 4 +- indexer/src/archive.rs | 593 +++++++++++++------------------ indexer/src/archread.rs | 363 +++++++++++++++++++ indexer/src/main.rs | 92 +---- indexer/src/man.rs | 85 +++++ indexer/tests/mktar.sh | 13 + indexer/tests/simpletest.tar.gz | Bin 0 -> 247 bytes indexer/tests/testarchive.tar.xz | Bin 616 -> 616 bytes 9 files changed, 732 insertions(+), 448 deletions(-) create mode 100644 indexer/src/archread.rs create mode 100644 indexer/src/man.rs create mode 100644 indexer/tests/simpletest.tar.gz diff --git a/indexer/Cargo.lock b/indexer/Cargo.lock index 53c701d..dbd3057 100644 --- a/indexer/Cargo.lock +++ b/indexer/Cargo.lock @@ -3,9 +3,11 @@ name = "indexer" version = "0.1.0" dependencies = [ "env_logger 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)", - "libarchive 0.1.1 (git+https://github.com/17dec/libarchive-rust)", + "lazy_static 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)", + "libarchive3-sys 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.17 (registry+https://github.com/rust-lang/crates.io-index)", "log 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", - "regex 0.1.77 (registry+https://github.com/rust-lang/crates.io-index)", + "regex 0.1.80 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] @@ -22,7 +24,7 @@ version = "0.3.5" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ "log 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", - "regex 0.1.77 (registry+https://github.com/rust-lang/crates.io-index)", + "regex 0.1.80 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] @@ -35,13 +37,9 @@ dependencies = [ ] [[package]] -name = "libarchive" -version = "0.1.1" -source = "git+https://github.com/17dec/libarchive-rust#3f723cf0064561f21f0cebbd534a75076e6dbcaa" -dependencies = [ - "libarchive3-sys 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)", - "libc 0.2.17 (registry+https://github.com/rust-lang/crates.io-index)", -] +name = "lazy_static" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" [[package]] name = "libarchive3-sys" @@ -77,19 +75,19 @@ source = "registry+https://github.com/rust-lang/crates.io-index" [[package]] name = "regex" -version = "0.1.77" +version = "0.1.80" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ "aho-corasick 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)", "memchr 0.1.11 (registry+https://github.com/rust-lang/crates.io-index)", - "regex-syntax 0.3.7 (registry+https://github.com/rust-lang/crates.io-index)", + "regex-syntax 0.3.9 (registry+https://github.com/rust-lang/crates.io-index)", "thread_local 0.2.7 (registry+https://github.com/rust-lang/crates.io-index)", "utf8-ranges 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] name = "regex-syntax" -version = "0.3.7" +version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" [[package]] @@ -128,14 +126,14 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum aho-corasick 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)" = "ca972c2ea5f742bfce5687b9aef75506a764f61d37f8f649047846a9686ddb66" "checksum env_logger 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)" = "15abd780e45b3ea4f76b4e9a26ff4843258dd8a3eed2775a0e7368c2e7936c2f" "checksum kernel32-sys 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7507624b29483431c0ba2d82aece8ca6cdba9382bff4ddd0f7490560c056098d" -"checksum libarchive 0.1.1 (git+https://github.com/17dec/libarchive-rust)" = "" +"checksum lazy_static 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "49247ec2a285bb3dcb23cbd9c35193c025e7251bfce77c1d5da97e6362dffe7f" "checksum libarchive3-sys 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "3cd3beae8f59a4c7a806523269b5392037577c150446e88d684dfa6de6031ca7" "checksum libc 0.2.17 (registry+https://github.com/rust-lang/crates.io-index)" = "044d1360593a78f5c8e5e710beccdc24ab71d1f01bc19a29bcacdba22e8475d8" "checksum log 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)" = "ab83497bf8bf4ed2a74259c1c802351fcd67a65baa86394b6ba73c36f4838054" "checksum memchr 0.1.11 (registry+https://github.com/rust-lang/crates.io-index)" = "d8b629fb514376c675b98c1421e80b151d3817ac42d7c667717d282761418d20" "checksum pkg-config 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)" = "8cee804ecc7eaf201a4a207241472cc870e825206f6c031e3ee2a72fa425f2fa" -"checksum regex 0.1.77 (registry+https://github.com/rust-lang/crates.io-index)" = "64b03446c466d35b42f2a8b203c8e03ed8b91c0f17b56e1f84f7210a257aa665" -"checksum regex-syntax 0.3.7 (registry+https://github.com/rust-lang/crates.io-index)" = "48f0573bcee95a48da786f8823465b5f2a1fae288a55407aca991e5b3e0eae11" +"checksum regex 0.1.80 (registry+https://github.com/rust-lang/crates.io-index)" = "4fd4ace6a8cf7860714a2c2280d6c1f7e6a413486c13298bbc86fd3da019402f" +"checksum regex-syntax 0.3.9 (registry+https://github.com/rust-lang/crates.io-index)" = "f9ec002c35e86791825ed294b50008eea9ddfc8def4420124fbc6b08db834957" "checksum thread-id 2.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "a9539db560102d1cef46b8b78ce737ff0bb64e7e18d35b2a5688f7d097d0ff03" "checksum thread_local 0.2.7 (registry+https://github.com/rust-lang/crates.io-index)" = "8576dbbfcaef9641452d5cf0df9b0e7eeab7694956dd33bb61515fb8f18cfdd5" "checksum utf8-ranges 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)" = "a1ca13c08c41c9c3e04224ed9ff80461d97e121589ff27c753a16cb10830ae0f" diff --git a/indexer/Cargo.toml b/indexer/Cargo.toml index 7b1b918..4fd665f 100644 --- a/indexer/Cargo.toml +++ b/indexer/Cargo.toml @@ -7,4 +7,6 @@ authors = ["Yorhel "] regex = "0.1.77" log = "0.3.6" env_logger = "0.3.5" -libarchive = { git = "https://github.com/17dec/libarchive-rust" } +lazy_static = "0.2.1" +libc = "0.2.17" +libarchive3-sys = "0.1.2" diff --git a/indexer/src/archive.rs b/indexer/src/archive.rs index f1b9403..17e85c4 100644 --- a/indexer/src/archive.rs +++ b/indexer/src/archive.rs @@ -1,368 +1,273 @@ -use std::path::Path; -use std::collections::HashMap; -use libarchive::reader::Reader as ArchiveReader; -use libarchive::reader::{FileReader,Builder}; -use libarchive::archive::{Entry,FileType,ReadFormat,ReadFilter}; -use libarchive::error::ArchiveResult; +use std::str; +use std::ptr; +use std::error::Error as ErrorTrait; +use std::io::{Result,Error,Read}; +use std::ffi::{CStr,CString}; + +use libc::{c_void,ssize_t}; +use libarchive3_sys::ffi; -pub fn open_file>(path: T) -> ArchiveResult { - let mut builder = Builder::new(); - try!(builder.support_format(ReadFormat::All)); - try!(builder.support_filter(ReadFilter::All)); - builder.open_file(path) -} - - -#[derive(Clone,Debug,PartialEq,Eq)] -pub enum EntryType { - // Regular file that has been handled/indexed - Handled, - // Regular file that hasn't been handled because the caller wasn't interested in it. Could - // still be an interesting file if it is referenced from an interesting path. - Regular, - // Link to another file (interesting or not is irrelevant) - Link(String), - // Directory; need this information when resolving links - Directory, - // Something that couldn't be a an interesting file (chardev/socket/etc); If any link resolves - // to this we know we're done. - Other, -} - - -/* - * I had hoped that reading man pages from an archive would just be a simple: +/* This is a safe, limited and opinionated wrapper around the libarchive C bindings. + * I initially used the libarchive crate, but it has several issues. Some of which are not fixable + * without a complete rewrite. + * - Panics on non-UTF8 path names + * - Panics on hard links (PR #6) + * - API is far too flexible, easy to misuse and get panics/segfaults + * - Impossible to correctly read files from an archive (issue #7) + * - Does not provide a convenient Read interface for files * - * 1. Walk through all files in the archive in a streaming fashion - * 2. Parse/index man pages - * - * But alas, it was not to be. Symlinks and hardlinks have ruined it. Now we have to... - * - * 1. Walk through all entries in the archive in a streaming fashion - * 2. Parse/index regular file man pages - * 3. Keep track of all paths in the archive - * 4. Use the result of step (3) to resolve symlinks/hardlinks to their actual file - * 5. Read the entire damn archive again if one of the links resolved to a file that was not - * recognized as a man page in step (2). Luckily, this isn't very common. - * - * And this doesn't even cover the problem of duplicate entries in a tar, which is also quite - * annoying to handle. - * - * What annoys me the most about all of this is that it's not possible to stream an archive from - * the network and read/index the entire thing in a single step. Now we have to buffer packages to - * disk in order to be able to read the archive a second time. - * - * (Note that it is possible to resolve links while walking through the entries, which will allow - * us to match files found later in the archive against links found earlier, thus potentially - * saving the need to read the archive a second time. This is merely a performance improvement for - * an uncommon case, and it certainly won't simplify the code) - * - * (Note that it's also possible to just flush all files <10MB* to disk to completely avoid the - * need for a second archive read, but that's going to significantly slow down the common case in - * order to handle a rare case. It's possible to further optimize this using some heuristics to - * determine whether a file is potentially a man page, but that's both complex and may not even - * save much) - * - * (* So apparently some man pages are close to 10MB...) + * Barring any unexpected behaviour or bugs in libarchive, the API below should not panic or + * segfault for any archive or usage pattern. */ -pub struct Reader { - // List of seen files. This is used to resolve links - seen: HashMap, - // List of interesting links - links: Vec, - // List of files we have to read in a second walk through the archive - missedfiles: HashMap>, + +pub struct Archive<'a> { + a: *mut ffi::Struct_archive, + rd: &'a mut Read, + buf: Vec, + err: Option, } -// Generalized API: -// 1. Read once -// reader.read(file, interest_cb, file_cb) -> Error -// file: A libarchive::Reader -// interest_cb(path) -> bool -// Called on every file/link name, should return whether it's a file the caller is interested -// in. (e.g. parse_path(), but also +DESC and other metadata). -// file_cb(path, reader, entry) -> Error -// Called on every interesting (actual) file, given the (normalized?) path, the -// libarchive::Reader and a ReaderEntry -// -// 2. Read links -// reader.links(link_cb) -> Error -// link_cb(path, dest) -> Error -// Called on every link which has as 'dest' a file path that has already been given to -// file_cb() before. -// -// 3. (Optionally) read a second time -// if reader.need_reread() { -// reader.reread(file, file_cb) -// } -impl Reader { - pub fn new() -> Reader { - Reader { - seen: HashMap::new(), - links: Vec::new(), - missedfiles: HashMap::new(), +pub struct ArchiveEntry<'a> { + a: Box>, + e: *mut ffi::Struct_archive_entry, +} + + +#[derive(Debug,PartialEq,Eq)] +pub enum FileType { + File, + Directory, + Link(String), + Other, // Also includes Link() +} + + +unsafe extern "C" fn archive_read_cb(_: *mut ffi::Struct_archive, data: *mut c_void, buf: *mut *const c_void) -> ssize_t { + let arch: &mut Archive = &mut *(data as *mut Archive); + *buf = arch.buf.as_mut_ptr() as *mut c_void; + match arch.rd.read(&mut arch.buf[..]) { + Ok(s) => s as ssize_t, + Err(e) => { + let desc = CString::new(e.description()).unwrap(); + let fmt = CString::new("%s").unwrap(); + ffi::archive_set_error(arch.a, e.raw_os_error().unwrap_or(0), fmt.as_ptr(), desc.as_ptr()); + arch.err = Some(e); + -1 } } - - // Convenience function to read the path/type/link from the next header. - fn read_header(rd: &mut ArchiveReader) -> Option<(String, EntryType)> { - let ent = match rd.next_header() { - Some(x) => x, - None => return None, - }; - let path = ent.pathname().trim_left_matches('/').trim_left_matches("./").trim_right_matches('/').to_string(); - - // Hard links are apparently relative to the root of the archive. - let link = ent.hardlink().map(|x| format!("/{}", x)) - .or(ent.symlink().map(str::to_string)); - - let(fts, ret) = match ent.filetype() { - FileType::BlockDevice => ("blk", EntryType::Other), - FileType::SymbolicLink => ("sym", match link { Some(l) => EntryType::Link(l), _ => EntryType::Other }), - FileType::Socket => ("sck", EntryType::Other), - FileType::CharacterDevice => ("chr", EntryType::Other), - FileType::Directory => ("dir", EntryType::Directory), - FileType::NamedPipe => ("fif", EntryType::Other), - FileType::Mount => ("mnt", EntryType::Other), - FileType::RegularFile => ("reg", EntryType::Regular), - FileType::Unknown => ("unk", match link { Some(l) => EntryType::Link(l), _ => EntryType::Other }), - }; - - trace!("Archive entry: {}{:10} bytes, path={:?} type={:?}", fts, ent.size(), path, ret); - Some((path, ret)) - } - - pub fn read(&mut self, rd: &mut ArchiveReader, interest_cb: F, mut file_cb: G) -> ArchiveResult<()> - where F: Fn(&str) -> bool, G: FnMut(&[&str], &mut ArchiveReader) -> ArchiveResult<()> - { - while let Some((path, t)) = Self::read_header(rd) { - // We ought to throw away the result of the previous entry with the same name and use - // this new entry instead, but fuck it. This case is too rare, so let's just warn! it. - if let Some(_) = self.seen.get(&path) { - warn!("Duplicate file entry: {}", path); - continue; - } - - let mut newt = t; - match newt { - EntryType::Regular if interest_cb(&path) => { - let pathv = [&path as &str]; - try!(file_cb(&pathv[..], rd)); - newt = EntryType::Handled - }, - EntryType::Link(_) if interest_cb(&path) => { - self.links.push(path.clone()); - }, - _ => () - }; - self.seen.insert(path, newt); - } - Ok(()) - } - - // This is basically realpath(), using the virtual filesystem in self.seen. - // This method is not particularly efficient, it allocates like crazy. - fn resolve_link(&self, base: &str, path: &str, depth: usize) -> Option<(EntryType, Vec)> { - if depth < 1 { - warn!("Unresolved link: {} -> {}; Recursion depth exceeded", base, path); - return None - } - - // Remove filename from the base - let basedir = if let Some(i) = base.rfind('/') { base.split_at(i).0 } else { return None }; - - let comp : Vec<&str> = - if path.starts_with('/') { path.split('/').collect() } - else { basedir.split('/').chain(path.split('/')).collect() }; - - let mut dest = Vec::new(); - - for (i, &c) in comp.iter().enumerate() { - if c == "" || c == "." { - continue; - } - if c == ".." { - if dest.len() > 1 { - dest.pop(); - } - continue; - } - dest.push(c.to_string()); - let curpath = dest.join("/"); - match self.seen.get(&curpath) { - - // If it's a directory, we're good - Some(&EntryType::Directory) => (), - - // If it's a file or man page, it must be the last item. - Some(& ref x@ EntryType::Regular) | - Some(& ref x@ EntryType::Handled) => return - if i == comp.len()-1 { - Some((x.clone(), dest)) - } else { - warn!("Unresolved link: {} -> {}; Non-directory component", base, path); - None - }, - - // Links... Ugh - Some(&EntryType::Link(ref d)) => { - match self.resolve_link(&curpath, &d, depth-1) { - // Same as above, with dirs we can continue, files have to be last - Some((EntryType::Directory, d)) => dest = d, - x@Some((EntryType::Regular, _)) | - x@Some((EntryType::Handled, _)) => return - if i == comp.len()-1 { x } - else { - warn!("Unresolved link: {} -> {}; Non-directory link component", base, path); - None - }, - _ => return None, - } - }, - - // Don't care about anything else, just stop. - _ => { - warn!("Unresolved link: {} -> {}; Special or non-existing file", base, path); - return None - } - } - } - Some((EntryType::Directory, dest)) - } - - pub fn links(&mut self, mut cb: F) where F: FnMut(&str, &str) { - for p in self.links.iter() { - let dest = match self.seen.get(p) { Some(&EntryType::Link(ref x)) => x, _ => unreachable!() }; - - match self.resolve_link(&p, dest, 32) { - Some((EntryType::Handled, d)) => { - let dstr = d.join("/"); - cb(&p, &dstr) - }, - Some((EntryType::Regular, d)) => { - let dstr = d.join("/"); - self.missedfiles.entry(dstr).or_insert_with(Vec::new).push(p.to_string()); - } - _ => {}, - } - } - // We can reclaim this memory early. - self.links = Vec::new(); - self.seen = HashMap::new(); - } - - pub fn need_reread(&self) -> bool { - self.missedfiles.len() > 0 - } - - pub fn reread(&mut self, rd: &mut ArchiveReader, mut file_cb: G) -> ArchiveResult<()> - where G: FnMut(&[&str], &mut ArchiveReader) -> ArchiveResult<()> - { - while let Some((path, _)) = Self::read_header(rd) { - if let Some(f) = self.missedfiles.remove(&path) { - let v: Vec<&str> = f.iter().map(|x| x as &str).collect(); - try!(file_cb(&v, rd)) - } - if self.missedfiles.len() < 1 { - break; - } - } - Ok(()) - } } +impl<'a> Archive<'a> { + fn new(rd: &mut Read, a: *mut ffi::Struct_archive) -> Result> { + let bufsize = 64*1024; + let mut buf = Vec::with_capacity(bufsize); + unsafe { buf.set_len(bufsize) }; + let mut ret = Box::new(Archive { a: a, rd: rd, buf: buf, err: None }); + + let aptr: *mut c_void = &mut *ret as *mut Archive as *mut c_void; + let r = unsafe { ffi::archive_read_open(a, aptr, None, Some(archive_read_cb), None) }; + if r == ffi::ARCHIVE_FATAL { + return Err(ret.error()); + } + Ok(ret) + } + + fn error(&mut self) -> Error { + // TODO: Do something with the description + self.err.take().unwrap_or_else(|| + Error::from_raw_os_error(unsafe { ffi::archive_errno(self.a) }) + ) + } + + fn entry(self: Box) -> Result>> { + let mut ent = ArchiveEntry { + a: self, + e: ptr::null_mut() + }; + let res = unsafe { ffi::archive_read_next_header(ent.a.a, &mut ent.e) }; + match res { + ffi::ARCHIVE_EOF => Ok(None), + ffi::ARCHIVE_FATAL => Err(ent.a.error()), + _ => Ok(Some(ent)) + } + } + + fn read(&mut self, buf: &mut [u8]) -> Result { + let cbuf = buf.as_mut_ptr() as *mut c_void; + let n = unsafe { ffi::archive_read_data(self.a, cbuf, buf.len()) }; + if n >= 0 { + Ok(n as usize) + } else { + Err(self.error()) + } + } + + pub fn open_archive(rd: &mut Read) -> Result> { + let a = unsafe { + let a = ffi::archive_read_new(); + ffi::archive_read_support_filter_all(a); + ffi::archive_read_support_format_all(a); + a + }; + try!(Self::new(rd, a)).entry() + } +} + + +impl<'a> Drop for Archive<'a> { + fn drop(&mut self) { + unsafe { + ffi::archive_read_free(self.a); + } + } +} + + +impl<'a> ArchiveEntry<'a> { + pub fn next(self) -> Result>> { + self.a.entry() + } + + // Returns None in NULL (when does that even happen?) or on invalid UTF-8. + pub fn path(&self) -> Option<&str> { + let c_str: &CStr = unsafe { + let ptr = ffi::archive_entry_pathname(self.e); + if ptr.is_null() { + return None; + } + CStr::from_ptr(ptr) + }; + str::from_utf8(c_str.to_bytes()).ok() + // Perform some simple opinionated normalization. Full normalization might be better, + // but also slower and more complex. This solution covers the most important cases. + .map(|s| s.trim_left_matches('/').trim_left_matches("./").trim_right_matches('/')) + } + + pub fn size(&self) -> usize { + unsafe { ffi::archive_entry_size(self.e) as usize } + } + + fn symlink(&self) -> Option { + let c_str: &CStr = unsafe { + let ptr = ffi::archive_entry_symlink(self.e); + if ptr.is_null() { + return None; + } + CStr::from_ptr(ptr) + }; + str::from_utf8(c_str.to_bytes()).map(str::to_string).ok() + } + + fn hardlink(&self) -> Option { + let c_str: &CStr = unsafe { + let ptr = ffi::archive_entry_hardlink(self.e); + if ptr.is_null() { + return None; + } + CStr::from_ptr(ptr) + }; + // Hard links have the same name as an earlier pathname(), and those typically don't have a + // preceding slash. Add this slash here so that the same resolution logic can be used for + // both hardlinks and symlinks. I really don't care about the difference between these two. + str::from_utf8(c_str.to_bytes()).map(|p| format!("/{}", p)).ok() + } + + pub fn filetype(&self) -> FileType { + // If it has a symlink/hardlink path, then just consider it a link regardless of what + // _filetype() says. + if let Some(l) = self.symlink().or(self.hardlink()) { + return FileType::Link(l); + } + match unsafe { ffi::archive_entry_filetype(self.e) } { + ffi::AE_IFDIR => FileType::Directory, + ffi::AE_IFREG => FileType::File, + _ => FileType::Other, + } + } +} + + +impl<'a> Read for ArchiveEntry<'a> { + fn read(&mut self, buf: &mut [u8]) -> Result { + self.a.read(buf) + } +} + + +// We can't provide an Iterator object for ArchiveEntries because Rust doesn't support streaming +// iterators. Let's instead provide a walk function for convenience. +// cb should return Ok(true) to continue, Ok(false) to break +pub fn walk(ent: Option, mut cb: F) -> Result<()> + where F: FnMut(&mut ArchiveEntry) -> Result +{ + let mut ent = ent; + while let Some(mut e) = ent { + if !try!(cb(&mut e)) { + break; + } + ent = try!(e.next()); + } + Ok(()) +} + #[cfg(test)] mod tests { use super::*; - use env_logger; + use std; + use std::io::Read; + use std::fs::File; - fn test_read(r: &mut Reader) { - let mut f = open_file("tests/testarchive.tar.xz").unwrap(); - let mut files = Vec::new(); - r.read(&mut f, - |p| p.starts_with("man/man"), - |p,_| { files.extend(p.iter().map(|x| x.to_string())); Ok(()) } - ).unwrap(); - assert_eq!(files, vec!["man/man3/helloworld.3".to_string()]); - } - - fn test_resolve_links(r: &mut Reader) { - let res = |p| { - if let Some(&EntryType::Link(ref l)) = r.seen.get(p) { - r.resolve_link(p, &l, 5) - } else { - panic!("Not found or not a link: {}", p); - } - }; - let helloworld = Some((EntryType::Handled, vec!["man".to_string(), "man3".to_string(), "helloworld.3".to_string()])); - - assert_eq!(res("man/mans"), Some((EntryType::Directory, vec!["man".to_string(), "man3".to_string()]))); - assert_eq!(res("man/man6/hardlink.6"), helloworld); - assert_eq!(res("man/man1/symlinkbefore.1"), helloworld); - assert_eq!(res("man/man6/symlinkafter.6"), helloworld); - - assert_eq!(res("man/man1/badsymlink1.1"), None); - assert_eq!(res("man/man1/badsymlink2.1"), None); - assert_eq!(res("man/man1/badsymlink3.1"), None); - assert_eq!(res("man/man1/badsymlink4.1"), None); - assert_eq!(res("man/man1/badsymlink5.1"), None); - - assert_eq!(res("man/man1/doublesymlink1.1"), helloworld); - assert_eq!(res("man/man1/doublesymlink2.1"), helloworld); - assert_eq!(res("man/man1/triplesymlink.1"), helloworld); - assert_eq!(res("man/man1/infinitesymlink.1"), None); - } - - fn test_links(r: &mut Reader) { - let mut links = Vec::new(); - r.links(|p,d| links.push((p.to_string(), d.to_string()))); - links.sort(); - - { - let mut res = |p:&str| { - let r = links.remove(0); - assert_eq!(r.0, p.to_string()); - assert_eq!(r.1, "man/man3/helloworld.3".to_string()); - }; - res("man/man1/doublesymlink1.1"); - res("man/man1/doublesymlink2.1"); - res("man/man1/symlinkbefore.1"); - res("man/man1/triplesymlink.1"); - res("man/man6/hardlink.6"); - res("man/man6/symlinkafter.6"); - } - assert_eq!(links.len(), 0); - } - - fn test_reread(r: &mut Reader) { - assert!(r.need_reread()); - - let mut f = open_file("tests/testarchive.tar.xz").unwrap(); - let mut files = Vec::new(); - r.reread(&mut f, - |p,_| { files.extend(p.iter().map(|x| x.to_string())); Ok(()) } - ).unwrap(); - - files.sort(); - assert_eq!(files, vec![ - "man/man3/needreread.3".to_string(), - "man/man6/needreread.6".to_string() - ]); + #[test] + fn invalid_archive() { + let mut r = std::io::repeat(0x0a).take(64*1024); + let ent = Archive::open_archive(&mut r); + assert!(ent.is_err()); } #[test] - fn test_reader() { - env_logger::init().unwrap(); + fn zerolength_archive() { + let mut r = std::io::empty(); + let ent = Archive::open_archive(&mut r); + // I expected an error here rather than None, whatever. + assert!(ent.unwrap().is_none()); + } - let mut r = Reader::new(); - test_read(&mut r); - test_resolve_links(&mut r); - test_links(&mut r); - test_reread(&mut r); + #[test] + fn read() { + let mut f = File::open("tests/simpletest.tar.gz").unwrap(); + let mut ent = Archive::open_archive(&mut f).unwrap().unwrap(); + + let t = |e:&mut ArchiveEntry, path, size, ft, cont| { + assert_eq!(e.path(), path); + assert_eq!(e.size(), size); + assert_eq!(e.filetype(), ft); + let mut contents = String::new(); + assert_eq!(e.read_to_string(&mut contents).unwrap(), size); + assert_eq!(&contents, cont); + }; + + t(&mut ent, Some("simple"), 0, FileType::Directory, ""); + + ent = ent.next().unwrap().unwrap(); + t(&mut ent, Some("simple/file"), 3, FileType::File, "Hi\n"); + + ent = ent.next().unwrap().unwrap(); + t(&mut ent, Some("simple/link"), 0, FileType::Link("file".to_string()), ""); + + ent = ent.next().unwrap().unwrap(); + t(&mut ent, Some("simple/hardlink"), 0, FileType::Link("/simple/file".to_string()), ""); + + ent = ent.next().unwrap().unwrap(); + t(&mut ent, Some("simple/fifo"), 0, FileType::Other, ""); + + ent = ent.next().unwrap().unwrap(); + t(&mut ent, None, 0, FileType::File, ""); + + assert!(ent.next().unwrap().is_none()); } } diff --git a/indexer/src/archread.rs b/indexer/src/archread.rs new file mode 100644 index 0000000..22086f8 --- /dev/null +++ b/indexer/src/archread.rs @@ -0,0 +1,363 @@ +use std::io::Result; +use std::collections::HashMap; + +use archive::{walk,ArchiveEntry,FileType}; + +/* I had hoped that reading man pages from an archive would just be a simple: + * + * 1. Walk through all files in the archive in a streaming fashion + * 2. Parse/index man pages + * + * But alas, it was not to be. Symlinks and hardlinks have ruined it. Now we have to... + * + * 1. Walk through all entries in the archive in a streaming fashion + * 2. Parse/index regular file man pages + * 3. Keep track of all paths in the archive + * 4. Use the result of step (3) to resolve symlinks/hardlinks to their actual file + * 5. Read the entire damn archive again if one of the links resolved to a file that was not + * recognized as a man page in step (2). Luckily, this isn't very common. + * + * And this doesn't even cover the problem of duplicate entries in a tar, which is also quite + * annoying to handle. + * + * What annoys me the most about all of this is that it's not possible to stream an archive from + * the network and read/index the entire thing in a single step. Now we either have to buffer + * packages to disk or redownload the archive in order to be able to follow all links to man pages. + * + * (Note that it is possible to resolve links while walking through the entries, which will allow + * us to match files found later in the archive against links found earlier, thus potentially + * saving the need to read the archive a second time. This is merely a performance improvement for + * an uncommon case, and it certainly won't simplify the code) + * + * (Note that it's also possible to just flush all files <10MB* to disk to completely avoid the + * need for a second archive read, but that's going to significantly slow down the common case in + * order to handle a rare case. It's possible to further optimize this using some heuristics to + * determine whether a file is potentially a man page, but that's both complex and may not even + * save much) + * + * (* So apparently some man pages are close to 10MB...) + */ + + +#[derive(Clone,Debug,PartialEq,Eq)] +pub enum EntryType { + // Regular file that has been handled/indexed + Handled, + // Regular file that hasn't been handled because the caller wasn't interested in it. Could + // still be an interesting file if it is referenced from an interesting path. + Regular, + // Link to another file (interesting or not is irrelevant) + Link(String), + // Directory; need this information when resolving links + Directory, + // Something that couldn't be an interesting file (chardev/socket/etc); If any link resolves to + // this we know we're done. + Other, +} + +pub struct FileList { + // List of seen files. This is used to resolve links + seen: HashMap, + // List of interesting links + links: Vec, +} + +pub struct MissedFiles(HashMap>); + + +impl FileList { + + /* Read an archive until the end. Accepts two callbacks: + * + * interest_cb: Called on every path in the archive, should return whether the file is + * interesting (i.e. whether we want to know its contents). + * file_cb: Called on every regular file for which interest_cb() showed an interest. + * The callback accepts multiple path names, but this function will only provide one. + * + * Returns a FileList struct that can be used to retreive all interesting non-regular files. + */ + pub fn read(ent: Option, interest_cb: F, mut file_cb: G) -> Result + where F: Fn(&str) -> bool, G: FnMut(&[&str], &mut ArchiveEntry) -> Result<()> + { + let mut fl = FileList { + seen: HashMap::new(), + links: Vec::new(), + }; + + try!(walk(ent, |mut e| { + let path = match e.path() { + Some(x) => x.to_string(), + None => { warn!("Invalid UTF-8 filename in archive"); return Ok(true) } + }; + let ft = e.filetype(); + trace!("Archive entry: {:10} {} {:?}", e.size(), path, ft); + + // We ought to throw away the result of the previous entry with the same name and use + // this new entry instead, but fuck it. This case is too rare, so let's just warn. + if let Some(_) = fl.seen.get(&path) { + warn!("Duplicate file entry: {}", path); + return Ok(true); + } + + let et = match ft { + FileType::File => { + if interest_cb(&path) { + let pathv = [&path as &str]; + try!(file_cb(&pathv[..], &mut e)); + EntryType::Handled + } else { + EntryType::Regular + } + }, + FileType::Link(l) => { + if interest_cb(&path) { + fl.links.push(path.clone()); + } + EntryType::Link(l) + }, + FileType::Directory => EntryType::Directory, + FileType::Other => EntryType::Other, + }; + + fl.seen.insert(path, et); + Ok(true) + })); + Ok(fl) + } + + + // This is basically realpath(), using the virtual filesystem in self.seen. + // This method is not particularly efficient, it allocates like crazy. + fn resolve_link(&self, base: &str, path: &str, depth: usize) -> Option<(EntryType, Vec)> { + if depth < 1 { + warn!("Unresolved link: {} -> {}; Recursion depth exceeded", base, path); + return None + } + + // Remove filename from the base + let basedir = if let Some(i) = base.rfind('/') { base.split_at(i).0 } else { return None }; + + let comp : Vec<&str> = + if path.starts_with('/') { path.split('/').collect() } + else { basedir.split('/').chain(path.split('/')).collect() }; + + let mut dest = Vec::new(); + + for (i, &c) in comp.iter().enumerate() { + if c == "" || c == "." { + continue; + } + if c == ".." { + if dest.len() > 1 { + dest.pop(); + } + continue; + } + dest.push(c.to_string()); + let curpath = dest.join("/"); + match self.seen.get(&curpath) { + + // If it's a directory, we're good + Some(&EntryType::Directory) => (), + + // If it's a file or man page, it must be the last item. + Some(& ref x@ EntryType::Regular) | + Some(& ref x@ EntryType::Handled) => return + if i == comp.len()-1 { + Some((x.clone(), dest)) + } else { + warn!("Unresolved link: {} -> {}; Non-directory component", base, path); + None + }, + + // Links... Ugh + Some(&EntryType::Link(ref d)) => { + match self.resolve_link(&curpath, &d, depth-1) { + // Same as above, with dirs we can continue, files have to be last + Some((EntryType::Directory, d)) => dest = d, + x@Some((EntryType::Regular, _)) | + x@Some((EntryType::Handled, _)) => return + if i == comp.len()-1 { x } + else { + warn!("Unresolved link: {} -> {}; Non-directory link component", base, path); + None + }, + _ => return None, + } + }, + + // Don't care about anything else, just stop. + _ => { + warn!("Unresolved link: {} -> {}; Special or non-existing file", base, path); + return None + } + } + } + Some((EntryType::Directory, dest)) + } + + /* Calls cb() on every 'interesting' link to a file that has already been passed to a file_cb() + * in FileList::read(). + * If there are any interesting links that have not yet been passed to file_cb(), a MissedFiles + * struct is returned that can be used to retrieve those files by re-reading the archive. + */ + pub fn links(self, mut cb: F) -> Option where F: FnMut(&str, &str) { + let mut missed = HashMap::new(); + + for p in self.links.iter() { + let dest = match self.seen.get(p) { Some(&EntryType::Link(ref x)) => x, _ => unreachable!() }; + + match self.resolve_link(&p, dest, 32) { + Some((EntryType::Handled, d)) => { + let dstr = d.join("/"); + cb(&p, &dstr); + }, + Some((EntryType::Regular, d)) => { + let dstr = d.join("/"); + missed.entry(dstr).or_insert_with(Vec::new).push(p.to_string()); + } + _ => (), + } + } + + if missed.len() > 0 { + Some(MissedFiles(missed)) + } else { + None + } + } +} + + +impl MissedFiles { + /* Reads the archive again and calls file_cb() on every interesting file that was missed during + * the first read of the archive (using FileList::{read,links}). file_cb is exactly the same as + * in FileList::read, but this time it can actually get multiple paths as first argument; which + * happens when multiple interesting links point to the same file. */ + pub fn read(mut self, ent: Option, mut file_cb: G) -> Result<()> + where G: FnMut(&[&str], &mut ArchiveEntry) -> Result<()> + { + walk(ent, |mut e| { + if let Some(f) = e.path().and_then(|p| self.0.remove(p)) { + let v: Vec<&str> = f.iter().map(|x| x as &str).collect(); + try!(file_cb(&v, &mut e)) + } + Ok(self.0.len() > 0) + }) + } +} + + +#[cfg(test)] +mod tests { + use super::*; + use archive::Archive; + use std::io::Read; + use std::fs::File; + + fn test_read() -> FileList { + let mut f = File::open("tests/testarchive.tar.xz").unwrap(); + let arch = Archive::open_archive(&mut f).unwrap(); + let mut cnt = 0; + FileList::read(arch, + |p| p.starts_with("man/man"), + |p,e| { + assert_eq!(cnt, 0); + cnt += 1; + assert_eq!(p, &["man/man3/helloworld.3"][..]); + assert_eq!(e.size(), 12); + + let mut cont = String::new(); + e.read_to_string(&mut cont).unwrap(); + assert_eq!(&cont, "Hello World\n"); + Ok(()) + } + ).unwrap() + } + + fn test_resolve_links(r: &FileList) { + let res = |p| { + if let Some(&EntryType::Link(ref l)) = r.seen.get(p) { + r.resolve_link(p, &l, 5) + } else { + panic!("Not found or not a link: {}", p); + } + }; + let helloworld = Some((EntryType::Handled, vec!["man".to_string(), "man3".to_string(), "helloworld.3".to_string()])); + + assert_eq!(res("man/mans"), Some((EntryType::Directory, vec!["man".to_string(), "man3".to_string()]))); + assert_eq!(res("man/man6/hardlink.6"), helloworld); + assert_eq!(res("man/man1/symlinkbefore.1"), helloworld); + assert_eq!(res("man/man6/symlinkafter.6"), helloworld); + + assert_eq!(res("man/man1/badsymlink1.1"), None); + assert_eq!(res("man/man1/badsymlink2.1"), None); + assert_eq!(res("man/man1/badsymlink3.1"), None); + assert_eq!(res("man/man1/badsymlink4.1"), None); + assert_eq!(res("man/man1/badsymlink5.1"), None); + + assert_eq!(res("man/man1/doublesymlink1.1"), helloworld); + assert_eq!(res("man/man1/doublesymlink2.1"), helloworld); + assert_eq!(res("man/man1/triplesymlink.1"), helloworld); + assert_eq!(res("man/man1/infinitesymlink.1"), None); + } + + fn test_links(r: FileList) -> Option { + let mut links = Vec::new(); + let missed = r.links(|p,d| links.push((p.to_string(), d.to_string()))); + links.sort(); + + { + let mut res = |p:&str| { + let r = links.remove(0); + assert_eq!(r.0, p.to_string()); + assert_eq!(r.1, "man/man3/helloworld.3".to_string()); + }; + res("man/man1/doublesymlink1.1"); + res("man/man1/doublesymlink2.1"); + res("man/man1/symlinkbefore.1"); + res("man/man1/triplesymlink.1"); + res("man/man6/hardlink.6"); + res("man/man6/symlinkafter.6"); + } + assert_eq!(links.len(), 0); + missed + } + + fn test_reread(r: MissedFiles) { + let mut f = File::open("tests/testarchive.tar.xz").unwrap(); + let ent = Archive::open_archive(&mut f).unwrap(); + let mut files = Vec::new(); + r.read(ent, + |p,e| { + let mut cont = String::new(); + e.read_to_string(&mut cont).unwrap(); + files.extend(p.iter().map(|x| (x.to_string(), cont.clone()) )); + Ok(()) + } + ).unwrap(); + files.sort(); + + { + let mut res = |a:&str, b:&str| { + let r = files.remove(0); + assert_eq!(&r.0, a); + assert_eq!(&r.1, b); + }; + res("man/man3/needreread.3", "Potentially interesting file\n"); + res("man/man6/needreread.6", "Potentially interesting file\n"); + } + assert_eq!(files.len(), 0); + } + + #[test] + fn test_reader() { + //use env_logger; + //env_logger::init().unwrap(); + + let r = test_read(); + test_resolve_links(&r); + let l = test_links(r).unwrap(); + test_reread(l); + } +} diff --git a/indexer/src/main.rs b/indexer/src/main.rs index d0eb4e2..5661568 100644 --- a/indexer/src/main.rs +++ b/indexer/src/main.rs @@ -1,97 +1,15 @@ #[macro_use] extern crate log; +#[macro_use] extern crate lazy_static; extern crate env_logger; -extern crate libarchive; extern crate regex; - -use regex::Regex; +extern crate libarchive3_sys; +extern crate libc; mod archive; - - -// Checks a path for a man page candidate. Returns None if it doesn't seem like a man page -// location, otherwise Some((manPageName, Section, Locale)). -fn parse_path(path: &str) -> Option<(&str, &str, &str)> { - // Roughly: man[/locale]/man1/manpage.section[.compression]+ - // TODO: lazy_static - let re = Regex::new(r"(?x) - man - (?: / ([^/]+) )? # Optional locale - /man[a-z0-9]/ # Subdir - ([^/]+?) # Man page name (non-greedy) - \. ([^/\.]+) # Section - (?: \. (?: gz|lzma|bz2|xz ))* $ # Any number of compression extensions - ").unwrap(); - - let cap = match re.captures(path) { Some(x) => x, None => return None }; - let locale = cap.at(1).unwrap_or(""); - let name = cap.at(2).unwrap(); - let section = cap.at(3).unwrap(); - - // Not everything matching the regex is necessarily a man page, exclude some special cases. - match (name, section, locale) { - // Files that totally aren't man pages - ("Makefile", "in", _) | - ("Makefile", "am", _) | - (".cvsignore", _, _) | - (_, "gz", _) | - (_, "lzma", _) | - (_, "bz2", _) | - (_, "xz", _) | - (_, "html", _) => None, - // Some weird directories that happen to match the locale - (n, s, "5man") | - (n, s, "c") | - (n, s, "man1") | - (n, s, "man2") | - (n, s, "man3") | - (n, s, "man4") | - (n, s, "man5") | - (n, s, "man6") | - (n, s, "man7") | - (n, s, "man8") | - (n, s, "Man-Part1") | - (n, s, "Man-Part2") => Some((n, s, "")), - // Nothing special! - x => Some(x) - } -} - +mod archread; +mod man; fn main() { env_logger::init().unwrap(); info!("Hello, world!"); } - - -#[test] -fn test_parse_path() { - // Generic tests - assert_eq!(parse_path("/"), None); - assert_eq!(parse_path("/man1/ncdu.1"), None); - assert_eq!(parse_path("/man/man?/ncdu.1"), None); - assert_eq!(parse_path("/man/man1/ncdu.1"), Some(("ncdu", "1", ""))); - assert_eq!(parse_path("/man/man1/ncdu.1.gz.lzma.xz.bz2.gz"), Some(("ncdu", "1", ""))); // This stuff happens - assert_eq!(parse_path("/man/en_US.UTF-8/man1/ncdu.1"), Some(("ncdu", "1", "en_US.UTF-8"))); - - // Special cases - assert_eq!(parse_path("/usr/share/man/man1/INDEX"), None); - assert_eq!(parse_path("/usr/share/man/man1/Makefile"), None); - assert_eq!(parse_path("/usr/share/man/man1/Makefile.am"), None); - assert_eq!(parse_path("/usr/share/man/man1/Makefile.in"), None); - assert_eq!(parse_path("/usr/share/man/man1/.cvsignore"), None); - assert_eq!(parse_path("/usr/share/man/man1/.cvsignore.gz"), None); - - // Some actual locations - assert_eq!(parse_path("/usr/local/man/man1/list_audio_tracks.1.gz"), Some(("list_audio_tracks", "1", ""))); - assert_eq!(parse_path("/usr/local/lib/perl5/site_perl/man/man3/DBIx::Class::Helper::ResultSet::DateMethods1::Announcement.3.gz"), Some(("DBIx::Class::Helper::ResultSet::DateMethods1::Announcement", "3", ""))); - assert_eq!(parse_path("/usr/man/man3/exit.3tk"), Some(("exit", "3tk", ""))); - assert_eq!(parse_path("/usr/local/brlcad/share/man/mann/exit.nged.gz"), Some(("exit", "nged", ""))); - assert_eq!(parse_path("/usr/X11R6/man/man3/intro.3xglut.gz"), Some(("intro", "3xglut", ""))); - assert_eq!(parse_path("/usr/local/share/man/ko_KR.eucKR/man3/intro.3.gz"), Some(("intro", "3", "ko_KR.eucKR"))); - - assert_eq!(parse_path("/usr/lib/scilab/man/Man-Part1/man1/ans.1"), Some(("ans", "1", ""))); - assert_eq!(parse_path("/heirloom/usr/share/man/5man/man1/chgrp.1.gz"), Some(("chgrp", "1", ""))); - - assert_eq!(parse_path("/usr/local/plan9/man/man8/index.html"), None); - assert_eq!(parse_path("/usr/local/share/doc/gmt/html/man/grdpaste.html"), None); -} diff --git a/indexer/src/man.rs b/indexer/src/man.rs new file mode 100644 index 0000000..b268fe8 --- /dev/null +++ b/indexer/src/man.rs @@ -0,0 +1,85 @@ +use regex::Regex; + + +// Checks a path for a man page candidate. Returns None if it doesn't seem like a man page +// location, otherwise Some((manPageName, Section, Locale)). +fn parse_path(path: &str) -> Option<(&str, &str, &str)> { + // Roughly: man[/locale]/man1/manpage.section[.compression]+ + lazy_static! { + static ref RE: Regex = Regex::new(r"(?x) + man + (?: / ([^/]+) )? # Optional locale + /man[a-z0-9]/ # Subdir + ([^/]+?) # Man page name (non-greedy) + \. ([^/\.]+) # Section + (?: \. (?: gz|lzma|bz2|xz ))* $ # Any number of compression extensions + ").unwrap(); + } + + let cap = match RE.captures(path) { Some(x) => x, None => return None }; + let locale = cap.at(1).unwrap_or(""); + let name = cap.at(2).unwrap(); + let section = cap.at(3).unwrap(); + + // Not everything matching the regex is necessarily a man page, exclude some special cases. + match (name, section, locale) { + // Files that totally aren't man pages + ("Makefile", "in", _) | + ("Makefile", "am", _) | + (".cvsignore", _, _) | + (_, "gz", _) | + (_, "lzma", _) | + (_, "bz2", _) | + (_, "xz", _) | + (_, "html", _) => None, + // Some weird directories that happen to match the locale + (n, s, "5man") | + (n, s, "c") | + (n, s, "man1") | + (n, s, "man2") | + (n, s, "man3") | + (n, s, "man4") | + (n, s, "man5") | + (n, s, "man6") | + (n, s, "man7") | + (n, s, "man8") | + (n, s, "Man-Part1") | + (n, s, "Man-Part2") => Some((n, s, "")), + // Nothing special! + x => Some(x) + } +} + + +#[test] +fn test_parse_path() { + // Generic tests + assert_eq!(parse_path("/"), None); + assert_eq!(parse_path("/man1/ncdu.1"), None); + assert_eq!(parse_path("/man/man?/ncdu.1"), None); + assert_eq!(parse_path("/man/man1/ncdu.1"), Some(("ncdu", "1", ""))); + assert_eq!(parse_path("/man/man1/ncdu.1.gz.lzma.xz.bz2.gz"), Some(("ncdu", "1", ""))); // This stuff happens + assert_eq!(parse_path("/man/en_US.UTF-8/man1/ncdu.1"), Some(("ncdu", "1", "en_US.UTF-8"))); + + // Special cases + assert_eq!(parse_path("/usr/share/man/man1/INDEX"), None); + assert_eq!(parse_path("/usr/share/man/man1/Makefile"), None); + assert_eq!(parse_path("/usr/share/man/man1/Makefile.am"), None); + assert_eq!(parse_path("/usr/share/man/man1/Makefile.in"), None); + assert_eq!(parse_path("/usr/share/man/man1/.cvsignore"), None); + assert_eq!(parse_path("/usr/share/man/man1/.cvsignore.gz"), None); + + // Some actual locations + assert_eq!(parse_path("/usr/local/man/man1/list_audio_tracks.1.gz"), Some(("list_audio_tracks", "1", ""))); + assert_eq!(parse_path("/usr/local/lib/perl5/site_perl/man/man3/DBIx::Class::Helper::ResultSet::DateMethods1::Announcement.3.gz"), Some(("DBIx::Class::Helper::ResultSet::DateMethods1::Announcement", "3", ""))); + assert_eq!(parse_path("/usr/man/man3/exit.3tk"), Some(("exit", "3tk", ""))); + assert_eq!(parse_path("/usr/local/brlcad/share/man/mann/exit.nged.gz"), Some(("exit", "nged", ""))); + assert_eq!(parse_path("/usr/X11R6/man/man3/intro.3xglut.gz"), Some(("intro", "3xglut", ""))); + assert_eq!(parse_path("/usr/local/share/man/ko_KR.eucKR/man3/intro.3.gz"), Some(("intro", "3", "ko_KR.eucKR"))); + + assert_eq!(parse_path("/usr/lib/scilab/man/Man-Part1/man1/ans.1"), Some(("ans", "1", ""))); + assert_eq!(parse_path("/heirloom/usr/share/man/5man/man1/chgrp.1.gz"), Some(("chgrp", "1", ""))); + + assert_eq!(parse_path("/usr/local/plan9/man/man8/index.html"), None); + assert_eq!(parse_path("/usr/local/share/doc/gmt/html/man/grdpaste.html"), None); +} diff --git a/indexer/tests/mktar.sh b/indexer/tests/mktar.sh index 3f5ba4e..9f8844d 100755 --- a/indexer/tests/mktar.sh +++ b/indexer/tests/mktar.sh @@ -4,6 +4,19 @@ # way. The tests will fail quite badly if hardlink.6 is considered the # "original" version. + +mkdir simple +echo Hi >simple/file +ln -s file simple/link +ln simple/file simple/hardlink +mkfifo simple/fifo +badfn=`echo 'Héllö.txt' | iconv -t ISO-8859-1` +touch $badfn +tar -czf simpletest.tar.gz simple $badfn +rm -rf $badfn simple + + + mkdir man cd man diff --git a/indexer/tests/simpletest.tar.gz b/indexer/tests/simpletest.tar.gz new file mode 100644 index 0000000000000000000000000000000000000000..06a535cdc1f5a295ca3254f00ae7728166efebff GIT binary patch literal 247 zcmVkvZ6|;hx|Jj1u2vU@$3^k}Fw}GD1G!UW0 z;3nbuUgp7pd-2R%EwjTZ#2ZFQ)V#ILD!mo`^@T|-gz>^iS?`n5Yr(iB?W%Jwt8KwK zyXHk2hxhx&p8u(`%|FRPOje@_*ctO7e?7=wS*IBnWF7S^JpU+L&D{dbsJs6mJMPJ9 zJmip#|K2FHLeZ@&mR%3ep%6i4P xBkQPV;rU0m5bo>hQW4hpBL8MP{{H{~00000000000000cdH@H84l)2J00691c?tjk literal 0 HcmV?d00001 diff --git a/indexer/tests/testarchive.tar.xz b/indexer/tests/testarchive.tar.xz index 0fe9760170b551b0ef3438c652ae8f7a4f9f4ac2..55b69f76ca05b8ac257a26ca0598ac6560c880ef 100644 GIT binary patch delta 577 zcmV-H0>1s|1n2~i904ei9U6bcUXsFzs4O9PO_a20_F!?Bihx;Fu@F9s3ri%53V?Y2 zoess^YJ1$v)lBFR-xDhG*hv3dQ@qt-)_kn250?xJkOHG_G=jU&%u0(RFN z{U}9mYEXk1*SbX}TDOl_b*YnIj5DPQZ%@c@+;2jClq>t{RCGS6zJ)S>k56E`WEYNA zP0`sjs;0i_7OiV+v!H)0zoG?K(0z%91Ym2f97zjQ^Rdym)@TaSo{xA2wKvDuo2*N< z%e@vlTQaDqV$1U&++S!nmvGro(Y`p`KpEX@ds;ts^5d3QMx~Z*_?E9{AAl4P8*k$38pzr|GAPif83mu$V>i?_@Dv9OJ-UB_AY;4^fC$+y`>9^R z49|VtEnKHoi*E368ac59T6P-`3R}ec<=nR6I$)(SwRbj?rixk?&Q8-9-RF4eNmSg2 z663^ss$*tXHNAo@8yOyBkASE&QX!_g_eZL5kz%?d$|7bzfA_}WcIHM15qHC3n0?ng z9cxHui7)&*6puBt^Ta8g#l`S(4(D`T5ol>m!Z!c_H06BdkU$A+00G1VfS>^YrSz7O PvBYQl0ssI200dcD;^h>~ delta 577 zcmV-H0>1s|1n2~i904Vf9U6b63;X|?X!0~%n{l=1_BoNbZEHx-KVzh^vc>!oKR=(){6=Y^HJx;yN?*l z-xK2KnMgv=5C0aog>64Z%DZvc zFvb2d4%YV;FU5=(Ni%iPf)ERYjOngtEiGNzM6i#4mYpFDY%lksito(PE5hTP^3_wd z)M|k3C0J z1^)Gb3}3`Im%**uwIY8Z!EOn3P82M}8M0BdgGmdk66r0kR$3puB5i9Pn5V-!M%^Mv z)$&An){@ARJOz<30t^L{s^-FY-RJUIX?l!gRN*SA=b6=Rq>STl^ZN#WHDpToOK09b zlXkUoD_{ap$1$X$G4BRNjxS9d0`)j~4|c{#6lZ-1_{^-vmLh*Hqn14)Y5#AkoXaZw zL{?lkKDZ~>G7$ANlUilzW>cOW(2k(KEttbOlUS9;E>b*;Xzod^qJNiQaY zaHKOHbCAI>W8*}*v7r5qOIVwt@0*c@UQRk9Ijj#?<-p2S3)ilT(O-QU^B&-o9mJdy zlOU39(@Jn|UXe8<{meQ#D}BoWGDIttd8%*#00000R}B$w`ofdC00F@SfS>^Y7G+z( PvBYQl0ssI200dcDq7xL# From 0cab7586655c328392c2c5bee437e29728c8d8f9 Mon Sep 17 00:00:00 2001 From: Yorhel Date: Sun, 30 Oct 2016 11:06:14 +0100 Subject: [PATCH 3/7] Add support for man page reading & decoding --- indexer/Cargo.lock | 82 ++++++++ indexer/Cargo.toml | 2 + indexer/src/archive.rs | 92 +++++++-- indexer/src/main.rs | 2 + indexer/src/man.rs | 218 +++++++++++++++++++++- indexer/tests/exit.3.gz.lzma | Bin 0 -> 1970 bytes indexer/tests/{mktar.sh => mkarchives.sh} | 11 +- indexer/tests/rawtest.gz.xz.bzip2 | Bin 0 -> 35 bytes indexer/tests/simpletest.tar.gz | Bin 247 -> 248 bytes indexer/tests/testarchive.tar.xz | Bin 616 -> 620 bytes 10 files changed, 394 insertions(+), 13 deletions(-) create mode 100644 indexer/tests/exit.3.gz.lzma rename indexer/tests/{mktar.sh => mkarchives.sh} (88%) create mode 100644 indexer/tests/rawtest.gz.xz.bzip2 diff --git a/indexer/Cargo.lock b/indexer/Cargo.lock index dbd3057..1d0ea13 100644 --- a/indexer/Cargo.lock +++ b/indexer/Cargo.lock @@ -2,12 +2,14 @@ name = "indexer" version = "0.1.0" dependencies = [ + "encoding 0.2.33 (registry+https://github.com/rust-lang/crates.io-index)", "env_logger 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)", "lazy_static 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)", "libarchive3-sys 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)", "libc 0.2.17 (registry+https://github.com/rust-lang/crates.io-index)", "log 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", "regex 0.1.80 (registry+https://github.com/rust-lang/crates.io-index)", + "ring 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] @@ -18,6 +20,63 @@ dependencies = [ "memchr 0.1.11 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "encoding" +version = "0.2.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "encoding-index-japanese 1.20141219.5 (registry+https://github.com/rust-lang/crates.io-index)", + "encoding-index-korean 1.20141219.5 (registry+https://github.com/rust-lang/crates.io-index)", + "encoding-index-simpchinese 1.20141219.5 (registry+https://github.com/rust-lang/crates.io-index)", + "encoding-index-singlebyte 1.20141219.5 (registry+https://github.com/rust-lang/crates.io-index)", + "encoding-index-tradchinese 1.20141219.5 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "encoding-index-japanese" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "encoding_index_tests 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "encoding-index-korean" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "encoding_index_tests 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "encoding-index-simpchinese" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "encoding_index_tests 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "encoding-index-singlebyte" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "encoding_index_tests 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "encoding-index-tradchinese" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "encoding_index_tests 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "encoding_index_tests" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "env_logger" version = "0.3.5" @@ -90,6 +149,15 @@ name = "regex-syntax" version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "ring" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "lazy_static 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)", + "untrusted 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "thread-id" version = "2.0.0" @@ -107,6 +175,11 @@ dependencies = [ "thread-id 2.0.0 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "untrusted" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "utf8-ranges" version = "0.1.3" @@ -124,6 +197,13 @@ source = "registry+https://github.com/rust-lang/crates.io-index" [metadata] "checksum aho-corasick 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)" = "ca972c2ea5f742bfce5687b9aef75506a764f61d37f8f649047846a9686ddb66" +"checksum encoding 0.2.33 (registry+https://github.com/rust-lang/crates.io-index)" = "6b0d943856b990d12d3b55b359144ff341533e516d94098b1d3fc1ac666d36ec" +"checksum encoding-index-japanese 1.20141219.5 (registry+https://github.com/rust-lang/crates.io-index)" = "04e8b2ff42e9a05335dbf8b5c6f7567e5591d0d916ccef4e0b1710d32a0d0c91" +"checksum encoding-index-korean 1.20141219.5 (registry+https://github.com/rust-lang/crates.io-index)" = "4dc33fb8e6bcba213fe2f14275f0963fd16f0a02c878e3095ecfdf5bee529d81" +"checksum encoding-index-simpchinese 1.20141219.5 (registry+https://github.com/rust-lang/crates.io-index)" = "d87a7194909b9118fc707194baa434a4e3b0fb6a5a757c73c3adb07aa25031f7" +"checksum encoding-index-singlebyte 1.20141219.5 (registry+https://github.com/rust-lang/crates.io-index)" = "3351d5acffb224af9ca265f435b859c7c01537c0849754d3db3fdf2bfe2ae84a" +"checksum encoding-index-tradchinese 1.20141219.5 (registry+https://github.com/rust-lang/crates.io-index)" = "fd0e20d5688ce3cab59eb3ef3a2083a5c77bf496cb798dc6fcdb75f323890c18" +"checksum encoding_index_tests 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)" = "a246d82be1c9d791c5dfde9a2bd045fc3cbba3fa2b11ad558f27d01712f00569" "checksum env_logger 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)" = "15abd780e45b3ea4f76b4e9a26ff4843258dd8a3eed2775a0e7368c2e7936c2f" "checksum kernel32-sys 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7507624b29483431c0ba2d82aece8ca6cdba9382bff4ddd0f7490560c056098d" "checksum lazy_static 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "49247ec2a285bb3dcb23cbd9c35193c025e7251bfce77c1d5da97e6362dffe7f" @@ -134,8 +214,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum pkg-config 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)" = "8cee804ecc7eaf201a4a207241472cc870e825206f6c031e3ee2a72fa425f2fa" "checksum regex 0.1.80 (registry+https://github.com/rust-lang/crates.io-index)" = "4fd4ace6a8cf7860714a2c2280d6c1f7e6a413486c13298bbc86fd3da019402f" "checksum regex-syntax 0.3.9 (registry+https://github.com/rust-lang/crates.io-index)" = "f9ec002c35e86791825ed294b50008eea9ddfc8def4420124fbc6b08db834957" +"checksum ring 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)" = "c0743ef007bcff4909b107907a410418eb7e5c6ad55b843d70b39f62bfb7112e" "checksum thread-id 2.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "a9539db560102d1cef46b8b78ce737ff0bb64e7e18d35b2a5688f7d097d0ff03" "checksum thread_local 0.2.7 (registry+https://github.com/rust-lang/crates.io-index)" = "8576dbbfcaef9641452d5cf0df9b0e7eeab7694956dd33bb61515fb8f18cfdd5" +"checksum untrusted 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)" = "5d9bc0e6e73a10975d1fbff8ac3541e221181b0d8998351600fb5523de634c0d" "checksum utf8-ranges 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)" = "a1ca13c08c41c9c3e04224ed9ff80461d97e121589ff27c753a16cb10830ae0f" "checksum winapi 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)" = "167dc9d6949a9b857f3451275e911c3f44255842c1f7a76f33c55103a909087a" "checksum winapi-build 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "2d315eee3b34aca4797b2da6b13ed88266e6d612562a0c46390af8299fc699bc" diff --git a/indexer/Cargo.toml b/indexer/Cargo.toml index 4fd665f..f97465d 100644 --- a/indexer/Cargo.toml +++ b/indexer/Cargo.toml @@ -10,3 +10,5 @@ env_logger = "0.3.5" lazy_static = "0.2.1" libc = "0.2.17" libarchive3-sys = "0.1.2" +encoding = "0.2.33" +ring = "0.5.3" diff --git a/indexer/src/archive.rs b/indexer/src/archive.rs index 17e85c4..c536dcb 100644 --- a/indexer/src/archive.rs +++ b/indexer/src/archive.rs @@ -26,6 +26,7 @@ pub struct Archive<'a> { rd: &'a mut Read, buf: Vec, err: Option, + eof: bool, } @@ -34,6 +35,8 @@ pub struct ArchiveEntry<'a> { e: *mut ffi::Struct_archive_entry, } +pub struct RawEntry<'a>(Box>); + #[derive(Debug,PartialEq,Eq)] pub enum FileType { @@ -65,7 +68,7 @@ impl<'a> Archive<'a> { let bufsize = 64*1024; let mut buf = Vec::with_capacity(bufsize); unsafe { buf.set_len(bufsize) }; - let mut ret = Box::new(Archive { a: a, rd: rd, buf: buf, err: None }); + let mut ret = Box::new(Archive { a: a, rd: rd, buf: buf, err: None, eof: false }); let aptr: *mut c_void = &mut *ret as *mut Archive as *mut c_void; let r = unsafe { ffi::archive_read_open(a, aptr, None, Some(archive_read_cb), None) }; @@ -76,10 +79,18 @@ impl<'a> Archive<'a> { } fn error(&mut self) -> Error { - // TODO: Do something with the description - self.err.take().unwrap_or_else(|| - Error::from_raw_os_error(unsafe { ffi::archive_errno(self.a) }) - ) + self.err.take().unwrap_or_else(|| { + let err = Error::from_raw_os_error(unsafe { ffi::archive_errno(self.a) }); + let desc = unsafe { ffi::archive_error_string(self.a) }; + if desc.is_null() { + return err; + } + if let Ok(s) = str::from_utf8(unsafe { CStr::from_ptr(desc) }.to_bytes()) { + Error::new(err.kind(), s) + } else { + err + } + }) } fn entry(self: Box) -> Result>> { @@ -87,6 +98,7 @@ impl<'a> Archive<'a> { a: self, e: ptr::null_mut() }; + ent.a.eof = false; let res = unsafe { ffi::archive_read_next_header(ent.a.a, &mut ent.e) }; match res { ffi::ARCHIVE_EOF => Ok(None), @@ -96,9 +108,15 @@ impl<'a> Archive<'a> { } fn read(&mut self, buf: &mut [u8]) -> Result { + // libarchive tends to throw an error if you try to read after an EOF; handle that case + // here. + if self.eof { + return Ok(0); + } let cbuf = buf.as_mut_ptr() as *mut c_void; let n = unsafe { ffi::archive_read_data(self.a, cbuf, buf.len()) }; if n >= 0 { + self.eof = n == 0; Ok(n as usize) } else { Err(self.error()) @@ -114,6 +132,27 @@ impl<'a> Archive<'a> { }; try!(Self::new(rd, a)).entry() } + + pub fn open_raw(rd: &mut Read) -> Result { + let a = unsafe { + let a = ffi::archive_read_new(); + ffi::archive_read_support_filter_all(a); + ffi::archive_read_support_format_raw(a); + ffi::archive_read_support_format_empty(a); + a + }; + let mut a = try!(Self::new(rd, a)); + let mut e: *mut ffi::Struct_archive_entry = ptr::null_mut(); + let res = unsafe { ffi::archive_read_next_header(a.a, &mut e) }; + match res { + ffi::ARCHIVE_FATAL => Err(a.error()), + ffi::ARCHIVE_EOF => { + a.eof = true; + Ok(RawEntry(a)) + }, + _ => Ok(RawEntry(a)) + } + } } @@ -197,6 +236,13 @@ impl<'a> Read for ArchiveEntry<'a> { } +impl<'a> Read for RawEntry<'a> { + fn read(&mut self, buf: &mut [u8]) -> Result { + self.0.read(buf) + } +} + + // We can't provide an Iterator object for ArchiveEntries because Rust doesn't support streaming // iterators. Let's instead provide a walk function for convenience. // cb should return Ok(true) to continue, Ok(false) to break @@ -223,22 +269,28 @@ mod tests { use std::fs::File; #[test] - fn invalid_archive() { + fn invalid() { let mut r = std::io::repeat(0x0a).take(64*1024); let ent = Archive::open_archive(&mut r); assert!(ent.is_err()); } #[test] - fn zerolength_archive() { + fn zerolength() { let mut r = std::io::empty(); - let ent = Archive::open_archive(&mut r); - // I expected an error here rather than None, whatever. - assert!(ent.unwrap().is_none()); + { + let ent = Archive::open_archive(&mut r); + assert!(ent.unwrap().is_none()); + } + { + let mut ent = Archive::open_raw(&mut r).unwrap(); + let mut v = Vec::new(); + assert_eq!(ent.read_to_end(&mut v).unwrap(), 0); + } } #[test] - fn read() { + fn archive() { let mut f = File::open("tests/simpletest.tar.gz").unwrap(); let mut ent = Archive::open_archive(&mut f).unwrap().unwrap(); @@ -270,4 +322,22 @@ mod tests { assert!(ent.next().unwrap().is_none()); } + + #[test] + fn raw() { + let mut f = File::open("tests/rawtest.gz.xz.bzip2").unwrap(); + let mut r = Archive::open_raw(&mut f).unwrap(); + let mut c = String::new(); + r.read_to_string(&mut c).unwrap(); + assert_eq!(&c, "File contents!\n"); + } + + #[test] + fn raw_passthrough() { + let mut r = std::io::Cursor::new(&b"This is an uncompressed text file"[..]); + let mut ent = Archive::open_raw(&mut r).unwrap(); + let mut s = String::new(); + ent.read_to_string(&mut s).unwrap(); + assert_eq!(&s, "This is an uncompressed text file"); + } } diff --git a/indexer/src/main.rs b/indexer/src/main.rs index 5661568..bcb5ee8 100644 --- a/indexer/src/main.rs +++ b/indexer/src/main.rs @@ -4,6 +4,8 @@ extern crate env_logger; extern crate regex; extern crate libarchive3_sys; extern crate libc; +extern crate ring; +extern crate encoding; mod archive; mod archread; diff --git a/indexer/src/man.rs b/indexer/src/man.rs index b268fe8..56a45b3 100644 --- a/indexer/src/man.rs +++ b/indexer/src/man.rs @@ -1,4 +1,19 @@ +use std::str; +use std::io; +use std::io::Read; +use regex::bytes; use regex::Regex; +use encoding; +use encoding::{all,EncodingRef}; +use encoding::label::encoding_from_whatwg_label; +use ring::digest; + +use archive::Archive; + +// Anything larger than this just isn't a man page. I hope. +const MAX_MAN_SIZE: u64 = 20*1024*1024; +// I've also not seen valid man pages smaller than this +const MIN_MAN_SIZE: u64 = 9; // Checks a path for a man page candidate. Returns None if it doesn't seem like a man page @@ -24,9 +39,9 @@ fn parse_path(path: &str) -> Option<(&str, &str, &str)> { // Not everything matching the regex is necessarily a man page, exclude some special cases. match (name, section, locale) { // Files that totally aren't man pages - ("Makefile", "in", _) | ("Makefile", "am", _) | (".cvsignore", _, _) | + (_, "in", _) | (_, "gz", _) | (_, "lzma", _) | (_, "bz2", _) | @@ -51,6 +66,165 @@ fn parse_path(path: &str) -> Option<(&str, &str, &str)> { } +// Convenient wrapper for archread's interest_cb +pub fn ismanpath(path: &str) -> bool { + parse_path(path).is_some() +} + + +fn validate(data: &Vec) -> Option<&'static str> { + lazy_static! { + static ref HTML: bytes::Regex = bytes::Regex::new(r"^\s*<(?:html|head|!DOCTYPE)").unwrap(); + } + + if data.len() >= MAX_MAN_SIZE as usize { + Some("File too large") + } else if data.len() < MIN_MAN_SIZE as usize { + Some("File too small") + } else if &data[..] == &b".so man3/\n"[..] { + Some("Contents: '.so man3/'") + } else if &data[..] == &b"timestamp\n"[..] { + Some("Contents: 'timestamp'") + } else if HTML.is_match(&data) { + Some("Looks like an HTML file") + } else { + None + } +} + + +// Look for 'coding:' indications in the file header, a la preconv(1). +fn codec_from_tag(data: &Vec) -> Option { + lazy_static! { + // According to the emacs docs the tag should be on the first line; according to preconv(1) + // it should be on the first or second line. I've also seen some files with the tag on the + // last line. I've not seen the tag itself used in a different context, so just get it from + // anywhere... + static ref TAG: bytes::Regex = bytes::Regex::new(r"-\*-.*coding:\s*(?u:([^\s;]+)).*-\*").unwrap(); + } + let cap = match TAG.captures(&data) { Some(x) => x, None => return None }; + let tag = str::from_utf8(cap.at(1).unwrap()).unwrap().to_lowercase(); + + match &tag[..] { + // Deny some common UTF-8-compatible encodings. These tags are obviously incorrect. + "us-ascii" | "ascii" | "utf8" | "utf-8" | "utf-8-unix" => None, + + // latin-1 isn't in the whatwg spec under that name + "latin-1" => Some(all::WINDOWS_1252), + + // Waaaaaaaaah we can't decode this :( + "armscii-8" => None, + + // Anything else should be found by its whatwg label. + x => match encoding_from_whatwg_label(x) { + Some(x) => Some(x), + None => { warn!("Unknown encoding in emacs tag: {}", x); None }, + } + } +} + + +fn codec_from_path(path: &str) -> Option { + let locale = match parse_path(path) { + Some((_,_,l)) if l != "" => l.to_lowercase(), + _ => return None, + }; + + lazy_static! { + static ref RE: Regex = Regex::new(r"^(?x) + ([a-z]+) # primary language + (?:_ ([a-z]+))? # secondary language + (?:@ [a-z]+)? # script (potentially useful, but uncommon and not currently used) + (?:\. ([^\.@]+))? # encoding (FUCKING USEFUL) + $").unwrap(); + } + + let cap = match RE.captures(&locale) { Some(x) => x, None => return None }; + let lang = cap.at(1).unwrap(); + let seclang = cap.at(2); + let enc = cap.at(3); + + // Try to do something with the encoding tag + match (lang, enc) { + (_, Some("eucjp")) | + (_, Some("ujis")) | // Not sure about this one, but it seems to come out alright + ("ja", Some("euc")) => return Some(all::EUC_JP), + + (_, Some("euckr")) => return Some(all::WINDOWS_949), + + ("ja", Some("jis7")) | + ("ja", Some("pck")) => return None, /* WAT? TODO: DO SOMETHING WITH THESE */ + + (_, Some(x)) => match encoding_from_whatwg_label(x) { + Some(x) => return Some(x), + _ => { warn!("Unknown encoding in locale: {}", x) }, + }, + _ => {}, + }; + + // Fall back to language + match (lang, seclang) { + ("pl", _) | + ("cs", _) | + ("hr", _) | + ("hu", _) | + ("sl", _) | + ("sk", _) => Some(all::ISO_8859_2), + ("bg", _) | + ("be", _) | + ("uk", _) => Some(all::ISO_8859_5), + ("el", _) => Some(all::ISO_8859_7), + ("et", _) => Some(all::ISO_8859_15), + ("tr", _) => Some(all::WINDOWS_1254), + ("ru", _) => Some(all::KOI8_R), + ("ja", _) | + ("jp", _) => Some(all::EUC_JP), // Tricky; but JIS is certainly less common + ("zh", Some("cn")) => Some(all::GBK), // These are based purely on what I've observed, + ("zh", _) => Some(all::BIG5_2003), // perhaps some heuristics based on contents can do better + ("ko", _) => Some(all::WINDOWS_949), + (_, _) => None, + } +} + + +// Decompresses / decodes a man page and returns its SHA-1 hash, encoding name, and UTF-8 contents. +pub fn decode(paths: &[&str], ent: &mut Read) -> io::Result<(digest::Digest,&'static str,String)> { + let mut decomp = try!(Archive::open_raw(ent)).take(MAX_MAN_SIZE+1); + let mut data = Vec::new(); + try!(decomp.read_to_end(&mut data)); + + if let Some(e) = validate(&data) { + return Err(io::Error::new(io::ErrorKind::InvalidData, e)); + } + + let dig = digest::digest(&digest::SHA1, &data); + + // TODO: Handle BOM? UTF-16? + // If it passes as UTF-8, then just consider it UTF-8. + if let Ok(_) = str::from_utf8(&data) { + return Ok((dig, "utf8", unsafe { String::from_utf8_unchecked(data) } )); + } + // Otherwise, look for a coding tag in the contents + if let Some(e) = codec_from_tag(&data) { + if let Ok(s) = e.decode(&data, encoding::DecoderTrap::Strict) { + return Ok((dig, e.name(), s)); + } + } + // If that fails as well, look for clues in the file path. + for path in paths { + if let Some(e) = codec_from_path(path) { + if let Ok(s) = e.decode(&data, encoding::DecoderTrap::Strict) { + return Ok((dig, e.name(), s)); + } + } + } + // If all else fails, use a lossy iso-8859-1 + Ok((dig, "iso-8859-1", (all::ISO_8859_1 as EncodingRef).decode(&data, encoding::DecoderTrap::Ignore).unwrap() )) +} + + + + #[test] fn test_parse_path() { // Generic tests @@ -83,3 +257,45 @@ fn test_parse_path() { assert_eq!(parse_path("/usr/local/plan9/man/man8/index.html"), None); assert_eq!(parse_path("/usr/local/share/doc/gmt/html/man/grdpaste.html"), None); } + + +#[test] +fn test_codec_from_path() { + let t = |p,n| { + assert_eq!(codec_from_path(p).unwrap().name(), n); + }; + t("man/de_DE.ISO8859-15/man1/scribus.1.gz", "iso-8859-15"); + t("man/de_DE.ISO_8859-1/man1/scribus.1.gz", "windows-1252"); + t("man/ja.UTF-8/man1/test.1", "utf-8"); + t("man/ja_JP/man1/test.1", "euc-jp"); + t("man/ja_JP.EUC/man1/test.1", "euc-jp"); + t("man/ja_JP.SJIS/man1/test.1", "windows-31j"); + t("man/jp.eucJP/man1/test.1", "euc-jp"); + t("man/jp/man1/test.1", "euc-jp"); + t("man/lt.ISO8859-13/man1/test.1", "iso-8859-13"); + t("man/ru/man1/test.1", "koi8-r"); + t("man/ru_RU@Cyr/man1/test.1", "koi8-r"); + t("man/zh_CN/man1/test.1", "gbk"); + t("man/zh_TW/man1/test.1", "big5-2003"); +} + + +#[test] +fn test_decode_zh() { + use std::fs::File; + use ring::test::from_hex; + + // cat exit.1.gz | lzma -d | gzip -d | sha1sum + let filehash = from_hex("cdf9b3e8d96a83c908eb0a0c277485e2f3eebe87").unwrap(); + // cat exit.1.gz | lzma -d | gzip -d | iconv -f gbk -t utf8 | sha1sum + let utf8hash = from_hex("47f3e441137b207c0abdc38adac692298da4927a").unwrap(); + + let mut f = File::open("tests/exit.3.gz.lzma").unwrap(); + let (dig, enc, s) = decode(&["bullshit", "/usr/share/man/zh_CN/man3/exit.3.gz"][..], &mut f).unwrap(); + + assert_eq!(dig.as_ref(), &filehash[..]); + assert_eq!(enc, "gbk"); + + let utf8dig = digest::digest(&digest::SHA1, s.as_bytes()); + assert_eq!(utf8dig.as_ref(), &utf8hash[..]); +} diff --git a/indexer/tests/exit.3.gz.lzma b/indexer/tests/exit.3.gz.lzma new file mode 100644 index 0000000000000000000000000000000000000000..7c84e6aa5a44c01b7106a5a0d2730216a95a1e98 GIT binary patch literal 1970 zcmV;j2Tk~0004jh|NsC0|NsC001u<4_yZ||V@;Sg2wDj?>szYj(xOW}Vch82$Y~u$ zjYL;`1p&cVr{I`sKrcxOZ%A)1C1F@zIrpYVZX)rs0J#2mAj^4j>Sl>0>S2i6A!c9vjDu2zF~F&n$$+QVGp_xowyZj%S)Ay z-HUhdC8A>iQ7tgS`y!zQA{0EKa{cSx4xvCt#CKZ1@x9er1-nAHog6EX4|Riu9h(YI z+h${fYkX2cBpu!4K|R-u4nI3^xMC27%7L_yL+5yflLhnB`a>AhpeRKVd6ecWzDq|f zrAtb(S})kdtx0ZNJuk~76YNs3VtOkowSb2ybckcyV6hjyA`fue198k@XNbYUW=v!?2^htr zk>epthPRGfzJ4=rBrrU`SZ@{(!C+Uz#GsBIIN)ka??0Ud!KD;9IVDM}rjVhHzpFgD z#n=kFB(KZszKgn-al(QAB5_o!K^)`C5D86qaRZ&wFMXGgYSUSYsV}4c0BSE^fILR~ z9HY9cXx#I_Cj)7$ZSB$Cy7(ITWiEG@Xu61WFQhyGKnIQ^L3^jXT!foRhD{Crg+;>p z0#FFwpnQFEtbN0Qtzv{f-1krHSn3a5u^sW+24`1n8nUDWI11(!6I1pr9H4NhoaB=p zRV|>i-QCitHX`KDl+2MMTP>9ruIU1Awi#l=y^sX{|96CHq#W69|4J(hx`Qws3@C_o z%%)Kc7O8~JmS~}5c1;`=@LkQ4zst(0G^~ZaDGUO-%Z|pxEi~d`10hwHfFO zp{h8Uc(eL9DYjO_%Jr~PLsbPvUVz8UlQ2j7HUmfqjqNL00gj54g<_eg zV2=$f9D~#Or+6Zc^yCK5I}Q+wm`K`qd4PeRrm<8!07AM5^IIlXAm+zw2VT{IOM~f6 zB@>pH!|`1PPN$9?=e}zrg^+ji^|Rm4i$xosY!G(=D0C=!cv*~RXPD=n;M|n2KQ|2u zs$owGj~yfJ(W;@vkOd);akANrTrOa!;2YZhbK)hc4J*nSIgOSEz^h-TuHs)hKHXNY zn}e>HQRbFY<4Ci+Qi_iF5jdI_t+8k7bk$EzVBg+Bt#Wy!3f%N1imR*5`vEoJgd9RZ zNQ;W!aYPa6JCS3=+)9@WaZ|rj$U?CKKggVv5x^24a6hf$8He|?n?F|-57KtomSaFt z{aDCtdMlK!HstCR7IRcWEi_z~YTb&}y~;69)9^yJG2ZMO?UUAOs%b54w08TkU z%K_d)gnj19%g|j=L@MYY+!WAAD6D4aA%vv;_D^9E8VnP1+yEFq($RKSi|~2zk0sZM zS<>z7h(SL)&XYIydMtkVje{u_LOO9P`F+7D`qf=KeUGsmK&tiD$0e{djfPy2R5&3n z99C3_--m4jWj>DgsLrW{&6ik zE==JYrU-;&5G2cm*w53cLZ1gV1%Oc!Y8y)BIfzd%02hord?xu(itB7RZLm~P^`3?; z@PLXF#;-t>R2`C;io!!~ZcRk9l2XAl+^d3z7xmn_cIg1PuZsWN#tkt z0{t*67=dgY6c;t|z10LOhM<)pzo5n^Y_P#7lM$=ttni}nu*Uvz1s#~zQV_nmbEwY= z3?Rqqj%8V~?}b@N=jD3Qseo{X9oW} z7q{_Mz*b)l#jnFa_t2W1#4e*CV}Ep^L^{cs^k5(MV5RDfm}C^JSjaAsWS~XuBM$q~ zwM!q(j!xF_$yi7aJ6n>B&bfX%&P}kkQgE2wo0z9RlZMT_fy)lM&=TDL|Mmo+ E0EY0t$p8QV literal 0 HcmV?d00001 diff --git a/indexer/tests/mktar.sh b/indexer/tests/mkarchives.sh similarity index 88% rename from indexer/tests/mktar.sh rename to indexer/tests/mkarchives.sh index 9f8844d..169f2bd 100755 --- a/indexer/tests/mktar.sh +++ b/indexer/tests/mkarchives.sh @@ -1,10 +1,12 @@ #!/bin/sh # The order of inserting the files into the tar is not fully deterministic this -# way. The tests will fail quite badly if hardlink.6 is considered the +# way. The tests will fail quite badly if a hardlink is considered the # "original" version. +# simpletest.tar.gz + mkdir simple echo Hi >simple/file ln -s file simple/link @@ -17,6 +19,13 @@ rm -rf $badfn simple +# rawtest.gz.xz.bzip2 + +echo "File contents!" | gzip | xz | bzip2 >rawtest.gz.xz.bzip2 + + +# testarchive.tar.xz + mkdir man cd man diff --git a/indexer/tests/rawtest.gz.xz.bzip2 b/indexer/tests/rawtest.gz.xz.bzip2 new file mode 100644 index 0000000000000000000000000000000000000000..bc4f2e85dcc775acb097d4476898ccc2d20ceaad GIT binary patch literal 35 rcmb2|=3tnUCKAEGTzvYBXQ;=y^V*(gwKc<@FfpiITUE`^z`y_i*Q5)w literal 0 HcmV?d00001 diff --git a/indexer/tests/simpletest.tar.gz b/indexer/tests/simpletest.tar.gz index 06a535cdc1f5a295ca3254f00ae7728166efebff..409f5ca796b57bff6cbc29ea7b3932e71e5bd26a 100644 GIT binary patch literal 248 zcmVhs{7`rQLw4A) z)p*Q*?Rd6__#l603IBfp000000000000000Q{4gjPIt-xC;$ND_;|wr literal 247 zcmVkvZ6|;hx|Jj1u2vU@$3^k}Fw}GD1G!UW0 z;3nbuUgp7pd-2R%EwjTZ#2ZFQ)V#ILD!mo`^@T|-gz>^iS?`n5Yr(iB?W%Jwt8KwK zyXHk2hxhx&p8u(`%|FRPOje@_*ctO7e?7=wS*IBnWF7S^JpU+L&D{dbsJs6mJMPJ9 zJmip#|K2FHLeZ@&mR%3ep%6i4P xBkQPV;rU0m5bo>hQW4hpBL8MP{{H{~00000000000000cdH@H84l)2J00691c?tjk diff --git a/indexer/tests/testarchive.tar.xz b/indexer/tests/testarchive.tar.xz index 55b69f76ca05b8ac257a26ca0598ac6560c880ef..9892fae93b1a3e7d6a3ed69b5dfeeb8f67143cb5 100644 GIT binary patch delta 581 zcmV-L0=oU^1ndNm904nl9U6b%A<(^aTIQTy)Rvi5@`fweX8+Kxd_<}wbZns^xI_gC zPscT6q{l6`<^HZIMuQ_HozpU3V}a4|;w{!!esY3HGl_!dJ9(p?2_!}Gi@=jR8}ZK2 zg!G+JbcTu#*UF0kE_fwvyJ1Z}i^3e8MqwdP!;-@VHW-GUyykBbR0)6mrfYUC*9wH8 zBwxBO(59yLgRrguJkSPudU|T!uaHHvlGO}6@`t~(n%--3QwNv7#*#nJltHkq# z@h6%%ls*r&7m>G1b%;w_X$IO%tT6DQMW{jak%};Zg{XY>sxZ=`SQ&p_Gw=S2Ac`8? zA&^al_HD^xA)9?w za+YxXGfG@nJo$CUP$e{~sl-TkpUW|ha(r4&W6>e2A})>FzO2Y~J^D@kXg_Kbxki;< z&F#Y^fL&VeD0>>;EAUg|TqA)lfhw z!F5PVjM6*1n%OxuwR=jC-l?Wb#c|r7&qgL{O_|ul6X*Z{i^Y T3x)z0vBYQl0ssI200dcDsB;>Y delta 577 zcmV-H0>1t11n2~i904ei9U6bcUXsFzs4O9PO_a20_F!?Bihx;Fu@F9s3ri%53V?Y2 zoess^YJ1$v)lBFR-xDhG*hv3dQ@qt-)_kn250?xJkOHG_G=jU&%u0(RFN z{U}9mYEXk1*SbX}TDOl_b*YnIj5DPQZ%@c@+;2jClq>t{RCGS6zJ)S>k56E`WEYNA zP0`sjs;0i_7OiV+v!H)0zoG?K(0z%91Ym2f97zjQ^Rdym)@TaSo{xA2wKvDuo2*N< z%e@vlTQaDqV$1U&++S!nmvGro(Y`p`KpEX@ds;ts^5d3QMx~Z*_?E9{AAl4P8*k$38pzr|GAPif83mu$V>i?_@Dv9OJ-UB_AY;4^fC$+y`>9^R z49|VtEnKHoi*E368ac59T6P-`3R}ec<=nR6I$)(SwRbj?rixk?&Q8-9-RF4eNmSg2 z663^ss$*tXHNAo@8yOyBkASE&QX!_g_eZL5kz%?d$|7bzfA_}WcIHM15qHC3n0?ng z9cxHui7)&*6puBt^Ta8g#l`S(4(D`T5ol>m!Z!c_H06BdkU$A+00G1VfS>^YrSz7O PvBYQl0ssI200dcD>**BG From aff68205b0f2a6fabcd4e77ddbc72eb19fcf6cdc Mon Sep 17 00:00:00 2001 From: Yorhel Date: Sat, 5 Nov 2016 10:22:31 +0100 Subject: [PATCH 4/7] Add postgres package indexing + cli options --- indexer/Cargo.lock | 207 +++++++++++++++++++++++++++++++++++++------- indexer/Cargo.toml | 4 +- indexer/src/main.rs | 58 ++++++++++++- indexer/src/man.rs | 6 +- indexer/src/pkg.rs | 139 +++++++++++++++++++++++++++++ 5 files changed, 376 insertions(+), 38 deletions(-) create mode 100644 indexer/src/pkg.rs diff --git a/indexer/Cargo.lock b/indexer/Cargo.lock index 1d0ea13..de07942 100644 --- a/indexer/Cargo.lock +++ b/indexer/Cargo.lock @@ -2,12 +2,14 @@ name = "indexer" version = "0.1.0" dependencies = [ - "encoding 0.2.33 (registry+https://github.com/rust-lang/crates.io-index)", + "clap 2.17.1 (registry+https://github.com/rust-lang/crates.io-index)", + "encoding 0.3.0-dev (git+https://github.com/lifthrasiir/rust-encoding)", "env_logger 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)", "lazy_static 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)", "libarchive3-sys 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)", "libc 0.2.17 (registry+https://github.com/rust-lang/crates.io-index)", "log 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", + "postgres 0.12.0 (registry+https://github.com/rust-lang/crates.io-index)", "regex 0.1.80 (registry+https://github.com/rust-lang/crates.io-index)", "ring 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)", ] @@ -21,61 +23,102 @@ dependencies = [ ] [[package]] -name = "encoding" -version = "0.2.33" +name = "ansi_term" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "bitflags" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "bufstream" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "byteorder" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "clap" +version = "2.17.1" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ - "encoding-index-japanese 1.20141219.5 (registry+https://github.com/rust-lang/crates.io-index)", - "encoding-index-korean 1.20141219.5 (registry+https://github.com/rust-lang/crates.io-index)", - "encoding-index-simpchinese 1.20141219.5 (registry+https://github.com/rust-lang/crates.io-index)", - "encoding-index-singlebyte 1.20141219.5 (registry+https://github.com/rust-lang/crates.io-index)", - "encoding-index-tradchinese 1.20141219.5 (registry+https://github.com/rust-lang/crates.io-index)", + "ansi_term 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)", + "bitflags 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.17 (registry+https://github.com/rust-lang/crates.io-index)", + "strsim 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)", + "term_size 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)", + "unicode-segmentation 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)", + "unicode-width 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)", + "vec_map 0.6.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "encoding" +version = "0.3.0-dev" +source = "git+https://github.com/lifthrasiir/rust-encoding#61e331b0820311572fa00a06349b0f02511e810c" +dependencies = [ + "encoding-index-japanese 1.20141219.6 (git+https://github.com/lifthrasiir/rust-encoding)", + "encoding-index-korean 1.20141219.6 (git+https://github.com/lifthrasiir/rust-encoding)", + "encoding-index-simpchinese 1.20160120.0 (git+https://github.com/lifthrasiir/rust-encoding)", + "encoding-index-singlebyte 1.20160120.0 (git+https://github.com/lifthrasiir/rust-encoding)", + "encoding-index-tradchinese 1.20141219.6 (git+https://github.com/lifthrasiir/rust-encoding)", + "encoding-types 0.2.0 (git+https://github.com/lifthrasiir/rust-encoding)", ] [[package]] name = "encoding-index-japanese" -version = "1.20141219.5" -source = "registry+https://github.com/rust-lang/crates.io-index" +version = "1.20141219.6" +source = "git+https://github.com/lifthrasiir/rust-encoding#61e331b0820311572fa00a06349b0f02511e810c" dependencies = [ - "encoding_index_tests 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)", + "encoding_index_tests 0.1.5 (git+https://github.com/lifthrasiir/rust-encoding)", ] [[package]] name = "encoding-index-korean" -version = "1.20141219.5" -source = "registry+https://github.com/rust-lang/crates.io-index" +version = "1.20141219.6" +source = "git+https://github.com/lifthrasiir/rust-encoding#61e331b0820311572fa00a06349b0f02511e810c" dependencies = [ - "encoding_index_tests 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)", + "encoding_index_tests 0.1.5 (git+https://github.com/lifthrasiir/rust-encoding)", ] [[package]] name = "encoding-index-simpchinese" -version = "1.20141219.5" -source = "registry+https://github.com/rust-lang/crates.io-index" +version = "1.20160120.0" +source = "git+https://github.com/lifthrasiir/rust-encoding#61e331b0820311572fa00a06349b0f02511e810c" dependencies = [ - "encoding_index_tests 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)", + "encoding_index_tests 0.1.5 (git+https://github.com/lifthrasiir/rust-encoding)", ] [[package]] name = "encoding-index-singlebyte" -version = "1.20141219.5" -source = "registry+https://github.com/rust-lang/crates.io-index" +version = "1.20160120.0" +source = "git+https://github.com/lifthrasiir/rust-encoding#61e331b0820311572fa00a06349b0f02511e810c" dependencies = [ - "encoding_index_tests 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)", + "encoding_index_tests 0.1.5 (git+https://github.com/lifthrasiir/rust-encoding)", ] [[package]] name = "encoding-index-tradchinese" -version = "1.20141219.5" -source = "registry+https://github.com/rust-lang/crates.io-index" +version = "1.20141219.6" +source = "git+https://github.com/lifthrasiir/rust-encoding#61e331b0820311572fa00a06349b0f02511e810c" dependencies = [ - "encoding_index_tests 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)", + "encoding_index_tests 0.1.5 (git+https://github.com/lifthrasiir/rust-encoding)", ] +[[package]] +name = "encoding-types" +version = "0.2.0" +source = "git+https://github.com/lifthrasiir/rust-encoding#61e331b0820311572fa00a06349b0f02511e810c" + [[package]] name = "encoding_index_tests" -version = "0.1.4" -source = "registry+https://github.com/rust-lang/crates.io-index" +version = "0.1.5" +source = "git+https://github.com/lifthrasiir/rust-encoding#61e331b0820311572fa00a06349b0f02511e810c" [[package]] name = "env_logger" @@ -86,6 +129,16 @@ dependencies = [ "regex 0.1.80 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "fallible-iterator" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "hex" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "kernel32-sys" version = "0.2.2" @@ -119,6 +172,11 @@ name = "log" version = "0.3.6" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "md5" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "memchr" version = "0.1.11" @@ -127,11 +185,48 @@ dependencies = [ "libc 0.2.17 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "phf" +version = "0.7.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "phf_shared 0.7.15 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "phf_shared" +version = "0.7.15" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "pkg-config" version = "0.3.8" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "postgres" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "bufstream 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)", + "fallible-iterator 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)", + "hex 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)", + "log 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", + "phf 0.7.15 (registry+https://github.com/rust-lang/crates.io-index)", + "postgres-protocol 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "postgres-protocol" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "byteorder 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)", + "fallible-iterator 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)", + "hex 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)", + "md5 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "regex" version = "0.1.80" @@ -158,6 +253,21 @@ dependencies = [ "untrusted 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "strsim" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "term_size" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "kernel32-sys 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.17 (registry+https://github.com/rust-lang/crates.io-index)", + "winapi 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "thread-id" version = "2.0.0" @@ -175,6 +285,16 @@ dependencies = [ "thread-id 2.0.0 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "unicode-segmentation" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "unicode-width" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "untrusted" version = "0.3.1" @@ -185,6 +305,11 @@ name = "utf8-ranges" version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "vec_map" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "winapi" version = "0.2.8" @@ -197,27 +322,45 @@ source = "registry+https://github.com/rust-lang/crates.io-index" [metadata] "checksum aho-corasick 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)" = "ca972c2ea5f742bfce5687b9aef75506a764f61d37f8f649047846a9686ddb66" -"checksum encoding 0.2.33 (registry+https://github.com/rust-lang/crates.io-index)" = "6b0d943856b990d12d3b55b359144ff341533e516d94098b1d3fc1ac666d36ec" -"checksum encoding-index-japanese 1.20141219.5 (registry+https://github.com/rust-lang/crates.io-index)" = "04e8b2ff42e9a05335dbf8b5c6f7567e5591d0d916ccef4e0b1710d32a0d0c91" -"checksum encoding-index-korean 1.20141219.5 (registry+https://github.com/rust-lang/crates.io-index)" = "4dc33fb8e6bcba213fe2f14275f0963fd16f0a02c878e3095ecfdf5bee529d81" -"checksum encoding-index-simpchinese 1.20141219.5 (registry+https://github.com/rust-lang/crates.io-index)" = "d87a7194909b9118fc707194baa434a4e3b0fb6a5a757c73c3adb07aa25031f7" -"checksum encoding-index-singlebyte 1.20141219.5 (registry+https://github.com/rust-lang/crates.io-index)" = "3351d5acffb224af9ca265f435b859c7c01537c0849754d3db3fdf2bfe2ae84a" -"checksum encoding-index-tradchinese 1.20141219.5 (registry+https://github.com/rust-lang/crates.io-index)" = "fd0e20d5688ce3cab59eb3ef3a2083a5c77bf496cb798dc6fcdb75f323890c18" -"checksum encoding_index_tests 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)" = "a246d82be1c9d791c5dfde9a2bd045fc3cbba3fa2b11ad558f27d01712f00569" +"checksum ansi_term 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)" = "23ac7c30002a5accbf7e8987d0632fa6de155b7c3d39d0067317a391e00a2ef6" +"checksum bitflags 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "aad18937a628ec6abcd26d1489012cc0e18c21798210f491af69ded9b881106d" +"checksum bufstream 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7b48dbe2ff0e98fa2f03377d204a9637d3c9816cd431bfe05a8abbd0ea11d074" +"checksum byteorder 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)" = "0fc10e8cc6b2580fda3f36eb6dc5316657f812a3df879a44a66fc9f0fdbc4855" +"checksum clap 2.17.1 (registry+https://github.com/rust-lang/crates.io-index)" = "27dac76762fb56019b04aed3ccb43a770a18f80f9c2eb62ee1a18d9fb4ea2430" +"checksum encoding 0.3.0-dev (git+https://github.com/lifthrasiir/rust-encoding)" = "" +"checksum encoding-index-japanese 1.20141219.6 (git+https://github.com/lifthrasiir/rust-encoding)" = "" +"checksum encoding-index-korean 1.20141219.6 (git+https://github.com/lifthrasiir/rust-encoding)" = "" +"checksum encoding-index-simpchinese 1.20160120.0 (git+https://github.com/lifthrasiir/rust-encoding)" = "" +"checksum encoding-index-singlebyte 1.20160120.0 (git+https://github.com/lifthrasiir/rust-encoding)" = "" +"checksum encoding-index-tradchinese 1.20141219.6 (git+https://github.com/lifthrasiir/rust-encoding)" = "" +"checksum encoding-types 0.2.0 (git+https://github.com/lifthrasiir/rust-encoding)" = "" +"checksum encoding_index_tests 0.1.5 (git+https://github.com/lifthrasiir/rust-encoding)" = "" "checksum env_logger 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)" = "15abd780e45b3ea4f76b4e9a26ff4843258dd8a3eed2775a0e7368c2e7936c2f" +"checksum fallible-iterator 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)" = "5d48ab1bc11a086628e8cc0cc2c2dc200b884ac05c4b48fb71d6036b6999ff1d" +"checksum hex 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "d6a22814455d41612f41161581c2883c0c6a1c41852729b17d5ed88f01e153aa" "checksum kernel32-sys 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7507624b29483431c0ba2d82aece8ca6cdba9382bff4ddd0f7490560c056098d" "checksum lazy_static 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "49247ec2a285bb3dcb23cbd9c35193c025e7251bfce77c1d5da97e6362dffe7f" "checksum libarchive3-sys 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "3cd3beae8f59a4c7a806523269b5392037577c150446e88d684dfa6de6031ca7" "checksum libc 0.2.17 (registry+https://github.com/rust-lang/crates.io-index)" = "044d1360593a78f5c8e5e710beccdc24ab71d1f01bc19a29bcacdba22e8475d8" "checksum log 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)" = "ab83497bf8bf4ed2a74259c1c802351fcd67a65baa86394b6ba73c36f4838054" +"checksum md5 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "7df230903ccdffd6b3b4ec21624498ea64c912ce50297846907f0b8e1bb249dd" "checksum memchr 0.1.11 (registry+https://github.com/rust-lang/crates.io-index)" = "d8b629fb514376c675b98c1421e80b151d3817ac42d7c667717d282761418d20" +"checksum phf 0.7.15 (registry+https://github.com/rust-lang/crates.io-index)" = "17896951e179a6cbed7d3519b3078ac6c03a347d3e9cf8f303c8a1a73c5a3e44" +"checksum phf_shared 0.7.15 (registry+https://github.com/rust-lang/crates.io-index)" = "bb6c14aac1140c2b06b41477096f249416b17c893d56386a892ac657edfdffba" "checksum pkg-config 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)" = "8cee804ecc7eaf201a4a207241472cc870e825206f6c031e3ee2a72fa425f2fa" +"checksum postgres 0.12.0 (registry+https://github.com/rust-lang/crates.io-index)" = "a7ef92468927003a037e175b54320319e358886865899b37f7318837a646a9fd" +"checksum postgres-protocol 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "7e2fc3d800dacc2dd749b690ad15b9b78bc04c26c3f0525cbe163436559bc3fc" "checksum regex 0.1.80 (registry+https://github.com/rust-lang/crates.io-index)" = "4fd4ace6a8cf7860714a2c2280d6c1f7e6a413486c13298bbc86fd3da019402f" "checksum regex-syntax 0.3.9 (registry+https://github.com/rust-lang/crates.io-index)" = "f9ec002c35e86791825ed294b50008eea9ddfc8def4420124fbc6b08db834957" "checksum ring 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)" = "c0743ef007bcff4909b107907a410418eb7e5c6ad55b843d70b39f62bfb7112e" +"checksum strsim 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)" = "50c069df92e4b01425a8bf3576d5d417943a6a7272fbabaf5bd80b1aaa76442e" +"checksum term_size 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "3f7f5f3f71b0040cecc71af239414c23fd3c73570f5ff54cf50e03cef637f2a0" "checksum thread-id 2.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "a9539db560102d1cef46b8b78ce737ff0bb64e7e18d35b2a5688f7d097d0ff03" "checksum thread_local 0.2.7 (registry+https://github.com/rust-lang/crates.io-index)" = "8576dbbfcaef9641452d5cf0df9b0e7eeab7694956dd33bb61515fb8f18cfdd5" +"checksum unicode-segmentation 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "b905d0fc2a1f0befd86b0e72e31d1787944efef9d38b9358a9e92a69757f7e3b" +"checksum unicode-width 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)" = "2d6722facc10989f63ee0e20a83cd4e1714a9ae11529403ac7e0afd069abc39e" "checksum untrusted 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)" = "5d9bc0e6e73a10975d1fbff8ac3541e221181b0d8998351600fb5523de634c0d" "checksum utf8-ranges 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)" = "a1ca13c08c41c9c3e04224ed9ff80461d97e121589ff27c753a16cb10830ae0f" +"checksum vec_map 0.6.0 (registry+https://github.com/rust-lang/crates.io-index)" = "cac5efe5cb0fa14ec2f84f83c701c562ee63f6dcc680861b21d65c682adfb05f" "checksum winapi 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)" = "167dc9d6949a9b857f3451275e911c3f44255842c1f7a76f33c55103a909087a" "checksum winapi-build 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "2d315eee3b34aca4797b2da6b13ed88266e6d612562a0c46390af8299fc699bc" diff --git a/indexer/Cargo.toml b/indexer/Cargo.toml index f97465d..5c828ba 100644 --- a/indexer/Cargo.toml +++ b/indexer/Cargo.toml @@ -10,5 +10,7 @@ env_logger = "0.3.5" lazy_static = "0.2.1" libc = "0.2.17" libarchive3-sys = "0.1.2" -encoding = "0.2.33" +encoding = { git = "https://github.com/lifthrasiir/rust-encoding", features = ["no-optimized-legacy-encoding"] } ring = "0.5.3" +postgres = "0.12.0" +clap = "2.16.3" diff --git a/indexer/src/main.rs b/indexer/src/main.rs index bcb5ee8..5649528 100644 --- a/indexer/src/main.rs +++ b/indexer/src/main.rs @@ -1,17 +1,71 @@ #[macro_use] extern crate log; #[macro_use] extern crate lazy_static; +#[macro_use] extern crate clap; extern crate env_logger; extern crate regex; extern crate libarchive3_sys; extern crate libc; extern crate ring; extern crate encoding; +extern crate postgres; mod archive; mod archread; mod man; +mod pkg; + + +// Convenience function to get a system id by short-name. Panics if the system doesn't exist. +fn sysbyshort(conn: &postgres::GenericConnection, short: &str) -> i32 { + let r = conn.query("SELECT id FROM systems WHERE short = $1", &[&short]).unwrap(); + if r.is_empty() { + panic!("Invalid system: {}", short); + } + r.get(0).get(0) +} + fn main() { - env_logger::init().unwrap(); - info!("Hello, world!"); + let arg = clap_app!(indexer => + (about: "Manned.org man page indexer") + (@arg v: -v +multiple "Increase verbosity") + (@arg host: -h +required +takes_value "PostgreSQL connection string") + (@subcommand pkg => + (about: "Index a single package") + (@arg sys: --sys +required +takes_value "System short-name") + (@arg cat: --cat +required +takes_value "Package category") + (@arg pkg: --pkg +required +takes_value "Package name") + (@arg ver: --ver +required +takes_value "Package version") + (@arg date: --date +required +takes_value "Package release date") + (@arg FILE: +required "Package file") + ) + ).get_matches(); + + let verbose = arg.occurrences_of("v"); + env_logger::LogBuilder::new() + .filter(Some("indexer"), match verbose { + 0 => log::LogLevelFilter::Warn, + 1 => log::LogLevelFilter::Info, + 2 => log::LogLevelFilter::Debug, + _ => log::LogLevelFilter::Trace, + }) + .filter(Some("postgres"), if verbose >= 4 { log::LogLevelFilter::Trace } else { log::LogLevelFilter::Info }) + .init().unwrap(); + + let db = match postgres::Connection::connect(arg.value_of("host").unwrap(), postgres::TlsMode::None) { + Ok(x) => x, + Err(x) => { error!("Can't connect to postgres: {}", x); return }, + }; + debug!("Connected to database"); + + if let Some(matches) = arg.subcommand_matches("pkg") { + pkg::pkg(&db, pkg::PkgOpt { + sys: sysbyshort(&db, matches.value_of("sys").unwrap()), + cat: matches.value_of("cat").unwrap(), + pkg: matches.value_of("pkg").unwrap(), + ver: matches.value_of("ver").unwrap(), + date: matches.value_of("date").unwrap(), + file: matches.value_of("FILE").unwrap() + }); + } } diff --git a/indexer/src/man.rs b/indexer/src/man.rs index 56a45b3..9bcb2bf 100644 --- a/indexer/src/man.rs +++ b/indexer/src/man.rs @@ -18,7 +18,7 @@ const MIN_MAN_SIZE: u64 = 9; // Checks a path for a man page candidate. Returns None if it doesn't seem like a man page // location, otherwise Some((manPageName, Section, Locale)). -fn parse_path(path: &str) -> Option<(&str, &str, &str)> { +pub fn parse_path(path: &str) -> Option<(&str, &str, &str)> { // Roughly: man[/locale]/man1/manpage.section[.compression]+ lazy_static! { static ref RE: Regex = Regex::new(r"(?x) @@ -112,8 +112,8 @@ fn codec_from_tag(data: &Vec) -> Option { // latin-1 isn't in the whatwg spec under that name "latin-1" => Some(all::WINDOWS_1252), - // Waaaaaaaaah we can't decode this :( - "armscii-8" => None, + // armscii isn't in the whatwg spec at all + "armscii-8" => Some(all::ARMSCII_8), // Anything else should be found by its whatwg label. x => match encoding_from_whatwg_label(x) { diff --git a/indexer/src/pkg.rs b/indexer/src/pkg.rs new file mode 100644 index 0000000..a2c0c73 --- /dev/null +++ b/indexer/src/pkg.rs @@ -0,0 +1,139 @@ +use std; +use std::io::Read; +use postgres; + +use archive; +use archread; +use man; +use archive::Archive; + +pub struct PkgOpt<'a> { + pub sys: i32, + pub cat: &'a str, + pub pkg: &'a str, + pub ver: &'a str, + pub date: &'a str, + pub file: &'a str +} + + +fn insert_pkg(tr: &postgres::transaction::Transaction, opt: &PkgOpt) -> Option { + // The ON CONFLICT .. DO UPDATE is used instead of DO NOTHING because in that case the + // RETURNING clause wouldn't give us a package id. + let q = "INSERT INTO packages (system, category, name) VALUES($1, $2, $3) + ON CONFLICT ON CONSTRAINT packages_system_name_category_key DO UPDATE SET name=$3 RETURNING id"; + let pkgid: i32 = match tr.query(q, &[&opt.sys, &opt.cat, &opt.pkg]) { + Err(e) => { + error!("Can't insert package in database: {}", e); + return None; + }, + Ok(r) => r.get(0).get(0), + }; + + // TODO: option to overwrite an existing package version + let q = "INSERT INTO package_versions (package, version, released) VALUES($1, $2, $3::text::date) RETURNING id"; + let verid: i32 = match tr.query(q, &[&pkgid, &opt.ver, &opt.date]) { + Err(e) => { + error!("Can't insert package version in database: {}", e); + return None; + }, + Ok(r) => r.get(0).get(0), + }; + trace!("Package pkgid {} verid {}", pkgid, verid); + Some(verid) +} + + +fn insert_man_row(tr: &postgres::GenericConnection, verid: i32, path: &str, hash: &[u8]) { + // TODO: Store 'encoding' in the database + let (name, sect, locale) = man::parse_path(path).unwrap(); + if let Err(e) = tr.execute( + "INSERT INTO man (package, name, filename, locale, hash, section) VALUES ($1, $2, '/'||$3, $4, $5, $6)", + &[&verid, &name, &path, &locale, &hash, §] + ) { + // I think this can only happen if archread gives us the same file twice, which really + // shouldn't happen. But I'd rather continue with an error logged than panic. + error!("Can't insert verid {} fn {}: {}", verid, path, e); + } +} + + +fn insert_man(tr: &postgres::GenericConnection, verid: i32, paths: &[&str], ent: &mut Read) { + let (dig, enc, cont) = match man::decode(paths, ent) { + Err(e) => { error!("Error decoding {}: {}", paths[0], e); return }, + Ok(x) => x, + }; + + // TODO: Overwrite entry if the contents are different? It's possible that earlier decoding + // implementations didn't properly detect the encoding. (On the other hand, due to differences + // in filenames it's also possible that THIS decoding step went wrong. Ugh) + tr.execute( + "INSERT INTO contents (hash, content) VALUES($1, $2) ON CONFLICT (hash) DO NOTHING", + &[&dig.as_ref(), &cont] + ).unwrap(); + + for path in paths { + insert_man_row(tr, verid, path, dig.as_ref()); + debug!("Inserted man page: {} ({})", path, enc); + } +} + + +fn insert_link(tr: &postgres::GenericConnection, verid: i32, src: &str, dest: &str) { + let hash = tr.query("SELECT hash FROM man WHERE package = $1 AND filename = '/'||$2", &[&verid, &dest]).unwrap(); + if hash.is_empty() { /* Can happen if man::decode() failed previously. */ + error!("Link to unindexed man page: {} -> {}", src, dest); + return; + } + let hash: Vec = hash.get(0).get(0); + insert_man_row(tr, verid, src, &hash); + debug!("Inserted man link: {} -> {}", src, dest); +} + + +fn with_pkg(file: &str, cb: F) -> std::io::Result + where F: FnOnce(Option) -> std::io::Result +{ + // TODO: Support streaming from URLs + // TODO: How does .deb support fit into this? (Or anything else with metadata) + let mut f = try!(std::fs::File::open(file)); + let ent = try!(Archive::open_archive(&mut f)); + cb(ent) +} + + +fn index_pkg(tr: &postgres::GenericConnection, opt: &PkgOpt, verid: i32) -> std::io::Result<()> { + let indexfunc = |paths: &[&str], ent: &mut archive::ArchiveEntry| { + insert_man(tr, verid, paths, ent); + Ok(()) /* Don't propagate errors, continue handling other man pages */ + }; + + let missed = try!( + with_pkg(opt.file, |ent| { archread::FileList::read(ent, man::ismanpath, &indexfunc) }) + ).links(|src, dest| { insert_link(tr, verid, src, dest) }); + + if let Some(missed) = missed { + warn!("Some links were missed, reading package again"); + try!(with_pkg(opt.file, |ent| { missed.read(ent, indexfunc) })) + } + Ok(()) +} + + +pub fn pkg(conn: &postgres::GenericConnection, opt: PkgOpt) { + info!("Handling pkg: {} / {} / {} - {} @ {} in {}", opt.sys, opt.cat, opt.pkg, opt.ver, opt.date, opt.file); + + let tr = conn.transaction().unwrap(); + tr.set_rollback(); + + let verid = match insert_pkg(&tr, &opt) { Some(x) => x, None => return }; + + match index_pkg(&tr, &opt, verid) { + Err(e) => error!("Error reading package: {}", e), + Ok(_) => tr.set_commit() + } + + if let Err(e) = tr.finish() { + error!("Error finishing transaction: {}", e); + } +} From 35fab522d6b36c4a151d51a0e79e2650b20d29d6 Mon Sep 17 00:00:00 2001 From: Yorhel Date: Sun, 6 Nov 2016 09:21:53 +0100 Subject: [PATCH 5/7] Indexer: Support HTTP fetching + misc improvements --- indexer/Cargo.lock | 289 ++++++++++++++++++++++++++++++++++++++++++++ indexer/Cargo.toml | 1 + indexer/src/main.rs | 10 +- indexer/src/pkg.rs | 69 +++++++---- 4 files changed, 345 insertions(+), 24 deletions(-) diff --git a/indexer/Cargo.lock b/indexer/Cargo.lock index de07942..501d8d3 100644 --- a/indexer/Cargo.lock +++ b/indexer/Cargo.lock @@ -5,6 +5,7 @@ dependencies = [ "clap 2.17.1 (registry+https://github.com/rust-lang/crates.io-index)", "encoding 0.3.0-dev (git+https://github.com/lifthrasiir/rust-encoding)", "env_logger 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)", + "hyper 0.9.11 (registry+https://github.com/rust-lang/crates.io-index)", "lazy_static 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)", "libarchive3-sys 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)", "libc 0.2.17 (registry+https://github.com/rust-lang/crates.io-index)", @@ -57,6 +58,17 @@ dependencies = [ "vec_map 0.6.0 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "cookie" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "openssl 0.7.14 (registry+https://github.com/rust-lang/crates.io-index)", + "rustc-serialize 0.3.19 (registry+https://github.com/rust-lang/crates.io-index)", + "time 0.1.35 (registry+https://github.com/rust-lang/crates.io-index)", + "url 1.2.3 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "encoding" version = "0.3.0-dev" @@ -134,11 +146,70 @@ name = "fallible-iterator" version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "gcc" +version = "0.3.38" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "gdi32-sys" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "winapi 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)", + "winapi-build 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "hex" version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "hpack" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "log 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "httparse" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "hyper" +version = "0.9.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "cookie 0.2.5 (registry+https://github.com/rust-lang/crates.io-index)", + "httparse 1.1.2 (registry+https://github.com/rust-lang/crates.io-index)", + "language-tags 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)", + "log 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", + "mime 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)", + "num_cpus 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)", + "openssl 0.7.14 (registry+https://github.com/rust-lang/crates.io-index)", + "openssl-verify 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", + "rustc-serialize 0.3.19 (registry+https://github.com/rust-lang/crates.io-index)", + "solicit 0.4.4 (registry+https://github.com/rust-lang/crates.io-index)", + "time 0.1.35 (registry+https://github.com/rust-lang/crates.io-index)", + "traitobject 0.0.1 (registry+https://github.com/rust-lang/crates.io-index)", + "typeable 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)", + "unicase 1.4.0 (registry+https://github.com/rust-lang/crates.io-index)", + "url 1.2.3 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "idna" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "matches 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)", + "unicode-bidi 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)", + "unicode-normalization 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "kernel32-sys" version = "0.2.2" @@ -148,6 +219,11 @@ dependencies = [ "winapi-build 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "language-tags" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "lazy_static" version = "0.2.1" @@ -167,11 +243,24 @@ name = "libc" version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "libressl-pnacl-sys" +version = "2.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "pnacl-build-helper 1.4.10 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "log" version = "0.3.6" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "matches" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "md5" version = "0.2.1" @@ -185,6 +274,65 @@ dependencies = [ "libc 0.2.17 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "mime" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "log 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "num_cpus" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "libc 0.2.17 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "openssl" +version = "0.7.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "bitflags 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)", + "gcc 0.3.38 (registry+https://github.com/rust-lang/crates.io-index)", + "lazy_static 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.17 (registry+https://github.com/rust-lang/crates.io-index)", + "openssl-sys 0.7.17 (registry+https://github.com/rust-lang/crates.io-index)", + "openssl-sys-extras 0.7.14 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "openssl-sys" +version = "0.7.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "gdi32-sys 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.17 (registry+https://github.com/rust-lang/crates.io-index)", + "libressl-pnacl-sys 2.1.6 (registry+https://github.com/rust-lang/crates.io-index)", + "pkg-config 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)", + "user32-sys 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "openssl-sys-extras" +version = "0.7.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "gcc 0.3.38 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.17 (registry+https://github.com/rust-lang/crates.io-index)", + "openssl-sys 0.7.17 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "openssl-verify" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "openssl 0.7.14 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "phf" version = "0.7.15" @@ -203,6 +351,14 @@ name = "pkg-config" version = "0.3.8" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "pnacl-build-helper" +version = "1.4.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "tempdir 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "postgres" version = "0.12.0" @@ -227,6 +383,14 @@ dependencies = [ "md5 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "rand" +version = "0.3.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "libc 0.2.17 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "regex" version = "0.1.80" @@ -253,11 +417,46 @@ dependencies = [ "untrusted 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "rustc-serialize" +version = "0.3.19" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "rustc_version" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "semver 0.1.20 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "semver" +version = "0.1.20" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "solicit" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "hpack 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)", + "log 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "strsim" version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "tempdir" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "rand 0.3.14 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "term_size" version = "0.2.1" @@ -285,6 +484,47 @@ dependencies = [ "thread-id 2.0.0 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "time" +version = "0.1.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "kernel32-sys 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.17 (registry+https://github.com/rust-lang/crates.io-index)", + "winapi 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "traitobject" +version = "0.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "typeable" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "unicase" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "rustc_version 0.1.7 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "unicode-bidi" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "matches 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "unicode-normalization" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "unicode-segmentation" version = "0.1.2" @@ -300,6 +540,24 @@ name = "untrusted" version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "url" +version = "1.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "idna 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", + "matches 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "user32-sys" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "winapi 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)", + "winapi-build 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "utf8-ranges" version = "0.1.3" @@ -327,6 +585,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum bufstream 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7b48dbe2ff0e98fa2f03377d204a9637d3c9816cd431bfe05a8abbd0ea11d074" "checksum byteorder 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)" = "0fc10e8cc6b2580fda3f36eb6dc5316657f812a3df879a44a66fc9f0fdbc4855" "checksum clap 2.17.1 (registry+https://github.com/rust-lang/crates.io-index)" = "27dac76762fb56019b04aed3ccb43a770a18f80f9c2eb62ee1a18d9fb4ea2430" +"checksum cookie 0.2.5 (registry+https://github.com/rust-lang/crates.io-index)" = "0e3d6405328b6edb412158b3b7710e2634e23f3614b9bb1c412df7952489a626" "checksum encoding 0.3.0-dev (git+https://github.com/lifthrasiir/rust-encoding)" = "" "checksum encoding-index-japanese 1.20141219.6 (git+https://github.com/lifthrasiir/rust-encoding)" = "" "checksum encoding-index-korean 1.20141219.6 (git+https://github.com/lifthrasiir/rust-encoding)" = "" @@ -337,29 +596,59 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum encoding_index_tests 0.1.5 (git+https://github.com/lifthrasiir/rust-encoding)" = "" "checksum env_logger 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)" = "15abd780e45b3ea4f76b4e9a26ff4843258dd8a3eed2775a0e7368c2e7936c2f" "checksum fallible-iterator 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)" = "5d48ab1bc11a086628e8cc0cc2c2dc200b884ac05c4b48fb71d6036b6999ff1d" +"checksum gcc 0.3.38 (registry+https://github.com/rust-lang/crates.io-index)" = "553f11439bdefe755bf366b264820f1da70f3aaf3924e594b886beb9c831bcf5" +"checksum gdi32-sys 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "0912515a8ff24ba900422ecda800b52f4016a56251922d397c576bf92c690518" "checksum hex 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "d6a22814455d41612f41161581c2883c0c6a1c41852729b17d5ed88f01e153aa" +"checksum hpack 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "3d2da7d3a34cf6406d9d700111b8eafafe9a251de41ae71d8052748259343b58" +"checksum httparse 1.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "46534074dbb80b070d60a5cb8ecadd8963a00a438ae1a95268850a7ef73b67ae" +"checksum hyper 0.9.11 (registry+https://github.com/rust-lang/crates.io-index)" = "edd47c66782933e546a32ae89ca3c49263b2ba9bc29f3a0d5c52fff48e0ac67c" +"checksum idna 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "1053236e00ce4f668aeca4a769a09b3bf5a682d802abd6f3cb39374f6b162c11" "checksum kernel32-sys 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7507624b29483431c0ba2d82aece8ca6cdba9382bff4ddd0f7490560c056098d" +"checksum language-tags 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "a91d884b6667cd606bb5a69aa0c99ba811a115fc68915e7056ec08a46e93199a" "checksum lazy_static 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "49247ec2a285bb3dcb23cbd9c35193c025e7251bfce77c1d5da97e6362dffe7f" "checksum libarchive3-sys 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "3cd3beae8f59a4c7a806523269b5392037577c150446e88d684dfa6de6031ca7" "checksum libc 0.2.17 (registry+https://github.com/rust-lang/crates.io-index)" = "044d1360593a78f5c8e5e710beccdc24ab71d1f01bc19a29bcacdba22e8475d8" +"checksum libressl-pnacl-sys 2.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "cbc058951ab6a3ef35ca16462d7642c4867e6403520811f28537a4e2f2db3e71" "checksum log 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)" = "ab83497bf8bf4ed2a74259c1c802351fcd67a65baa86394b6ba73c36f4838054" +"checksum matches 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)" = "efd7622e3022e1a6eaa602c4cea8912254e5582c9c692e9167714182244801b1" "checksum md5 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "7df230903ccdffd6b3b4ec21624498ea64c912ce50297846907f0b8e1bb249dd" "checksum memchr 0.1.11 (registry+https://github.com/rust-lang/crates.io-index)" = "d8b629fb514376c675b98c1421e80b151d3817ac42d7c667717d282761418d20" +"checksum mime 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "b5c93a4bd787ddc6e7833c519b73a50883deb5863d76d9b71eb8216fb7f94e66" +"checksum num_cpus 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "8890e6084723d57d0df8d2720b0d60c6ee67d6c93e7169630e4371e88765dcad" +"checksum openssl 0.7.14 (registry+https://github.com/rust-lang/crates.io-index)" = "c4117b6244aac42ed0150a6019b4d953d28247c5dd6ae6f46ae469b5f2318733" +"checksum openssl-sys 0.7.17 (registry+https://github.com/rust-lang/crates.io-index)" = "89c47ee94c352eea9ddaf8e364be7f978a3bb6d66d73176572484238dd5a5c3f" +"checksum openssl-sys-extras 0.7.14 (registry+https://github.com/rust-lang/crates.io-index)" = "11c5e1dba7d3d03d80f045bf0d60111dc69213b67651e7c889527a3badabb9fa" +"checksum openssl-verify 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "3ed86cce894f6b0ed4572e21eb34026f1dc8869cb9ee3869029131bc8c3feb2d" "checksum phf 0.7.15 (registry+https://github.com/rust-lang/crates.io-index)" = "17896951e179a6cbed7d3519b3078ac6c03a347d3e9cf8f303c8a1a73c5a3e44" "checksum phf_shared 0.7.15 (registry+https://github.com/rust-lang/crates.io-index)" = "bb6c14aac1140c2b06b41477096f249416b17c893d56386a892ac657edfdffba" "checksum pkg-config 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)" = "8cee804ecc7eaf201a4a207241472cc870e825206f6c031e3ee2a72fa425f2fa" +"checksum pnacl-build-helper 1.4.10 (registry+https://github.com/rust-lang/crates.io-index)" = "61c9231d31aea845007443d62fcbb58bb6949ab9c18081ee1e09920e0cf1118b" "checksum postgres 0.12.0 (registry+https://github.com/rust-lang/crates.io-index)" = "a7ef92468927003a037e175b54320319e358886865899b37f7318837a646a9fd" "checksum postgres-protocol 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "7e2fc3d800dacc2dd749b690ad15b9b78bc04c26c3f0525cbe163436559bc3fc" +"checksum rand 0.3.14 (registry+https://github.com/rust-lang/crates.io-index)" = "2791d88c6defac799c3f20d74f094ca33b9332612d9aef9078519c82e4fe04a5" "checksum regex 0.1.80 (registry+https://github.com/rust-lang/crates.io-index)" = "4fd4ace6a8cf7860714a2c2280d6c1f7e6a413486c13298bbc86fd3da019402f" "checksum regex-syntax 0.3.9 (registry+https://github.com/rust-lang/crates.io-index)" = "f9ec002c35e86791825ed294b50008eea9ddfc8def4420124fbc6b08db834957" "checksum ring 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)" = "c0743ef007bcff4909b107907a410418eb7e5c6ad55b843d70b39f62bfb7112e" +"checksum rustc-serialize 0.3.19 (registry+https://github.com/rust-lang/crates.io-index)" = "6159e4e6e559c81bd706afe9c8fd68f547d3e851ce12e76b1de7914bab61691b" +"checksum rustc_version 0.1.7 (registry+https://github.com/rust-lang/crates.io-index)" = "c5f5376ea5e30ce23c03eb77cbe4962b988deead10910c372b226388b594c084" +"checksum semver 0.1.20 (registry+https://github.com/rust-lang/crates.io-index)" = "d4f410fedcf71af0345d7607d246e7ad15faaadd49d240ee3b24e5dc21a820ac" +"checksum solicit 0.4.4 (registry+https://github.com/rust-lang/crates.io-index)" = "172382bac9424588d7840732b250faeeef88942e37b6e35317dce98cafdd75b2" "checksum strsim 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)" = "50c069df92e4b01425a8bf3576d5d417943a6a7272fbabaf5bd80b1aaa76442e" +"checksum tempdir 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)" = "87974a6f5c1dfb344d733055601650059a3363de2a6104819293baff662132d6" "checksum term_size 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "3f7f5f3f71b0040cecc71af239414c23fd3c73570f5ff54cf50e03cef637f2a0" "checksum thread-id 2.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "a9539db560102d1cef46b8b78ce737ff0bb64e7e18d35b2a5688f7d097d0ff03" "checksum thread_local 0.2.7 (registry+https://github.com/rust-lang/crates.io-index)" = "8576dbbfcaef9641452d5cf0df9b0e7eeab7694956dd33bb61515fb8f18cfdd5" +"checksum time 0.1.35 (registry+https://github.com/rust-lang/crates.io-index)" = "3c7ec6d62a20df54e07ab3b78b9a3932972f4b7981de295563686849eb3989af" +"checksum traitobject 0.0.1 (registry+https://github.com/rust-lang/crates.io-index)" = "07eaeb7689bb7fca7ce15628319635758eda769fed481ecfe6686ddef2600616" +"checksum typeable 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "1410f6f91f21d1612654e7cc69193b0334f909dcf2c790c4826254fbb86f8887" +"checksum unicase 1.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "13a5906ca2b98c799f4b1ab4557b76367ebd6ae5ef14930ec841c74aed5f3764" +"checksum unicode-bidi 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)" = "c1f7ceb96afdfeedee42bade65a0d585a6a0106f681b6749c8ff4daa8df30b3f" +"checksum unicode-normalization 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "26643a2f83bac55f1976fb716c10234485f9202dcd65cfbdf9da49867b271172" "checksum unicode-segmentation 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "b905d0fc2a1f0befd86b0e72e31d1787944efef9d38b9358a9e92a69757f7e3b" "checksum unicode-width 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)" = "2d6722facc10989f63ee0e20a83cd4e1714a9ae11529403ac7e0afd069abc39e" "checksum untrusted 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)" = "5d9bc0e6e73a10975d1fbff8ac3541e221181b0d8998351600fb5523de634c0d" +"checksum url 1.2.3 (registry+https://github.com/rust-lang/crates.io-index)" = "48ccf7bd87a81b769cf84ad556e034541fb90e1cd6d4bc375c822ed9500cd9d7" +"checksum user32-sys 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "4ef4711d107b21b410a3a974b1204d9accc8b10dad75d8324b5d755de1617d47" "checksum utf8-ranges 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)" = "a1ca13c08c41c9c3e04224ed9ff80461d97e121589ff27c753a16cb10830ae0f" "checksum vec_map 0.6.0 (registry+https://github.com/rust-lang/crates.io-index)" = "cac5efe5cb0fa14ec2f84f83c701c562ee63f6dcc680861b21d65c682adfb05f" "checksum winapi 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)" = "167dc9d6949a9b857f3451275e911c3f44255842c1f7a76f33c55103a909087a" diff --git a/indexer/Cargo.toml b/indexer/Cargo.toml index 5c828ba..d8ce4a0 100644 --- a/indexer/Cargo.toml +++ b/indexer/Cargo.toml @@ -14,3 +14,4 @@ encoding = { git = "https://github.com/lifthrasiir/rust-encoding", features = [" ring = "0.5.3" postgres = "0.12.0" clap = "2.16.3" +hyper = "0.9.11" diff --git a/indexer/src/main.rs b/indexer/src/main.rs index 5649528..49c7079 100644 --- a/indexer/src/main.rs +++ b/indexer/src/main.rs @@ -8,6 +8,7 @@ extern crate libc; extern crate ring; extern crate encoding; extern crate postgres; +extern crate hyper; mod archive; mod archread; @@ -29,9 +30,9 @@ fn main() { let arg = clap_app!(indexer => (about: "Manned.org man page indexer") (@arg v: -v +multiple "Increase verbosity") - (@arg host: -h +required +takes_value "PostgreSQL connection string") (@subcommand pkg => (about: "Index a single package") + (@arg force: --force "Overwrite existing indexed package") (@arg sys: --sys +required +takes_value "System short-name") (@arg cat: --cat +required +takes_value "Package category") (@arg pkg: --pkg +required +takes_value "Package name") @@ -52,7 +53,11 @@ fn main() { .filter(Some("postgres"), if verbose >= 4 { log::LogLevelFilter::Trace } else { log::LogLevelFilter::Info }) .init().unwrap(); - let db = match postgres::Connection::connect(arg.value_of("host").unwrap(), postgres::TlsMode::None) { + let dbhost = match std::env::var("MANNED_PG") { + Ok(x) => x, + Err(_) => { error!("MANNED_PG not set."); return } + }; + let db = match postgres::Connection::connect(&dbhost[..], postgres::TlsMode::None) { Ok(x) => x, Err(x) => { error!("Can't connect to postgres: {}", x); return }, }; @@ -60,6 +65,7 @@ fn main() { if let Some(matches) = arg.subcommand_matches("pkg") { pkg::pkg(&db, pkg::PkgOpt { + force: matches.is_present("force"), sys: sysbyshort(&db, matches.value_of("sys").unwrap()), cat: matches.value_of("cat").unwrap(), pkg: matches.value_of("pkg").unwrap(), diff --git a/indexer/src/pkg.rs b/indexer/src/pkg.rs index a2c0c73..4120988 100644 --- a/indexer/src/pkg.rs +++ b/indexer/src/pkg.rs @@ -1,6 +1,7 @@ use std; use std::io::Read; use postgres; +use hyper; use archive; use archread; @@ -8,11 +9,12 @@ use man; use archive::Archive; pub struct PkgOpt<'a> { + pub force: bool, pub sys: i32, pub cat: &'a str, pub pkg: &'a str, pub ver: &'a str, - pub date: &'a str, + pub date: &'a str, // TODO: Option to extract date from package metadata itself pub file: &'a str } @@ -30,17 +32,26 @@ fn insert_pkg(tr: &postgres::transaction::Transaction, opt: &PkgOpt) -> Option r.get(0).get(0), }; - // TODO: option to overwrite an existing package version - let q = "INSERT INTO package_versions (package, version, released) VALUES($1, $2, $3::text::date) RETURNING id"; - let verid: i32 = match tr.query(q, &[&pkgid, &opt.ver, &opt.date]) { - Err(e) => { - error!("Can't insert package version in database: {}", e); - return None; - }, - Ok(r) => r.get(0).get(0), - }; - trace!("Package pkgid {} verid {}", pkgid, verid); - Some(verid) + let q = "SELECT id FROM package_versions WHERE package = $1 AND version = $2 AND released = $3::text::date"; + let res = tr.query(q, &[&pkgid, &opt.ver, &opt.date]).unwrap(); + + let verid : i32; + if res.is_empty() { + let q = "INSERT INTO package_versions (package, version, released) VALUES($1, $2, $3::text::date) RETURNING id"; + verid = tr.query(q, &[&pkgid, &opt.ver, &opt.date]).unwrap().get(0).get(0); + trace!("New package pkgid {} verid {}", pkgid, verid); + Some(verid) + + } else if opt.force { + verid = res.get(0).get(0); + trace!("Overwriting package pkgid {} verid {}", pkgid, verid); + tr.query("DELETE FROM man WHERE package = $1", &[&verid]).unwrap(); + Some(verid) + + } else { + info!("Package already in database, pkgid {} verid {}", pkgid, res.get(0).get::(0)); + None + } } @@ -50,7 +61,7 @@ fn insert_man_row(tr: &postgres::GenericConnection, verid: i32, path: &str, hash if let Err(e) = tr.execute( "INSERT INTO man (package, name, filename, locale, hash, section) VALUES ($1, $2, '/'||$3, $4, $5, $6)", &[&verid, &name, &path, &locale, &hash, §] - ) { + ) { // I think this can only happen if archread gives us the same file twice, which really // shouldn't happen. But I'd rather continue with an error logged than panic. error!("Can't insert verid {} fn {}: {}", verid, path, e); @@ -64,11 +75,12 @@ fn insert_man(tr: &postgres::GenericConnection, verid: i32, paths: &[&str], ent: Ok(x) => x, }; - // TODO: Overwrite entry if the contents are different? It's possible that earlier decoding + // Overwrite entry if the contents are different. It's possible that earlier decoding // implementations didn't properly detect the encoding. (On the other hand, due to differences - // in filenames it's also possible that THIS decoding step went wrong. Ugh) + // in filenames it's also possible that THIS decoding step went wrong, but that's slightly less + // likely) tr.execute( - "INSERT INTO contents (hash, content) VALUES($1, $2) ON CONFLICT (hash) DO NOTHING", + "INSERT INTO contents (hash, content) VALUES($1, $2) ON CONFLICT (hash) DO UPDATE SET content = $2", &[&dig.as_ref(), &cont] ).unwrap(); @@ -94,11 +106,24 @@ fn insert_link(tr: &postgres::GenericConnection, verid: i32, src: &str, dest: &s fn with_pkg(file: &str, cb: F) -> std::io::Result where F: FnOnce(Option) -> std::io::Result { - // TODO: Support streaming from URLs - // TODO: How does .deb support fit into this? (Or anything else with metadata) - let mut f = try!(std::fs::File::open(file)); - let ent = try!(Archive::open_archive(&mut f)); - cb(ent) + // TODO: .deb support + + if file.starts_with("http://") || file.starts_with("https://") { + let mut res = try!( + hyper::Client::new().get(file).send() + .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, format!("Hyper: {}", e))) + ); + if !res.status.is_success() { + return Err(std::io::Error::new(std::io::ErrorKind::Other, format!("HTTP: {}", res.status) )); + } + let ent = try!(Archive::open_archive(&mut res)); + cb(ent) + + } else { + let mut res = try!(std::fs::File::open(file)); + let ent = try!(Archive::open_archive(&mut res)); + cb(ent) + } } @@ -121,7 +146,7 @@ fn index_pkg(tr: &postgres::GenericConnection, opt: &PkgOpt, verid: i32) -> std: pub fn pkg(conn: &postgres::GenericConnection, opt: PkgOpt) { - info!("Handling pkg: {} / {} / {} - {} @ {} in {}", opt.sys, opt.cat, opt.pkg, opt.ver, opt.date, opt.file); + info!("Handling pkg: {} / {} / {} - {} @ {} @ {}", opt.sys, opt.cat, opt.pkg, opt.ver, opt.date, opt.file); let tr = conn.transaction().unwrap(); tr.set_rollback(); From 1ca43665a19453b128ab7a29009032f93b4d268a Mon Sep 17 00:00:00 2001 From: Yorhel Date: Sun, 6 Nov 2016 13:34:22 +0100 Subject: [PATCH 6/7] indexer: Add file caching + Arch Linux indexing --- indexer/Cargo.lock | 48 +++++++++++++++ indexer/Cargo.toml | 2 + indexer/src/main.rs | 20 ++++++- indexer/src/open.rs | 82 +++++++++++++++++++++++++ indexer/src/pkg.rs | 54 +++++------------ indexer/src/sys_arch.rs | 128 ++++++++++++++++++++++++++++++++++++++++ 6 files changed, 295 insertions(+), 39 deletions(-) create mode 100644 indexer/src/open.rs create mode 100644 indexer/src/sys_arch.rs diff --git a/indexer/Cargo.lock b/indexer/Cargo.lock index 501d8d3..5b22fa7 100644 --- a/indexer/Cargo.lock +++ b/indexer/Cargo.lock @@ -2,6 +2,7 @@ name = "indexer" version = "0.1.0" dependencies = [ + "chrono 0.2.25 (registry+https://github.com/rust-lang/crates.io-index)", "clap 2.17.1 (registry+https://github.com/rust-lang/crates.io-index)", "encoding 0.3.0-dev (git+https://github.com/lifthrasiir/rust-encoding)", "env_logger 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)", @@ -13,6 +14,7 @@ dependencies = [ "postgres 0.12.0 (registry+https://github.com/rust-lang/crates.io-index)", "regex 0.1.80 (registry+https://github.com/rust-lang/crates.io-index)", "ring 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)", + "url 1.2.3 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] @@ -43,6 +45,15 @@ name = "byteorder" version = "0.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "chrono" +version = "0.2.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "num 0.1.36 (registry+https://github.com/rust-lang/crates.io-index)", + "time 0.1.35 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "clap" version = "2.17.1" @@ -282,6 +293,38 @@ dependencies = [ "log 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "num" +version = "0.1.36" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "num-integer 0.1.32 (registry+https://github.com/rust-lang/crates.io-index)", + "num-iter 0.1.32 (registry+https://github.com/rust-lang/crates.io-index)", + "num-traits 0.1.36 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "num-integer" +version = "0.1.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "num-traits 0.1.36 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "num-iter" +version = "0.1.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "num-integer 0.1.32 (registry+https://github.com/rust-lang/crates.io-index)", + "num-traits 0.1.36 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "num-traits" +version = "0.1.36" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "num_cpus" version = "1.1.0" @@ -584,6 +627,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum bitflags 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "aad18937a628ec6abcd26d1489012cc0e18c21798210f491af69ded9b881106d" "checksum bufstream 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7b48dbe2ff0e98fa2f03377d204a9637d3c9816cd431bfe05a8abbd0ea11d074" "checksum byteorder 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)" = "0fc10e8cc6b2580fda3f36eb6dc5316657f812a3df879a44a66fc9f0fdbc4855" +"checksum chrono 0.2.25 (registry+https://github.com/rust-lang/crates.io-index)" = "9213f7cd7c27e95c2b57c49f0e69b1ea65b27138da84a170133fd21b07659c00" "checksum clap 2.17.1 (registry+https://github.com/rust-lang/crates.io-index)" = "27dac76762fb56019b04aed3ccb43a770a18f80f9c2eb62ee1a18d9fb4ea2430" "checksum cookie 0.2.5 (registry+https://github.com/rust-lang/crates.io-index)" = "0e3d6405328b6edb412158b3b7710e2634e23f3614b9bb1c412df7952489a626" "checksum encoding 0.3.0-dev (git+https://github.com/lifthrasiir/rust-encoding)" = "" @@ -614,6 +658,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum md5 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "7df230903ccdffd6b3b4ec21624498ea64c912ce50297846907f0b8e1bb249dd" "checksum memchr 0.1.11 (registry+https://github.com/rust-lang/crates.io-index)" = "d8b629fb514376c675b98c1421e80b151d3817ac42d7c667717d282761418d20" "checksum mime 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "b5c93a4bd787ddc6e7833c519b73a50883deb5863d76d9b71eb8216fb7f94e66" +"checksum num 0.1.36 (registry+https://github.com/rust-lang/crates.io-index)" = "bde7c03b09e7c6a301ee81f6ddf66d7a28ec305699e3d3b056d2fc56470e3120" +"checksum num-integer 0.1.32 (registry+https://github.com/rust-lang/crates.io-index)" = "fb24d9bfb3f222010df27995441ded1e954f8f69cd35021f6bef02ca9552fb92" +"checksum num-iter 0.1.32 (registry+https://github.com/rust-lang/crates.io-index)" = "287a1c9969a847055e1122ec0ea7a5c5d6f72aad97934e131c83d5c08ab4e45c" +"checksum num-traits 0.1.36 (registry+https://github.com/rust-lang/crates.io-index)" = "a16a42856a256b39c6d3484f097f6713e14feacd9bfb02290917904fae46c81c" "checksum num_cpus 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "8890e6084723d57d0df8d2720b0d60c6ee67d6c93e7169630e4371e88765dcad" "checksum openssl 0.7.14 (registry+https://github.com/rust-lang/crates.io-index)" = "c4117b6244aac42ed0150a6019b4d953d28247c5dd6ae6f46ae469b5f2318733" "checksum openssl-sys 0.7.17 (registry+https://github.com/rust-lang/crates.io-index)" = "89c47ee94c352eea9ddaf8e364be7f978a3bb6d66d73176572484238dd5a5c3f" diff --git a/indexer/Cargo.toml b/indexer/Cargo.toml index d8ce4a0..a15db65 100644 --- a/indexer/Cargo.toml +++ b/indexer/Cargo.toml @@ -15,3 +15,5 @@ ring = "0.5.3" postgres = "0.12.0" clap = "2.16.3" hyper = "0.9.11" +url = "1.2.3" +chrono = "0.2.25" diff --git a/indexer/src/main.rs b/indexer/src/main.rs index 49c7079..1083559 100644 --- a/indexer/src/main.rs +++ b/indexer/src/main.rs @@ -9,11 +9,15 @@ extern crate ring; extern crate encoding; extern crate postgres; extern crate hyper; +extern crate url; +extern crate chrono; mod archive; mod archread; mod man; +mod open; mod pkg; +mod sys_arch; // Convenience function to get a system id by short-name. Panics if the system doesn't exist. @@ -40,6 +44,12 @@ fn main() { (@arg date: --date +required +takes_value "Package release date") (@arg FILE: +required "Package file") ) + (@subcommand arch => + (about: "Index an Arch Linux repository") + (@arg sys: --sys +required +takes_value "System short-name") + (@arg mirror: --mirror +required +takes_value "Mirror URL") + (@arg repo: --repo +required +takes_value "Repository name") + ) ).get_matches(); let verbose = arg.occurrences_of("v"); @@ -71,7 +81,15 @@ fn main() { pkg: matches.value_of("pkg").unwrap(), ver: matches.value_of("ver").unwrap(), date: matches.value_of("date").unwrap(), - file: matches.value_of("FILE").unwrap() + file: open::Path{ path: matches.value_of("FILE").unwrap(), cache: false, canbelocal: true}, }); } + + if let Some(matches) = arg.subcommand_matches("arch") { + sys_arch::sync(&db, + sysbyshort(&db, matches.value_of("sys").unwrap()), + matches.value_of("mirror").unwrap(), + matches.value_of("repo").unwrap() + ); + } } diff --git a/indexer/src/open.rs b/indexer/src/open.rs new file mode 100644 index 0000000..6919fc4 --- /dev/null +++ b/indexer/src/open.rs @@ -0,0 +1,82 @@ +use std::io::{Read,Result,Error,ErrorKind,copy}; +use std::fs::{File,create_dir_all,metadata}; +use std::hash::{Hash,Hasher,SipHasher}; +use std::time::{Duration,SystemTime}; +use url::Url; +use hyper; + + +const CACHE_PATH: &'static str = "/var/tmp/manned-indexer"; +const CACHE_TIME: u64 = 24*3600; + + +pub struct Path<'a> { + pub path: &'a str, + pub cache: bool, + pub canbelocal: bool, +} + + +fn cache_fn(url: &Url) -> String { + let name = url.path_segments().unwrap().last().unwrap(); + let name = if name == "" { "index" } else { name }; + + let mut hash = SipHasher::new(); + url.hash(&mut hash); + format!("{}/{}-{}-{:x}", CACHE_PATH, url.host_str().unwrap(), name, hash.finish()) +} + + +fn fetch(url: &str) -> Result> { + let res = try!(hyper::Client::new() + .get(url) + .header(hyper::header::UserAgent("Man page crawler (info@manned.org; https://manned.org/)".to_owned())) + .send() + .map_err(|e| Error::new(ErrorKind::Other, format!("Hyper: {}", e))) + ); + if !res.status.is_success() { + return Err(Error::new(ErrorKind::Other, format!("HTTP: {}", res.status) )); + } + Ok(Box::new(res) as Box) +} + + +fn file(path: &str) -> Result> { + Ok(Box::new(try!(File::open(path))) as Box) +} + + +impl<'a> Path<'a> { + pub fn open(&self) -> Result> { + if let Ok(url) = Url::parse(self.path) { + if url.scheme() != "http" && url.scheme() != "https" { + return Err(Error::new(ErrorKind::Other, "Invalid scheme")); + } + + if self.cache { + let cfn = cache_fn(&url); + if let Ok(m) = metadata(&cfn) { + if m.modified().unwrap() > SystemTime::now() - Duration::from_secs(CACHE_TIME) { + return file(&cfn); + } + } + try!(create_dir_all(CACHE_PATH)); + { + let mut rd = try!(fetch(url.as_str())); + let mut wr = try!(File::create(&cfn)); + try!(copy(&mut rd, &mut wr)); + } + file(&cfn) + + } else { + fetch(url.as_str()) + } + + } else if self.canbelocal { + file(self.path) + + } else { + Err(Error::new(ErrorKind::Other, "Invalid URL")) + } + } +} diff --git a/indexer/src/pkg.rs b/indexer/src/pkg.rs index 4120988..4d3379d 100644 --- a/indexer/src/pkg.rs +++ b/indexer/src/pkg.rs @@ -1,12 +1,11 @@ use std; use std::io::Read; use postgres; -use hyper; -use archive; +use open; use archread; use man; -use archive::Archive; +use archive::{Archive,ArchiveEntry}; pub struct PkgOpt<'a> { pub force: bool, @@ -15,7 +14,7 @@ pub struct PkgOpt<'a> { pub pkg: &'a str, pub ver: &'a str, pub date: &'a str, // TODO: Option to extract date from package metadata itself - pub file: &'a str + pub file: open::Path<'a> } @@ -32,19 +31,19 @@ fn insert_pkg(tr: &postgres::transaction::Transaction, opt: &PkgOpt) -> Option r.get(0).get(0), }; - let q = "SELECT id FROM package_versions WHERE package = $1 AND version = $2 AND released = $3::text::date"; - let res = tr.query(q, &[&pkgid, &opt.ver, &opt.date]).unwrap(); + let q = "SELECT id FROM package_versions WHERE package = $1 AND version = $2"; + let res = tr.query(q, &[&pkgid, &opt.ver]).unwrap(); let verid : i32; if res.is_empty() { let q = "INSERT INTO package_versions (package, version, released) VALUES($1, $2, $3::text::date) RETURNING id"; verid = tr.query(q, &[&pkgid, &opt.ver, &opt.date]).unwrap().get(0).get(0); - trace!("New package pkgid {} verid {}", pkgid, verid); + info!("New package pkgid {} verid {}", pkgid, verid); Some(verid) } else if opt.force { verid = res.get(0).get(0); - trace!("Overwriting package pkgid {} verid {}", pkgid, verid); + info!("Overwriting package pkgid {} verid {}", pkgid, verid); tr.query("DELETE FROM man WHERE package = $1", &[&verid]).unwrap(); Some(verid) @@ -103,50 +102,29 @@ fn insert_link(tr: &postgres::GenericConnection, verid: i32, src: &str, dest: &s } -fn with_pkg(file: &str, cb: F) -> std::io::Result - where F: FnOnce(Option) -> std::io::Result -{ - // TODO: .deb support - - if file.starts_with("http://") || file.starts_with("https://") { - let mut res = try!( - hyper::Client::new().get(file).send() - .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, format!("Hyper: {}", e))) - ); - if !res.status.is_success() { - return Err(std::io::Error::new(std::io::ErrorKind::Other, format!("HTTP: {}", res.status) )); - } - let ent = try!(Archive::open_archive(&mut res)); - cb(ent) - - } else { - let mut res = try!(std::fs::File::open(file)); - let ent = try!(Archive::open_archive(&mut res)); - cb(ent) - } -} - - fn index_pkg(tr: &postgres::GenericConnection, opt: &PkgOpt, verid: i32) -> std::io::Result<()> { - let indexfunc = |paths: &[&str], ent: &mut archive::ArchiveEntry| { + let indexfunc = |paths: &[&str], ent: &mut ArchiveEntry| { insert_man(tr, verid, paths, ent); Ok(()) /* Don't propagate errors, continue handling other man pages */ }; - let missed = try!( - with_pkg(opt.file, |ent| { archread::FileList::read(ent, man::ismanpath, &indexfunc) }) - ).links(|src, dest| { insert_link(tr, verid, src, dest) }); + let mut rd = try!(opt.file.open()); + let missed = try!(archread::FileList::read( + try!(Archive::open_archive(&mut rd)), + man::ismanpath, &indexfunc)) + .links(|src, dest| { insert_link(tr, verid, src, dest) }); if let Some(missed) = missed { warn!("Some links were missed, reading package again"); - try!(with_pkg(opt.file, |ent| { missed.read(ent, indexfunc) })) + let mut rd = try!(opt.file.open()); + try!(missed.read(try!(Archive::open_archive(&mut rd)), indexfunc)); } Ok(()) } pub fn pkg(conn: &postgres::GenericConnection, opt: PkgOpt) { - info!("Handling pkg: {} / {} / {} - {} @ {} @ {}", opt.sys, opt.cat, opt.pkg, opt.ver, opt.date, opt.file); + info!("Handling pkg: {} / {} / {} - {} @ {} @ {}", opt.sys, opt.cat, opt.pkg, opt.ver, opt.date, opt.file.path); let tr = conn.transaction().unwrap(); tr.set_rollback(); diff --git a/indexer/src/sys_arch.rs b/indexer/src/sys_arch.rs new file mode 100644 index 0000000..7a0bf1f --- /dev/null +++ b/indexer/src/sys_arch.rs @@ -0,0 +1,128 @@ +use std::str::FromStr; +use std::io::{Read,BufRead,BufReader,Result}; +use regex::Regex; +use chrono::NaiveDateTime; +use postgres; + +use archive; +use open; +use man; +use pkg; + + +struct Meta { + filename: String, + name: String, + version: String, + date: String, +} + + +fn read_files(lst: T) -> Result { + let rd = BufReader::new(lst); + for line in rd.lines() { + let line = try!(line); + if man::ismanpath(&line) { + return Ok(true); + } + } + Ok(false) +} + + +fn read_desc(rd: &mut archive::ArchiveEntry) -> Result> { + let mut data = String::new(); + try!(rd.take(64*1024).read_to_string(&mut data)); + + let path = rd.path().unwrap(); + lazy_static! { + static ref RE: Regex = Regex::new(r"\s*%([^%]+)%\s*\n\s*([^\n]+)\s*\n").unwrap(); + } + + let mut filename = None; + let mut name = None; + let mut version = None; + let mut builddate = None; + + for kv in RE.captures_iter(&data) { + let key = kv.at(1).unwrap(); + let val = kv.at(2).unwrap(); + trace!("{}: {} = {}", path, key, val); + match key { + "FILENAME" => filename = Some(val), + "NAME" => name = Some(val), + "VERSION" => version = Some(val), + "BUILDDATE" => builddate = i64::from_str(val).ok(), + _ => {}, + } + } + + if filename.is_some() && name.is_some() && version.is_some() && builddate.is_some() { + Ok(Some(Meta { + filename: filename.unwrap().to_string(), + name: name.unwrap().to_string(), + version: version.unwrap().to_string(), + date: NaiveDateTime::from_timestamp(builddate.unwrap(), 0).format("%Y-%m-%d").to_string(), + })) + } else { + warn!("Metadata missing from package description: {}", path); + Ok(None) + } +} + + +// TODO: Switch to x86_64 instead of i686 +pub fn sync(pg: &postgres::GenericConnection, sys: i32, mirror: &str, repo: &str) { + info!("Reading packages from {} {}", mirror, repo); + + let path = format!("{}/{}/os/i686/{1:}.files.tar.gz", mirror, repo); + let path = open::Path{ path: &path, cache: true, canbelocal: false }; + let mut index = match path.open() { + Err(e) => { error!("Can't read package index: {}", e); return }, + Ok(x) => x, + }; + + let ent = match archive::Archive::open_archive(&mut index) { + Err(e) => { error!("Can't read package index: {}", e); return }, + Ok(x) => x, + }; + + let mut hasman = false; + let mut meta = None; + let r = archive::walk(ent, |x| { + if x.filetype() == archive::FileType::Directory { + hasman = false; + meta = None; + } else if x.path().unwrap().ends_with("/files") { + hasman = try!(read_files(x)); + } else if x.path().unwrap().ends_with("/desc") { + meta = try!(read_desc(x)); + } + + if hasman && meta.is_some() { + hasman = false; + let m = meta.take().unwrap(); + + let p = format!("{}/{}/os/i686/{}", mirror, repo, m.filename); + pkg::pkg(pg, pkg::PkgOpt{ + force: false, + sys: sys, + cat: repo, + pkg: &m.name, + ver: &m.version, + date: &m.date, + file: open::Path{ + path: &p, + cache: false, + canbelocal: false, + }, + }); + } + + Ok(true) + }); + + if let Err(e) = r { + error!("Error reading package index: {}", e); + } +} From 5e39af459f25af2f50ef2ef8b4d76e455bb9a084 Mon Sep 17 00:00:00 2001 From: Yorhel Date: Sun, 6 Nov 2016 15:26:20 +0100 Subject: [PATCH 7/7] Replace old Arch Linux scripts with new indexer --- Makefile | 11 ++++++-- util/arch.sh | 76 --------------------------------------------------- util/cron.sh | 2 +- util/index.sh | 22 +++++++++++++++ util/indexer | 1 + 5 files changed, 33 insertions(+), 79 deletions(-) delete mode 100755 util/arch.sh create mode 100755 util/index.sh create mode 120000 util/indexer diff --git a/Makefile b/Makefile index b1169e5..259b15e 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,6 @@ -.PHONY: ManUtils +.PHONY: ManUtils indexer clean + +all: ManUtils indexer ManUtils: lib/ManUtils/Build cd lib/ManUtils && perl Build.PL && ./Build install --install-base=inst @@ -6,7 +8,12 @@ ManUtils: lib/ManUtils/Build lib/ManUtils/Build: lib/ManUtils/Build.PL cd lib/ManUtils && perl Build.PL +indexer: indexer/target/release/indexer + +indexer/target/release/indexer: indexer/Cargo.toml indexer/src/*.rs + cd indexer && cargo build --release + clean: cd lib/ManUtils && ./Build distclean rm -rf lib/ManUtils/inst - + cd indexer && cargo clean diff --git a/util/arch.sh b/util/arch.sh deleted file mode 100755 index 57ff957..0000000 --- a/util/arch.sh +++ /dev/null @@ -1,76 +0,0 @@ -#!/bin/bash - -# Usage: ./arch.sh -# Synchronises the database with an Arch mirror, fetching any packages that -# aren't yet in the database and may have man pages. - -MIRROR=http://ftp.nluug.nl/pub/os/Linux/distr/archlinux -REPOS="core extra community" -DEBUG=false -SYSID=1 - -. ./common.sh - - -checkpkg() { - REPO=$1 - FN=$2 - D="$TMP/$REPO/$FN" - if [ ! \( -d "$D" -a -f "$D/files" -a -f "$D/desc" \) ]; then - echo "===> $FN" - echo "Invalid item, ignoring" - return - fi - grep -q /man/ "$D/files" - if [ "$?" -ne 0 ]; then - $DEBUG && echo "===> $FN" - $DEBUG && echo "No mans" - return - fi - - # Somewhat inefficient description parsing - FILENAME=`grep -A 1 '%FILENAME%' "$D/desc" | tail -n 1` - NAME=`grep -A 1 '%NAME%' "$D/desc" | tail -n 1` - VERSION=`grep -A 1 '%VERSION%' "$D/desc" | tail -n 1` - BUILDDATE=`grep -A 1 '%BUILDDATE%' "$D/desc" | tail -n 1` - if [ -z "$FILENAME" -o -z "$NAME" -o -z "$VERSION" -o -z "$BUILDDATE" ]; then - echo "===> $FN" - echo "Invalid/missing description info" - return - fi - BUILDDATE=`date -d "@$BUILDDATE" '+%F'` - - add_pkginfo $SYSID "$REPO" "$NAME" "$VERSION" "$BUILDDATE" - if [ "$?" -eq 0 ]; then - $DEBUG && echo "===> $FN" - $DEBUG && echo "Already up-to-date" - return - fi - - echo "===> $FN" - F="$TMP/$REPO/$FILENAME" - $CURL "$MIRROR/$REPO/os/i686/$FILENAME" -o "$F" || return - add_tar "$F" "$PKGID" - rm -f "$F" -} - - -syncrepo() { - REPO=$1 - F="$TMP/$REPO/repo.tar.gz" - echo "============ $MIRROR $REPO" - $CURL "$MIRROR/$REPO/os/i686/$REPO.files.tar.gz" -o "$F" || return 1 - tar -C "$TMP/$REPO" -xf "$F" || return 1 - rm -f "$F" - for fn in "$TMP/$REPO"/*; do - checkpkg "$REPO" `basename "$fn"` - done -} - - -for r in $REPOS; do - mkdir "$TMP/$r" - syncrepo $r - rm -rf "$TMP/$r" -done - diff --git a/util/cron.sh b/util/cron.sh index 47fdc4b..ae3f553 100755 --- a/util/cron.sh +++ b/util/cron.sh @@ -2,7 +2,7 @@ . ./common.sh -./arch.sh +./index.sh daily ./deb.sh ubuntu_active ./deb.sh debian_active echo "============ Updating SQL indices" diff --git a/util/index.sh b/util/index.sh new file mode 100755 index 0000000..87b355b --- /dev/null +++ b/util/index.sh @@ -0,0 +1,22 @@ +if test -f .config; then + source .config +fi + +INDEX="./indexer -vv" + +set -x + +arch() { + local MIRROR=http://ftp.nluug.nl/pub/os/Linux/distr/archlinux + local REPOS="core extra community" + for REPO in $REPOS; do + $INDEX arch --sys arch --mirror $MIRROR --repo $REPO + done +} + + +daily() { + arch +} + +$@ diff --git a/util/indexer b/util/indexer new file mode 120000 index 0000000..a15109f --- /dev/null +++ b/util/indexer @@ -0,0 +1 @@ +../indexer/target/release/indexer \ No newline at end of file