From 022e9acc4f7e42b807bb63e021655cc79ce9f398 Mon Sep 17 00:00:00 2001 From: Yorhel Date: Sat, 22 Oct 2016 14:54:35 +0200 Subject: [PATCH] WIP: Rewritten man page indexer in Rust Currently just figuring out how to read archives. Turns out to not be as simple as I had expected. --- .gitignore | 2 + indexer/Cargo.lock | 143 ++++++++++++ indexer/Cargo.toml | 10 + indexer/src/archive.rs | 368 +++++++++++++++++++++++++++++++ indexer/src/main.rs | 97 ++++++++ indexer/tests/mktar.sh | 49 ++++ indexer/tests/testarchive.tar.xz | Bin 0 -> 616 bytes 7 files changed, 669 insertions(+) create mode 100644 indexer/Cargo.lock create mode 100644 indexer/Cargo.toml create mode 100644 indexer/src/archive.rs create mode 100644 indexer/src/main.rs create mode 100755 indexer/tests/mktar.sh create mode 100644 indexer/tests/testarchive.tar.xz diff --git a/.gitignore b/.gitignore index 0da3d84..a80d19e 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,5 @@ !/lib/ManUtils/Build.PL !/lib/ManUtils/ManUtils.pm !/lib/ManUtils/ManUtils.xs +indexer/target + diff --git a/indexer/Cargo.lock b/indexer/Cargo.lock new file mode 100644 index 0000000..53c701d --- /dev/null +++ b/indexer/Cargo.lock @@ -0,0 +1,143 @@ +[root] +name = "indexer" +version = "0.1.0" +dependencies = [ + "env_logger 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)", + "libarchive 0.1.1 (git+https://github.com/17dec/libarchive-rust)", + "log 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", + "regex 0.1.77 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "aho-corasick" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "memchr 0.1.11 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "env_logger" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "log 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", + "regex 0.1.77 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "kernel32-sys" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "winapi 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)", + "winapi-build 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "libarchive" +version = "0.1.1" +source = "git+https://github.com/17dec/libarchive-rust#3f723cf0064561f21f0cebbd534a75076e6dbcaa" +dependencies = [ + "libarchive3-sys 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.17 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "libarchive3-sys" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "libc 0.2.17 (registry+https://github.com/rust-lang/crates.io-index)", + "pkg-config 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "libc" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "log" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "memchr" +version = "0.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "libc 0.2.17 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "pkg-config" +version = "0.3.8" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "regex" +version = "0.1.77" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "aho-corasick 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)", + "memchr 0.1.11 (registry+https://github.com/rust-lang/crates.io-index)", + "regex-syntax 0.3.7 (registry+https://github.com/rust-lang/crates.io-index)", + "thread_local 0.2.7 (registry+https://github.com/rust-lang/crates.io-index)", + "utf8-ranges 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "regex-syntax" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "thread-id" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "kernel32-sys 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.17 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "thread_local" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "thread-id 2.0.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "utf8-ranges" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "winapi" +version = "0.2.8" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "winapi-build" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[metadata] +"checksum aho-corasick 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)" = "ca972c2ea5f742bfce5687b9aef75506a764f61d37f8f649047846a9686ddb66" +"checksum env_logger 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)" = "15abd780e45b3ea4f76b4e9a26ff4843258dd8a3eed2775a0e7368c2e7936c2f" +"checksum kernel32-sys 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7507624b29483431c0ba2d82aece8ca6cdba9382bff4ddd0f7490560c056098d" +"checksum libarchive 0.1.1 (git+https://github.com/17dec/libarchive-rust)" = "" +"checksum libarchive3-sys 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "3cd3beae8f59a4c7a806523269b5392037577c150446e88d684dfa6de6031ca7" +"checksum libc 0.2.17 (registry+https://github.com/rust-lang/crates.io-index)" = "044d1360593a78f5c8e5e710beccdc24ab71d1f01bc19a29bcacdba22e8475d8" +"checksum log 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)" = "ab83497bf8bf4ed2a74259c1c802351fcd67a65baa86394b6ba73c36f4838054" +"checksum memchr 0.1.11 (registry+https://github.com/rust-lang/crates.io-index)" = "d8b629fb514376c675b98c1421e80b151d3817ac42d7c667717d282761418d20" +"checksum pkg-config 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)" = "8cee804ecc7eaf201a4a207241472cc870e825206f6c031e3ee2a72fa425f2fa" +"checksum regex 0.1.77 (registry+https://github.com/rust-lang/crates.io-index)" = "64b03446c466d35b42f2a8b203c8e03ed8b91c0f17b56e1f84f7210a257aa665" +"checksum regex-syntax 0.3.7 (registry+https://github.com/rust-lang/crates.io-index)" = "48f0573bcee95a48da786f8823465b5f2a1fae288a55407aca991e5b3e0eae11" +"checksum thread-id 2.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "a9539db560102d1cef46b8b78ce737ff0bb64e7e18d35b2a5688f7d097d0ff03" +"checksum thread_local 0.2.7 (registry+https://github.com/rust-lang/crates.io-index)" = "8576dbbfcaef9641452d5cf0df9b0e7eeab7694956dd33bb61515fb8f18cfdd5" +"checksum utf8-ranges 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)" = "a1ca13c08c41c9c3e04224ed9ff80461d97e121589ff27c753a16cb10830ae0f" +"checksum winapi 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)" = "167dc9d6949a9b857f3451275e911c3f44255842c1f7a76f33c55103a909087a" +"checksum winapi-build 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "2d315eee3b34aca4797b2da6b13ed88266e6d612562a0c46390af8299fc699bc" diff --git a/indexer/Cargo.toml b/indexer/Cargo.toml new file mode 100644 index 0000000..7b1b918 --- /dev/null +++ b/indexer/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "indexer" +version = "0.1.0" +authors = ["Yorhel "] + +[dependencies] +regex = "0.1.77" +log = "0.3.6" +env_logger = "0.3.5" +libarchive = { git = "https://github.com/17dec/libarchive-rust" } diff --git a/indexer/src/archive.rs b/indexer/src/archive.rs new file mode 100644 index 0000000..f1b9403 --- /dev/null +++ b/indexer/src/archive.rs @@ -0,0 +1,368 @@ +use std::path::Path; +use std::collections::HashMap; +use libarchive::reader::Reader as ArchiveReader; +use libarchive::reader::{FileReader,Builder}; +use libarchive::archive::{Entry,FileType,ReadFormat,ReadFilter}; +use libarchive::error::ArchiveResult; + + +pub fn open_file>(path: T) -> ArchiveResult { + let mut builder = Builder::new(); + try!(builder.support_format(ReadFormat::All)); + try!(builder.support_filter(ReadFilter::All)); + builder.open_file(path) +} + + +#[derive(Clone,Debug,PartialEq,Eq)] +pub enum EntryType { + // Regular file that has been handled/indexed + Handled, + // Regular file that hasn't been handled because the caller wasn't interested in it. Could + // still be an interesting file if it is referenced from an interesting path. + Regular, + // Link to another file (interesting or not is irrelevant) + Link(String), + // Directory; need this information when resolving links + Directory, + // Something that couldn't be a an interesting file (chardev/socket/etc); If any link resolves + // to this we know we're done. + Other, +} + + +/* + * I had hoped that reading man pages from an archive would just be a simple: + * + * 1. Walk through all files in the archive in a streaming fashion + * 2. Parse/index man pages + * + * But alas, it was not to be. Symlinks and hardlinks have ruined it. Now we have to... + * + * 1. Walk through all entries in the archive in a streaming fashion + * 2. Parse/index regular file man pages + * 3. Keep track of all paths in the archive + * 4. Use the result of step (3) to resolve symlinks/hardlinks to their actual file + * 5. Read the entire damn archive again if one of the links resolved to a file that was not + * recognized as a man page in step (2). Luckily, this isn't very common. + * + * And this doesn't even cover the problem of duplicate entries in a tar, which is also quite + * annoying to handle. + * + * What annoys me the most about all of this is that it's not possible to stream an archive from + * the network and read/index the entire thing in a single step. Now we have to buffer packages to + * disk in order to be able to read the archive a second time. + * + * (Note that it is possible to resolve links while walking through the entries, which will allow + * us to match files found later in the archive against links found earlier, thus potentially + * saving the need to read the archive a second time. This is merely a performance improvement for + * an uncommon case, and it certainly won't simplify the code) + * + * (Note that it's also possible to just flush all files <10MB* to disk to completely avoid the + * need for a second archive read, but that's going to significantly slow down the common case in + * order to handle a rare case. It's possible to further optimize this using some heuristics to + * determine whether a file is potentially a man page, but that's both complex and may not even + * save much) + * + * (* So apparently some man pages are close to 10MB...) + */ +pub struct Reader { + // List of seen files. This is used to resolve links + seen: HashMap, + // List of interesting links + links: Vec, + // List of files we have to read in a second walk through the archive + missedfiles: HashMap>, +} + + +// Generalized API: +// 1. Read once +// reader.read(file, interest_cb, file_cb) -> Error +// file: A libarchive::Reader +// interest_cb(path) -> bool +// Called on every file/link name, should return whether it's a file the caller is interested +// in. (e.g. parse_path(), but also +DESC and other metadata). +// file_cb(path, reader, entry) -> Error +// Called on every interesting (actual) file, given the (normalized?) path, the +// libarchive::Reader and a ReaderEntry +// +// 2. Read links +// reader.links(link_cb) -> Error +// link_cb(path, dest) -> Error +// Called on every link which has as 'dest' a file path that has already been given to +// file_cb() before. +// +// 3. (Optionally) read a second time +// if reader.need_reread() { +// reader.reread(file, file_cb) +// } +impl Reader { + pub fn new() -> Reader { + Reader { + seen: HashMap::new(), + links: Vec::new(), + missedfiles: HashMap::new(), + } + } + + // Convenience function to read the path/type/link from the next header. + fn read_header(rd: &mut ArchiveReader) -> Option<(String, EntryType)> { + let ent = match rd.next_header() { + Some(x) => x, + None => return None, + }; + let path = ent.pathname().trim_left_matches('/').trim_left_matches("./").trim_right_matches('/').to_string(); + + // Hard links are apparently relative to the root of the archive. + let link = ent.hardlink().map(|x| format!("/{}", x)) + .or(ent.symlink().map(str::to_string)); + + let(fts, ret) = match ent.filetype() { + FileType::BlockDevice => ("blk", EntryType::Other), + FileType::SymbolicLink => ("sym", match link { Some(l) => EntryType::Link(l), _ => EntryType::Other }), + FileType::Socket => ("sck", EntryType::Other), + FileType::CharacterDevice => ("chr", EntryType::Other), + FileType::Directory => ("dir", EntryType::Directory), + FileType::NamedPipe => ("fif", EntryType::Other), + FileType::Mount => ("mnt", EntryType::Other), + FileType::RegularFile => ("reg", EntryType::Regular), + FileType::Unknown => ("unk", match link { Some(l) => EntryType::Link(l), _ => EntryType::Other }), + }; + + trace!("Archive entry: {}{:10} bytes, path={:?} type={:?}", fts, ent.size(), path, ret); + Some((path, ret)) + } + + pub fn read(&mut self, rd: &mut ArchiveReader, interest_cb: F, mut file_cb: G) -> ArchiveResult<()> + where F: Fn(&str) -> bool, G: FnMut(&[&str], &mut ArchiveReader) -> ArchiveResult<()> + { + while let Some((path, t)) = Self::read_header(rd) { + // We ought to throw away the result of the previous entry with the same name and use + // this new entry instead, but fuck it. This case is too rare, so let's just warn! it. + if let Some(_) = self.seen.get(&path) { + warn!("Duplicate file entry: {}", path); + continue; + } + + let mut newt = t; + match newt { + EntryType::Regular if interest_cb(&path) => { + let pathv = [&path as &str]; + try!(file_cb(&pathv[..], rd)); + newt = EntryType::Handled + }, + EntryType::Link(_) if interest_cb(&path) => { + self.links.push(path.clone()); + }, + _ => () + }; + self.seen.insert(path, newt); + } + Ok(()) + } + + // This is basically realpath(), using the virtual filesystem in self.seen. + // This method is not particularly efficient, it allocates like crazy. + fn resolve_link(&self, base: &str, path: &str, depth: usize) -> Option<(EntryType, Vec)> { + if depth < 1 { + warn!("Unresolved link: {} -> {}; Recursion depth exceeded", base, path); + return None + } + + // Remove filename from the base + let basedir = if let Some(i) = base.rfind('/') { base.split_at(i).0 } else { return None }; + + let comp : Vec<&str> = + if path.starts_with('/') { path.split('/').collect() } + else { basedir.split('/').chain(path.split('/')).collect() }; + + let mut dest = Vec::new(); + + for (i, &c) in comp.iter().enumerate() { + if c == "" || c == "." { + continue; + } + if c == ".." { + if dest.len() > 1 { + dest.pop(); + } + continue; + } + dest.push(c.to_string()); + let curpath = dest.join("/"); + match self.seen.get(&curpath) { + + // If it's a directory, we're good + Some(&EntryType::Directory) => (), + + // If it's a file or man page, it must be the last item. + Some(& ref x@ EntryType::Regular) | + Some(& ref x@ EntryType::Handled) => return + if i == comp.len()-1 { + Some((x.clone(), dest)) + } else { + warn!("Unresolved link: {} -> {}; Non-directory component", base, path); + None + }, + + // Links... Ugh + Some(&EntryType::Link(ref d)) => { + match self.resolve_link(&curpath, &d, depth-1) { + // Same as above, with dirs we can continue, files have to be last + Some((EntryType::Directory, d)) => dest = d, + x@Some((EntryType::Regular, _)) | + x@Some((EntryType::Handled, _)) => return + if i == comp.len()-1 { x } + else { + warn!("Unresolved link: {} -> {}; Non-directory link component", base, path); + None + }, + _ => return None, + } + }, + + // Don't care about anything else, just stop. + _ => { + warn!("Unresolved link: {} -> {}; Special or non-existing file", base, path); + return None + } + } + } + Some((EntryType::Directory, dest)) + } + + pub fn links(&mut self, mut cb: F) where F: FnMut(&str, &str) { + for p in self.links.iter() { + let dest = match self.seen.get(p) { Some(&EntryType::Link(ref x)) => x, _ => unreachable!() }; + + match self.resolve_link(&p, dest, 32) { + Some((EntryType::Handled, d)) => { + let dstr = d.join("/"); + cb(&p, &dstr) + }, + Some((EntryType::Regular, d)) => { + let dstr = d.join("/"); + self.missedfiles.entry(dstr).or_insert_with(Vec::new).push(p.to_string()); + } + _ => {}, + } + } + // We can reclaim this memory early. + self.links = Vec::new(); + self.seen = HashMap::new(); + } + + pub fn need_reread(&self) -> bool { + self.missedfiles.len() > 0 + } + + pub fn reread(&mut self, rd: &mut ArchiveReader, mut file_cb: G) -> ArchiveResult<()> + where G: FnMut(&[&str], &mut ArchiveReader) -> ArchiveResult<()> + { + while let Some((path, _)) = Self::read_header(rd) { + if let Some(f) = self.missedfiles.remove(&path) { + let v: Vec<&str> = f.iter().map(|x| x as &str).collect(); + try!(file_cb(&v, rd)) + } + if self.missedfiles.len() < 1 { + break; + } + } + Ok(()) + } +} + + + + +#[cfg(test)] +mod tests { + use super::*; + use env_logger; + + fn test_read(r: &mut Reader) { + let mut f = open_file("tests/testarchive.tar.xz").unwrap(); + let mut files = Vec::new(); + r.read(&mut f, + |p| p.starts_with("man/man"), + |p,_| { files.extend(p.iter().map(|x| x.to_string())); Ok(()) } + ).unwrap(); + assert_eq!(files, vec!["man/man3/helloworld.3".to_string()]); + } + + fn test_resolve_links(r: &mut Reader) { + let res = |p| { + if let Some(&EntryType::Link(ref l)) = r.seen.get(p) { + r.resolve_link(p, &l, 5) + } else { + panic!("Not found or not a link: {}", p); + } + }; + let helloworld = Some((EntryType::Handled, vec!["man".to_string(), "man3".to_string(), "helloworld.3".to_string()])); + + assert_eq!(res("man/mans"), Some((EntryType::Directory, vec!["man".to_string(), "man3".to_string()]))); + assert_eq!(res("man/man6/hardlink.6"), helloworld); + assert_eq!(res("man/man1/symlinkbefore.1"), helloworld); + assert_eq!(res("man/man6/symlinkafter.6"), helloworld); + + assert_eq!(res("man/man1/badsymlink1.1"), None); + assert_eq!(res("man/man1/badsymlink2.1"), None); + assert_eq!(res("man/man1/badsymlink3.1"), None); + assert_eq!(res("man/man1/badsymlink4.1"), None); + assert_eq!(res("man/man1/badsymlink5.1"), None); + + assert_eq!(res("man/man1/doublesymlink1.1"), helloworld); + assert_eq!(res("man/man1/doublesymlink2.1"), helloworld); + assert_eq!(res("man/man1/triplesymlink.1"), helloworld); + assert_eq!(res("man/man1/infinitesymlink.1"), None); + } + + fn test_links(r: &mut Reader) { + let mut links = Vec::new(); + r.links(|p,d| links.push((p.to_string(), d.to_string()))); + links.sort(); + + { + let mut res = |p:&str| { + let r = links.remove(0); + assert_eq!(r.0, p.to_string()); + assert_eq!(r.1, "man/man3/helloworld.3".to_string()); + }; + res("man/man1/doublesymlink1.1"); + res("man/man1/doublesymlink2.1"); + res("man/man1/symlinkbefore.1"); + res("man/man1/triplesymlink.1"); + res("man/man6/hardlink.6"); + res("man/man6/symlinkafter.6"); + } + assert_eq!(links.len(), 0); + } + + fn test_reread(r: &mut Reader) { + assert!(r.need_reread()); + + let mut f = open_file("tests/testarchive.tar.xz").unwrap(); + let mut files = Vec::new(); + r.reread(&mut f, + |p,_| { files.extend(p.iter().map(|x| x.to_string())); Ok(()) } + ).unwrap(); + + files.sort(); + assert_eq!(files, vec![ + "man/man3/needreread.3".to_string(), + "man/man6/needreread.6".to_string() + ]); + } + + #[test] + fn test_reader() { + env_logger::init().unwrap(); + + let mut r = Reader::new(); + test_read(&mut r); + test_resolve_links(&mut r); + test_links(&mut r); + test_reread(&mut r); + } +} diff --git a/indexer/src/main.rs b/indexer/src/main.rs new file mode 100644 index 0000000..d0eb4e2 --- /dev/null +++ b/indexer/src/main.rs @@ -0,0 +1,97 @@ +#[macro_use] extern crate log; +extern crate env_logger; +extern crate libarchive; +extern crate regex; + +use regex::Regex; + +mod archive; + + +// Checks a path for a man page candidate. Returns None if it doesn't seem like a man page +// location, otherwise Some((manPageName, Section, Locale)). +fn parse_path(path: &str) -> Option<(&str, &str, &str)> { + // Roughly: man[/locale]/man1/manpage.section[.compression]+ + // TODO: lazy_static + let re = Regex::new(r"(?x) + man + (?: / ([^/]+) )? # Optional locale + /man[a-z0-9]/ # Subdir + ([^/]+?) # Man page name (non-greedy) + \. ([^/\.]+) # Section + (?: \. (?: gz|lzma|bz2|xz ))* $ # Any number of compression extensions + ").unwrap(); + + let cap = match re.captures(path) { Some(x) => x, None => return None }; + let locale = cap.at(1).unwrap_or(""); + let name = cap.at(2).unwrap(); + let section = cap.at(3).unwrap(); + + // Not everything matching the regex is necessarily a man page, exclude some special cases. + match (name, section, locale) { + // Files that totally aren't man pages + ("Makefile", "in", _) | + ("Makefile", "am", _) | + (".cvsignore", _, _) | + (_, "gz", _) | + (_, "lzma", _) | + (_, "bz2", _) | + (_, "xz", _) | + (_, "html", _) => None, + // Some weird directories that happen to match the locale + (n, s, "5man") | + (n, s, "c") | + (n, s, "man1") | + (n, s, "man2") | + (n, s, "man3") | + (n, s, "man4") | + (n, s, "man5") | + (n, s, "man6") | + (n, s, "man7") | + (n, s, "man8") | + (n, s, "Man-Part1") | + (n, s, "Man-Part2") => Some((n, s, "")), + // Nothing special! + x => Some(x) + } +} + + +fn main() { + env_logger::init().unwrap(); + info!("Hello, world!"); +} + + +#[test] +fn test_parse_path() { + // Generic tests + assert_eq!(parse_path("/"), None); + assert_eq!(parse_path("/man1/ncdu.1"), None); + assert_eq!(parse_path("/man/man?/ncdu.1"), None); + assert_eq!(parse_path("/man/man1/ncdu.1"), Some(("ncdu", "1", ""))); + assert_eq!(parse_path("/man/man1/ncdu.1.gz.lzma.xz.bz2.gz"), Some(("ncdu", "1", ""))); // This stuff happens + assert_eq!(parse_path("/man/en_US.UTF-8/man1/ncdu.1"), Some(("ncdu", "1", "en_US.UTF-8"))); + + // Special cases + assert_eq!(parse_path("/usr/share/man/man1/INDEX"), None); + assert_eq!(parse_path("/usr/share/man/man1/Makefile"), None); + assert_eq!(parse_path("/usr/share/man/man1/Makefile.am"), None); + assert_eq!(parse_path("/usr/share/man/man1/Makefile.in"), None); + assert_eq!(parse_path("/usr/share/man/man1/.cvsignore"), None); + assert_eq!(parse_path("/usr/share/man/man1/.cvsignore.gz"), None); + + // Some actual locations + assert_eq!(parse_path("/usr/local/man/man1/list_audio_tracks.1.gz"), Some(("list_audio_tracks", "1", ""))); + assert_eq!(parse_path("/usr/local/lib/perl5/site_perl/man/man3/DBIx::Class::Helper::ResultSet::DateMethods1::Announcement.3.gz"), Some(("DBIx::Class::Helper::ResultSet::DateMethods1::Announcement", "3", ""))); + assert_eq!(parse_path("/usr/man/man3/exit.3tk"), Some(("exit", "3tk", ""))); + assert_eq!(parse_path("/usr/local/brlcad/share/man/mann/exit.nged.gz"), Some(("exit", "nged", ""))); + assert_eq!(parse_path("/usr/X11R6/man/man3/intro.3xglut.gz"), Some(("intro", "3xglut", ""))); + assert_eq!(parse_path("/usr/local/share/man/ko_KR.eucKR/man3/intro.3.gz"), Some(("intro", "3", "ko_KR.eucKR"))); + + assert_eq!(parse_path("/usr/lib/scilab/man/Man-Part1/man1/ans.1"), Some(("ans", "1", ""))); + assert_eq!(parse_path("/heirloom/usr/share/man/5man/man1/chgrp.1.gz"), Some(("chgrp", "1", ""))); + + assert_eq!(parse_path("/usr/local/plan9/man/man8/index.html"), None); + assert_eq!(parse_path("/usr/local/share/doc/gmt/html/man/grdpaste.html"), None); +} diff --git a/indexer/tests/mktar.sh b/indexer/tests/mktar.sh new file mode 100755 index 0000000..3f5ba4e --- /dev/null +++ b/indexer/tests/mktar.sh @@ -0,0 +1,49 @@ +#!/bin/sh + +# The order of inserting the files into the tar is not fully deterministic this +# way. The tests will fail quite badly if hardlink.6 is considered the +# "original" version. + +mkdir man +cd man + +mkdir man1 +mkdir man3 +mkdir man6 +ln -s man3 mans + +echo 'Hello World' >man3/helloworld.3 +echo 'Not a very interesting file' >notinteresting +echo 'Potentially interesting file' >possiblyinteresting + +ln man3/helloworld.3 man6/hardlink.6 + +ln -s ../man3/helloworld.3 man1/symlinkbefore.1 +ln -s ../man3/helloworld.3 man6/symlinkafter.6 + +ln -s notadir/../../man3/helloworld.3 man1/badsymlink1.1 +ln -s man3/helloworld.3 man1/badsymlink2.1 +ln -s ../man3/helloworld.3/. man1/badsymlink3.1 +ln -s ../man3/helloworld.3/../helloworld.3 man1/badsymlink4.1 +ln -s ../man1/symlinkbefore.1/../../man1/helloworld.3 man1/badsymlink5.1 + +ln -s symlinkbefore.1 man1/doublesymlink1.1 +ln -s ../mans/helloworld.3 man1/doublesymlink2.1 +ln -s ../mans/../man1/symlinkbefore.1 man1/triplesymlink.1 +ln -s infinitesymlink.1 man1/infinitesymlink.1 + +ln -s ../possiblyinteresting man3/needreread.3 +ln -s ../possiblyinteresting man6/needreread.6 + +cd .. +rm -f testarchive.tar +tar -cf testarchive.tar man/ +rm -r man/ + +mkdir man +echo 'Overwritten file' >man/possiblyinteresting +tar -rf testarchive.tar man/ +rm -r man/ + +rm -f testarchive.tar.xz +xz testarchive.tar diff --git a/indexer/tests/testarchive.tar.xz b/indexer/tests/testarchive.tar.xz new file mode 100644 index 0000000000000000000000000000000000000000..0fe9760170b551b0ef3438c652ae8f7a4f9f4ac2 GIT binary patch literal 616 zcmV-u0+;>$H+ooF000E$*0e?f03iVu0001VFXf})PyYfXT>v(iN*x^x(XuqR&o7_G zqKJ_QaG??>54EHV`~R9~@-$qVakc37Igz++Ye>*PW3Fd=eOl$6n&`e4NsNE<4=t6w zS`9CCx}(t#Z>BrXMfF;+wTH0tnp;8i%GyqxP-pJeiwX<#QRl_Gj~L6}6XNKZNJ7vL z{}#7}Z9hiJyK(!$u%0Y%`5$*;J#I6T{n#=SG@DD= z&*>1DD{ItdQr-Pnb#7GiLBoULZnFe}81bZ@LG%E_)BNrK9hE}aw}j0P{%Q( zqA~9VMUF2`90K(?c@K8RNEBy%2>8sb#+D*3qn14)Y5#AkoXaZwL{?lkKDZ~>G7$AN zlUilzW>cOW(2k(KEttbOlUS9;E>b*;Xzod^qJNiQaYaHKOHbCAI>W8*}* zv7r5qOIVwt@0*c@UQRk9Ijj#?<-p2S3)ilT(O-QU^B&-o9mJdylOU39(@Jn|UXdjI z%sM(NeaitdL@Skfs&D`R000124H0kp!jrlH0l@@-paB3DWn00q#Ao{g000001X)_j CkRw