Use libarchive3-sys crate directly + improve archread API

This all should offer a more convenient and robust interface to handle all sorts of archives.
2016-10-26 18:26:06 +02:00 · 2016-10-26 18:26:06 +02:00 · c8bb4da246
commit c8bb4da246
parent 022e9acc4f
9 changed files with 732 additions and 448 deletions
--- a/indexer/Cargo.lock
+++ b/indexer/Cargo.lock
@ -3,9 +3,11 @@ name = "indexer"
 version = "0.1.0"
 dependencies = [
 "env_logger 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)",
- "libarchive 0.1.1 (git+https://github.com/17dec/libarchive-rust)",
+ "lazy_static 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)",
+ "libarchive3-sys 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)",
+ "libc 0.2.17 (registry+https://github.com/rust-lang/crates.io-index)",
 "log 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)",
- "regex 0.1.77 (registry+https://github.com/rust-lang/crates.io-index)",
+ "regex 0.1.80 (registry+https://github.com/rust-lang/crates.io-index)",
 ]

 [[package]]
@ -22,7 +24,7 @@ version = "0.3.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 dependencies = [
 "log 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)",
- "regex 0.1.77 (registry+https://github.com/rust-lang/crates.io-index)",
+ "regex 0.1.80 (registry+https://github.com/rust-lang/crates.io-index)",
 ]

 [[package]]
@ -35,13 +37,9 @@ dependencies = [
 ]

 [[package]]
-name = "libarchive"
-version = "0.1.1"
-source = "git+https://github.com/17dec/libarchive-rust#3f723cf0064561f21f0cebbd534a75076e6dbcaa"
-dependencies = [
- "libarchive3-sys 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)",
- "libc 0.2.17 (registry+https://github.com/rust-lang/crates.io-index)",
-]
+name = "lazy_static"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"

 [[package]]
 name = "libarchive3-sys"
@ -77,19 +75,19 @@ source = "registry+https://github.com/rust-lang/crates.io-index"

 [[package]]
 name = "regex"
-version = "0.1.77"
+version = "0.1.80"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 dependencies = [
 "aho-corasick 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)",
 "memchr 0.1.11 (registry+https://github.com/rust-lang/crates.io-index)",
- "regex-syntax 0.3.7 (registry+https://github.com/rust-lang/crates.io-index)",
+ "regex-syntax 0.3.9 (registry+https://github.com/rust-lang/crates.io-index)",
 "thread_local 0.2.7 (registry+https://github.com/rust-lang/crates.io-index)",
 "utf8-ranges 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)",
 ]

 [[package]]
 name = "regex-syntax"
-version = "0.3.7"
+version = "0.3.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"

 [[package]]
@ -128,14 +126,14 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 "checksum aho-corasick 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)" = "ca972c2ea5f742bfce5687b9aef75506a764f61d37f8f649047846a9686ddb66"
 "checksum env_logger 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)" = "15abd780e45b3ea4f76b4e9a26ff4843258dd8a3eed2775a0e7368c2e7936c2f"
 "checksum kernel32-sys 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7507624b29483431c0ba2d82aece8ca6cdba9382bff4ddd0f7490560c056098d"
-"checksum libarchive 0.1.1 (git+https://github.com/17dec/libarchive-rust)" = "<none>"
+"checksum lazy_static 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "49247ec2a285bb3dcb23cbd9c35193c025e7251bfce77c1d5da97e6362dffe7f"
 "checksum libarchive3-sys 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "3cd3beae8f59a4c7a806523269b5392037577c150446e88d684dfa6de6031ca7"
 "checksum libc 0.2.17 (registry+https://github.com/rust-lang/crates.io-index)" = "044d1360593a78f5c8e5e710beccdc24ab71d1f01bc19a29bcacdba22e8475d8"
 "checksum log 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)" = "ab83497bf8bf4ed2a74259c1c802351fcd67a65baa86394b6ba73c36f4838054"
 "checksum memchr 0.1.11 (registry+https://github.com/rust-lang/crates.io-index)" = "d8b629fb514376c675b98c1421e80b151d3817ac42d7c667717d282761418d20"
 "checksum pkg-config 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)" = "8cee804ecc7eaf201a4a207241472cc870e825206f6c031e3ee2a72fa425f2fa"
-"checksum regex 0.1.77 (registry+https://github.com/rust-lang/crates.io-index)" = "64b03446c466d35b42f2a8b203c8e03ed8b91c0f17b56e1f84f7210a257aa665"
-"checksum regex-syntax 0.3.7 (registry+https://github.com/rust-lang/crates.io-index)" = "48f0573bcee95a48da786f8823465b5f2a1fae288a55407aca991e5b3e0eae11"
+"checksum regex 0.1.80 (registry+https://github.com/rust-lang/crates.io-index)" = "4fd4ace6a8cf7860714a2c2280d6c1f7e6a413486c13298bbc86fd3da019402f"
+"checksum regex-syntax 0.3.9 (registry+https://github.com/rust-lang/crates.io-index)" = "f9ec002c35e86791825ed294b50008eea9ddfc8def4420124fbc6b08db834957"
 "checksum thread-id 2.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "a9539db560102d1cef46b8b78ce737ff0bb64e7e18d35b2a5688f7d097d0ff03"
 "checksum thread_local 0.2.7 (registry+https://github.com/rust-lang/crates.io-index)" = "8576dbbfcaef9641452d5cf0df9b0e7eeab7694956dd33bb61515fb8f18cfdd5"
 "checksum utf8-ranges 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)" = "a1ca13c08c41c9c3e04224ed9ff80461d97e121589ff27c753a16cb10830ae0f"
--- a/indexer/Cargo.toml
+++ b/indexer/Cargo.toml
@ -7,4 +7,6 @@ authors = ["Yorhel <git@yorhel.nl>"]
 regex = "0.1.77"
 log = "0.3.6"
 env_logger = "0.3.5"
-libarchive = { git = "https://github.com/17dec/libarchive-rust" }
+lazy_static = "0.2.1"
+libc = "0.2.17"
+libarchive3-sys = "0.1.2"
--- a/indexer/src/archive.rs
+++ b/indexer/src/archive.rs
@ -1,368 +1,273 @@
-use std::path::Path;
-use std::collections::HashMap;
-use libarchive::reader::Reader as ArchiveReader;
-use libarchive::reader::{FileReader,Builder};
-use libarchive::archive::{Entry,FileType,ReadFormat,ReadFilter};
-use libarchive::error::ArchiveResult;
+use std::str;
+use std::ptr;
+use std::error::Error as ErrorTrait;
+use std::io::{Result,Error,Read};
+use std::ffi::{CStr,CString};
+
+use libc::{c_void,ssize_t};
+use libarchive3_sys::ffi;


-pub fn open_file<T: AsRef<Path>>(path: T) -> ArchiveResult<FileReader> {
-    let mut builder = Builder::new();
-    try!(builder.support_format(ReadFormat::All));
-    try!(builder.support_filter(ReadFilter::All));
-    builder.open_file(path)
-}
-
-
-#[derive(Clone,Debug,PartialEq,Eq)]
-pub enum EntryType {
-    // Regular file that has been handled/indexed
-    Handled,
-    // Regular file that hasn't been handled because the caller wasn't interested in it. Could
-    // still be an interesting file if it is referenced from an interesting path.
-    Regular,
-    // Link to another file (interesting or not is irrelevant)
-    Link(String),
-    // Directory; need this information when resolving links
-    Directory,
-    // Something that couldn't be a an interesting file (chardev/socket/etc); If any link resolves
-    // to this we know we're done.
-    Other,
-}
-
-
-/*
- * I had hoped that reading man pages from an archive would just be a simple:
+/* This is a safe, limited and opinionated wrapper around the libarchive C bindings.
+ * I initially used the libarchive crate, but it has several issues. Some of which are not fixable
+ * without a complete rewrite.
+ * - Panics on non-UTF8 path names
+ * - Panics on hard links (PR #6)
+ * - API is far too flexible, easy to misuse and get panics/segfaults
+ * - Impossible to correctly read files from an archive (issue #7)
+ * - Does not provide a convenient Read interface for files
 *
- * 1. Walk through all files in the archive in a streaming fashion
- * 2. Parse/index man pages
- *
- * But alas, it was not to be. Symlinks and hardlinks have ruined it. Now we have to...
- *
- * 1. Walk through all entries in the archive in a streaming fashion
- * 2. Parse/index regular file man pages
- * 3. Keep track of all paths in the archive
- * 4. Use the result of step (3) to resolve symlinks/hardlinks to their actual file
- * 5. Read the entire damn archive again if one of the links resolved to a file that was not
- *    recognized as a man page in step (2). Luckily, this isn't very common.
- *
- * And this doesn't even cover the problem of duplicate entries in a tar, which is also quite
- * annoying to handle.
- *
- * What annoys me the most about all of this is that it's not possible to stream an archive from
- * the network and read/index the entire thing in a single step. Now we have to buffer packages to
- * disk in order to be able to read the archive a second time.
- *
- * (Note that it is possible to resolve links while walking through the entries, which will allow
- * us to match files found later in the archive against links found earlier, thus potentially
- * saving the need to read the archive a second time. This is merely a performance improvement for
- * an uncommon case, and it certainly won't simplify the code)
- *
- * (Note that it's also possible to just flush all files <10MB* to disk to completely avoid the
- * need for a second archive read, but that's going to significantly slow down the common case in
- * order to handle a rare case. It's possible to further optimize this using some heuristics to
- * determine whether a file is potentially a man page, but that's both complex and may not even
- * save much)
- *
- * (* So apparently some man pages are close to 10MB...)
+ * Barring any unexpected behaviour or bugs in libarchive, the API below should not panic or
+ * segfault for any archive or usage pattern.
 */
-pub struct Reader {
-    // List of seen files. This is used to resolve links
-    seen: HashMap<String, EntryType>,
-    // List of interesting links
-    links: Vec<String>,
-    // List of files we have to read in a second walk through the archive
-    missedfiles: HashMap<String, Vec<String>>,
+
+pub struct Archive<'a> {
+    a: *mut ffi::Struct_archive,
+    rd: &'a mut Read,
+    buf: Vec<u8>,
+    err: Option<Error>,
 }


-// Generalized API:
-// 1. Read once
-//    reader.read(file, interest_cb, file_cb) -> Error
-//    file: A libarchive::Reader
-//    interest_cb(path) -> bool
-//      Called on every file/link name, should return whether it's a file the caller is interested
-//      in.  (e.g. parse_path(), but also +DESC and other metadata).
-//    file_cb(path, reader, entry) -> Error
-//      Called on every interesting (actual) file, given the (normalized?) path, the
-//      libarchive::Reader and a ReaderEntry
-//
-// 2. Read links
-//    reader.links(link_cb) -> Error
-//    link_cb(path, dest) -> Error
-//      Called on every link which has as 'dest' a file path that has already been given to
-//      file_cb() before.
-//
-// 3. (Optionally) read a second time
-//    if reader.need_reread() {
-//      reader.reread(file, file_cb)
-//    }
-impl Reader {
-    pub fn new() -> Reader {
-        Reader {
-            seen: HashMap::new(),
-            links: Vec::new(),
-            missedfiles: HashMap::new(),
+pub struct ArchiveEntry<'a> {
+    a: Box<Archive<'a>>,
+    e: *mut ffi::Struct_archive_entry,
+}
+
+
+#[derive(Debug,PartialEq,Eq)]
+pub enum FileType {
+    File,
+    Directory,
+    Link(String),
+    Other, // Also includes Link(<non-utf8-path>)
+}
+
+
+unsafe extern "C" fn archive_read_cb(_: *mut ffi::Struct_archive, data: *mut c_void, buf: *mut *const c_void) -> ssize_t {
+    let arch: &mut Archive = &mut *(data as *mut Archive);
+    *buf = arch.buf.as_mut_ptr() as *mut c_void;
+    match arch.rd.read(&mut arch.buf[..]) {
+        Ok(s) => s as ssize_t,
+        Err(e) => {
+            let desc = CString::new(e.description()).unwrap();
+            let fmt = CString::new("%s").unwrap();
+            ffi::archive_set_error(arch.a, e.raw_os_error().unwrap_or(0), fmt.as_ptr(), desc.as_ptr());
+            arch.err = Some(e);
+            -1
        }
    }
-
-    // Convenience function to read the path/type/link from the next header.
-    fn read_header(rd: &mut ArchiveReader) -> Option<(String, EntryType)> {
-        let ent = match rd.next_header() {
-            Some(x) => x,
-            None => return None,
-        };
-        let path = ent.pathname().trim_left_matches('/').trim_left_matches("./").trim_right_matches('/').to_string();
-
-        // Hard links are apparently relative to the root of the archive.
-        let link = ent.hardlink().map(|x| format!("/{}", x))
-            .or(ent.symlink().map(str::to_string));
-
-        let(fts, ret) = match ent.filetype() {
-            FileType::BlockDevice     => ("blk", EntryType::Other),
-            FileType::SymbolicLink    => ("sym", match link { Some(l) => EntryType::Link(l), _ => EntryType::Other }),
-            FileType::Socket          => ("sck", EntryType::Other),
-            FileType::CharacterDevice => ("chr", EntryType::Other),
-            FileType::Directory       => ("dir", EntryType::Directory),
-            FileType::NamedPipe       => ("fif", EntryType::Other),
-            FileType::Mount           => ("mnt", EntryType::Other),
-            FileType::RegularFile     => ("reg", EntryType::Regular),
-            FileType::Unknown         => ("unk", match link { Some(l) => EntryType::Link(l), _ => EntryType::Other }),
-        };
-
-        trace!("Archive entry: {}{:10} bytes, path={:?} type={:?}", fts, ent.size(), path, ret);
-        Some((path, ret))
-    }
-
-    pub fn read<F,G>(&mut self, rd: &mut ArchiveReader, interest_cb: F, mut file_cb: G) -> ArchiveResult<()>
-        where F: Fn(&str) -> bool, G: FnMut(&[&str], &mut ArchiveReader) -> ArchiveResult<()>
-    {
-        while let Some((path, t)) = Self::read_header(rd) {
-            // We ought to throw away the result of the previous entry with the same name and use
-            // this new entry instead, but fuck it. This case is too rare, so let's just warn! it.
-            if let Some(_) = self.seen.get(&path) {
-                warn!("Duplicate file entry: {}", path);
-                continue;
-            }
-
-            let mut newt = t;
-            match newt {
-                EntryType::Regular if interest_cb(&path) => {
-                    let pathv = [&path as &str];
-                    try!(file_cb(&pathv[..], rd));
-                    newt = EntryType::Handled
-                },
-                EntryType::Link(_) if interest_cb(&path) => {
-                    self.links.push(path.clone());
-                },
-                _ => ()
-            };
-            self.seen.insert(path, newt);
-        }
-        Ok(())
-    }
-
-    // This is basically realpath(), using the virtual filesystem in self.seen.
-    // This method is not particularly efficient, it allocates like crazy.
-    fn resolve_link(&self, base: &str, path: &str, depth: usize) -> Option<(EntryType, Vec<String>)> {
-        if depth < 1 {
-            warn!("Unresolved link: {} -> {}; Recursion depth exceeded", base, path);
-            return None
-        }
-
-        // Remove filename from the base
-        let basedir = if let Some(i) = base.rfind('/') { base.split_at(i).0 } else { return None };
-
-        let comp : Vec<&str> =
-            if path.starts_with('/') { path.split('/').collect() }
-            else { basedir.split('/').chain(path.split('/')).collect() };
-
-        let mut dest = Vec::new();
-
-        for (i, &c) in comp.iter().enumerate() {
-            if c == "" || c == "." {
-                continue;
-            }
-            if c == ".." {
-                if dest.len() > 1 {
-                    dest.pop();
-                }
-                continue;
-            }
-            dest.push(c.to_string());
-            let curpath = dest.join("/");
-            match self.seen.get(&curpath) {
-
-                // If it's a directory, we're good
-                Some(&EntryType::Directory) => (),
-
-                // If it's a file or man page, it must be the last item.
-                Some(& ref x@ EntryType::Regular) |
-                Some(& ref x@ EntryType::Handled) => return
-                    if i == comp.len()-1 {
-                        Some((x.clone(), dest))
-                    } else {
-                        warn!("Unresolved link: {} -> {}; Non-directory component", base, path);
-                        None
-                    },
-
-                // Links... Ugh
-                Some(&EntryType::Link(ref d)) => {
-                    match self.resolve_link(&curpath, &d, depth-1) {
-                        // Same as above, with dirs we can continue, files have to be last
-                        Some((EntryType::Directory, d)) => dest = d,
-                        x@Some((EntryType::Regular, _)) |
-                        x@Some((EntryType::Handled, _)) => return
-                            if i == comp.len()-1 { x }
-                            else {
-                                warn!("Unresolved link: {} -> {}; Non-directory link component", base, path);
-                                None
-                            },
-                        _ => return None,
-                    }
-                },
-
-                // Don't care about anything else, just stop.
-                _ => {
-                    warn!("Unresolved link: {} -> {}; Special or non-existing file", base, path);
-                    return None
-                }
-            }
-        }
-        Some((EntryType::Directory, dest))
-    }
-
-    pub fn links<F>(&mut self, mut cb: F) where F: FnMut(&str, &str) {
-        for p in self.links.iter() {
-            let dest = match self.seen.get(p) { Some(&EntryType::Link(ref x)) => x, _ => unreachable!() };
-
-            match self.resolve_link(&p, dest, 32) {
-                Some((EntryType::Handled, d)) => {
-                    let dstr = d.join("/");
-                    cb(&p, &dstr)
-                },
-                Some((EntryType::Regular, d)) => {
-                    let dstr = d.join("/");
-                    self.missedfiles.entry(dstr).or_insert_with(Vec::new).push(p.to_string());
-                }
-                _ => {},
-            }
-        }
-        // We can reclaim this memory early.
-        self.links = Vec::new();
-        self.seen = HashMap::new();
-    }
-
-    pub fn need_reread(&self) -> bool {
-        self.missedfiles.len() > 0
-    }
-
-    pub fn reread<G>(&mut self, rd: &mut ArchiveReader, mut file_cb: G) -> ArchiveResult<()>
-        where G: FnMut(&[&str], &mut ArchiveReader) -> ArchiveResult<()>
-    {
-        while let Some((path, _)) = Self::read_header(rd) {
-            if let Some(f) = self.missedfiles.remove(&path) {
-                let v: Vec<&str> = f.iter().map(|x| x as &str).collect();
-                try!(file_cb(&v, rd))
-            }
-            if self.missedfiles.len() < 1 {
-                break;
-            }
-        }
-        Ok(())
-    }
 }


+impl<'a> Archive<'a> {
+    fn new(rd: &mut Read, a: *mut ffi::Struct_archive) -> Result<Box<Archive>> {
+        let bufsize = 64*1024;
+        let mut buf = Vec::with_capacity(bufsize);
+        unsafe { buf.set_len(bufsize) };
+        let mut ret = Box::new(Archive { a: a, rd: rd, buf: buf, err: None });
+
+        let aptr: *mut c_void = &mut *ret as *mut Archive as *mut c_void;
+        let r = unsafe { ffi::archive_read_open(a, aptr, None, Some(archive_read_cb), None) };
+        if r == ffi::ARCHIVE_FATAL {
+            return Err(ret.error());
+        }
+        Ok(ret)
+    }
+
+    fn error(&mut self) -> Error {
+        // TODO: Do something with the description
+        self.err.take().unwrap_or_else(||
+            Error::from_raw_os_error(unsafe { ffi::archive_errno(self.a) })
+        )
+    }
+
+    fn entry(self: Box<Self>) -> Result<Option<ArchiveEntry<'a>>> {
+        let mut ent = ArchiveEntry {
+            a: self,
+            e: ptr::null_mut()
+        };
+        let res = unsafe { ffi::archive_read_next_header(ent.a.a, &mut ent.e) };
+        match res {
+            ffi::ARCHIVE_EOF => Ok(None),
+            ffi::ARCHIVE_FATAL => Err(ent.a.error()),
+            _ => Ok(Some(ent))
+        }
+    }
+
+    fn read(&mut self, buf: &mut [u8]) -> Result<usize> {
+        let cbuf = buf.as_mut_ptr() as *mut c_void;
+        let n = unsafe { ffi::archive_read_data(self.a, cbuf, buf.len()) };
+        if n >= 0 {
+            Ok(n as usize)
+        } else {
+            Err(self.error())
+        }
+    }
+
+    pub fn open_archive(rd: &mut Read) -> Result<Option<ArchiveEntry>> {
+        let a  = unsafe {
+            let a = ffi::archive_read_new();
+            ffi::archive_read_support_filter_all(a);
+            ffi::archive_read_support_format_all(a);
+            a
+        };
+        try!(Self::new(rd, a)).entry()
+    }
+}
+
+
+impl<'a> Drop for Archive<'a> {
+    fn drop(&mut self) {
+        unsafe {
+            ffi::archive_read_free(self.a);
+        }
+    }
+}
+
+
+impl<'a> ArchiveEntry<'a> {
+    pub fn next(self) -> Result<Option<ArchiveEntry<'a>>> {
+        self.a.entry()
+    }
+
+    // Returns None in NULL (when does that even happen?) or on invalid UTF-8.
+    pub fn path(&self) -> Option<&str> {
+        let c_str: &CStr = unsafe {
+            let ptr = ffi::archive_entry_pathname(self.e);
+            if ptr.is_null() {
+                return None;
+            }
+            CStr::from_ptr(ptr)
+        };
+        str::from_utf8(c_str.to_bytes()).ok()
+            // Perform some simple opinionated normalization. Full normalization might be better,
+            // but also slower and more complex. This solution covers the most important cases.
+            .map(|s| s.trim_left_matches('/').trim_left_matches("./").trim_right_matches('/'))
+    }
+
+    pub fn size(&self) -> usize {
+        unsafe { ffi::archive_entry_size(self.e) as usize }
+    }
+
+    fn symlink(&self) -> Option<String> {
+        let c_str: &CStr = unsafe {
+            let ptr = ffi::archive_entry_symlink(self.e);
+            if ptr.is_null() {
+                return None;
+            }
+            CStr::from_ptr(ptr)
+        };
+        str::from_utf8(c_str.to_bytes()).map(str::to_string).ok()
+    }
+
+    fn hardlink(&self) -> Option<String> {
+        let c_str: &CStr = unsafe {
+            let ptr = ffi::archive_entry_hardlink(self.e);
+            if ptr.is_null() {
+                return None;
+            }
+            CStr::from_ptr(ptr)
+        };
+        // Hard links have the same name as an earlier pathname(), and those typically don't have a
+        // preceding slash. Add this slash here so that the same resolution logic can be used for
+        // both hardlinks and symlinks. I really don't care about the difference between these two.
+        str::from_utf8(c_str.to_bytes()).map(|p| format!("/{}", p)).ok()
+    }
+
+    pub fn filetype(&self) -> FileType {
+        // If it has a symlink/hardlink path, then just consider it a link regardless of what
+        // _filetype() says.
+        if let Some(l) = self.symlink().or(self.hardlink()) {
+            return FileType::Link(l);
+        }
+        match unsafe { ffi::archive_entry_filetype(self.e) } {
+            ffi::AE_IFDIR => FileType::Directory,
+            ffi::AE_IFREG => FileType::File,
+            _ => FileType::Other,
+        }
+    }
+}
+
+
+impl<'a> Read for ArchiveEntry<'a> {
+    fn read(&mut self, buf: &mut [u8]) -> Result<usize> {
+        self.a.read(buf)
+    }
+}
+
+
+// We can't provide an Iterator object for ArchiveEntries because Rust doesn't support streaming
+// iterators. Let's instead provide a walk function for convenience.
+// cb should return Ok(true) to continue, Ok(false) to break
+pub fn walk<F>(ent: Option<ArchiveEntry>, mut cb: F) -> Result<()>
+    where F: FnMut(&mut ArchiveEntry) -> Result<bool>
+{
+    let mut ent = ent;
+    while let Some(mut e) = ent {
+        if !try!(cb(&mut e)) {
+            break;
+        }
+        ent = try!(e.next());
+    }
+    Ok(())
+}
+


 #[cfg(test)]
 mod tests {
    use super::*;
-    use env_logger;
+    use std;
+    use std::io::Read;
+    use std::fs::File;

-    fn test_read(r: &mut Reader) {
-        let mut f = open_file("tests/testarchive.tar.xz").unwrap();
-        let mut files = Vec::new();
-        r.read(&mut f,
-            |p| p.starts_with("man/man"),
-            |p,_| { files.extend(p.iter().map(|x| x.to_string())); Ok(()) }
-        ).unwrap();
-        assert_eq!(files, vec!["man/man3/helloworld.3".to_string()]);
-    }
-
-    fn test_resolve_links(r: &mut Reader) {
-        let res = |p| {
-            if let Some(&EntryType::Link(ref l)) = r.seen.get(p) {
-                r.resolve_link(p, &l, 5)
-            } else {
-                panic!("Not found or not a link: {}", p);
-            }
-        };
-        let helloworld = Some((EntryType::Handled, vec!["man".to_string(), "man3".to_string(), "helloworld.3".to_string()]));
-
-        assert_eq!(res("man/mans"), Some((EntryType::Directory, vec!["man".to_string(), "man3".to_string()])));
-        assert_eq!(res("man/man6/hardlink.6"), helloworld);
-        assert_eq!(res("man/man1/symlinkbefore.1"), helloworld);
-        assert_eq!(res("man/man6/symlinkafter.6"), helloworld);
-
-        assert_eq!(res("man/man1/badsymlink1.1"), None);
-        assert_eq!(res("man/man1/badsymlink2.1"), None);
-        assert_eq!(res("man/man1/badsymlink3.1"), None);
-        assert_eq!(res("man/man1/badsymlink4.1"), None);
-        assert_eq!(res("man/man1/badsymlink5.1"), None);
-
-        assert_eq!(res("man/man1/doublesymlink1.1"), helloworld);
-        assert_eq!(res("man/man1/doublesymlink2.1"), helloworld);
-        assert_eq!(res("man/man1/triplesymlink.1"), helloworld);
-        assert_eq!(res("man/man1/infinitesymlink.1"), None);
-    }
-
-    fn test_links(r: &mut Reader) {
-        let mut links = Vec::new();
-        r.links(|p,d| links.push((p.to_string(), d.to_string())));
-        links.sort();
-
-        {
-            let mut res = |p:&str| {
-                let r = links.remove(0);
-                assert_eq!(r.0, p.to_string());
-                assert_eq!(r.1, "man/man3/helloworld.3".to_string());
-            };
-            res("man/man1/doublesymlink1.1");
-            res("man/man1/doublesymlink2.1");
-            res("man/man1/symlinkbefore.1");
-            res("man/man1/triplesymlink.1");
-            res("man/man6/hardlink.6");
-            res("man/man6/symlinkafter.6");
-        }
-        assert_eq!(links.len(), 0);
-    }
-
-    fn test_reread(r: &mut Reader) {
-        assert!(r.need_reread());
-
-        let mut f = open_file("tests/testarchive.tar.xz").unwrap();
-        let mut files = Vec::new();
-        r.reread(&mut f,
-            |p,_| { files.extend(p.iter().map(|x| x.to_string())); Ok(()) }
-        ).unwrap();
-
-        files.sort();
-        assert_eq!(files, vec![
-            "man/man3/needreread.3".to_string(),
-            "man/man6/needreread.6".to_string()
-        ]);
+    #[test]
+    fn invalid_archive() {
+        let mut r = std::io::repeat(0x0a).take(64*1024);
+        let ent = Archive::open_archive(&mut r);
+        assert!(ent.is_err());
    }

    #[test]
-    fn test_reader() {
-        env_logger::init().unwrap();
+    fn zerolength_archive() {
+        let mut r = std::io::empty();
+        let ent = Archive::open_archive(&mut r);
+        // I expected an error here rather than None, whatever.
+        assert!(ent.unwrap().is_none());
+    }

-        let mut r = Reader::new();
-        test_read(&mut r);
-        test_resolve_links(&mut r);
-        test_links(&mut r);
-        test_reread(&mut r);
+    #[test]
+    fn read() {
+        let mut f = File::open("tests/simpletest.tar.gz").unwrap();
+        let mut ent = Archive::open_archive(&mut f).unwrap().unwrap();
+
+        let t = |e:&mut ArchiveEntry, path, size, ft, cont| {
+            assert_eq!(e.path(), path);
+            assert_eq!(e.size(), size);
+            assert_eq!(e.filetype(), ft);
+            let mut contents = String::new();
+            assert_eq!(e.read_to_string(&mut contents).unwrap(), size);
+            assert_eq!(&contents, cont);
+        };
+
+        t(&mut ent, Some("simple"), 0, FileType::Directory, "");
+
+        ent = ent.next().unwrap().unwrap();
+        t(&mut ent, Some("simple/file"), 3, FileType::File, "Hi\n");
+
+        ent = ent.next().unwrap().unwrap();
+        t(&mut ent, Some("simple/link"), 0, FileType::Link("file".to_string()), "");
+
+        ent = ent.next().unwrap().unwrap();
+        t(&mut ent, Some("simple/hardlink"), 0, FileType::Link("/simple/file".to_string()), "");
+
+        ent = ent.next().unwrap().unwrap();
+        t(&mut ent, Some("simple/fifo"), 0, FileType::Other, "");
+
+        ent = ent.next().unwrap().unwrap();
+        t(&mut ent, None, 0, FileType::File, "");
+
+        assert!(ent.next().unwrap().is_none());
    }
 }
--- a/indexer/src/archread.rs
+++ b/indexer/src/archread.rs
@ -0,0 +1,363 @@
+use std::io::Result;
+use std::collections::HashMap;
+
+use archive::{walk,ArchiveEntry,FileType};
+
+/* I had hoped that reading man pages from an archive would just be a simple:
+ *
+ * 1. Walk through all files in the archive in a streaming fashion
+ * 2. Parse/index man pages
+ *
+ * But alas, it was not to be. Symlinks and hardlinks have ruined it. Now we have to...
+ *
+ * 1. Walk through all entries in the archive in a streaming fashion
+ * 2. Parse/index regular file man pages
+ * 3. Keep track of all paths in the archive
+ * 4. Use the result of step (3) to resolve symlinks/hardlinks to their actual file
+ * 5. Read the entire damn archive again if one of the links resolved to a file that was not
+ *    recognized as a man page in step (2). Luckily, this isn't very common.
+ *
+ * And this doesn't even cover the problem of duplicate entries in a tar, which is also quite
+ * annoying to handle.
+ *
+ * What annoys me the most about all of this is that it's not possible to stream an archive from
+ * the network and read/index the entire thing in a single step. Now we either have to buffer
+ * packages to disk or redownload the archive in order to be able to follow all links to man pages.
+ *
+ * (Note that it is possible to resolve links while walking through the entries, which will allow
+ * us to match files found later in the archive against links found earlier, thus potentially
+ * saving the need to read the archive a second time. This is merely a performance improvement for
+ * an uncommon case, and it certainly won't simplify the code)
+ *
+ * (Note that it's also possible to just flush all files <10MB* to disk to completely avoid the
+ * need for a second archive read, but that's going to significantly slow down the common case in
+ * order to handle a rare case. It's possible to further optimize this using some heuristics to
+ * determine whether a file is potentially a man page, but that's both complex and may not even
+ * save much)
+ *
+ * (* So apparently some man pages are close to 10MB...)
+ */
+
+
+#[derive(Clone,Debug,PartialEq,Eq)]
+pub enum EntryType {
+    // Regular file that has been handled/indexed
+    Handled,
+    // Regular file that hasn't been handled because the caller wasn't interested in it. Could
+    // still be an interesting file if it is referenced from an interesting path.
+    Regular,
+    // Link to another file (interesting or not is irrelevant)
+    Link(String),
+    // Directory; need this information when resolving links
+    Directory,
+    // Something that couldn't be an interesting file (chardev/socket/etc); If any link resolves to
+    // this we know we're done.
+    Other,
+}
+
+pub struct FileList {
+    // List of seen files. This is used to resolve links
+    seen: HashMap<String, EntryType>,
+    // List of interesting links
+    links: Vec<String>,
+}
+
+pub struct MissedFiles(HashMap<String, Vec<String>>);
+
+
+impl FileList {
+
+    /* Read an archive until the end. Accepts two callbacks:
+     *
+     *   interest_cb: Called on every path in the archive, should return whether the file is
+     *       interesting (i.e. whether we want to know its contents).
+     *   file_cb: Called on every regular file for which interest_cb() showed an interest.
+     *       The callback accepts multiple path names, but this function will only provide one.
+     *
+     * Returns a FileList struct that can be used to retreive all interesting non-regular files.
+     */
+    pub fn read<F,G>(ent: Option<ArchiveEntry>, interest_cb: F, mut file_cb: G) -> Result<FileList>
+        where F: Fn(&str) -> bool, G: FnMut(&[&str], &mut ArchiveEntry) -> Result<()>
+    {
+        let mut fl = FileList {
+            seen: HashMap::new(),
+            links: Vec::new(),
+        };
+
+        try!(walk(ent, |mut e| {
+            let path = match e.path() {
+                Some(x) => x.to_string(),
+                None => { warn!("Invalid UTF-8 filename in archive"); return Ok(true) }
+            };
+            let ft = e.filetype();
+            trace!("Archive entry: {:10} {} {:?}", e.size(), path, ft);
+
+            // We ought to throw away the result of the previous entry with the same name and use
+            // this new entry instead, but fuck it. This case is too rare, so let's just warn.
+            if let Some(_) = fl.seen.get(&path) {
+                warn!("Duplicate file entry: {}", path);
+                return Ok(true);
+            }
+
+            let et = match ft {
+                FileType::File => {
+                    if interest_cb(&path) {
+                        let pathv = [&path as &str];
+                        try!(file_cb(&pathv[..], &mut e));
+                        EntryType::Handled
+                    } else {
+                        EntryType::Regular
+                    }
+                },
+                FileType::Link(l) => {
+                    if interest_cb(&path) {
+                        fl.links.push(path.clone());
+                    }
+                    EntryType::Link(l)
+                },
+                FileType::Directory => EntryType::Directory,
+                FileType::Other => EntryType::Other,
+            };
+
+            fl.seen.insert(path, et);
+            Ok(true)
+        }));
+        Ok(fl)
+    }
+
+
+    // This is basically realpath(), using the virtual filesystem in self.seen.
+    // This method is not particularly efficient, it allocates like crazy.
+    fn resolve_link(&self, base: &str, path: &str, depth: usize) -> Option<(EntryType, Vec<String>)> {
+        if depth < 1 {
+            warn!("Unresolved link: {} -> {}; Recursion depth exceeded", base, path);
+            return None
+        }
+
+        // Remove filename from the base
+        let basedir = if let Some(i) = base.rfind('/') { base.split_at(i).0 } else { return None };
+
+        let comp : Vec<&str> =
+            if path.starts_with('/') { path.split('/').collect() }
+            else { basedir.split('/').chain(path.split('/')).collect() };
+
+        let mut dest = Vec::new();
+
+        for (i, &c) in comp.iter().enumerate() {
+            if c == "" || c == "." {
+                continue;
+            }
+            if c == ".." {
+                if dest.len() > 1 {
+                    dest.pop();
+                }
+                continue;
+            }
+            dest.push(c.to_string());
+            let curpath = dest.join("/");
+            match self.seen.get(&curpath) {
+
+                // If it's a directory, we're good
+                Some(&EntryType::Directory) => (),
+
+                // If it's a file or man page, it must be the last item.
+                Some(& ref x@ EntryType::Regular) |
+                Some(& ref x@ EntryType::Handled) => return
+                    if i == comp.len()-1 {
+                        Some((x.clone(), dest))
+                    } else {
+                        warn!("Unresolved link: {} -> {}; Non-directory component", base, path);
+                        None
+                    },
+
+                // Links... Ugh
+                Some(&EntryType::Link(ref d)) => {
+                    match self.resolve_link(&curpath, &d, depth-1) {
+                        // Same as above, with dirs we can continue, files have to be last
+                        Some((EntryType::Directory, d)) => dest = d,
+                        x@Some((EntryType::Regular, _)) |
+                        x@Some((EntryType::Handled, _)) => return
+                            if i == comp.len()-1 { x }
+                            else {
+                                warn!("Unresolved link: {} -> {}; Non-directory link component", base, path);
+                                None
+                            },
+                        _ => return None,
+                    }
+                },
+
+                // Don't care about anything else, just stop.
+                _ => {
+                    warn!("Unresolved link: {} -> {}; Special or non-existing file", base, path);
+                    return None
+                }
+            }
+        }
+        Some((EntryType::Directory, dest))
+    }
+
+    /* Calls cb() on every 'interesting' link to a file that has already been passed to a file_cb()
+     * in FileList::read().
+     * If there are any interesting links that have not yet been passed to file_cb(), a MissedFiles
+     * struct is returned that can be used to retrieve those files by re-reading the archive.
+     */
+    pub fn links<F>(self, mut cb: F) -> Option<MissedFiles> where F: FnMut(&str, &str) {
+        let mut missed = HashMap::new();
+
+        for p in self.links.iter() {
+            let dest = match self.seen.get(p) { Some(&EntryType::Link(ref x)) => x, _ => unreachable!() };
+
+            match self.resolve_link(&p, dest, 32) {
+                Some((EntryType::Handled, d)) => {
+                    let dstr = d.join("/");
+                    cb(&p, &dstr);
+                },
+                Some((EntryType::Regular, d)) => {
+                    let dstr = d.join("/");
+                    missed.entry(dstr).or_insert_with(Vec::new).push(p.to_string());
+                }
+                _ => (),
+            }
+        }
+
+        if missed.len() > 0 {
+            Some(MissedFiles(missed))
+        } else {
+            None
+        }
+    }
+}
+
+
+impl MissedFiles {
+    /* Reads the archive again and calls file_cb() on every interesting file that was missed during
+     * the first read of the archive (using FileList::{read,links}). file_cb is exactly the same as
+     * in FileList::read, but this time it can actually get multiple paths as first argument; which
+     * happens when multiple interesting links point to the same file. */
+    pub fn read<G>(mut self, ent: Option<ArchiveEntry>, mut file_cb: G) -> Result<()>
+        where G: FnMut(&[&str], &mut ArchiveEntry) -> Result<()>
+    {
+        walk(ent, |mut e| {
+            if let Some(f) = e.path().and_then(|p| self.0.remove(p)) {
+                let v: Vec<&str> = f.iter().map(|x| x as &str).collect();
+                try!(file_cb(&v, &mut e))
+            }
+            Ok(self.0.len() > 0)
+        })
+    }
+}
+
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use archive::Archive;
+    use std::io::Read;
+    use std::fs::File;
+
+    fn test_read() -> FileList {
+        let mut f = File::open("tests/testarchive.tar.xz").unwrap();
+        let arch = Archive::open_archive(&mut f).unwrap();
+        let mut cnt = 0;
+        FileList::read(arch,
+            |p| p.starts_with("man/man"),
+            |p,e| {
+                assert_eq!(cnt, 0);
+                cnt += 1;
+                assert_eq!(p, &["man/man3/helloworld.3"][..]);
+                assert_eq!(e.size(), 12);
+
+                let mut cont = String::new();
+                e.read_to_string(&mut cont).unwrap();
+                assert_eq!(&cont, "Hello World\n");
+                Ok(())
+            }
+        ).unwrap()
+    }
+
+    fn test_resolve_links(r: &FileList) {
+        let res = |p| {
+            if let Some(&EntryType::Link(ref l)) = r.seen.get(p) {
+                r.resolve_link(p, &l, 5)
+            } else {
+                panic!("Not found or not a link: {}", p);
+            }
+        };
+        let helloworld = Some((EntryType::Handled, vec!["man".to_string(), "man3".to_string(), "helloworld.3".to_string()]));
+
+        assert_eq!(res("man/mans"), Some((EntryType::Directory, vec!["man".to_string(), "man3".to_string()])));
+        assert_eq!(res("man/man6/hardlink.6"), helloworld);
+        assert_eq!(res("man/man1/symlinkbefore.1"), helloworld);
+        assert_eq!(res("man/man6/symlinkafter.6"), helloworld);
+
+        assert_eq!(res("man/man1/badsymlink1.1"), None);
+        assert_eq!(res("man/man1/badsymlink2.1"), None);
+        assert_eq!(res("man/man1/badsymlink3.1"), None);
+        assert_eq!(res("man/man1/badsymlink4.1"), None);
+        assert_eq!(res("man/man1/badsymlink5.1"), None);
+
+        assert_eq!(res("man/man1/doublesymlink1.1"), helloworld);
+        assert_eq!(res("man/man1/doublesymlink2.1"), helloworld);
+        assert_eq!(res("man/man1/triplesymlink.1"), helloworld);
+        assert_eq!(res("man/man1/infinitesymlink.1"), None);
+    }
+
+    fn test_links(r: FileList) -> Option<MissedFiles> {
+        let mut links = Vec::new();
+        let missed = r.links(|p,d| links.push((p.to_string(), d.to_string())));
+        links.sort();
+
+        {
+            let mut res = |p:&str| {
+                let r = links.remove(0);
+                assert_eq!(r.0, p.to_string());
+                assert_eq!(r.1, "man/man3/helloworld.3".to_string());
+            };
+            res("man/man1/doublesymlink1.1");
+            res("man/man1/doublesymlink2.1");
+            res("man/man1/symlinkbefore.1");
+            res("man/man1/triplesymlink.1");
+            res("man/man6/hardlink.6");
+            res("man/man6/symlinkafter.6");
+        }
+        assert_eq!(links.len(), 0);
+        missed
+    }
+
+    fn test_reread(r: MissedFiles) {
+        let mut f = File::open("tests/testarchive.tar.xz").unwrap();
+        let ent = Archive::open_archive(&mut f).unwrap();
+        let mut files = Vec::new();
+        r.read(ent,
+            |p,e| {
+                let mut cont = String::new();
+                e.read_to_string(&mut cont).unwrap();
+                files.extend(p.iter().map(|x| (x.to_string(), cont.clone()) ));
+                Ok(())
+            }
+        ).unwrap();
+        files.sort();
+
+        {
+            let mut res = |a:&str, b:&str| {
+                let r = files.remove(0);
+                assert_eq!(&r.0, a);
+                assert_eq!(&r.1, b);
+            };
+            res("man/man3/needreread.3", "Potentially interesting file\n");
+            res("man/man6/needreread.6", "Potentially interesting file\n");
+        }
+        assert_eq!(files.len(), 0);
+    }
+
+    #[test]
+    fn test_reader() {
+        //use env_logger;
+        //env_logger::init().unwrap();
+
+        let r = test_read();
+        test_resolve_links(&r);
+        let l = test_links(r).unwrap();
+        test_reread(l);
+    }
+}
--- a/indexer/src/main.rs
+++ b/indexer/src/main.rs
@ -1,97 +1,15 @@
 #[macro_use] extern crate log;
+#[macro_use] extern crate lazy_static;
 extern crate env_logger;
-extern crate libarchive;
 extern crate regex;
-
-use regex::Regex;
+extern crate libarchive3_sys;
+extern crate libc;

 mod archive;
-
-
-// Checks a path for a man page candidate. Returns None if it doesn't seem like a man page
-// location, otherwise Some((manPageName, Section, Locale)).
-fn parse_path(path: &str) -> Option<(&str, &str, &str)> {
-    // Roughly: man[/locale]/man1/manpage.section[.compression]+
-    // TODO: lazy_static
-    let re = Regex::new(r"(?x)
-        man
-        (?: / ([^/]+) )?   # Optional locale
-        /man[a-z0-9]/      # Subdir
-        ([^/]+?)           # Man page name (non-greedy)
-        \. ([^/\.]+)       # Section
-        (?: \. (?: gz|lzma|bz2|xz ))* $  # Any number of compression extensions
-    ").unwrap();
-
-    let cap = match re.captures(path) { Some(x) => x, None => return None };
-    let locale = cap.at(1).unwrap_or("");
-    let name = cap.at(2).unwrap();
-    let section = cap.at(3).unwrap();
-
-    // Not everything matching the regex is necessarily a man page, exclude some special cases.
-    match (name, section, locale) {
-        // Files that totally aren't man pages
-        ("Makefile",   "in",   _) |
-        ("Makefile",   "am",   _) |
-        (".cvsignore",  _,     _) |
-        (_,            "gz",   _) |
-        (_,            "lzma", _) |
-        (_,            "bz2",  _) |
-        (_,            "xz",   _) |
-        (_,            "html", _) => None,
-        // Some weird directories that happen to match the locale
-        (n, s, "5man") |
-        (n, s, "c")    |
-        (n, s, "man1") |
-        (n, s, "man2") |
-        (n, s, "man3") |
-        (n, s, "man4") |
-        (n, s, "man5") |
-        (n, s, "man6") |
-        (n, s, "man7") |
-        (n, s, "man8") |
-        (n, s, "Man-Part1") |
-        (n, s, "Man-Part2") => Some((n, s, "")),
-        // Nothing special!
-        x => Some(x)
-    }
-}
-
+mod archread;
+mod man;

 fn main() {
    env_logger::init().unwrap();
    info!("Hello, world!");
 }
-
-
-#[test]
-fn test_parse_path() {
-    // Generic tests
-    assert_eq!(parse_path("/"), None);
-    assert_eq!(parse_path("/man1/ncdu.1"), None);
-    assert_eq!(parse_path("/man/man?/ncdu.1"), None);
-    assert_eq!(parse_path("/man/man1/ncdu.1"), Some(("ncdu", "1", "")));
-    assert_eq!(parse_path("/man/man1/ncdu.1.gz.lzma.xz.bz2.gz"), Some(("ncdu", "1", ""))); // This stuff happens
-    assert_eq!(parse_path("/man/en_US.UTF-8/man1/ncdu.1"), Some(("ncdu", "1", "en_US.UTF-8")));
-
-    // Special cases
-    assert_eq!(parse_path("/usr/share/man/man1/INDEX"), None);
-    assert_eq!(parse_path("/usr/share/man/man1/Makefile"), None);
-    assert_eq!(parse_path("/usr/share/man/man1/Makefile.am"), None);
-    assert_eq!(parse_path("/usr/share/man/man1/Makefile.in"), None);
-    assert_eq!(parse_path("/usr/share/man/man1/.cvsignore"), None);
-    assert_eq!(parse_path("/usr/share/man/man1/.cvsignore.gz"), None);
-
-    // Some actual locations
-    assert_eq!(parse_path("/usr/local/man/man1/list_audio_tracks.1.gz"), Some(("list_audio_tracks", "1", "")));
-    assert_eq!(parse_path("/usr/local/lib/perl5/site_perl/man/man3/DBIx::Class::Helper::ResultSet::DateMethods1::Announcement.3.gz"), Some(("DBIx::Class::Helper::ResultSet::DateMethods1::Announcement", "3", "")));
-    assert_eq!(parse_path("/usr/man/man3/exit.3tk"), Some(("exit", "3tk", "")));
-    assert_eq!(parse_path("/usr/local/brlcad/share/man/mann/exit.nged.gz"), Some(("exit", "nged", "")));
-    assert_eq!(parse_path("/usr/X11R6/man/man3/intro.3xglut.gz"), Some(("intro", "3xglut", "")));
-    assert_eq!(parse_path("/usr/local/share/man/ko_KR.eucKR/man3/intro.3.gz"), Some(("intro", "3", "ko_KR.eucKR")));
-
-    assert_eq!(parse_path("/usr/lib/scilab/man/Man-Part1/man1/ans.1"), Some(("ans", "1", "")));
-    assert_eq!(parse_path("/heirloom/usr/share/man/5man/man1/chgrp.1.gz"), Some(("chgrp", "1", "")));
-
-    assert_eq!(parse_path("/usr/local/plan9/man/man8/index.html"), None);
-    assert_eq!(parse_path("/usr/local/share/doc/gmt/html/man/grdpaste.html"), None);
-}
--- a/indexer/src/man.rs
+++ b/indexer/src/man.rs
@ -0,0 +1,85 @@
+use regex::Regex;
+
+
+// Checks a path for a man page candidate. Returns None if it doesn't seem like a man page
+// location, otherwise Some((manPageName, Section, Locale)).
+fn parse_path(path: &str) -> Option<(&str, &str, &str)> {
+    // Roughly: man[/locale]/man1/manpage.section[.compression]+
+    lazy_static! {
+        static ref RE: Regex = Regex::new(r"(?x)
+            man
+            (?: / ([^/]+) )?   # Optional locale
+            /man[a-z0-9]/      # Subdir
+            ([^/]+?)           # Man page name (non-greedy)
+            \. ([^/\.]+)       # Section
+            (?: \. (?: gz|lzma|bz2|xz ))* $  # Any number of compression extensions
+        ").unwrap();
+    }
+
+    let cap = match RE.captures(path) { Some(x) => x, None => return None };
+    let locale = cap.at(1).unwrap_or("");
+    let name = cap.at(2).unwrap();
+    let section = cap.at(3).unwrap();
+
+    // Not everything matching the regex is necessarily a man page, exclude some special cases.
+    match (name, section, locale) {
+        // Files that totally aren't man pages
+        ("Makefile",   "in",   _) |
+        ("Makefile",   "am",   _) |
+        (".cvsignore",  _,     _) |
+        (_,            "gz",   _) |
+        (_,            "lzma", _) |
+        (_,            "bz2",  _) |
+        (_,            "xz",   _) |
+        (_,            "html", _) => None,
+        // Some weird directories that happen to match the locale
+        (n, s, "5man") |
+        (n, s, "c")    |
+        (n, s, "man1") |
+        (n, s, "man2") |
+        (n, s, "man3") |
+        (n, s, "man4") |
+        (n, s, "man5") |
+        (n, s, "man6") |
+        (n, s, "man7") |
+        (n, s, "man8") |
+        (n, s, "Man-Part1") |
+        (n, s, "Man-Part2") => Some((n, s, "")),
+        // Nothing special!
+        x => Some(x)
+    }
+}
+
+
+#[test]
+fn test_parse_path() {
+    // Generic tests
+    assert_eq!(parse_path("/"), None);
+    assert_eq!(parse_path("/man1/ncdu.1"), None);
+    assert_eq!(parse_path("/man/man?/ncdu.1"), None);
+    assert_eq!(parse_path("/man/man1/ncdu.1"), Some(("ncdu", "1", "")));
+    assert_eq!(parse_path("/man/man1/ncdu.1.gz.lzma.xz.bz2.gz"), Some(("ncdu", "1", ""))); // This stuff happens
+    assert_eq!(parse_path("/man/en_US.UTF-8/man1/ncdu.1"), Some(("ncdu", "1", "en_US.UTF-8")));
+
+    // Special cases
+    assert_eq!(parse_path("/usr/share/man/man1/INDEX"), None);
+    assert_eq!(parse_path("/usr/share/man/man1/Makefile"), None);
+    assert_eq!(parse_path("/usr/share/man/man1/Makefile.am"), None);
+    assert_eq!(parse_path("/usr/share/man/man1/Makefile.in"), None);
+    assert_eq!(parse_path("/usr/share/man/man1/.cvsignore"), None);
+    assert_eq!(parse_path("/usr/share/man/man1/.cvsignore.gz"), None);
+
+    // Some actual locations
+    assert_eq!(parse_path("/usr/local/man/man1/list_audio_tracks.1.gz"), Some(("list_audio_tracks", "1", "")));
+    assert_eq!(parse_path("/usr/local/lib/perl5/site_perl/man/man3/DBIx::Class::Helper::ResultSet::DateMethods1::Announcement.3.gz"), Some(("DBIx::Class::Helper::ResultSet::DateMethods1::Announcement", "3", "")));
+    assert_eq!(parse_path("/usr/man/man3/exit.3tk"), Some(("exit", "3tk", "")));
+    assert_eq!(parse_path("/usr/local/brlcad/share/man/mann/exit.nged.gz"), Some(("exit", "nged", "")));
+    assert_eq!(parse_path("/usr/X11R6/man/man3/intro.3xglut.gz"), Some(("intro", "3xglut", "")));
+    assert_eq!(parse_path("/usr/local/share/man/ko_KR.eucKR/man3/intro.3.gz"), Some(("intro", "3", "ko_KR.eucKR")));
+
+    assert_eq!(parse_path("/usr/lib/scilab/man/Man-Part1/man1/ans.1"), Some(("ans", "1", "")));
+    assert_eq!(parse_path("/heirloom/usr/share/man/5man/man1/chgrp.1.gz"), Some(("chgrp", "1", "")));
+
+    assert_eq!(parse_path("/usr/local/plan9/man/man8/index.html"), None);
+    assert_eq!(parse_path("/usr/local/share/doc/gmt/html/man/grdpaste.html"), None);
+}
--- a/indexer/tests/mktar.sh
+++ b/indexer/tests/mktar.sh
@ -4,6 +4,19 @@
 # way.  The tests will fail quite badly if hardlink.6 is considered the
 # "original" version.

+
+mkdir simple
+echo Hi >simple/file
+ln -s file simple/link
+ln simple/file simple/hardlink
+mkfifo simple/fifo
+badfn=`echo 'Héllö.txt' | iconv -t ISO-8859-1`
+touch $badfn
+tar -czf simpletest.tar.gz simple $badfn
+rm -rf $badfn simple
+
+
+
 mkdir man
 cd man

--- a/indexer/tests/simpletest.tar.gz
+++ b/indexer/tests/simpletest.tar.gz
--- a/indexer/tests/testarchive.tar.xz
+++ b/indexer/tests/testarchive.tar.xz