use std::io::Result; use std::collections::HashMap; use archive::{walk,ArchiveEntry,FileType}; /* I had hoped that reading man pages from an archive would just be a simple: * * 1. Walk through all files in the archive in a streaming fashion * 2. Parse/index man pages * * But alas, it was not to be. Symlinks and hardlinks have ruined it. Now we have to... * * 1. Walk through all entries in the archive in a streaming fashion * 2. Parse/index regular file man pages * 3. Keep track of all paths in the archive * 4. Use the result of step (3) to resolve symlinks/hardlinks to their actual file * 5. Read the entire damn archive again if one of the links resolved to a file that was not * recognized as a man page in step (2). Luckily, this isn't very common. * * And this doesn't even cover the problem of duplicate entries in a tar, which is also quite * annoying to handle. * * What annoys me the most about all of this is that it's not possible to stream an archive from * the network and read/index the entire thing in a single step. Now we either have to buffer * packages to disk or redownload the archive in order to be able to follow all links to man pages. * * (Note that it is possible to resolve links while walking through the entries, which will allow * us to match files found later in the archive against links found earlier, thus potentially * saving the need to read the archive a second time. This is merely a performance improvement for * an uncommon case, and it certainly won't simplify the code) * * (Note that it's also possible to just flush all files <10MB* to disk to completely avoid the * need for a second archive read, but that's going to significantly slow down the common case in * order to handle a rare case. It's possible to further optimize this using some heuristics to * determine whether a file is potentially a man page, but that's both complex and may not even * save much) * * (* So apparently some man pages are close to 10MB...) */ /* How links are handled by libarchive and how we deal with that: * * symlinks: * Simply seen as a file entry containing a path that can be resolved to another file entry. Path * can be absolute or relative. Symlink can come before or after the referenced file. * (Handling of this is described above) * * tar hardlinks: * The first occurrence of the file is seen as a regular file. For any future occurrences, * libarchive will give us the path of the first occurrence in archive_entry_hardlink(), which * archive.rs will then merge into a Link() entry. * Basically, hard links can be handled in the exact same way as symlinks. * * cpio hardlinks (old): Hard links are duplicated. No special handling needed. * * cpio hardlinks (new): * A. The first occurrence of the file is seen as a regular file with size=0, nlinks>1 * B. Any later occurrences of the file are seen as a hardlink to the first occurrence. * C. The last occurrence of the file is also seen as a hardlink to the first occurrence, but * actually has the correct file size and body. * How to handle: * - Entry (A) is inserted in the FileList as an 'Hardlink' * - Entry (B) is inserted as a normal 'Link' to (A) * - Entry (C), detected by having size>0, and handled as follows: * - The entry itself is inserted as a normal 'Link' to (A) * - If (A) is interesting, the contents of the entry are processed with (A)'s path, and (A)'s * entry is changed to 'Handled' * - Then you end up with one 'Handled' entry and several 'Link' entries pointing to it. The link * resolution mechanism will handle the rest. * Note that this will not work if (A)'s path itself is not interesting. In that case the archive * will be read a second time, but all entries pointing to (A), either through hardlinks or * symlinks pointing to hardlinks, will resolve to the empty file entry. I strongly doubt there * is a package archive like that. */ #[derive(Clone,Debug,PartialEq,Eq)] pub enum EntryType { // Regular file that has been handled/indexed Handled, // Regular file that hasn't been handled because the caller wasn't interested in it. Could // still be an interesting file if it is referenced from an interesting path. Regular, // Link to another file (interesting or not is irrelevant) Link(String), // A hardlink for which we don't have the file contents (yet) Hardlink, // Directory; need this information when resolving links Directory, // Something that couldn't be an interesting file (chardev/socket/etc); If any link resolves to // this we know we're done. Other, } pub struct FileList { // List of seen files. This is used to resolve links seen: HashMap, // List of interesting links links: Vec, } pub struct MissedFiles(HashMap>); impl FileList { /* Read an archive until the end. Accepts two callbacks: * * interest_cb: Called on every path in the archive, should return whether the file is * interesting (i.e. whether we want to know its contents). * entry_cb: Called once on every entry in the archive. * file_cb: Called on every regular file for which interest_cb() showed an interest. * The callback accepts multiple path names, but this function will only provide one. * * Returns a FileList struct that can be used to retreive all interesting non-regular files. */ pub fn read(ent: Option, interest_cb: F, mut entry_cb: G, mut file_cb: H) -> Result where F: Fn(&str) -> bool, G: FnMut(&ArchiveEntry), H: FnMut(&[&str], &mut ArchiveEntry) -> Result<()> { let mut fl = FileList { seen: HashMap::new(), links: Vec::new(), }; try!(walk(ent, |mut e| { let path = match e.path() { Some(x) => x.to_string(), None => { warn!("Invalid UTF-8 filename in archive"); return Ok(true) } }; let ft = e.filetype(); trace!("Archive entry: {:10} #{} {} {:?}", e.size(), e.nlink(), path, ft); // We ought to throw away the result of the previous entry with the same name and use // this new entry instead, but fuck it. This case is too rare, so let's just warn. if let Some(_) = fl.seen.get(&path) { warn!("Duplicate file entry: {}", path); return Ok(true); } entry_cb(&e); let et = match ft { FileType::File => { if e.size() == 0 && e.nlink() > 1 { EntryType::Hardlink } else if interest_cb(&path) { let pathv = [&path as &str]; try!(file_cb(&pathv[..], &mut e)); EntryType::Handled } else { EntryType::Regular } }, FileType::Link(l) => { // cpio hard link for which we have the file contents if e.size() > 0 && e.nlink() > 1 { if let Some((EntryType::Hardlink, ocomp)) = fl.resolve_link(&path, &l, 32) { let opath = ocomp.join("/"); if interest_cb(&opath) { let pathv = [&opath as &str]; try!(file_cb(&pathv[..], &mut e)); *fl.seen.get_mut(&opath).unwrap() = EntryType::Handled; } } } if interest_cb(&path) { fl.links.push(path.clone()); } EntryType::Link(l) }, FileType::Directory => EntryType::Directory, FileType::Other => EntryType::Other, }; fl.seen.insert(path, et); Ok(true) })); Ok(fl) } // This is basically realpath(), using the virtual filesystem in self.seen. // This method is not particularly efficient, it allocates like crazy. fn resolve_link(&self, base: &str, path: &str, depth: usize) -> Option<(EntryType, Vec)> { if depth < 1 { warn!("Unresolved link: {} -> {}; Recursion depth exceeded", base, path); return None } // Remove filename from the base let basedir = if let Some(i) = base.rfind('/') { base.split_at(i).0 } else { return None }; let comp : Vec<&str> = if path.starts_with('/') { path.split('/').collect() } else { basedir.split('/').chain(path.split('/')).collect() }; let mut dest = Vec::new(); for (i, &c) in comp.iter().enumerate() { if c == "" || c == "." { continue; } if c == ".." { if dest.len() > 1 { dest.pop(); } continue; } dest.push(c.to_string()); let curpath = dest.join("/"); match self.seen.get(&curpath) { // If it's a directory, we're good Some(&EntryType::Directory) => (), // If we didn't find anything but this isn't the last item, then it's possible that // this is a directory for which no explicit entry was created. This happens with // RPM files. Just continue and see if we find anything if we add more components. None if i != comp.len()-1 => (), // If it's a file or man page, it must be the last item. Some(& ref x@ EntryType::Regular) | Some(& ref x@ EntryType::Hardlink) | Some(& ref x@ EntryType::Handled) => return if i == comp.len()-1 { Some((x.clone(), dest)) } else { warn!("Unresolved link: {} -> {}; Non-directory component", base, path); None }, // Links... Ugh Some(&EntryType::Link(ref d)) => { match self.resolve_link(&curpath, &d, depth-1) { // Same as above, with dirs we can continue, files have to be last Some((EntryType::Directory, d)) => dest = d, x@Some((EntryType::Regular, _)) | x@Some((EntryType::Hardlink, _)) | x@Some((EntryType::Handled, _)) => return if i == comp.len()-1 { x } else { warn!("Unresolved link: {} -> {}; Non-directory link component", base, path); None }, _ => return None, } }, // Don't care about anything else, just stop. _ => { warn!("Unresolved link: {} -> {}; Special or non-existing file", base, path); return None } } } Some((EntryType::Directory, dest)) } /* Calls cb() on every 'interesting' link to a file that has already been passed to a file_cb() * in FileList::read(). * If there are any interesting links that have not yet been passed to file_cb(), a MissedFiles * struct is returned that can be used to retrieve those files by re-reading the archive. */ pub fn links(self, mut cb: F) -> Option where F: FnMut(&str, &str) { let mut missed = HashMap::new(); for p in self.links.iter() { let dest = match self.seen.get(p) { Some(&EntryType::Link(ref x)) => x, _ => unreachable!() }; match self.resolve_link(&p, dest, 32) { Some((EntryType::Handled, d)) => { let dstr = d.join("/"); cb(&p, &dstr); }, Some((EntryType::Regular, d)) => { let dstr = d.join("/"); missed.entry(dstr).or_insert_with(Vec::new).push(p.to_string()); }, Some((EntryType::Hardlink, _)) => error!("Interesting link resolved to uninteresting hardlink: {}-> {}", p, dest), _ => (), } } if missed.len() > 0 { Some(MissedFiles(missed)) } else { None } } } impl MissedFiles { /* Reads the archive again and calls file_cb() on every interesting file that was missed during * the first read of the archive (using FileList::{read,links}). file_cb is exactly the same as * in FileList::read, but this time it can actually get multiple paths as first argument; which * happens when multiple interesting links point to the same file. */ pub fn read(mut self, ent: Option, mut file_cb: G) -> Result<()> where G: FnMut(&[&str], &mut ArchiveEntry) -> Result<()> { walk(ent, |mut e| { if let Some(f) = e.path().and_then(|p| self.0.remove(p)) { let v: Vec<&str> = f.iter().map(|x| x as &str).collect(); try!(file_cb(&v, &mut e)) } Ok(self.0.len() > 0) }) } } #[cfg(test)] mod tests { use super::*; use archive::Archive; use std::io::Read; use std::fs::File; fn test_read() -> FileList { let mut f = File::open("tests/testarchive.tar.xz").unwrap(); let arch = Archive::open_archive(&mut f).unwrap(); let mut cnt = 0; FileList::read(arch, |p| p.starts_with("man/man"), |_| (), |p,e| { assert_eq!(cnt, 0); cnt += 1; assert_eq!(p, &["man/man3/helloworld.3"][..]); assert_eq!(e.size(), 12); let mut cont = String::new(); e.read_to_string(&mut cont).unwrap(); assert_eq!(&cont, "Hello World\n"); Ok(()) } ).unwrap() } fn test_resolve_links(r: &FileList) { let res = |p| { if let Some(&EntryType::Link(ref l)) = r.seen.get(p) { r.resolve_link(p, &l, 5) } else { panic!("Not found or not a link: {}", p); } }; let helloworld = Some((EntryType::Handled, vec!["man".to_string(), "man3".to_string(), "helloworld.3".to_string()])); assert_eq!(res("man/mans"), Some((EntryType::Directory, vec!["man".to_string(), "man3".to_string()]))); assert_eq!(res("man/man6/hardlink.6"), helloworld); assert_eq!(res("man/man1/symlinkbefore.1"), helloworld); assert_eq!(res("man/man6/symlinkafter.6"), helloworld); //assert_eq!(res("man/man1/badsymlink1.1"), None); assert_eq!(res("man/man1/badsymlink2.1"), None); assert_eq!(res("man/man1/badsymlink3.1"), None); assert_eq!(res("man/man1/badsymlink4.1"), None); assert_eq!(res("man/man1/badsymlink5.1"), None); assert_eq!(res("man/man1/doublesymlink1.1"), helloworld); assert_eq!(res("man/man1/doublesymlink2.1"), helloworld); assert_eq!(res("man/man1/triplesymlink.1"), helloworld); assert_eq!(res("man/man1/infinitesymlink.1"), None); } fn test_links(r: FileList) -> Option { let mut links = Vec::new(); let missed = r.links(|p,d| links.push((p.to_string(), d.to_string()))); links.sort(); { let mut res = |p:&str| { let r = links.remove(0); assert_eq!(r.0, p.to_string()); assert_eq!(r.1, "man/man3/helloworld.3".to_string()); }; res("man/man1/badsymlink1.1"); res("man/man1/doublesymlink1.1"); res("man/man1/doublesymlink2.1"); res("man/man1/symlinkbefore.1"); res("man/man1/triplesymlink.1"); res("man/man6/hardlink.6"); res("man/man6/symlinkafter.6"); } assert_eq!(links.len(), 0); missed } fn test_reread(r: MissedFiles) { let mut f = File::open("tests/testarchive.tar.xz").unwrap(); let ent = Archive::open_archive(&mut f).unwrap(); let mut files = Vec::new(); r.read(ent, |p,e| { let mut cont = String::new(); e.read_to_string(&mut cont).unwrap(); files.extend(p.iter().map(|x| (x.to_string(), cont.clone()) )); Ok(()) } ).unwrap(); files.sort(); { let mut res = |a:&str, b:&str| { let r = files.remove(0); assert_eq!(&r.0, a); assert_eq!(&r.1, b); }; res("man/man3/needreread.3", "Potentially interesting file\n"); res("man/man6/needreread.6", "Potentially interesting file\n"); } assert_eq!(files.len(), 0); } #[test] fn test_reader() { //use env_logger; //env_logger::init().unwrap(); let r = test_read(); test_resolve_links(&r); let l = test_links(r).unwrap(); test_reread(l); } }