WIP: Rewritten man page indexer in Rust
Currently just figuring out how to read archives. Turns out to not be as simple as I had expected.
This commit is contained in:
parent
965aa9a2f6
commit
022e9acc4f
7 changed files with 669 additions and 0 deletions
2
.gitignore
vendored
2
.gitignore
vendored
|
|
@ -2,3 +2,5 @@
|
||||||
!/lib/ManUtils/Build.PL
|
!/lib/ManUtils/Build.PL
|
||||||
!/lib/ManUtils/ManUtils.pm
|
!/lib/ManUtils/ManUtils.pm
|
||||||
!/lib/ManUtils/ManUtils.xs
|
!/lib/ManUtils/ManUtils.xs
|
||||||
|
indexer/target
|
||||||
|
|
||||||
|
|
|
||||||
143
indexer/Cargo.lock
generated
Normal file
143
indexer/Cargo.lock
generated
Normal file
|
|
@ -0,0 +1,143 @@
|
||||||
|
[root]
|
||||||
|
name = "indexer"
|
||||||
|
version = "0.1.0"
|
||||||
|
dependencies = [
|
||||||
|
"env_logger 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"libarchive 0.1.1 (git+https://github.com/17dec/libarchive-rust)",
|
||||||
|
"log 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"regex 0.1.77 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "aho-corasick"
|
||||||
|
version = "0.5.3"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
dependencies = [
|
||||||
|
"memchr 0.1.11 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "env_logger"
|
||||||
|
version = "0.3.5"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
dependencies = [
|
||||||
|
"log 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"regex 0.1.77 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "kernel32-sys"
|
||||||
|
version = "0.2.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
dependencies = [
|
||||||
|
"winapi 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"winapi-build 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "libarchive"
|
||||||
|
version = "0.1.1"
|
||||||
|
source = "git+https://github.com/17dec/libarchive-rust#3f723cf0064561f21f0cebbd534a75076e6dbcaa"
|
||||||
|
dependencies = [
|
||||||
|
"libarchive3-sys 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"libc 0.2.17 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "libarchive3-sys"
|
||||||
|
version = "0.1.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
dependencies = [
|
||||||
|
"libc 0.2.17 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"pkg-config 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "libc"
|
||||||
|
version = "0.2.17"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "log"
|
||||||
|
version = "0.3.6"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "memchr"
|
||||||
|
version = "0.1.11"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
dependencies = [
|
||||||
|
"libc 0.2.17 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "pkg-config"
|
||||||
|
version = "0.3.8"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "regex"
|
||||||
|
version = "0.1.77"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
dependencies = [
|
||||||
|
"aho-corasick 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"memchr 0.1.11 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"regex-syntax 0.3.7 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"thread_local 0.2.7 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"utf8-ranges 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "regex-syntax"
|
||||||
|
version = "0.3.7"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "thread-id"
|
||||||
|
version = "2.0.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
dependencies = [
|
||||||
|
"kernel32-sys 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"libc 0.2.17 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "thread_local"
|
||||||
|
version = "0.2.7"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
dependencies = [
|
||||||
|
"thread-id 2.0.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "utf8-ranges"
|
||||||
|
version = "0.1.3"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "winapi"
|
||||||
|
version = "0.2.8"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "winapi-build"
|
||||||
|
version = "0.1.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
|
||||||
|
[metadata]
|
||||||
|
"checksum aho-corasick 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)" = "ca972c2ea5f742bfce5687b9aef75506a764f61d37f8f649047846a9686ddb66"
|
||||||
|
"checksum env_logger 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)" = "15abd780e45b3ea4f76b4e9a26ff4843258dd8a3eed2775a0e7368c2e7936c2f"
|
||||||
|
"checksum kernel32-sys 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7507624b29483431c0ba2d82aece8ca6cdba9382bff4ddd0f7490560c056098d"
|
||||||
|
"checksum libarchive 0.1.1 (git+https://github.com/17dec/libarchive-rust)" = "<none>"
|
||||||
|
"checksum libarchive3-sys 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "3cd3beae8f59a4c7a806523269b5392037577c150446e88d684dfa6de6031ca7"
|
||||||
|
"checksum libc 0.2.17 (registry+https://github.com/rust-lang/crates.io-index)" = "044d1360593a78f5c8e5e710beccdc24ab71d1f01bc19a29bcacdba22e8475d8"
|
||||||
|
"checksum log 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)" = "ab83497bf8bf4ed2a74259c1c802351fcd67a65baa86394b6ba73c36f4838054"
|
||||||
|
"checksum memchr 0.1.11 (registry+https://github.com/rust-lang/crates.io-index)" = "d8b629fb514376c675b98c1421e80b151d3817ac42d7c667717d282761418d20"
|
||||||
|
"checksum pkg-config 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)" = "8cee804ecc7eaf201a4a207241472cc870e825206f6c031e3ee2a72fa425f2fa"
|
||||||
|
"checksum regex 0.1.77 (registry+https://github.com/rust-lang/crates.io-index)" = "64b03446c466d35b42f2a8b203c8e03ed8b91c0f17b56e1f84f7210a257aa665"
|
||||||
|
"checksum regex-syntax 0.3.7 (registry+https://github.com/rust-lang/crates.io-index)" = "48f0573bcee95a48da786f8823465b5f2a1fae288a55407aca991e5b3e0eae11"
|
||||||
|
"checksum thread-id 2.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "a9539db560102d1cef46b8b78ce737ff0bb64e7e18d35b2a5688f7d097d0ff03"
|
||||||
|
"checksum thread_local 0.2.7 (registry+https://github.com/rust-lang/crates.io-index)" = "8576dbbfcaef9641452d5cf0df9b0e7eeab7694956dd33bb61515fb8f18cfdd5"
|
||||||
|
"checksum utf8-ranges 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)" = "a1ca13c08c41c9c3e04224ed9ff80461d97e121589ff27c753a16cb10830ae0f"
|
||||||
|
"checksum winapi 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)" = "167dc9d6949a9b857f3451275e911c3f44255842c1f7a76f33c55103a909087a"
|
||||||
|
"checksum winapi-build 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "2d315eee3b34aca4797b2da6b13ed88266e6d612562a0c46390af8299fc699bc"
|
||||||
10
indexer/Cargo.toml
Normal file
10
indexer/Cargo.toml
Normal file
|
|
@ -0,0 +1,10 @@
|
||||||
|
[package]
|
||||||
|
name = "indexer"
|
||||||
|
version = "0.1.0"
|
||||||
|
authors = ["Yorhel <git@yorhel.nl>"]
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
regex = "0.1.77"
|
||||||
|
log = "0.3.6"
|
||||||
|
env_logger = "0.3.5"
|
||||||
|
libarchive = { git = "https://github.com/17dec/libarchive-rust" }
|
||||||
368
indexer/src/archive.rs
Normal file
368
indexer/src/archive.rs
Normal file
|
|
@ -0,0 +1,368 @@
|
||||||
|
use std::path::Path;
|
||||||
|
use std::collections::HashMap;
|
||||||
|
use libarchive::reader::Reader as ArchiveReader;
|
||||||
|
use libarchive::reader::{FileReader,Builder};
|
||||||
|
use libarchive::archive::{Entry,FileType,ReadFormat,ReadFilter};
|
||||||
|
use libarchive::error::ArchiveResult;
|
||||||
|
|
||||||
|
|
||||||
|
pub fn open_file<T: AsRef<Path>>(path: T) -> ArchiveResult<FileReader> {
|
||||||
|
let mut builder = Builder::new();
|
||||||
|
try!(builder.support_format(ReadFormat::All));
|
||||||
|
try!(builder.support_filter(ReadFilter::All));
|
||||||
|
builder.open_file(path)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#[derive(Clone,Debug,PartialEq,Eq)]
|
||||||
|
pub enum EntryType {
|
||||||
|
// Regular file that has been handled/indexed
|
||||||
|
Handled,
|
||||||
|
// Regular file that hasn't been handled because the caller wasn't interested in it. Could
|
||||||
|
// still be an interesting file if it is referenced from an interesting path.
|
||||||
|
Regular,
|
||||||
|
// Link to another file (interesting or not is irrelevant)
|
||||||
|
Link(String),
|
||||||
|
// Directory; need this information when resolving links
|
||||||
|
Directory,
|
||||||
|
// Something that couldn't be a an interesting file (chardev/socket/etc); If any link resolves
|
||||||
|
// to this we know we're done.
|
||||||
|
Other,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* I had hoped that reading man pages from an archive would just be a simple:
|
||||||
|
*
|
||||||
|
* 1. Walk through all files in the archive in a streaming fashion
|
||||||
|
* 2. Parse/index man pages
|
||||||
|
*
|
||||||
|
* But alas, it was not to be. Symlinks and hardlinks have ruined it. Now we have to...
|
||||||
|
*
|
||||||
|
* 1. Walk through all entries in the archive in a streaming fashion
|
||||||
|
* 2. Parse/index regular file man pages
|
||||||
|
* 3. Keep track of all paths in the archive
|
||||||
|
* 4. Use the result of step (3) to resolve symlinks/hardlinks to their actual file
|
||||||
|
* 5. Read the entire damn archive again if one of the links resolved to a file that was not
|
||||||
|
* recognized as a man page in step (2). Luckily, this isn't very common.
|
||||||
|
*
|
||||||
|
* And this doesn't even cover the problem of duplicate entries in a tar, which is also quite
|
||||||
|
* annoying to handle.
|
||||||
|
*
|
||||||
|
* What annoys me the most about all of this is that it's not possible to stream an archive from
|
||||||
|
* the network and read/index the entire thing in a single step. Now we have to buffer packages to
|
||||||
|
* disk in order to be able to read the archive a second time.
|
||||||
|
*
|
||||||
|
* (Note that it is possible to resolve links while walking through the entries, which will allow
|
||||||
|
* us to match files found later in the archive against links found earlier, thus potentially
|
||||||
|
* saving the need to read the archive a second time. This is merely a performance improvement for
|
||||||
|
* an uncommon case, and it certainly won't simplify the code)
|
||||||
|
*
|
||||||
|
* (Note that it's also possible to just flush all files <10MB* to disk to completely avoid the
|
||||||
|
* need for a second archive read, but that's going to significantly slow down the common case in
|
||||||
|
* order to handle a rare case. It's possible to further optimize this using some heuristics to
|
||||||
|
* determine whether a file is potentially a man page, but that's both complex and may not even
|
||||||
|
* save much)
|
||||||
|
*
|
||||||
|
* (* So apparently some man pages are close to 10MB...)
|
||||||
|
*/
|
||||||
|
pub struct Reader {
|
||||||
|
// List of seen files. This is used to resolve links
|
||||||
|
seen: HashMap<String, EntryType>,
|
||||||
|
// List of interesting links
|
||||||
|
links: Vec<String>,
|
||||||
|
// List of files we have to read in a second walk through the archive
|
||||||
|
missedfiles: HashMap<String, Vec<String>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// Generalized API:
|
||||||
|
// 1. Read once
|
||||||
|
// reader.read(file, interest_cb, file_cb) -> Error
|
||||||
|
// file: A libarchive::Reader
|
||||||
|
// interest_cb(path) -> bool
|
||||||
|
// Called on every file/link name, should return whether it's a file the caller is interested
|
||||||
|
// in. (e.g. parse_path(), but also +DESC and other metadata).
|
||||||
|
// file_cb(path, reader, entry) -> Error
|
||||||
|
// Called on every interesting (actual) file, given the (normalized?) path, the
|
||||||
|
// libarchive::Reader and a ReaderEntry
|
||||||
|
//
|
||||||
|
// 2. Read links
|
||||||
|
// reader.links(link_cb) -> Error
|
||||||
|
// link_cb(path, dest) -> Error
|
||||||
|
// Called on every link which has as 'dest' a file path that has already been given to
|
||||||
|
// file_cb() before.
|
||||||
|
//
|
||||||
|
// 3. (Optionally) read a second time
|
||||||
|
// if reader.need_reread() {
|
||||||
|
// reader.reread(file, file_cb)
|
||||||
|
// }
|
||||||
|
impl Reader {
|
||||||
|
pub fn new() -> Reader {
|
||||||
|
Reader {
|
||||||
|
seen: HashMap::new(),
|
||||||
|
links: Vec::new(),
|
||||||
|
missedfiles: HashMap::new(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Convenience function to read the path/type/link from the next header.
|
||||||
|
fn read_header(rd: &mut ArchiveReader) -> Option<(String, EntryType)> {
|
||||||
|
let ent = match rd.next_header() {
|
||||||
|
Some(x) => x,
|
||||||
|
None => return None,
|
||||||
|
};
|
||||||
|
let path = ent.pathname().trim_left_matches('/').trim_left_matches("./").trim_right_matches('/').to_string();
|
||||||
|
|
||||||
|
// Hard links are apparently relative to the root of the archive.
|
||||||
|
let link = ent.hardlink().map(|x| format!("/{}", x))
|
||||||
|
.or(ent.symlink().map(str::to_string));
|
||||||
|
|
||||||
|
let(fts, ret) = match ent.filetype() {
|
||||||
|
FileType::BlockDevice => ("blk", EntryType::Other),
|
||||||
|
FileType::SymbolicLink => ("sym", match link { Some(l) => EntryType::Link(l), _ => EntryType::Other }),
|
||||||
|
FileType::Socket => ("sck", EntryType::Other),
|
||||||
|
FileType::CharacterDevice => ("chr", EntryType::Other),
|
||||||
|
FileType::Directory => ("dir", EntryType::Directory),
|
||||||
|
FileType::NamedPipe => ("fif", EntryType::Other),
|
||||||
|
FileType::Mount => ("mnt", EntryType::Other),
|
||||||
|
FileType::RegularFile => ("reg", EntryType::Regular),
|
||||||
|
FileType::Unknown => ("unk", match link { Some(l) => EntryType::Link(l), _ => EntryType::Other }),
|
||||||
|
};
|
||||||
|
|
||||||
|
trace!("Archive entry: {}{:10} bytes, path={:?} type={:?}", fts, ent.size(), path, ret);
|
||||||
|
Some((path, ret))
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn read<F,G>(&mut self, rd: &mut ArchiveReader, interest_cb: F, mut file_cb: G) -> ArchiveResult<()>
|
||||||
|
where F: Fn(&str) -> bool, G: FnMut(&[&str], &mut ArchiveReader) -> ArchiveResult<()>
|
||||||
|
{
|
||||||
|
while let Some((path, t)) = Self::read_header(rd) {
|
||||||
|
// We ought to throw away the result of the previous entry with the same name and use
|
||||||
|
// this new entry instead, but fuck it. This case is too rare, so let's just warn! it.
|
||||||
|
if let Some(_) = self.seen.get(&path) {
|
||||||
|
warn!("Duplicate file entry: {}", path);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut newt = t;
|
||||||
|
match newt {
|
||||||
|
EntryType::Regular if interest_cb(&path) => {
|
||||||
|
let pathv = [&path as &str];
|
||||||
|
try!(file_cb(&pathv[..], rd));
|
||||||
|
newt = EntryType::Handled
|
||||||
|
},
|
||||||
|
EntryType::Link(_) if interest_cb(&path) => {
|
||||||
|
self.links.push(path.clone());
|
||||||
|
},
|
||||||
|
_ => ()
|
||||||
|
};
|
||||||
|
self.seen.insert(path, newt);
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
// This is basically realpath(), using the virtual filesystem in self.seen.
|
||||||
|
// This method is not particularly efficient, it allocates like crazy.
|
||||||
|
fn resolve_link(&self, base: &str, path: &str, depth: usize) -> Option<(EntryType, Vec<String>)> {
|
||||||
|
if depth < 1 {
|
||||||
|
warn!("Unresolved link: {} -> {}; Recursion depth exceeded", base, path);
|
||||||
|
return None
|
||||||
|
}
|
||||||
|
|
||||||
|
// Remove filename from the base
|
||||||
|
let basedir = if let Some(i) = base.rfind('/') { base.split_at(i).0 } else { return None };
|
||||||
|
|
||||||
|
let comp : Vec<&str> =
|
||||||
|
if path.starts_with('/') { path.split('/').collect() }
|
||||||
|
else { basedir.split('/').chain(path.split('/')).collect() };
|
||||||
|
|
||||||
|
let mut dest = Vec::new();
|
||||||
|
|
||||||
|
for (i, &c) in comp.iter().enumerate() {
|
||||||
|
if c == "" || c == "." {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if c == ".." {
|
||||||
|
if dest.len() > 1 {
|
||||||
|
dest.pop();
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
dest.push(c.to_string());
|
||||||
|
let curpath = dest.join("/");
|
||||||
|
match self.seen.get(&curpath) {
|
||||||
|
|
||||||
|
// If it's a directory, we're good
|
||||||
|
Some(&EntryType::Directory) => (),
|
||||||
|
|
||||||
|
// If it's a file or man page, it must be the last item.
|
||||||
|
Some(& ref x@ EntryType::Regular) |
|
||||||
|
Some(& ref x@ EntryType::Handled) => return
|
||||||
|
if i == comp.len()-1 {
|
||||||
|
Some((x.clone(), dest))
|
||||||
|
} else {
|
||||||
|
warn!("Unresolved link: {} -> {}; Non-directory component", base, path);
|
||||||
|
None
|
||||||
|
},
|
||||||
|
|
||||||
|
// Links... Ugh
|
||||||
|
Some(&EntryType::Link(ref d)) => {
|
||||||
|
match self.resolve_link(&curpath, &d, depth-1) {
|
||||||
|
// Same as above, with dirs we can continue, files have to be last
|
||||||
|
Some((EntryType::Directory, d)) => dest = d,
|
||||||
|
x@Some((EntryType::Regular, _)) |
|
||||||
|
x@Some((EntryType::Handled, _)) => return
|
||||||
|
if i == comp.len()-1 { x }
|
||||||
|
else {
|
||||||
|
warn!("Unresolved link: {} -> {}; Non-directory link component", base, path);
|
||||||
|
None
|
||||||
|
},
|
||||||
|
_ => return None,
|
||||||
|
}
|
||||||
|
},
|
||||||
|
|
||||||
|
// Don't care about anything else, just stop.
|
||||||
|
_ => {
|
||||||
|
warn!("Unresolved link: {} -> {}; Special or non-existing file", base, path);
|
||||||
|
return None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Some((EntryType::Directory, dest))
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn links<F>(&mut self, mut cb: F) where F: FnMut(&str, &str) {
|
||||||
|
for p in self.links.iter() {
|
||||||
|
let dest = match self.seen.get(p) { Some(&EntryType::Link(ref x)) => x, _ => unreachable!() };
|
||||||
|
|
||||||
|
match self.resolve_link(&p, dest, 32) {
|
||||||
|
Some((EntryType::Handled, d)) => {
|
||||||
|
let dstr = d.join("/");
|
||||||
|
cb(&p, &dstr)
|
||||||
|
},
|
||||||
|
Some((EntryType::Regular, d)) => {
|
||||||
|
let dstr = d.join("/");
|
||||||
|
self.missedfiles.entry(dstr).or_insert_with(Vec::new).push(p.to_string());
|
||||||
|
}
|
||||||
|
_ => {},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// We can reclaim this memory early.
|
||||||
|
self.links = Vec::new();
|
||||||
|
self.seen = HashMap::new();
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn need_reread(&self) -> bool {
|
||||||
|
self.missedfiles.len() > 0
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn reread<G>(&mut self, rd: &mut ArchiveReader, mut file_cb: G) -> ArchiveResult<()>
|
||||||
|
where G: FnMut(&[&str], &mut ArchiveReader) -> ArchiveResult<()>
|
||||||
|
{
|
||||||
|
while let Some((path, _)) = Self::read_header(rd) {
|
||||||
|
if let Some(f) = self.missedfiles.remove(&path) {
|
||||||
|
let v: Vec<&str> = f.iter().map(|x| x as &str).collect();
|
||||||
|
try!(file_cb(&v, rd))
|
||||||
|
}
|
||||||
|
if self.missedfiles.len() < 1 {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
use env_logger;
|
||||||
|
|
||||||
|
fn test_read(r: &mut Reader) {
|
||||||
|
let mut f = open_file("tests/testarchive.tar.xz").unwrap();
|
||||||
|
let mut files = Vec::new();
|
||||||
|
r.read(&mut f,
|
||||||
|
|p| p.starts_with("man/man"),
|
||||||
|
|p,_| { files.extend(p.iter().map(|x| x.to_string())); Ok(()) }
|
||||||
|
).unwrap();
|
||||||
|
assert_eq!(files, vec!["man/man3/helloworld.3".to_string()]);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn test_resolve_links(r: &mut Reader) {
|
||||||
|
let res = |p| {
|
||||||
|
if let Some(&EntryType::Link(ref l)) = r.seen.get(p) {
|
||||||
|
r.resolve_link(p, &l, 5)
|
||||||
|
} else {
|
||||||
|
panic!("Not found or not a link: {}", p);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
let helloworld = Some((EntryType::Handled, vec!["man".to_string(), "man3".to_string(), "helloworld.3".to_string()]));
|
||||||
|
|
||||||
|
assert_eq!(res("man/mans"), Some((EntryType::Directory, vec!["man".to_string(), "man3".to_string()])));
|
||||||
|
assert_eq!(res("man/man6/hardlink.6"), helloworld);
|
||||||
|
assert_eq!(res("man/man1/symlinkbefore.1"), helloworld);
|
||||||
|
assert_eq!(res("man/man6/symlinkafter.6"), helloworld);
|
||||||
|
|
||||||
|
assert_eq!(res("man/man1/badsymlink1.1"), None);
|
||||||
|
assert_eq!(res("man/man1/badsymlink2.1"), None);
|
||||||
|
assert_eq!(res("man/man1/badsymlink3.1"), None);
|
||||||
|
assert_eq!(res("man/man1/badsymlink4.1"), None);
|
||||||
|
assert_eq!(res("man/man1/badsymlink5.1"), None);
|
||||||
|
|
||||||
|
assert_eq!(res("man/man1/doublesymlink1.1"), helloworld);
|
||||||
|
assert_eq!(res("man/man1/doublesymlink2.1"), helloworld);
|
||||||
|
assert_eq!(res("man/man1/triplesymlink.1"), helloworld);
|
||||||
|
assert_eq!(res("man/man1/infinitesymlink.1"), None);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn test_links(r: &mut Reader) {
|
||||||
|
let mut links = Vec::new();
|
||||||
|
r.links(|p,d| links.push((p.to_string(), d.to_string())));
|
||||||
|
links.sort();
|
||||||
|
|
||||||
|
{
|
||||||
|
let mut res = |p:&str| {
|
||||||
|
let r = links.remove(0);
|
||||||
|
assert_eq!(r.0, p.to_string());
|
||||||
|
assert_eq!(r.1, "man/man3/helloworld.3".to_string());
|
||||||
|
};
|
||||||
|
res("man/man1/doublesymlink1.1");
|
||||||
|
res("man/man1/doublesymlink2.1");
|
||||||
|
res("man/man1/symlinkbefore.1");
|
||||||
|
res("man/man1/triplesymlink.1");
|
||||||
|
res("man/man6/hardlink.6");
|
||||||
|
res("man/man6/symlinkafter.6");
|
||||||
|
}
|
||||||
|
assert_eq!(links.len(), 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn test_reread(r: &mut Reader) {
|
||||||
|
assert!(r.need_reread());
|
||||||
|
|
||||||
|
let mut f = open_file("tests/testarchive.tar.xz").unwrap();
|
||||||
|
let mut files = Vec::new();
|
||||||
|
r.reread(&mut f,
|
||||||
|
|p,_| { files.extend(p.iter().map(|x| x.to_string())); Ok(()) }
|
||||||
|
).unwrap();
|
||||||
|
|
||||||
|
files.sort();
|
||||||
|
assert_eq!(files, vec![
|
||||||
|
"man/man3/needreread.3".to_string(),
|
||||||
|
"man/man6/needreread.6".to_string()
|
||||||
|
]);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_reader() {
|
||||||
|
env_logger::init().unwrap();
|
||||||
|
|
||||||
|
let mut r = Reader::new();
|
||||||
|
test_read(&mut r);
|
||||||
|
test_resolve_links(&mut r);
|
||||||
|
test_links(&mut r);
|
||||||
|
test_reread(&mut r);
|
||||||
|
}
|
||||||
|
}
|
||||||
97
indexer/src/main.rs
Normal file
97
indexer/src/main.rs
Normal file
|
|
@ -0,0 +1,97 @@
|
||||||
|
#[macro_use] extern crate log;
|
||||||
|
extern crate env_logger;
|
||||||
|
extern crate libarchive;
|
||||||
|
extern crate regex;
|
||||||
|
|
||||||
|
use regex::Regex;
|
||||||
|
|
||||||
|
mod archive;
|
||||||
|
|
||||||
|
|
||||||
|
// Checks a path for a man page candidate. Returns None if it doesn't seem like a man page
|
||||||
|
// location, otherwise Some((manPageName, Section, Locale)).
|
||||||
|
fn parse_path(path: &str) -> Option<(&str, &str, &str)> {
|
||||||
|
// Roughly: man[/locale]/man1/manpage.section[.compression]+
|
||||||
|
// TODO: lazy_static
|
||||||
|
let re = Regex::new(r"(?x)
|
||||||
|
man
|
||||||
|
(?: / ([^/]+) )? # Optional locale
|
||||||
|
/man[a-z0-9]/ # Subdir
|
||||||
|
([^/]+?) # Man page name (non-greedy)
|
||||||
|
\. ([^/\.]+) # Section
|
||||||
|
(?: \. (?: gz|lzma|bz2|xz ))* $ # Any number of compression extensions
|
||||||
|
").unwrap();
|
||||||
|
|
||||||
|
let cap = match re.captures(path) { Some(x) => x, None => return None };
|
||||||
|
let locale = cap.at(1).unwrap_or("");
|
||||||
|
let name = cap.at(2).unwrap();
|
||||||
|
let section = cap.at(3).unwrap();
|
||||||
|
|
||||||
|
// Not everything matching the regex is necessarily a man page, exclude some special cases.
|
||||||
|
match (name, section, locale) {
|
||||||
|
// Files that totally aren't man pages
|
||||||
|
("Makefile", "in", _) |
|
||||||
|
("Makefile", "am", _) |
|
||||||
|
(".cvsignore", _, _) |
|
||||||
|
(_, "gz", _) |
|
||||||
|
(_, "lzma", _) |
|
||||||
|
(_, "bz2", _) |
|
||||||
|
(_, "xz", _) |
|
||||||
|
(_, "html", _) => None,
|
||||||
|
// Some weird directories that happen to match the locale
|
||||||
|
(n, s, "5man") |
|
||||||
|
(n, s, "c") |
|
||||||
|
(n, s, "man1") |
|
||||||
|
(n, s, "man2") |
|
||||||
|
(n, s, "man3") |
|
||||||
|
(n, s, "man4") |
|
||||||
|
(n, s, "man5") |
|
||||||
|
(n, s, "man6") |
|
||||||
|
(n, s, "man7") |
|
||||||
|
(n, s, "man8") |
|
||||||
|
(n, s, "Man-Part1") |
|
||||||
|
(n, s, "Man-Part2") => Some((n, s, "")),
|
||||||
|
// Nothing special!
|
||||||
|
x => Some(x)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
fn main() {
|
||||||
|
env_logger::init().unwrap();
|
||||||
|
info!("Hello, world!");
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_parse_path() {
|
||||||
|
// Generic tests
|
||||||
|
assert_eq!(parse_path("/"), None);
|
||||||
|
assert_eq!(parse_path("/man1/ncdu.1"), None);
|
||||||
|
assert_eq!(parse_path("/man/man?/ncdu.1"), None);
|
||||||
|
assert_eq!(parse_path("/man/man1/ncdu.1"), Some(("ncdu", "1", "")));
|
||||||
|
assert_eq!(parse_path("/man/man1/ncdu.1.gz.lzma.xz.bz2.gz"), Some(("ncdu", "1", ""))); // This stuff happens
|
||||||
|
assert_eq!(parse_path("/man/en_US.UTF-8/man1/ncdu.1"), Some(("ncdu", "1", "en_US.UTF-8")));
|
||||||
|
|
||||||
|
// Special cases
|
||||||
|
assert_eq!(parse_path("/usr/share/man/man1/INDEX"), None);
|
||||||
|
assert_eq!(parse_path("/usr/share/man/man1/Makefile"), None);
|
||||||
|
assert_eq!(parse_path("/usr/share/man/man1/Makefile.am"), None);
|
||||||
|
assert_eq!(parse_path("/usr/share/man/man1/Makefile.in"), None);
|
||||||
|
assert_eq!(parse_path("/usr/share/man/man1/.cvsignore"), None);
|
||||||
|
assert_eq!(parse_path("/usr/share/man/man1/.cvsignore.gz"), None);
|
||||||
|
|
||||||
|
// Some actual locations
|
||||||
|
assert_eq!(parse_path("/usr/local/man/man1/list_audio_tracks.1.gz"), Some(("list_audio_tracks", "1", "")));
|
||||||
|
assert_eq!(parse_path("/usr/local/lib/perl5/site_perl/man/man3/DBIx::Class::Helper::ResultSet::DateMethods1::Announcement.3.gz"), Some(("DBIx::Class::Helper::ResultSet::DateMethods1::Announcement", "3", "")));
|
||||||
|
assert_eq!(parse_path("/usr/man/man3/exit.3tk"), Some(("exit", "3tk", "")));
|
||||||
|
assert_eq!(parse_path("/usr/local/brlcad/share/man/mann/exit.nged.gz"), Some(("exit", "nged", "")));
|
||||||
|
assert_eq!(parse_path("/usr/X11R6/man/man3/intro.3xglut.gz"), Some(("intro", "3xglut", "")));
|
||||||
|
assert_eq!(parse_path("/usr/local/share/man/ko_KR.eucKR/man3/intro.3.gz"), Some(("intro", "3", "ko_KR.eucKR")));
|
||||||
|
|
||||||
|
assert_eq!(parse_path("/usr/lib/scilab/man/Man-Part1/man1/ans.1"), Some(("ans", "1", "")));
|
||||||
|
assert_eq!(parse_path("/heirloom/usr/share/man/5man/man1/chgrp.1.gz"), Some(("chgrp", "1", "")));
|
||||||
|
|
||||||
|
assert_eq!(parse_path("/usr/local/plan9/man/man8/index.html"), None);
|
||||||
|
assert_eq!(parse_path("/usr/local/share/doc/gmt/html/man/grdpaste.html"), None);
|
||||||
|
}
|
||||||
49
indexer/tests/mktar.sh
Executable file
49
indexer/tests/mktar.sh
Executable file
|
|
@ -0,0 +1,49 @@
|
||||||
|
#!/bin/sh
|
||||||
|
|
||||||
|
# The order of inserting the files into the tar is not fully deterministic this
|
||||||
|
# way. The tests will fail quite badly if hardlink.6 is considered the
|
||||||
|
# "original" version.
|
||||||
|
|
||||||
|
mkdir man
|
||||||
|
cd man
|
||||||
|
|
||||||
|
mkdir man1
|
||||||
|
mkdir man3
|
||||||
|
mkdir man6
|
||||||
|
ln -s man3 mans
|
||||||
|
|
||||||
|
echo 'Hello World' >man3/helloworld.3
|
||||||
|
echo 'Not a very interesting file' >notinteresting
|
||||||
|
echo 'Potentially interesting file' >possiblyinteresting
|
||||||
|
|
||||||
|
ln man3/helloworld.3 man6/hardlink.6
|
||||||
|
|
||||||
|
ln -s ../man3/helloworld.3 man1/symlinkbefore.1
|
||||||
|
ln -s ../man3/helloworld.3 man6/symlinkafter.6
|
||||||
|
|
||||||
|
ln -s notadir/../../man3/helloworld.3 man1/badsymlink1.1
|
||||||
|
ln -s man3/helloworld.3 man1/badsymlink2.1
|
||||||
|
ln -s ../man3/helloworld.3/. man1/badsymlink3.1
|
||||||
|
ln -s ../man3/helloworld.3/../helloworld.3 man1/badsymlink4.1
|
||||||
|
ln -s ../man1/symlinkbefore.1/../../man1/helloworld.3 man1/badsymlink5.1
|
||||||
|
|
||||||
|
ln -s symlinkbefore.1 man1/doublesymlink1.1
|
||||||
|
ln -s ../mans/helloworld.3 man1/doublesymlink2.1
|
||||||
|
ln -s ../mans/../man1/symlinkbefore.1 man1/triplesymlink.1
|
||||||
|
ln -s infinitesymlink.1 man1/infinitesymlink.1
|
||||||
|
|
||||||
|
ln -s ../possiblyinteresting man3/needreread.3
|
||||||
|
ln -s ../possiblyinteresting man6/needreread.6
|
||||||
|
|
||||||
|
cd ..
|
||||||
|
rm -f testarchive.tar
|
||||||
|
tar -cf testarchive.tar man/
|
||||||
|
rm -r man/
|
||||||
|
|
||||||
|
mkdir man
|
||||||
|
echo 'Overwritten file' >man/possiblyinteresting
|
||||||
|
tar -rf testarchive.tar man/
|
||||||
|
rm -r man/
|
||||||
|
|
||||||
|
rm -f testarchive.tar.xz
|
||||||
|
xz testarchive.tar
|
||||||
BIN
indexer/tests/testarchive.tar.xz
Normal file
BIN
indexer/tests/testarchive.tar.xz
Normal file
Binary file not shown.
Loading…
Add table
Add a link
Reference in a new issue