manned/indexer/src/archive.rs
2016-11-15 21:15:35 +01:00

365 lines
11 KiB
Rust

use std::str;
use std::ptr;
use std::error::Error as ErrorTrait;
use std::io::{Result,Error,Read};
use std::ffi::{CStr,CString};
use libc::{c_void,ssize_t};
use libarchive3_sys::ffi;
/* This is a safe, limited and opinionated wrapper around the libarchive C bindings.
* I initially used the libarchive crate, but it has several issues. Some of which are not fixable
* without a complete rewrite.
* - Panics on non-UTF8 path names
* - Panics on hard links (PR #6)
* - API is far too flexible, easy to misuse and get panics/segfaults
* - Impossible to correctly read files from an archive (issue #7)
* - Does not provide a convenient Read interface for files
*
* Barring any unexpected behaviour or bugs in libarchive, the API below should not panic or
* segfault for any archive or usage pattern.
*/
pub struct Archive<'a> {
a: *mut ffi::Struct_archive,
rd: &'a mut Read,
buf: Vec<u8>,
err: Option<Error>,
eof: bool,
}
pub struct ArchiveEntry<'a> {
a: Box<Archive<'a>>,
e: *mut ffi::Struct_archive_entry,
}
pub struct RawEntry<'a>(Box<Archive<'a>>);
#[derive(Debug,PartialEq,Eq)]
pub enum FileType {
File,
Directory,
Link(String),
Other, // Also includes Link(<non-utf8-path>)
}
// Top-level formats, as in ARCHIVE_FORMAT_*
#[derive(Debug,PartialEq,Eq)]
pub enum Format {
Tar,
Ar,
Other, // Ultra lazyness
}
unsafe extern "C" fn archive_read_cb(_: *mut ffi::Struct_archive, data: *mut c_void, buf: *mut *const c_void) -> ssize_t {
let arch: &mut Archive = &mut *(data as *mut Archive);
*buf = arch.buf.as_mut_ptr() as *mut c_void;
match arch.rd.read(&mut arch.buf[..]) {
Ok(s) => s as ssize_t,
Err(e) => {
let desc = CString::new(e.description()).unwrap();
let fmt = CString::new("%s").unwrap();
ffi::archive_set_error(arch.a, e.raw_os_error().unwrap_or(0), fmt.as_ptr(), desc.as_ptr());
arch.err = Some(e);
-1
}
}
}
impl<'a> Archive<'a> {
fn new(rd: &mut Read, a: *mut ffi::Struct_archive) -> Result<Box<Archive>> {
let bufsize = 64*1024;
let mut buf = Vec::with_capacity(bufsize);
unsafe { buf.set_len(bufsize) };
let mut ret = Box::new(Archive { a: a, rd: rd, buf: buf, err: None, eof: false });
let aptr: *mut c_void = &mut *ret as *mut Archive as *mut c_void;
let r = unsafe { ffi::archive_read_open(a, aptr, None, Some(archive_read_cb), None) };
if r == ffi::ARCHIVE_FATAL {
return Err(ret.error());
}
Ok(ret)
}
fn error(&mut self) -> Error {
self.err.take().unwrap_or_else(|| {
let err = Error::from_raw_os_error(unsafe { ffi::archive_errno(self.a) });
let desc = unsafe { ffi::archive_error_string(self.a) };
if desc.is_null() {
return err;
}
if let Ok(s) = str::from_utf8(unsafe { CStr::from_ptr(desc) }.to_bytes()) {
Error::new(err.kind(), s)
} else {
err
}
})
}
fn entry(self: Box<Self>) -> Result<Option<ArchiveEntry<'a>>> {
let mut ent = ArchiveEntry {
a: self,
e: ptr::null_mut()
};
ent.a.eof = false;
let res = unsafe { ffi::archive_read_next_header(ent.a.a, &mut ent.e) };
match res {
ffi::ARCHIVE_EOF => Ok(None),
ffi::ARCHIVE_FATAL => Err(ent.a.error()),
_ => Ok(Some(ent))
}
}
fn read(&mut self, buf: &mut [u8]) -> Result<usize> {
// libarchive tends to throw an error if you try to read after an EOF; handle that case
// here.
if self.eof {
return Ok(0);
}
let cbuf = buf.as_mut_ptr() as *mut c_void;
let n = unsafe { ffi::archive_read_data(self.a, cbuf, buf.len()) };
if n >= 0 {
self.eof = n == 0;
Ok(n as usize)
} else {
Err(self.error())
}
}
pub fn open_archive(rd: &mut Read) -> Result<Option<ArchiveEntry>> {
let a = unsafe {
let a = ffi::archive_read_new();
ffi::archive_read_support_filter_all(a);
ffi::archive_read_support_format_all(a);
a
};
try!(Self::new(rd, a)).entry()
}
pub fn open_raw(rd: &mut Read) -> Result<RawEntry> {
let a = unsafe {
let a = ffi::archive_read_new();
ffi::archive_read_support_filter_all(a);
ffi::archive_read_support_format_raw(a);
ffi::archive_read_support_format_empty(a);
a
};
let mut a = try!(Self::new(rd, a));
let mut e: *mut ffi::Struct_archive_entry = ptr::null_mut();
let res = unsafe { ffi::archive_read_next_header(a.a, &mut e) };
match res {
ffi::ARCHIVE_FATAL => Err(a.error()),
ffi::ARCHIVE_EOF => {
a.eof = true;
Ok(RawEntry(a))
},
_ => Ok(RawEntry(a))
}
}
}
impl<'a> Drop for Archive<'a> {
fn drop(&mut self) {
unsafe {
ffi::archive_read_free(self.a);
}
}
}
impl<'a> ArchiveEntry<'a> {
pub fn next(self) -> Result<Option<ArchiveEntry<'a>>> {
self.a.entry()
}
// Returns None in NULL (when does that even happen?) or on invalid UTF-8.
pub fn path(&self) -> Option<&str> {
let c_str: &CStr = unsafe {
let ptr = ffi::archive_entry_pathname(self.e);
if ptr.is_null() {
return None;
}
CStr::from_ptr(ptr)
};
str::from_utf8(c_str.to_bytes()).ok()
// Perform some simple opinionated normalization. Full normalization might be better,
// but also slower and more complex. This solution covers the most important cases.
.map(|s| s.trim_left_matches('/').trim_left_matches("./").trim_right_matches('/'))
}
pub fn size(&self) -> usize {
unsafe { ffi::archive_entry_size(self.e) as usize }
}
pub fn format(&self) -> Format {
// Interestingly, archive_format() is a property of the entry itself, not of the top-level
// archive. Hence it requires archive_read_next_header() and hence it's better placed as
// part of this ArchiveEntry object rather than the Archive object.
// ...that said, the top-level format isn't likely to change, it's the lower 16 bits that
// might be different.
match unsafe { ffi::archive_format(self.a.a) } >> 16 {
0x3 => Format::Tar,
0x7 => Format::Ar,
_ => Format::Other,
}
}
fn symlink(&self) -> Option<String> {
let c_str: &CStr = unsafe {
let ptr = ffi::archive_entry_symlink(self.e);
if ptr.is_null() {
return None;
}
CStr::from_ptr(ptr)
};
str::from_utf8(c_str.to_bytes()).map(str::to_string).ok()
}
fn hardlink(&self) -> Option<String> {
let c_str: &CStr = unsafe {
let ptr = ffi::archive_entry_hardlink(self.e);
if ptr.is_null() {
return None;
}
CStr::from_ptr(ptr)
};
// Hard links have the same name as an earlier pathname(), and those typically don't have a
// preceding slash. Add this slash here so that the same resolution logic can be used for
// both hardlinks and symlinks. I really don't care about the difference between these two.
str::from_utf8(c_str.to_bytes()).map(|p| format!("/{}", p)).ok()
}
pub fn filetype(&self) -> FileType {
// If it has a symlink/hardlink path, then just consider it a link regardless of what
// _filetype() says.
if let Some(l) = self.symlink().or(self.hardlink()) {
return FileType::Link(l);
}
match unsafe { ffi::archive_entry_filetype(self.e) } {
ffi::AE_IFDIR => FileType::Directory,
ffi::AE_IFREG => FileType::File,
_ => FileType::Other,
}
}
}
impl<'a> Read for ArchiveEntry<'a> {
fn read(&mut self, buf: &mut [u8]) -> Result<usize> {
self.a.read(buf)
}
}
impl<'a> Read for RawEntry<'a> {
fn read(&mut self, buf: &mut [u8]) -> Result<usize> {
self.0.read(buf)
}
}
// We can't provide an Iterator object for ArchiveEntries because Rust doesn't support streaming
// iterators. Let's instead provide a walk function for convenience.
// cb should return Ok(true) to continue, Ok(false) to break
pub fn walk<F>(ent: Option<ArchiveEntry>, mut cb: F) -> Result<()>
where F: FnMut(&mut ArchiveEntry) -> Result<bool>
{
let mut ent = ent;
while let Some(mut e) = ent {
if !try!(cb(&mut e)) {
break;
}
ent = try!(e.next());
}
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
use std;
use std::io::Read;
use std::fs::File;
#[test]
fn invalid() {
let mut r = std::io::repeat(0x0a).take(64*1024);
let ent = Archive::open_archive(&mut r);
assert!(ent.is_err());
}
#[test]
fn zerolength() {
let mut r = std::io::empty();
{
let ent = Archive::open_archive(&mut r);
assert!(ent.unwrap().is_none());
}
{
let mut ent = Archive::open_raw(&mut r).unwrap();
let mut v = Vec::new();
assert_eq!(ent.read_to_end(&mut v).unwrap(), 0);
}
}
#[test]
fn archive() {
let mut f = File::open("tests/simpletest.tar.gz").unwrap();
let mut ent = Archive::open_archive(&mut f).unwrap().unwrap();
let t = |e:&mut ArchiveEntry, path, size, ft, cont| {
assert_eq!(e.format(), Format::Tar);
assert_eq!(e.path(), path);
assert_eq!(e.size(), size);
assert_eq!(e.filetype(), ft);
let mut contents = String::new();
assert_eq!(e.read_to_string(&mut contents).unwrap(), size);
assert_eq!(&contents, cont);
};
t(&mut ent, Some("simple"), 0, FileType::Directory, "");
ent = ent.next().unwrap().unwrap();
t(&mut ent, Some("simple/file"), 3, FileType::File, "Hi\n");
ent = ent.next().unwrap().unwrap();
t(&mut ent, Some("simple/link"), 0, FileType::Link("file".to_string()), "");
ent = ent.next().unwrap().unwrap();
t(&mut ent, Some("simple/hardlink"), 0, FileType::Link("/simple/file".to_string()), "");
ent = ent.next().unwrap().unwrap();
t(&mut ent, Some("simple/fifo"), 0, FileType::Other, "");
ent = ent.next().unwrap().unwrap();
t(&mut ent, None, 0, FileType::File, "");
assert!(ent.next().unwrap().is_none());
}
#[test]
fn raw() {
let mut f = File::open("tests/rawtest.gz.xz.bzip2").unwrap();
let mut r = Archive::open_raw(&mut f).unwrap();
let mut c = String::new();
r.read_to_string(&mut c).unwrap();
assert_eq!(&c, "File contents!\n");
}
#[test]
fn raw_passthrough() {
let mut r = std::io::Cursor::new(&b"This is an uncompressed text file"[..]);
let mut ent = Archive::open_raw(&mut r).unwrap();
let mut s = String::new();
ent.read_to_string(&mut s).unwrap();
assert_eq!(&s, "This is an uncompressed text file");
}
}