From 0cab7586655c328392c2c5bee437e29728c8d8f9 Mon Sep 17 00:00:00 2001 From: Yorhel Date: Sun, 30 Oct 2016 11:06:14 +0100 Subject: [PATCH] Add support for man page reading & decoding --- indexer/Cargo.lock | 82 ++++++++ indexer/Cargo.toml | 2 + indexer/src/archive.rs | 92 +++++++-- indexer/src/main.rs | 2 + indexer/src/man.rs | 218 +++++++++++++++++++++- indexer/tests/exit.3.gz.lzma | Bin 0 -> 1970 bytes indexer/tests/{mktar.sh => mkarchives.sh} | 11 +- indexer/tests/rawtest.gz.xz.bzip2 | Bin 0 -> 35 bytes indexer/tests/simpletest.tar.gz | Bin 247 -> 248 bytes indexer/tests/testarchive.tar.xz | Bin 616 -> 620 bytes 10 files changed, 394 insertions(+), 13 deletions(-) create mode 100644 indexer/tests/exit.3.gz.lzma rename indexer/tests/{mktar.sh => mkarchives.sh} (88%) create mode 100644 indexer/tests/rawtest.gz.xz.bzip2 diff --git a/indexer/Cargo.lock b/indexer/Cargo.lock index dbd3057..1d0ea13 100644 --- a/indexer/Cargo.lock +++ b/indexer/Cargo.lock @@ -2,12 +2,14 @@ name = "indexer" version = "0.1.0" dependencies = [ + "encoding 0.2.33 (registry+https://github.com/rust-lang/crates.io-index)", "env_logger 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)", "lazy_static 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)", "libarchive3-sys 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)", "libc 0.2.17 (registry+https://github.com/rust-lang/crates.io-index)", "log 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", "regex 0.1.80 (registry+https://github.com/rust-lang/crates.io-index)", + "ring 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] @@ -18,6 +20,63 @@ dependencies = [ "memchr 0.1.11 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "encoding" +version = "0.2.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "encoding-index-japanese 1.20141219.5 (registry+https://github.com/rust-lang/crates.io-index)", + "encoding-index-korean 1.20141219.5 (registry+https://github.com/rust-lang/crates.io-index)", + "encoding-index-simpchinese 1.20141219.5 (registry+https://github.com/rust-lang/crates.io-index)", + "encoding-index-singlebyte 1.20141219.5 (registry+https://github.com/rust-lang/crates.io-index)", + "encoding-index-tradchinese 1.20141219.5 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "encoding-index-japanese" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "encoding_index_tests 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "encoding-index-korean" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "encoding_index_tests 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "encoding-index-simpchinese" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "encoding_index_tests 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "encoding-index-singlebyte" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "encoding_index_tests 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "encoding-index-tradchinese" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "encoding_index_tests 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "encoding_index_tests" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "env_logger" version = "0.3.5" @@ -90,6 +149,15 @@ name = "regex-syntax" version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "ring" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "lazy_static 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)", + "untrusted 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "thread-id" version = "2.0.0" @@ -107,6 +175,11 @@ dependencies = [ "thread-id 2.0.0 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "untrusted" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "utf8-ranges" version = "0.1.3" @@ -124,6 +197,13 @@ source = "registry+https://github.com/rust-lang/crates.io-index" [metadata] "checksum aho-corasick 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)" = "ca972c2ea5f742bfce5687b9aef75506a764f61d37f8f649047846a9686ddb66" +"checksum encoding 0.2.33 (registry+https://github.com/rust-lang/crates.io-index)" = "6b0d943856b990d12d3b55b359144ff341533e516d94098b1d3fc1ac666d36ec" +"checksum encoding-index-japanese 1.20141219.5 (registry+https://github.com/rust-lang/crates.io-index)" = "04e8b2ff42e9a05335dbf8b5c6f7567e5591d0d916ccef4e0b1710d32a0d0c91" +"checksum encoding-index-korean 1.20141219.5 (registry+https://github.com/rust-lang/crates.io-index)" = "4dc33fb8e6bcba213fe2f14275f0963fd16f0a02c878e3095ecfdf5bee529d81" +"checksum encoding-index-simpchinese 1.20141219.5 (registry+https://github.com/rust-lang/crates.io-index)" = "d87a7194909b9118fc707194baa434a4e3b0fb6a5a757c73c3adb07aa25031f7" +"checksum encoding-index-singlebyte 1.20141219.5 (registry+https://github.com/rust-lang/crates.io-index)" = "3351d5acffb224af9ca265f435b859c7c01537c0849754d3db3fdf2bfe2ae84a" +"checksum encoding-index-tradchinese 1.20141219.5 (registry+https://github.com/rust-lang/crates.io-index)" = "fd0e20d5688ce3cab59eb3ef3a2083a5c77bf496cb798dc6fcdb75f323890c18" +"checksum encoding_index_tests 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)" = "a246d82be1c9d791c5dfde9a2bd045fc3cbba3fa2b11ad558f27d01712f00569" "checksum env_logger 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)" = "15abd780e45b3ea4f76b4e9a26ff4843258dd8a3eed2775a0e7368c2e7936c2f" "checksum kernel32-sys 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7507624b29483431c0ba2d82aece8ca6cdba9382bff4ddd0f7490560c056098d" "checksum lazy_static 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "49247ec2a285bb3dcb23cbd9c35193c025e7251bfce77c1d5da97e6362dffe7f" @@ -134,8 +214,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum pkg-config 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)" = "8cee804ecc7eaf201a4a207241472cc870e825206f6c031e3ee2a72fa425f2fa" "checksum regex 0.1.80 (registry+https://github.com/rust-lang/crates.io-index)" = "4fd4ace6a8cf7860714a2c2280d6c1f7e6a413486c13298bbc86fd3da019402f" "checksum regex-syntax 0.3.9 (registry+https://github.com/rust-lang/crates.io-index)" = "f9ec002c35e86791825ed294b50008eea9ddfc8def4420124fbc6b08db834957" +"checksum ring 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)" = "c0743ef007bcff4909b107907a410418eb7e5c6ad55b843d70b39f62bfb7112e" "checksum thread-id 2.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "a9539db560102d1cef46b8b78ce737ff0bb64e7e18d35b2a5688f7d097d0ff03" "checksum thread_local 0.2.7 (registry+https://github.com/rust-lang/crates.io-index)" = "8576dbbfcaef9641452d5cf0df9b0e7eeab7694956dd33bb61515fb8f18cfdd5" +"checksum untrusted 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)" = "5d9bc0e6e73a10975d1fbff8ac3541e221181b0d8998351600fb5523de634c0d" "checksum utf8-ranges 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)" = "a1ca13c08c41c9c3e04224ed9ff80461d97e121589ff27c753a16cb10830ae0f" "checksum winapi 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)" = "167dc9d6949a9b857f3451275e911c3f44255842c1f7a76f33c55103a909087a" "checksum winapi-build 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "2d315eee3b34aca4797b2da6b13ed88266e6d612562a0c46390af8299fc699bc" diff --git a/indexer/Cargo.toml b/indexer/Cargo.toml index 4fd665f..f97465d 100644 --- a/indexer/Cargo.toml +++ b/indexer/Cargo.toml @@ -10,3 +10,5 @@ env_logger = "0.3.5" lazy_static = "0.2.1" libc = "0.2.17" libarchive3-sys = "0.1.2" +encoding = "0.2.33" +ring = "0.5.3" diff --git a/indexer/src/archive.rs b/indexer/src/archive.rs index 17e85c4..c536dcb 100644 --- a/indexer/src/archive.rs +++ b/indexer/src/archive.rs @@ -26,6 +26,7 @@ pub struct Archive<'a> { rd: &'a mut Read, buf: Vec, err: Option, + eof: bool, } @@ -34,6 +35,8 @@ pub struct ArchiveEntry<'a> { e: *mut ffi::Struct_archive_entry, } +pub struct RawEntry<'a>(Box>); + #[derive(Debug,PartialEq,Eq)] pub enum FileType { @@ -65,7 +68,7 @@ impl<'a> Archive<'a> { let bufsize = 64*1024; let mut buf = Vec::with_capacity(bufsize); unsafe { buf.set_len(bufsize) }; - let mut ret = Box::new(Archive { a: a, rd: rd, buf: buf, err: None }); + let mut ret = Box::new(Archive { a: a, rd: rd, buf: buf, err: None, eof: false }); let aptr: *mut c_void = &mut *ret as *mut Archive as *mut c_void; let r = unsafe { ffi::archive_read_open(a, aptr, None, Some(archive_read_cb), None) }; @@ -76,10 +79,18 @@ impl<'a> Archive<'a> { } fn error(&mut self) -> Error { - // TODO: Do something with the description - self.err.take().unwrap_or_else(|| - Error::from_raw_os_error(unsafe { ffi::archive_errno(self.a) }) - ) + self.err.take().unwrap_or_else(|| { + let err = Error::from_raw_os_error(unsafe { ffi::archive_errno(self.a) }); + let desc = unsafe { ffi::archive_error_string(self.a) }; + if desc.is_null() { + return err; + } + if let Ok(s) = str::from_utf8(unsafe { CStr::from_ptr(desc) }.to_bytes()) { + Error::new(err.kind(), s) + } else { + err + } + }) } fn entry(self: Box) -> Result>> { @@ -87,6 +98,7 @@ impl<'a> Archive<'a> { a: self, e: ptr::null_mut() }; + ent.a.eof = false; let res = unsafe { ffi::archive_read_next_header(ent.a.a, &mut ent.e) }; match res { ffi::ARCHIVE_EOF => Ok(None), @@ -96,9 +108,15 @@ impl<'a> Archive<'a> { } fn read(&mut self, buf: &mut [u8]) -> Result { + // libarchive tends to throw an error if you try to read after an EOF; handle that case + // here. + if self.eof { + return Ok(0); + } let cbuf = buf.as_mut_ptr() as *mut c_void; let n = unsafe { ffi::archive_read_data(self.a, cbuf, buf.len()) }; if n >= 0 { + self.eof = n == 0; Ok(n as usize) } else { Err(self.error()) @@ -114,6 +132,27 @@ impl<'a> Archive<'a> { }; try!(Self::new(rd, a)).entry() } + + pub fn open_raw(rd: &mut Read) -> Result { + let a = unsafe { + let a = ffi::archive_read_new(); + ffi::archive_read_support_filter_all(a); + ffi::archive_read_support_format_raw(a); + ffi::archive_read_support_format_empty(a); + a + }; + let mut a = try!(Self::new(rd, a)); + let mut e: *mut ffi::Struct_archive_entry = ptr::null_mut(); + let res = unsafe { ffi::archive_read_next_header(a.a, &mut e) }; + match res { + ffi::ARCHIVE_FATAL => Err(a.error()), + ffi::ARCHIVE_EOF => { + a.eof = true; + Ok(RawEntry(a)) + }, + _ => Ok(RawEntry(a)) + } + } } @@ -197,6 +236,13 @@ impl<'a> Read for ArchiveEntry<'a> { } +impl<'a> Read for RawEntry<'a> { + fn read(&mut self, buf: &mut [u8]) -> Result { + self.0.read(buf) + } +} + + // We can't provide an Iterator object for ArchiveEntries because Rust doesn't support streaming // iterators. Let's instead provide a walk function for convenience. // cb should return Ok(true) to continue, Ok(false) to break @@ -223,22 +269,28 @@ mod tests { use std::fs::File; #[test] - fn invalid_archive() { + fn invalid() { let mut r = std::io::repeat(0x0a).take(64*1024); let ent = Archive::open_archive(&mut r); assert!(ent.is_err()); } #[test] - fn zerolength_archive() { + fn zerolength() { let mut r = std::io::empty(); - let ent = Archive::open_archive(&mut r); - // I expected an error here rather than None, whatever. - assert!(ent.unwrap().is_none()); + { + let ent = Archive::open_archive(&mut r); + assert!(ent.unwrap().is_none()); + } + { + let mut ent = Archive::open_raw(&mut r).unwrap(); + let mut v = Vec::new(); + assert_eq!(ent.read_to_end(&mut v).unwrap(), 0); + } } #[test] - fn read() { + fn archive() { let mut f = File::open("tests/simpletest.tar.gz").unwrap(); let mut ent = Archive::open_archive(&mut f).unwrap().unwrap(); @@ -270,4 +322,22 @@ mod tests { assert!(ent.next().unwrap().is_none()); } + + #[test] + fn raw() { + let mut f = File::open("tests/rawtest.gz.xz.bzip2").unwrap(); + let mut r = Archive::open_raw(&mut f).unwrap(); + let mut c = String::new(); + r.read_to_string(&mut c).unwrap(); + assert_eq!(&c, "File contents!\n"); + } + + #[test] + fn raw_passthrough() { + let mut r = std::io::Cursor::new(&b"This is an uncompressed text file"[..]); + let mut ent = Archive::open_raw(&mut r).unwrap(); + let mut s = String::new(); + ent.read_to_string(&mut s).unwrap(); + assert_eq!(&s, "This is an uncompressed text file"); + } } diff --git a/indexer/src/main.rs b/indexer/src/main.rs index 5661568..bcb5ee8 100644 --- a/indexer/src/main.rs +++ b/indexer/src/main.rs @@ -4,6 +4,8 @@ extern crate env_logger; extern crate regex; extern crate libarchive3_sys; extern crate libc; +extern crate ring; +extern crate encoding; mod archive; mod archread; diff --git a/indexer/src/man.rs b/indexer/src/man.rs index b268fe8..56a45b3 100644 --- a/indexer/src/man.rs +++ b/indexer/src/man.rs @@ -1,4 +1,19 @@ +use std::str; +use std::io; +use std::io::Read; +use regex::bytes; use regex::Regex; +use encoding; +use encoding::{all,EncodingRef}; +use encoding::label::encoding_from_whatwg_label; +use ring::digest; + +use archive::Archive; + +// Anything larger than this just isn't a man page. I hope. +const MAX_MAN_SIZE: u64 = 20*1024*1024; +// I've also not seen valid man pages smaller than this +const MIN_MAN_SIZE: u64 = 9; // Checks a path for a man page candidate. Returns None if it doesn't seem like a man page @@ -24,9 +39,9 @@ fn parse_path(path: &str) -> Option<(&str, &str, &str)> { // Not everything matching the regex is necessarily a man page, exclude some special cases. match (name, section, locale) { // Files that totally aren't man pages - ("Makefile", "in", _) | ("Makefile", "am", _) | (".cvsignore", _, _) | + (_, "in", _) | (_, "gz", _) | (_, "lzma", _) | (_, "bz2", _) | @@ -51,6 +66,165 @@ fn parse_path(path: &str) -> Option<(&str, &str, &str)> { } +// Convenient wrapper for archread's interest_cb +pub fn ismanpath(path: &str) -> bool { + parse_path(path).is_some() +} + + +fn validate(data: &Vec) -> Option<&'static str> { + lazy_static! { + static ref HTML: bytes::Regex = bytes::Regex::new(r"^\s*<(?:html|head|!DOCTYPE)").unwrap(); + } + + if data.len() >= MAX_MAN_SIZE as usize { + Some("File too large") + } else if data.len() < MIN_MAN_SIZE as usize { + Some("File too small") + } else if &data[..] == &b".so man3/\n"[..] { + Some("Contents: '.so man3/'") + } else if &data[..] == &b"timestamp\n"[..] { + Some("Contents: 'timestamp'") + } else if HTML.is_match(&data) { + Some("Looks like an HTML file") + } else { + None + } +} + + +// Look for 'coding:' indications in the file header, a la preconv(1). +fn codec_from_tag(data: &Vec) -> Option { + lazy_static! { + // According to the emacs docs the tag should be on the first line; according to preconv(1) + // it should be on the first or second line. I've also seen some files with the tag on the + // last line. I've not seen the tag itself used in a different context, so just get it from + // anywhere... + static ref TAG: bytes::Regex = bytes::Regex::new(r"-\*-.*coding:\s*(?u:([^\s;]+)).*-\*").unwrap(); + } + let cap = match TAG.captures(&data) { Some(x) => x, None => return None }; + let tag = str::from_utf8(cap.at(1).unwrap()).unwrap().to_lowercase(); + + match &tag[..] { + // Deny some common UTF-8-compatible encodings. These tags are obviously incorrect. + "us-ascii" | "ascii" | "utf8" | "utf-8" | "utf-8-unix" => None, + + // latin-1 isn't in the whatwg spec under that name + "latin-1" => Some(all::WINDOWS_1252), + + // Waaaaaaaaah we can't decode this :( + "armscii-8" => None, + + // Anything else should be found by its whatwg label. + x => match encoding_from_whatwg_label(x) { + Some(x) => Some(x), + None => { warn!("Unknown encoding in emacs tag: {}", x); None }, + } + } +} + + +fn codec_from_path(path: &str) -> Option { + let locale = match parse_path(path) { + Some((_,_,l)) if l != "" => l.to_lowercase(), + _ => return None, + }; + + lazy_static! { + static ref RE: Regex = Regex::new(r"^(?x) + ([a-z]+) # primary language + (?:_ ([a-z]+))? # secondary language + (?:@ [a-z]+)? # script (potentially useful, but uncommon and not currently used) + (?:\. ([^\.@]+))? # encoding (FUCKING USEFUL) + $").unwrap(); + } + + let cap = match RE.captures(&locale) { Some(x) => x, None => return None }; + let lang = cap.at(1).unwrap(); + let seclang = cap.at(2); + let enc = cap.at(3); + + // Try to do something with the encoding tag + match (lang, enc) { + (_, Some("eucjp")) | + (_, Some("ujis")) | // Not sure about this one, but it seems to come out alright + ("ja", Some("euc")) => return Some(all::EUC_JP), + + (_, Some("euckr")) => return Some(all::WINDOWS_949), + + ("ja", Some("jis7")) | + ("ja", Some("pck")) => return None, /* WAT? TODO: DO SOMETHING WITH THESE */ + + (_, Some(x)) => match encoding_from_whatwg_label(x) { + Some(x) => return Some(x), + _ => { warn!("Unknown encoding in locale: {}", x) }, + }, + _ => {}, + }; + + // Fall back to language + match (lang, seclang) { + ("pl", _) | + ("cs", _) | + ("hr", _) | + ("hu", _) | + ("sl", _) | + ("sk", _) => Some(all::ISO_8859_2), + ("bg", _) | + ("be", _) | + ("uk", _) => Some(all::ISO_8859_5), + ("el", _) => Some(all::ISO_8859_7), + ("et", _) => Some(all::ISO_8859_15), + ("tr", _) => Some(all::WINDOWS_1254), + ("ru", _) => Some(all::KOI8_R), + ("ja", _) | + ("jp", _) => Some(all::EUC_JP), // Tricky; but JIS is certainly less common + ("zh", Some("cn")) => Some(all::GBK), // These are based purely on what I've observed, + ("zh", _) => Some(all::BIG5_2003), // perhaps some heuristics based on contents can do better + ("ko", _) => Some(all::WINDOWS_949), + (_, _) => None, + } +} + + +// Decompresses / decodes a man page and returns its SHA-1 hash, encoding name, and UTF-8 contents. +pub fn decode(paths: &[&str], ent: &mut Read) -> io::Result<(digest::Digest,&'static str,String)> { + let mut decomp = try!(Archive::open_raw(ent)).take(MAX_MAN_SIZE+1); + let mut data = Vec::new(); + try!(decomp.read_to_end(&mut data)); + + if let Some(e) = validate(&data) { + return Err(io::Error::new(io::ErrorKind::InvalidData, e)); + } + + let dig = digest::digest(&digest::SHA1, &data); + + // TODO: Handle BOM? UTF-16? + // If it passes as UTF-8, then just consider it UTF-8. + if let Ok(_) = str::from_utf8(&data) { + return Ok((dig, "utf8", unsafe { String::from_utf8_unchecked(data) } )); + } + // Otherwise, look for a coding tag in the contents + if let Some(e) = codec_from_tag(&data) { + if let Ok(s) = e.decode(&data, encoding::DecoderTrap::Strict) { + return Ok((dig, e.name(), s)); + } + } + // If that fails as well, look for clues in the file path. + for path in paths { + if let Some(e) = codec_from_path(path) { + if let Ok(s) = e.decode(&data, encoding::DecoderTrap::Strict) { + return Ok((dig, e.name(), s)); + } + } + } + // If all else fails, use a lossy iso-8859-1 + Ok((dig, "iso-8859-1", (all::ISO_8859_1 as EncodingRef).decode(&data, encoding::DecoderTrap::Ignore).unwrap() )) +} + + + + #[test] fn test_parse_path() { // Generic tests @@ -83,3 +257,45 @@ fn test_parse_path() { assert_eq!(parse_path("/usr/local/plan9/man/man8/index.html"), None); assert_eq!(parse_path("/usr/local/share/doc/gmt/html/man/grdpaste.html"), None); } + + +#[test] +fn test_codec_from_path() { + let t = |p,n| { + assert_eq!(codec_from_path(p).unwrap().name(), n); + }; + t("man/de_DE.ISO8859-15/man1/scribus.1.gz", "iso-8859-15"); + t("man/de_DE.ISO_8859-1/man1/scribus.1.gz", "windows-1252"); + t("man/ja.UTF-8/man1/test.1", "utf-8"); + t("man/ja_JP/man1/test.1", "euc-jp"); + t("man/ja_JP.EUC/man1/test.1", "euc-jp"); + t("man/ja_JP.SJIS/man1/test.1", "windows-31j"); + t("man/jp.eucJP/man1/test.1", "euc-jp"); + t("man/jp/man1/test.1", "euc-jp"); + t("man/lt.ISO8859-13/man1/test.1", "iso-8859-13"); + t("man/ru/man1/test.1", "koi8-r"); + t("man/ru_RU@Cyr/man1/test.1", "koi8-r"); + t("man/zh_CN/man1/test.1", "gbk"); + t("man/zh_TW/man1/test.1", "big5-2003"); +} + + +#[test] +fn test_decode_zh() { + use std::fs::File; + use ring::test::from_hex; + + // cat exit.1.gz | lzma -d | gzip -d | sha1sum + let filehash = from_hex("cdf9b3e8d96a83c908eb0a0c277485e2f3eebe87").unwrap(); + // cat exit.1.gz | lzma -d | gzip -d | iconv -f gbk -t utf8 | sha1sum + let utf8hash = from_hex("47f3e441137b207c0abdc38adac692298da4927a").unwrap(); + + let mut f = File::open("tests/exit.3.gz.lzma").unwrap(); + let (dig, enc, s) = decode(&["bullshit", "/usr/share/man/zh_CN/man3/exit.3.gz"][..], &mut f).unwrap(); + + assert_eq!(dig.as_ref(), &filehash[..]); + assert_eq!(enc, "gbk"); + + let utf8dig = digest::digest(&digest::SHA1, s.as_bytes()); + assert_eq!(utf8dig.as_ref(), &utf8hash[..]); +} diff --git a/indexer/tests/exit.3.gz.lzma b/indexer/tests/exit.3.gz.lzma new file mode 100644 index 0000000000000000000000000000000000000000..7c84e6aa5a44c01b7106a5a0d2730216a95a1e98 GIT binary patch literal 1970 zcmV;j2Tk~0004jh|NsC0|NsC001u<4_yZ||V@;Sg2wDj?>szYj(xOW}Vch82$Y~u$ zjYL;`1p&cVr{I`sKrcxOZ%A)1C1F@zIrpYVZX)rs0J#2mAj^4j>Sl>0>S2i6A!c9vjDu2zF~F&n$$+QVGp_xowyZj%S)Ay z-HUhdC8A>iQ7tgS`y!zQA{0EKa{cSx4xvCt#CKZ1@x9er1-nAHog6EX4|Riu9h(YI z+h${fYkX2cBpu!4K|R-u4nI3^xMC27%7L_yL+5yflLhnB`a>AhpeRKVd6ecWzDq|f zrAtb(S})kdtx0ZNJuk~76YNs3VtOkowSb2ybckcyV6hjyA`fue198k@XNbYUW=v!?2^htr zk>epthPRGfzJ4=rBrrU`SZ@{(!C+Uz#GsBIIN)ka??0Ud!KD;9IVDM}rjVhHzpFgD z#n=kFB(KZszKgn-al(QAB5_o!K^)`C5D86qaRZ&wFMXGgYSUSYsV}4c0BSE^fILR~ z9HY9cXx#I_Cj)7$ZSB$Cy7(ITWiEG@Xu61WFQhyGKnIQ^L3^jXT!foRhD{Crg+;>p z0#FFwpnQFEtbN0Qtzv{f-1krHSn3a5u^sW+24`1n8nUDWI11(!6I1pr9H4NhoaB=p zRV|>i-QCitHX`KDl+2MMTP>9ruIU1Awi#l=y^sX{|96CHq#W69|4J(hx`Qws3@C_o z%%)Kc7O8~JmS~}5c1;`=@LkQ4zst(0G^~ZaDGUO-%Z|pxEi~d`10hwHfFO zp{h8Uc(eL9DYjO_%Jr~PLsbPvUVz8UlQ2j7HUmfqjqNL00gj54g<_eg zV2=$f9D~#Or+6Zc^yCK5I}Q+wm`K`qd4PeRrm<8!07AM5^IIlXAm+zw2VT{IOM~f6 zB@>pH!|`1PPN$9?=e}zrg^+ji^|Rm4i$xosY!G(=D0C=!cv*~RXPD=n;M|n2KQ|2u zs$owGj~yfJ(W;@vkOd);akANrTrOa!;2YZhbK)hc4J*nSIgOSEz^h-TuHs)hKHXNY zn}e>HQRbFY<4Ci+Qi_iF5jdI_t+8k7bk$EzVBg+Bt#Wy!3f%N1imR*5`vEoJgd9RZ zNQ;W!aYPa6JCS3=+)9@WaZ|rj$U?CKKggVv5x^24a6hf$8He|?n?F|-57KtomSaFt z{aDCtdMlK!HstCR7IRcWEi_z~YTb&}y~;69)9^yJG2ZMO?UUAOs%b54w08TkU z%K_d)gnj19%g|j=L@MYY+!WAAD6D4aA%vv;_D^9E8VnP1+yEFq($RKSi|~2zk0sZM zS<>z7h(SL)&XYIydMtkVje{u_LOO9P`F+7D`qf=KeUGsmK&tiD$0e{djfPy2R5&3n z99C3_--m4jWj>DgsLrW{&6ik zE==JYrU-;&5G2cm*w53cLZ1gV1%Oc!Y8y)BIfzd%02hord?xu(itB7RZLm~P^`3?; z@PLXF#;-t>R2`C;io!!~ZcRk9l2XAl+^d3z7xmn_cIg1PuZsWN#tkt z0{t*67=dgY6c;t|z10LOhM<)pzo5n^Y_P#7lM$=ttni}nu*Uvz1s#~zQV_nmbEwY= z3?Rqqj%8V~?}b@N=jD3Qseo{X9oW} z7q{_Mz*b)l#jnFa_t2W1#4e*CV}Ep^L^{cs^k5(MV5RDfm}C^JSjaAsWS~XuBM$q~ zwM!q(j!xF_$yi7aJ6n>B&bfX%&P}kkQgE2wo0z9RlZMT_fy)lM&=TDL|Mmo+ E0EY0t$p8QV literal 0 HcmV?d00001 diff --git a/indexer/tests/mktar.sh b/indexer/tests/mkarchives.sh similarity index 88% rename from indexer/tests/mktar.sh rename to indexer/tests/mkarchives.sh index 9f8844d..169f2bd 100755 --- a/indexer/tests/mktar.sh +++ b/indexer/tests/mkarchives.sh @@ -1,10 +1,12 @@ #!/bin/sh # The order of inserting the files into the tar is not fully deterministic this -# way. The tests will fail quite badly if hardlink.6 is considered the +# way. The tests will fail quite badly if a hardlink is considered the # "original" version. +# simpletest.tar.gz + mkdir simple echo Hi >simple/file ln -s file simple/link @@ -17,6 +19,13 @@ rm -rf $badfn simple +# rawtest.gz.xz.bzip2 + +echo "File contents!" | gzip | xz | bzip2 >rawtest.gz.xz.bzip2 + + +# testarchive.tar.xz + mkdir man cd man diff --git a/indexer/tests/rawtest.gz.xz.bzip2 b/indexer/tests/rawtest.gz.xz.bzip2 new file mode 100644 index 0000000000000000000000000000000000000000..bc4f2e85dcc775acb097d4476898ccc2d20ceaad GIT binary patch literal 35 rcmb2|=3tnUCKAEGTzvYBXQ;=y^V*(gwKc<@FfpiITUE`^z`y_i*Q5)w literal 0 HcmV?d00001 diff --git a/indexer/tests/simpletest.tar.gz b/indexer/tests/simpletest.tar.gz index 06a535cdc1f5a295ca3254f00ae7728166efebff..409f5ca796b57bff6cbc29ea7b3932e71e5bd26a 100644 GIT binary patch literal 248 zcmVhs{7`rQLw4A) z)p*Q*?Rd6__#l603IBfp000000000000000Q{4gjPIt-xC;$ND_;|wr literal 247 zcmVkvZ6|;hx|Jj1u2vU@$3^k}Fw}GD1G!UW0 z;3nbuUgp7pd-2R%EwjTZ#2ZFQ)V#ILD!mo`^@T|-gz>^iS?`n5Yr(iB?W%Jwt8KwK zyXHk2hxhx&p8u(`%|FRPOje@_*ctO7e?7=wS*IBnWF7S^JpU+L&D{dbsJs6mJMPJ9 zJmip#|K2FHLeZ@&mR%3ep%6i4P xBkQPV;rU0m5bo>hQW4hpBL8MP{{H{~00000000000000cdH@H84l)2J00691c?tjk diff --git a/indexer/tests/testarchive.tar.xz b/indexer/tests/testarchive.tar.xz index 55b69f76ca05b8ac257a26ca0598ac6560c880ef..9892fae93b1a3e7d6a3ed69b5dfeeb8f67143cb5 100644 GIT binary patch delta 581 zcmV-L0=oU^1ndNm904nl9U6b%A<(^aTIQTy)Rvi5@`fweX8+Kxd_<}wbZns^xI_gC zPscT6q{l6`<^HZIMuQ_HozpU3V}a4|;w{!!esY3HGl_!dJ9(p?2_!}Gi@=jR8}ZK2 zg!G+JbcTu#*UF0kE_fwvyJ1Z}i^3e8MqwdP!;-@VHW-GUyykBbR0)6mrfYUC*9wH8 zBwxBO(59yLgRrguJkSPudU|T!uaHHvlGO}6@`t~(n%--3QwNv7#*#nJltHkq# z@h6%%ls*r&7m>G1b%;w_X$IO%tT6DQMW{jak%};Zg{XY>sxZ=`SQ&p_Gw=S2Ac`8? zA&^al_HD^xA)9?w za+YxXGfG@nJo$CUP$e{~sl-TkpUW|ha(r4&W6>e2A})>FzO2Y~J^D@kXg_Kbxki;< z&F#Y^fL&VeD0>>;EAUg|TqA)lfhw z!F5PVjM6*1n%OxuwR=jC-l?Wb#c|r7&qgL{O_|ul6X*Z{i^Y T3x)z0vBYQl0ssI200dcDsB;>Y delta 577 zcmV-H0>1t11n2~i904ei9U6bcUXsFzs4O9PO_a20_F!?Bihx;Fu@F9s3ri%53V?Y2 zoess^YJ1$v)lBFR-xDhG*hv3dQ@qt-)_kn250?xJkOHG_G=jU&%u0(RFN z{U}9mYEXk1*SbX}TDOl_b*YnIj5DPQZ%@c@+;2jClq>t{RCGS6zJ)S>k56E`WEYNA zP0`sjs;0i_7OiV+v!H)0zoG?K(0z%91Ym2f97zjQ^Rdym)@TaSo{xA2wKvDuo2*N< z%e@vlTQaDqV$1U&++S!nmvGro(Y`p`KpEX@ds;ts^5d3QMx~Z*_?E9{AAl4P8*k$38pzr|GAPif83mu$V>i?_@Dv9OJ-UB_AY;4^fC$+y`>9^R z49|VtEnKHoi*E368ac59T6P-`3R}ec<=nR6I$)(SwRbj?rixk?&Q8-9-RF4eNmSg2 z663^ss$*tXHNAo@8yOyBkASE&QX!_g_eZL5kz%?d$|7bzfA_}WcIHM15qHC3n0?ng z9cxHui7)&*6puBt^Ta8g#l`S(4(D`T5ol>m!Z!c_H06BdkU$A+00G1VfS>^YrSz7O PvBYQl0ssI200dcD>**BG