Add support for man page reading & decoding
This commit is contained in:
parent
c8bb4da246
commit
0cab758665
10 changed files with 394 additions and 13 deletions
82
indexer/Cargo.lock
generated
82
indexer/Cargo.lock
generated
|
|
@ -2,12 +2,14 @@
|
|||
name = "indexer"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"encoding 0.2.33 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"env_logger 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"lazy_static 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"libarchive3-sys 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"libc 0.2.17 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"log 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"regex 0.1.80 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"ring 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
|
@ -18,6 +20,63 @@ dependencies = [
|
|||
"memchr 0.1.11 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "encoding"
|
||||
version = "0.2.33"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
dependencies = [
|
||||
"encoding-index-japanese 1.20141219.5 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"encoding-index-korean 1.20141219.5 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"encoding-index-simpchinese 1.20141219.5 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"encoding-index-singlebyte 1.20141219.5 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"encoding-index-tradchinese 1.20141219.5 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "encoding-index-japanese"
|
||||
version = "1.20141219.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
dependencies = [
|
||||
"encoding_index_tests 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "encoding-index-korean"
|
||||
version = "1.20141219.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
dependencies = [
|
||||
"encoding_index_tests 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "encoding-index-simpchinese"
|
||||
version = "1.20141219.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
dependencies = [
|
||||
"encoding_index_tests 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "encoding-index-singlebyte"
|
||||
version = "1.20141219.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
dependencies = [
|
||||
"encoding_index_tests 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "encoding-index-tradchinese"
|
||||
version = "1.20141219.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
dependencies = [
|
||||
"encoding_index_tests 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "encoding_index_tests"
|
||||
version = "0.1.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
|
||||
[[package]]
|
||||
name = "env_logger"
|
||||
version = "0.3.5"
|
||||
|
|
@ -90,6 +149,15 @@ name = "regex-syntax"
|
|||
version = "0.3.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
|
||||
[[package]]
|
||||
name = "ring"
|
||||
version = "0.5.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
dependencies = [
|
||||
"lazy_static 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"untrusted 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "thread-id"
|
||||
version = "2.0.0"
|
||||
|
|
@ -107,6 +175,11 @@ dependencies = [
|
|||
"thread-id 2.0.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "untrusted"
|
||||
version = "0.3.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
|
||||
[[package]]
|
||||
name = "utf8-ranges"
|
||||
version = "0.1.3"
|
||||
|
|
@ -124,6 +197,13 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||
|
||||
[metadata]
|
||||
"checksum aho-corasick 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)" = "ca972c2ea5f742bfce5687b9aef75506a764f61d37f8f649047846a9686ddb66"
|
||||
"checksum encoding 0.2.33 (registry+https://github.com/rust-lang/crates.io-index)" = "6b0d943856b990d12d3b55b359144ff341533e516d94098b1d3fc1ac666d36ec"
|
||||
"checksum encoding-index-japanese 1.20141219.5 (registry+https://github.com/rust-lang/crates.io-index)" = "04e8b2ff42e9a05335dbf8b5c6f7567e5591d0d916ccef4e0b1710d32a0d0c91"
|
||||
"checksum encoding-index-korean 1.20141219.5 (registry+https://github.com/rust-lang/crates.io-index)" = "4dc33fb8e6bcba213fe2f14275f0963fd16f0a02c878e3095ecfdf5bee529d81"
|
||||
"checksum encoding-index-simpchinese 1.20141219.5 (registry+https://github.com/rust-lang/crates.io-index)" = "d87a7194909b9118fc707194baa434a4e3b0fb6a5a757c73c3adb07aa25031f7"
|
||||
"checksum encoding-index-singlebyte 1.20141219.5 (registry+https://github.com/rust-lang/crates.io-index)" = "3351d5acffb224af9ca265f435b859c7c01537c0849754d3db3fdf2bfe2ae84a"
|
||||
"checksum encoding-index-tradchinese 1.20141219.5 (registry+https://github.com/rust-lang/crates.io-index)" = "fd0e20d5688ce3cab59eb3ef3a2083a5c77bf496cb798dc6fcdb75f323890c18"
|
||||
"checksum encoding_index_tests 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)" = "a246d82be1c9d791c5dfde9a2bd045fc3cbba3fa2b11ad558f27d01712f00569"
|
||||
"checksum env_logger 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)" = "15abd780e45b3ea4f76b4e9a26ff4843258dd8a3eed2775a0e7368c2e7936c2f"
|
||||
"checksum kernel32-sys 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7507624b29483431c0ba2d82aece8ca6cdba9382bff4ddd0f7490560c056098d"
|
||||
"checksum lazy_static 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "49247ec2a285bb3dcb23cbd9c35193c025e7251bfce77c1d5da97e6362dffe7f"
|
||||
|
|
@ -134,8 +214,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||
"checksum pkg-config 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)" = "8cee804ecc7eaf201a4a207241472cc870e825206f6c031e3ee2a72fa425f2fa"
|
||||
"checksum regex 0.1.80 (registry+https://github.com/rust-lang/crates.io-index)" = "4fd4ace6a8cf7860714a2c2280d6c1f7e6a413486c13298bbc86fd3da019402f"
|
||||
"checksum regex-syntax 0.3.9 (registry+https://github.com/rust-lang/crates.io-index)" = "f9ec002c35e86791825ed294b50008eea9ddfc8def4420124fbc6b08db834957"
|
||||
"checksum ring 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)" = "c0743ef007bcff4909b107907a410418eb7e5c6ad55b843d70b39f62bfb7112e"
|
||||
"checksum thread-id 2.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "a9539db560102d1cef46b8b78ce737ff0bb64e7e18d35b2a5688f7d097d0ff03"
|
||||
"checksum thread_local 0.2.7 (registry+https://github.com/rust-lang/crates.io-index)" = "8576dbbfcaef9641452d5cf0df9b0e7eeab7694956dd33bb61515fb8f18cfdd5"
|
||||
"checksum untrusted 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)" = "5d9bc0e6e73a10975d1fbff8ac3541e221181b0d8998351600fb5523de634c0d"
|
||||
"checksum utf8-ranges 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)" = "a1ca13c08c41c9c3e04224ed9ff80461d97e121589ff27c753a16cb10830ae0f"
|
||||
"checksum winapi 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)" = "167dc9d6949a9b857f3451275e911c3f44255842c1f7a76f33c55103a909087a"
|
||||
"checksum winapi-build 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "2d315eee3b34aca4797b2da6b13ed88266e6d612562a0c46390af8299fc699bc"
|
||||
|
|
|
|||
|
|
@ -10,3 +10,5 @@ env_logger = "0.3.5"
|
|||
lazy_static = "0.2.1"
|
||||
libc = "0.2.17"
|
||||
libarchive3-sys = "0.1.2"
|
||||
encoding = "0.2.33"
|
||||
ring = "0.5.3"
|
||||
|
|
|
|||
|
|
@ -26,6 +26,7 @@ pub struct Archive<'a> {
|
|||
rd: &'a mut Read,
|
||||
buf: Vec<u8>,
|
||||
err: Option<Error>,
|
||||
eof: bool,
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -34,6 +35,8 @@ pub struct ArchiveEntry<'a> {
|
|||
e: *mut ffi::Struct_archive_entry,
|
||||
}
|
||||
|
||||
pub struct RawEntry<'a>(Box<Archive<'a>>);
|
||||
|
||||
|
||||
#[derive(Debug,PartialEq,Eq)]
|
||||
pub enum FileType {
|
||||
|
|
@ -65,7 +68,7 @@ impl<'a> Archive<'a> {
|
|||
let bufsize = 64*1024;
|
||||
let mut buf = Vec::with_capacity(bufsize);
|
||||
unsafe { buf.set_len(bufsize) };
|
||||
let mut ret = Box::new(Archive { a: a, rd: rd, buf: buf, err: None });
|
||||
let mut ret = Box::new(Archive { a: a, rd: rd, buf: buf, err: None, eof: false });
|
||||
|
||||
let aptr: *mut c_void = &mut *ret as *mut Archive as *mut c_void;
|
||||
let r = unsafe { ffi::archive_read_open(a, aptr, None, Some(archive_read_cb), None) };
|
||||
|
|
@ -76,10 +79,18 @@ impl<'a> Archive<'a> {
|
|||
}
|
||||
|
||||
fn error(&mut self) -> Error {
|
||||
// TODO: Do something with the description
|
||||
self.err.take().unwrap_or_else(||
|
||||
Error::from_raw_os_error(unsafe { ffi::archive_errno(self.a) })
|
||||
)
|
||||
self.err.take().unwrap_or_else(|| {
|
||||
let err = Error::from_raw_os_error(unsafe { ffi::archive_errno(self.a) });
|
||||
let desc = unsafe { ffi::archive_error_string(self.a) };
|
||||
if desc.is_null() {
|
||||
return err;
|
||||
}
|
||||
if let Ok(s) = str::from_utf8(unsafe { CStr::from_ptr(desc) }.to_bytes()) {
|
||||
Error::new(err.kind(), s)
|
||||
} else {
|
||||
err
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
fn entry(self: Box<Self>) -> Result<Option<ArchiveEntry<'a>>> {
|
||||
|
|
@ -87,6 +98,7 @@ impl<'a> Archive<'a> {
|
|||
a: self,
|
||||
e: ptr::null_mut()
|
||||
};
|
||||
ent.a.eof = false;
|
||||
let res = unsafe { ffi::archive_read_next_header(ent.a.a, &mut ent.e) };
|
||||
match res {
|
||||
ffi::ARCHIVE_EOF => Ok(None),
|
||||
|
|
@ -96,9 +108,15 @@ impl<'a> Archive<'a> {
|
|||
}
|
||||
|
||||
fn read(&mut self, buf: &mut [u8]) -> Result<usize> {
|
||||
// libarchive tends to throw an error if you try to read after an EOF; handle that case
|
||||
// here.
|
||||
if self.eof {
|
||||
return Ok(0);
|
||||
}
|
||||
let cbuf = buf.as_mut_ptr() as *mut c_void;
|
||||
let n = unsafe { ffi::archive_read_data(self.a, cbuf, buf.len()) };
|
||||
if n >= 0 {
|
||||
self.eof = n == 0;
|
||||
Ok(n as usize)
|
||||
} else {
|
||||
Err(self.error())
|
||||
|
|
@ -114,6 +132,27 @@ impl<'a> Archive<'a> {
|
|||
};
|
||||
try!(Self::new(rd, a)).entry()
|
||||
}
|
||||
|
||||
pub fn open_raw(rd: &mut Read) -> Result<RawEntry> {
|
||||
let a = unsafe {
|
||||
let a = ffi::archive_read_new();
|
||||
ffi::archive_read_support_filter_all(a);
|
||||
ffi::archive_read_support_format_raw(a);
|
||||
ffi::archive_read_support_format_empty(a);
|
||||
a
|
||||
};
|
||||
let mut a = try!(Self::new(rd, a));
|
||||
let mut e: *mut ffi::Struct_archive_entry = ptr::null_mut();
|
||||
let res = unsafe { ffi::archive_read_next_header(a.a, &mut e) };
|
||||
match res {
|
||||
ffi::ARCHIVE_FATAL => Err(a.error()),
|
||||
ffi::ARCHIVE_EOF => {
|
||||
a.eof = true;
|
||||
Ok(RawEntry(a))
|
||||
},
|
||||
_ => Ok(RawEntry(a))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -197,6 +236,13 @@ impl<'a> Read for ArchiveEntry<'a> {
|
|||
}
|
||||
|
||||
|
||||
impl<'a> Read for RawEntry<'a> {
|
||||
fn read(&mut self, buf: &mut [u8]) -> Result<usize> {
|
||||
self.0.read(buf)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// We can't provide an Iterator object for ArchiveEntries because Rust doesn't support streaming
|
||||
// iterators. Let's instead provide a walk function for convenience.
|
||||
// cb should return Ok(true) to continue, Ok(false) to break
|
||||
|
|
@ -223,22 +269,28 @@ mod tests {
|
|||
use std::fs::File;
|
||||
|
||||
#[test]
|
||||
fn invalid_archive() {
|
||||
fn invalid() {
|
||||
let mut r = std::io::repeat(0x0a).take(64*1024);
|
||||
let ent = Archive::open_archive(&mut r);
|
||||
assert!(ent.is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn zerolength_archive() {
|
||||
fn zerolength() {
|
||||
let mut r = std::io::empty();
|
||||
let ent = Archive::open_archive(&mut r);
|
||||
// I expected an error here rather than None, whatever.
|
||||
assert!(ent.unwrap().is_none());
|
||||
{
|
||||
let ent = Archive::open_archive(&mut r);
|
||||
assert!(ent.unwrap().is_none());
|
||||
}
|
||||
{
|
||||
let mut ent = Archive::open_raw(&mut r).unwrap();
|
||||
let mut v = Vec::new();
|
||||
assert_eq!(ent.read_to_end(&mut v).unwrap(), 0);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn read() {
|
||||
fn archive() {
|
||||
let mut f = File::open("tests/simpletest.tar.gz").unwrap();
|
||||
let mut ent = Archive::open_archive(&mut f).unwrap().unwrap();
|
||||
|
||||
|
|
@ -270,4 +322,22 @@ mod tests {
|
|||
|
||||
assert!(ent.next().unwrap().is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn raw() {
|
||||
let mut f = File::open("tests/rawtest.gz.xz.bzip2").unwrap();
|
||||
let mut r = Archive::open_raw(&mut f).unwrap();
|
||||
let mut c = String::new();
|
||||
r.read_to_string(&mut c).unwrap();
|
||||
assert_eq!(&c, "File contents!\n");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn raw_passthrough() {
|
||||
let mut r = std::io::Cursor::new(&b"This is an uncompressed text file"[..]);
|
||||
let mut ent = Archive::open_raw(&mut r).unwrap();
|
||||
let mut s = String::new();
|
||||
ent.read_to_string(&mut s).unwrap();
|
||||
assert_eq!(&s, "This is an uncompressed text file");
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -4,6 +4,8 @@ extern crate env_logger;
|
|||
extern crate regex;
|
||||
extern crate libarchive3_sys;
|
||||
extern crate libc;
|
||||
extern crate ring;
|
||||
extern crate encoding;
|
||||
|
||||
mod archive;
|
||||
mod archread;
|
||||
|
|
|
|||
|
|
@ -1,4 +1,19 @@
|
|||
use std::str;
|
||||
use std::io;
|
||||
use std::io::Read;
|
||||
use regex::bytes;
|
||||
use regex::Regex;
|
||||
use encoding;
|
||||
use encoding::{all,EncodingRef};
|
||||
use encoding::label::encoding_from_whatwg_label;
|
||||
use ring::digest;
|
||||
|
||||
use archive::Archive;
|
||||
|
||||
// Anything larger than this just isn't a man page. I hope.
|
||||
const MAX_MAN_SIZE: u64 = 20*1024*1024;
|
||||
// I've also not seen valid man pages smaller than this
|
||||
const MIN_MAN_SIZE: u64 = 9;
|
||||
|
||||
|
||||
// Checks a path for a man page candidate. Returns None if it doesn't seem like a man page
|
||||
|
|
@ -24,9 +39,9 @@ fn parse_path(path: &str) -> Option<(&str, &str, &str)> {
|
|||
// Not everything matching the regex is necessarily a man page, exclude some special cases.
|
||||
match (name, section, locale) {
|
||||
// Files that totally aren't man pages
|
||||
("Makefile", "in", _) |
|
||||
("Makefile", "am", _) |
|
||||
(".cvsignore", _, _) |
|
||||
(_, "in", _) |
|
||||
(_, "gz", _) |
|
||||
(_, "lzma", _) |
|
||||
(_, "bz2", _) |
|
||||
|
|
@ -51,6 +66,165 @@ fn parse_path(path: &str) -> Option<(&str, &str, &str)> {
|
|||
}
|
||||
|
||||
|
||||
// Convenient wrapper for archread's interest_cb
|
||||
pub fn ismanpath(path: &str) -> bool {
|
||||
parse_path(path).is_some()
|
||||
}
|
||||
|
||||
|
||||
fn validate(data: &Vec<u8>) -> Option<&'static str> {
|
||||
lazy_static! {
|
||||
static ref HTML: bytes::Regex = bytes::Regex::new(r"^\s*<(?:html|head|!DOCTYPE)").unwrap();
|
||||
}
|
||||
|
||||
if data.len() >= MAX_MAN_SIZE as usize {
|
||||
Some("File too large")
|
||||
} else if data.len() < MIN_MAN_SIZE as usize {
|
||||
Some("File too small")
|
||||
} else if &data[..] == &b".so man3/\n"[..] {
|
||||
Some("Contents: '.so man3/'")
|
||||
} else if &data[..] == &b"timestamp\n"[..] {
|
||||
Some("Contents: 'timestamp'")
|
||||
} else if HTML.is_match(&data) {
|
||||
Some("Looks like an HTML file")
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Look for 'coding:' indications in the file header, a la preconv(1).
|
||||
fn codec_from_tag(data: &Vec<u8>) -> Option<EncodingRef> {
|
||||
lazy_static! {
|
||||
// According to the emacs docs the tag should be on the first line; according to preconv(1)
|
||||
// it should be on the first or second line. I've also seen some files with the tag on the
|
||||
// last line. I've not seen the tag itself used in a different context, so just get it from
|
||||
// anywhere...
|
||||
static ref TAG: bytes::Regex = bytes::Regex::new(r"-\*-.*coding:\s*(?u:([^\s;]+)).*-\*").unwrap();
|
||||
}
|
||||
let cap = match TAG.captures(&data) { Some(x) => x, None => return None };
|
||||
let tag = str::from_utf8(cap.at(1).unwrap()).unwrap().to_lowercase();
|
||||
|
||||
match &tag[..] {
|
||||
// Deny some common UTF-8-compatible encodings. These tags are obviously incorrect.
|
||||
"us-ascii" | "ascii" | "utf8" | "utf-8" | "utf-8-unix" => None,
|
||||
|
||||
// latin-1 isn't in the whatwg spec under that name
|
||||
"latin-1" => Some(all::WINDOWS_1252),
|
||||
|
||||
// Waaaaaaaaah we can't decode this :(
|
||||
"armscii-8" => None,
|
||||
|
||||
// Anything else should be found by its whatwg label.
|
||||
x => match encoding_from_whatwg_label(x) {
|
||||
Some(x) => Some(x),
|
||||
None => { warn!("Unknown encoding in emacs tag: {}", x); None },
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
fn codec_from_path(path: &str) -> Option<EncodingRef> {
|
||||
let locale = match parse_path(path) {
|
||||
Some((_,_,l)) if l != "" => l.to_lowercase(),
|
||||
_ => return None,
|
||||
};
|
||||
|
||||
lazy_static! {
|
||||
static ref RE: Regex = Regex::new(r"^(?x)
|
||||
([a-z]+) # primary language
|
||||
(?:_ ([a-z]+))? # secondary language
|
||||
(?:@ [a-z]+)? # script (potentially useful, but uncommon and not currently used)
|
||||
(?:\. ([^\.@]+))? # encoding (FUCKING USEFUL)
|
||||
$").unwrap();
|
||||
}
|
||||
|
||||
let cap = match RE.captures(&locale) { Some(x) => x, None => return None };
|
||||
let lang = cap.at(1).unwrap();
|
||||
let seclang = cap.at(2);
|
||||
let enc = cap.at(3);
|
||||
|
||||
// Try to do something with the encoding tag
|
||||
match (lang, enc) {
|
||||
(_, Some("eucjp")) |
|
||||
(_, Some("ujis")) | // Not sure about this one, but it seems to come out alright
|
||||
("ja", Some("euc")) => return Some(all::EUC_JP),
|
||||
|
||||
(_, Some("euckr")) => return Some(all::WINDOWS_949),
|
||||
|
||||
("ja", Some("jis7")) |
|
||||
("ja", Some("pck")) => return None, /* WAT? TODO: DO SOMETHING WITH THESE */
|
||||
|
||||
(_, Some(x)) => match encoding_from_whatwg_label(x) {
|
||||
Some(x) => return Some(x),
|
||||
_ => { warn!("Unknown encoding in locale: {}", x) },
|
||||
},
|
||||
_ => {},
|
||||
};
|
||||
|
||||
// Fall back to language
|
||||
match (lang, seclang) {
|
||||
("pl", _) |
|
||||
("cs", _) |
|
||||
("hr", _) |
|
||||
("hu", _) |
|
||||
("sl", _) |
|
||||
("sk", _) => Some(all::ISO_8859_2),
|
||||
("bg", _) |
|
||||
("be", _) |
|
||||
("uk", _) => Some(all::ISO_8859_5),
|
||||
("el", _) => Some(all::ISO_8859_7),
|
||||
("et", _) => Some(all::ISO_8859_15),
|
||||
("tr", _) => Some(all::WINDOWS_1254),
|
||||
("ru", _) => Some(all::KOI8_R),
|
||||
("ja", _) |
|
||||
("jp", _) => Some(all::EUC_JP), // Tricky; but JIS is certainly less common
|
||||
("zh", Some("cn")) => Some(all::GBK), // These are based purely on what I've observed,
|
||||
("zh", _) => Some(all::BIG5_2003), // perhaps some heuristics based on contents can do better
|
||||
("ko", _) => Some(all::WINDOWS_949),
|
||||
(_, _) => None,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Decompresses / decodes a man page and returns its SHA-1 hash, encoding name, and UTF-8 contents.
|
||||
pub fn decode(paths: &[&str], ent: &mut Read) -> io::Result<(digest::Digest,&'static str,String)> {
|
||||
let mut decomp = try!(Archive::open_raw(ent)).take(MAX_MAN_SIZE+1);
|
||||
let mut data = Vec::new();
|
||||
try!(decomp.read_to_end(&mut data));
|
||||
|
||||
if let Some(e) = validate(&data) {
|
||||
return Err(io::Error::new(io::ErrorKind::InvalidData, e));
|
||||
}
|
||||
|
||||
let dig = digest::digest(&digest::SHA1, &data);
|
||||
|
||||
// TODO: Handle BOM? UTF-16?
|
||||
// If it passes as UTF-8, then just consider it UTF-8.
|
||||
if let Ok(_) = str::from_utf8(&data) {
|
||||
return Ok((dig, "utf8", unsafe { String::from_utf8_unchecked(data) } ));
|
||||
}
|
||||
// Otherwise, look for a coding tag in the contents
|
||||
if let Some(e) = codec_from_tag(&data) {
|
||||
if let Ok(s) = e.decode(&data, encoding::DecoderTrap::Strict) {
|
||||
return Ok((dig, e.name(), s));
|
||||
}
|
||||
}
|
||||
// If that fails as well, look for clues in the file path.
|
||||
for path in paths {
|
||||
if let Some(e) = codec_from_path(path) {
|
||||
if let Ok(s) = e.decode(&data, encoding::DecoderTrap::Strict) {
|
||||
return Ok((dig, e.name(), s));
|
||||
}
|
||||
}
|
||||
}
|
||||
// If all else fails, use a lossy iso-8859-1
|
||||
Ok((dig, "iso-8859-1", (all::ISO_8859_1 as EncodingRef).decode(&data, encoding::DecoderTrap::Ignore).unwrap() ))
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
#[test]
|
||||
fn test_parse_path() {
|
||||
// Generic tests
|
||||
|
|
@ -83,3 +257,45 @@ fn test_parse_path() {
|
|||
assert_eq!(parse_path("/usr/local/plan9/man/man8/index.html"), None);
|
||||
assert_eq!(parse_path("/usr/local/share/doc/gmt/html/man/grdpaste.html"), None);
|
||||
}
|
||||
|
||||
|
||||
#[test]
|
||||
fn test_codec_from_path() {
|
||||
let t = |p,n| {
|
||||
assert_eq!(codec_from_path(p).unwrap().name(), n);
|
||||
};
|
||||
t("man/de_DE.ISO8859-15/man1/scribus.1.gz", "iso-8859-15");
|
||||
t("man/de_DE.ISO_8859-1/man1/scribus.1.gz", "windows-1252");
|
||||
t("man/ja.UTF-8/man1/test.1", "utf-8");
|
||||
t("man/ja_JP/man1/test.1", "euc-jp");
|
||||
t("man/ja_JP.EUC/man1/test.1", "euc-jp");
|
||||
t("man/ja_JP.SJIS/man1/test.1", "windows-31j");
|
||||
t("man/jp.eucJP/man1/test.1", "euc-jp");
|
||||
t("man/jp/man1/test.1", "euc-jp");
|
||||
t("man/lt.ISO8859-13/man1/test.1", "iso-8859-13");
|
||||
t("man/ru/man1/test.1", "koi8-r");
|
||||
t("man/ru_RU@Cyr/man1/test.1", "koi8-r");
|
||||
t("man/zh_CN/man1/test.1", "gbk");
|
||||
t("man/zh_TW/man1/test.1", "big5-2003");
|
||||
}
|
||||
|
||||
|
||||
#[test]
|
||||
fn test_decode_zh() {
|
||||
use std::fs::File;
|
||||
use ring::test::from_hex;
|
||||
|
||||
// cat exit.1.gz | lzma -d | gzip -d | sha1sum
|
||||
let filehash = from_hex("cdf9b3e8d96a83c908eb0a0c277485e2f3eebe87").unwrap();
|
||||
// cat exit.1.gz | lzma -d | gzip -d | iconv -f gbk -t utf8 | sha1sum
|
||||
let utf8hash = from_hex("47f3e441137b207c0abdc38adac692298da4927a").unwrap();
|
||||
|
||||
let mut f = File::open("tests/exit.3.gz.lzma").unwrap();
|
||||
let (dig, enc, s) = decode(&["bullshit", "/usr/share/man/zh_CN/man3/exit.3.gz"][..], &mut f).unwrap();
|
||||
|
||||
assert_eq!(dig.as_ref(), &filehash[..]);
|
||||
assert_eq!(enc, "gbk");
|
||||
|
||||
let utf8dig = digest::digest(&digest::SHA1, s.as_bytes());
|
||||
assert_eq!(utf8dig.as_ref(), &utf8hash[..]);
|
||||
}
|
||||
|
|
|
|||
BIN
indexer/tests/exit.3.gz.lzma
Normal file
BIN
indexer/tests/exit.3.gz.lzma
Normal file
Binary file not shown.
|
|
@ -1,10 +1,12 @@
|
|||
#!/bin/sh
|
||||
|
||||
# The order of inserting the files into the tar is not fully deterministic this
|
||||
# way. The tests will fail quite badly if hardlink.6 is considered the
|
||||
# way. The tests will fail quite badly if a hardlink is considered the
|
||||
# "original" version.
|
||||
|
||||
|
||||
# simpletest.tar.gz
|
||||
|
||||
mkdir simple
|
||||
echo Hi >simple/file
|
||||
ln -s file simple/link
|
||||
|
|
@ -17,6 +19,13 @@ rm -rf $badfn simple
|
|||
|
||||
|
||||
|
||||
# rawtest.gz.xz.bzip2
|
||||
|
||||
echo "File contents!" | gzip | xz | bzip2 >rawtest.gz.xz.bzip2
|
||||
|
||||
|
||||
# testarchive.tar.xz
|
||||
|
||||
mkdir man
|
||||
cd man
|
||||
|
||||
BIN
indexer/tests/rawtest.gz.xz.bzip2
Normal file
BIN
indexer/tests/rawtest.gz.xz.bzip2
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading…
Add table
Add a link
Reference in a new issue