indexer: Switch to ureq + debloat stuff a bit
And stop using the "url" crate directly, its API is too unstable for it to be worth using. ...that applies to several other crates as well, but meh.
This commit is contained in:
parent
4588e67b64
commit
c48feedc85
5 changed files with 184 additions and 1065 deletions
1190
indexer/Cargo.lock
generated
1190
indexer/Cargo.lock
generated
File diff suppressed because it is too large
Load diff
|
|
@ -11,10 +11,10 @@ lazy_static = "1.0.0"
|
||||||
libc = "0.2.39"
|
libc = "0.2.39"
|
||||||
libarchive3-sys = "0.1.2"
|
libarchive3-sys = "0.1.2"
|
||||||
encoding = { git = "https://github.com/lifthrasiir/rust-encoding", features = ["no-optimized-legacy-encoding"] }
|
encoding = { git = "https://github.com/lifthrasiir/rust-encoding", features = ["no-optimized-legacy-encoding"] }
|
||||||
ring = "0.14.6"
|
ring = "0.16.20"
|
||||||
postgres = "0.17.5"
|
postgres = "0.17.5"
|
||||||
clap = "2.31.2"
|
clap = "2.31.2"
|
||||||
reqwest = "0.9.17"
|
ureq = "2.3.1"
|
||||||
url = "1.7.0"
|
percent-encoding = "2.0"
|
||||||
chrono = "0.4.0"
|
chrono = "0.4.0"
|
||||||
quick-xml = "0.14.0"
|
quick-xml = "0.14.0"
|
||||||
|
|
|
||||||
|
|
@ -8,10 +8,10 @@ extern crate libc;
|
||||||
extern crate ring;
|
extern crate ring;
|
||||||
extern crate encoding;
|
extern crate encoding;
|
||||||
extern crate postgres;
|
extern crate postgres;
|
||||||
extern crate reqwest;
|
extern crate ureq;
|
||||||
extern crate url;
|
|
||||||
extern crate chrono;
|
extern crate chrono;
|
||||||
extern crate quick_xml;
|
extern crate quick_xml;
|
||||||
|
extern crate percent_encoding;
|
||||||
|
|
||||||
mod archive;
|
mod archive;
|
||||||
mod archread;
|
mod archread;
|
||||||
|
|
|
||||||
|
|
@ -204,7 +204,7 @@ pub fn decode(paths: &[&str], ent: &mut Read) -> io::Result<(digest::Digest,&'st
|
||||||
return Err(io::Error::new(io::ErrorKind::InvalidData, e));
|
return Err(io::Error::new(io::ErrorKind::InvalidData, e));
|
||||||
}
|
}
|
||||||
|
|
||||||
let dig = digest::digest(&digest::SHA1, &data);
|
let dig = digest::digest(&digest::SHA1_FOR_LEGACY_USE_ONLY, &data);
|
||||||
|
|
||||||
// Create a list of encodings to try, starting with UTF-8
|
// Create a list of encodings to try, starting with UTF-8
|
||||||
let mut encs : Vec<EncodingRef> = vec![all::UTF_8];
|
let mut encs : Vec<EncodingRef> = vec![all::UTF_8];
|
||||||
|
|
|
||||||
|
|
@ -3,9 +3,8 @@ use std::fs::{File,create_dir_all,metadata,read_dir,remove_file};
|
||||||
use std::time::{Duration,SystemTime};
|
use std::time::{Duration,SystemTime};
|
||||||
use regex::bytes::Regex;
|
use regex::bytes::Regex;
|
||||||
use ring::digest;
|
use ring::digest;
|
||||||
use url::Url;
|
use percent_encoding::percent_decode;
|
||||||
use url::percent_encoding::percent_decode;
|
use ureq;
|
||||||
use reqwest;
|
|
||||||
|
|
||||||
|
|
||||||
const CACHE_PATH: &'static str = "/var/tmp/manned-indexer";
|
const CACHE_PATH: &'static str = "/var/tmp/manned-indexer";
|
||||||
|
|
@ -20,29 +19,16 @@ pub struct Path<'a> {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
fn cache_fn(url: &Url) -> String {
|
|
||||||
let name = url.path_segments().unwrap().last().unwrap();
|
|
||||||
let name = if name == "" { "index" } else { name };
|
|
||||||
|
|
||||||
let hash = digest::digest(&digest::SHA1, url.as_str().as_bytes())
|
|
||||||
.as_ref()[0..8].into_iter()
|
|
||||||
.fold(0u64, |a, &e| (a<<8) + e as u64);
|
|
||||||
|
|
||||||
format!("{}/{}-{}-{:x}", CACHE_PATH, url.host_str().unwrap(), name, hash)
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
fn fetch(url: &str) -> Result<Box<Read>> {
|
fn fetch(url: &str) -> Result<Box<Read>> {
|
||||||
let res = try!(reqwest::Client::new()
|
let res = try!(ureq::get(url)
|
||||||
.get(url)
|
.set("User-Agent", "Man page crawler (info@manned.org; https://manned.org/)")
|
||||||
.header("User-Agent", "Man page crawler (info@manned.org; https://manned.org/)")
|
.call()
|
||||||
.send()
|
.map_err(|e| Error::new(ErrorKind::Other, format!("Ureq: {}", e)))
|
||||||
.map_err(|e| Error::new(ErrorKind::Other, format!("Reqwest: {}", e)))
|
|
||||||
);
|
);
|
||||||
if !res.status().is_success() {
|
if res.status() != 200 {
|
||||||
return Err(Error::new(ErrorKind::Other, format!("HTTP: {}", res.status()) ));
|
return Err(Error::new(ErrorKind::Other, format!("HTTP: {}", res.status()) ));
|
||||||
}
|
}
|
||||||
Ok(Box::new(res) as Box<Read>)
|
Ok(Box::new(res.into_reader()) as Box<Read>)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -66,27 +52,26 @@ pub fn clear_cache() -> Result<()> {
|
||||||
|
|
||||||
impl<'a> Path<'a> {
|
impl<'a> Path<'a> {
|
||||||
pub fn open(&self) -> Result<Box<Read>> {
|
pub fn open(&self) -> Result<Box<Read>> {
|
||||||
if let Ok(url) = Url::parse(self.path) {
|
if self.path.starts_with("http://") || self.path.starts_with("https://") {
|
||||||
if url.scheme() != "http" {
|
|
||||||
return Err(Error::new(ErrorKind::Other, "Invalid scheme"));
|
|
||||||
}
|
|
||||||
|
|
||||||
if self.cache {
|
if self.cache {
|
||||||
let cfn = cache_fn(&url);
|
let hash = digest::digest(&digest::SHA256, self.path.as_bytes())
|
||||||
|
.as_ref()[0..8].into_iter()
|
||||||
|
.fold(0u64, |a, &e| (a<<8) + e as u64);
|
||||||
|
|
||||||
|
let cfn = format!("{}/{:x}", CACHE_PATH, hash);
|
||||||
if let Ok(f) = file(&cfn) {
|
if let Ok(f) = file(&cfn) {
|
||||||
return Ok(f);
|
return Ok(f);
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
let mut rd = try!(fetch(url.as_str()));
|
let mut rd = try!(fetch(self.path));
|
||||||
let mut wr = try!(File::create(&cfn));
|
let mut wr = try!(File::create(&cfn));
|
||||||
try!(copy(&mut rd, &mut wr));
|
try!(copy(&mut rd, &mut wr));
|
||||||
}
|
}
|
||||||
file(&cfn)
|
file(&cfn)
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
fetch(url.as_str())
|
fetch(self.path)
|
||||||
}
|
}
|
||||||
|
|
||||||
} else if self.canbelocal {
|
} else if self.canbelocal {
|
||||||
file(self.path)
|
file(self.path)
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue