indexer: Switch to ureq + debloat stuff a bit

And stop using the "url" crate directly, its API is too unstable for it
to be worth using.

...that applies to several other crates as well, but meh.
This commit is contained in:
Yorhel 2021-12-11 11:38:26 +01:00
parent 4588e67b64
commit c48feedc85
5 changed files with 184 additions and 1065 deletions

1190
indexer/Cargo.lock generated

File diff suppressed because it is too large Load diff

View file

@ -11,10 +11,10 @@ lazy_static = "1.0.0"
libc = "0.2.39"
libarchive3-sys = "0.1.2"
encoding = { git = "https://github.com/lifthrasiir/rust-encoding", features = ["no-optimized-legacy-encoding"] }
ring = "0.14.6"
ring = "0.16.20"
postgres = "0.17.5"
clap = "2.31.2"
reqwest = "0.9.17"
url = "1.7.0"
ureq = "2.3.1"
percent-encoding = "2.0"
chrono = "0.4.0"
quick-xml = "0.14.0"

View file

@ -8,10 +8,10 @@ extern crate libc;
extern crate ring;
extern crate encoding;
extern crate postgres;
extern crate reqwest;
extern crate url;
extern crate ureq;
extern crate chrono;
extern crate quick_xml;
extern crate percent_encoding;
mod archive;
mod archread;

View file

@ -204,7 +204,7 @@ pub fn decode(paths: &[&str], ent: &mut Read) -> io::Result<(digest::Digest,&'st
return Err(io::Error::new(io::ErrorKind::InvalidData, e));
}
let dig = digest::digest(&digest::SHA1, &data);
let dig = digest::digest(&digest::SHA1_FOR_LEGACY_USE_ONLY, &data);
// Create a list of encodings to try, starting with UTF-8
let mut encs : Vec<EncodingRef> = vec![all::UTF_8];

View file

@ -3,9 +3,8 @@ use std::fs::{File,create_dir_all,metadata,read_dir,remove_file};
use std::time::{Duration,SystemTime};
use regex::bytes::Regex;
use ring::digest;
use url::Url;
use url::percent_encoding::percent_decode;
use reqwest;
use percent_encoding::percent_decode;
use ureq;
const CACHE_PATH: &'static str = "/var/tmp/manned-indexer";
@ -20,29 +19,16 @@ pub struct Path<'a> {
}
fn cache_fn(url: &Url) -> String {
let name = url.path_segments().unwrap().last().unwrap();
let name = if name == "" { "index" } else { name };
let hash = digest::digest(&digest::SHA1, url.as_str().as_bytes())
.as_ref()[0..8].into_iter()
.fold(0u64, |a, &e| (a<<8) + e as u64);
format!("{}/{}-{}-{:x}", CACHE_PATH, url.host_str().unwrap(), name, hash)
}
fn fetch(url: &str) -> Result<Box<Read>> {
let res = try!(reqwest::Client::new()
.get(url)
.header("User-Agent", "Man page crawler (info@manned.org; https://manned.org/)")
.send()
.map_err(|e| Error::new(ErrorKind::Other, format!("Reqwest: {}", e)))
let res = try!(ureq::get(url)
.set("User-Agent", "Man page crawler (info@manned.org; https://manned.org/)")
.call()
.map_err(|e| Error::new(ErrorKind::Other, format!("Ureq: {}", e)))
);
if !res.status().is_success() {
if res.status() != 200 {
return Err(Error::new(ErrorKind::Other, format!("HTTP: {}", res.status()) ));
}
Ok(Box::new(res) as Box<Read>)
Ok(Box::new(res.into_reader()) as Box<Read>)
}
@ -66,27 +52,26 @@ pub fn clear_cache() -> Result<()> {
impl<'a> Path<'a> {
pub fn open(&self) -> Result<Box<Read>> {
if let Ok(url) = Url::parse(self.path) {
if url.scheme() != "http" {
return Err(Error::new(ErrorKind::Other, "Invalid scheme"));
}
if self.path.starts_with("http://") || self.path.starts_with("https://") {
if self.cache {
let cfn = cache_fn(&url);
let hash = digest::digest(&digest::SHA256, self.path.as_bytes())
.as_ref()[0..8].into_iter()
.fold(0u64, |a, &e| (a<<8) + e as u64);
let cfn = format!("{}/{:x}", CACHE_PATH, hash);
if let Ok(f) = file(&cfn) {
return Ok(f);
}
{
let mut rd = try!(fetch(url.as_str()));
let mut rd = try!(fetch(self.path));
let mut wr = try!(File::create(&cfn));
try!(copy(&mut rd, &mut wr));
}
file(&cfn)
} else {
fetch(url.as_str())
fetch(self.path)
}
} else if self.canbelocal {
file(self.path)