manned/indexer/src/open.rs

118 lines
3.6 KiB
Rust

use std::io::{BufRead,BufReader,Read,Result,Error,ErrorKind,copy};
use std::fs::{File,create_dir_all,metadata,read_dir,remove_file};
use std::time::{Duration,SystemTime};
use regex::bytes::Regex;
use ring::digest;
use percent_encoding::percent_decode;
use ureq;
const CACHE_PATH: &'static str = "/var/tmp/manned-indexer";
const CACHE_TIME: u64 = 20*3600;
#[derive(Clone,Copy)]
pub struct Path<'a> {
pub path: &'a str,
pub cache: bool,
pub canbelocal: bool,
}
fn fetch(url: &str) -> Result<Box<dyn Read>> {
let res = ureq::get(url)
.set("User-Agent", "Man page crawler (info@manned.org; https://manned.org/)")
.call()
.map_err(|e| Error::new(ErrorKind::Other, format!("Ureq: {}", e)))?;
if res.status() != 200 {
return Err(Error::new(ErrorKind::Other, format!("HTTP: {}", res.status()) ));
}
Ok(Box::new(res.into_reader()) as Box<dyn Read>)
}
fn file(path: &str) -> Result<Box<dyn Read>> {
Ok(Box::new(File::open(path)?) as Box<dyn Read>)
}
pub fn clear_cache() -> Result<()> {
create_dir_all(CACHE_PATH)?;
for f in read_dir(CACHE_PATH)? {
let f = f?.path();
let m = metadata(&f)?;
if m.modified().unwrap() < SystemTime::now() - Duration::from_secs(CACHE_TIME) {
remove_file(&f)?;
}
}
Ok(())
}
impl<'a> Path<'a> {
pub fn open(&self) -> Result<Box<dyn Read>> {
if self.path.starts_with("http://") || self.path.starts_with("https://") {
if self.cache {
let hash = digest::digest(&digest::SHA256, self.path.as_bytes())
.as_ref()[0..8].into_iter()
.fold(0u64, |a, &e| (a<<8) + e as u64);
let cfn = format!("{}/{:x}", CACHE_PATH, hash);
if let Ok(f) = file(&cfn) {
return Ok(f);
}
{
let mut rd = fetch(self.path)?;
let mut wr = File::create(&cfn)?;
copy(&mut rd, &mut wr)?;
}
file(&cfn)
} else {
fetch(self.path)
}
} else if self.canbelocal {
file(self.path)
} else {
Err(Error::new(ErrorKind::Other, "Invalid URL"))
}
}
// Attempt to parse a HTTP directory listing. Returns the name and whether it's a directory for
// each item.
// Only tested with a lighttpd/1.4 and apache 2.4 server.
// (I tried using FTP before, but that didn't work out well; While FTP does return a more easily
// parsable file list, some servers have issues with generating a list of a large directory)
pub fn dirlist(&self) -> Result<Vec<(String,bool)>> {
lazy_static!(
static ref RE: Regex = Regex::new("(?i:<a +href *= *\"([^?/\"]+)(/)?\">)").unwrap();
);
let rd = self.open()?;
let brd = BufReader::new(rd);
let mut res = Vec::new();
for line in brd.split(b'\n') {
let line = line?;
let mut matches = RE.captures_iter(&line);
let first = matches.next();
// There's only a single link per line.
if first.is_some() && matches.next().is_some() {
continue;
}
if let Some(cap) = first {
let name = &cap[1];
if name == b".." || name.starts_with(b"/") {
continue;
}
if let Ok(name) = percent_decode(name).decode_utf8() {
let isdir = cap.get(2).is_some();
res.push((name.to_string(), isdir));
}
}
}
Ok(res)
}
}