manned/indexer/src/open.rs

use std::io::{BufRead,BufReader,Read,Result,Error,ErrorKind,copy};
use std::fs::{File,create_dir_all,metadata,read_dir,remove_file};
use std::hash::{Hash,Hasher,SipHasher};
use std::time::{Duration,SystemTime};
use regex::bytes::Regex;
use url::Url;
use url::percent_encoding::percent_decode;
use hyper;


const CACHE_PATH: &'static str = "/var/tmp/manned-indexer";
const CACHE_TIME: u64 = 20*3600;


#[derive(Clone,Copy)]
pub struct Path<'a> {
    pub path: &'a str,
    pub cache: bool,
    pub canbelocal: bool,
}


fn cache_fn(url: &Url) -> String {
    let name = url.path_segments().unwrap().last().unwrap();
    let name = if name == "" { "index" } else { name };

    let mut hash = SipHasher::new();
    url.hash(&mut hash);
    format!("{}/{}-{}-{:x}", CACHE_PATH, url.host_str().unwrap(), name, hash.finish())
}


fn fetch(url: &str) -> Result<Box<Read>> {
    let res = try!(hyper::Client::new()
        .get(url)
        .header(hyper::header::UserAgent("Man page crawler (info@manned.org; https://manned.org/)".to_owned()))
        .send()
        .map_err(|e| Error::new(ErrorKind::Other, format!("Hyper: {}", e)))
    );
    if !res.status.is_success() {
        return Err(Error::new(ErrorKind::Other, format!("HTTP: {}", res.status) ));
    }
    Ok(Box::new(res) as Box<Read>)
}


fn file(path: &str) -> Result<Box<Read>> {
    Ok(Box::new(try!(File::open(path))) as Box<Read>)
}


pub fn clear_cache() -> Result<()> {
    create_dir_all(CACHE_PATH)?;
    for f in read_dir(CACHE_PATH)? {
        let f = f?.path();
        let m = metadata(&f)?;
        if m.modified().unwrap() < SystemTime::now() - Duration::from_secs(CACHE_TIME) {
            remove_file(&f)?;
        }
    }
    Ok(())
}


impl<'a> Path<'a> {
    pub fn open(&self) -> Result<Box<Read>> {
        if let Ok(url) = Url::parse(self.path) {
            if url.scheme() != "http" && url.scheme() != "https" {
                return Err(Error::new(ErrorKind::Other, "Invalid scheme"));
            }

            if self.cache {
                let cfn = cache_fn(&url);
                if let Ok(f) = file(&cfn) {
                    return Ok(f);
                }
                {
                    let mut rd = try!(fetch(url.as_str()));
                    let mut wr = try!(File::create(&cfn));
                    try!(copy(&mut rd, &mut wr));
                }
                file(&cfn)

            } else {
                fetch(url.as_str())
            }

        } else if self.canbelocal {
            file(self.path)

        } else {
            Err(Error::new(ErrorKind::Other, "Invalid URL"))
        }
    }

    // Attempt to parse a HTTP directory listing. Returns the name and whether it's a directory for
    // each item.
    // Only tested with a lighttpd/1.4 and apache 2.4 server.
    // (I tried using FTP before, but that didn't work out well; While FTP does return a more easily
    // parsable file list, some servers have issues with generating a list of a large directory)
    pub fn dirlist(&self) -> Result<Vec<(String,bool)>> {
        lazy_static!(
            static ref RE: Regex = Regex::new("(?i:<a +href *= *\"([^?/\"]+)(/?)\">)").unwrap();
        );
        let rd = self.open()?;
        let brd = BufReader::new(rd);
        let mut res = Vec::new();
        for line in brd.split(b'\n') {
            let line = line?;
            let mut matches = RE.captures_iter(&line);
            let first = matches.next();

            // There's only a single link per line.
            if first.is_some() && matches.next().is_some() {
                continue;
            }

            if let Some(cap) = first {
                let name = cap.at(1).unwrap();
                if name == b".." || name.starts_with(b"/") {
                    continue;
                }
                if let Ok(name) = percent_decode(name).decode_utf8() {
                    let isdir = cap.at(2) == Some(b"/");
                    res.push((name.to_string(), isdir));
                }
            }
        }
        Ok(res)
    }

}