indexer: Support for indexing FreeBSD <9.3 repositories

This commit is contained in:
Yorhel 2016-12-11 10:59:54 +01:00
parent 1ca0cd4325
commit defaa032f8
9 changed files with 892 additions and 826 deletions

View file

@ -19,6 +19,7 @@ mod open;
mod pkg;
mod sys_arch;
mod sys_deb;
mod sys_freebsd1;
// Convenience function to get a system id by short-name. Panics if the system doesn't exist.
@ -60,6 +61,12 @@ fn main() {
(@arg contents: --contents +takes_value "Contents file")
(@arg packages: --packages +required +takes_value "Packages file")
)
(@subcommand freebsd1 =>
(about: "Index packages from a FreeBSD <= 9.2 package repo")
(@arg sys: --sys +required +takes_value "System short-name")
(@arg mirror: --mirror +required +takes_value "Mirror URL (should point to the packages/ dir)")
(@arg arch: --arch +required +takes_value "Arch")
)
).get_matches();
unsafe { pkg::DRY_RUN = arg.is_present("dry") };
@ -93,6 +100,7 @@ fn main() {
if let Some(matches) = arg.subcommand_matches("pkg") {
let date = match matches.value_of("date").unwrap() {
"deb" => pkg::Date::Deb,
"desc" => pkg::Date::Desc,
s => pkg::Date::Known(s),
};
pkg::pkg(&db, pkg::PkgOpt {
@ -124,5 +132,13 @@ fn main() {
);
}
if let Some(matches) = arg.subcommand_matches("freebsd1") {
sys_freebsd1::sync(&db,
sysbyshort(&db, matches.value_of("sys").unwrap()),
matches.value_of("arch").unwrap(),
matches.value_of("mirror").unwrap()
).unwrap_or_else(|e| error!("{}", e));
}
trace!("Exiting");
}

View file

@ -1,8 +1,10 @@
use std::io::{Read,Result,Error,ErrorKind,copy};
use std::io::{BufRead,BufReader,Read,Result,Error,ErrorKind,copy};
use std::fs::{File,create_dir_all,metadata,read_dir,remove_file};
use std::hash::{Hash,Hasher,SipHasher};
use std::time::{Duration,SystemTime};
use regex::bytes::Regex;
use url::Url;
use url::percent_encoding::percent_decode;
use hyper;
@ -90,4 +92,41 @@ impl<'a> Path<'a> {
Err(Error::new(ErrorKind::Other, "Invalid URL"))
}
}
// Attempt to parse a HTTP directory listing. Returns the name and whether it's a directory for
// each item.
// Only tested with a lighttpd/1.4 and apache 2.4 server.
// (I tried using FTP before, but that didn't work out well; While FTP does return a more easily
// parsable file list, some servers have issues with generating a list of a large directory)
pub fn dirlist(&self) -> Result<Vec<(String,bool)>> {
lazy_static!(
static ref RE: Regex = Regex::new("(?i:<a +href *= *\"([^?/\"]+)(/?)\">)").unwrap();
);
let rd = self.open()?;
let brd = BufReader::new(rd);
let mut res = Vec::new();
for line in brd.split(b'\n') {
let line = line?;
let mut matches = RE.captures_iter(&line);
let first = matches.next();
// There's only a single link per line.
if first.is_some() && matches.next().is_some() {
continue;
}
if let Some(cap) = first {
let name = cap.at(1).unwrap();
if name == b".." || name.starts_with(b"/") {
continue;
}
if let Ok(name) = percent_decode(name).decode_utf8() {
let isdir = cap.at(2) == Some(b"/");
res.push((name.to_string(), isdir));
}
}
}
Ok(res)
}
}

View file

@ -16,6 +16,7 @@ pub enum Date<'a> {
Known(&'a str), // Given in PkgOpt
Found(i64), // Found in package
Deb, // Should be read from the timestamp of the 'debian-binary' file
Desc, // Should be read from the '+DESC' file (FreeBSD <= 9.2)
}
@ -24,6 +25,7 @@ impl<'a> Date<'a> {
// TODO: Validate that the mtime() date is sensible (e.g. 1990 < date < now)
*self = match *self {
Date::Deb if ent.format() == Format::Ar && ent.path() == Some("debian-binary") => Date::Found(ent.mtime()),
Date::Desc if ent.path() == Some("+DESC") => Date::Found(ent.mtime()),
x => x,
}
}

186
indexer/src/sys_freebsd1.rs Normal file
View file

@ -0,0 +1,186 @@
use std::collections::HashSet;
use std::ascii::AsciiExt;
use std::io::Result;
use regex::Regex;
use postgres;
use open;
use pkg;
// Sync a FreeBSD <= 9.2 package respository.
//
// Reads "." to get a list of categories, "Latest" to get a list of all packages, and all category
// directories to figure out which package belongs in which category.
//
// Splitting a package filename into a package name and version is a hard problem. There are two
// strategies:
// 1. Use the listing from 'Latest' to get the list of package names, and use that to find the
// longest matching substring in the package filename to split off the version.
// 2. Guessing, like splitver() below.
//
// Both strategies lead to errors. (1) doesn't always work because the 'Latest' directory tends to
// miss a few packages. (2) doesn't always work because version strings are too damn irregular.
// This function tries (1) first, then falls back to (2) if it couldn't find a matching package.
// This combined solution also isn't perfect, as sometimes a package prefix does exist, but is
// incomplete. E.g. 'pear-PHPUnit-1.3.3.tbz' is parsed as 'pear version PHPUnit-1.3.3' rather than
// 'pear-PHPUnit version 1.3.3', because there is a 'pear' package in 'Latest' but no
// 'pear-PHPUnit'. This is handled with a static list of package names to add to the 'pkgs' list,
// see EXTRA_PKGS below.
pub fn sync(pg: &postgres::GenericConnection, sys: i32, arch: &str, mirror: &str) -> Result<()> {
let path = format!("{}Latest/", mirror);
let mut pkgs : Vec<String> = open::Path{path: &path, cache: true, canbelocal: false}
.dirlist()?.into_iter()
.map(|(n,_)| trimext(&n).to_string())
.collect();
pkgs.extend(EXTRA_PKGS.into_iter().map(|e| e.to_string()));
pkgs.sort_by(|a, b| b.len().cmp(&a.len())); // Longest first
// List of packages (name+version) we've already seen; Some packages are present in multiple
// categories, we only index the first found.
let mut seenpkgs = HashSet::new();
let cats = open::Path{path: mirror, cache: true, canbelocal: false}
.dirlist()?.into_iter()
.filter(|&(ref n,i)| i && n != "All" && n != "Latest")
.map(|(n,_)| n);
for cat in cats {
trace!("Category: {}", cat);
let path = format!("{}{}/", mirror, cat);
let lst = open::Path{path: &path, cache: true, canbelocal: false}.dirlist()?.into_iter().map(|(n,_)| n);
for f in lst {
let name = trimext(&f);
if !name.is_ascii() {
warn!("Non-ASCII package name: {}", f);
continue;
}
// The take() mystifies me; why is it necessary?
let pkg = pkgs.iter()
.find(|p| name.len() > p.len()+1 && name.starts_with(&p as &str) && &name[p.len() .. p.len()+1] == "-")
.take().map(|p| (p as &str, &name[p.len()+1 .. ]))
.or_else(|| splitver(name));
if let Some((pkg, ver)) = pkg {
if !seenpkgs.insert((pkg.to_string(), ver.to_string())) {
continue;
}
let path = format!("{}{}/{}", mirror, cat, f);
pkg::pkg(pg, pkg::PkgOpt{
force: false,
sys: sys,
cat: &cat,
pkg: pkg,
ver: ver,
date: pkg::Date::Desc,
arch: Some(arch),
file: open::Path{
path: &path,
cache: false,
canbelocal: false,
},
});
} else {
warn!("Unknown package: {}/{}", cat, f);
}
}
}
Ok(())
}
fn trimext(n: &str) -> &str {
n.trim_right_matches(".tgz").trim_right_matches(".tbz")
}
fn splitver(n: &str) -> Option<(&str, &str)> {
lazy_static!(
static ref RE1: Regex = Regex::new("^(.+?)-([0-9].*)$").unwrap();
static ref RE2: Regex = Regex::new("^(.+)-([^-]+)$").unwrap();
);
if let Some(cap) = RE1.captures(n) {
Some((cap.at(1).unwrap(), cap.at(2).unwrap()))
} else if let Some(cap) = RE2.captures(n) {
Some((cap.at(1).unwrap(), cap.at(2).unwrap()))
} else {
None
}
}
// This list may not be complete, and these packages may not necessarily have man pages.
const EXTRA_PKGS : &'static [&'static str] = &[
"amanda-client",
"amanda-server",
"apache-event",
"apache-itk",
"apache-peruser",
"apache-tomcat",
"apache-worker",
"bison-devel",
"boxbackup-devel",
"boxbackup-devel",
"ffmpeg-devel",
"flex-sdk",
"fpc-gdb",
"freeradius-mysql",
"gdb-insight",
"glib-reference",
"gmime-24",
"gmime-24-sharp",
"gtk-reference",
"gtk-sharp",
"gtkmm-reference",
"horde-content",
"horde-groupware",
"horde-timeobjects",
"horde-webmail",
"hping-devel",
"ja-jvim-direct_canna",
"ja-mutt-devel",
"kdelibs-experimental",
"kdepim-runtime",
"lame-devel",
"libdivxdecore-devel",
"libquicktime-lame",
"libtorrent-rasterbar",
"linux-netscape-communicator",
"mkisofs-devel",
"mldonkey-core-devel",
"mldonkey-gui-devel",
"mod_log_sql-dtc",
"nethack-qt",
"nfdump-devel",
"openssl-beta",
"pear-PHPUnit",
"pear-XML_Query2XML",
"pear-phpunit-PHPUnit",
"pgadmin3-unicode",
"proftpd-mod_ldap",
"proftpd-mod_sql_mysql",
"proftpd-mod_sql_odbc",
"proftpd-mod_sql_postgres",
"proftpd-mod_sql_sqlite",
"proftpd-mod_sql_tds",
"qt-static",
"rsyslog-gnutls",
"rsyslog-gssapi",
"rsyslog-libdbi",
"rsyslog-mysql",
"rsyslog-pgsql",
"rsyslog-relp",
"rsyslog-rfc3195",
"rsyslog-snmp",
"samba-libsmbclient",
"samba-nmblookup",
"squirrelmail-shared_calendars-plugin",
"tcl-thread",
"wxgtk2-common-devel",
"wxgtk2-contrib-common-devel",
"wxgtk2-utils-devel",
];