Add arch/encoding metadata to DB + Fetch Arch Linux x86_64

The encoding metadata will be very useful in finding badly decoded man
pages. The package 'arch' is necessary to properly identify which
package was used, which is not obvious now that I'm going to switch more
systems to the (more common) x86_64 arch.
This commit is contained in:
Yorhel 2016-11-06 16:05:13 +01:00
parent b8a1945d38
commit cb81bedac1
6 changed files with 32 additions and 20 deletions

View file

@ -42,6 +42,7 @@ fn main() {
(@arg pkg: --pkg +required +takes_value "Package name")
(@arg ver: --ver +required +takes_value "Package version")
(@arg date: --date +required +takes_value "Package release date")
(@arg arch: --arch +takes_value "Architecture")
(@arg FILE: +required "Package file")
)
(@subcommand arch =>
@ -81,6 +82,7 @@ fn main() {
pkg: matches.value_of("pkg").unwrap(),
ver: matches.value_of("ver").unwrap(),
date: matches.value_of("date").unwrap(),
arch: matches.value_of("arch"),
file: open::Path{ path: matches.value_of("FILE").unwrap(), cache: false, canbelocal: true},
});
}

View file

@ -14,6 +14,7 @@ pub struct PkgOpt<'a> {
pub pkg: &'a str,
pub ver: &'a str,
pub date: &'a str, // TODO: Option to extract date from package metadata itself
pub arch: Option<&'a str>,
pub file: open::Path<'a>
}
@ -36,8 +37,8 @@ fn insert_pkg(tr: &postgres::transaction::Transaction, opt: &PkgOpt) -> Option<i
let verid : i32;
if res.is_empty() {
let q = "INSERT INTO package_versions (package, version, released) VALUES($1, $2, $3::text::date) RETURNING id";
verid = tr.query(q, &[&pkgid, &opt.ver, &opt.date]).unwrap().get(0).get(0);
let q = "INSERT INTO package_versions (package, version, released, arch) VALUES($1, $2, $3::text::date, $4) RETURNING id";
verid = tr.query(q, &[&pkgid, &opt.ver, &opt.date, &opt.arch]).unwrap().get(0).get(0);
info!("New package pkgid {} verid {}", pkgid, verid);
Some(verid)
@ -54,12 +55,11 @@ fn insert_pkg(tr: &postgres::transaction::Transaction, opt: &PkgOpt) -> Option<i
}
fn insert_man_row(tr: &postgres::GenericConnection, verid: i32, path: &str, hash: &[u8]) {
// TODO: Store 'encoding' in the database
fn insert_man_row(tr: &postgres::GenericConnection, verid: i32, path: &str, enc: &str, hash: &[u8]) {
let (name, sect, locale) = man::parse_path(path).unwrap();
if let Err(e) = tr.execute(
"INSERT INTO man (package, name, filename, locale, hash, section) VALUES ($1, $2, '/'||$3, $4, $5, $6)",
&[&verid, &name, &path, &locale, &hash, &sect]
"INSERT INTO man (package, name, filename, locale, hash, section, encoding) VALUES ($1, $2, '/'||$3, $4, $5, $6, $7)",
&[&verid, &name, &path, &locale, &hash, &sect, &enc]
) {
// I think this can only happen if archread gives us the same file twice, which really
// shouldn't happen. But I'd rather continue with an error logged than panic.
@ -84,20 +84,21 @@ fn insert_man(tr: &postgres::GenericConnection, verid: i32, paths: &[&str], ent:
).unwrap();
for path in paths {
insert_man_row(tr, verid, path, dig.as_ref());
insert_man_row(tr, verid, path, enc, dig.as_ref());
debug!("Inserted man page: {} ({})", path, enc);
}
}
fn insert_link(tr: &postgres::GenericConnection, verid: i32, src: &str, dest: &str) {
let hash = tr.query("SELECT hash FROM man WHERE package = $1 AND filename = '/'||$2", &[&verid, &dest]).unwrap();
if hash.is_empty() { /* Can happen if man::decode() failed previously. */
let res = tr.query("SELECT hash, encoding FROM man WHERE package = $1 AND filename = '/'||$2", &[&verid, &dest]).unwrap();
if res.is_empty() { /* Can happen if man::decode() failed previously. */
error!("Link to unindexed man page: {} -> {}", src, dest);
return;
}
let hash: Vec<u8> = hash.get(0).get(0);
insert_man_row(tr, verid, src, &hash);
let hash: Vec<u8> = res.get(0).get(0);
let enc: String = res.get(0).get(1);
insert_man_row(tr, verid, src, &enc, &hash);
debug!("Inserted man link: {} -> {}", src, dest);
}

View file

@ -15,6 +15,7 @@ struct Meta {
name: String,
version: String,
date: String,
arch: Option<String>,
}
@ -43,6 +44,7 @@ fn read_desc(rd: &mut archive::ArchiveEntry) -> Result<Option<Meta>> {
let mut name = None;
let mut version = None;
let mut builddate = None;
let mut arch = None;
for kv in RE.captures_iter(&data) {
let key = kv.at(1).unwrap();
@ -53,6 +55,7 @@ fn read_desc(rd: &mut archive::ArchiveEntry) -> Result<Option<Meta>> {
"NAME" => name = Some(val),
"VERSION" => version = Some(val),
"BUILDDATE" => builddate = i64::from_str(val).ok(),
"ARCH" => arch = Some(val),
_ => {},
}
}
@ -63,6 +66,7 @@ fn read_desc(rd: &mut archive::ArchiveEntry) -> Result<Option<Meta>> {
name: name.unwrap().to_string(),
version: version.unwrap().to_string(),
date: NaiveDateTime::from_timestamp(builddate.unwrap(), 0).format("%Y-%m-%d").to_string(),
arch: arch.map(str::to_string),
}))
} else {
warn!("Metadata missing from package description: {}", path);
@ -71,11 +75,10 @@ fn read_desc(rd: &mut archive::ArchiveEntry) -> Result<Option<Meta>> {
}
// TODO: Switch to x86_64 instead of i686
pub fn sync(pg: &postgres::GenericConnection, sys: i32, mirror: &str, repo: &str) {
info!("Reading packages from {} {}", mirror, repo);
let path = format!("{}/{}/os/i686/{1:}.files.tar.gz", mirror, repo);
let path = format!("{}/{}/os/x86_64/{1:}.files.tar.gz", mirror, repo);
let path = open::Path{ path: &path, cache: true, canbelocal: false };
let mut index = match path.open() {
Err(e) => { error!("Can't read package index: {}", e); return },
@ -103,7 +106,7 @@ pub fn sync(pg: &postgres::GenericConnection, sys: i32, mirror: &str, repo: &str
hasman = false;
let m = meta.take().unwrap();
let p = format!("{}/{}/os/i686/{}", mirror, repo, m.filename);
let p = format!("{}/{}/os/x86_64/{}", mirror, repo, m.filename);
pkg::pkg(pg, pkg::PkgOpt{
force: false,
sys: sys,
@ -111,6 +114,7 @@ pub fn sync(pg: &postgres::GenericConnection, sys: i32, mirror: &str, repo: &str
pkg: &m.name,
ver: &m.version,
date: &m.date,
arch: m.arch.as_ref().map(|e| &e[..]),
file: open::Path{
path: &p,
cache: false,

View file

@ -24,16 +24,18 @@ CREATE TABLE package_versions (
package integer NOT NULL REFERENCES packages(id) ON DELETE CASCADE,
version varchar NOT NULL,
released date NOT NULL,
arch varchar,
UNIQUE(package, version)
);
CREATE TABLE man (
package integer NOT NULL REFERENCES package_versions(id) ON DELETE CASCADE,
name varchar NOT NULL,
section varchar NOT NULL,
filename varchar NOT NULL,
locale varchar,
hash bytea NOT NULL REFERENCES contents(hash),
section varchar NOT NULL,
encoding varchar,
UNIQUE(package, filename)
);

View file

@ -0,0 +1,2 @@
ALTER TABLE package_versions ADD COLUMN arch varchar;
ALTER TABLE man ADD COLUMN encoding varchar;

View file

@ -238,7 +238,9 @@ sub about {
<dl>
<dt>Arch Linux</dt><dd>
The core, extra and community repositories are fetched from a local
Arch mirror. Indexing started around begin June 2012.</dd>
Arch mirror. Indexing started around begin June 2012. The i686
architecture was indexed until November 6th, 2016, packages after that
were fetched from from x86_64.</dd>
<dt>Debian</dt><dd>
Historical releases were fetched from <a
href="http://archive.debian.org/debian/">http://archive.debian.org/debian/</a>
@ -268,13 +270,12 @@ sub about {
$release-security repositories are indexed. Backports are not included at
the moment. Indexing started around mid June 2012.</dd>
</dl>
Only packages for a single architecture (i386 or i686) are scanned. To my
Only packages for a single architecture (i386 or amd64) are scanned. To my
knowledge, packages that come with different manuals for different
architectures either don't exist or are extremely rare. It does happen that
some packages are not available for all architectures. Usually, though,
every package is at least available for i386/i686, so hopefully we're not
missing out on much.
<br /><br />
every package is at least available for the most popular architecture, so
hopefully we're not missing out on much. <br /><br />
The repositories are scanned for new packages on a daily basis.
_
end;