Initial commit

This commit is contained in:
Yorhel 2012-06-15 14:23:29 +02:00
commit c47f450934
11 changed files with 1271 additions and 0 deletions

159
util/add_dir.pl Executable file
View file

@ -0,0 +1,159 @@
#!/usr/bin/perl
# Usage: ./add_dir.pl <dir> <pkgid>
# Prints the path names of the found man pages on stdout.
# May throw errors or warnings on stderr.
# Returns 0 if it has added something, 1 on error or if nothing has been found.
use strict;
use warnings;
no warnings 'once';
use Encode 'decode', 'find_encoding', 'decode_utf8';
use Digest::SHA 'sha1_hex';
use File::Find;
use DBI;
die "Not enough arguments\n" if @ARGV < 2;
my($dir, $pkgid) = @ARGV;
my $db = DBI->connect('dbi:Pg:dbname=manned', 'manned', '', {
pg_enable_utf8 => 1, PrintError => 0, RaiseError => 1, AutoCommit => 0
});
sub readman {
my $ofn = shift;
local $/;
open my $F, '<', $ofn or die "Unable to open '$ofn': $!\n";
my $dat = <$F>;
close $F;
# Note: Don't forget to update 'section_from_filename()' in SQL when a new
# compression file extension is recognized.
my $fn = $ofn;
while(1) {
if($fn =~ s/\.gz$//) {
require Compress::Zlib;
$dat = Compress::Zlib::memGunzip($dat);
die "Error decompressing '$ofn': $Compress::Zlib::gzerrno\n" if !defined $dat;
next;
}
if($fn =~ s/\.bz2$//) {
# Don't try to use Compress::Bzip2::memBunzip() here. It's been terribly
# broken for at least 3 years:
# https://rt.cpan.org/Public/Bug/Display.html?id=48128
require Compress::Raw::Bzip2;
my($b, $s) = Compress::Raw::Bunzip2->new();
my $r;
die "Error decompressing '$ofn': Opening bzip2 decompressor: $s\n" if $s != Compress::Raw::Bzip2::BZ_OK();
die "Error decompressing '$ofn': $s\n" if ($s = $b->bzinflate($dat, $r)) != Compress::Raw::Bzip2::BZ_STREAM_END();
$dat = $r;
next;
}
if($fn =~ s/\.lzma$//) {
require Compress::Raw::Lzma;
my($l, $s) = Compress::Raw::Lzma::AutoDecoder->new();
my $r;
die "Error decompressing '$ofn': Opening lzma decompressor: $s\n" if $s != Compress::Raw::Lzma::LZMA_OK();
die "Error decompressing '$ofn': $s\n" if ($s = $l->code($dat, $r)) != Compress::Raw::Lzma::LZMA_STREAM_END();
$dat = $r;
next;
}
last;
}
return $dat;
}
sub decodeman {
my($data, $locale) = @_;
my @enc = ('utf-8'); # No harm in trying utf-8 first.
# Check for 'coding:' indications in the file header.
# According to preconv.1, only the first two lines are checked. I've not seen
# any man page where this coding information was on the second line, though.
# Note that that man page also mentions some aliasses that Perl's
# find_encoding doesn't have. Again, I've not found any man page using those.
my $re = qr/[\.']?\\["#].+-\*-.*coding: *([^ ;]+).+-\*-/;
if($data =~ /^$re/ || $data =~ /^.*\n$re/) {
(my $c = $1) =~ s/-(?:dos|unix|mac)$//;
$c = find_encoding $c;
$c = $c->name if $c;
push @enc, $c if $c && $c ne 'ascii' && $c ne 'utf8' && $c ne 'utf-8-strict';
}
# Get encoding from the locale part of the path
my $locenc = $locale && find_encoding $locale;
unshift @enc, $locenc->name if $locenc;
# Some language-specific fallbacks
# TODO: Handle zh_* locales
$locale && push @enc,
$locale =~ /^(pl|cs|sk)/i ? 'iso-8859-2'
: $locale =~ /^tr/i ? 'iso-8859-9'
: $locale =~ /^ru/i ? 'koi8-r' # TODO: Or iso-8859-5, probably want to autodetect that?
: $locale =~ /^ja/i ? 'euc-jp' # TODO: Works for everything I've found yet, but Japanese isn't that simple. Probably want to detect Shift-JIS as well?
: $locale =~ /^ko/i ? 'euc-kr'
#: $locale =~ /^el/i ? 'iso-8859-7' # So far, all el mans I've seen were UTF-8.
: ();
# If all else fails.
push @enc, 'iso-8859-1';
# Now try decoding
my($dec, $enc);
for(@enc) {
$enc = $_;
$dec = eval { my $tmp = $data; decode($enc, $tmp, 1) };
last if $dec;
}
return $dec ? ($enc, $dec) : ();
}
sub addman {
my($pkg, $path, $fn, $locale) = @_;
my $dat = readman $fn;
my $hash = sha1_hex $dat;
my($enc, $dec) = decodeman($dat, $locale);
print "Invalid encoding or empty file: $path\n" and return if !$enc;
$db->do(q{INSERT INTO contents (hash, content) VALUES(decode(?, 'hex'),?)}, {}, $hash, $dec)
if !$db->selectrow_arrayref(q{SELECT 1 FROM contents WHERE hash = decode(?, 'hex')}, {}, $hash);
$db->do(q{
INSERT INTO man (package, name, section, filename, locale, hash)
VALUES(?,name_from_filename(?),section_from_filename(?),?,?,decode(?, 'hex'))}, {},
$pkg, $path, $path, $path, $locale, $hash);
printf "$path ($enc)\n";
}
my $found = 0;
find sub {
return if !-f $_;
(my $path = $File::Find::name) =~ s/^\Q$dir\E//;
# Note: fltk also creates pre-formatted pages in /cat$sectre/, but those are ignored.
# TODO: Also ignore html and INDEX sections
return warn "Ignoring $path\n" if $path !~ m{man(?:/([^/]+))?/man[0-9n]/([^/]+)$};
addman $pkgid, $path, $2, $1;
$found++;
}, $dir;
if($found) {
$db->commit;
} else {
warn "No man pages found.\n";
$db->rollback;
exit 1;
}

16
util/add_tar.sh Executable file
View file

@ -0,0 +1,16 @@
#!/bin/sh
# Usage: add_tar.sh <file> <pkgid> <flags>
# Requires a recent GNU tar for compression autodetect and xz support.
TMP=`mktemp -d manned.XXXXXXX`
# TODO: tar throws an error if there are no man pages. This isn't really an error, though.
tar --warning=no-unknown-keyword -C "$TMP" $3 -xf "$1" --wildcards '*/man/*'\
&& ./add_dir.pl "$TMP" "$2"
RET=$?
rm -rf "$TMP"
exit $RET

95
util/arch.sh Executable file
View file

@ -0,0 +1,95 @@
#!/bin/sh
# Usage: ./arch.sh
# Synchronises the database with an Arch mirror, fetching any packages that
# aren't yet in the database and may have man pages.
MIRROR=http://ftp.nluug.nl/pub/os/Linux/distr/archlinux
REPOS="core extra community"
DEBUG=false
SYSID=1
CURL="curl -Ss"
PSQL="psql -U manned -Awtq"
TMP=`mktemp -d manned.arch.XXXXXX`
# Returns 0 if the package is already in the database or if an error occured.
# Otherwise adds the package, sets PKGID to the new ID, and returns 1.
PKGID=
add_pkginfo() { # cat name ver date
RES=`echo "SELECT id FROM package WHERE system = :'sysid' AND name = :'name' AND version = :'ver'"\
| $PSQL -v "sysid=$SYSID" -v "name=$2" -v "ver=$3"`
[ "$?" -ne 0 -o -n "$RES" ] && return 0
RES=`echo "INSERT INTO package (system, category, name, version, released) VALUES(:'sysid',:'cat',:'name',:'ver',:'rel') RETURNING id"\
| $PSQL -v "sysid=$SYSID" -v "cat=$1" -v "name=$2" -v "ver=$3" -v "rel=$4"`
[ "$?" -ne 0 ] && return 0
PKGID=$RES
return 1
}
checkpkg() {
REPO=$1
FN=$2
D="$TMP/$REPO/$FN"
if [ ! \( -d "$D" -a -f "$D/files" -a -f "$D/desc" \) ]; then
echo "===> $FN"
echo "Invalid item, ignoring"
return
fi
grep -q /man/ "$D/files"
if [ "$?" -ne 0 ]; then
$DEBUG && echo "===> $FN"
$DEBUG && echo "No mans"
return
fi
# Somewhat inefficient description parsing
FILENAME=`grep -A 1 '%FILENAME%' "$D/desc" | tail -n 1`
NAME=`grep -A 1 '%NAME%' "$D/desc" | tail -n 1`
VERSION=`grep -A 1 '%VERSION%' "$D/desc" | tail -n 1`
BUILDDATE=`grep -A 1 '%BUILDDATE%' "$D/desc" | tail -n 1`
if [ -z "$FILENAME" -o -z "$NAME" -o -z "$VERSION" -o -z "$BUILDDATE" ]; then
echo "===> $FN"
echo "Invalid/missing description info"
return
fi
BUILDDATE=`date -d "@$BUILDDATE" '+%F'`
add_pkginfo "$REPO" "$NAME" "$VERSION" "$BUILDDATE"
if [ "$?" -eq 0 ]; then
$DEBUG && echo "===> $FN"
$DEBUG && echo "Already up-to-date"
return
fi
echo "===> $FN"
F="$TMP/$REPO/$FILENAME"
$CURL "$MIRROR/$REPO/os/i686/$FILENAME" -o "$F" || return
./add_tar.sh "$F" "$PKGID"
rm -f "$F"
}
syncrepo() {
REPO=$1
F="$TMP/$REPO/repo.tar.gz"
echo "============ $REPO"
$CURL "$MIRROR/$REPO/os/i686/$REPO.files.tar.gz" -o "$F" || return 1
tar -C "$TMP/$REPO" -xf "$F" || return 1
rm -f "$F"
for fn in "$TMP/$REPO"/*; do
checkpkg "$REPO" `basename "$fn"`
done
}
for r in $REPOS; do
mkdir "$TMP/$r"
syncrepo $r
rm -rf "$TMP/$r"
done
rm -rf "$TMP"

121
util/deb.sh Executable file
View file

@ -0,0 +1,121 @@
#!/bin/bash
# A fetcher for debian-style repositories.
CURL="curl -Ss"
PSQL="psql -U manned -Awtq"
TMP=`mktemp -d manned.deb.XXXXXX`
checkpkg() {
SYSID=$1
REPO=$2
NAME=$3
VERSION=$4
SECTION=$5
FILE=$6
echo "===> $NAME-$VERSION"
FN="$TMP/$NAME-$VERSION.deb"
$CURL "$REPO/$FILE" -o "$FN" || return
# Get the date from the last modification time of the debian-binary file
# inside the .deb. Preferably, the date we store in the database indicates
# when the *source* package has been uploaded, but this will work fine as
# an approximation, I guess.
DATE=`date -d "\`ar tv \"$FN\" debian-binary | perl -lne 's/^[^ ]+ [^ ]+ +\d+ (.+) debian-binary$/print $1/e'\`" "+%F"`
# Insert package in the database
PKGID=`echo "INSERT INTO package (system, category, name, version, released) VALUES(:'sysid',:'cat',:'name',:'ver',:'rel') RETURNING id"\
| $PSQL -v "sysid=$SYSID" -v "cat=$SECTION" -v "name=$NAME" -v "ver=$VERSION" -v "rel=$DATE"`
# Extract and handle the man pages
if [ "$?" -eq 0 -a -n "$PKGID" ]; then
ar p "$FN" data.tar.gz | ./add_tar.sh - $PKGID -z
fi
rm "$FN"
}
syncrepo() {
SYSID=$1
REPO=$2
DISTRO=$3
COMPONENTS=$4
CONTENTSURL=${5:-"dists/$DISTRO/Contents-i386.gz"}
echo "============ $REPO $DISTRO ($COMPONENTS)"
# Get Contents.gz and Packages
CFN="$TMP/Contents"
PFN="$TMP/Packages"
printf "" >"$PFN"
$CURL "$REPO/$CONTENTSURL" -o "$CFN.gz" || return 1
gunzip "$CFN.gz"
for CMP in $COMPONENTS; do
echo "MANDIFF-COMPONENT: $CMP" >>"$PFN"
TFN="$TMP/Packages-$CMP.bz2"
$CURL "$REPO/dists/$DISTRO/$CMP/binary-i386/Packages.bz2" -o "$TFN" || return 1
bzcat "$TFN" >>"$PFN"
rm "$TFN"
done
# Parse the Contents and Packages files and check with the database to figure
# out which packages we need to download.
mkfifo "$TMP/fifo"
perl -l - $CFN $PFN $SYSID <<'EOP' >"$TMP/fifo" &
($cfn, $pfn, $sysid) = @ARGV;
use DBI;
$db = DBI->connect('dbi:Pg:dbname=manned', 'manned', '', {RaiseError => 1});
open F, '<', $cfn or die $!;
while(<F>) {
chomp; @l=split/ +/;
grep{ s{^.+/([^/]+)$}{$1}; $_ ne"-" and ($pkg{$_}=1) } split/,/, $l[1] if $l[0]=~/\/man\//
}
close F;
open F, '<', $pfn or die $!;
while(<F>) {
chomp;
$p = $1 if /^Package: (.+)/;
$v = $1 if /^Version: (.+)/;
$s = $1 if /^Section: (.+)/;
$f = $1 if /^Filename: (.+)/;
if(!$_) {
if($p && $v && $s && $f) {
print "$p $v $s $f" if $pkg{$p} && $pkg{$p} == 1
&& !$db->selectrow_arrayref(q{SELECT 1 FROM package WHERE system = ? AND name = ? AND version = ?}, {}, $sysid, $p, $v);
warn "Duplicate package? $p\n" if $pkg{$p} && $pkg{$p} == 2;
$pkg{$p} = 2;
}
$p=$v=$f=undef
}
}
close F;
EOP
while read l; do
checkpkg $SYSID $REPO $l
done <"$TMP/fifo"
rm -f "$TMP/fifo" "$CFN" "$PFN"
}
# TODO: backports?
#syncrepo 2 "http://old-releases.ubuntu.com/ubuntu/" "warty" "main multiverse restricted universe"
#syncrepo 2 "http://old-releases.ubuntu.com/ubuntu/" "warty-updates" "main multiverse restricted universe" "dists/warty/Contents-i386.gz"
#syncrepo 2 "http://old-releases.ubuntu.com/ubuntu/" "warty-security" "main multiverse restricted universe" "dists/warty/Contents-i386.gz"
#syncrepo 3 "http://old-releases.ubuntu.com/ubuntu/" "hoary" "main multiverse restricted universe"
#syncrepo 3 "http://old-releases.ubuntu.com/ubuntu/" "hoary-updates" "main multiverse restricted universe" "dists/hoary/Contents-i386.gz"
#syncrepo 3 "http://old-releases.ubuntu.com/ubuntu/" "hoary-security" "main multiverse restricted universe" "dists/hoary/Contents-i386.gz"
#syncrepo 4 "http://old-releases.ubuntu.com/ubuntu/" "breezy" "main multiverse restricted universe"
#syncrepo 4 "http://old-releases.ubuntu.com/ubuntu/" "breezy-updates" "main multiverse restricted universe" "dists/breezy/Contents-i386.gz"
#syncrepo 4 "http://old-releases.ubuntu.com/ubuntu/" "breezy-security" "main multiverse restricted universe" "dists/breezy/Contents-i386.gz"
rm -rf "$TMP"