From 531882296f66568df0a062f699d5e23cd2ca0a8f Mon Sep 17 00:00:00 2001 From: Yorhel Date: Wed, 4 Jul 2012 12:34:08 +0200 Subject: [PATCH] Added and indexed early Debian versions --- schema.sql | 6 ++++- util/arch.sh | 2 +- util/deb.sh | 72 +++++++++++++++++++++++++++++++++++++++------------- www/index.pl | 9 ++++++- www/man.js | 1 + 5 files changed, 70 insertions(+), 20 deletions(-) diff --git a/schema.sql b/schema.sql index 995119e..0f073e3 100644 --- a/schema.sql +++ b/schema.sql @@ -70,7 +70,10 @@ INSERT INTO systems (id, name, release, short, relorder) VALUES (14, 'Ubuntu', '10.10', 'ubuntu-maverick', 12), (15, 'Ubuntu', '11.04', 'ubuntu-natty', 13), (16, 'Ubuntu', '11.10', 'ubuntu-oneiric', 14), - (17, 'Ubuntu', '12.04', 'ubuntu-precise', 15); + (17, 'Ubuntu', '12.04', 'ubuntu-precise', 15), + (18, 'Debian', '1.1', 'debian-buzz', 0), + (19, 'Debian', '1.2', 'debian-rex', 1), + (20, 'Debian', '1.3', 'debian-bo', 2); -- Removes any path components and compression extensions from the filename. @@ -110,3 +113,4 @@ $$ LANGUAGE SQL; --DELETE FROM contents c WHERE NOT EXISTS(SELECT 1 FROM man m WHERE m.hash = c.hash); --COMMIT; +--DELETE FROM package WHERE system = 18 AND NOT EXISTS(SELECT 1 FROM man WHERE id = package); diff --git a/util/arch.sh b/util/arch.sh index 7844761..4efaf42 100755 --- a/util/arch.sh +++ b/util/arch.sh @@ -9,7 +9,7 @@ REPOS="core extra community" DEBUG=false SYSID=1 -CURL="curl -Ss" +CURL="curl -fSs" PSQL="psql -U manned -Awtq" TMP=`mktemp -d manned.arch.XXXXXX` diff --git a/util/deb.sh b/util/deb.sh index f87f8f4..2c86a11 100755 --- a/util/deb.sh +++ b/util/deb.sh @@ -2,7 +2,7 @@ # A fetcher for debian-style repositories. -CURL="curl -Ss" +CURL="curl -fSs" PSQL="psql -U manned -Awtq" TMP=`mktemp -d manned.deb.XXXXXX` @@ -16,13 +16,22 @@ checkpkg() { FILE=$6 echo "===> $NAME-$VERSION" FN="$TMP/$NAME-$VERSION.deb" - $CURL "$REPO$FILE" -o "$FN" || return + $CURL "$REPO$FILE" -o "$FN" || return 1 + + # For 0.939000 formats: + # control.tar.gz = tail -n+3 $FILE | head -c"`head -n2 $FILE | tail -n1`" + # data.tar.gz = tail -n+3 $FILE | tail -c+"`head -n2 $FILE | tail -n1`" | tail -c+2 # Get the date from the last modification time of the debian-binary file # inside the .deb. Preferably, the date we store in the database indicates # when the *source* package has been uploaded, but this will work fine as # an approximation, I guess. - DATE=`date -d "\`ar tv \"$FN\" debian-binary | perl -lne 's/^[^ ]+ [^ ]+ +\d+ (.+) debian-binary$/print $1/e'\`" "+%F"` + if [ "`head -c8 \"$FN\"`" = "0.939000" ]; then + DATE=`tail -n+3 "$FN" | head -c"\`head -n2 \"$FN\" | tail -n1\`" | tar -tvzf - | grep control | perl -lne 's/.+ ([^ ]+ [^ ]+) [^ ]*control$/print $1/e'` + else + DATE=`ar tv "$FN" debian-binary | perl -lne 's/^[^ ]+ [^ ]+ +\d+ (.+) debian-binary$/print $1/e'` + fi + DATE=`date -d "$DATE" +%F` # Insert package in the database PKGID=`echo "INSERT INTO package (system, category, name, version, released) VALUES(:'sysid',:'cat',:'name',:'ver',:'rel') RETURNING id"\ @@ -30,16 +39,23 @@ checkpkg() { # Extract and handle the man pages if [ "$?" -eq 0 -a -n "$PKGID" ]; then - DATAFN=`ar t $FN | grep -F data.tar` - case "$DATAFN" in - "data.tar.gz") DATAZ="-z" ;; - "data.tar.bz2") DATAZ="-j" ;; - "data.tar.lzma") DATAZ="--lzma" ;; - "data.tar.xz") DATAZ="-J" ;; - *) echo "No data.tar found, or unknown compression format."; DATAZ="ERR" ;; - esac + # Old format + if [ "`head -c8 \"$FN\"`" = "0.939000" ]; then + tail -n+3 "$FN" | tail -c+"`head -n2 \"$FN\" | tail -n1`" | tail -c+2 | ./add_tar.sh - $PKGID -z - [ "$DATAZ" != "ERR" ] && ar p "$FN" "$DATAFN" | ./add_tar.sh - $PKGID $DATAZ + # New format + else + DATAFN=`ar t $FN | grep -F data.tar` + case "$DATAFN" in + "data.tar.gz") DATAZ="-z" ;; + "data.tar.bz2") DATAZ="-j" ;; + "data.tar.lzma") DATAZ="--lzma" ;; + "data.tar.xz") DATAZ="-J" ;; + *) echo "No data.tar found, or unknown compression format."; DATAZ="ERR" ;; + esac + + [ "$DATAZ" != "ERR" ] && ar p "$FN" "$DATAFN" | ./add_tar.sh - $PKGID $DATAZ + fi fi rm "$FN" @@ -66,8 +82,13 @@ syncrepo() { for CMP in $COMPONENTS; do echo "MANDIFF-COMPONENT: $CMP" >>"$PFN" TFN="$TMP/Packages-$CMP.bz2" - $CURL "${REPO}dists/$DISTRO/$CMP/binary-i386/Packages.bz2" -o "$TFN" || return 1 - bzcat "$TFN" >>"$PFN" + $CURL "${REPO}dists/$DISTRO/$CMP/binary-i386/Packages.bz2" -o "$TFN" + if [ -s "$TFM" ]; then + bzcat "$TFN" >>"$PFN" + else + $CURL "${REPO}dists/$DISTRO/$CMP/binary-i386/Packages.gz" -o "$TFN" || return 1 + zcat "$TFN" >>"$PFN" + fi rm "$TFN" done @@ -91,11 +112,12 @@ syncrepo() { while() { chomp; $p = $1 if /^Package: (.+)/; - $v = $1 if /^Version: (.+)/; - $s = $1 if /^Section: (.+)/; - $f = $1 if /^Filename: (.+)/; + $v = $1 if /^[Vv]ersion: (.+)/; + $s = $1 if /^[Ss]ection: (.+)/; + $f = $1 if /^[Ff]ilename: (.+)/; if(!$_) { if($p && $v && $s && $f) { + $f =~ s{^(Debian-1.[12])/}{dists/$1/main/}; print "$p $v $s $f" if $pkg{$p} && $pkg{$p} == 1 && !$db->selectrow_arrayref(q{SELECT 1 FROM package WHERE system = ? AND name = ? AND version = ?}, {}, $sysid, $p, $v); #warn "Duplicate package? $p\n" if $pkg{$p} && $pkg{$p} == 2; @@ -250,6 +272,22 @@ ubuntu_active() { } +debian_buzz() { + # Contrib uses a rather non-standard arch directory ("binary" and "binary-all"), so let's stick with main for now. + syncrepo 18 "http://archive.debian.org/debian/" "buzz" "main" "dists/buzz/main/Contents.gz" +} + +debian_rex() { + # (Same note on contrib) + syncrepo 19 "http://archive.debian.org/debian/" "rex" "main" "dists/rex/main/Contents.gz" +} + +debian_bo() { + # Contrib and non-free don't have a Contents file :( + syncrepo 20 "http://archive.debian.org/debian/" "bo" "main" "dists/bo/main/Contents-i386.gz" +} + + "$@" rm -rf "$TMP" diff --git a/www/index.pl b/www/index.pl index b26e304..9e5f22c 100755 --- a/www/index.pl +++ b/www/index.pl @@ -57,7 +57,7 @@ sub home { p style => 'float: none'; # Relevant query: SELECT count(distinct hash), count(distinct name), count(*), count(distinct package) FROM man; # It's far too slow to run that on every pageview. :-( - lit 'Indexing 485,506 versions of 119,406 manual pages found in 1,578,498 files of 170,215 packages.'; + lit 'Indexing 493,399 versions of 120,090 manual pages found in 1,598,828 files of 171,724 packages.'; br; txt 'At this point only Arch Linux and Ubuntu have been indexed. More systems and repositories will be added later on.'; end; @@ -123,6 +123,13 @@ sub about { restricted and multiverse) from the $release, $release-updates and $release-security repositories are indexed. Backports are not included at the moment. +
Debian
+ Historical releases were fetched from http://archive.debian.org/debian/. + For buzz, rex and bo, only the 'main' component has been indexed, and + we're missing a few man pages because some packages were missing from the + repository archives. +

Only packages for a single architecture (i386 or i686) are scanned. To my knowledge, packages that come with different manuals for different diff --git a/www/man.js b/www/man.js index 4b7b1bb..7112246 100644 --- a/www/man.js +++ b/www/man.js @@ -45,6 +45,7 @@ function setText(obj, txt) { /* What follows is specific to manned.org */ // TODO: Fix the 'pkg' link +// TODO: Keep same view when switching to different version of the same man page // TODO: Allow showing/hiding old package versions individually. // TODO: Allow complete hiding of old systems. (And enable that by default)