diff --git a/util/arch.sh b/util/arch.sh index d72269c..96733ca 100755 --- a/util/arch.sh +++ b/util/arch.sh @@ -9,7 +9,7 @@ REPOS="core extra community" DEBUG=false SYSID=1 -CURL="curl -fSs" +CURL="curl -fSs -A manual-page-crawler,info@manned.org" PSQL="psql -U manned -Awtq" TMP=`mktemp -d manned.arch.XXXXXX` diff --git a/util/deb.sh b/util/deb.sh index 5889f1d..39d8e92 100755 --- a/util/deb.sh +++ b/util/deb.sh @@ -2,7 +2,7 @@ # A fetcher for debian-style repositories. -CURL="curl -fSs" +CURL="curl -fSs -A manual-page-crawler,info@manned.org --limit-rate 500k" PSQL="psql -U manned -Awtq" TMP=`mktemp -d manned.deb.XXXXXX` @@ -76,7 +76,7 @@ syncrepo() { printf "" >"$PFN" if [ "$CONTENTSURL" != "-" ]; then $CURL "$REPO$CONTENTSURL" -o "$CFN.gz" || return 1 - gunzip "$CFN.gz" + gunzip -f "$CFN.gz" fi for CMP in $COMPONENTS; do @@ -350,6 +350,31 @@ debian() { } + +# Fetch older packages from snapshot.debian.org + +debian_snapshot_month() { + YEAR=$1 + MONTH=$2 + ROOT="http://snapshot.debian.org/archive/debian/" + DATES=`$CURL "$ROOT?year=$YEAR&month=$MONTH" | perl -lne 'm| "20050607" \) -a \( $DATE \< "20081028" \) ] && syncrepo 25 "$ROOT$DATE/" "sarge" "main contrib non-free" + [ \( $DATE \> "20070409" \) -a \( $DATE \< "20100620" \) ] && syncrepo 26 "$ROOT$DATE/" "etch" "main contrib non-free" + [ \( $DATE \> "20090218" \) -a \( $DATE \< "20120326" \) ] && syncrepo 27 "$ROOT$DATE/" "lenny" "main contrib non-free" + if [ $DATE \> "20110206" ]; then + syncrepo 28 "$ROOT$DATE/" "squeeze" "main contrib non-free" + syncrepo 28 "$ROOT$DATE/" "squeeze-updates" "main contrib non-free" + fi + done +} + + "$@" rm -rf "$TMP" diff --git a/www/index.pl b/www/index.pl index 0dab6ee..b6cf40f 100755 --- a/www/index.pl +++ b/www/index.pl @@ -57,8 +57,8 @@ sub home { # Relevant query: SELECT count(distinct hash), count(distinct name), count(*), count(distinct package) FROM man; # It's far too slow to run that on every pageview. :-( p style => 'float: none'; lit <<' _'; - Indexing 612,354 versions of 132,474 manual pages found in - 2,056,796 files of 217,369 packages. + Indexing 679,885 versions of 132,493 manual pages found in + 2,213,486 files of 222,543 packages.

Manned.org aims to index all manual pages from a variety of systems, both old and new, and provides a convenient interface for looking up and viewing @@ -152,12 +152,13 @@ sub about { the moment. Indexing started around mid June 2012.
Debian
Historical releases were fetched from http://archive.debian.org/debian/. + href="http://archive.debian.org/debian/">http://archive.debian.org/debian/ + and http://snapshot.debian.org/. For buzz, rex and bo, only the 'main' component has been indexed, and we're missing a few man pages because some packages were missing from the repository archives. For the other releases, all components (main, contrib - and non-free) from the $release and $release-updates repositories are - indexed. + and non-free) from the $release and $release-updates (where available) + repositories are indexed.

Only packages for a single architecture (i386 or i686) are scanned. To my