diff --git a/util/arch.sh b/util/arch.sh
index d72269c..96733ca 100755
--- a/util/arch.sh
+++ b/util/arch.sh
@@ -9,7 +9,7 @@ REPOS="core extra community"
DEBUG=false
SYSID=1
-CURL="curl -fSs"
+CURL="curl -fSs -A manual-page-crawler,info@manned.org"
PSQL="psql -U manned -Awtq"
TMP=`mktemp -d manned.arch.XXXXXX`
diff --git a/util/deb.sh b/util/deb.sh
index 5889f1d..39d8e92 100755
--- a/util/deb.sh
+++ b/util/deb.sh
@@ -2,7 +2,7 @@
# A fetcher for debian-style repositories.
-CURL="curl -fSs"
+CURL="curl -fSs -A manual-page-crawler,info@manned.org --limit-rate 500k"
PSQL="psql -U manned -Awtq"
TMP=`mktemp -d manned.deb.XXXXXX`
@@ -76,7 +76,7 @@ syncrepo() {
printf "" >"$PFN"
if [ "$CONTENTSURL" != "-" ]; then
$CURL "$REPO$CONTENTSURL" -o "$CFN.gz" || return 1
- gunzip "$CFN.gz"
+ gunzip -f "$CFN.gz"
fi
for CMP in $COMPONENTS; do
@@ -350,6 +350,31 @@ debian() {
}
+
+# Fetch older packages from snapshot.debian.org
+
+debian_snapshot_month() {
+ YEAR=$1
+ MONTH=$2
+ ROOT="http://snapshot.debian.org/archive/debian/"
+ DATES=`$CURL "$ROOT?year=$YEAR&month=$MONTH" | perl -lne 'm| "20050607" \) -a \( $DATE \< "20081028" \) ] && syncrepo 25 "$ROOT$DATE/" "sarge" "main contrib non-free"
+ [ \( $DATE \> "20070409" \) -a \( $DATE \< "20100620" \) ] && syncrepo 26 "$ROOT$DATE/" "etch" "main contrib non-free"
+ [ \( $DATE \> "20090218" \) -a \( $DATE \< "20120326" \) ] && syncrepo 27 "$ROOT$DATE/" "lenny" "main contrib non-free"
+ if [ $DATE \> "20110206" ]; then
+ syncrepo 28 "$ROOT$DATE/" "squeeze" "main contrib non-free"
+ syncrepo 28 "$ROOT$DATE/" "squeeze-updates" "main contrib non-free"
+ fi
+ done
+}
+
+
"$@"
rm -rf "$TMP"
diff --git a/www/index.pl b/www/index.pl
index 0dab6ee..b6cf40f 100755
--- a/www/index.pl
+++ b/www/index.pl
@@ -57,8 +57,8 @@ sub home {
# Relevant query: SELECT count(distinct hash), count(distinct name), count(*), count(distinct package) FROM man;
# It's far too slow to run that on every pageview. :-(
p style => 'float: none'; lit <<' _';
- Indexing 612,354 versions of 132,474 manual pages found in
- 2,056,796 files of 217,369 packages.
+ Indexing 679,885 versions of 132,493 manual pages found in
+ 2,213,486 files of 222,543 packages.
Manned.org aims to index all manual pages from a variety of systems, both
old and new, and provides a convenient interface for looking up and viewing
@@ -152,12 +152,13 @@ sub about {
the moment. Indexing started around mid June 2012.
Debian
Historical releases were fetched from http://archive.debian.org/debian/.
+ href="http://archive.debian.org/debian/">http://archive.debian.org/debian/
+ and http://snapshot.debian.org/.
For buzz, rex and bo, only the 'main' component has been indexed, and
we're missing a few man pages because some packages were missing from the
repository archives. For the other releases, all components (main, contrib
- and non-free) from the $release and $release-updates repositories are
- indexed.
+ and non-free) from the $release and $release-updates (where available)
+ repositories are indexed.
Only packages for a single architecture (i386 or i686) are scanned. To my