Indexed snapshot.debian.org + give crawler a user-agent

This commit is contained in:
Yorhel 2012-07-15 16:38:32 +02:00
parent f2bf6829c3
commit d9def9d542
3 changed files with 34 additions and 8 deletions

View file

@ -9,7 +9,7 @@ REPOS="core extra community"
DEBUG=false DEBUG=false
SYSID=1 SYSID=1
CURL="curl -fSs" CURL="curl -fSs -A manual-page-crawler,info@manned.org"
PSQL="psql -U manned -Awtq" PSQL="psql -U manned -Awtq"
TMP=`mktemp -d manned.arch.XXXXXX` TMP=`mktemp -d manned.arch.XXXXXX`

View file

@ -2,7 +2,7 @@
# A fetcher for debian-style repositories. # A fetcher for debian-style repositories.
CURL="curl -fSs" CURL="curl -fSs -A manual-page-crawler,info@manned.org --limit-rate 500k"
PSQL="psql -U manned -Awtq" PSQL="psql -U manned -Awtq"
TMP=`mktemp -d manned.deb.XXXXXX` TMP=`mktemp -d manned.deb.XXXXXX`
@ -76,7 +76,7 @@ syncrepo() {
printf "" >"$PFN" printf "" >"$PFN"
if [ "$CONTENTSURL" != "-" ]; then if [ "$CONTENTSURL" != "-" ]; then
$CURL "$REPO$CONTENTSURL" -o "$CFN.gz" || return 1 $CURL "$REPO$CONTENTSURL" -o "$CFN.gz" || return 1
gunzip "$CFN.gz" gunzip -f "$CFN.gz"
fi fi
for CMP in $COMPONENTS; do for CMP in $COMPONENTS; do
@ -350,6 +350,31 @@ debian() {
} }
# Fetch older packages from snapshot.debian.org
debian_snapshot_month() {
YEAR=$1
MONTH=$2
ROOT="http://snapshot.debian.org/archive/debian/"
DATES=`$CURL "$ROOT?year=$YEAR&month=$MONTH" | perl -lne 'm|<a href="([0-9]{8}T[0-9]{6}Z)/"| && print $1'`
PREVDATE="00000000"
for DATE in $DATES; do
CURDATE=`echo $DATE | head -c8`
[ "$CURDATE" = "$PREVDATE" ] && continue
PREVDATE=$CURDATE
[ $DATE \< "20070104" ] && syncrepo 24 "$ROOT$DATE/" "woody" "main contrib non-free"
[ \( $DATE \> "20050607" \) -a \( $DATE \< "20081028" \) ] && syncrepo 25 "$ROOT$DATE/" "sarge" "main contrib non-free"
[ \( $DATE \> "20070409" \) -a \( $DATE \< "20100620" \) ] && syncrepo 26 "$ROOT$DATE/" "etch" "main contrib non-free"
[ \( $DATE \> "20090218" \) -a \( $DATE \< "20120326" \) ] && syncrepo 27 "$ROOT$DATE/" "lenny" "main contrib non-free"
if [ $DATE \> "20110206" ]; then
syncrepo 28 "$ROOT$DATE/" "squeeze" "main contrib non-free"
syncrepo 28 "$ROOT$DATE/" "squeeze-updates" "main contrib non-free"
fi
done
}
"$@" "$@"
rm -rf "$TMP" rm -rf "$TMP"

View file

@ -57,8 +57,8 @@ sub home {
# Relevant query: SELECT count(distinct hash), count(distinct name), count(*), count(distinct package) FROM man; # Relevant query: SELECT count(distinct hash), count(distinct name), count(*), count(distinct package) FROM man;
# It's far too slow to run that on every pageview. :-( # It's far too slow to run that on every pageview. :-(
p style => 'float: none'; lit <<' _'; p style => 'float: none'; lit <<' _';
Indexing <b>612,354</b> versions of <b>132,474</b> manual pages found in Indexing <b>679,885</b> versions of <b>132,493</b> manual pages found in
<b>2,056,796</b> files of <b>217,369</b> packages. <b>2,213,486</b> files of <b>222,543</b> packages.
<br /><br /> <br /><br />
Manned.org aims to index all manual pages from a variety of systems, both Manned.org aims to index all manual pages from a variety of systems, both
old and new, and provides a convenient interface for looking up and viewing old and new, and provides a convenient interface for looking up and viewing
@ -152,12 +152,13 @@ sub about {
the moment. Indexing started around mid June 2012.</dd> the moment. Indexing started around mid June 2012.</dd>
<dt>Debian</dt><dd> <dt>Debian</dt><dd>
Historical releases were fetched from <a Historical releases were fetched from <a
href="http://archive.debian.org/debian/">http://archive.debian.org/debian/</a>. href="http://archive.debian.org/debian/">http://archive.debian.org/debian/</a>
and <a href="http://snapshot.debian.org/">http://snapshot.debian.org/</a>.
For buzz, rex and bo, only the 'main' component has been indexed, and For buzz, rex and bo, only the 'main' component has been indexed, and
we're missing a few man pages because some packages were missing from the we're missing a few man pages because some packages were missing from the
repository archives. For the other releases, all components (main, contrib repository archives. For the other releases, all components (main, contrib
and non-free) from the $release and $release-updates repositories are and non-free) from the $release and $release-updates (where available)
indexed. repositories are indexed.
</dd> </dd>
</dl><br /> </dl><br />
Only packages for a single architecture (i386 or i686) are scanned. To my Only packages for a single architecture (i386 or i686) are scanned. To my