Indexed snapshot.debian.org + give crawler a user-agent
This commit is contained in:
parent
f2bf6829c3
commit
d9def9d542
3 changed files with 34 additions and 8 deletions
|
|
@ -9,7 +9,7 @@ REPOS="core extra community"
|
||||||
DEBUG=false
|
DEBUG=false
|
||||||
SYSID=1
|
SYSID=1
|
||||||
|
|
||||||
CURL="curl -fSs"
|
CURL="curl -fSs -A manual-page-crawler,info@manned.org"
|
||||||
PSQL="psql -U manned -Awtq"
|
PSQL="psql -U manned -Awtq"
|
||||||
TMP=`mktemp -d manned.arch.XXXXXX`
|
TMP=`mktemp -d manned.arch.XXXXXX`
|
||||||
|
|
||||||
|
|
|
||||||
29
util/deb.sh
29
util/deb.sh
|
|
@ -2,7 +2,7 @@
|
||||||
|
|
||||||
# A fetcher for debian-style repositories.
|
# A fetcher for debian-style repositories.
|
||||||
|
|
||||||
CURL="curl -fSs"
|
CURL="curl -fSs -A manual-page-crawler,info@manned.org --limit-rate 500k"
|
||||||
PSQL="psql -U manned -Awtq"
|
PSQL="psql -U manned -Awtq"
|
||||||
TMP=`mktemp -d manned.deb.XXXXXX`
|
TMP=`mktemp -d manned.deb.XXXXXX`
|
||||||
|
|
||||||
|
|
@ -76,7 +76,7 @@ syncrepo() {
|
||||||
printf "" >"$PFN"
|
printf "" >"$PFN"
|
||||||
if [ "$CONTENTSURL" != "-" ]; then
|
if [ "$CONTENTSURL" != "-" ]; then
|
||||||
$CURL "$REPO$CONTENTSURL" -o "$CFN.gz" || return 1
|
$CURL "$REPO$CONTENTSURL" -o "$CFN.gz" || return 1
|
||||||
gunzip "$CFN.gz"
|
gunzip -f "$CFN.gz"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
for CMP in $COMPONENTS; do
|
for CMP in $COMPONENTS; do
|
||||||
|
|
@ -350,6 +350,31 @@ debian() {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# Fetch older packages from snapshot.debian.org
|
||||||
|
|
||||||
|
debian_snapshot_month() {
|
||||||
|
YEAR=$1
|
||||||
|
MONTH=$2
|
||||||
|
ROOT="http://snapshot.debian.org/archive/debian/"
|
||||||
|
DATES=`$CURL "$ROOT?year=$YEAR&month=$MONTH" | perl -lne 'm|<a href="([0-9]{8}T[0-9]{6}Z)/"| && print $1'`
|
||||||
|
PREVDATE="00000000"
|
||||||
|
for DATE in $DATES; do
|
||||||
|
CURDATE=`echo $DATE | head -c8`
|
||||||
|
[ "$CURDATE" = "$PREVDATE" ] && continue
|
||||||
|
PREVDATE=$CURDATE
|
||||||
|
[ $DATE \< "20070104" ] && syncrepo 24 "$ROOT$DATE/" "woody" "main contrib non-free"
|
||||||
|
[ \( $DATE \> "20050607" \) -a \( $DATE \< "20081028" \) ] && syncrepo 25 "$ROOT$DATE/" "sarge" "main contrib non-free"
|
||||||
|
[ \( $DATE \> "20070409" \) -a \( $DATE \< "20100620" \) ] && syncrepo 26 "$ROOT$DATE/" "etch" "main contrib non-free"
|
||||||
|
[ \( $DATE \> "20090218" \) -a \( $DATE \< "20120326" \) ] && syncrepo 27 "$ROOT$DATE/" "lenny" "main contrib non-free"
|
||||||
|
if [ $DATE \> "20110206" ]; then
|
||||||
|
syncrepo 28 "$ROOT$DATE/" "squeeze" "main contrib non-free"
|
||||||
|
syncrepo 28 "$ROOT$DATE/" "squeeze-updates" "main contrib non-free"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
"$@"
|
"$@"
|
||||||
|
|
||||||
rm -rf "$TMP"
|
rm -rf "$TMP"
|
||||||
|
|
|
||||||
11
www/index.pl
11
www/index.pl
|
|
@ -57,8 +57,8 @@ sub home {
|
||||||
# Relevant query: SELECT count(distinct hash), count(distinct name), count(*), count(distinct package) FROM man;
|
# Relevant query: SELECT count(distinct hash), count(distinct name), count(*), count(distinct package) FROM man;
|
||||||
# It's far too slow to run that on every pageview. :-(
|
# It's far too slow to run that on every pageview. :-(
|
||||||
p style => 'float: none'; lit <<' _';
|
p style => 'float: none'; lit <<' _';
|
||||||
Indexing <b>612,354</b> versions of <b>132,474</b> manual pages found in
|
Indexing <b>679,885</b> versions of <b>132,493</b> manual pages found in
|
||||||
<b>2,056,796</b> files of <b>217,369</b> packages.
|
<b>2,213,486</b> files of <b>222,543</b> packages.
|
||||||
<br /><br />
|
<br /><br />
|
||||||
Manned.org aims to index all manual pages from a variety of systems, both
|
Manned.org aims to index all manual pages from a variety of systems, both
|
||||||
old and new, and provides a convenient interface for looking up and viewing
|
old and new, and provides a convenient interface for looking up and viewing
|
||||||
|
|
@ -152,12 +152,13 @@ sub about {
|
||||||
the moment. Indexing started around mid June 2012.</dd>
|
the moment. Indexing started around mid June 2012.</dd>
|
||||||
<dt>Debian</dt><dd>
|
<dt>Debian</dt><dd>
|
||||||
Historical releases were fetched from <a
|
Historical releases were fetched from <a
|
||||||
href="http://archive.debian.org/debian/">http://archive.debian.org/debian/</a>.
|
href="http://archive.debian.org/debian/">http://archive.debian.org/debian/</a>
|
||||||
|
and <a href="http://snapshot.debian.org/">http://snapshot.debian.org/</a>.
|
||||||
For buzz, rex and bo, only the 'main' component has been indexed, and
|
For buzz, rex and bo, only the 'main' component has been indexed, and
|
||||||
we're missing a few man pages because some packages were missing from the
|
we're missing a few man pages because some packages were missing from the
|
||||||
repository archives. For the other releases, all components (main, contrib
|
repository archives. For the other releases, all components (main, contrib
|
||||||
and non-free) from the $release and $release-updates repositories are
|
and non-free) from the $release and $release-updates (where available)
|
||||||
indexed.
|
repositories are indexed.
|
||||||
</dd>
|
</dd>
|
||||||
</dl><br />
|
</dl><br />
|
||||||
Only packages for a single architecture (i386 or i686) are scanned. To my
|
Only packages for a single architecture (i386 or i686) are scanned. To my
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue