Reorganize indexing scripts + use Rust for Debian

This commit is contained in:
Yorhel 2016-11-20 12:29:01 +01:00
parent 5d44d0e2ec
commit 2ee2f7495b
7 changed files with 142 additions and 148 deletions

13
util/arch.sh Executable file
View file

@ -0,0 +1,13 @@
#!/bin/sh
. ./common.sh
case "$1" in
active)
MIRROR=http://ftp.nluug.nl/pub/os/Linux/distr/archlinux
REPOS="core extra community"
for REPO in $REPOS; do
index arch --sys arch --mirror $MIRROR --repo $REPO
done
;;
esac

View file

@ -1,8 +1,50 @@
#!/bin/bash
CURL="curl -fSs -A manual-page-crawler,info@manned.org --limit-rate 500k"
if test -f .config; then
source .config
fi
index() {
echo "====> indexer -vv $@"
./indexer -vv --dryrun $@ 2>&1
echo
}
# Convenient wrapper around index() for debian repos
# TODO: Use x86_64 for new releases
# Usage: index_dev sys mirror distro list-of-components [contents]
# contents:
# empty for global Contents-i386.gz location
# "cmp" for per-component Contents.i386.gz location
# Otherwise, full path to Contents file
index_deb() {
local SYS=$1
local MIRROR=$2
local DISTRO=$3
local COMPONENTS=$4
local CONTENTS=${5:-"dists/$DISTRO/Contents-i386.gz"}
for CMP in $COMPONENTS; do
local CONT=$CONTENTS
test $CONT = cmp && CONT="dists/$DISTRO/$CMP/Contents-i386.gz"
index deb --sys "$SYS" --mirror "$MIRROR" --contents "$MIRROR$CONT" --packages "${MIRROR}dists/$DISTRO/$CMP/binary-i386/Packages.gz"
done
}
PSQL="psql -U manned -Awtq"
## THE STUFF BELOW IS OLD
# To be replaced with calls to index()
CURL="curl -fSs -A manual-page-crawler,info@manned.org --limit-rate 500k"
TMP=`mktemp -d manned.XXXXXX`
# bash-ism, remove the working directory when we're done.

View file

@ -2,9 +2,8 @@
. ./common.sh
./index.sh daily
./deb.sh ubuntu_active
./deb.sh debian_active
./arch.sh active
./debian.sh active
echo "============ Updating SQL indices"
$PSQL -f update_indices.sql

78
util/debian.sh Executable file
View file

@ -0,0 +1,78 @@
#!/bin/sh
. ./common.sh
AMIRROR=http://archive.debian.org/debian/
CMIRROR=http://ftp.nl.debian.org/debian/
# XXX: buzz and rex have some deb-old formatted packages, the indexer doesn't support these.
case "$1" in
buzz)
index deb --sys debian-buzz --mirror $AMIRROR --contents ${AMIRROR}dists/buzz/main/Contents.gz --packages ${AMIRROR}dists/buzz/main/binary-i386/Packages.gz
index deb --sys debian-buzz --mirror $AMIRROR --contents ${AMIRROR}dists/buzz/contrib/Contents.gz --packages ${AMIRROR}dists/buzz/contrib/binary/Packages.gz
;;
rex)
index deb --sys debian-rex --mirror $AMIRROR --contents ${AMIRROR}dists/rex/main/Contents.gz --packages ${AMIRROR}dists/rex/main/binary-i386/Packages.gz
index deb --sys debian-rex --mirror $AMIRROR --contents ${AMIRROR}dists/rex/contrib/Contents.gz --packages ${AMIRROR}dists/rex/contrib/binary/Packages.gz
;;
bo)
index deb --sys debian-bo --mirror $AMIRROR --contents ${AMIRROR}dists/bo/main/Contents-i386.gz --packages ${AMIRROR}dists/bo/main/binary-i386/Packages.gz
# There's no Contents file for contrib and non-free
index deb --sys debian-bo --mirror $AMIRROR --packages ${AMIRROR}dists/bo/contrib/binary/Packages.gz
index deb --sys debian-bo --mirror $AMIRROR --packages ${AMIRROR}dists/bo/non-free/binary/Packages.gz
;;
hamm)
index_deb debian-hamm $AMIRROR hamm "main hamm contrib non-free"
;;
slink)
index_deb debian-slink $AMIRROR slink "main contrib non-free"
;;
potato)
index_deb debian-potato $AMIRROR potato "main contrib non-free"
;;
woody)
index_deb debian-woody $AMIRROR woody "main contrib non-free"
;;
sarge)
index_deb debian-sarge $AMIRROR sarge "main contrib non-free"
;;
etch)
index_deb debian-etch $AMIRROR etch "main contrib non-free"
;;
lenny)
index_deb debian-lenny $AMIRROR lenny "main contrib non-free"
;;
squeeze)
index_deb debian-squeeze $AMIRROR squeeze "main contrib non-free"
index_deb debian-squeeze $AMIRROR squeeze-lts "main contrib non-free" cmp
;;
wheezy)
index_deb debian-wheezy $CMIRROR wheezy "main contrib non-free"
index_deb debian-wheezy $CMIRROR wheezy-updates "main contrib non-free" cmp
;;
jessie)
index_deb debian-jessie $CMIRROR jessie "main contrib non-free" cmp
index_deb debian-jessie $CMIRROR jessie-updates "main contrib non-free" cmp
;;
old)
$0 buzz
$0 rex
$0 bo
$0 hamm
$0 slink
$0 potato
$0 woody
$0 sarge
$0 etch
$0 lenny
$0 squeeze
;;
active)
$0 wheezy
$0 jessie
;;
all)
$0 old
$0 active
esac

View file

@ -1,22 +0,0 @@
if test -f .config; then
source .config
fi
INDEX="./indexer -vv"
set -x
arch() {
local MIRROR=http://ftp.nluug.nl/pub/os/Linux/distr/archlinux
local REPOS="core extra community"
for REPO in $REPOS; do
$INDEX arch --sys arch --mirror $MIRROR --repo $REPO
done
}
daily() {
arch
}
$@

View file

@ -339,120 +339,5 @@ ubuntu() {
}
debian_buzz() {
# Contrib uses a rather non-standard arch directory ("binary" and "binary-all"), so let's stick with main for now.
syncrepo 18 "http://archive.debian.org/debian/" "buzz" "main" "dists/buzz/main/Contents.gz"
}
debian_rex() {
# (Same note on contrib)
syncrepo 19 "http://archive.debian.org/debian/" "rex" "main" "dists/rex/main/Contents.gz"
}
debian_bo() {
# Contrib and non-free don't have a Contents file :(
syncrepo 20 "http://archive.debian.org/debian/" "bo" "main" "dists/bo/main/Contents-i386.gz"
}
debian_hamm() {
syncrepo 21 "http://archive.debian.org/debian/" "hamm" "main hamm contrib non-free"
}
debian_slink() {
syncrepo 22 "http://archive.debian.org/debian/" "slink" "main contrib non-free"
}
debian_potato() {
syncrepo 23 "http://archive.debian.org/debian/" "potato" "main contrib non-free"
}
debian_woody() {
syncrepo 24 "http://archive.debian.org/debian/" "woody" "main contrib non-free"
}
debian_sarge() {
syncrepo 25 "http://archive.debian.org/debian/" "sarge" "main contrib non-free"
}
debian_etch() {
syncrepo 26 "http://archive.debian.org/debian/" "etch" "main contrib non-free"
}
debian_lenny() {
syncrepo 27 "http://archive.debian.org/debian/" "lenny" "main contrib non-free"
}
debian_squeeze() {
syncrepo 28 "http://ftp.nl.debian.org/debian/" "squeeze" "main contrib non-free"
syncrepo 28 "http://ftp.nl.debian.org/debian/" "squeeze-updates" "main contrib non-free"
}
debian_wheezy() {
syncrepo 83 "http://ftp.nl.debian.org/debian/" "wheezy" "main contrib non-free"
# The Contents-* files have moved...
syncrepo 83 "http://ftp.nl.debian.org/debian/" "wheezy-updates" "main" "dists/wheezy-updates/main/Contents-i386.gz"
syncrepo 83 "http://ftp.nl.debian.org/debian/" "wheezy-updates" "contrib" "dists/wheezy-updates/contrib/Contents-i386.gz"
syncrepo 83 "http://ftp.nl.debian.org/debian/" "wheezy-updates" "non-free" "dists/wheezy-updates/non-free/Contents-i386.gz"
}
debian_jessie() {
syncrepo 91 "http://ftp.nl.debian.org/debian/" "jessie" "main" "dists/jessie/main/Contents-i386.gz"
syncrepo 91 "http://ftp.nl.debian.org/debian/" "jessie" "contrib" "dists/jessie/contrib/Contents-i386.gz"
syncrepo 91 "http://ftp.nl.debian.org/debian/" "jessie" "non-free" "dists/jessie/non-free/Contents-i386.gz"
syncrepo 91 "http://ftp.nl.debian.org/debian/" "jessie-updates" "main" "dists/jessie-updates/main/Contents-i386.gz"
syncrepo 91 "http://ftp.nl.debian.org/debian/" "jessie-updates" "contrib" "dists/jessie-updates/contrib/Contents-i386.gz"
syncrepo 91 "http://ftp.nl.debian.org/debian/" "jessie-updates" "non-free" "dists/jessie-updates/non-free/Contents-i386.gz"
}
debian_old() {
debian_buzz
debian_rex
debian_bo
debian_hamm
debian_slink
debian_potato
debian_woody
debian_sarge
debian_etch
debian_lenny
debian_squeeze
}
debian_active() {
debian_wheezy
debian_jessie
}
debian() {
debian_old
debian_active
}
# Fetch older packages from snapshot.debian.org
debian_snapshot_month() {
YEAR=$1
MONTH=$2
ROOT="http://snapshot.debian.org/archive/debian/"
DATES=`$CURL "$ROOT?year=$YEAR&month=$MONTH" | perl -lne 'm|<a href="([0-9]{8}T[0-9]{6}Z)/"| && print $1'`
PREVDATE="00000000"
for DATE in $DATES; do
CURDATE=`echo $DATE | head -c8`
[ "$CURDATE" = "$PREVDATE" ] && continue
PREVDATE=$CURDATE
[ $DATE \< "20070104" ] && syncrepo 24 "$ROOT$DATE/" "woody" "main contrib non-free"
[ \( $DATE \> "20050607" \) -a \( $DATE \< "20081028" \) ] && syncrepo 25 "$ROOT$DATE/" "sarge" "main contrib non-free"
[ \( $DATE \> "20070409" \) -a \( $DATE \< "20100620" \) ] && syncrepo 26 "$ROOT$DATE/" "etch" "main contrib non-free"
[ \( $DATE \> "20090218" \) -a \( $DATE \< "20120326" \) ] && syncrepo 27 "$ROOT$DATE/" "lenny" "main contrib non-free"
if [ $DATE \> "20110206" ]; then
syncrepo 28 "$ROOT$DATE/" "squeeze" "main contrib non-free"
syncrepo 28 "$ROOT$DATE/" "squeeze-updates" "main contrib non-free"
fi
done
}
"$@"

View file

@ -245,10 +245,9 @@ sub about {
Historical releases were fetched from <a
href="http://archive.debian.org/debian/">http://archive.debian.org/debian/</a>
and <a href="http://snapshot.debian.org/">http://snapshot.debian.org/</a>.
For buzz, rex and bo, only the 'main' component has been indexed, and
we're missing a few man pages because some packages were missing from the
repository archives. For the other releases, all components (main, contrib
and non-free) from the $release and $release-updates (where available)
For buzz, rex and bo, we're missing a few man pages because some packages
were missing from the repository archives. Where available, all components
(main, contrib and non-free) from the $release and $release-updates
repositories are indexed.</dd>
<dt>FreeBSD</dt><dd>
Historical releases were fetched from <a
@ -267,8 +266,8 @@ sub about {
href="http://old-releases.ubuntu.com/ubuntu/">http://old-releases.ubuntu.com/ubuntu/</a>,
supported releases from a local mirror. All components (main, universe,
restricted and multiverse) from the $release, $release-updates and
$release-security repositories are indexed. Backports are not included at
the moment. Indexing started around mid June 2012.</dd>
$release-security repositories are indexed. Indexing started around mid
June 2012.</dd>
</dl>
Only packages for a single architecture (i386 or amd64) are scanned. To my
knowledge, packages that come with different manuals for different