Merge branch 'indexer'

This commit is contained in:
Yorhel 2016-11-06 15:26:42 +01:00
commit b8a1945d38
20 changed files with 2282 additions and 79 deletions

2
.gitignore vendored
View file

@ -2,3 +2,5 @@
!/lib/ManUtils/Build.PL
!/lib/ManUtils/ManUtils.pm
!/lib/ManUtils/ManUtils.xs
indexer/target

View file

@ -1,4 +1,6 @@
.PHONY: ManUtils
.PHONY: ManUtils indexer clean
all: ManUtils indexer
ManUtils: lib/ManUtils/Build
cd lib/ManUtils && perl Build.PL && ./Build install --install-base=inst
@ -6,7 +8,12 @@ ManUtils: lib/ManUtils/Build
lib/ManUtils/Build: lib/ManUtils/Build.PL
cd lib/ManUtils && perl Build.PL
indexer: indexer/target/release/indexer
indexer/target/release/indexer: indexer/Cargo.toml indexer/src/*.rs
cd indexer && cargo build --release
clean:
cd lib/ManUtils && ./Build distclean
rm -rf lib/ManUtils/inst
cd indexer && cargo clean

703
indexer/Cargo.lock generated Normal file
View file

@ -0,0 +1,703 @@
[root]
name = "indexer"
version = "0.1.0"
dependencies = [
"chrono 0.2.25 (registry+https://github.com/rust-lang/crates.io-index)",
"clap 2.17.1 (registry+https://github.com/rust-lang/crates.io-index)",
"encoding 0.3.0-dev (git+https://github.com/lifthrasiir/rust-encoding)",
"env_logger 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)",
"hyper 0.9.11 (registry+https://github.com/rust-lang/crates.io-index)",
"lazy_static 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)",
"libarchive3-sys 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)",
"libc 0.2.17 (registry+https://github.com/rust-lang/crates.io-index)",
"log 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)",
"postgres 0.12.0 (registry+https://github.com/rust-lang/crates.io-index)",
"regex 0.1.80 (registry+https://github.com/rust-lang/crates.io-index)",
"ring 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)",
"url 1.2.3 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "aho-corasick"
version = "0.5.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"memchr 0.1.11 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "ansi_term"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "bitflags"
version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "bufstream"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "byteorder"
version = "0.5.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "chrono"
version = "0.2.25"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"num 0.1.36 (registry+https://github.com/rust-lang/crates.io-index)",
"time 0.1.35 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "clap"
version = "2.17.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"ansi_term 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)",
"bitflags 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)",
"libc 0.2.17 (registry+https://github.com/rust-lang/crates.io-index)",
"strsim 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)",
"term_size 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)",
"unicode-segmentation 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)",
"unicode-width 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)",
"vec_map 0.6.0 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "cookie"
version = "0.2.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"openssl 0.7.14 (registry+https://github.com/rust-lang/crates.io-index)",
"rustc-serialize 0.3.19 (registry+https://github.com/rust-lang/crates.io-index)",
"time 0.1.35 (registry+https://github.com/rust-lang/crates.io-index)",
"url 1.2.3 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "encoding"
version = "0.3.0-dev"
source = "git+https://github.com/lifthrasiir/rust-encoding#61e331b0820311572fa00a06349b0f02511e810c"
dependencies = [
"encoding-index-japanese 1.20141219.6 (git+https://github.com/lifthrasiir/rust-encoding)",
"encoding-index-korean 1.20141219.6 (git+https://github.com/lifthrasiir/rust-encoding)",
"encoding-index-simpchinese 1.20160120.0 (git+https://github.com/lifthrasiir/rust-encoding)",
"encoding-index-singlebyte 1.20160120.0 (git+https://github.com/lifthrasiir/rust-encoding)",
"encoding-index-tradchinese 1.20141219.6 (git+https://github.com/lifthrasiir/rust-encoding)",
"encoding-types 0.2.0 (git+https://github.com/lifthrasiir/rust-encoding)",
]
[[package]]
name = "encoding-index-japanese"
version = "1.20141219.6"
source = "git+https://github.com/lifthrasiir/rust-encoding#61e331b0820311572fa00a06349b0f02511e810c"
dependencies = [
"encoding_index_tests 0.1.5 (git+https://github.com/lifthrasiir/rust-encoding)",
]
[[package]]
name = "encoding-index-korean"
version = "1.20141219.6"
source = "git+https://github.com/lifthrasiir/rust-encoding#61e331b0820311572fa00a06349b0f02511e810c"
dependencies = [
"encoding_index_tests 0.1.5 (git+https://github.com/lifthrasiir/rust-encoding)",
]
[[package]]
name = "encoding-index-simpchinese"
version = "1.20160120.0"
source = "git+https://github.com/lifthrasiir/rust-encoding#61e331b0820311572fa00a06349b0f02511e810c"
dependencies = [
"encoding_index_tests 0.1.5 (git+https://github.com/lifthrasiir/rust-encoding)",
]
[[package]]
name = "encoding-index-singlebyte"
version = "1.20160120.0"
source = "git+https://github.com/lifthrasiir/rust-encoding#61e331b0820311572fa00a06349b0f02511e810c"
dependencies = [
"encoding_index_tests 0.1.5 (git+https://github.com/lifthrasiir/rust-encoding)",
]
[[package]]
name = "encoding-index-tradchinese"
version = "1.20141219.6"
source = "git+https://github.com/lifthrasiir/rust-encoding#61e331b0820311572fa00a06349b0f02511e810c"
dependencies = [
"encoding_index_tests 0.1.5 (git+https://github.com/lifthrasiir/rust-encoding)",
]
[[package]]
name = "encoding-types"
version = "0.2.0"
source = "git+https://github.com/lifthrasiir/rust-encoding#61e331b0820311572fa00a06349b0f02511e810c"
[[package]]
name = "encoding_index_tests"
version = "0.1.5"
source = "git+https://github.com/lifthrasiir/rust-encoding#61e331b0820311572fa00a06349b0f02511e810c"
[[package]]
name = "env_logger"
version = "0.3.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"log 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)",
"regex 0.1.80 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "fallible-iterator"
version = "0.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "gcc"
version = "0.3.38"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "gdi32-sys"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"winapi 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)",
"winapi-build 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "hex"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "hpack"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"log 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "httparse"
version = "1.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "hyper"
version = "0.9.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"cookie 0.2.5 (registry+https://github.com/rust-lang/crates.io-index)",
"httparse 1.1.2 (registry+https://github.com/rust-lang/crates.io-index)",
"language-tags 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)",
"log 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)",
"mime 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)",
"num_cpus 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
"openssl 0.7.14 (registry+https://github.com/rust-lang/crates.io-index)",
"openssl-verify 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
"rustc-serialize 0.3.19 (registry+https://github.com/rust-lang/crates.io-index)",
"solicit 0.4.4 (registry+https://github.com/rust-lang/crates.io-index)",
"time 0.1.35 (registry+https://github.com/rust-lang/crates.io-index)",
"traitobject 0.0.1 (registry+https://github.com/rust-lang/crates.io-index)",
"typeable 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)",
"unicase 1.4.0 (registry+https://github.com/rust-lang/crates.io-index)",
"url 1.2.3 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "idna"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"matches 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)",
"unicode-bidi 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)",
"unicode-normalization 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "kernel32-sys"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"winapi 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)",
"winapi-build 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "language-tags"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "lazy_static"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "libarchive3-sys"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"libc 0.2.17 (registry+https://github.com/rust-lang/crates.io-index)",
"pkg-config 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "libc"
version = "0.2.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "libressl-pnacl-sys"
version = "2.1.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"pnacl-build-helper 1.4.10 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "log"
version = "0.3.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "matches"
version = "0.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "md5"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "memchr"
version = "0.1.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"libc 0.2.17 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "mime"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"log 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "num"
version = "0.1.36"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"num-integer 0.1.32 (registry+https://github.com/rust-lang/crates.io-index)",
"num-iter 0.1.32 (registry+https://github.com/rust-lang/crates.io-index)",
"num-traits 0.1.36 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "num-integer"
version = "0.1.32"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"num-traits 0.1.36 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "num-iter"
version = "0.1.32"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"num-integer 0.1.32 (registry+https://github.com/rust-lang/crates.io-index)",
"num-traits 0.1.36 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "num-traits"
version = "0.1.36"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "num_cpus"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"libc 0.2.17 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "openssl"
version = "0.7.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"bitflags 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)",
"gcc 0.3.38 (registry+https://github.com/rust-lang/crates.io-index)",
"lazy_static 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)",
"libc 0.2.17 (registry+https://github.com/rust-lang/crates.io-index)",
"openssl-sys 0.7.17 (registry+https://github.com/rust-lang/crates.io-index)",
"openssl-sys-extras 0.7.14 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "openssl-sys"
version = "0.7.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"gdi32-sys 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)",
"libc 0.2.17 (registry+https://github.com/rust-lang/crates.io-index)",
"libressl-pnacl-sys 2.1.6 (registry+https://github.com/rust-lang/crates.io-index)",
"pkg-config 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)",
"user32-sys 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "openssl-sys-extras"
version = "0.7.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"gcc 0.3.38 (registry+https://github.com/rust-lang/crates.io-index)",
"libc 0.2.17 (registry+https://github.com/rust-lang/crates.io-index)",
"openssl-sys 0.7.17 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "openssl-verify"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"openssl 0.7.14 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "phf"
version = "0.7.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"phf_shared 0.7.15 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "phf_shared"
version = "0.7.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "pkg-config"
version = "0.3.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "pnacl-build-helper"
version = "1.4.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"tempdir 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "postgres"
version = "0.12.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"bufstream 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)",
"fallible-iterator 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)",
"hex 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)",
"log 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)",
"phf 0.7.15 (registry+https://github.com/rust-lang/crates.io-index)",
"postgres-protocol 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "postgres-protocol"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"byteorder 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)",
"fallible-iterator 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)",
"hex 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)",
"md5 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "rand"
version = "0.3.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"libc 0.2.17 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "regex"
version = "0.1.80"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"aho-corasick 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)",
"memchr 0.1.11 (registry+https://github.com/rust-lang/crates.io-index)",
"regex-syntax 0.3.9 (registry+https://github.com/rust-lang/crates.io-index)",
"thread_local 0.2.7 (registry+https://github.com/rust-lang/crates.io-index)",
"utf8-ranges 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "regex-syntax"
version = "0.3.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "ring"
version = "0.5.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"lazy_static 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)",
"untrusted 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "rustc-serialize"
version = "0.3.19"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "rustc_version"
version = "0.1.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"semver 0.1.20 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "semver"
version = "0.1.20"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "solicit"
version = "0.4.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"hpack 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)",
"log 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "strsim"
version = "0.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "tempdir"
version = "0.3.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"rand 0.3.14 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "term_size"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"kernel32-sys 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)",
"libc 0.2.17 (registry+https://github.com/rust-lang/crates.io-index)",
"winapi 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "thread-id"
version = "2.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"kernel32-sys 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)",
"libc 0.2.17 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "thread_local"
version = "0.2.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"thread-id 2.0.0 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "time"
version = "0.1.35"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"kernel32-sys 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)",
"libc 0.2.17 (registry+https://github.com/rust-lang/crates.io-index)",
"winapi 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "traitobject"
version = "0.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "typeable"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "unicase"
version = "1.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"rustc_version 0.1.7 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "unicode-bidi"
version = "0.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"matches 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "unicode-normalization"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "unicode-segmentation"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "unicode-width"
version = "0.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "untrusted"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "url"
version = "1.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"idna 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
"matches 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "user32-sys"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"winapi 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)",
"winapi-build 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "utf8-ranges"
version = "0.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "vec_map"
version = "0.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "winapi"
version = "0.2.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "winapi-build"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
[metadata]
"checksum aho-corasick 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)" = "ca972c2ea5f742bfce5687b9aef75506a764f61d37f8f649047846a9686ddb66"
"checksum ansi_term 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)" = "23ac7c30002a5accbf7e8987d0632fa6de155b7c3d39d0067317a391e00a2ef6"
"checksum bitflags 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "aad18937a628ec6abcd26d1489012cc0e18c21798210f491af69ded9b881106d"
"checksum bufstream 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7b48dbe2ff0e98fa2f03377d204a9637d3c9816cd431bfe05a8abbd0ea11d074"
"checksum byteorder 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)" = "0fc10e8cc6b2580fda3f36eb6dc5316657f812a3df879a44a66fc9f0fdbc4855"
"checksum chrono 0.2.25 (registry+https://github.com/rust-lang/crates.io-index)" = "9213f7cd7c27e95c2b57c49f0e69b1ea65b27138da84a170133fd21b07659c00"
"checksum clap 2.17.1 (registry+https://github.com/rust-lang/crates.io-index)" = "27dac76762fb56019b04aed3ccb43a770a18f80f9c2eb62ee1a18d9fb4ea2430"
"checksum cookie 0.2.5 (registry+https://github.com/rust-lang/crates.io-index)" = "0e3d6405328b6edb412158b3b7710e2634e23f3614b9bb1c412df7952489a626"
"checksum encoding 0.3.0-dev (git+https://github.com/lifthrasiir/rust-encoding)" = "<none>"
"checksum encoding-index-japanese 1.20141219.6 (git+https://github.com/lifthrasiir/rust-encoding)" = "<none>"
"checksum encoding-index-korean 1.20141219.6 (git+https://github.com/lifthrasiir/rust-encoding)" = "<none>"
"checksum encoding-index-simpchinese 1.20160120.0 (git+https://github.com/lifthrasiir/rust-encoding)" = "<none>"
"checksum encoding-index-singlebyte 1.20160120.0 (git+https://github.com/lifthrasiir/rust-encoding)" = "<none>"
"checksum encoding-index-tradchinese 1.20141219.6 (git+https://github.com/lifthrasiir/rust-encoding)" = "<none>"
"checksum encoding-types 0.2.0 (git+https://github.com/lifthrasiir/rust-encoding)" = "<none>"
"checksum encoding_index_tests 0.1.5 (git+https://github.com/lifthrasiir/rust-encoding)" = "<none>"
"checksum env_logger 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)" = "15abd780e45b3ea4f76b4e9a26ff4843258dd8a3eed2775a0e7368c2e7936c2f"
"checksum fallible-iterator 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)" = "5d48ab1bc11a086628e8cc0cc2c2dc200b884ac05c4b48fb71d6036b6999ff1d"
"checksum gcc 0.3.38 (registry+https://github.com/rust-lang/crates.io-index)" = "553f11439bdefe755bf366b264820f1da70f3aaf3924e594b886beb9c831bcf5"
"checksum gdi32-sys 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "0912515a8ff24ba900422ecda800b52f4016a56251922d397c576bf92c690518"
"checksum hex 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "d6a22814455d41612f41161581c2883c0c6a1c41852729b17d5ed88f01e153aa"
"checksum hpack 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "3d2da7d3a34cf6406d9d700111b8eafafe9a251de41ae71d8052748259343b58"
"checksum httparse 1.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "46534074dbb80b070d60a5cb8ecadd8963a00a438ae1a95268850a7ef73b67ae"
"checksum hyper 0.9.11 (registry+https://github.com/rust-lang/crates.io-index)" = "edd47c66782933e546a32ae89ca3c49263b2ba9bc29f3a0d5c52fff48e0ac67c"
"checksum idna 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "1053236e00ce4f668aeca4a769a09b3bf5a682d802abd6f3cb39374f6b162c11"
"checksum kernel32-sys 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7507624b29483431c0ba2d82aece8ca6cdba9382bff4ddd0f7490560c056098d"
"checksum language-tags 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "a91d884b6667cd606bb5a69aa0c99ba811a115fc68915e7056ec08a46e93199a"
"checksum lazy_static 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "49247ec2a285bb3dcb23cbd9c35193c025e7251bfce77c1d5da97e6362dffe7f"
"checksum libarchive3-sys 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "3cd3beae8f59a4c7a806523269b5392037577c150446e88d684dfa6de6031ca7"
"checksum libc 0.2.17 (registry+https://github.com/rust-lang/crates.io-index)" = "044d1360593a78f5c8e5e710beccdc24ab71d1f01bc19a29bcacdba22e8475d8"
"checksum libressl-pnacl-sys 2.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "cbc058951ab6a3ef35ca16462d7642c4867e6403520811f28537a4e2f2db3e71"
"checksum log 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)" = "ab83497bf8bf4ed2a74259c1c802351fcd67a65baa86394b6ba73c36f4838054"
"checksum matches 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)" = "efd7622e3022e1a6eaa602c4cea8912254e5582c9c692e9167714182244801b1"
"checksum md5 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "7df230903ccdffd6b3b4ec21624498ea64c912ce50297846907f0b8e1bb249dd"
"checksum memchr 0.1.11 (registry+https://github.com/rust-lang/crates.io-index)" = "d8b629fb514376c675b98c1421e80b151d3817ac42d7c667717d282761418d20"
"checksum mime 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "b5c93a4bd787ddc6e7833c519b73a50883deb5863d76d9b71eb8216fb7f94e66"
"checksum num 0.1.36 (registry+https://github.com/rust-lang/crates.io-index)" = "bde7c03b09e7c6a301ee81f6ddf66d7a28ec305699e3d3b056d2fc56470e3120"
"checksum num-integer 0.1.32 (registry+https://github.com/rust-lang/crates.io-index)" = "fb24d9bfb3f222010df27995441ded1e954f8f69cd35021f6bef02ca9552fb92"
"checksum num-iter 0.1.32 (registry+https://github.com/rust-lang/crates.io-index)" = "287a1c9969a847055e1122ec0ea7a5c5d6f72aad97934e131c83d5c08ab4e45c"
"checksum num-traits 0.1.36 (registry+https://github.com/rust-lang/crates.io-index)" = "a16a42856a256b39c6d3484f097f6713e14feacd9bfb02290917904fae46c81c"
"checksum num_cpus 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "8890e6084723d57d0df8d2720b0d60c6ee67d6c93e7169630e4371e88765dcad"
"checksum openssl 0.7.14 (registry+https://github.com/rust-lang/crates.io-index)" = "c4117b6244aac42ed0150a6019b4d953d28247c5dd6ae6f46ae469b5f2318733"
"checksum openssl-sys 0.7.17 (registry+https://github.com/rust-lang/crates.io-index)" = "89c47ee94c352eea9ddaf8e364be7f978a3bb6d66d73176572484238dd5a5c3f"
"checksum openssl-sys-extras 0.7.14 (registry+https://github.com/rust-lang/crates.io-index)" = "11c5e1dba7d3d03d80f045bf0d60111dc69213b67651e7c889527a3badabb9fa"
"checksum openssl-verify 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "3ed86cce894f6b0ed4572e21eb34026f1dc8869cb9ee3869029131bc8c3feb2d"
"checksum phf 0.7.15 (registry+https://github.com/rust-lang/crates.io-index)" = "17896951e179a6cbed7d3519b3078ac6c03a347d3e9cf8f303c8a1a73c5a3e44"
"checksum phf_shared 0.7.15 (registry+https://github.com/rust-lang/crates.io-index)" = "bb6c14aac1140c2b06b41477096f249416b17c893d56386a892ac657edfdffba"
"checksum pkg-config 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)" = "8cee804ecc7eaf201a4a207241472cc870e825206f6c031e3ee2a72fa425f2fa"
"checksum pnacl-build-helper 1.4.10 (registry+https://github.com/rust-lang/crates.io-index)" = "61c9231d31aea845007443d62fcbb58bb6949ab9c18081ee1e09920e0cf1118b"
"checksum postgres 0.12.0 (registry+https://github.com/rust-lang/crates.io-index)" = "a7ef92468927003a037e175b54320319e358886865899b37f7318837a646a9fd"
"checksum postgres-protocol 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "7e2fc3d800dacc2dd749b690ad15b9b78bc04c26c3f0525cbe163436559bc3fc"
"checksum rand 0.3.14 (registry+https://github.com/rust-lang/crates.io-index)" = "2791d88c6defac799c3f20d74f094ca33b9332612d9aef9078519c82e4fe04a5"
"checksum regex 0.1.80 (registry+https://github.com/rust-lang/crates.io-index)" = "4fd4ace6a8cf7860714a2c2280d6c1f7e6a413486c13298bbc86fd3da019402f"
"checksum regex-syntax 0.3.9 (registry+https://github.com/rust-lang/crates.io-index)" = "f9ec002c35e86791825ed294b50008eea9ddfc8def4420124fbc6b08db834957"
"checksum ring 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)" = "c0743ef007bcff4909b107907a410418eb7e5c6ad55b843d70b39f62bfb7112e"
"checksum rustc-serialize 0.3.19 (registry+https://github.com/rust-lang/crates.io-index)" = "6159e4e6e559c81bd706afe9c8fd68f547d3e851ce12e76b1de7914bab61691b"
"checksum rustc_version 0.1.7 (registry+https://github.com/rust-lang/crates.io-index)" = "c5f5376ea5e30ce23c03eb77cbe4962b988deead10910c372b226388b594c084"
"checksum semver 0.1.20 (registry+https://github.com/rust-lang/crates.io-index)" = "d4f410fedcf71af0345d7607d246e7ad15faaadd49d240ee3b24e5dc21a820ac"
"checksum solicit 0.4.4 (registry+https://github.com/rust-lang/crates.io-index)" = "172382bac9424588d7840732b250faeeef88942e37b6e35317dce98cafdd75b2"
"checksum strsim 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)" = "50c069df92e4b01425a8bf3576d5d417943a6a7272fbabaf5bd80b1aaa76442e"
"checksum tempdir 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)" = "87974a6f5c1dfb344d733055601650059a3363de2a6104819293baff662132d6"
"checksum term_size 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "3f7f5f3f71b0040cecc71af239414c23fd3c73570f5ff54cf50e03cef637f2a0"
"checksum thread-id 2.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "a9539db560102d1cef46b8b78ce737ff0bb64e7e18d35b2a5688f7d097d0ff03"
"checksum thread_local 0.2.7 (registry+https://github.com/rust-lang/crates.io-index)" = "8576dbbfcaef9641452d5cf0df9b0e7eeab7694956dd33bb61515fb8f18cfdd5"
"checksum time 0.1.35 (registry+https://github.com/rust-lang/crates.io-index)" = "3c7ec6d62a20df54e07ab3b78b9a3932972f4b7981de295563686849eb3989af"
"checksum traitobject 0.0.1 (registry+https://github.com/rust-lang/crates.io-index)" = "07eaeb7689bb7fca7ce15628319635758eda769fed481ecfe6686ddef2600616"
"checksum typeable 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "1410f6f91f21d1612654e7cc69193b0334f909dcf2c790c4826254fbb86f8887"
"checksum unicase 1.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "13a5906ca2b98c799f4b1ab4557b76367ebd6ae5ef14930ec841c74aed5f3764"
"checksum unicode-bidi 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)" = "c1f7ceb96afdfeedee42bade65a0d585a6a0106f681b6749c8ff4daa8df30b3f"
"checksum unicode-normalization 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "26643a2f83bac55f1976fb716c10234485f9202dcd65cfbdf9da49867b271172"
"checksum unicode-segmentation 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "b905d0fc2a1f0befd86b0e72e31d1787944efef9d38b9358a9e92a69757f7e3b"
"checksum unicode-width 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)" = "2d6722facc10989f63ee0e20a83cd4e1714a9ae11529403ac7e0afd069abc39e"
"checksum untrusted 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)" = "5d9bc0e6e73a10975d1fbff8ac3541e221181b0d8998351600fb5523de634c0d"
"checksum url 1.2.3 (registry+https://github.com/rust-lang/crates.io-index)" = "48ccf7bd87a81b769cf84ad556e034541fb90e1cd6d4bc375c822ed9500cd9d7"
"checksum user32-sys 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "4ef4711d107b21b410a3a974b1204d9accc8b10dad75d8324b5d755de1617d47"
"checksum utf8-ranges 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)" = "a1ca13c08c41c9c3e04224ed9ff80461d97e121589ff27c753a16cb10830ae0f"
"checksum vec_map 0.6.0 (registry+https://github.com/rust-lang/crates.io-index)" = "cac5efe5cb0fa14ec2f84f83c701c562ee63f6dcc680861b21d65c682adfb05f"
"checksum winapi 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)" = "167dc9d6949a9b857f3451275e911c3f44255842c1f7a76f33c55103a909087a"
"checksum winapi-build 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "2d315eee3b34aca4797b2da6b13ed88266e6d612562a0c46390af8299fc699bc"

19
indexer/Cargo.toml Normal file
View file

@ -0,0 +1,19 @@
[package]
name = "indexer"
version = "0.1.0"
authors = ["Yorhel <git@yorhel.nl>"]
[dependencies]
regex = "0.1.77"
log = "0.3.6"
env_logger = "0.3.5"
lazy_static = "0.2.1"
libc = "0.2.17"
libarchive3-sys = "0.1.2"
encoding = { git = "https://github.com/lifthrasiir/rust-encoding", features = ["no-optimized-legacy-encoding"] }
ring = "0.5.3"
postgres = "0.12.0"
clap = "2.16.3"
hyper = "0.9.11"
url = "1.2.3"
chrono = "0.2.25"

343
indexer/src/archive.rs Normal file
View file

@ -0,0 +1,343 @@
use std::str;
use std::ptr;
use std::error::Error as ErrorTrait;
use std::io::{Result,Error,Read};
use std::ffi::{CStr,CString};
use libc::{c_void,ssize_t};
use libarchive3_sys::ffi;
/* This is a safe, limited and opinionated wrapper around the libarchive C bindings.
* I initially used the libarchive crate, but it has several issues. Some of which are not fixable
* without a complete rewrite.
* - Panics on non-UTF8 path names
* - Panics on hard links (PR #6)
* - API is far too flexible, easy to misuse and get panics/segfaults
* - Impossible to correctly read files from an archive (issue #7)
* - Does not provide a convenient Read interface for files
*
* Barring any unexpected behaviour or bugs in libarchive, the API below should not panic or
* segfault for any archive or usage pattern.
*/
pub struct Archive<'a> {
a: *mut ffi::Struct_archive,
rd: &'a mut Read,
buf: Vec<u8>,
err: Option<Error>,
eof: bool,
}
pub struct ArchiveEntry<'a> {
a: Box<Archive<'a>>,
e: *mut ffi::Struct_archive_entry,
}
pub struct RawEntry<'a>(Box<Archive<'a>>);
#[derive(Debug,PartialEq,Eq)]
pub enum FileType {
File,
Directory,
Link(String),
Other, // Also includes Link(<non-utf8-path>)
}
unsafe extern "C" fn archive_read_cb(_: *mut ffi::Struct_archive, data: *mut c_void, buf: *mut *const c_void) -> ssize_t {
let arch: &mut Archive = &mut *(data as *mut Archive);
*buf = arch.buf.as_mut_ptr() as *mut c_void;
match arch.rd.read(&mut arch.buf[..]) {
Ok(s) => s as ssize_t,
Err(e) => {
let desc = CString::new(e.description()).unwrap();
let fmt = CString::new("%s").unwrap();
ffi::archive_set_error(arch.a, e.raw_os_error().unwrap_or(0), fmt.as_ptr(), desc.as_ptr());
arch.err = Some(e);
-1
}
}
}
impl<'a> Archive<'a> {
fn new(rd: &mut Read, a: *mut ffi::Struct_archive) -> Result<Box<Archive>> {
let bufsize = 64*1024;
let mut buf = Vec::with_capacity(bufsize);
unsafe { buf.set_len(bufsize) };
let mut ret = Box::new(Archive { a: a, rd: rd, buf: buf, err: None, eof: false });
let aptr: *mut c_void = &mut *ret as *mut Archive as *mut c_void;
let r = unsafe { ffi::archive_read_open(a, aptr, None, Some(archive_read_cb), None) };
if r == ffi::ARCHIVE_FATAL {
return Err(ret.error());
}
Ok(ret)
}
fn error(&mut self) -> Error {
self.err.take().unwrap_or_else(|| {
let err = Error::from_raw_os_error(unsafe { ffi::archive_errno(self.a) });
let desc = unsafe { ffi::archive_error_string(self.a) };
if desc.is_null() {
return err;
}
if let Ok(s) = str::from_utf8(unsafe { CStr::from_ptr(desc) }.to_bytes()) {
Error::new(err.kind(), s)
} else {
err
}
})
}
fn entry(self: Box<Self>) -> Result<Option<ArchiveEntry<'a>>> {
let mut ent = ArchiveEntry {
a: self,
e: ptr::null_mut()
};
ent.a.eof = false;
let res = unsafe { ffi::archive_read_next_header(ent.a.a, &mut ent.e) };
match res {
ffi::ARCHIVE_EOF => Ok(None),
ffi::ARCHIVE_FATAL => Err(ent.a.error()),
_ => Ok(Some(ent))
}
}
fn read(&mut self, buf: &mut [u8]) -> Result<usize> {
// libarchive tends to throw an error if you try to read after an EOF; handle that case
// here.
if self.eof {
return Ok(0);
}
let cbuf = buf.as_mut_ptr() as *mut c_void;
let n = unsafe { ffi::archive_read_data(self.a, cbuf, buf.len()) };
if n >= 0 {
self.eof = n == 0;
Ok(n as usize)
} else {
Err(self.error())
}
}
pub fn open_archive(rd: &mut Read) -> Result<Option<ArchiveEntry>> {
let a = unsafe {
let a = ffi::archive_read_new();
ffi::archive_read_support_filter_all(a);
ffi::archive_read_support_format_all(a);
a
};
try!(Self::new(rd, a)).entry()
}
pub fn open_raw(rd: &mut Read) -> Result<RawEntry> {
let a = unsafe {
let a = ffi::archive_read_new();
ffi::archive_read_support_filter_all(a);
ffi::archive_read_support_format_raw(a);
ffi::archive_read_support_format_empty(a);
a
};
let mut a = try!(Self::new(rd, a));
let mut e: *mut ffi::Struct_archive_entry = ptr::null_mut();
let res = unsafe { ffi::archive_read_next_header(a.a, &mut e) };
match res {
ffi::ARCHIVE_FATAL => Err(a.error()),
ffi::ARCHIVE_EOF => {
a.eof = true;
Ok(RawEntry(a))
},
_ => Ok(RawEntry(a))
}
}
}
impl<'a> Drop for Archive<'a> {
fn drop(&mut self) {
unsafe {
ffi::archive_read_free(self.a);
}
}
}
impl<'a> ArchiveEntry<'a> {
pub fn next(self) -> Result<Option<ArchiveEntry<'a>>> {
self.a.entry()
}
// Returns None in NULL (when does that even happen?) or on invalid UTF-8.
pub fn path(&self) -> Option<&str> {
let c_str: &CStr = unsafe {
let ptr = ffi::archive_entry_pathname(self.e);
if ptr.is_null() {
return None;
}
CStr::from_ptr(ptr)
};
str::from_utf8(c_str.to_bytes()).ok()
// Perform some simple opinionated normalization. Full normalization might be better,
// but also slower and more complex. This solution covers the most important cases.
.map(|s| s.trim_left_matches('/').trim_left_matches("./").trim_right_matches('/'))
}
pub fn size(&self) -> usize {
unsafe { ffi::archive_entry_size(self.e) as usize }
}
fn symlink(&self) -> Option<String> {
let c_str: &CStr = unsafe {
let ptr = ffi::archive_entry_symlink(self.e);
if ptr.is_null() {
return None;
}
CStr::from_ptr(ptr)
};
str::from_utf8(c_str.to_bytes()).map(str::to_string).ok()
}
fn hardlink(&self) -> Option<String> {
let c_str: &CStr = unsafe {
let ptr = ffi::archive_entry_hardlink(self.e);
if ptr.is_null() {
return None;
}
CStr::from_ptr(ptr)
};
// Hard links have the same name as an earlier pathname(), and those typically don't have a
// preceding slash. Add this slash here so that the same resolution logic can be used for
// both hardlinks and symlinks. I really don't care about the difference between these two.
str::from_utf8(c_str.to_bytes()).map(|p| format!("/{}", p)).ok()
}
pub fn filetype(&self) -> FileType {
// If it has a symlink/hardlink path, then just consider it a link regardless of what
// _filetype() says.
if let Some(l) = self.symlink().or(self.hardlink()) {
return FileType::Link(l);
}
match unsafe { ffi::archive_entry_filetype(self.e) } {
ffi::AE_IFDIR => FileType::Directory,
ffi::AE_IFREG => FileType::File,
_ => FileType::Other,
}
}
}
impl<'a> Read for ArchiveEntry<'a> {
fn read(&mut self, buf: &mut [u8]) -> Result<usize> {
self.a.read(buf)
}
}
impl<'a> Read for RawEntry<'a> {
fn read(&mut self, buf: &mut [u8]) -> Result<usize> {
self.0.read(buf)
}
}
// We can't provide an Iterator object for ArchiveEntries because Rust doesn't support streaming
// iterators. Let's instead provide a walk function for convenience.
// cb should return Ok(true) to continue, Ok(false) to break
pub fn walk<F>(ent: Option<ArchiveEntry>, mut cb: F) -> Result<()>
where F: FnMut(&mut ArchiveEntry) -> Result<bool>
{
let mut ent = ent;
while let Some(mut e) = ent {
if !try!(cb(&mut e)) {
break;
}
ent = try!(e.next());
}
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
use std;
use std::io::Read;
use std::fs::File;
#[test]
fn invalid() {
let mut r = std::io::repeat(0x0a).take(64*1024);
let ent = Archive::open_archive(&mut r);
assert!(ent.is_err());
}
#[test]
fn zerolength() {
let mut r = std::io::empty();
{
let ent = Archive::open_archive(&mut r);
assert!(ent.unwrap().is_none());
}
{
let mut ent = Archive::open_raw(&mut r).unwrap();
let mut v = Vec::new();
assert_eq!(ent.read_to_end(&mut v).unwrap(), 0);
}
}
#[test]
fn archive() {
let mut f = File::open("tests/simpletest.tar.gz").unwrap();
let mut ent = Archive::open_archive(&mut f).unwrap().unwrap();
let t = |e:&mut ArchiveEntry, path, size, ft, cont| {
assert_eq!(e.path(), path);
assert_eq!(e.size(), size);
assert_eq!(e.filetype(), ft);
let mut contents = String::new();
assert_eq!(e.read_to_string(&mut contents).unwrap(), size);
assert_eq!(&contents, cont);
};
t(&mut ent, Some("simple"), 0, FileType::Directory, "");
ent = ent.next().unwrap().unwrap();
t(&mut ent, Some("simple/file"), 3, FileType::File, "Hi\n");
ent = ent.next().unwrap().unwrap();
t(&mut ent, Some("simple/link"), 0, FileType::Link("file".to_string()), "");
ent = ent.next().unwrap().unwrap();
t(&mut ent, Some("simple/hardlink"), 0, FileType::Link("/simple/file".to_string()), "");
ent = ent.next().unwrap().unwrap();
t(&mut ent, Some("simple/fifo"), 0, FileType::Other, "");
ent = ent.next().unwrap().unwrap();
t(&mut ent, None, 0, FileType::File, "");
assert!(ent.next().unwrap().is_none());
}
#[test]
fn raw() {
let mut f = File::open("tests/rawtest.gz.xz.bzip2").unwrap();
let mut r = Archive::open_raw(&mut f).unwrap();
let mut c = String::new();
r.read_to_string(&mut c).unwrap();
assert_eq!(&c, "File contents!\n");
}
#[test]
fn raw_passthrough() {
let mut r = std::io::Cursor::new(&b"This is an uncompressed text file"[..]);
let mut ent = Archive::open_raw(&mut r).unwrap();
let mut s = String::new();
ent.read_to_string(&mut s).unwrap();
assert_eq!(&s, "This is an uncompressed text file");
}
}

363
indexer/src/archread.rs Normal file
View file

@ -0,0 +1,363 @@
use std::io::Result;
use std::collections::HashMap;
use archive::{walk,ArchiveEntry,FileType};
/* I had hoped that reading man pages from an archive would just be a simple:
*
* 1. Walk through all files in the archive in a streaming fashion
* 2. Parse/index man pages
*
* But alas, it was not to be. Symlinks and hardlinks have ruined it. Now we have to...
*
* 1. Walk through all entries in the archive in a streaming fashion
* 2. Parse/index regular file man pages
* 3. Keep track of all paths in the archive
* 4. Use the result of step (3) to resolve symlinks/hardlinks to their actual file
* 5. Read the entire damn archive again if one of the links resolved to a file that was not
* recognized as a man page in step (2). Luckily, this isn't very common.
*
* And this doesn't even cover the problem of duplicate entries in a tar, which is also quite
* annoying to handle.
*
* What annoys me the most about all of this is that it's not possible to stream an archive from
* the network and read/index the entire thing in a single step. Now we either have to buffer
* packages to disk or redownload the archive in order to be able to follow all links to man pages.
*
* (Note that it is possible to resolve links while walking through the entries, which will allow
* us to match files found later in the archive against links found earlier, thus potentially
* saving the need to read the archive a second time. This is merely a performance improvement for
* an uncommon case, and it certainly won't simplify the code)
*
* (Note that it's also possible to just flush all files <10MB* to disk to completely avoid the
* need for a second archive read, but that's going to significantly slow down the common case in
* order to handle a rare case. It's possible to further optimize this using some heuristics to
* determine whether a file is potentially a man page, but that's both complex and may not even
* save much)
*
* (* So apparently some man pages are close to 10MB...)
*/
#[derive(Clone,Debug,PartialEq,Eq)]
pub enum EntryType {
// Regular file that has been handled/indexed
Handled,
// Regular file that hasn't been handled because the caller wasn't interested in it. Could
// still be an interesting file if it is referenced from an interesting path.
Regular,
// Link to another file (interesting or not is irrelevant)
Link(String),
// Directory; need this information when resolving links
Directory,
// Something that couldn't be an interesting file (chardev/socket/etc); If any link resolves to
// this we know we're done.
Other,
}
pub struct FileList {
// List of seen files. This is used to resolve links
seen: HashMap<String, EntryType>,
// List of interesting links
links: Vec<String>,
}
pub struct MissedFiles(HashMap<String, Vec<String>>);
impl FileList {
/* Read an archive until the end. Accepts two callbacks:
*
* interest_cb: Called on every path in the archive, should return whether the file is
* interesting (i.e. whether we want to know its contents).
* file_cb: Called on every regular file for which interest_cb() showed an interest.
* The callback accepts multiple path names, but this function will only provide one.
*
* Returns a FileList struct that can be used to retreive all interesting non-regular files.
*/
pub fn read<F,G>(ent: Option<ArchiveEntry>, interest_cb: F, mut file_cb: G) -> Result<FileList>
where F: Fn(&str) -> bool, G: FnMut(&[&str], &mut ArchiveEntry) -> Result<()>
{
let mut fl = FileList {
seen: HashMap::new(),
links: Vec::new(),
};
try!(walk(ent, |mut e| {
let path = match e.path() {
Some(x) => x.to_string(),
None => { warn!("Invalid UTF-8 filename in archive"); return Ok(true) }
};
let ft = e.filetype();
trace!("Archive entry: {:10} {} {:?}", e.size(), path, ft);
// We ought to throw away the result of the previous entry with the same name and use
// this new entry instead, but fuck it. This case is too rare, so let's just warn.
if let Some(_) = fl.seen.get(&path) {
warn!("Duplicate file entry: {}", path);
return Ok(true);
}
let et = match ft {
FileType::File => {
if interest_cb(&path) {
let pathv = [&path as &str];
try!(file_cb(&pathv[..], &mut e));
EntryType::Handled
} else {
EntryType::Regular
}
},
FileType::Link(l) => {
if interest_cb(&path) {
fl.links.push(path.clone());
}
EntryType::Link(l)
},
FileType::Directory => EntryType::Directory,
FileType::Other => EntryType::Other,
};
fl.seen.insert(path, et);
Ok(true)
}));
Ok(fl)
}
// This is basically realpath(), using the virtual filesystem in self.seen.
// This method is not particularly efficient, it allocates like crazy.
fn resolve_link(&self, base: &str, path: &str, depth: usize) -> Option<(EntryType, Vec<String>)> {
if depth < 1 {
warn!("Unresolved link: {} -> {}; Recursion depth exceeded", base, path);
return None
}
// Remove filename from the base
let basedir = if let Some(i) = base.rfind('/') { base.split_at(i).0 } else { return None };
let comp : Vec<&str> =
if path.starts_with('/') { path.split('/').collect() }
else { basedir.split('/').chain(path.split('/')).collect() };
let mut dest = Vec::new();
for (i, &c) in comp.iter().enumerate() {
if c == "" || c == "." {
continue;
}
if c == ".." {
if dest.len() > 1 {
dest.pop();
}
continue;
}
dest.push(c.to_string());
let curpath = dest.join("/");
match self.seen.get(&curpath) {
// If it's a directory, we're good
Some(&EntryType::Directory) => (),
// If it's a file or man page, it must be the last item.
Some(& ref x@ EntryType::Regular) |
Some(& ref x@ EntryType::Handled) => return
if i == comp.len()-1 {
Some((x.clone(), dest))
} else {
warn!("Unresolved link: {} -> {}; Non-directory component", base, path);
None
},
// Links... Ugh
Some(&EntryType::Link(ref d)) => {
match self.resolve_link(&curpath, &d, depth-1) {
// Same as above, with dirs we can continue, files have to be last
Some((EntryType::Directory, d)) => dest = d,
x@Some((EntryType::Regular, _)) |
x@Some((EntryType::Handled, _)) => return
if i == comp.len()-1 { x }
else {
warn!("Unresolved link: {} -> {}; Non-directory link component", base, path);
None
},
_ => return None,
}
},
// Don't care about anything else, just stop.
_ => {
warn!("Unresolved link: {} -> {}; Special or non-existing file", base, path);
return None
}
}
}
Some((EntryType::Directory, dest))
}
/* Calls cb() on every 'interesting' link to a file that has already been passed to a file_cb()
* in FileList::read().
* If there are any interesting links that have not yet been passed to file_cb(), a MissedFiles
* struct is returned that can be used to retrieve those files by re-reading the archive.
*/
pub fn links<F>(self, mut cb: F) -> Option<MissedFiles> where F: FnMut(&str, &str) {
let mut missed = HashMap::new();
for p in self.links.iter() {
let dest = match self.seen.get(p) { Some(&EntryType::Link(ref x)) => x, _ => unreachable!() };
match self.resolve_link(&p, dest, 32) {
Some((EntryType::Handled, d)) => {
let dstr = d.join("/");
cb(&p, &dstr);
},
Some((EntryType::Regular, d)) => {
let dstr = d.join("/");
missed.entry(dstr).or_insert_with(Vec::new).push(p.to_string());
}
_ => (),
}
}
if missed.len() > 0 {
Some(MissedFiles(missed))
} else {
None
}
}
}
impl MissedFiles {
/* Reads the archive again and calls file_cb() on every interesting file that was missed during
* the first read of the archive (using FileList::{read,links}). file_cb is exactly the same as
* in FileList::read, but this time it can actually get multiple paths as first argument; which
* happens when multiple interesting links point to the same file. */
pub fn read<G>(mut self, ent: Option<ArchiveEntry>, mut file_cb: G) -> Result<()>
where G: FnMut(&[&str], &mut ArchiveEntry) -> Result<()>
{
walk(ent, |mut e| {
if let Some(f) = e.path().and_then(|p| self.0.remove(p)) {
let v: Vec<&str> = f.iter().map(|x| x as &str).collect();
try!(file_cb(&v, &mut e))
}
Ok(self.0.len() > 0)
})
}
}
#[cfg(test)]
mod tests {
use super::*;
use archive::Archive;
use std::io::Read;
use std::fs::File;
fn test_read() -> FileList {
let mut f = File::open("tests/testarchive.tar.xz").unwrap();
let arch = Archive::open_archive(&mut f).unwrap();
let mut cnt = 0;
FileList::read(arch,
|p| p.starts_with("man/man"),
|p,e| {
assert_eq!(cnt, 0);
cnt += 1;
assert_eq!(p, &["man/man3/helloworld.3"][..]);
assert_eq!(e.size(), 12);
let mut cont = String::new();
e.read_to_string(&mut cont).unwrap();
assert_eq!(&cont, "Hello World\n");
Ok(())
}
).unwrap()
}
fn test_resolve_links(r: &FileList) {
let res = |p| {
if let Some(&EntryType::Link(ref l)) = r.seen.get(p) {
r.resolve_link(p, &l, 5)
} else {
panic!("Not found or not a link: {}", p);
}
};
let helloworld = Some((EntryType::Handled, vec!["man".to_string(), "man3".to_string(), "helloworld.3".to_string()]));
assert_eq!(res("man/mans"), Some((EntryType::Directory, vec!["man".to_string(), "man3".to_string()])));
assert_eq!(res("man/man6/hardlink.6"), helloworld);
assert_eq!(res("man/man1/symlinkbefore.1"), helloworld);
assert_eq!(res("man/man6/symlinkafter.6"), helloworld);
assert_eq!(res("man/man1/badsymlink1.1"), None);
assert_eq!(res("man/man1/badsymlink2.1"), None);
assert_eq!(res("man/man1/badsymlink3.1"), None);
assert_eq!(res("man/man1/badsymlink4.1"), None);
assert_eq!(res("man/man1/badsymlink5.1"), None);
assert_eq!(res("man/man1/doublesymlink1.1"), helloworld);
assert_eq!(res("man/man1/doublesymlink2.1"), helloworld);
assert_eq!(res("man/man1/triplesymlink.1"), helloworld);
assert_eq!(res("man/man1/infinitesymlink.1"), None);
}
fn test_links(r: FileList) -> Option<MissedFiles> {
let mut links = Vec::new();
let missed = r.links(|p,d| links.push((p.to_string(), d.to_string())));
links.sort();
{
let mut res = |p:&str| {
let r = links.remove(0);
assert_eq!(r.0, p.to_string());
assert_eq!(r.1, "man/man3/helloworld.3".to_string());
};
res("man/man1/doublesymlink1.1");
res("man/man1/doublesymlink2.1");
res("man/man1/symlinkbefore.1");
res("man/man1/triplesymlink.1");
res("man/man6/hardlink.6");
res("man/man6/symlinkafter.6");
}
assert_eq!(links.len(), 0);
missed
}
fn test_reread(r: MissedFiles) {
let mut f = File::open("tests/testarchive.tar.xz").unwrap();
let ent = Archive::open_archive(&mut f).unwrap();
let mut files = Vec::new();
r.read(ent,
|p,e| {
let mut cont = String::new();
e.read_to_string(&mut cont).unwrap();
files.extend(p.iter().map(|x| (x.to_string(), cont.clone()) ));
Ok(())
}
).unwrap();
files.sort();
{
let mut res = |a:&str, b:&str| {
let r = files.remove(0);
assert_eq!(&r.0, a);
assert_eq!(&r.1, b);
};
res("man/man3/needreread.3", "Potentially interesting file\n");
res("man/man6/needreread.6", "Potentially interesting file\n");
}
assert_eq!(files.len(), 0);
}
#[test]
fn test_reader() {
//use env_logger;
//env_logger::init().unwrap();
let r = test_read();
test_resolve_links(&r);
let l = test_links(r).unwrap();
test_reread(l);
}
}

95
indexer/src/main.rs Normal file
View file

@ -0,0 +1,95 @@
#[macro_use] extern crate log;
#[macro_use] extern crate lazy_static;
#[macro_use] extern crate clap;
extern crate env_logger;
extern crate regex;
extern crate libarchive3_sys;
extern crate libc;
extern crate ring;
extern crate encoding;
extern crate postgres;
extern crate hyper;
extern crate url;
extern crate chrono;
mod archive;
mod archread;
mod man;
mod open;
mod pkg;
mod sys_arch;
// Convenience function to get a system id by short-name. Panics if the system doesn't exist.
fn sysbyshort(conn: &postgres::GenericConnection, short: &str) -> i32 {
let r = conn.query("SELECT id FROM systems WHERE short = $1", &[&short]).unwrap();
if r.is_empty() {
panic!("Invalid system: {}", short);
}
r.get(0).get(0)
}
fn main() {
let arg = clap_app!(indexer =>
(about: "Manned.org man page indexer")
(@arg v: -v +multiple "Increase verbosity")
(@subcommand pkg =>
(about: "Index a single package")
(@arg force: --force "Overwrite existing indexed package")
(@arg sys: --sys +required +takes_value "System short-name")
(@arg cat: --cat +required +takes_value "Package category")
(@arg pkg: --pkg +required +takes_value "Package name")
(@arg ver: --ver +required +takes_value "Package version")
(@arg date: --date +required +takes_value "Package release date")
(@arg FILE: +required "Package file")
)
(@subcommand arch =>
(about: "Index an Arch Linux repository")
(@arg sys: --sys +required +takes_value "System short-name")
(@arg mirror: --mirror +required +takes_value "Mirror URL")
(@arg repo: --repo +required +takes_value "Repository name")
)
).get_matches();
let verbose = arg.occurrences_of("v");
env_logger::LogBuilder::new()
.filter(Some("indexer"), match verbose {
0 => log::LogLevelFilter::Warn,
1 => log::LogLevelFilter::Info,
2 => log::LogLevelFilter::Debug,
_ => log::LogLevelFilter::Trace,
})
.filter(Some("postgres"), if verbose >= 4 { log::LogLevelFilter::Trace } else { log::LogLevelFilter::Info })
.init().unwrap();
let dbhost = match std::env::var("MANNED_PG") {
Ok(x) => x,
Err(_) => { error!("MANNED_PG not set."); return }
};
let db = match postgres::Connection::connect(&dbhost[..], postgres::TlsMode::None) {
Ok(x) => x,
Err(x) => { error!("Can't connect to postgres: {}", x); return },
};
debug!("Connected to database");
if let Some(matches) = arg.subcommand_matches("pkg") {
pkg::pkg(&db, pkg::PkgOpt {
force: matches.is_present("force"),
sys: sysbyshort(&db, matches.value_of("sys").unwrap()),
cat: matches.value_of("cat").unwrap(),
pkg: matches.value_of("pkg").unwrap(),
ver: matches.value_of("ver").unwrap(),
date: matches.value_of("date").unwrap(),
file: open::Path{ path: matches.value_of("FILE").unwrap(), cache: false, canbelocal: true},
});
}
if let Some(matches) = arg.subcommand_matches("arch") {
sys_arch::sync(&db,
sysbyshort(&db, matches.value_of("sys").unwrap()),
matches.value_of("mirror").unwrap(),
matches.value_of("repo").unwrap()
);
}
}

301
indexer/src/man.rs Normal file
View file

@ -0,0 +1,301 @@
use std::str;
use std::io;
use std::io::Read;
use regex::bytes;
use regex::Regex;
use encoding;
use encoding::{all,EncodingRef};
use encoding::label::encoding_from_whatwg_label;
use ring::digest;
use archive::Archive;
// Anything larger than this just isn't a man page. I hope.
const MAX_MAN_SIZE: u64 = 20*1024*1024;
// I've also not seen valid man pages smaller than this
const MIN_MAN_SIZE: u64 = 9;
// Checks a path for a man page candidate. Returns None if it doesn't seem like a man page
// location, otherwise Some((manPageName, Section, Locale)).
pub fn parse_path(path: &str) -> Option<(&str, &str, &str)> {
// Roughly: man[/locale]/man1/manpage.section[.compression]+
lazy_static! {
static ref RE: Regex = Regex::new(r"(?x)
man
(?: / ([^/]+) )? # Optional locale
/man[a-z0-9]/ # Subdir
([^/]+?) # Man page name (non-greedy)
\. ([^/\.]+) # Section
(?: \. (?: gz|lzma|bz2|xz ))* $ # Any number of compression extensions
").unwrap();
}
let cap = match RE.captures(path) { Some(x) => x, None => return None };
let locale = cap.at(1).unwrap_or("");
let name = cap.at(2).unwrap();
let section = cap.at(3).unwrap();
// Not everything matching the regex is necessarily a man page, exclude some special cases.
match (name, section, locale) {
// Files that totally aren't man pages
("Makefile", "am", _) |
(".cvsignore", _, _) |
(_, "in", _) |
(_, "gz", _) |
(_, "lzma", _) |
(_, "bz2", _) |
(_, "xz", _) |
(_, "html", _) => None,
// Some weird directories that happen to match the locale
(n, s, "5man") |
(n, s, "c") |
(n, s, "man1") |
(n, s, "man2") |
(n, s, "man3") |
(n, s, "man4") |
(n, s, "man5") |
(n, s, "man6") |
(n, s, "man7") |
(n, s, "man8") |
(n, s, "Man-Part1") |
(n, s, "Man-Part2") => Some((n, s, "")),
// Nothing special!
x => Some(x)
}
}
// Convenient wrapper for archread's interest_cb
pub fn ismanpath(path: &str) -> bool {
parse_path(path).is_some()
}
fn validate(data: &Vec<u8>) -> Option<&'static str> {
lazy_static! {
static ref HTML: bytes::Regex = bytes::Regex::new(r"^\s*<(?:html|head|!DOCTYPE)").unwrap();
}
if data.len() >= MAX_MAN_SIZE as usize {
Some("File too large")
} else if data.len() < MIN_MAN_SIZE as usize {
Some("File too small")
} else if &data[..] == &b".so man3/\n"[..] {
Some("Contents: '.so man3/'")
} else if &data[..] == &b"timestamp\n"[..] {
Some("Contents: 'timestamp'")
} else if HTML.is_match(&data) {
Some("Looks like an HTML file")
} else {
None
}
}
// Look for 'coding:' indications in the file header, a la preconv(1).
fn codec_from_tag(data: &Vec<u8>) -> Option<EncodingRef> {
lazy_static! {
// According to the emacs docs the tag should be on the first line; according to preconv(1)
// it should be on the first or second line. I've also seen some files with the tag on the
// last line. I've not seen the tag itself used in a different context, so just get it from
// anywhere...
static ref TAG: bytes::Regex = bytes::Regex::new(r"-\*-.*coding:\s*(?u:([^\s;]+)).*-\*").unwrap();
}
let cap = match TAG.captures(&data) { Some(x) => x, None => return None };
let tag = str::from_utf8(cap.at(1).unwrap()).unwrap().to_lowercase();
match &tag[..] {
// Deny some common UTF-8-compatible encodings. These tags are obviously incorrect.
"us-ascii" | "ascii" | "utf8" | "utf-8" | "utf-8-unix" => None,
// latin-1 isn't in the whatwg spec under that name
"latin-1" => Some(all::WINDOWS_1252),
// armscii isn't in the whatwg spec at all
"armscii-8" => Some(all::ARMSCII_8),
// Anything else should be found by its whatwg label.
x => match encoding_from_whatwg_label(x) {
Some(x) => Some(x),
None => { warn!("Unknown encoding in emacs tag: {}", x); None },
}
}
}
fn codec_from_path(path: &str) -> Option<EncodingRef> {
let locale = match parse_path(path) {
Some((_,_,l)) if l != "" => l.to_lowercase(),
_ => return None,
};
lazy_static! {
static ref RE: Regex = Regex::new(r"^(?x)
([a-z]+) # primary language
(?:_ ([a-z]+))? # secondary language
(?:@ [a-z]+)? # script (potentially useful, but uncommon and not currently used)
(?:\. ([^\.@]+))? # encoding (FUCKING USEFUL)
$").unwrap();
}
let cap = match RE.captures(&locale) { Some(x) => x, None => return None };
let lang = cap.at(1).unwrap();
let seclang = cap.at(2);
let enc = cap.at(3);
// Try to do something with the encoding tag
match (lang, enc) {
(_, Some("eucjp")) |
(_, Some("ujis")) | // Not sure about this one, but it seems to come out alright
("ja", Some("euc")) => return Some(all::EUC_JP),
(_, Some("euckr")) => return Some(all::WINDOWS_949),
("ja", Some("jis7")) |
("ja", Some("pck")) => return None, /* WAT? TODO: DO SOMETHING WITH THESE */
(_, Some(x)) => match encoding_from_whatwg_label(x) {
Some(x) => return Some(x),
_ => { warn!("Unknown encoding in locale: {}", x) },
},
_ => {},
};
// Fall back to language
match (lang, seclang) {
("pl", _) |
("cs", _) |
("hr", _) |
("hu", _) |
("sl", _) |
("sk", _) => Some(all::ISO_8859_2),
("bg", _) |
("be", _) |
("uk", _) => Some(all::ISO_8859_5),
("el", _) => Some(all::ISO_8859_7),
("et", _) => Some(all::ISO_8859_15),
("tr", _) => Some(all::WINDOWS_1254),
("ru", _) => Some(all::KOI8_R),
("ja", _) |
("jp", _) => Some(all::EUC_JP), // Tricky; but JIS is certainly less common
("zh", Some("cn")) => Some(all::GBK), // These are based purely on what I've observed,
("zh", _) => Some(all::BIG5_2003), // perhaps some heuristics based on contents can do better
("ko", _) => Some(all::WINDOWS_949),
(_, _) => None,
}
}
// Decompresses / decodes a man page and returns its SHA-1 hash, encoding name, and UTF-8 contents.
pub fn decode(paths: &[&str], ent: &mut Read) -> io::Result<(digest::Digest,&'static str,String)> {
let mut decomp = try!(Archive::open_raw(ent)).take(MAX_MAN_SIZE+1);
let mut data = Vec::new();
try!(decomp.read_to_end(&mut data));
if let Some(e) = validate(&data) {
return Err(io::Error::new(io::ErrorKind::InvalidData, e));
}
let dig = digest::digest(&digest::SHA1, &data);
// TODO: Handle BOM? UTF-16?
// If it passes as UTF-8, then just consider it UTF-8.
if let Ok(_) = str::from_utf8(&data) {
return Ok((dig, "utf8", unsafe { String::from_utf8_unchecked(data) } ));
}
// Otherwise, look for a coding tag in the contents
if let Some(e) = codec_from_tag(&data) {
if let Ok(s) = e.decode(&data, encoding::DecoderTrap::Strict) {
return Ok((dig, e.name(), s));
}
}
// If that fails as well, look for clues in the file path.
for path in paths {
if let Some(e) = codec_from_path(path) {
if let Ok(s) = e.decode(&data, encoding::DecoderTrap::Strict) {
return Ok((dig, e.name(), s));
}
}
}
// If all else fails, use a lossy iso-8859-1
Ok((dig, "iso-8859-1", (all::ISO_8859_1 as EncodingRef).decode(&data, encoding::DecoderTrap::Ignore).unwrap() ))
}
#[test]
fn test_parse_path() {
// Generic tests
assert_eq!(parse_path("/"), None);
assert_eq!(parse_path("/man1/ncdu.1"), None);
assert_eq!(parse_path("/man/man?/ncdu.1"), None);
assert_eq!(parse_path("/man/man1/ncdu.1"), Some(("ncdu", "1", "")));
assert_eq!(parse_path("/man/man1/ncdu.1.gz.lzma.xz.bz2.gz"), Some(("ncdu", "1", ""))); // This stuff happens
assert_eq!(parse_path("/man/en_US.UTF-8/man1/ncdu.1"), Some(("ncdu", "1", "en_US.UTF-8")));
// Special cases
assert_eq!(parse_path("/usr/share/man/man1/INDEX"), None);
assert_eq!(parse_path("/usr/share/man/man1/Makefile"), None);
assert_eq!(parse_path("/usr/share/man/man1/Makefile.am"), None);
assert_eq!(parse_path("/usr/share/man/man1/Makefile.in"), None);
assert_eq!(parse_path("/usr/share/man/man1/.cvsignore"), None);
assert_eq!(parse_path("/usr/share/man/man1/.cvsignore.gz"), None);
// Some actual locations
assert_eq!(parse_path("/usr/local/man/man1/list_audio_tracks.1.gz"), Some(("list_audio_tracks", "1", "")));
assert_eq!(parse_path("/usr/local/lib/perl5/site_perl/man/man3/DBIx::Class::Helper::ResultSet::DateMethods1::Announcement.3.gz"), Some(("DBIx::Class::Helper::ResultSet::DateMethods1::Announcement", "3", "")));
assert_eq!(parse_path("/usr/man/man3/exit.3tk"), Some(("exit", "3tk", "")));
assert_eq!(parse_path("/usr/local/brlcad/share/man/mann/exit.nged.gz"), Some(("exit", "nged", "")));
assert_eq!(parse_path("/usr/X11R6/man/man3/intro.3xglut.gz"), Some(("intro", "3xglut", "")));
assert_eq!(parse_path("/usr/local/share/man/ko_KR.eucKR/man3/intro.3.gz"), Some(("intro", "3", "ko_KR.eucKR")));
assert_eq!(parse_path("/usr/lib/scilab/man/Man-Part1/man1/ans.1"), Some(("ans", "1", "")));
assert_eq!(parse_path("/heirloom/usr/share/man/5man/man1/chgrp.1.gz"), Some(("chgrp", "1", "")));
assert_eq!(parse_path("/usr/local/plan9/man/man8/index.html"), None);
assert_eq!(parse_path("/usr/local/share/doc/gmt/html/man/grdpaste.html"), None);
}
#[test]
fn test_codec_from_path() {
let t = |p,n| {
assert_eq!(codec_from_path(p).unwrap().name(), n);
};
t("man/de_DE.ISO8859-15/man1/scribus.1.gz", "iso-8859-15");
t("man/de_DE.ISO_8859-1/man1/scribus.1.gz", "windows-1252");
t("man/ja.UTF-8/man1/test.1", "utf-8");
t("man/ja_JP/man1/test.1", "euc-jp");
t("man/ja_JP.EUC/man1/test.1", "euc-jp");
t("man/ja_JP.SJIS/man1/test.1", "windows-31j");
t("man/jp.eucJP/man1/test.1", "euc-jp");
t("man/jp/man1/test.1", "euc-jp");
t("man/lt.ISO8859-13/man1/test.1", "iso-8859-13");
t("man/ru/man1/test.1", "koi8-r");
t("man/ru_RU@Cyr/man1/test.1", "koi8-r");
t("man/zh_CN/man1/test.1", "gbk");
t("man/zh_TW/man1/test.1", "big5-2003");
}
#[test]
fn test_decode_zh() {
use std::fs::File;
use ring::test::from_hex;
// cat exit.1.gz | lzma -d | gzip -d | sha1sum
let filehash = from_hex("cdf9b3e8d96a83c908eb0a0c277485e2f3eebe87").unwrap();
// cat exit.1.gz | lzma -d | gzip -d | iconv -f gbk -t utf8 | sha1sum
let utf8hash = from_hex("47f3e441137b207c0abdc38adac692298da4927a").unwrap();
let mut f = File::open("tests/exit.3.gz.lzma").unwrap();
let (dig, enc, s) = decode(&["bullshit", "/usr/share/man/zh_CN/man3/exit.3.gz"][..], &mut f).unwrap();
assert_eq!(dig.as_ref(), &filehash[..]);
assert_eq!(enc, "gbk");
let utf8dig = digest::digest(&digest::SHA1, s.as_bytes());
assert_eq!(utf8dig.as_ref(), &utf8hash[..]);
}

82
indexer/src/open.rs Normal file
View file

@ -0,0 +1,82 @@
use std::io::{Read,Result,Error,ErrorKind,copy};
use std::fs::{File,create_dir_all,metadata};
use std::hash::{Hash,Hasher,SipHasher};
use std::time::{Duration,SystemTime};
use url::Url;
use hyper;
const CACHE_PATH: &'static str = "/var/tmp/manned-indexer";
const CACHE_TIME: u64 = 24*3600;
pub struct Path<'a> {
pub path: &'a str,
pub cache: bool,
pub canbelocal: bool,
}
fn cache_fn(url: &Url) -> String {
let name = url.path_segments().unwrap().last().unwrap();
let name = if name == "" { "index" } else { name };
let mut hash = SipHasher::new();
url.hash(&mut hash);
format!("{}/{}-{}-{:x}", CACHE_PATH, url.host_str().unwrap(), name, hash.finish())
}
fn fetch(url: &str) -> Result<Box<Read>> {
let res = try!(hyper::Client::new()
.get(url)
.header(hyper::header::UserAgent("Man page crawler (info@manned.org; https://manned.org/)".to_owned()))
.send()
.map_err(|e| Error::new(ErrorKind::Other, format!("Hyper: {}", e)))
);
if !res.status.is_success() {
return Err(Error::new(ErrorKind::Other, format!("HTTP: {}", res.status) ));
}
Ok(Box::new(res) as Box<Read>)
}
fn file(path: &str) -> Result<Box<Read>> {
Ok(Box::new(try!(File::open(path))) as Box<Read>)
}
impl<'a> Path<'a> {
pub fn open(&self) -> Result<Box<Read>> {
if let Ok(url) = Url::parse(self.path) {
if url.scheme() != "http" && url.scheme() != "https" {
return Err(Error::new(ErrorKind::Other, "Invalid scheme"));
}
if self.cache {
let cfn = cache_fn(&url);
if let Ok(m) = metadata(&cfn) {
if m.modified().unwrap() > SystemTime::now() - Duration::from_secs(CACHE_TIME) {
return file(&cfn);
}
}
try!(create_dir_all(CACHE_PATH));
{
let mut rd = try!(fetch(url.as_str()));
let mut wr = try!(File::create(&cfn));
try!(copy(&mut rd, &mut wr));
}
file(&cfn)
} else {
fetch(url.as_str())
}
} else if self.canbelocal {
file(self.path)
} else {
Err(Error::new(ErrorKind::Other, "Invalid URL"))
}
}
}

142
indexer/src/pkg.rs Normal file
View file

@ -0,0 +1,142 @@
use std;
use std::io::Read;
use postgres;
use open;
use archread;
use man;
use archive::{Archive,ArchiveEntry};
pub struct PkgOpt<'a> {
pub force: bool,
pub sys: i32,
pub cat: &'a str,
pub pkg: &'a str,
pub ver: &'a str,
pub date: &'a str, // TODO: Option to extract date from package metadata itself
pub file: open::Path<'a>
}
fn insert_pkg(tr: &postgres::transaction::Transaction, opt: &PkgOpt) -> Option<i32> {
// The ON CONFLICT .. DO UPDATE is used instead of DO NOTHING because in that case the
// RETURNING clause wouldn't give us a package id.
let q = "INSERT INTO packages (system, category, name) VALUES($1, $2, $3)
ON CONFLICT ON CONSTRAINT packages_system_name_category_key DO UPDATE SET name=$3 RETURNING id";
let pkgid: i32 = match tr.query(q, &[&opt.sys, &opt.cat, &opt.pkg]) {
Err(e) => {
error!("Can't insert package in database: {}", e);
return None;
},
Ok(r) => r.get(0).get(0),
};
let q = "SELECT id FROM package_versions WHERE package = $1 AND version = $2";
let res = tr.query(q, &[&pkgid, &opt.ver]).unwrap();
let verid : i32;
if res.is_empty() {
let q = "INSERT INTO package_versions (package, version, released) VALUES($1, $2, $3::text::date) RETURNING id";
verid = tr.query(q, &[&pkgid, &opt.ver, &opt.date]).unwrap().get(0).get(0);
info!("New package pkgid {} verid {}", pkgid, verid);
Some(verid)
} else if opt.force {
verid = res.get(0).get(0);
info!("Overwriting package pkgid {} verid {}", pkgid, verid);
tr.query("DELETE FROM man WHERE package = $1", &[&verid]).unwrap();
Some(verid)
} else {
info!("Package already in database, pkgid {} verid {}", pkgid, res.get(0).get::<usize,i32>(0));
None
}
}
fn insert_man_row(tr: &postgres::GenericConnection, verid: i32, path: &str, hash: &[u8]) {
// TODO: Store 'encoding' in the database
let (name, sect, locale) = man::parse_path(path).unwrap();
if let Err(e) = tr.execute(
"INSERT INTO man (package, name, filename, locale, hash, section) VALUES ($1, $2, '/'||$3, $4, $5, $6)",
&[&verid, &name, &path, &locale, &hash, &sect]
) {
// I think this can only happen if archread gives us the same file twice, which really
// shouldn't happen. But I'd rather continue with an error logged than panic.
error!("Can't insert verid {} fn {}: {}", verid, path, e);
}
}
fn insert_man(tr: &postgres::GenericConnection, verid: i32, paths: &[&str], ent: &mut Read) {
let (dig, enc, cont) = match man::decode(paths, ent) {
Err(e) => { error!("Error decoding {}: {}", paths[0], e); return },
Ok(x) => x,
};
// Overwrite entry if the contents are different. It's possible that earlier decoding
// implementations didn't properly detect the encoding. (On the other hand, due to differences
// in filenames it's also possible that THIS decoding step went wrong, but that's slightly less
// likely)
tr.execute(
"INSERT INTO contents (hash, content) VALUES($1, $2) ON CONFLICT (hash) DO UPDATE SET content = $2",
&[&dig.as_ref(), &cont]
).unwrap();
for path in paths {
insert_man_row(tr, verid, path, dig.as_ref());
debug!("Inserted man page: {} ({})", path, enc);
}
}
fn insert_link(tr: &postgres::GenericConnection, verid: i32, src: &str, dest: &str) {
let hash = tr.query("SELECT hash FROM man WHERE package = $1 AND filename = '/'||$2", &[&verid, &dest]).unwrap();
if hash.is_empty() { /* Can happen if man::decode() failed previously. */
error!("Link to unindexed man page: {} -> {}", src, dest);
return;
}
let hash: Vec<u8> = hash.get(0).get(0);
insert_man_row(tr, verid, src, &hash);
debug!("Inserted man link: {} -> {}", src, dest);
}
fn index_pkg(tr: &postgres::GenericConnection, opt: &PkgOpt, verid: i32) -> std::io::Result<()> {
let indexfunc = |paths: &[&str], ent: &mut ArchiveEntry| {
insert_man(tr, verid, paths, ent);
Ok(()) /* Don't propagate errors, continue handling other man pages */
};
let mut rd = try!(opt.file.open());
let missed = try!(archread::FileList::read(
try!(Archive::open_archive(&mut rd)),
man::ismanpath, &indexfunc))
.links(|src, dest| { insert_link(tr, verid, src, dest) });
if let Some(missed) = missed {
warn!("Some links were missed, reading package again");
let mut rd = try!(opt.file.open());
try!(missed.read(try!(Archive::open_archive(&mut rd)), indexfunc));
}
Ok(())
}
pub fn pkg(conn: &postgres::GenericConnection, opt: PkgOpt) {
info!("Handling pkg: {} / {} / {} - {} @ {} @ {}", opt.sys, opt.cat, opt.pkg, opt.ver, opt.date, opt.file.path);
let tr = conn.transaction().unwrap();
tr.set_rollback();
let verid = match insert_pkg(&tr, &opt) { Some(x) => x, None => return };
match index_pkg(&tr, &opt, verid) {
Err(e) => error!("Error reading package: {}", e),
Ok(_) => tr.set_commit()
}
if let Err(e) = tr.finish() {
error!("Error finishing transaction: {}", e);
}
}

128
indexer/src/sys_arch.rs Normal file
View file

@ -0,0 +1,128 @@
use std::str::FromStr;
use std::io::{Read,BufRead,BufReader,Result};
use regex::Regex;
use chrono::NaiveDateTime;
use postgres;
use archive;
use open;
use man;
use pkg;
struct Meta {
filename: String,
name: String,
version: String,
date: String,
}
fn read_files<T: Read>(lst: T) -> Result<bool> {
let rd = BufReader::new(lst);
for line in rd.lines() {
let line = try!(line);
if man::ismanpath(&line) {
return Ok(true);
}
}
Ok(false)
}
fn read_desc(rd: &mut archive::ArchiveEntry) -> Result<Option<Meta>> {
let mut data = String::new();
try!(rd.take(64*1024).read_to_string(&mut data));
let path = rd.path().unwrap();
lazy_static! {
static ref RE: Regex = Regex::new(r"\s*%([^%]+)%\s*\n\s*([^\n]+)\s*\n").unwrap();
}
let mut filename = None;
let mut name = None;
let mut version = None;
let mut builddate = None;
for kv in RE.captures_iter(&data) {
let key = kv.at(1).unwrap();
let val = kv.at(2).unwrap();
trace!("{}: {} = {}", path, key, val);
match key {
"FILENAME" => filename = Some(val),
"NAME" => name = Some(val),
"VERSION" => version = Some(val),
"BUILDDATE" => builddate = i64::from_str(val).ok(),
_ => {},
}
}
if filename.is_some() && name.is_some() && version.is_some() && builddate.is_some() {
Ok(Some(Meta {
filename: filename.unwrap().to_string(),
name: name.unwrap().to_string(),
version: version.unwrap().to_string(),
date: NaiveDateTime::from_timestamp(builddate.unwrap(), 0).format("%Y-%m-%d").to_string(),
}))
} else {
warn!("Metadata missing from package description: {}", path);
Ok(None)
}
}
// TODO: Switch to x86_64 instead of i686
pub fn sync(pg: &postgres::GenericConnection, sys: i32, mirror: &str, repo: &str) {
info!("Reading packages from {} {}", mirror, repo);
let path = format!("{}/{}/os/i686/{1:}.files.tar.gz", mirror, repo);
let path = open::Path{ path: &path, cache: true, canbelocal: false };
let mut index = match path.open() {
Err(e) => { error!("Can't read package index: {}", e); return },
Ok(x) => x,
};
let ent = match archive::Archive::open_archive(&mut index) {
Err(e) => { error!("Can't read package index: {}", e); return },
Ok(x) => x,
};
let mut hasman = false;
let mut meta = None;
let r = archive::walk(ent, |x| {
if x.filetype() == archive::FileType::Directory {
hasman = false;
meta = None;
} else if x.path().unwrap().ends_with("/files") {
hasman = try!(read_files(x));
} else if x.path().unwrap().ends_with("/desc") {
meta = try!(read_desc(x));
}
if hasman && meta.is_some() {
hasman = false;
let m = meta.take().unwrap();
let p = format!("{}/{}/os/i686/{}", mirror, repo, m.filename);
pkg::pkg(pg, pkg::PkgOpt{
force: false,
sys: sys,
cat: repo,
pkg: &m.name,
ver: &m.version,
date: &m.date,
file: open::Path{
path: &p,
cache: false,
canbelocal: false,
},
});
}
Ok(true)
});
if let Err(e) = r {
error!("Error reading package index: {}", e);
}
}

Binary file not shown.

71
indexer/tests/mkarchives.sh Executable file
View file

@ -0,0 +1,71 @@
#!/bin/sh
# The order of inserting the files into the tar is not fully deterministic this
# way. The tests will fail quite badly if a hardlink is considered the
# "original" version.
# simpletest.tar.gz
mkdir simple
echo Hi >simple/file
ln -s file simple/link
ln simple/file simple/hardlink
mkfifo simple/fifo
badfn=`echo 'Héllö.txt' | iconv -t ISO-8859-1`
touch $badfn
tar -czf simpletest.tar.gz simple $badfn
rm -rf $badfn simple
# rawtest.gz.xz.bzip2
echo "File contents!" | gzip | xz | bzip2 >rawtest.gz.xz.bzip2
# testarchive.tar.xz
mkdir man
cd man
mkdir man1
mkdir man3
mkdir man6
ln -s man3 mans
echo 'Hello World' >man3/helloworld.3
echo 'Not a very interesting file' >notinteresting
echo 'Potentially interesting file' >possiblyinteresting
ln man3/helloworld.3 man6/hardlink.6
ln -s ../man3/helloworld.3 man1/symlinkbefore.1
ln -s ../man3/helloworld.3 man6/symlinkafter.6
ln -s notadir/../../man3/helloworld.3 man1/badsymlink1.1
ln -s man3/helloworld.3 man1/badsymlink2.1
ln -s ../man3/helloworld.3/. man1/badsymlink3.1
ln -s ../man3/helloworld.3/../helloworld.3 man1/badsymlink4.1
ln -s ../man1/symlinkbefore.1/../../man1/helloworld.3 man1/badsymlink5.1
ln -s symlinkbefore.1 man1/doublesymlink1.1
ln -s ../mans/helloworld.3 man1/doublesymlink2.1
ln -s ../mans/../man1/symlinkbefore.1 man1/triplesymlink.1
ln -s infinitesymlink.1 man1/infinitesymlink.1
ln -s ../possiblyinteresting man3/needreread.3
ln -s ../possiblyinteresting man6/needreread.6
cd ..
rm -f testarchive.tar
tar -cf testarchive.tar man/
rm -r man/
mkdir man
echo 'Overwritten file' >man/possiblyinteresting
tar -rf testarchive.tar man/
rm -r man/
rm -f testarchive.tar.xz
xz testarchive.tar

Binary file not shown.

Binary file not shown.

Binary file not shown.

View file

@ -1,76 +0,0 @@
#!/bin/bash
# Usage: ./arch.sh
# Synchronises the database with an Arch mirror, fetching any packages that
# aren't yet in the database and may have man pages.
MIRROR=http://ftp.nluug.nl/pub/os/Linux/distr/archlinux
REPOS="core extra community"
DEBUG=false
SYSID=1
. ./common.sh
checkpkg() {
REPO=$1
FN=$2
D="$TMP/$REPO/$FN"
if [ ! \( -d "$D" -a -f "$D/files" -a -f "$D/desc" \) ]; then
echo "===> $FN"
echo "Invalid item, ignoring"
return
fi
grep -q /man/ "$D/files"
if [ "$?" -ne 0 ]; then
$DEBUG && echo "===> $FN"
$DEBUG && echo "No mans"
return
fi
# Somewhat inefficient description parsing
FILENAME=`grep -A 1 '%FILENAME%' "$D/desc" | tail -n 1`
NAME=`grep -A 1 '%NAME%' "$D/desc" | tail -n 1`
VERSION=`grep -A 1 '%VERSION%' "$D/desc" | tail -n 1`
BUILDDATE=`grep -A 1 '%BUILDDATE%' "$D/desc" | tail -n 1`
if [ -z "$FILENAME" -o -z "$NAME" -o -z "$VERSION" -o -z "$BUILDDATE" ]; then
echo "===> $FN"
echo "Invalid/missing description info"
return
fi
BUILDDATE=`date -d "@$BUILDDATE" '+%F'`
add_pkginfo $SYSID "$REPO" "$NAME" "$VERSION" "$BUILDDATE"
if [ "$?" -eq 0 ]; then
$DEBUG && echo "===> $FN"
$DEBUG && echo "Already up-to-date"
return
fi
echo "===> $FN"
F="$TMP/$REPO/$FILENAME"
$CURL "$MIRROR/$REPO/os/i686/$FILENAME" -o "$F" || return
add_tar "$F" "$PKGID"
rm -f "$F"
}
syncrepo() {
REPO=$1
F="$TMP/$REPO/repo.tar.gz"
echo "============ $MIRROR $REPO"
$CURL "$MIRROR/$REPO/os/i686/$REPO.files.tar.gz" -o "$F" || return 1
tar -C "$TMP/$REPO" -xf "$F" || return 1
rm -f "$F"
for fn in "$TMP/$REPO"/*; do
checkpkg "$REPO" `basename "$fn"`
done
}
for r in $REPOS; do
mkdir "$TMP/$r"
syncrepo $r
rm -rf "$TMP/$r"
done

View file

@ -2,7 +2,7 @@
. ./common.sh
./arch.sh
./index.sh daily
./deb.sh ubuntu_active
./deb.sh debian_active
echo "============ Updating SQL indices"

22
util/index.sh Executable file
View file

@ -0,0 +1,22 @@
if test -f .config; then
source .config
fi
INDEX="./indexer -vv"
set -x
arch() {
local MIRROR=http://ftp.nluug.nl/pub/os/Linux/distr/archlinux
local REPOS="core extra community"
for REPO in $REPOS; do
$INDEX arch --sys arch --mirror $MIRROR --repo $REPO
done
}
daily() {
arch
}
$@

1
util/indexer Symbolic link
View file

@ -0,0 +1 @@
../indexer/target/release/indexer