Auto-detect man compression from content rather than filename

This fixes the indexing of some packages. The most common error is to
have a doubly-gzip-compressed file with only one '.gz' extension, which
would previously not be detected.
This commit is contained in:
Yorhel 2012-07-06 12:39:12 +02:00
parent 862f42dce5
commit 39e61a3cfb

View file

@ -12,6 +12,9 @@ use Encode 'decode', 'find_encoding', 'decode_utf8';
use Digest::SHA 'sha1_hex'; use Digest::SHA 'sha1_hex';
use File::Find; use File::Find;
use DBI; use DBI;
use Compress::Zlib ();
use Compress::Raw::Bzip2 ();
use Compress::Raw::Lzma ();
die "Not enough arguments\n" if @ARGV < 2; die "Not enough arguments\n" if @ARGV < 2;
my($dir, $pkgid) = @ARGV; my($dir, $pkgid) = @ARGV;
@ -29,41 +32,39 @@ sub readman {
my $dat = <$F>; my $dat = <$F>;
close $F; close $F;
# Note: Don't forget to update 'section_from_filename()' in SQL when a new # Ignore the filename extensions when decompressing - those aren't reliable.
# compression file extension is recognized. # Instead just pass stuff through the decompressors and let them fail if the
my $fn = $ofn; # format isn't correct.
while(1) {
if($fn =~ s/\.gz$//) { my @alg;
require Compress::Zlib;
$dat = Compress::Zlib::memGunzip($dat); while(length $dat) {
die "Error decompressing '$ofn': $Compress::Zlib::gzerrno\n" if !defined $dat; my($ndat, $s, $o, $r);
next;
} # gzip
if($fn =~ s/\.bz2$//) { $ndat = Compress::Zlib::memGunzip($dat);
# Don't try to use Compress::Bzip2::memBunzip() here. It's been terribly defined $ndat and push(@alg, 'gz') and ($dat = $ndat) and next;
# broken for at least 3 years:
# https://rt.cpan.org/Public/Bug/Display.html?id=48128 # bzip2
require Compress::Raw::Bzip2; # Don't try to use Compress::Bzip2::memBunzip() here. It's been terribly
my($b, $s) = Compress::Raw::Bunzip2->new(); # broken for at least 3 years:
my $r; # https://rt.cpan.org/Public/Bug/Display.html?id=48128
die "Error decompressing '$ofn': Opening bzip2 decompressor: $s\n" if $s != Compress::Raw::Bzip2::BZ_OK(); ($o, $s) = Compress::Raw::Bunzip2->new();
die "Error decompressing '$ofn': $s\n" if ($s = $b->bzinflate($dat, $r)) != Compress::Raw::Bzip2::BZ_STREAM_END(); die "Error opening bzip2 decompressor: $s\n" if $s != Compress::Raw::Bzip2::BZ_OK();
$dat = $r; $ndat = $dat;
next; $o->bzinflate($ndat, $r) == Compress::Raw::Bzip2::BZ_STREAM_END() and push(@alg, 'bz2') and ($dat = $r) and next;
}
if($fn =~ s/\.lzma$//) { # lzma
require Compress::Raw::Lzma; ($o, $s) = Compress::Raw::Lzma::AutoDecoder->new();
my($l, $s) = Compress::Raw::Lzma::AutoDecoder->new(); die "Error opening lzma decompressor: $s\n" if $s != Compress::Raw::Lzma::LZMA_OK();
my $r; $ndat = $dat;
die "Error decompressing '$ofn': Opening lzma decompressor: $s\n" if $s != Compress::Raw::Lzma::LZMA_OK(); $r = '';
die "Error decompressing '$ofn': $s\n" if ($s = $l->code($dat, $r)) != Compress::Raw::Lzma::LZMA_STREAM_END(); $o->code($ndat, $r) == Compress::Raw::Lzma::LZMA_STREAM_END() and push(@alg, 'lzma') and ($dat = $r) and next;
$dat = $r;
next;
}
last; last;
} }
return $dat; return $dat, \@alg;
} }
@ -117,7 +118,7 @@ sub decodeman {
sub addman { sub addman {
my($pkg, $path, $fn, $locale) = @_; my($pkg, $path, $fn, $locale) = @_;
my $dat = readman $fn; my($dat, $alg) = readman $fn;
my $hash = sha1_hex $dat; my $hash = sha1_hex $dat;
my($enc, $dec) = decodeman($dat, $locale); my($enc, $dec) = decodeman($dat, $locale);
@ -131,7 +132,12 @@ sub addman {
VALUES(?,name_from_filename(?),section_from_filename(?),?,?,decode(?, 'hex'))}, {}, VALUES(?,name_from_filename(?),section_from_filename(?),?,?,decode(?, 'hex'))}, {},
$pkg, $path, $path, $path, $locale, $hash); $pkg, $path, $path, $path, $locale, $hash);
printf "$path ($enc)\n"; # Report issues with the decompression extension
my $correctext = @$alg ? '.'.join('.', @$alg) : '';
my($tmp, $foundext) = ($path, '');
$foundext = ".$1$foundext" while($tmp =~ s/\.(gz|bz2|lzma)$//);
printf "%s (%s%s)\n", $path, $enc, $foundext ne $correctext ? " - Incorrect file extension, expected $correctext" : '';
} }