Auto-detect man compression from content rather than filename
This fixes the indexing of some packages. The most common error is to have a doubly-gzip-compressed file with only one '.gz' extension, which would previously not be detected.
This commit is contained in:
parent
862f42dce5
commit
39e61a3cfb
1 changed files with 40 additions and 34 deletions
|
|
@ -12,6 +12,9 @@ use Encode 'decode', 'find_encoding', 'decode_utf8';
|
||||||
use Digest::SHA 'sha1_hex';
|
use Digest::SHA 'sha1_hex';
|
||||||
use File::Find;
|
use File::Find;
|
||||||
use DBI;
|
use DBI;
|
||||||
|
use Compress::Zlib ();
|
||||||
|
use Compress::Raw::Bzip2 ();
|
||||||
|
use Compress::Raw::Lzma ();
|
||||||
|
|
||||||
die "Not enough arguments\n" if @ARGV < 2;
|
die "Not enough arguments\n" if @ARGV < 2;
|
||||||
my($dir, $pkgid) = @ARGV;
|
my($dir, $pkgid) = @ARGV;
|
||||||
|
|
@ -29,41 +32,39 @@ sub readman {
|
||||||
my $dat = <$F>;
|
my $dat = <$F>;
|
||||||
close $F;
|
close $F;
|
||||||
|
|
||||||
# Note: Don't forget to update 'section_from_filename()' in SQL when a new
|
# Ignore the filename extensions when decompressing - those aren't reliable.
|
||||||
# compression file extension is recognized.
|
# Instead just pass stuff through the decompressors and let them fail if the
|
||||||
my $fn = $ofn;
|
# format isn't correct.
|
||||||
while(1) {
|
|
||||||
if($fn =~ s/\.gz$//) {
|
my @alg;
|
||||||
require Compress::Zlib;
|
|
||||||
$dat = Compress::Zlib::memGunzip($dat);
|
while(length $dat) {
|
||||||
die "Error decompressing '$ofn': $Compress::Zlib::gzerrno\n" if !defined $dat;
|
my($ndat, $s, $o, $r);
|
||||||
next;
|
|
||||||
}
|
# gzip
|
||||||
if($fn =~ s/\.bz2$//) {
|
$ndat = Compress::Zlib::memGunzip($dat);
|
||||||
# Don't try to use Compress::Bzip2::memBunzip() here. It's been terribly
|
defined $ndat and push(@alg, 'gz') and ($dat = $ndat) and next;
|
||||||
# broken for at least 3 years:
|
|
||||||
# https://rt.cpan.org/Public/Bug/Display.html?id=48128
|
# bzip2
|
||||||
require Compress::Raw::Bzip2;
|
# Don't try to use Compress::Bzip2::memBunzip() here. It's been terribly
|
||||||
my($b, $s) = Compress::Raw::Bunzip2->new();
|
# broken for at least 3 years:
|
||||||
my $r;
|
# https://rt.cpan.org/Public/Bug/Display.html?id=48128
|
||||||
die "Error decompressing '$ofn': Opening bzip2 decompressor: $s\n" if $s != Compress::Raw::Bzip2::BZ_OK();
|
($o, $s) = Compress::Raw::Bunzip2->new();
|
||||||
die "Error decompressing '$ofn': $s\n" if ($s = $b->bzinflate($dat, $r)) != Compress::Raw::Bzip2::BZ_STREAM_END();
|
die "Error opening bzip2 decompressor: $s\n" if $s != Compress::Raw::Bzip2::BZ_OK();
|
||||||
$dat = $r;
|
$ndat = $dat;
|
||||||
next;
|
$o->bzinflate($ndat, $r) == Compress::Raw::Bzip2::BZ_STREAM_END() and push(@alg, 'bz2') and ($dat = $r) and next;
|
||||||
}
|
|
||||||
if($fn =~ s/\.lzma$//) {
|
# lzma
|
||||||
require Compress::Raw::Lzma;
|
($o, $s) = Compress::Raw::Lzma::AutoDecoder->new();
|
||||||
my($l, $s) = Compress::Raw::Lzma::AutoDecoder->new();
|
die "Error opening lzma decompressor: $s\n" if $s != Compress::Raw::Lzma::LZMA_OK();
|
||||||
my $r;
|
$ndat = $dat;
|
||||||
die "Error decompressing '$ofn': Opening lzma decompressor: $s\n" if $s != Compress::Raw::Lzma::LZMA_OK();
|
$r = '';
|
||||||
die "Error decompressing '$ofn': $s\n" if ($s = $l->code($dat, $r)) != Compress::Raw::Lzma::LZMA_STREAM_END();
|
$o->code($ndat, $r) == Compress::Raw::Lzma::LZMA_STREAM_END() and push(@alg, 'lzma') and ($dat = $r) and next;
|
||||||
$dat = $r;
|
|
||||||
next;
|
|
||||||
}
|
|
||||||
last;
|
last;
|
||||||
}
|
}
|
||||||
|
|
||||||
return $dat;
|
return $dat, \@alg;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -117,7 +118,7 @@ sub decodeman {
|
||||||
|
|
||||||
sub addman {
|
sub addman {
|
||||||
my($pkg, $path, $fn, $locale) = @_;
|
my($pkg, $path, $fn, $locale) = @_;
|
||||||
my $dat = readman $fn;
|
my($dat, $alg) = readman $fn;
|
||||||
my $hash = sha1_hex $dat;
|
my $hash = sha1_hex $dat;
|
||||||
|
|
||||||
my($enc, $dec) = decodeman($dat, $locale);
|
my($enc, $dec) = decodeman($dat, $locale);
|
||||||
|
|
@ -131,7 +132,12 @@ sub addman {
|
||||||
VALUES(?,name_from_filename(?),section_from_filename(?),?,?,decode(?, 'hex'))}, {},
|
VALUES(?,name_from_filename(?),section_from_filename(?),?,?,decode(?, 'hex'))}, {},
|
||||||
$pkg, $path, $path, $path, $locale, $hash);
|
$pkg, $path, $path, $path, $locale, $hash);
|
||||||
|
|
||||||
printf "$path ($enc)\n";
|
# Report issues with the decompression extension
|
||||||
|
my $correctext = @$alg ? '.'.join('.', @$alg) : '';
|
||||||
|
my($tmp, $foundext) = ($path, '');
|
||||||
|
$foundext = ".$1$foundext" while($tmp =~ s/\.(gz|bz2|lzma)$//);
|
||||||
|
|
||||||
|
printf "%s (%s%s)\n", $path, $enc, $foundext ne $correctext ? " - Incorrect file extension, expected $correctext" : '';
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue