manned/util/add_dir.pl
2012-06-15 14:23:29 +02:00

159 lines
4.8 KiB
Perl
Executable file

#!/usr/bin/perl
# Usage: ./add_dir.pl <dir> <pkgid>
# Prints the path names of the found man pages on stdout.
# May throw errors or warnings on stderr.
# Returns 0 if it has added something, 1 on error or if nothing has been found.
use strict;
use warnings;
no warnings 'once';
use Encode 'decode', 'find_encoding', 'decode_utf8';
use Digest::SHA 'sha1_hex';
use File::Find;
use DBI;
die "Not enough arguments\n" if @ARGV < 2;
my($dir, $pkgid) = @ARGV;
my $db = DBI->connect('dbi:Pg:dbname=manned', 'manned', '', {
pg_enable_utf8 => 1, PrintError => 0, RaiseError => 1, AutoCommit => 0
});
sub readman {
my $ofn = shift;
local $/;
open my $F, '<', $ofn or die "Unable to open '$ofn': $!\n";
my $dat = <$F>;
close $F;
# Note: Don't forget to update 'section_from_filename()' in SQL when a new
# compression file extension is recognized.
my $fn = $ofn;
while(1) {
if($fn =~ s/\.gz$//) {
require Compress::Zlib;
$dat = Compress::Zlib::memGunzip($dat);
die "Error decompressing '$ofn': $Compress::Zlib::gzerrno\n" if !defined $dat;
next;
}
if($fn =~ s/\.bz2$//) {
# Don't try to use Compress::Bzip2::memBunzip() here. It's been terribly
# broken for at least 3 years:
# https://rt.cpan.org/Public/Bug/Display.html?id=48128
require Compress::Raw::Bzip2;
my($b, $s) = Compress::Raw::Bunzip2->new();
my $r;
die "Error decompressing '$ofn': Opening bzip2 decompressor: $s\n" if $s != Compress::Raw::Bzip2::BZ_OK();
die "Error decompressing '$ofn': $s\n" if ($s = $b->bzinflate($dat, $r)) != Compress::Raw::Bzip2::BZ_STREAM_END();
$dat = $r;
next;
}
if($fn =~ s/\.lzma$//) {
require Compress::Raw::Lzma;
my($l, $s) = Compress::Raw::Lzma::AutoDecoder->new();
my $r;
die "Error decompressing '$ofn': Opening lzma decompressor: $s\n" if $s != Compress::Raw::Lzma::LZMA_OK();
die "Error decompressing '$ofn': $s\n" if ($s = $l->code($dat, $r)) != Compress::Raw::Lzma::LZMA_STREAM_END();
$dat = $r;
next;
}
last;
}
return $dat;
}
sub decodeman {
my($data, $locale) = @_;
my @enc = ('utf-8'); # No harm in trying utf-8 first.
# Check for 'coding:' indications in the file header.
# According to preconv.1, only the first two lines are checked. I've not seen
# any man page where this coding information was on the second line, though.
# Note that that man page also mentions some aliasses that Perl's
# find_encoding doesn't have. Again, I've not found any man page using those.
my $re = qr/[\.']?\\["#].+-\*-.*coding: *([^ ;]+).+-\*-/;
if($data =~ /^$re/ || $data =~ /^.*\n$re/) {
(my $c = $1) =~ s/-(?:dos|unix|mac)$//;
$c = find_encoding $c;
$c = $c->name if $c;
push @enc, $c if $c && $c ne 'ascii' && $c ne 'utf8' && $c ne 'utf-8-strict';
}
# Get encoding from the locale part of the path
my $locenc = $locale && find_encoding $locale;
unshift @enc, $locenc->name if $locenc;
# Some language-specific fallbacks
# TODO: Handle zh_* locales
$locale && push @enc,
$locale =~ /^(pl|cs|sk)/i ? 'iso-8859-2'
: $locale =~ /^tr/i ? 'iso-8859-9'
: $locale =~ /^ru/i ? 'koi8-r' # TODO: Or iso-8859-5, probably want to autodetect that?
: $locale =~ /^ja/i ? 'euc-jp' # TODO: Works for everything I've found yet, but Japanese isn't that simple. Probably want to detect Shift-JIS as well?
: $locale =~ /^ko/i ? 'euc-kr'
#: $locale =~ /^el/i ? 'iso-8859-7' # So far, all el mans I've seen were UTF-8.
: ();
# If all else fails.
push @enc, 'iso-8859-1';
# Now try decoding
my($dec, $enc);
for(@enc) {
$enc = $_;
$dec = eval { my $tmp = $data; decode($enc, $tmp, 1) };
last if $dec;
}
return $dec ? ($enc, $dec) : ();
}
sub addman {
my($pkg, $path, $fn, $locale) = @_;
my $dat = readman $fn;
my $hash = sha1_hex $dat;
my($enc, $dec) = decodeman($dat, $locale);
print "Invalid encoding or empty file: $path\n" and return if !$enc;
$db->do(q{INSERT INTO contents (hash, content) VALUES(decode(?, 'hex'),?)}, {}, $hash, $dec)
if !$db->selectrow_arrayref(q{SELECT 1 FROM contents WHERE hash = decode(?, 'hex')}, {}, $hash);
$db->do(q{
INSERT INTO man (package, name, section, filename, locale, hash)
VALUES(?,name_from_filename(?),section_from_filename(?),?,?,decode(?, 'hex'))}, {},
$pkg, $path, $path, $path, $locale, $hash);
printf "$path ($enc)\n";
}
my $found = 0;
find sub {
return if !-f $_;
(my $path = $File::Find::name) =~ s/^\Q$dir\E//;
# Note: fltk also creates pre-formatted pages in /cat$sectre/, but those are ignored.
# TODO: Also ignore html and INDEX sections
return warn "Ignoring $path\n" if $path !~ m{man(?:/([^/]+))?/man[0-9n]/([^/]+)$};
addman $pkgid, $path, $2, $1;
$found++;
}, $dir;
if($found) {
$db->commit;
} else {
warn "No man pages found.\n";
$db->rollback;
exit 1;
}