#!/usr/bin/perl # Usage: ./add_dir.pl # Prints the path names of the found man pages on stdout. # May throw errors or warnings on stderr. # Returns 0 if it has added something, 1 on error or if nothing has been found. use strict; use warnings; no warnings 'once'; use Encode 'decode', 'find_encoding', 'decode_utf8'; use Digest::SHA 'sha1_hex'; use File::Find; use DBI; die "Not enough arguments\n" if @ARGV < 2; my($dir, $pkgid) = @ARGV; my $db = DBI->connect('dbi:Pg:dbname=manned', 'manned', '', { pg_enable_utf8 => 1, PrintError => 0, RaiseError => 1, AutoCommit => 0 }); sub readman { my $ofn = shift; local $/; open my $F, '<', $ofn or die "Unable to open '$ofn': $!\n"; my $dat = <$F>; close $F; # Note: Don't forget to update 'section_from_filename()' in SQL when a new # compression file extension is recognized. my $fn = $ofn; while(1) { if($fn =~ s/\.gz$//) { require Compress::Zlib; $dat = Compress::Zlib::memGunzip($dat); die "Error decompressing '$ofn': $Compress::Zlib::gzerrno\n" if !defined $dat; next; } if($fn =~ s/\.bz2$//) { # Don't try to use Compress::Bzip2::memBunzip() here. It's been terribly # broken for at least 3 years: # https://rt.cpan.org/Public/Bug/Display.html?id=48128 require Compress::Raw::Bzip2; my($b, $s) = Compress::Raw::Bunzip2->new(); my $r; die "Error decompressing '$ofn': Opening bzip2 decompressor: $s\n" if $s != Compress::Raw::Bzip2::BZ_OK(); die "Error decompressing '$ofn': $s\n" if ($s = $b->bzinflate($dat, $r)) != Compress::Raw::Bzip2::BZ_STREAM_END(); $dat = $r; next; } if($fn =~ s/\.lzma$//) { require Compress::Raw::Lzma; my($l, $s) = Compress::Raw::Lzma::AutoDecoder->new(); my $r; die "Error decompressing '$ofn': Opening lzma decompressor: $s\n" if $s != Compress::Raw::Lzma::LZMA_OK(); die "Error decompressing '$ofn': $s\n" if ($s = $l->code($dat, $r)) != Compress::Raw::Lzma::LZMA_STREAM_END(); $dat = $r; next; } last; } return $dat; } sub decodeman { my($data, $locale) = @_; my @enc = ('utf-8'); # No harm in trying utf-8 first. # Check for 'coding:' indications in the file header. # According to preconv.1, only the first two lines are checked. I've not seen # any man page where this coding information was on the second line, though. # Note that that man page also mentions some aliasses that Perl's # find_encoding doesn't have. Again, I've not found any man page using those. my $re = qr/[\.']?\\["#].+-\*-.*coding: *([^ ;]+).+-\*-/; if($data =~ /^$re/ || $data =~ /^.*\n$re/) { (my $c = $1) =~ s/-(?:dos|unix|mac)$//; $c = find_encoding $c; $c = $c->name if $c; push @enc, $c if $c && $c ne 'ascii' && $c ne 'utf8' && $c ne 'utf-8-strict'; } # Get encoding from the locale part of the path my $locenc = $locale && find_encoding $locale; unshift @enc, $locenc->name if $locenc; # Some language-specific fallbacks # TODO: Handle zh_* locales $locale && push @enc, $locale =~ /^(pl|cs|sk)/i ? 'iso-8859-2' : $locale =~ /^tr/i ? 'iso-8859-9' : $locale =~ /^ru/i ? 'koi8-r' # TODO: Or iso-8859-5, probably want to autodetect that? : $locale =~ /^ja/i ? 'euc-jp' # TODO: Works for everything I've found yet, but Japanese isn't that simple. Probably want to detect Shift-JIS as well? : $locale =~ /^ko/i ? 'euc-kr' #: $locale =~ /^el/i ? 'iso-8859-7' # So far, all el mans I've seen were UTF-8. : (); # If all else fails. push @enc, 'iso-8859-1'; # Now try decoding my($dec, $enc); for(@enc) { $enc = $_; $dec = eval { my $tmp = $data; decode($enc, $tmp, 1) }; last if $dec; } return $dec ? ($enc, $dec) : (); } sub addman { my($pkg, $path, $fn, $locale) = @_; my $dat = readman $fn; my $hash = sha1_hex $dat; my($enc, $dec) = decodeman($dat, $locale); print "Invalid encoding or empty file: $path\n" and return if !$enc; $db->do(q{INSERT INTO contents (hash, content) VALUES(decode(?, 'hex'),?)}, {}, $hash, $dec) if !$db->selectrow_arrayref(q{SELECT 1 FROM contents WHERE hash = decode(?, 'hex')}, {}, $hash); $db->do(q{ INSERT INTO man (package, name, section, filename, locale, hash) VALUES(?,name_from_filename(?),section_from_filename(?),?,?,decode(?, 'hex'))}, {}, $pkg, $path, $path, $path, $locale, $hash); printf "$path ($enc)\n"; } my $found = 0; find sub { return if !-f $_; (my $path = $File::Find::name) =~ s/^\Q$dir\E//; # Note: fltk also creates pre-formatted pages in /cat$sectre/, but those are ignored. # TODO: Also ignore html and INDEX sections return warn "Ignoring $path\n" if $path !~ m{man(?:/([^/]+))?/man[0-9n]/([^/]+)$}; addman $pkgid, $path, $2, $1; $found++; }, $dir; if($found) { $db->commit; } else { warn "No man pages found.\n"; $db->rollback; exit 1; }