#!/usr/bin/perl use strict; use warnings; use TUWF ':html', 'html_escape', ':xml'; use IPC::Open2; use IO::Select; use Encode 'encode_utf8', 'decode_utf8'; use Time::HiRes 'tv_interval', 'gettimeofday'; use JSON::XS; use Cwd 'abs_path'; our $ROOT; BEGIN { ($ROOT = abs_path $0) =~ s{/www/index\.pl$}{}; } use lib "$ROOT/lib/ManUtils/inst/lib/perl5"; use ManUtils; TUWF::set( logfile => $ENV{TUWF_LOG}, db_login => [undef, undef, undef], debug => 1, xml_pretty => 0, # Cache the system information pre_request_handler => sub { my $self = shift; if(!$self->{systems}) { $self->{systems} = $self->dbSystemGet; $_->{full} = $_->{name}.($_->{release}?' '.$_->{release}:'') for(@{$self->{systems}}); $self->{sysbyid} = { map +($_->{id}, $_), @{$self->{systems}} }; $self->{sysbyshort} = { map +($_->{short}, $_), @{$self->{systems}} }; } 1; }, error_404_handler => sub { my $self = shift; $self->resStatus(404); my $title = 'No manual entry for '.$self->reqPath; $self->htmlHeader(title => $title); h1 $title; p 'That is, the page you were looking for doesn\'t exist.'; $self->htmlFooter; }, ); TUWF::register( qr// => \&home, qr{info/about} => \&about, qr{browse/search} => \&browsesearch, qr{browse/([^/]+)} => \&browsesys, qr{browse/([^/]+)/([^/]+)(?:/([^/]+))?} => \&browsepkg, qr{xml/search\.xml} => \&xmlsearch, qr{([^/]+)/([0-9a-f]{8})} => \&man, qr{([^/]+)/([0-9a-f]{8})/src} => \&src, qr{([^/]+)} => \&man, ); TUWF::run(); sub home { my $self = shift; my $stats = $self->dbStats; my $fn = sub { local $_=shift; 1 while(s/(\d)(\d{3})($|,)/$1,$2/); $_ }; $self->htmlHeader(title => 'Man Pages Archive'); h1 'Man Pages Archive'; p; lit sprintf <<' _', map $fn->($stats->{$_}), qw|hashes mans files packages|; Indexing %s versions of %s manual pages found in %s files of %s packages.

Manned.org aims to index all manual pages from a variety of systems, both old and new, and provides a convenient interface for looking up and viewing the various versions of each man page. This site is still in its early life, so we're actively adding new systems to the database and finding ways to improve the online interface. About manned.org » _ end; h2 'Browse the manuals'; ul id => 'systems'; my %sys; push @{$sys{$_->{name}}}, $_ for(@{$self->{systems}}); for my $sys (sort keys %sys) { $sys = $sys{$sys}; (my $img = $sys->[0]{short}) =~ s/^(.+)-.+$/$1/; li; a href => "/browse/$sys->[0]{short}" if @$sys == 1; span style => "background-image: url('images/$img.png')", ''; b $sys->[0]{name}; if(@$sys > 1) { my $i = 0; for(reverse @$sys) { a href => "/browse/$_->{short}", ++$i > 3 ? (class => 'hidden') : (), $_->{release}; lit ' '; } a href => "#", class => 'more', 'more...' if $i > 3; } end 'a' if @$sys == 1; end; } end; h2 'Other sites'; ul id => 'external'; li; a href => 'http://man.cx/', 'Man.cx'; end; li; a href => 'http://man.he.net/', 'Man.he.net'; end; li; a href => 'http://linux.die.net/man/', 'Die.net'; end; li; a href => 'http://www.freebsd.org/cgi/man.cgi', 'FreeBSD.org Man Pages'; end; li; a href => 'http://www.openbsd.org/cgi-bin/man.cgi', 'OpenBSD Man Pages'; end; li; a href => 'http://linuxmanpages.net/', 'Fedora Manuals'; end; li; a href => 'http://manpages.ubuntu.com/', 'Ubuntu Manuals'; end; li; a href => 'http://www.manpagez.com/', 'Manpagez.com'; txt ' (Mac OS X, has some texinfo documentation as well)'; end; # li; a href => 'http://www.ma.utexas.edu/cgi-bin/man-cgi', 'ma.utexas.edu'; end; <- No idea what this has to offer when compared to the rest end; $self->htmlFooter; } sub about { my $self = shift; $self->htmlHeader(title => 'About'); h1 'About Manned.org'; div id => 'about'; h2 'Goal'; p; lit <<' _'; The state of online indices of manual pages is a sad one. Existing sites only offer you a single version of a man page: From one origin, and often only in a single language. Most don't even tell you where the manual actually originated from, making it very hard to determine whether the manual you found actually applies to your situation and even harder to find a manual from a specific system. Additionally, some sites render the manuals in an unreadable way, don't correctly handle special formatting - like tables - or don't correctly display non-ASCII characters.

Manned.org was created in order to improve this situation. This site aims to index the manual pages from a variaty of systems, both old and new, and allows you to browse through the various versions of a manual page to find out how each system behaves. The manuals are stored in the database as UTF-8, and are passed through groff to render them in (mostly) the same way as they are displayed in your terminal. _ end; h2 'The indexing process'; p; lit <<' _'; All man pages are fetched right from the (binary) packages available on the public repositories of Linux distributions. In particular:

Arch Linux
The core, extra and community repositories are fetched from a local Arch mirror. Indexing started around begin June 2012.
Ubuntu
Historical releases were fetched from http://old-releases.ubuntu.com/ubuntu/, supported releases from a local mirror. All components (main, universe, restricted and multiverse) from the $release, $release-updates and $release-security repositories are indexed. Backports are not included at the moment. Indexing started around mid June 2012.
Debian
Historical releases were fetched from http://archive.debian.org/debian/ and http://snapshot.debian.org/. For buzz, rex and bo, only the 'main' component has been indexed, and we're missing a few man pages because some packages were missing from the repository archives. For the other releases, all components (main, contrib and non-free) from the $release and $release-updates (where available) repositories are indexed.

Only packages for a single architecture (i386 or i686) are scanned. To my knowledge, packages that come with different manuals for different architectures either don't exist or are extremely rare. It does happen that some packages are not available for all architectures. Usually, though, every package is at least available for i386/i686, so hopefully we're not missing out on much.

The repositories are scanned for new packages on a daily basis. _ end; h2 'Other systems'; p; lit <<' _'; I'd love to index the manuals of most major Linux distributions in the future. In the short term, this means all Fedora and OpenSUSE releases will get indexed. In the long term, many others may be added as well.

It would also be great to index a few non-Linux systems such as *BSD, Solaris/Illumos and Mac OS X. Unfortunately, those don't always follow a binary package based approach, or are otherwise less easy to properly index. The FreeBSD ports look like a good future target, however.

In general, systems that follow an entirely source-based distribution approach can't be indexed without compiling everything. Since that is both very resource-heavy and open to security issues, there are no plans to include manuals from such systems at the moment. So unless someone comes with a solution I hadn't thought of yet, there won't be any Gentoo manuals here. :-( _ end; h2 'Copyright'; p; lit <<' _'; All manual pages are copyrighted by their respective authors. The manuals have been fetched from publically available repositories of free and (primarily) open source software. The distributors of said software have put in efforts to only include software and documentation that allows free distribution. Nonetheless, if a manual that does not allow to be redistributed has been inadvertently included in our index, please let me know and I will have it removed as soon as possible. _ end; end; $self->htmlFooter; } sub browsesearch { my $self = shift; my $q = $self->reqGet('q')||''; my $man = $self->dbSearch($q, 150); return $self->resRedirect("/$man->[0]{name}.".substr($man->[0]{section},0,1), 'temp') if @$man == 1; $self->htmlHeader(title => 'Search results for '.$q); h1 'Search results for '.$q; p 'Note: This is just a simple case-insensitive prefix match on the man names. In the future we\'ll have more powerful search functionality. Hopefully.'; if(@$man) { ul id => 'searchres'; for(@$man) { li; a href => "/$_->{name}.".substr($_->{section},0,1), $_->{name}; i " $_->{section}"; end; } end; } else { br; br; b 'No results :-('; } $self->htmlFooter; } sub browsesys { my($self, $short) = @_; my $sys = $self->{sysbyshort}{$short}; return $self->resNotFound if !$sys; my $f = $self->formValidate( { get => 'c', required => 0, enum => [ '0', 'all', 'a'..'z' ], default => 'all' }, { get => 's', required => 0, regex => qr/^[a-zA-Z0-9_+.-]+$/i }, ); return $self->resNotFound if $f->{_err}; my $pkg = $self->dbPackageList( hasman => 1, sysid => $sys->{id}, char => $f->{c} eq 'all' ? undef : $f->{c}, start => $f->{s}, results => 201, ); my $more = @$pkg > 200 && pop @$pkg; # TODO: A "previous" link would be nice... my $next = sub { use utf8; if($more) { p class => 'pagination'; a href => "/browse/$short?c=$f->{c};s=$pkg->[199]{name}", 'next »'; end; } }; my $title = "Packages for $sys->{name}".($sys->{release}?" $sys->{release}":""); $self->htmlHeader(title => $title); h1 $title; p id => 'charselect'; for('all', 0, 'a'..'z') { a href => "/browse/$short?c=$_", $_?uc$_:'#' if $_ ne $f->{c}; b $_?uc$_:'#' if $_ eq $f->{c}; } end; p 'Note: Packages without man pages are not listed.'; $next->(); ul id => 'packages'; for(@$pkg) { li; a href => "/browse/$short/$_->{name}", $_->{name}; i ' '.$_->{category}; end; } end; $next->(); $self->htmlFooter; } sub browsepkg { my($self, $short, $name, $ver) = @_; my $sys = $self->{sysbyshort}{$short}; return $self->resNotFound if !$sys; my $pkgs = $self->dbPackageGet($sys->{id}, $name); my $sel = $ver ? (grep $_->{version} eq $ver, @$pkgs)[0] : $pkgs->[0]; return $self->resNotFound if !$sel; my $f = $self->formValidate({ get => 's', required => 0}); return $self->resNotFound if $f->{_err}; my $mans = $self->dbManInfo(package => $sel->{id}, results => 201, start => $f->{s}, sort => 'name'); my $more = @$mans > 200 && pop @$mans; # TODO: A "previous" link would be nice... my $next = sub { use utf8; if($more) { p class => 'pagination'; a href => "/browse/$sys->{short}/$name/$sel->{version}?s=$mans->[199]{name}", 'next »'; end; } }; # Latest version of this package determines last modification date of the page. $self->setLastMod($pkgs->[0]{released}); my $title = "$sys->{name}".($sys->{release}?" $sys->{release}":"")." / $name $sel->{version}"; $self->htmlHeader(title => $title); h1 $title; # TODO: Link back to the system browsing page h2 'Versions'; ul id => 'packages'; for(@$pkgs) { li; txt "$_->{released} "; a href => "/browse/$sys->{short}/$name/$_->{version}", $_->{version} if $_ != $sel; b " $_->{version}" if $_ == $sel; i " $_->{category}"; end; } end; h2 "Manuals for version $sel->{version}"; $next->(); ul id => 'manuals'; for(@$mans) { li; a href => "/$_->{name}/".substr($_->{hash},0,8), "$_->{name}($_->{section})"; b " $_->{locale}" if $_->{locale}; i " $_->{filename}"; end; } end; $next->(); $self->htmlFooter; } sub manjslist { my($self, $m) = @_; # The structure we generate is described in the JS code. my %sys; push @{$sys{$_->{system}}}, $_ for (@$m); [ map [ $self->{sysbyid}{$_}{name}, $self->{sysbyid}{$_}{full}, $self->{sysbyid}{$_}{short}, do { my %pkgs; for(@{$sys{$_}}) { my $pn = "$_->{package}-$_->{version}"; $pkgs{$pn} = [ $_->{package}, $_->{version}, [], $_->{released} ] if !$pkgs{$pn}; push @{$pkgs{$pn}[2]}, [ $_->{section}, $_->{locale}, substr $_->{hash}, 0, 8 ]; } [ grep delete($_->[3]) && ($_->[2] = [sort { $a->[0] cmp $b->[0] || ($a->[1]||'') cmp ($b->[1]||'') } @{$_->[2]}]), sort { $a->[0] cmp $b->[0] || $b->[3] cmp $a->[3] } values %pkgs ]; } ], sort { my $x=$self->{sysbyid}{$a}; my $y=$self->{sysbyid}{$b}; $x->{name} cmp $y->{name} or $y->{relorder} <=> $x->{relorder} } keys %sys ] } # Given the name and optionally the hash of a man page, check with a list of # man pages with the same name to select the right one for display. sub getman { my($self, $name, $hash, $list) = @_; my $sect = $name =~ /\.([0-9n])$/ ? $1 : undef; # If we already have a shorthash, just get the full hash if($hash) { $_->{hash} =~ /^$hash/ && return $_ for (@$list); } # If that failed, use some heuristics my $cmp = sub { local($a,$b) = @_; # English or non-locale packages always win !(($a->{locale}||'') =~ /^(en|$)/) != !(($b->{locale}||'') =~ /^(en|$)/) ? (($a->{locale}||'') =~ /^(en|$)/ ? -1 : 1) # Newer versions of a package have higher priority : $a->{system} == $b->{system} && $a->{package} eq $b->{package} && $a->{version} ne $b->{version} ? $b->{released} cmp $a->{released} # Section prefix match. : $sect && !($a->{section} =~ /^\Q$sect/) != !($b->{section} =~ /^\Q$sect/) ? ($a->{section} =~ /^\Q$sect/ ? -1 : 1) # Give lower priority to pages in a non-standard directory : !($a->{filename} =~ q{^/usr/share/man}) != !($b->{filename} =~ q{^/usr/share/man}) ? ($a->{filename} =~ q{^/usr/share/man} ? -1 : 1) # Lower sections > higher sections (because 'man' does this as well) : substr($a->{section},0,1) ne substr($b->{section},0,1) ? $a->{section} cmp $b->{section} # Prefer Arch over other systems : $a->{system} != $b->{system} ? ($a->{system} == 1 ? -1 : 1) # Prefer a later system release over an older one : $a->{system} != $b->{system} && $self->{sysbyid}{$a->{system}}{name} eq $self->{sysbyid}{$b->{system}}{name} ? $self->{sysbyid}{$b->{system}}{relorder} <=> $self->{sysbyid}{$a->{system}}{relorder} # Sections without appendix before sections with appendix : $a->{section} ne $b->{section} ? $a->{section} cmp $b->{section} # Fallback to hash if nothing else matters (guarantees the order is at least stable) : $a->{hash} cmp $b->{hash}; }; my $winner = $list->[0]; $cmp->($winner, $_) > 0 and ($winner = $_) for (@$list); return $winner; } sub man { my($self, $name, $hash) = @_; my $m = $self->dbManInfo(name => $name); return $self->resNotFound() if !@$m; my $man = getman($self, $name, $hash, $m); my $view = $self->formValidate({get => 'v', regex => qr/^[a-z2-7]+$/}); $view = $view->{_err} ? '' : $view->{v}; # To be really correct, the last modification time of this page should be the # release date of the latest version of the man page, as that is displayed in # the menu. But let's just consider the content of the page as more # important, and use release date of the man page as last modification date. $self->setLastMod($man->{released}); $self->htmlHeader(title => $name); div id => 'nav', 'Sorry, this navigation menu won\'t display without Javascript. :-('; h1; txt $man->{name}.' '; a href => "/$man->{name}/".substr($man->{hash}, 0, 8).'/src', 'source'; end; div id => 'contents'; my $c = $self->dbManContent($man->{hash}); # TODO: Store/cache the result of fmt() in the database. pre; lit ManUtils::html(ManUtils::fmt_block $c); end; end; div id => 'locations'; h2 'Locations of this man page'; table; thead; Tr; td 'System'; td 'Package'; td 'Version'; td 'Name'; td 'Filename'; end; end; my @l = sort { $self->{sysbyid}{$a->{system}}{name} cmp $self->{sysbyid}{$b->{system}}{name} || $self->{sysbyid}{$b->{system}}{relorder} <=> $self->{sysbyid}{$a->{system}}{relorder} || $b->{released} cmp $a->{released} || $a->{filename} cmp $b->{filename} } @{$self->dbManInfo(hash => $man->{hash})}; for(@l) { Tr; td $self->{sysbyid}{$_->{system}}{full}; td "$_->{category}/$_->{package}"; td $_->{version}; td; a href => "/$_->{name}", $_->{name} if $_->{name} ne $man->{name}; txt $_->{name} if $_->{name} eq $man->{name}; txt ".$_->{section}"; end; td $_->{filename}; end; } end; end; $self->htmlFooter(js => { hash => substr($man->{hash}, 0, 8), name => $man->{name}, view => $view, mans => manjslist($self, $m) }); } sub src { my($self, $name, $hash) = @_; my $m = $self->dbManInfo(name => $name, shorthash => $hash); return $self->resNotFound if !@$m; $self->setLastMod($m->[0]{released}); $self->resHeader('Content-Type', 'text/plain; charset=UTF-8'); my $c = $self->dbManContent($m->[0]{hash}); lit $c; } sub xmlsearch { my $self = shift; my $q = $self->reqGet('q')||''; my $man = $self->dbSearch($q, 20); # The JS dropdown search expects this particular format. $self->resHeader('Content-Type' => 'text/xml; charset=UTF-8'); xml; tag 'results'; tag 'item', id => "$_->{name}.$_->{section}", %$_, undef for(@$man); end 'results'; } package TUWF::Object; use TUWF ':html', 'html_escape'; use Time::Local 'timegm'; sub htmlHeader { my $self = shift; my %o = @_; html; head; Link rel => 'stylesheet', type => 'text/css', href => '/man.css'; title $o{title}.' - manned.org'; end 'head'; body; div id => 'header'; a href => '/', 'manned.org'; form action => '/browse/search', method => 'get'; input type => 'text', name => 'q', id => 'q'; input type => 'submit', value => ' '; end; end; div id => 'body'; } sub htmlFooter { my($self, %o) = @_; br style => 'clear: both'; end; div id => 'footer'; lit 'All manual pages are copyrighted by their respective authors. | About manned.org | Contact'; end; if($o{js}) { script type => 'text/javascript'; lit 'VARS = '; lit(JSON::XS->new->ascii->encode($o{js})); lit ';'; end; } script type => 'text/javascript', src => '/man.js', ''; end; end 'html'; # write the SQL queries as a HTML comment when debugging is enabled # (stolen from VNDB code) if($self->debug) { lit "\n\n"; } } # Set the last modification time from a string in yyyy-mm-dd format. sub setLastMod { my($s, $d) = @_; return if $d !~ /^(\d{4})-(\d{2})-(\d{2})/; my @t = gmtime timegm 0,0,0,$3,$2-1,$1; $s->resHeader('Last-Modified', sprintf '%s, %02d %s %04d %02d:%02d:%02d GMT', (qw|Sun Mon Tue Wed Thu Fri Sat|)[$t[6]], $t[3], (qw|Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec|)[$t[4]], $t[5]+1900, $t[2], $t[1], $t[0]); } sub dbManContent { my($s, $hash) = @_; return $s->dbRow(q{SELECT content FROM contents WHERE hash = decode(?, 'hex')}, $hash)->{content}; } # Options: name, section, shorthash, locale, package, start, results, sort sub dbManInfo { my $s = shift; my %o = @_; (my $oname = $o{name}||'') =~ s/\.([0-9n])$//; my %where = ( $o{name} ? ('m.name IN(!l)' => [[ $o{name}, $oname ne $o{name} ? $oname : () ]]) : (), $o{package} ? ('m.package = ?' => $o{package}) : (), $o{section} ? ('m.section = ?' => $o{section}) : (), $o{shorthash} ? (q{substring(m.hash from 1 for 4) = decode(?, 'hex')} => $o{shorthash}) : (), $o{hash} ? (q{m.hash = decode(?, 'hex')} => $o{hash}) : (), $o{locale} ? ('m.locale = ?', $o{locale}) : exists $o{locale} ? ('m.locale IS NULL' => 1) : (), $o{start} ? ('m.name > ?' => $o{start}) : (), ); # TODO: Flags to indicate what to information to fetch return $s->dbAll(q{ SELECT p.system, p.category, p.name AS package, p.version, p.released, m.name, m.section, m.filename, m.locale, encode(m.hash, 'hex') AS hash FROM package p JOIN man m ON m.package = p.id !W !s LIMIT ? }, \%where, $o{sort} ? 'ORDER BY name, locale NULLS FIRST' : '', $o{results}||10000 ); } # Very simple (and fast) prefix match. sub dbSearch { my($s, $q, $limit) = @_; my $sect = $q =~ s/^([0-9])\s+// || $q =~ s/\(([a-zA-Z0-9]+)\)$// ? $1 : ''; my $name = $q =~ s/^([a-zA-Z0-9,.:_-]+)// ? $1 : ''; return !$name ? [] : $s->dbAll( 'SELECT name, section FROM man_index !W ORDER BY name, section LIMIT ?', { # Don't use wildcards in this query, prevents index usage. "lower(name) LIKE '\L$name\E%'" => 1, $sect ? ("section ILIKE '\L$sect\E%'" => 1) : () }, $limit ); } sub dbSystemGet { return shift->dbAll('SELECT id, name, release, short, relorder FROM systems ORDER BY name, relorder'); } # TODO: Optimize # Options: sysid char hasman start results sub dbPackageList { my $s = shift; my %o = (results => 10, @_); my @where = ( $o{sysid} ? ('system = ?' => $o{sysid}) : (), $o{start} ? ('name > ?' => $o{start}) : (), defined($o{hasman}) ? ('!s EXISTS(SELECT 1 FROM man m WHERE m.package = p.id)' => $o{hasman}?'':'NOT') : (), $o{char} ? ( 'LOWER(SUBSTR(name, 1, 1)) = ?' => $o{char} ) : (), defined($o{char}) && !$o{char} ? ( '(ASCII(name) < 97 OR ASCII(name) > 122) AND (ASCII(name) < 65 OR ASCII(name) > 90)' => 1 ) : (), # Only get the latest version # TODO: This is more efficient than using, e.g. SELECT DISTINCT to achieve # the same effect. The downside of this solution is that packages of which # the latest release does not include man pages are not listed, even if an # earlier version does have mans. 'NOT EXISTS(SELECT 1 FROM package p2 WHERE p2.system = p.system AND p2.name = p.name AND p2.released > p.released)' ); return $s->dbAll(q{ SELECT name, category FROM package p !W ORDER BY name LIMIT ?}, \@where, $o{results}) } # TODO: Optimize? sub dbPackageGet { my($s, $sysid, $name) = @_; return $s->dbAll(q{ SELECT id, category, name, version, released FROM package p WHERE system = ? AND name = ? AND EXISTS(SELECT 1 FROM man m WHERE m.package = p.id) ORDER BY released DESC}, $sysid, $name) } sub dbStats { return $_[0]->dbRow('SELECT * FROM stats_cache'); }