Add support for caching HTML-rendered pages
Downside is that this consumes significant disk space, requires recreating the entire cache when changing something to the way that pages are rendered and removes flexibility to add dynamic render-influencing settings in the future. Alas, crawlers are getting more aggressive and I don't like the idea of adding more invasive anti-bot tech. This might not be enough in the long term, we also have a few slow SQL queries that I'm not yet sure how to optimize. But this ought to give us more time, at least.
This commit is contained in:
parent
8edb226a18
commit
d3bebc8888
3 changed files with 64 additions and 5 deletions
|
|
@ -15,8 +15,10 @@ CREATE TABLE contents (
|
||||||
-- may not necessary match the hash, and it's possible for the same content
|
-- may not necessary match the hash, and it's possible for the same content
|
||||||
-- to be in the database under multiple hashes (but I suspect that's rare).
|
-- to be in the database under multiple hashes (but I suspect that's rare).
|
||||||
hash bytea NOT NULL UNIQUE,
|
hash bytea NOT NULL UNIQUE,
|
||||||
content text NOT NULL
|
content text NOT NULL,
|
||||||
|
html text
|
||||||
);
|
);
|
||||||
|
CREATE INDEX contents_nohtml ON contents (id) WHERE html IS NULL;
|
||||||
|
|
||||||
|
|
||||||
-- Unique man page, as identified by name & section
|
-- Unique man page, as identified by name & section
|
||||||
|
|
|
||||||
57
util/cache-html.pl
Executable file
57
util/cache-html.pl
Executable file
|
|
@ -0,0 +1,57 @@
|
||||||
|
#!/usr/bin/env perl
|
||||||
|
|
||||||
|
# This script populates the HTML-rendered man page cache in the database.
|
||||||
|
#
|
||||||
|
# Usage: cache-html.pl
|
||||||
|
#
|
||||||
|
# --verbose
|
||||||
|
# Be more verbose.
|
||||||
|
#
|
||||||
|
# --delay=$SEC
|
||||||
|
# Seconds (fraction supported) to wait between rendering subsequent pages.
|
||||||
|
#
|
||||||
|
# --maxpages=$NUM
|
||||||
|
# Maximum number of pages to render before exiting.
|
||||||
|
#
|
||||||
|
# Multiple instances of this script can run in parallel in order to speed up cache generation.
|
||||||
|
|
||||||
|
use v5.36;
|
||||||
|
use FU::Pg;
|
||||||
|
use Getopt::Long;
|
||||||
|
use Time::HiRes 'time';
|
||||||
|
use Cwd 'abs_path';
|
||||||
|
our $ROOT;
|
||||||
|
BEGIN { ($ROOT = abs_path $0) =~ s{/util/cache-html\.pl$}{}; }
|
||||||
|
|
||||||
|
use lib "$ROOT/ManUtils/blib/lib", "$ROOT/ManUtils/blib/arch";
|
||||||
|
use ManUtils;
|
||||||
|
|
||||||
|
my $verbose = 0;
|
||||||
|
my $delay = 0;
|
||||||
|
my $maxpages = 0;
|
||||||
|
GetOptions('verbose' => \$verbose, 'delay=i' => \$delay, 'maxpages=i' => \$maxpages);
|
||||||
|
|
||||||
|
my $conn = FU::Pg->connect($ENV{MANNED_PG}//'');
|
||||||
|
|
||||||
|
while (1) {
|
||||||
|
my $txn = $conn->txn;
|
||||||
|
my($id, $content) = $txn->q('SELECT id, content FROM contents WHERE html IS NULL FOR UPDATE SKIP LOCKED LIMIT 1')->rowl;
|
||||||
|
last if !$id;
|
||||||
|
|
||||||
|
my $start = time;
|
||||||
|
my $html = eval { ManUtils::html ManUtils::fmt $content };
|
||||||
|
my $end = time;
|
||||||
|
|
||||||
|
# Should be rare. Do save *something* in the database, so we won't get
|
||||||
|
# stuck trying this again and we can easily query for broken pages.
|
||||||
|
if (!defined $html) {
|
||||||
|
$html = '(Error rendering man page)';
|
||||||
|
warn "$id: Error rendering page: $@\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
$txn->q('UPDATE contents SET html = $1 WHERE id = $2', $html, $id)->exec;
|
||||||
|
$txn->commit;
|
||||||
|
printf "%10d: %5.1f ms, %d raw, %d html\n", $id, ($end-$start)*1000, length($content), length($html) if $verbose;
|
||||||
|
last if !--$maxpages;
|
||||||
|
sleep $delay if $delay;
|
||||||
|
}
|
||||||
|
|
@ -709,7 +709,7 @@ sub man_nav_($man, $url, $toc, $htmllang) {
|
||||||
sub man_page($man, $url) {
|
sub man_page($man, $url) {
|
||||||
fu->set_lastmod($man->{released});
|
fu->set_lastmod($man->{released});
|
||||||
|
|
||||||
my($hash, $content) = fu->SQL('SELECT hash, content FROM contents WHERE id =', $man->{content})->rowl;
|
my($hash, $content, $fmt) = fu->SQL('SELECT hash, content, html FROM contents WHERE id =', $man->{content})->rowl;
|
||||||
if($url->{fmt} eq 'raw') {
|
if($url->{fmt} eq 'raw') {
|
||||||
fu->set_header('content-type', 'text/plain');
|
fu->set_header('content-type', 'text/plain');
|
||||||
fu->set_header('content-disposition', sprintf 'filename="%s.%s"', $man->{name}, $man->{section});
|
fu->set_header('content-disposition', sprintf 'filename="%s.%s"', $man->{name}, $man->{section});
|
||||||
|
|
@ -724,11 +724,11 @@ sub man_page($man, $url) {
|
||||||
my $data = $content =~ s/^\.\\".*//rmg;
|
my $data = $content =~ s/^\.\\".*//rmg;
|
||||||
if ($data =~ m{^\s*\.so (?:[^\s]*/)?([^\s/]+)\s*$}s) {
|
if ($data =~ m{^\s*\.so (?:[^\s]*/)?([^\s/]+)\s*$}s) {
|
||||||
($follow) = man_pref_name $1, SQL 'v.id =', $man->{verid};
|
($follow) = man_pref_name $1, SQL 'v.id =', $man->{verid};
|
||||||
$content = fu->SQL('SELECT content FROM contents WHERE id =', $follow->{content})->val if $follow;
|
($content, $fmt) = fu->SQL('SELECT content, html FROM contents WHERE id =', $follow->{content})->rowh if $follow;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
my $fmt = ManUtils::html ManUtils::fmt $content;
|
$fmt //= ManUtils::html ManUtils::fmt $content;
|
||||||
if($url->{fmt} eq 'txt') {
|
if($url->{fmt} eq 'txt') {
|
||||||
# TODO: The 'txt' format is kind of broken right now as it includes our HTML formatting codes.
|
# TODO: The 'txt' format is kind of broken right now as it includes our HTML formatting codes.
|
||||||
# This feature is a WIP and not advertised at the moment, anyway.
|
# This feature is a WIP and not advertised at the moment, anyway.
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue