Rewrite to static site

With a complete reorganisation of the directory structure and most of
the content converted to pandoc-flavoured markdown.

Some TODO's left before this can go live:
- Main page
- Atom feeds
- Bug tracker
This commit is contained in:
Yorhel 2019-03-23 11:52:08 +01:00
parent 5c85a7d32f
commit 6242b2ee9c
291 changed files with 4346 additions and 6141 deletions

89
.gitignore vendored
View file

@ -1,3 +1,88 @@
*.gz
*.zip
download/doc
*.gz
*.pdf
dat/globster/api.md
dat/globster/api.pod
dat/globster/ctl.md
dat/globster/ctl.pod
dat/globster/daemon.md
dat/globster/daemon.pod
dat/globster/launch.md
dat/globster/launch.pod
dat/ncdc/changes.log
dat/ncdc/changes.md
dat/ncdc/man.md
dat/ncdc/man.pod
dat/ncdu/changes.log
dat/ncdu/changes.md
dat/ncdu/man.md
dat/ncdu/man.pod
dat/nginx-confgen/changes.log
dat/nginx-confgen/changes.md
dat/nginx-confgen/man.md
dat/nginx-confgen/man.pod
dat/tuwf/changes.log
dat/tuwf/changes.md
dat/tuwf/man.md
dat/tuwf/man.pod
dat/tuwf/man/db.md
dat/tuwf/man/db.pod
dat/tuwf/man/intro.md
dat/tuwf/man/intro.pod
dat/tuwf/man/misc.md
dat/tuwf/man/misc.pod
dat/tuwf/man/request.md
dat/tuwf/man/request.pod
dat/tuwf/man/response.md
dat/tuwf/man/response.pod
dat/tuwf/man/validate.md
dat/tuwf/man/validate.pod
dat/tuwf/man/xml.md
dat/tuwf/man/xml.pod
dat/ylib.md
dat/ylib.pod
pub/doc.html
pub/doc/commvis.html
pub/doc/dcstats.html
pub/doc/easyipc.html
pub/doc/funcweb.html
pub/doc/sqlaccess.html
pub/dump.html
pub/dump/awshrink.html
pub/dump/btrfssize.html
pub/dump/demo.html
pub/dump/grenamr.html
pub/dump/insbench.html
pub/dump/nccolour.html
pub/globster.html
pub/globster/api.html
pub/globster/ctl.html
pub/globster/daemon.html
pub/globster/launch.html
pub/ncdc.html
pub/ncdc/changes.html
pub/ncdc/faq.html
pub/ncdc/install.html
pub/ncdc/man.html
pub/ncdc/scr.html
pub/ncdu.html
pub/ncdu/changes.html
pub/ncdu/jsonfmt.html
pub/ncdu/man.html
pub/ncdu/scr.html
pub/nginx-confgen.html
pub/nginx-confgen/changes.html
pub/nginx-confgen/man.html
pub/tuwf.html
pub/tuwf/changes.html
pub/tuwf/man.html
pub/tuwf/man/db.html
pub/tuwf/man/intro.html
pub/tuwf/man/misc.html
pub/tuwf/man/request.html
pub/tuwf/man/response.html
pub/tuwf/man/validate.html
pub/tuwf/man/xml.html
pub/ylib.html
pub/yxml.html
pub/yxml/man.html

139
Makefile Normal file
View file

@ -0,0 +1,139 @@
# List of all input files. Each file is converted into a .html file at the same path.
#
# The format of each line is: $path $URL $title
#
# If no $URL is given or the $URL is '-', then the input file is assumed to be
# in dat/, otherwise it will be fetched from $URL.
#
# A $title should be given for .pod and .log files, it is ignored for .md files
# because those already have a title embedded in the file.
#
# Supported file types:
# .md: Converted directly into .html with pandoc.
# .pod: Perl's Plain Old Documentation, converted through HTML into a .md
# file which is then converted into .html again with the proper template.
# .log: A ChangeLog-formatted file, converted through .md into .html.
PAGES=\
"doc.md"\
"doc/commvis.md"\
"doc/dcstats.md"\
"doc/easyipc.md"\
"doc/funcweb.md"\
"doc/sqlaccess.md"\
"dump.md"\
"dump/awshrink.md"\
"dump/btrfssize.md"\
"dump/demo.md"\
"dump/grenamr.md"\
"dump/insbench.md"\
"dump/nccolour.md"\
"globster.md"\
"globster/api.pod https://g.blicky.net/globster.git/plain/doc/api.pod The Globster D-Bus API"\
"globster/ctl.pod https://g.blicky.net/globster.git/plain/doc/globsterctl.pod The globsterctl(1) Man Page"\
"globster/daemon.pod https://g.blicky.net/globster.git/plain/doc/globster.pod The globster(1) Man Page"\
"globster/launch.pod https://g.blicky.net/globster.git/plain/doc/globster-launch.pod The globster-launch(1) Man Page"\
"ncdc.md"\
"ncdc/changes.log https://g.blicky.net/ncdc.git/plain/ChangeLog Ncdc Release History"\
"ncdc/faq.md"\
"ncdc/install.md"\
"ncdc/man.pod - Ncdc Manual"\
"ncdc/scr.md"\
"ncdu.md"\
"ncdu/changes.log https://g.blicky.net/ncdu.git/plain/ChangeLog Ncdu Release History"\
"ncdu/jsonfmt.md"\
"ncdu/man.pod https://g.blicky.net/ncdu.git/plain/doc/ncdu.pod Ncdu Manual"\
"ncdu/scr.md"\
"nginx-confgen.md"\
"nginx-confgen/changes.log https://g.blicky.net/nginx-confgen.git/plain/ChangeLog Nginx-confgen Release History"\
"nginx-confgen/man.pod https://g.blicky.net/nginx-confgen.git/plain/nginx-confgen.pod The nginx-confgen(1) Man Page"\
"tuwf.md"\
"tuwf/changes.log https://g.blicky.net/tuwf.git/plain/ChangeLog TUWF Release History"\
"tuwf/man.pod https://g.blicky.net/tuwf.git/plain/lib/TUWF.pod TUWF Documentation"\
"tuwf/man/db.pod https://g.blicky.net/tuwf.git/plain/lib/TUWF/DB.pod TUWF::DB Documentation"\
"tuwf/man/intro.pod https://g.blicky.net/tuwf.git/plain/lib/TUWF/Intro.pod TUWF::Intro Documentation"\
"tuwf/man/misc.pod https://g.blicky.net/tuwf.git/plain/lib/TUWF/Misc.pod TUWF::Misc Documentation"\
"tuwf/man/request.pod https://g.blicky.net/tuwf.git/plain/lib/TUWF/Request.pod TUWF::Request Documentation"\
"tuwf/man/response.pod https://g.blicky.net/tuwf.git/plain/lib/TUWF/Response.pod TUWF::Response Documentation"\
"tuwf/man/validate.pod https://g.blicky.net/tuwf.git/plain/lib/TUWF/Validate.pod TUWF::Validate Documentation"\
"tuwf/man/xml.pod https://g.blicky.net/tuwf.git/plain/lib/TUWF/XML.pod TUWF::XML Documentation"\
"ylib.pod https://g.blicky.net/ylib.git/plain/README.pod Ylib"\
"yxml.md"\
"yxml/man.md"
# Files we need to download
FETCH := $(shell for i in ${PAGES}; do echo "$$i" | grep -Eo '^[^ ]+ +[^-][^ ]+' | sed -E 's/^([^ ]+).*/dat\/\1/'; done)
# List of generated .html files
HTML_OUT := $(shell for i in ${PAGES}; do echo "$$i" | sed -E 's/^([^ ]+)\.[^\. ]+.*$$/pub\/\1.html/'; done)
# List of .md files generated from .pod files
POD_MD := $(shell for i in ${PAGES}; do echo "$$i" | grep -Eo '^[^ ]+\.pod' | sed -E 's/(.+)\.pod$$/dat\/\1.md/'; done)
# List of .md files generated from .log files
CHANGES_MD := $(shell for i in ${PAGES}; do echo "$$i" | grep -Eo '^[^ ]+\.log' | sed -E 's/(.+)\.log$$/dat\/\1.md/'; done)
# All fetched & generated files
CLEAN := ${FETCH} ${POD_MD} ${CHANGES_MD} ${HTML_OUT}
.PHONY: all clean
all: .gitignore ${HTML_OUT}
${FETCH}: dat/%:
@echo "FETCH $*"
@mkdir -p $$(dirname "$@")
@curl -s ${shell for i in ${PAGES}; do case "$$i" in "$* "*) echo "$$i" | awk '{print$$2}';; esac; done} -o "$@"
# There is a 'pod2markdown' program, but going through HTML with a little bit
# of Perl magic tends to give better results, if only because definition lists
# are properly converted this way and I have more control over links.
${POD_MD}: dat/%.md: dat/%.pod mkpod.pl
@echo "POD $*"
@cat "$<" | ./mkpod.pl |\
pandoc -f html -t markdown -s -o "$@" \
--metadata title="${shell for i in ${PAGES}; do case "$$i" in "$*.pod "*) echo "$$i" | sed -E 's/[^ ]+ +[^ ]+ +//';; esac; done}"
@rm -f pod2htmd.tmp pod2html.tmp
${CHANGES_MD}: dat/%.md: dat/%.log mkchangelog.pl
@echo "MD $*"
@./mkchangelog.pl "$*" "${shell for i in ${PAGES}; do case "$$i" in "$*.log "*) echo "$$i" | sed -E 's/[^ ]+ +[^ ]+ +//';; esac; done}" <"$<" >"$@"
${HTML_OUT}: pub/%.html: dat/%.md template.html
@echo "HTML $*"
@mkdir -p $$(dirname "$@")
@cat "$<" |\
perl -pe 's{\[dllink ([^ \]]+)\]}{<a href="/download/$$1">$$1</a><b class="sig"><a href="/download/$$1.asc">pgp</a>-<a href="/download/$$1.sha1">sha1</a>-<a href="/download/$$1.md5">md5</a></b>}' |\
pandoc -f markdown -t html5 --template template.html \
--metadata path1=$$(echo "$*" | sed 's/\/.*//') \
--metadata path2=$$(echo "$*" | sed 's/\//-/' | sed 's/\/.*//') \
--metadata path3=$$(echo "$*" | sed 's/\//-/g') \
--variable menu-$$(case "$*" in\
globster*) echo "globster";;\
ncdc*) echo "ncdc";;\
ncdu*) echo "ncdu";;\
nginx-confgen*) echo "nginx-confgen";;\
tuwf*) echo "tuwf";;\
yxml*) echo "yxml";;\
*) echo "main";;\
esac)\
-o "$@"
.gitignore: Makefile
@echo "GIT"
@echo '*.zip' >$@
@echo '*.gz' >>$@
@echo '*.pdf' >>$@
@for i in ${CLEAN}; do echo "$$i"; done | sort >>$@
clean:
rm -rf ${CLEAN}
find dat pub -type d -empty -print -delete

16
README.md Normal file
View file

@ -0,0 +1,16 @@
# Requirements
Build-time:
- GNU Make
- curl
- Perl (+ Pod::Simple)
- pandoc
Run-time (for the issue tracker):
- Apache
- Perl
- TUWF
- DBI
- DBD::SQLite

60
dat/doc
View file

@ -1,60 +0,0 @@
=pod
I don't often write stuff. Certainly not enough to warrant a blog. But
sometimes I do feel the need to write down my thoughts. The results of those
rare occasions are published on this page.
=head2 Articles That May As Well Be Considered Blog Posts
=over
=item C<2017-05-28 > - L<An Opinionated Survey of Functional Web Development|https://dev.yorhel.nl/doc/funcweb>
The title says it all.
=item C<2014-07-29 > - L<The Sorry State of Convenient IPC|https://dev.yorhel.nl/doc/easyipc>
A long rant about IPC systems.
=item C<2014-01-09 > - L<Some Measurements on Direct Connect File Lists|https://dev.yorhel.nl/doc/dcstats>
A short measurement study on the file lists obtained from a Direct Connect hub.
Lots of graphs!
=item C<2012-02-15 > - L<A Distributed Communication System for Modular Applications|https://dev.yorhel.nl/doc/commvis>
In this article I explain a vision of mine, and the results of a small research
project aimed at realizing that vision.
=item C<2011-11-26 > - L<Multi-threaded Access to an SQLite3 Database|https://dev.yorhel.nl/doc/sqlaccess>
So you have a single database and some threads. How do you combine these in a
program?
=back
=head2 Longer Reports
=over
=item C<2014-06-10 > - L<Biased Random Periodic Switching in Direct Connect|https://dev.yorhel.nl/download/doc/brpsdc.pdf> (PDF)
My masters thesis.
=item C<2013-04-05 > - L<Peer Selection in Direct Connect|https://dev.yorhel.nl/download/doc/psdc.pdf> (PDF)
The rather long-ish literature study that precluded my masters thesis.
=item C<2010-06-02 > - L<Design and implementation of a compressed linked list library|https://dev.yorhel.nl/download/doc/compll.pdf> (PDF)
The report for the final project of my professional (HBO) bachelor of
Electrical Engineering. I was very liberal with some terminology in this
report. For example, "linked lists" aren't what you think they are, and I
didn't even use the term "locality of reference" where I really should have. It
was also written for an audience with little knowledge on the subject, so I
elaborated on a lot of things that should be obvious for most people in the
field. Then there is a lot of uninteresting overhead about the project itself,
which just happened to be mandatory for this report. Nonetheless, if you can
ignore these faults it's not such a bad read, if I may say so myself. :-)
=back

View file

@ -1,683 +0,0 @@
=pod
(Published on B<2014-07-29>.)
=head1 The Problem
How do you implement communication between two or more processes? This is a
question that has been haunting me for at least 6 years now. Of course, this
question is very broad and has many possible answers, depending on your
scenario. So let me get more specific by describing the problem I want to
solve.
What I want is to write a daemon process that runs in the background and can be
controlled from other programs or libraries. The intention is that people can
easily write custom interfaces or quick scripts to control the daemon. The
service that the daemon offers over this communication channel can be thought
of as its primary API, in this way you can think of the daemon as a persistent
programming library. This concept is similar to existing programs such as
L<btpd|https://github.com/btpd/btpd>, L<MPD|http://www.musicpd.org/>,
L<Transmission|https://www.transmissionbt.com/> and
L<Telepathy|http://telepathy.freedesktop.org/wiki/> - I'll get back to these
later.
More specifically, the most recent project I've been working on that follows
this pattern is L<Globster|https://dev.yorhel.nl/globster>, a remotely
controllable Direct Connect client (if you're not familiar with Direct Connect,
think of it as IRC with some additional file sharing capabilities built in).
While the problem I describe is not specific to Globster, it still serves as an
important use case. I see many other projects with similar IPC requirements.
The IPC mechanism should support two messaging patterns: Request/response and
asynchronous notifications. The request/response pattern is what you typically
get in RPC systems - the client requests something of the daemon and the daemon
then replies with a response. Asynchronous notifications are useful in allowing
the daemon to send asynchronous status updates to the client, such as incoming
chat messages or file transfer status. Lack of support for such notifications
would mean that a client needs to continuously poll for updates, which is
inefficient.
So what I'm looking for is a high-level IPC mechanism that handles this
communication. Solutions are evaluated by the following criteria, in no
particular order.
=over
=item B<Easy>
And with I<easy> I refer to I<ease of use>. As mentioned above, other people
should be able to write applications and scripts to control the daemon. Not
many people are willing to invest days of work just to figure out how to
communicate with the daemon.
=item B<Simple>
Simplicity refers to the actual protocol and the complexity of the code
necessary to implement it. Complex protocols require complex code, and complex
code is hard to maintain and will inevitably contain bugs. Note that I<simple>
and I<easy> are very different things and often even conflict with each other.
=item B<Small>
The IPC implementation shouldn't be too large, and shouldn't depend on huge
libraries. If you need several megabytes worth of libraries just to send a few
messages over a socket, you're doing it wrong.
=item B<Language independent>
Control the daemon with whatever programming language you're familiar with.
=item B<Networked>
A good solution should be accessible from both the local system (daemon running
on the same machine as the client) and from the network (daemon and client
running different machines).
=item B<Secure>
There's three parts in having a secure IPC mechanism. One part is to realize
that IPC operates at a I<trust boundary>; The daemon can't blindly trust
everything the client says and vice versa, so message validation and other
mechanisms to prevent DoS or information disclosure on either part are
necessary.
Then there the matter of I<confidentiality>. On a local system, UNIX sockets
will provide all the confidentiality you can get, so that's trivial. Networked
access, on the other hand, requires some form of transport layer security.
And finally, we need some form of I<authentication>. There should be some
mechanism to prevent just about anyone to connect to the daemon. A
coarse-grained solution such as file permissions on a local UNIX socket or a
password-based approach for networked access will do just fine for most
purposes. Really, just keep it simple.
=item B<Fast>
Although performance isn't really a primary goal, the communication between the
daemon and the clients shouldn't be too slow or heavyweight. For my purposes,
anything that supports about a hundred messages a second on average hardware
will do perfectly fine. And that shouldn't be particularly hard to achieve.
=item B<Proxy support>
This isn't really a hard requirement either, but it would be nice to allow
other processes (say, plugins of the daemon, or clients connecting to the
daemon) to export services over the same IPC channel as the main daemon. This
is especially useful in implementing a cross-language plugin architecture. But
again, not a hard requirement, because even if the IPC mechanism doesn't
directly support proxying, it's always possible for the daemon to implement
some custom APIs to achieve the same effect. This, however, requires extra work
and may not be as elegant as a built-in solution.
=back
Now let's discuss some existing solutions...
=head1 Custom Protocol
Why use an existing IPC mechanism in the first place when all you need is
UNIX/TCP sockets? This is the approach taken by
L<btpd|https://github.com/btpd/btpd>, L<MPD|http://www.musicpd.org/>
(L<protocol spec|http://www.musicpd.org/doc/protocol/index.html>) and older
versions of Transmission (see their L<1.2x
spec|https://trac.transmissionbt.com/browser/branches/1.2x/doc/ipcproto.txt>).
Brpd hasn't taken the time to documented the protocol format, suggesting it's
not really intended to be used as a convenient API (other than through their
btcli), and Transmission has since changed to a different protocol. I'll mainly
focus on MPD here.
MPD uses a text-based request/response mechanism, where each request is a
simple one-line command and a response consists of one or more lines, ending
with an C<OK> or C<ACK> line. There's no support for asynchronous
notifications, although that could obviously have been implemented, too. Let's
grade this protocol...
=over
=item B<Easy?> Not really.
Although MPD has conventions for how messages are formatted, each individual
message still requires custom parsing and validation. This can be automated by
designing an
L<IDL|https://en.wikipedia.org/wiki/Interface_description_language> and
accompanying code generator, but writing one specific for a single project
doesn't seem like a particularly fun task.
The protocol, despite its apparent simplicity, is apparently painful enough to
use that there is a special I<libmpdclient> library to abstract away the
communication with MPD, and interfaces to this library are available in many
programming languages. If you have access to such an application-specific
library for your language of choice, then sure, using the IPC mechanism is easy
enough. But that applies to literally any IPC mechanism.
Ideally, such a library needs to be written only once for the IPC mechanism in
use, and after that no additional code is needed to communicate with
services/daemons using that particular IPC mechanism. Code re-use among
different projects is great, yo. It also doesn't scale very well when extending
the services offered by daemon, any addition to the API will require
modifications to all implementations.
=item B<Simple?> Definitely.
I only needed a quick glance at the MPD protocol reference and I was able to
play a bit with telnet and control my MPD. Writing an implementation doesn't
seem like a complex task. Of course, this doesn't necessarily apply to all
custom protocols, but you can make it as simple or complex as you want it to
be.
=item B<Small?> Sure.
This obviously depends on how elaborate you design your protocol. If you have a
large or complex API, the size of a generic message parser and validator can
easily compensate for the custom parser and validator needed for each custom
message. But for a simple APIs, it's hard to beat a custom protocol in terms of
size.
=item B<Language independent?> Depends.
Of course, a socket library is available to most programming languages, and in
that sense any IPC mechanism built on sockets is language independent. This is,
as such, more of an argument as to how convenient it is to communicate with the
protocol directly rather than with a library that abstracts the protocol away.
In the case of MPD, the text-based protocol seems easy enough to use directly
from most languages, yet for some reason most people prefer language-specific
libraries for MPD.
If you design a binary protocol or anything more complex than simple
request/response message types, using your protocol directly is going to be a
pain in certain languages, and people will definitely want a library specific
to your daemon for their favourite programming language. Something you'll want
to avoid, I suppose.
=item B<Networked?> Sure enough.
Just a switch between UNIX sockets and TCP sockets. Whether a simple solution
like that is a good idea, however, depends on the next point...
=item B<Secure?> Ugh.
Security is hard to get right, so having an existing infrastructure that takes
care of most security sensitive features will help a lot. Implementing your own
protocol means that you also have to implement your own security, to some
extent at least.
Writing code to parse and validate custom messages is error-prone, and a bug in
this code could make both the daemon and the client vulnerable to crashes and
buffer overflows. A statically-typed abstraction that handles parsing and
validation would help a lot.
For networked communication, you'll need some form of confidentiality. MPD does
not seem to support this, so any networked access to an MPD server is
vulnerable to passive observers and MITM attacks. This may be fine for a local
network (presumably what it is intended to be used for), but certainly doesn't
work for exposing your MPD control interface to the wider internet. Existing
protocols such as TLS or SSH can be used to create a secure channel, but these
libraries tend to be large and hard to use securely. This is especially true
for TLS, but at least there's L<stunnel|https://www.stunnel.org/> to simplify
the implementation - at the cost of less convenient deployment.
In terms of authentication, you again need to implement this yourself. MPD
supports authentication using a plain-text password. This is fine for a trusted
network, but on an untrusted network you certainly want confidentiality to
prevent a random observer from reading your password.
=item B<Fast?> Sure.
Existing protocols may have put more effort into profiling and implementing
various optimizations than one would typically do with a custom and
quickly-hacked-together protocol, but still, it probably takes effort to design
a protocol that isn't fast enough.
=item B<Proxy support?> Depends...
Really depends on how elaborate you want to be. It can be very simple if all
you want is to route some messages, it can get very complex if you want to
ensure that these messages follow some format or if you want to reserve certain
interfaces or namespaces to certain clients. What surprised me about the MPD
protocol is that it actually has L<some support for
proxying|http://www.musicpd.org/doc/protocol/ch03s11.html>. But considering the
ad-hoc nature of the MPD protocol, the primitiveness and simplicity of this
proxy support wasn't too surprising. Gets the job done, I suppose.
=back
Overall, and as a rather obvious conclusion, a custom protocol really is what
you make of it. In general, though, it's a lot of work, not always easy to use,
and a challenge to get the security part right.
=head1 D-Bus
D-Bus is being used in L<Transmission|https://www.transmissionbt.com/> and is
what I used for L<Globster|https://dev.yorhel.nl/globster>.
On a quick glance, D-Bus looks I<perfect>. It is high-level, has the messaging
patterns I described, the L<protocol
specification|http://dbus.freedesktop.org/doc/dbus-specification.html> does not
seem I<overly> complex (though certainly could be simplified), it has
implementations for a number of programming languages, has support for
networking, proxying is part of normal operation, and it seems fast enough for
most purposes. When you actually give it a closer look, however, reality isn't
as rose-colored.
D-Bus is designed for two very specific use-cases. One is to allow local
applications to securely interact with system-level daemons such as
L<HAL|https://en.wikipedia.org/wiki/HAL_(software)> (now long dead) and
L<systemd|http://freedesktop.org/wiki/Software/systemd/>, and the other
use-case is to allow communication between different applications inside one
login session. As such, on a typical Linux system there are two D-Bus daemons
where applications can export interfaces and where messages can be routed
through. These are called the I<system bus> and the I<session bus>.
=over
=item B<Easy?> Almost.
The basic ideas behind D-Bus seem easy enough to use. The fact that is has
type-safe messages, interface descriptions and introspection really help in
making D-Bus a convenient IPC mechanism.
The main reasons why I think D-Bus isn't all that easy to use in practice is
due to the lack of good introductionary documentation and the crappy state of
the various D-Bus implementations. There is a L<fairly good
article|https://pythonhosted.org/txdbus/dbus_overview.html> providing a
high-level overview to D-Bus, but there isn't a lot of material that covers how
to actually use D-Bus to interact with applications or to implement a service.
On the implementations, I have had rather bad experiences with the actual
libraries. I've personally used the official libdbus-1, which markets itself a
"low-level" library designed to facilitate writing bindings for other
languages. In practice, the functionality that it offers appears to be too
high-level for writing bindings (L<GDBus|https://developer.gnome.org/glib/>
doesn't use it for this reason), and it is indeed missing a lot of
functionality to make it convenient to use directly. I've also played around
with Perl's L<Net::DBus|http://search.cpan.org/perldoc?Net%3A%3ADBus> and was
highly disappointed. Not only is the documentation rather incomplete, the
actual implementation has more bugs than features. And instead of building on
top of one of the many good event loops for Perl (such as
L<AnyEvent|http://search.cpan.org/perldoc?AnyEvent>), it chooses to implement
L<its own event
loop|http://search.cpan.org/perldoc?Net%3A%3ADBus%3A%3AReactor>. The existence
of several different libraries for Python doesn't incite much confidence,
either.
I was also disappointed in terms of the available tooling to help in the
development, testing and debugging of services. The L<gdbus(1)> tool is useful
for monitoring messages and scripting some things, but is not all that
convenient because D-Bus has too many namespaces and the terrible Java-like
naming conventions make typing everything out a rather painful experience.
L<D-Feet|http://live.gnome.org/DFeet/> offers a great way to explore services,
but lacks functionality for quick debugging sessions. I L<made an
attempt|http://g.blicky.net/dbush.git/> to write a convenient command-line
shell, but lost interest halfway. :-(
D-Bus has the potential to be an easy and convenient IPC mechanism, but the
lack of any centralized organization to offer good implementations,
documentation and tooling makes using D-Bus a pain to use.
=item B<Simple?> Not quite.
D-Bus is conceptually easy and the message protocol is alright, too. Some
aspects of D-Bus, however, are rather more complex than they need to be.
I have once made an attempt to fully understand how D-Bus discovers and
connects to the session bus, but I gave up halfway because there are too many
special cases. To quickly summarize what I found, there's the
C<DBUS_SESSION_BUS_ADDRESS> environment variable which could point to the
(filesystem or abstract) path of a UNIX socket or a TCP address. If that
variable isn't set, D-Bus will try to connect to your X server and get the
address from that. In order to avoid linking everything against X libraries, a
separate L<dbus-launch> utility is spawned instead. Then the bus address could
also be obtained from a file in your C<$HOME/.dbus/> directory, with added
complexity to still support a different session bus for each X session. I've no
idea how exactly connection initiation to the system bus works, but my
impression is that a bunch of special cases exist there, too, depending on
which init system your OS happens to use.
As if all the options in connection initiation aren't annoying enough, there's
also work on L<kdbus|https://lwn.net/Articles/580194/>, a Linux kernel
implementation to get better performance. Not only will kdbus use a different
underlying communication mechanism, it will also switch to a completely
different serialization format. If/when this becomes widespread you will have
to implement and support two completely different protocols and pray that your
application works with both.
On the design aspect there is, in my opinion, needless complexity with regards
to naming and namespaces. First there is a global namespace for I<bus names>,
which are probably better called I<application names>, because that's usually
what they represent. Then, there is a separate I<object> namespace local to
each bus name. Each object has methods and properties, and these are
associated with an I<interface name>, in a namespace specific to the particular
object. Despite these different namespaces, the convention is to use a full and
globally unique path for everything that has a name. For example, to list the
IM protocols that Telepathy supports, you call the C<ListProtocols> method in
the C<org.freedesktop.Telepathy.ConnectionManager> interface on the
C</org/freedesktop/Telepathy/ConnectionManager> object at the
C<org.freedesktop.Telepathy> bus. Fun times indeed. I can understand the
reasoning behind most of these choices, but in my opinion they found the wrong
trade-off.
Another point of complexity that annoys me is the fact that an XML format is
used to describe interfaces. Supporting XML as an IDL format is alright, but
requiring a separate format for an introspection interface gives me the
impression that the message format wasn't powerful enough for such a simple
purpose. The direct effect of this is that any application wishing to use
introspection data will have to link against an XML parser, and almost all
conforming XML parser implementations are as large as the D-Bus implementation
itself.
=item B<Small?> Kind of.
C<libdbus-1.so.3.8.6> on my system is about 240 KiB. It doesn't cover parsing
interface descriptions or implementing a D-Bus daemon, but still covers most of
what is needed to interact with services and to offer services over D-Bus.
It's not I<that> small, but then again, libdbus-1 was not really written with
small size in mind. There's room for optimization.
=item B<Language independent?> Sure.
D-Bus libraries exist for a number of programming languages.
=item B<Networked?> Half-assed.
D-Bus I<officially> supports networked connections to a D-Bus daemon. Actually
using this, however, is painful. Convincing L<dbus-daemon(1)> to accept
connections on a TCP socket involves disabling all authentication (it expects
UNIX credential passing, normally) and requires adding an undocumented C<<
<allow_anonymous/> >> tag in the configuration (I only figured this out from
reading the source code).
Even when you've gotten that to work, there is the problem that D-Bus isn't
totally agnostic to the underlying socket protocol. D-Bus has support for
passing UNIX file descriptors over the connection, and this of course doesn't
work over TCP. While this feature is optional and easily avoided, some services
(I can't find one now) use UNIX fds in order to keep track of processes that
listen to a certain event. Obviously, those services can't be accessed over the
network.
=item B<Secure?> Only locally.
D-Bus has statically typed messages that can be validated automatically, so
that's a plus.
For local authentication, there is support for standard UNIX permissions and
credential passing for more fine-grained authorization. For remote
authentication, I think there is support for a shared secret cookie, but I
haven't tried to use this yet.
There is, as with MPD, no support at all for confidentiality, so using
networked D-Bus over an untrusted network would be a very bad idea anyway.
=item B<Fast?> Mostly.
The messaging protocol is fairly lightweight, so no problems there. I do have
to mention two potential performance issues, however.
The first issue is that the normal mode of operation in D-Bus is to proxy all
messages through an intermediate D-Bus daemon. This involves extra context
switches and message parsing passes in order to get one message from
application A to application B. I believe it is I<officially> supported to
bypass this daemon and to communicate directly between two processes, but after
my experience with networking I am wary of trying anything that isn't part of
how D-Bus is I<intended> to be used. This particular performance issue is what
kdbus addresses, so I suppose it won't apply to future Linux systems.
The other issue is that a daemon that provides a service over D-Bus does not
know whether there exists an application that is interested receiving its
notifications. This means that the daemon always has to spend resources to send
out notification messages, even if no application is actually interested in
receiving them. In practice this means that the notification mechanism is
avoided for events that may occur fairly often, and an equally inefficient
polling approach has to be used instead. It is possible for a service provider
to keep track of interested applications, but this is not part of the D-Bus
protocol and not something you would want to implement for each possible event.
I've no idea if kdbus addresses this issue, but it would be stupid not to.
=item B<Proxy support?> Yup.
It's part of normal operation, even.
=back
D-Bus has many faults, some of them are by design, but many are fixable. I
would have contributed to improving the situation, but I get the feeling that
the goals of the D-Bus maintainers are not at all aligned with mine. My
impression is that the D-Bus maintainers are far too focussed on their own
specific needs and care little about projects with slightly different needs.
Especially with the introduction of kdbus, I consider D-Bus too complex now to
consider it worth the effort to improve. Starting from scratch seems less work.
=head1 JSON/XML RPC
While I haven't extensively used JSON-RPC or XML-RPC myself, it's still an
interesting alternative to study.
L<Transmission|https://www.transmissionbt.com/> uses JSON-RPC
(L<spec|https://trac.transmissionbt.com/browser/trunk/extras/rpc-spec.txt>) as
its primary IPC mechanism, and L<RTorrent|http://rakshasa.github.io/rtorrent/>
has support for an optional XML-RPC interface. (Why do I keep referencing
torrent clients? Surely there are other interesting applications? Oh well.)
The main selling point of HTTP-based IPC is that it is accessible from
browser-based applications, assuming everything has been setup correctly. This
is a nice advantage, but lack of this support is not really a deal-breaker for
me. Browser-based applications can still use any other IPC mechanism, as long
as there are browser plugins or some form of proxy server that converts the
messages of the IPC mechanism to something that is usable over HTTP. For
example, both solutions exist for D-Bus, in the form of the L<Browser DBus
Bridge|http://sandbox.movial.com/wiki/index.php/Browser_DBus_Bridge> and
L<cloudeebus|https://github.com/01org/cloudeebus>. Of course, such solutions
typically aren't as convenient as native HTTP support.
Since HTTP is, by design, purely request-response, JSON-RPC and XML-RPC don't
generally support asynchronous notifications. It's possible to still get
asynchronous notifications by using
L<WebSockets|https://en.wikipedia.org/wiki/WebSocket> (Ugh, opaque stream
sockets, time to go back to our L<custom protocol|/Custom Protocol>) or by
having the client implement a HTTP server itself and send its URL to the
service provider (This is known as a
L<callback|https://duckduckgo.com/?q=web%20service%20callback> in the
L<SOAP|https://en.wikipedia.org/wiki/SOAP> world. I have a lot of respect for
developers who can put up with that crap). As I already hinted, neither
solution is simple or easy.
Let's move on to the usual grading...
=over
=item B<Easy?> Sure.
The ubiquity of HTTP, JSON and XML on the internet means that most developers
are already familiar with using it. And even if you aren't, there are so many
easy-to-use and well-documented libraries available that you're ready to go in
a matter of minutes.
Although interface description languages/formats exist for XML-RPC (and
possibly for JSON-RPC, too), I get the impression these are not often used
outside of the SOAP world. As a result, interacting with such a service tends
be weakly/stringly typed, which, I imagine, is not as convenient in strongly
typed programming languages.
=item B<Simple?> Not really.
Many people have the impression that HTTP is somehow a simple protocol. Sure,
it may look simple on the wire, but in reality it is a hugely bloated and
complex protocol. I strongly encourage everyone to read through L<RFC
2616|https://tools.ietf.org/html/rfc2616> at least once to get an idea of its
size and complexity. To make things worse, there's a lot of recent activity to
standardize on a next generation HTTP
(L<SPDY|https://en.wikipedia.org/wiki/SPDY> and L<HTTP
2.0|https://en.wikipedia.org/wiki/HTTP_2.0>), but I suppose we can ignore these
developments for the foreseeable future for the use case of IPC.
Of course, a lot of the functionality specified for HTTP is optional and can be
ignored for the purpose of IPC, but that doesn't mean that these options don't
exist. When implementing a client, it would be useful to know exactly which
HTTP options the server supports. It would be wasteful to implement compression
support if the server doesn't support it, or keep-alive, or content
negotiation, or ranged requests, or authentication, or correct handling for all
response codes when the server will only ever send 'OK'. What also commonly
happens is that server implementors want to support as much as possible, to the
point that you can have JSON or XML output, depending on what the client
requested.
XML faces a similar problem. The format looks simple, but the specification has
a bunch of features that hardly anyone uses. In contrast to HTTP, however, a
correct XML parser can't just decide to not parse C<< <!DOCTYPE ..> >> stuff,
so it I<has> to implement some of this complexity.
On the upside, JSON is a really simple serialization format, and if you're
careful enough to only implement the functionality necessary for basic HTTP, a
JSON-RPC implementation I<can> be somewhat simple.
=item B<Small?> Not really.
What typically happens is that implementors take an existing HTTP library and
build on top of that. A generic HTTP library likely implements a lot more than
necessary for IPC, so that's not going to be very small. RTorrent, for example,
makes use of the not-very-small L<xmlrpc-c|http://xmlrpc-c.sourceforge.net/>,
which in turn uses L<libcurl|http://curl.haxx.se/> (400 KiB, excluding TLS
library) and either the bloated L<libxml2|http://xmlsoft.org/> (1.5 MiB) or
L<libexpat|http://www.libexpat.org/> (170 KiB). In any case, expect your
programs to grow by a megabyte or more if you go this route.
Transmission seems rather less bloated. It uses the HTTP library that is built
into L<libevent|http://libevent.org/> (totalling ~500 KiB, but libevent is also
used for other networking parts), and a simple JSON parser can't be that large
either. I'm sure that if you reimplement everything from scratch for the
purpose of building an API, you could get something much smaller. Then again,
even if you manage to shrink the size of the server that way, you can't expect
all your users to do the same.
If HTTPS is to be supported, add ~500 KiB more. TLS isn't the simplest
protocol, either.
=item B<Language independent?> Yes.
Almost every language has libraries for web stuff.
=item B<Networked?> Definitely.
In fact, I've never seen anyone use XML/JSON RPC over UNIX sockets.
=item B<Secure?> Alright.
HTTP has built-in support for authentication, but it also isn't uncommon to use
some other mechanism (based on cookies, I guess?).
Confidentiality can be achieved with HTTPS. There is the problem of verifying
the certificate, since I doubt anyone is going to have certificates of their
local applications signed by a certificate authority, but there's always the
option of trust-on-first use. Custom applications can also include a
fingerprint of the server certificate in the URL for verification, but this
won't work for web apps.
=item B<Fast?> No.
JSON/XML RPC messages add significant overhead to the network and requires more
parsing than a simple custom solution or D-Bus. I wouldn't really call it
I<fast>, but admittedly, it might still be I<fast enough> for most purposes.
=item B<Proxy support?> Sure.
HTTP has native support for proxying, and it's always possible to proxy some
URI on the main server to another server, assuming the libraries you use
support that. It's not necessarily simple to implement, however.
=back
The lack of asynchronous notifications and the overhead and complexity of
JSON/XML RPC make me stay away from it, but it certainly is a solution that
many client developers will like because of its ease of use.
=head1 Other Systems
There are a more alternatives out there than I have described so far. Most of
those were options I dismissed early on because they're either incomplete
solutions or specific to a single framework or language. I'll still mention a
few here.
=head2 Message Queues
In the context of IPC I see that message queues such as
L<RabbitMQ|https://www.rabbitmq.com/> and L<ZeroMQ|http://zeromq.org/> are
quite popular. I can't say I have much experience with any of these, but these
MQs don't seem to offer a solution to the problem I described in the
introduction. My impression of MQs is that they offer a higher-level and more
powerful alternative to TCP and UDP. That is, they route messages from one
endpoint to another. The contents of the messages are still completely up to
the application, so you're still on your own in implementing an RPC mechanism
on top of that. And for the purpose of building a simple RPC mechanism, I'm
convinced that plain old UNIX sockets or TCP will do just fine.
=head2 Cap'n Proto
I probably should be spending a full chapter on L<Cap'n
Proto|http://kentonv.github.io/capnproto/> instead of this tiny little section,
but I'm simply not familiar enough with it to offer any deep insights. I can
still offer my blatantly uninformed impression of it: It looks very promising,
but puts, in my opinion, too much emphasis on performance and too little
emphasis on ease of use. It lacks introspection and requires that clients have
already obtained the schema of the service in order to interact with it. It
also uses a capability system to handle authorization, which, despite being
elegant and powerful, increases complexity and cognitive load (though I
obviously need more experience to quantify this). It still lacks
confidentiality for networked access and the number of bindings to other
programming languages is limited, but these problems can be addressed.
Cap'n Proto seems like the ideal IPC mechanism for internal communication
within a single (distributed) application and offers a bunch of unique features
not found in other RPC systems. But it doesn't feel quite right as an easy API
for others to use.
=head2 CORBA
CORBA has been used by the GNOME project in the past, and was later abandoned
in favour of D-Bus, primarily (I think) because CORBA was deemed too L<complex
and incomplete|http://dbus.freedesktop.org/doc/dbus-faq.html#corba>. A system
that is deemed more complex than D-Bus is an immediate red flag. The L<long and
painful history of CORBA|http://queue.acm.org/detail.cfm?id=1142044> also makes
me want to avoid it, if only because that makes it very hard to judge the
quality and modernness of existing implementations.
=head2 Project Tanja
A bit over two years ago I was researching the same problem, but from a much
more generic angle. The result of that was a project that I called Tanja. I
described its concepts L<in an earlier
article|https://dev.yorhel.nl/doc/commvis>, and wrote an incomplete
L<specification|http://g.blicky.net/tanja.git/> along with implementations in
L<C|http://g.blicky.net/tanja-c.git/>, L<Go|http://g.blicky.net/tanja-go.git/>
and L<Perl|http://g.blicky.net/tanja-perl.git/>. I consider project Tanja a
failure, primarily because of its genericity. It supported too many
communication models and the lack of a specification as to which model was
used, and the lack of any guarantee that this model was actually followed, made
Tanja hard to use in practice. It was a very interesting experiment, but not
something I would actually use. I learned the hard way that you sometimes have
to move some complexity down into a lower abstraction layer in order to keep
the complexity in check at higher layers of abstraction.
=head1 Conclusions
This must be the longest rant I've written so far.
In any case, there isn't really a perfect IPC mechanism for my use case. A
custom protocol involves reimplementing a lot of stuff, D-Bus is a pain, and
JSON/XML RPC are bloat.
I am still undecided on what to do. I have a lot of ideas as to what a perfect
IPC solution would look like, both in terms of features and in how to implement
it, and I feel like I have enough experience by now to actually develop a
proper solution. Unfortunately, writing a complete IPC system with the required
utilities and language bindings takes B<a lot> of time and effort. It's not
really worth it if I am the only one using it.
So here is my plea to you, dear reader: If you know of any existing solutions
I've missed, please tell me. If you empathize with me and want a better
solution to this problem, please get in touch as well! I'd love to hear about
projects which face similar problems and have similar requirements.

View file

@ -1,571 +0,0 @@
An Opinionated Survey of Functional Web Development
=pod
(Published on B<2017-05-28>.)
=head1 Intro
TL;DR: In this article I provide an overview of the frameworks and libraries
available for creating websites in statically-typed functional programming
languages.
I recommend you now skip directly to the next section, but if you're interested
in some context and don't mind a rant, feel free to read on. :-)
B<< <Rant mode> >>
When compared to native desktop application development, web development just
sucks. Native development is relatively simple with toolkits such as
L<Qt|https://www.qt.io/>, L<GTK+|https://www.gtk.org/> and others: You have
convenient widget libraries, and you can describe your entire application, from
interface design to all behavioural aspects, in a single programming language.
You're also largely free to structure code in whichever way makes most sense.
You can describe what a certain input field looks like, what happens when the
user interacts with it and what will happen with the input data, all succinctly
in a single file. There are even drag-and-drop UI builders to speed up
development.
Web development is the exact opposite of that. There are several different
technologies you're forced to work with even when creating the most mundane
website, and there's a necessary but annoying split between code that runs on
the server and code that runs in the browser. Creating a simple input field
requires you to consider and maintain several ends:
=over
=item
The back end (server-side code) that describes how the input field interacts
with the database.
=item
Some JavaScript code to describe how the user can interact with the input
field.
=item
Some CSS to describe what the input field looks like.
=item
And then there's HTML to act as a glue between the above.
=back
In many web development setups, all four of the above technologies are
maintained in different files. If you want to add, remove or modify an input
field, or just about anything else on a page, you'll be editing at least four
different files with different syntax and meaning. I don't know how other
developers deal with this, but the only way I've been able to keep these places
synchronized is to just edit one or two places, test if it works in a browser,
and then edit the other places accordingly to fix whatever issues I find. This
doesn't always work well: I don't get a warning if I remove an HTML element
somewhere and forget to also remove the associated CSS. Heck, in larger
projects I can't even tell whether it's safe to remove or edit a certain line
of CSS because I have no way to know for sure that it's not still being used
elsewhere. Perhaps this particular case can be solved with proper organization
and discipline, but similar problems exist with the other technologies.
Yet despite that, why do I still create websites in my free time? Because it is
the only environment with high portability and low friction - after all, pretty
much anyone can browse the web. I would not have been able to create a useful
"L<Visual Novel Database|https://vndb.org/>" any other way than through a
website. And the entire purpose of L<Manned.org|https://manned.org/> is to
provide quick access to man pages from anywhere, which is not easily possible
with native applications.
B<< </Rant mode> >>
Fortunately, I am not the only one who sees the problems with the "classic"
development strategy mentioned above. There are many existing attempts to
improve on that situation. A popular approach to simplify development is the
L<Single-page
application|https://en.wikipedia.org/wiki/Single-page_application> (SPA). The
idea is to move as much code as possible to the front end, and keep only a
minimal back end. Both the HTML and the entire behaviour of the page can be
defined in the same language and same file. With libraries such as
L<React|https://facebook.github.io/react/> and browser support for L<Web
components|https://developer.mozilla.org/en-US/docs/Web/Web_Components>, the
split between files described above can be largely eliminated. And if
JavaScript isn't your favorite language, there are many alternative languages
that compile to JavaScript. (See L<The JavaScript
Minefield|http://walkercoderanger.com/blog/2014/02/javascript-minefield/> for
an excellent series of articles on that topic).
While that approach certainly has the potential to make web development more
pleasant, it has a very significant drawback: Performance. For some
applications, such as web based email clients or CRM systems, it can be
perfectly acceptable to have a megabyte of JavaScript as part of the initial
page load. But for most other sites, such as this one, or the two sites I
mentioned earlier, or sites like Wikipedia, a slow initial page load is
something I consider to be absolutely unacceptable. The web can be really fast,
and developer laziness is not a valid excuse to ruin it. (If you haven't seen
or read L<The Website Obesity
Crisis|http://idlewords.com/talks/website_obesity.htm> yet, please do so now).
I'm much more interested in the opposite approach to SPA: Move as much code as
possible to the back end, and only send a minimal amount of JavaScript to the
browser. This is arguably how web development has always been done in the past,
and there's little reason to deviate from it. The difference, however, is that
people tend to expect much more "interactivity" from web sites nowadays, so the
amount of JavaScript is increasing. And that is alright, so long as the
JavaScript doesn't prevent the initial page from loading quickly. But this
increase in JavaScript does amplify the "multiple files" problem I ranted about
earlier.
So my ideal solution is a framework where I can describe all aspects of a site
in a single language, and organize the code among files in a way that makes
sense to me. That is, I want the same kind of freedom that I get with native
desktop software development. Such a framework should run on the back end, and
automatically generate efficient JavaScript and, optionally, CSS for the front
end. As an additional requirement (or rather, strong preference), all this
should be in a statically-typed language - because I am seemingly incapable of
writing large reliable applications with dynamic typing - and in a language
from functional heritage - because programming in functional languages has
spoiled me.
I'm confident that what I describe is possible, and it's evident that I'm not
the only person to want this, as several (potential) solutions like this do
indeed exist. I've been looking around for these solutions and have
experimented with a few that looked promising. This article provides an
overview of what I have found so far.
=head1 OCaml
My adventure began with L<OCaml|https://ocaml.org/>. It's been a few years
since I last used OCaml for anything, but development on the language and its
ecosystem have all but halted. L<Real World OCaml|https://realworldocaml.org/>
has been a great resource to get me up to speed again.
=head2 Ocsigen
For OCaml there is one project that has it all: L<Ocsigen|http://ocsigen.org/>.
It comes with an OCaml to JavaScript compiler, a web server, several handy
libraries, and a L<framework|http://ocsigen.org/eliom/> to put everything
together. Its L<syntax
extension|http://ocsigen.org/eliom/6.2/manual/ppx-syntax> allows you to mix
front and back end code, and you can easily share code between both ends. The
final result is a binary that runs the server and a JavaScript file that
handles everything on the client side.
The framework comes with an embedded DSL with which you can conveniently
generate HTML without actually typing HTML. And best of all, this DSL works on
both the client and the server: On the server side it generates an HTML string
that can be sent to the client, and running the same code on the client side
will result in a DOM element that is ready to be used.
Ocsigen makes heavy use of the OCaml type system to statically guarantee the
correctness of various aspects of the application. The HTML DSL ensures not
only that the generated HTML well-formed, but also prevents you from
incorrectly nesting certain elements and using the wrong attributes on the
wrong elements. Similarly, an HTML element generated on the server side can be
referenced from client side code without having to manually assign a unique ID
to the element. This prevents accidental typos in the ID naming and guarantees
that the element that the client side code refers to actually exists. URL
routing and links to internal pages are also checked at compile time.
Ocsigen almost exactly matches what I previously described as the perfect
development framework. Unfortunately, it has a few drawbacks:
=over
=item
The generated JavaScript is quite large, a bit over 400 KiB for an hello world.
In my brief experience with the framework, this also results in a noticeably
slower page load. I don't know if it was done for performance purposes, but
subsequent page views are per default performed via in-browser XHR requests,
which do not require that all the JavaScript is re-parsed and evaluated, and is
thus much faster. This, however, doesn't work well if the user opens pages in
multiple tabs or performs a page reload for whatever reason. And as I
mentioned, I care a lot about the initial page loading time.
=item
The framework has a steep learning curve, and the available documentation is by
far not complete enough to help you. I've found myself wondering many times how
I was supposed to use a certain API and have had to look for example code for
enlightenment. At some point I ended up just reading the source code instead of
going for the documentation. What doesn't help here is that, because of the
heavy use of the type system to ensure code correctness, most of the function
signatures are far from intuitive and are sometimes very hard to interpret.
This problem is made even worse with the generally unhelpful error messages
from the compiler. (A few months with L<Rust|https://www.rust-lang.org/> and
its excellent error messages has really spoiled me on this aspect, I suppose).
=item
I believe they went a bit too far with the compile-time verification of certain
correctness properties. Apart from making the framework harder to learn, it
also increases the verbosity of the code and removes a lot of flexibility. For
instance, in order for internal links to be checked, you have to declare your
URLs (or I<services>, as they call it) somewhere central such that the view
part of your application can access it. Then elsewhere you have to register a
handler to that service. This adds boilerplate and enforces a certain code
structure. And the gain of all this is, in my opinion, pretty small: In the 15
years that I have been building web sites, I don't remember a single occurrence
where I mistyped the URL in an internal link. I do suppose that this feature
makes it easy to change URLs without causing breakage, but there is a trivial
counter-argument to that: L<Cool URIs don't
change|https://www.w3.org/Provider/Style/URI.html>. (Also, somewhat ironically,
I have found more dead internal links on the Ocsigen website than on any other
site I have visited in the past year, so perhaps this was indeed a problem they
considered worth fixing. Too bad it didn't seem to work out so well for them).
=back
Despite these drawbacks, I am really impressed with what the Ocsigen project
has achieved, and it has set a high bar for the future frameworks that I will
be considering.
=head1 Haskell
I have always seen Haskell as that potentially awesome language that I just
can't seem to wrap my head around, despite several attempts in the past to
learn it. Apparently the only thing I was missing in those attempts was a
proper goal: When I finally started playing around with some web frameworks I
actually managed to get productive in Haskell with relative ease. What also
helped me this time was a practical introductory Haskell reference, L<What I
Wish I Knew When Learning Haskell|http://dev.stephendiehl.com/hask/>, in
addition to the more theoretical L<Learn You A Haskell for Great
Good|http://learnyouahaskell.com/>.
Haskell itself already has a few advantages when compared to OCaml: For one, it
has a larger ecosystem, so for any task you can think of there is probably
already at least one existing library. As an example, I was unable to find an
actively maintained SQL DSL for OCaml, while there are several available for
Haskell. Another advantage that I found were the much more friendly and
detailed error messages generated by the Haskell compiler, GHC. In terms of
build systems, Haskell has standardized on
L<Cabal|https://www.haskell.org/cabal/>, which works alright most of the time.
Packaging is still often complex and messy, but it's certainly improving as
L<Stack|http://haskellstack.org/> is gaining more widespread adoption. Finally,
I feel that the Haskell syntax is slightly less verbose, and more easily lends
itself to convenient DSLs.
Despite Haskell's larger web development community, I could not find a single
complete and integrated client/server development framework such as Ocsigen.
Instead, there are a whole bunch of different projects focussing on either the
back end or the front end. I'll explore some of them with the idea that,
perhaps, it's possible to mix and match different libraries and frameworks in
order to get the perfect development environment. And indeed, this seems to be
a common approach in many Haskell projects.
=head2 Server-side
Let's start with a few back end frameworks.
=over
=item Scotty
L<Scotty|https://github.com/scotty-web/scotty> is a web framework inspired by
L<Sinatra|http://www.sinatrarb.com/>. I have no experience with (web)
development in Ruby and have never used Sinatra, but it has some similarities
to what I have been using for a long time: L<TUWF|https://dev.yorhel.nl/tuwf>.
Scotty is a very minimalist framework; It does routing (that is, mapping URLs
to Haskell functions), it has some functions to access request data and some
functions to create and modify a response. That's it. No database handling,
session management, HTML generation, form handling or other niceties. But
that's alright, because there are many generic libraries to help you out there.
Thanks to its minimalism, I found Scotty to be very easy to learn and get used
to. Even as a Haskell newbie I had a simple website running within a day. The
documentation is appropriate, but the idiomatic way of combining Scotty with
other libraries is through the use of Monad Transformers, and a few more
examples in this area would certainly have helped.
=item Spock
Continuing with the Star Trek franchise, there's
L<Spock|https://www.spock.li/>. Spock is very similar to Scotty, but comes with
type-safe routing and various other goodies such as session and state
management, L<CSRF|https://en.wikipedia.org/wiki/Cross-site_request_forgery>
protection and database helpers.
As with everything that is (supposedly) more convenient, it also comes with a
slightly steeper learning curve. I haven't, for example, figured out yet how to
do regular expression based routing. I don't even know if that's still possible
in the latest version - the documentation isn't very clear. Likewise, it's
unclear to me what the session handling does exactly (Does it store something?
And where? Is there a timeout?) and how that interacts with CSRF protection.
Spock seems useful, but requires more than just a cursory glance.
=item Servant
L<Servant|http://haskell-servant.github.io/> is another minimalist web
framework, although it is primarily designed for creating RESTful APIs.
Servant distinguishes itself from Scotty and Spock by not only featuring
type-safe routing, it furthermore allows you to describe your complete public
API as a type, and get strongly typed responses for free. This also enables
support for automatically generated documentation and client-side API wrappers.
Servant would be an excellent back end for a SPA, but it does not seem like an
obvious approach to building regular websites.
=item Happstack / Snap / Yesod
L<Happstack|http://www.happstack.com/>, L<Yesod|http://www.yesodweb.com/> and
L<Snap|http://snapframework.com/> are three large frameworks with many
auxiliary libraries. They all come with a core web server, routing, state and
database management. Many of the libraries are not specific to the framework
and can be used together with other frameworks. I won't go into a detailed
comparison between the three projects because I have no personal experience
with any of them, and fortunately L<someone else already wrote a
comparison|http://softwaresimply.blogspot.nl/2012/04/hopefully-fair-and-useful-comparison-of.html>
in 2012 - though I don't know how accurate that still is today.
=back
So there are a fair amount of frameworks to choose from, and they can all work
together with other libraries to implement additional functions. Apart from the
framework, another important aspect of web development is how you generate the
HTML to send to the client. In true Haskell style, there are several answers.
For those who prefer embedded DSLs, there are
L<xhtml|http://hackage.haskell.org/package/xhtml>,
L<BlazeHTML|https://jaspervdj.be/blaze/> and
L<Lucid|https://github.com/chrisdone/lucid>. The xhtml package is not being
used much nowadays and has been superseded by BlazeHTML, which is both faster
and offers a more readable DSL using Haskell's do-notation. Lucid is heavily
inspired by Blaze, and attempts to L<fix several of its
shortcomings|http://chrisdone.com/posts/lucid>. Having used Lucid a bit myself,
I can attest that it is easy to get started with and pretty convenient in use.
I definitely prefer to generate HTML using DSLs as that keeps the entire
application in a single host language and with consistent syntax, but the
alternative approach, templating, is also fully supported in Haskell. The Snap
framework comes with L<Heist|https://github.com/snapframework/heist>, which are
run-time interpreted templates, like similar systems in most other languages.
Yesod comes with L<Shakespeare|http://hackage.haskell.org/package/shakespeare>,
which is a type-safe templating system with support for inlining the templates
in Haskell code. Interestingly, Shakespeare also has explicit support for
templating JavaScript code. Too bad that this doesn't take away the need to
write the JavaScript yourself, so I don't see how this is an improvement over
some other JavaScript solution that uses JSON for communication with the back
end.
=head2 Client-side
It is rather unusual to have multiple compiler implementations targeting
JavaScript for the same source language, but Haskell has three of them. All
three can be used to write front end code without touching a single line of
JavaScript, but there are large philosophical differences between the three
projects.
=over
=item Fay
L<Fay|https://github.com/faylang/fay/wiki> compiles Haskell code directly to
JavaScript. The main advantage of Fay is that it does not come with a large
runtime, resulting small and efficient JavaScript. The main downside is that it
only L<supports a subset of
Haskell|https://github.com/faylang/fay/wiki/What-is-not-supported?>. The
result is a development environment that is very browser-friendly, but where
you can't share much code between the front and back ends. You're basically
back to the separated front and back end situation in classic web development,
but at least you can use the same language for both - somewhat.
Fay itself doesn't come with many convenient UI libraries, but
L<Cinder|http://crooney.github.io/cinder/index.html> covers that with a
convenient HTML DSL and DOM manipulation library.
Fay is still seeing sporadic development activity, but there is not much of a
lively community around it. Most people have moved on to other solutions.
=item GHCJS
L<GHCJS|https://github.com/ghcjs/ghcjs> uses GHC itself to compile Haskell to a
low-level intermediate language, and then compiles that language to JavaScript.
This allows GHCJS to achieve excellent compatibility with native Haskell code,
but comes, quite predictably, at the high cost of duplicating a large part of
the Haskell runtime into the JavaScript output. The generated JavaScript code
is typically measured in megabytes rather than kilobytes, which is (in my
opinion) far too large for regular web sites. The upside of this high
compatibility, of course, is that you can re-use a lot of code between the
front and back ends, which will certainly make web development more tolerable.
The community around GHCJS seems to be more active than that of Fay. GHCJS
integrates properly with the Stack package manager, and there are a L<whole
bunch|http://hackage.haskell.org/packages/search?terms=ghcjs> of libraries
available.
=item Haste
L<Haste|https://github.com/valderman/haste-compiler> provides a middle ground
between Fay and GHCJS. Like GHCJS, Haste is based on GHC, but it instead of
using low-level compiler output, Haste uses a higher-level intermediate
language. This results in good compatibility with regular Haskell code while
keeping the output size in check. Haste has a JavaScript runtime of around 60
KiB and the compiled code is roughly as space-efficient as Fay.
While it should be possible to share a fair amount of code between the front
and back ends, not all libraries work well with Haste. I tried to use Lucid
within a Haste application, for example, but that did not work. Apparently one
of its dependencies (probably the UTF-8 codec, as far as I could debug the
problem) performs some low-level performance optimizations that are
incompatible with Haste.
Haste itself is still being sporadically developed, but not active enough to be
called alive. The compiler lags behind on the GHC version, and the upcoming 0.6
version has stayed unreleased and in limbo state for at least 4 months on the
git repository. The community around Haste is in a similar state. Various
libraries do exist, such as L<Shade|https://github.com/takeoutweight/shade>
(HTML DSL, Reactive UI), L<Perch|https://github.com/agocorona/haste-perch>
(another HTML DSL), L<haste-markup|https://github.com/ajnsit/haste-markup> (yet
another HTML DSL) and
L<haste-dome|https://github.com/wilfriedvanasten/haste-dome> (I<yet> another
HTML DSL), but they're all pretty much dead.
=back
Despite having three options available, only Haste provides enough benefit of
code reuse while remaining efficient enough for the kind of site that I
envision. Haste really deserves more love than it is currently getting.
=head2 More Haskell
In my quest for Haskell web development frameworks and tools, I came across a
few other interesting libraries. One of them is
L<Clay|http://fvisser.nl/clay/>, a CSS preprocessor as a DSL. This will by
itself not solve the CSS synchronisation problem that I mentioned at the start
of this article, but it could still be used to keep the CSS closer to code
implementing the rest of the site.
It also would not do to write an article on Haskell web development and not
mention a set of related projects: L<MFlow|https://github.com/agocorona/MFlow>,
L<HPlayground|https://github.com/agocorona/hplayground> and the more recent
L<Axiom|https://github.com/transient-haskell/axiom>. These are ambitious
efforts at building a very high-level and functional framework for both front
and back end web development. I haven't spend nearly enough time on these
projects to fully understand their scope, but I'm afraid of these being a bit
too high level. This invariably results in reduced flexibility (i.e. too many
opinions being hard-coded in the API) and less efficient JavaScript output.
Axiom being based on GHCJS reinforces the latter concern.
=head1 Other languages
I've covered OCaml and Haskell now, but there are relevant projects in other
languages, too:
=over
=item PureScript
L<PureScript|http://www.purescript.org/> is the spiritual successor of Fay -
except it does not try to be compatible with Haskell, and in fact
L<intentionally deviates from
Haskell|https://github.com/purescript/documentation/blob/master/language/Differences-from-Haskell.md>
at several points. Like Fay, and perhaps even more so, PureScript compiles down
to efficient and small JavaScript.
Being a not-quite-Haskell language, sharing code between a PureScript front end
and a Haskell back end is not possible, the differences are simply too large.
It is, however, possible to go into the other direction: PureScript could also
run on the back end in a NodeJS environment. I don't really know how well this
is supported by the language ecosystem, but I'm not sure I'm comfortable with
replacing the excellent quality of Haskell back end frameworks with a fragile
NodeJS back end (or such is my perception, I admittedly don't have too much
faith in most JavaScript-heavy projects).
The PureScript community is very active and many libraries are available in the
L<Persuit|https://pursuit.purescript.org/> package repository. Of note is
L<Halogen|https://pursuit.purescript.org/packages/purescript-halogen>, a
high-level reactive UI library. One thing to be aware of is that not all
libraries are written with space efficiency as their highest priority, the
simple L<Halogen
button|https://github.com/slamdata/purescript-halogen/tree/v2.0.1/examples/basic>
example already compiles down to a hefty 300 KB for me.
=item Elm
L<Elm|http://elm-lang.org/> is similar to PureScript, but rather than trying to
be a generic something-to-JavaScript compiler, Elm focuses exclusively on
providing a good environment to create web UIs. The reactive UI libraries are
well maintained and part of the core Elm project. Elm has a strong focus on
being easy to learn and comes with good documentation and many examples to get
started with.
=item Ur/Web
L<Ur/Web|http://www.impredicative.com/ur/> is an ML and Haskell inspired
programming language specifically designed for client/server programming. Based
on its description, Ur/Web is exactly the kind of thing I'm looking for: It
uses a single language for the front and back ends and provides convenient
methods for communication between the two.
This has been a low priority on my to-try list because it seems to be primarily
a one-man effort, and the ecosystem around it is pretty small. Using Ur/Web for
practical applications will likely involve writing your own libraries or
wrappers for many common tasks, such as for image manipulation or advanced text
processing. Nonetheless, I definitely should be giving this a try sometime.
(Besides, who still uses frames in this day and age? :-)
=item Opa
I'll be moving out of the functional programming world for a bit.
L<Opa|http://opalang.org/> is another language and environment designed for
client/server programming. Opa takes a similar approach to "everything in
PureScript": Just compile everything to JavaScript and run the server-side code
on NodeJS. The main difference with other to-JavaScript compilers is that Opa
supports mixing back end code with front end code, and it can automatically
figure out where the code should be run and how the back and front ends
communicate with each other.
Opa, as a language, is reminiscent of a statically-typed JavaScript with
various syntax extensions. While it does support SQL databases, its database
API seems to strongly favor object-oriented use rather than relational database
access.
=item GWT
Previously I compared web development to native GUI application development.
There is no reason why you can't directly apply native development structure
and strategies onto the web, and that's exactly what
L<GWT|http://www.gwtproject.org/> does. It provides a widget-based programming
environment that eventually runs on the server and compiles the client-side
part to JavaScript. I haven't really considered it further, as Java is not a
language I can be very productive in.
=item Webtoolkit
In the same vein, there's L<Wt|https://www.webtoolkit.eu/wt>. The name might
suggest that it is a web-based clone of Qt, and indeed that's what it looks
like. Wt is written in C++, but there are wrappers for L<other
languages|https://www.webtoolkit.eu/wt/other_language>. None of the languages
really interest me much, however.
That said, if I had to write a web UI for a resource-constrained device, this
seems like an excellent project to consider.
=back
=head1 To conclude
To be honest, I am a bit overwhelmed at the number of options. On the one hand,
it makes me very happy to see that a lot is happening in this world, and that
alternatives to boring web frameworks do exist. Yet after all this research I
still have no clue what I should use to develop my next website. I do like the
mix and match culture of Haskell, which has the potential to form a development
environment entirely to my own taste and with my own chosen trade-offs. On the
other hand, the client-side Haskell solutions are simply too immature and
integration with the back end frameworks is almost nonexistent.
Almost none of the frameworks I discussed attempt to tackle the CSS problem
that I mentioned in the introduction, so there is clearly room for more
research in this area.
There are a few technologies that I should spend more time on to familiarize
myself with. Ur/Web is an obvious candidate here, but perhaps it is possible to
create a Haskell interface to Wt. Or maybe some enhancements to the Haste
ecosystem could be enough to make that a workable solution instead.

45
dat/doc.md Normal file
View file

@ -0,0 +1,45 @@
% Writing
I don't often write stuff. Certainly not enough to warrant a blog. But
sometimes I do feel the need to write down my thoughts. The results of those
rare occasions are published on this page.
## Articles That May As Well Be Considered Blog Posts
`2017-05-28` - [An Opinionated Survey of Functional Web Development](/doc/funcweb)
: The title says it all.
`2014-07-29` - [The Sorry State of Convenient IPC](/doc/easyipc)
: A long rant about IPC systems.
`2014-01-09` - [Some Measurements on Direct Connect File Lists](/doc/dcstats)
: A short measurement study on the file lists obtained from a Direct Connect
hub. Lots of graphs!
`2012-02-15` - [A Distributed Communication System for Modular Applications](/doc/commvis)
: In this article I explain a vision of mine, and the results of a small
research project aimed at realizing that vision.
`2011-11-26` - [Multi-threaded Access to an SQLite3 Database](/doc/sqlaccess)
: So you have a single database and some threads. How do you combine these in
a program?
## Longer Reports
`2014-06-10` - [Biased Random Periodic Switching in Direct Connect](/download/doc/brpsdc.pdf) (PDF)
: My masters thesis.
`2013-04-05` - [Peer Selection in Direct Connect](/download/doc/psdc.pdf) (PDF)
: The rather long-ish literature study that precluded my masters thesis.
`2010-06-02` - [Design and implementation of a compressed linked list library](https://dev.yorhel.nl/download/doc/compll.pdf) (PDF)
: The report for the final project of my professional (HBO) bachelor of
Electrical Engineering. I was very liberal with some terminology in this
report. For example, "linked lists" aren't what you think they are, and I
didn't even use the term "locality of reference" where I really should
have. It was also written for an audience with little knowledge on the
subject, so I elaborated on a lot of things that should be obvious for most
people in the field. Then there is a lot of uninteresting overhead about
the project itself, which just happened to be mandatory for this report.
Nonetheless, if you can ignore these faults it's not such a bad read, if I
may say so myself. :-)

View file

@ -1,11 +1,8 @@
A Distributed Communication System for Modular Applications
% A Distributed Communication System for Modular Applications
=pod
(Published on **2012-02-15**)
(Published on B<2012-02-15>. Also available in L<POD|https://dev.yorhel.nl/dat/doc-commvis>.)
=head1 Introduction
# Introduction
I have a vision. A vision in which rigid point-to-point IPC is replaced with a
far more flexible and distributed communication system. A vision in which
@ -24,21 +21,19 @@ believe to have found an answer. In this article I'll try to explain my ideas
and how they may be used to realize this vision.
My ideas have been heavily inspired by
L<Linda|http://en.wikipedia.org/wiki/Linda_(coordination_language)>. If you're
already familiar with that, then what I present here probably won't be very
revolutionary. Still, there are several aspects in which my ideas differ
[Linda](https://en.wikipedia.org/wiki/Linda_\(coordination_language\)). If
you're already familiar with that, then what I present here probably won't be
very revolutionary. Still, there are several aspects in which my ideas differ
significantly from Linda, so you won't be bored reading this. :-)
=head1 The Concept
# The Concept
In this section I'll try to introduce the overall concept and some terminology.
This is going to be somewhat abstract and technical, but please bear with me.
I promise that things will get more interesting in the later sections.
Let me first define an abstract communications framework. We have a B<network>
and a bunch of B<sessions> connected to that network. Sessions can communicate
Let me first define an abstract communications framework. We have a **network**
and a bunch of **sessions** connected to that network. Sessions can communicate
with each other through this network (that's usually what a network is for,
after all). These sessions do not have to be static: they may come and go.
Keep in mind that, for the purpose of explaining this concept, these terms are
@ -49,15 +44,15 @@ sockets, pipes, a HTTP server, a broadcast network or just shared memory
between threads. If it allows sessions to communicate I'll call it a network.
Unlike many communication systems, this network does not have the concept of
I<addresses>. There is no direct way for one session to identify another, and
_addresses_. There is no direct way for one session to identify another, and
indeed there is no need to do so for the purposes of communication. Instead,
the primary means of communication is by using B<tuples> and patterns.
the primary means of communication is by using **tuples** and patterns.
A tuple is an ordered set (list, array, whatever terminology you prefer) of
zero or more elements. Each element may have a different type, so it can hold
booleans, integers, floating point numbers, strings and even more complex data
structures as arrays or maps. You may think of a tuple as an array in
L<JSON|http://json.org/> notation, if that makes things easier to understand.
[JSON](https://json.org/) notation, if that makes things easier to understand.
Sessions send and receive tuples to communicate with each other. On the sending
side, a session simply "passes" a tuple to the network. This is a non-blocking,
@ -66,12 +61,12 @@ action, because the sender can not know whether it will be received by any
other session anyway. The tuple may be received by many other sessions, or
there may not even be a single session interested in the tuple at all.
On the receiving side, sessions B<register> patterns. A pattern itself is
On the receiving side, sessions **register** patterns. A pattern itself is
mostly just a tuple, but with a more limited set of allowed types: only those
types for which exact matching makes sense, like booleans, integers and
strings. A pattern matches an incoming tuple if the first C<n> elements of the
strings. A pattern matches an incoming tuple if the first `n` elements of the
tuple exactly match the corresponding elements of the pattern. A special
I<wildcard> element may be used to match any value of any type.
_wildcard_ element may be used to match any value of any type.
A sessions thus only receives tuples from other sessions if they have
registered a pattern for them. As mentioned, it is not illegal to send a tuple
@ -83,7 +78,7 @@ own patterns, then it will receive its own tuple. (However, programming
interfaces might allow this to be detected and/or disabled if this eases the
implementation of a session).
Finally, there is the concept of a B<return-path>. Upon sending out a tuple, a
Finally, there is the concept of a **return-path**. Upon sending out a tuple, a
session may indicate that it is interested in receiving replies. The network
is then responsible for providing a return-path: a way for receivers of the
tuple to reply to it. When a tuple is received, the session has the option to
@ -99,18 +94,15 @@ received by multiple sessions, then the replies will be interleaved over the
return-path, and the path is closed when all of the receiving sessions have
closed their end.
=head1 Common design patterns and solutions
# Common design patterns and solutions
The previous section was rather abstract. This section provides several
examples on how to do common tasks and design patterns by using the previously
described concepts.
## Broadcast notifications
=head2 Broadcast notifications
This is commonly implemented in OOP systems using the I<Observer pattern>.
This is commonly implemented in OOP systems using the _Observer pattern_.
Implementing the same using tuples and patterns is an order of magnitude more
simple, as broadcast notifications are pretty much the native means of
communication.
@ -125,56 +117,43 @@ that follows a certain pattern, then you very easily achieve the same
functionality as with an OOP implementation. In fact, there are some advantages
to doing it this way:
=over
=item *
Sessions stay registered to the same notifications even if the "object" (the
session that is being observed) is restarted or replaced with something else.
It's the network itself that keeps track of the registrations, not the sessions
that provide the notifications. Of course, this can be seen as a drawback, but
you can easily emulate OOP behaviour by providing an extra notification when
the "object" is shut down, indicating that the observing sessions can remove
their patterns.
=item *
Since there is no need for the session that is being observed to keep a list of
sessions that are observing it, it also doesn't have walk the list and send out
multiple notifications. Notifying the observers is as simple as sending out a
single tuple.
=item *
Many implementations of the Observer pattern maintain only a single list of
observers per object, and each listed observer will be notified for every
change to the object. For example, if an object maintains a list and provides
notifications when something is added and deleted to the list, every observer
will be notified of both the "added" action and the "deleted" action. The use
of tuples and patterns allows observers to register for all actions, or just
for a single one. If an "add" action would be notified with a tuple of
C<["object", "add", id]> and a "delete" action with
C<["object", "delete", id]>, then an observing session can register with the
pattern C<["object", *]> to be notified for both actions, or just
C<["object", "add"]> to register only for additions.
=back
- Sessions stay registered to the same notifications even if the "object" (the
session that is being observed) is restarted or replaced with something else.
It's the network itself that keeps track of the registrations, not the
sessions that provide the notifications. Of course, this can be seen as a
drawback, but you can easily emulate OOP behaviour by providing an extra
notification when the "object" is shut down, indicating that the observing
sessions can remove their patterns.
- Since there is no need for the session that is being observed to keep a list
of sessions that are observing it, it also doesn't have walk the list and
send out multiple notifications. Notifying the observers is as simple as
sending out a single tuple.
- Many implementations of the Observer pattern maintain only a single list of
observers per object, and each listed observer will be notified for every
change to the object. For example, if an object maintains a list and provides
notifications when something is added and deleted to the list, every observer
will be notified of both the "added" action and the "deleted" action. The use
of tuples and patterns allows observers to register for all actions, or just
for a single one. If an "add" action would be notified with a tuple of
`["object", "add", id]` and a "delete" action with `["object", "delete",
id]`, then an observing session can register with the pattern `["object",
*]` to be notified for both actions, or just `["object", "add"]` to
register only for additions.
Of course, this is only one way to implement a notification mechanism. There
are also solutions that more accurately mimic the behaviour of the Observer
pattern OOP in cases where that is desired.
## Commands
=head2 Commands
A I<command> is what I call something along the lines of one session telling an
A _command_ is what I call something along the lines of one session telling an
other session to do something. Suppose we have a session representing a file
system. A command for this session could then be something like "delete file
X".
In a sense, this isn't much different from a notification as described above.
The file system session would have registered a pattern like
C<["fs", "delete", *]>, where the wildcard is used for the file name. If an
`["fs", "delete", *]`, where the wildcard is used for the file name. If an
other session then wants to have a file deleted, the only thing it will have to
do is send out a tuple matching that pattern, and the file system session will
take care of deleting it.
@ -183,25 +162,24 @@ In the above scenario, the session sending the command has no feedback
whatsoever on whether the command has been successfully executed or not.
Whether this is acceptable depends of course on the specific application. One
way of still providing some form of feedback is to have the file system session
send out a notification tuple, e.g. C<["fs", "deleted", "file"]> (Note that the
second element is now C<deleted> rather than C<delete>. Using the same tuple
send out a notification tuple, e.g. `["fs", "deleted", "file"]` (Note that the
second element is now `deleted` rather than `delete`. Using the same tuple
for actions and notifications is going to be very messy...). This way the
session sending the command, in addition to any other sessions that happen to
be interested in file deletion, will be notified of the deletion of the file.
An alternative solution is to use the RPC-like method, as described below.
## RPC
=head2 RPC
L<RPC|http://en.wikipedia.org/wiki/Remote_procedure_cal> is in essence nothing
else than providing an interface similar to a regular function call to a
component that can't be reached via a regular function call (e.g. because the
[RPC](https://en.wikipedia.org/wiki/Remote_procedure_call) is in essence
nothing else than providing an interface similar to a regular function call to
a component that can't be reached via a regular function call (e.g. because the
object isn't inside the address space of the program). RPC is generally a
request-response type of interaction, and making use of the return-path
facility as I described earlier, all of the functionality of RPC is also
available with the concept of tuple communication.
=head3 Commands, the RPC-way
### Commands, the RPC-way
Take the previous file system example. Instead of just sending the command
tuple to delete the file, the session could indicate that it is interested in
@ -224,19 +202,16 @@ a notification tuple. Of course, it all depends on the application whether this
is necessary, you only have to implement the functionality that is necessary
for your purposes.
=head3 Requesting information
### Requesting information
Another use of RPC, and thus also of the return-path, is to allow sessions to
request information from each other. Using the same example again, the file
system session could register for a pattern such as C<["fs", "list"]>. Upon
system session could register for a pattern such as `["fs", "list"]`. Upon
receiving a tuple matching that pattern, the session would send a list of all
its files over the return-path. Other sessions can then request this list by
simply sending out the right tuple and waiting for the replies.
=head1 Advantages over other systems
# Advantages over other systems
Now that I've hopefully convinced you that my communication concept is powerful
enough to build applications with it, you may be wondering why you should use
@ -246,10 +221,10 @@ systems. Let me present some of the inherent advantages that this system has
compared to others, and why it will help in designing flexible and modular
applications.
=head2 Loose coupling of components
## Loose coupling of components
Sessions (representing the components of a system) do not have to have a lot of
knowledge about each other. Sessions implicitly provide abstracted I<services>
knowledge about each other. Sessions implicitly provide abstracted _services_
using tuple communications, in much the same way as interfaces explicitly do in
OOP.
@ -262,16 +237,16 @@ worry about how long a certain function call block the callers' thread. Since
communication between the different sessions is completely asynchronous, these
worries are gone.
=head2 Location independence
## Location independence
Sessions can communicate with other sessions without knowing I<where> they are.
Sessions can communicate with other sessions without knowing _where_ they are.
This has as major advantage that a session can be moved around without having
to change a single line of code in any of the sessions relying on its service.
This allows sessions that communicate a lot with each other to be placed in the
same process, while resource-heavy sessions may be distributed among several
physical devices.
=head2 Programming language independence
## Programming language independence
All communication is solely done with tuples, which can be represented as
abstract objects and serialized and deserialized (or marshalled/unmarshalled,
@ -290,7 +265,7 @@ sessions in a low-level language such as C. Similarly, it allows developers to
hook into your application even when they are not familiar with your favorite
programming language.
=head2 Easy debugging
## Easy debugging
Not only can other applications and/or plugins hook into your application, you
can also connect a simple debugger to the network. The debugger just has to
@ -302,78 +277,67 @@ is being sent over a return-path is generally not visible to anyone but the
receiver of the replies, although a network implementation might allow a
debugging application to look into that as well.
=head1 Where to go from here
# Where to go from here
What I've described above is nothing more than a bunch of ideas. To actually
use this, there's a lot to be done.
=over
Defining a "tuple"
: What types can be used in tuples? Should a tuple have some maximum size or a
maximum number of elements? Should a `NULL` type be included? What about a
boolean type, why not use the integers 1 and 0 for that? Should it be possible
to interchange binary data, or only UTF-8 strings?
=item Defining a "tuple"
What will be the size of an integer that a session can reasonably assume to be
available? Specifying something like "infinite" is going to be either
inefficient in terms of memory and CPU overhead or will require extra overhead
(in terms of code) in usage. Specifying that everything should fit in a 64bit
integer is a lot more practical, but may be somewhat annoying to cope with in
many dynamically typed languages running on 32bit architectures. Specifying
that integers are 32bits will definitely ease the implementation of the network
library in interpreted languages, but lowers the usefulness of the integer type
and is still a pain to use in OCaml (which has 31bit integers).
What types can be used in tuples? Should a tuple have some maximum size or a
maximum number of elements? Should a C<NULL> type be included? What about a
boolean type, why not use the integers 1 and 0 for that? Should it be possible
to interchange binary data, or only UTF-8 strings?
These choices greatly affect the ease of implementing a networking library for
specific programming languages and the ease of using the network to actually
develop an application.
What will be the size of an integer that a session can reasonably assume to be
available? Specifying something like "infinite" is going to be either
inefficient in terms of memory and CPU overhead or will require extra overhead
(in terms of code) in usage. Specifying that everything should fit in a 64bit
integer is a lot more practical, but may be somewhat annoying to cope with in
many dynamically typed languages running on 32bit architectures. Specifying
that integers are 32bits will definitely ease the implementation of the network
library in interpreted languages, but lowers the usefulness of the integer type
and is still a pain to use in OCaml (which has 31bit integers).
The exact semantics of matching
: Somewhat similar to the previous point, the semantics of matching tuples with
patterns should also be defined in some way. Some related questions are whether
values of different types may be equivalent. For example, is the string
`"1234"` equivalent to an integer with that value? What about NULL and/or
boolean types? If there is a floating point type, you probably won't need exact
matching on those values (floating points are too imprecise for that anyway),
but you might still want the floating point number `10.0` to match the integer
`10` to ease the use in dynamic languages where the distinction between
integer and float is blurred.
These choices greatly affect the ease of implementing a networking library for
specific programming languages and the ease of using the network to actually
develop an application.
Defining the protocol(s)
: Making my vision of modularity and ease of use a reality requires that any
session can easily communicate with an other session, even if they have a
vastly different implementation. To do this, we need a protocol to connect
multiple processes together, whether they run on a local machine or over a
physical network.
=item The exact semantics of matching
Somewhat similar to the previous point, the semantics of matching tuples with
patterns should also be defined in some way. Some related questions are whether
values of different types may be equivalent. For example, is the string
C<"1234"> equivalent to an integer with that value? What about NULL and/or
boolean types? If there is a floating point type, you probably won't need exact
matching on those values (floating points are too imprecise for that anyway),
but you might still want the floating point number C<10.0> to match the integer
C<10> to ease the use in dynamic languages where the distinction between
integer and float is blurred.
=item Defining the protocol(s)
Making my vision of modularity and ease of use a reality requires that any
session can easily communicate with an other session, even if they have a
vastly different implementation. To do this, we need a protocol to connect
multiple processes together, whether they run on a local machine or over a
physical network.
=item Coding the stuff
Obviously, all of this remains as a mere concept if nothing ever gets
implemented. Easy-to-use libraries are needed for several programming
languages. And more importantly, actual applications will have to be developed
using these libraries.
=back
Coding the stuff
: Obviously, all of this remains as a mere concept if nothing ever gets
implemented. Easy-to-use libraries are needed for several programming
languages. And more importantly, actual applications will have to be developed
using these libraries.
Of course, realizing all of the above is an iterative process. You can't write
an implementation without knowing what data types a tuple is made of, but it is
equally impossible to determine the exact definition of a tuple without having
experienced with an actual implementation.
=head2 What's the plan?
## What's the plan?
I've been working on documenting the basics of the semantics and the
point-to-point communication protocol, and have started on an early
implementation in the Go programming language to experiment with. I've dubbed
the project B<Tanja>, and have published my progress on a
L<git repo|http://g.blicky.net/tanja.git/>.
the project **Tanja**, and have published my progress on a
[git repo](https://g.blicky.net/tanja.git/).
My intention is to also write implementations for C and Perl, experiment with
that, and see if I can refine the semantics to make this concept one that is
@ -389,5 +353,5 @@ how things work out.
In either case, if this article managed to get you interested in this concept
or in project Tanja, and you have any questions, feedback or (gasp!) feel like
helping out, don't hesitate to contact me! I'm available as 'Yorhel' on Direct
Connect at C<adc://blicky.net:2780> and IRC at C<irc.synirc.net>, or just drop
me a mail at C<projects@yorhel.nl>.
Connect at `adc://blicky.net:2780` and IRC at `irc.synirc.net`, or just drop me
a mail at `projects@yorhel.nl`.

View file

@ -1,25 +1,21 @@
Some Measurements on Direct Connect File Lists
% Some Measurements on Direct Connect File Lists
=pod
(Published on **2014-01-09**)
(Published on B<2014-01-09>.)
=head1 Introduction
# Introduction
I've been working on Direct Connect related projects for a while now. This
includes maintaining L<ncdc|https://dev.yorhel.nl/ncdc> and
L<Globster|https://dev.yorhel.nl/globster>, and doing a bit of research into
improving the downloading performance and scalability (to be published at some
later date). Whether I'm writing code or trying to setup experiments for
research, there's one thing that helps a lot in making decisions. Measurements
from an actual network.
includes maintaining [ncdc](/ncdc) and [Globster](/globster), and doing a bit
of research into improving the downloading performance and scalability (to be
published at some later date). Whether I'm writing code or trying to setup
experiments for research, there's one thing that helps a lot in making
decisions. Measurements from an actual network.
Because useful measurements are often missing, I decided to do some myself.
There's a lot to measure in an actual P2P network, but I restricted myself to
information that can be gathered quite easily from file lists.
=head1 Obtaining the Data
# Obtaining the Data
Different hubs will likely have totally different patterns in terms of what is
being shared. In order to keep this experiment simple, I limited myself to a
@ -40,7 +36,7 @@ the evening.
One thing I learned from this experience was that the downloading algorithm in
ncdc (1.18.1) does not scale particularly well. Every 60 seconds, it would try
to open a connection with B<all> users listed in the download queue. You can
to open a connection with **all** users listed in the download queue. You can
imagine that trying to connect to 11k users simultaneously put a significantly
heavier load on the hub than would have been necessary. Not good. Not something
a well-behaving netizen would do. Surprisingly enough, the hub didn't seem to
@ -50,7 +46,7 @@ are typically not the most busy days in P2P land. Weekends tend to be busier.
Despite that scalability issue, I successfully managed to download the file
lists of almost everyone who remained online for long enough to finally get
their list downloaded. In total I managed to download 14143 file lists (that's
one list too many for C<10000*sqrt(2)>, I should have stopped the process a bit
one list too many for `10000*sqrt(2)`, I should have stopped the process a bit
earlier). The total bzip2-compressed size of these lists is 6.5 GiB.
For obvious reasons, I won't be sharing my modifications to ncdc. I already
@ -58,15 +54,14 @@ tarnished the reputation of ncdc enough in that single day. If you wish to
repeat this experiment, please do so with a scalable downloading
implementation. :-)
=head1 Obtaining the Stats
# Obtaining the Stats
And then comes the challenge of aggregating statistics on 6.5 GiB of compressed
XML files. This didn't really sound like much of a challenge. After all, all
one needs to do is decompress the file lists, do some XML parsing and update
some values. Most of the CPU time in this process would likely be spent on
bzip2 decompression, so I figured I'd just pipe the output of L<bzcat(1)> to a
Perl script and be done with it.
bzip2 decompression, so I figured I'd just pipe the output of
[bzcat(1)](https://manned.org/bzcat) to a Perl script and be done with it.
To get the statistics on the sizes and the distribution of unique files, a data
structure containing information on all unique files in the lists was
@ -81,73 +76,73 @@ solution was needed.
When faced with such a problem, some people will try to optimize the algorithm,
others will throw extra hardware at it, and I did what I do best: Optimize away
the constants. That is, I rewrote the data analysis program in C. Using the
excellent L<khash|https://github.com/attractivechaos/klib> hash table library
to keep track of the file information and the equally awesome
L<yxml|https://dev.yorhel.nl/yxml> library (a little bit of self-promotion
doesn't hurt, right?) to do the XML parsing, I was able to do all the necessary
processing in 30 minutes using at most 3.6GB of RAM.
excellent [khash](https://github.com/attractivechaos/klib) hash table library
to keep track of the file information and the equally awesome [yxml](/yxml)
library (a little bit of self-promotion doesn't hurt, right?) to do the XML
parsing, I was able to do all the necessary processing in 30 minutes using at
most 3.6GB of RAM.
Long story short, here's my analysis program:
L<dcfilestats.c|http://g.blicky.net/dcstats.git/tree/dcfilestats.c>.
[dcfilestats.c](https://g.blicky.net/dcstats.git/tree/dcfilestats.c).
=head1 A Look at the Stats
# A Look at the Stats
Some lists didn't decompress/parse correctly, so the actual number of file
lists used in these stats is B<14137>. The total compressed size of these lists
is B<6,945,269,469> bytes (6.5 GiB), and uncompressed B<25,533,519,352> bytes
(24 GiB). In total these lists mentioned B<197,413,253> files. After taking
duplicate listings in account, there's still B<84,131,932> unique files.
lists used in these stats is **14137**. The total compressed size of these
lists is **6,945,269,469** bytes (6.5 GiB), and uncompressed **25,533,519,352**
bytes (24 GiB). In total these lists mentioned **197,413,253** files. After
taking duplicate listings in account, there's still **84,131,932** unique
files.
And now for some graphs...
=head2 Size of the File Lists
## Size of the File Lists
Behold, the compressed and uncompressed size of the downloaded file lists:
[img graph dclistsize.png ]
![](/img/dclistsize.png)
Nothing too surprising here, I guess. 100 KiB seems to be a common size for a
compressed file lists, but lists of 1 MiB aren't too weird, either. The largest
file list in this set is 34.8 MiB compressed and 120 MiB uncompressed. The
uncompressed size of a list tends to be (*gasp*) a bit larger, but we can't
uncompressed size of a list tends to be (\*gasp\*) a bit larger, but we can't
easily infer the compression ratio from this graph. Hence, another graph:
[img graph dclistcomp.png ]
![](/img/dclistcomp.png)
Most file lists compress to about 24% - 35% of their original size. This seems
to be consistent with L<similar
measurements|http://forum.dcbase.org/viewtopic.php?f=18&t=667> done in 2010.
to be consistent with [similar
measurements](http://forum.dcbase.org/viewtopic.php?f=18&t=667) done in 2010.
The raw data for these graphs is found in
L<dclistsize|http://g.blicky.net/dcstats.git/tree/dclistsize>, which lists the
[dclistsize](https://g.blicky.net/dcstats.git/tree/dclistsize), which lists the
compressed and uncompressed size, respectively, for each file list. The gnuplot
script for the first graph is
L<dclistsize.plot|http://g.blicky.net/dcstats.git/tree/dclistsize.plot> and
L<dclistcomp.plot|http://g.blicky.net/dcstats.git/tree/dclistcomp.plot> for the
[dclistsize.plot](https://g.blicky.net/dcstats.git/tree/dclistsize.plot) and
[dclistcomp.plot](https://g.blicky.net/dcstats.git/tree/dclistcomp.plot) for the
second.
=head2 Number of Files Per List
## Number of Files Per List
So how many files are people sharing? Let's find out.
[img graph dcnumfiles.png ]
![](/img/dcnumfiles.png)
As expected, this graph looks very similar to the one about the size of the
file list. The size of a list tends to be linear in the number of items it
holds, after all.
The raw data for this graph is found in
L<dcnumfiles|http://g.blicky.net/dcstats.git/tree/dcnumfiles>, which lists the
[dcnumfiles](https://g.blicky.net/dcstats.git/tree/dcnumfiles), which lists the
unique and total number of files, respectively, for each file list. The gnuplot
script is
L<dcnumfiles.plot|http://g.blicky.net/dcstats.git/tree/dcnumfiles.plot>.
[dcnumfiles.plot](https://g.blicky.net/dcstats.git/tree/dcnumfiles.plot).
=head2 File Sizes
## File Sizes
And how large are the files being shared? Well,
[img graph dcfilesize.png ]
![](/img/dcfilesize.png)
This graph is fun, and rather hard to explain without knowing what kind of
files we're dealing with. I'm not going to do any further analysis on what kind
@ -169,25 +164,25 @@ information in the file lists, but I don't expect the number of fake files to
be very significant.
The "raw" data for this graph is found in
L<dcfilesize|http://g.blicky.net/dcstats.git/tree/dcfilesize>. Because I wasn't
[dcfilesize](https://g.blicky.net/dcstats.git/tree/dcfilesize). Because I wasn't
interested in dealing with a text file of 84 million lines, the data is already
binned. The first column is the bin number and the second column the number of
unique files in that bin. The file sizes that each bin represents are between
C<2^(bin+9)> and C<2^(bin+10)>, with the exception of bin 0, which starts at a
`2^(bin+9)` and `2^(bin+10)`, with the exception of bin 0, which starts at a
file size of 0. The source of the gnuplot script is
L<dcfilesize.plot|http://g.blicky.net/dcstats.git/tree/dcfilesize.plot>.
[dcfilesize.plot](https://g.blicky.net/dcstats.git/tree/dcfilesize.plot).
=head2 Distribution of Files
## Distribution of Files
Another interesting thing to measure is how often files are shared. That is,
how many users have the same file?
[img graph dcfiledist.png ]
![](/img/dcfiledist.png)
Many files are only available from a single user. That's not really a good sign
when you wish to download such a file, but luckily there are also tons of files
that I<are> available from multiple users. What is interesting in this graph
isn't that it follows the L<power law|https://en.wikipedia.org/wiki/Power_law>,
that _are_ available from multiple users. What is interesting in this graph
isn't that it follows the [power law](https://en.wikipedia.org/wiki/Power_law),
but it's wondering what those outliers could possibly be. There's a collection
of 269 files that has been shared among 831 users, and there appears to be a
similar group of around 510-515 files that is shared among 20 or so users. I've
@ -201,12 +196,11 @@ the empty file. There are so many ways to get an empty file somewhere in your
filesystem, after all.
The raw data for this graph is found in
L<dcfiledist|http://g.blicky.net/dcstats.git/tree/dcfiledist>, which lists the
[dcfiledist](https://g.blicky.net/dcstats.git/tree/dcfiledist), which lists the
number of times shared and the aggregate number of files. The gnuplot script is
L<dcfiledist.plot|http://g.blicky.net/dcstats.git/tree/dcfiledist.plot>.
[dcfiledist.plot](https://g.blicky.net/dcstats.git/tree/dcfiledist.plot).
=head1 Final Notes
# Final Notes
So, erm, what conclusions can we draw from this? That stats are fun, I guess.
If anyone (including me) is going to repeat this experiment on a fresh data
@ -218,5 +212,4 @@ Furthermore, keep in mind that this is just a snapshot of a single day on a
single hub. The graphs may look very different when the file lists are
harvested at some other time. And it's also quite likely that different hubs
will have very different share profiles. It could be interesting to try and
graph everything, but I don't have I<that> kind of free time.
graph everything, but I don't have _that_ kind of free time.

632
dat/doc/easyipc.md Normal file
View file

@ -0,0 +1,632 @@
% The Sorry State of Convenient IPC
(Published on **2014-07-29**)
# The Problem
How do you implement communication between two or more processes? This is a
question that has been haunting me for at least 6 years now. Of course, this
question is very broad and has many possible answers, depending on your
scenario. So let me get more specific by describing the problem I want to
solve.
What I want is to write a daemon process that runs in the background and can be
controlled from other programs or libraries. The intention is that people can
easily write custom interfaces or quick scripts to control the daemon. The
service that the daemon offers over this communication channel can be thought
of as its primary API, in this way you can think of the daemon as a persistent
programming library. This concept is similar to existing programs such as
[btpd](https://github.com/btpd/btpd), [MPD](http://www.musicpd.org/),
[Transmission](https://www.transmissionbt.com/) and
[Telepathy](http://telepathy.freedesktop.org/wiki/) - I'll get back to these
later.
More specifically, the most recent project I've been working on that follows
this pattern is [Globster](/globster), a remotely controllable Direct Connect
client (if you're not familiar with Direct Connect, think of it as IRC with
some additional file sharing capabilities built in). While the problem I
describe is not specific to Globster, it still serves as an important use case.
I see many other projects with similar IPC requirements.
The IPC mechanism should support two messaging patterns: Request/response and
asynchronous notifications. The request/response pattern is what you typically
get in RPC systems - the client requests something of the daemon and the daemon
then replies with a response. Asynchronous notifications are useful in allowing
the daemon to send asynchronous status updates to the client, such as incoming
chat messages or file transfer status. Lack of support for such notifications
would mean that a client needs to continuously poll for updates, which is
inefficient.
So what I'm looking for is a high-level IPC mechanism that handles this
communication. Solutions are evaluated by the following criteria, in no
particular order.
**Easy**
: And with _easy_ I refer to _ease of use_. As mentioned above, other people
should be able to write applications and scripts to control the daemon. Not
many people are willing to invest days of work just to figure out how to
communicate with the daemon.
**Simple**
: Simplicity refers to the actual protocol and the complexity of the code
necessary to implement it. Complex protocols require complex code, and complex
code is hard to maintain and will inevitably contain bugs. Note that _simple_
and _easy_ are very different things and often even conflict with each other.
**Small**
: The IPC implementation shouldn't be too large, and shouldn't depend on huge
libraries. If you need several megabytes worth of libraries just to send a few
messages over a socket, you're doing it wrong.
**Language independent**
: Control the daemon with whatever programming language you're familiar with.
**Networked**
: A good solution should be accessible from both the local system (daemon running
on the same machine as the client) and from the network (daemon and client
running different machines).
**Secure**
: There's three parts in having a secure IPC mechanism. One part is to realize
that IPC operates at a _trust boundary_; The daemon can't blindly trust
everything the client says and vice versa, so message validation and other
mechanisms to prevent DoS or information disclosure on either part are
necessary.
Then there the matter of _confidentiality_. On a local system, UNIX sockets
will provide all the confidentiality you can get, so that's trivial. Networked
access, on the other hand, requires some form of transport layer security.
And finally, we need some form of _authentication_. There should be some
mechanism to prevent just about anyone to connect to the daemon. A
coarse-grained solution such as file permissions on a local UNIX socket or a
password-based approach for networked access will do just fine for most
purposes. Really, just keep it simple.
**Fast**
: Although performance isn't really a primary goal, the communication between the
daemon and the clients shouldn't be too slow or heavyweight. For my purposes,
anything that supports about a hundred messages a second on average hardware
will do perfectly fine. And that shouldn't be particularly hard to achieve.
**Proxy support**
: This isn't really a hard requirement either, but it would be nice to allow
other processes (say, plugins of the daemon, or clients connecting to the
daemon) to export services over the same IPC channel as the main daemon. This
is especially useful in implementing a cross-language plugin architecture. But
again, not a hard requirement, because even if the IPC mechanism doesn't
directly support proxying, it's always possible for the daemon to implement
some custom APIs to achieve the same effect. This, however, requires extra work
and may not be as elegant as a built-in solution.
Now let's discuss some existing solutions...
# Custom Protocol
Why use an existing IPC mechanism in the first place when all you need is
UNIX/TCP sockets? This is the approach taken by
[btpd](https://github.com/btpd/btpd), [MPD](http://www.musicpd.org/)
([protocol spec](http://www.musicpd.org/doc/protocol/index.html)) and older
versions of Transmission (see their [1.2x
spec](https://trac.transmissionbt.com/browser/branches/1.2x/doc/ipcproto.txt)).
Brpd hasn't taken the time to documented the protocol format, suggesting it's
not really intended to be used as a convenient API (other than through their
btcli), and Transmission has since changed to a different protocol. I'll mainly
focus on MPD here.
MPD uses a text-based request/response mechanism, where each request is a
simple one-line command and a response consists of one or more lines, ending
with an `OK` or `ACK` line. There's no support for asynchronous
notifications, although that could obviously have been implemented, too. Let's
grade this protocol...
**Easy?** Not really.
: Although MPD has conventions for how messages are formatted, each individual
message still requires custom parsing and validation. This can be automated by
designing an
[IDL](https://en.wikipedia.org/wiki/Interface_description_language) and
accompanying code generator, but writing one specific for a single project
doesn't seem like a particularly fun task.
The protocol, despite its apparent simplicity, is apparently painful enough to
use that there is a special _libmpdclient_ library to abstract away the
communication with MPD, and interfaces to this library are available in many
programming languages. If you have access to such an application-specific
library for your language of choice, then sure, using the IPC mechanism is easy
enough. But that applies to literally any IPC mechanism.
Ideally, such a library needs to be written only once for the IPC mechanism in
use, and after that no additional code is needed to communicate with
services/daemons using that particular IPC mechanism. Code re-use among
different projects is great, yo. It also doesn't scale very well when extending
the services offered by daemon, any addition to the API will require
modifications to all implementations.
**Simple?** Definitely.
: I only needed a quick glance at the MPD protocol reference and I was able to
play a bit with telnet and control my MPD. Writing an implementation doesn't
seem like a complex task. Of course, this doesn't necessarily apply to all
custom protocols, but you can make it as simple or complex as you want it to
be.
**Small?** Sure.
: This obviously depends on how elaborate you design your protocol. If you have a
large or complex API, the size of a generic message parser and validator can
easily compensate for the custom parser and validator needed for each custom
message. But for a simple APIs, it's hard to beat a custom protocol in terms of
size.
**Language independent?** Depends.
: Of course, a socket library is available to most programming languages, and in
that sense any IPC mechanism built on sockets is language independent. This is,
as such, more of an argument as to how convenient it is to communicate with the
protocol directly rather than with a library that abstracts the protocol away.
In the case of MPD, the text-based protocol seems easy enough to use directly
from most languages, yet for some reason most people prefer language-specific
libraries for MPD.
If you design a binary protocol or anything more complex than simple
request/response message types, using your protocol directly is going to be a
pain in certain languages, and people will definitely want a library specific
to your daemon for their favourite programming language. Something you'll want
to avoid, I suppose.
**Networked?** Sure enough.
: Just a switch between UNIX sockets and TCP sockets. Whether a simple solution
like that is a good idea, however, depends on the next point...
**Secure?** Ugh.
: Security is hard to get right, so having an existing infrastructure that takes
care of most security sensitive features will help a lot. Implementing your own
protocol means that you also have to implement your own security, to some
extent at least.
Writing code to parse and validate custom messages is error-prone, and a bug in
this code could make both the daemon and the client vulnerable to crashes and
buffer overflows. A statically-typed abstraction that handles parsing and
validation would help a lot.
For networked communication, you'll need some form of confidentiality. MPD does
not seem to support this, so any networked access to an MPD server is
vulnerable to passive observers and MITM attacks. This may be fine for a local
network (presumably what it is intended to be used for), but certainly doesn't
work for exposing your MPD control interface to the wider internet. Existing
protocols such as TLS or SSH can be used to create a secure channel, but these
libraries tend to be large and hard to use securely. This is especially true
for TLS, but at least there's [stunnel](https://www.stunnel.org/) to simplify
the implementation - at the cost of less convenient deployment.
In terms of authentication, you again need to implement this yourself. MPD
supports authentication using a plain-text password. This is fine for a trusted
network, but on an untrusted network you certainly want confidentiality to
prevent a random observer from reading your password.
**Fast?** Sure.
: Existing protocols may have put more effort into profiling and implementing
various optimizations than one would typically do with a custom and
quickly-hacked-together protocol, but still, it probably takes effort to design
a protocol that isn't fast enough.
**Proxy support?** Depends...
: Really depends on how elaborate you want to be. It can be very simple if all
you want is to route some messages, it can get very complex if you want to
ensure that these messages follow some format or if you want to reserve certain
interfaces or namespaces to certain clients. What surprised me about the MPD
protocol is that it actually has [some support for
proxying](http://www.musicpd.org/doc/protocol/ch03s11.html). But considering the
ad-hoc nature of the MPD protocol, the primitiveness and simplicity of this
proxy support wasn't too surprising. Gets the job done, I suppose.
Overall, and as a rather obvious conclusion, a custom protocol really is what
you make of it. In general, though, it's a lot of work, not always easy to use,
and a challenge to get the security part right.
# D-Bus
D-Bus is being used in [Transmission](https://www.transmissionbt.com/) and is
what I used for [Globster](/globster).
On a quick glance, D-Bus looks _perfect_. It is high-level, has the messaging
patterns I described, the [protocol
specification](http://dbus.freedesktop.org/doc/dbus-specification.html) does
not seem _overly_ complex (though certainly could be simplified), it has
implementations for a number of programming languages, has support for
networking, proxying is part of normal operation, and it seems fast enough for
most purposes. When you actually give it a closer look, however, reality isn't
as rose-colored.
D-Bus is designed for two very specific use-cases. One is to allow local
applications to securely interact with system-level daemons such as
[HAL](https://en.wikipedia.org/wiki/HAL_\(software\)) (now long dead) and
[systemd](http://freedesktop.org/wiki/Software/systemd/), and the other
use-case is to allow communication between different applications inside one
login session. As such, on a typical Linux system there are two D-Bus daemons
where applications can export interfaces and where messages can be routed
through. These are called the _system bus_ and the _session bus_.
**Easy?** Almost.
: The basic ideas behind D-Bus seem easy enough to use. The fact that is has
type-safe messages, interface descriptions and introspection really help in
making D-Bus a convenient IPC mechanism.
The main reasons why I think D-Bus isn't all that easy to use in practice is
due to the lack of good introductionary documentation and the crappy state of
the various D-Bus implementations. There is a [fairly good
article](https://pythonhosted.org/txdbus/dbus_overview.html) providing a
high-level overview to D-Bus, but there isn't a lot of material that covers how
to actually use D-Bus to interact with applications or to implement a service.
On the implementations, I have had rather bad experiences with the actual
libraries. I've personally used the official libdbus-1, which markets itself a
"low-level" library designed to facilitate writing bindings for other
languages. In practice, the functionality that it offers appears to be too
high-level for writing bindings ([GDBus](https://developer.gnome.org/glib/)
doesn't use it for this reason), and it is indeed missing a lot of
functionality to make it convenient to use directly. I've also played around
with Perl's [Net::DBus](http://search.cpan.org/perldoc?Net%3A%3ADBus) and was
highly disappointed. Not only is the documentation rather incomplete, the
actual implementation has more bugs than features. And instead of building on
top of one of the many good event loops for Perl (such as
[AnyEvent](http://search.cpan.org/perldoc?AnyEvent)), it chooses to implement
[its own event
loop](http://search.cpan.org/perldoc?Net%3A%3ADBus%3A%3AReactor). The existence
of several different libraries for Python doesn't incite much confidence,
either.
I was also disappointed in terms of the available tooling to help in the
development, testing and debugging of services. The [gdbus(1)](http://man.he.net/man1/gdbus) tool is useful
for monitoring messages and scripting some things, but is not all that
convenient because D-Bus has too many namespaces and the terrible Java-like
naming conventions make typing everything out a rather painful experience.
[D-Feet](http://live.gnome.org/DFeet/) offers a great way to explore services,
but lacks functionality for quick debugging sessions. I [made an
attempt](http://g.blicky.net/dbush.git/) to write a convenient command-line
shell, but lost interest halfway. :-(
D-Bus has the potential to be an easy and convenient IPC mechanism, but the
lack of any centralized organization to offer good implementations,
documentation and tooling makes using D-Bus a pain to use.
**Simple?** Not quite.
: D-Bus is conceptually easy and the message protocol is alright, too. Some
aspects of D-Bus, however, are rather more complex than they need to be.
I have once made an attempt to fully understand how D-Bus discovers and
connects to the session bus, but I gave up halfway because there are too
many special cases. To quickly summarize what I found, there's the
`DBUS_SESSION_BUS_ADDRESS` environment variable which could point to the
(filesystem or abstract) path of a UNIX socket or a TCP address. If that
variable isn't set, D-Bus will try to connect to your X server and get the
address from that. In order to avoid linking everything against X
libraries, a separate [dbus-launch](https://metacpan.org/pod/dbus-launch)
utility is spawned instead. Then the bus address could also be obtained
from a file in your `$HOME/.dbus/` directory, with added complexity to
still support a different session bus for each X session. I've no idea how
exactly connection initiation to the system bus works, but my impression is
that a bunch of special cases exist there, too, depending on which init
system your OS happens to use.
As if all the options in connection initiation aren't annoying enough,
there's also work on [kdbus](https://lwn.net/Articles/580194/), a Linux
kernel implementation to get better performance. Not only will kdbus use a
different underlying communication mechanism, it will also switch to a
completely different serialization format. If/when this becomes widespread
you will have to implement and support two completely different protocols
and pray that your application works with both.
On the design aspect there is, in my opinion, needless complexity with
regards to naming and namespaces. First there is a global namespace for
_bus names_, which are probably better called _application names_, because
that's usually what they represent. Then, there is a separate _object_
namespace local to each bus name. Each object has methods and properties,
and these are associated with an _interface name_, in a namespace specific
to the particular object. Despite these different namespaces, the
convention is to use a full and globally unique path for everything that
has a name. For example, to list the IM protocols that Telepathy supports,
you call the `ListProtocols` method in the
`org.freedesktop.Telepathy.ConnectionManager` interface on the
`/org/freedesktop/Telepathy/ConnectionManager` object at the
`org.freedesktop.Telepathy` bus. Fun times indeed. I can understand the
reasoning behind most of these choices, but in my opinion they found the
wrong trade-off.
Another point of complexity that annoys me is the fact that an XML format
is used to describe interfaces. Supporting XML as an IDL format is alright,
but requiring a separate format for an introspection interface gives me the
impression that the message format wasn't powerful enough for such a simple
purpose. The direct effect of this is that any application wishing to use
introspection data will have to link against an XML parser, and almost all
conforming XML parser implementations are as large as the D-Bus
implementation itself.
**Small?** Kind of.
: `libdbus-1.so.3.8.6` on my system is about 240 KiB. It doesn't cover parsing
interface descriptions or implementing a D-Bus daemon, but still covers most of
what is needed to interact with services and to offer services over D-Bus.
It's not _that_ small, but then again, libdbus-1 was not really written with
small size in mind. There's room for optimization.
**Language independent?** Sure.
: D-Bus libraries exist for a number of programming languages.
**Networked?** Half-assed.
: D-Bus _officially_ supports networked connections to a D-Bus daemon. Actually
using this, however, is painful. Convincing
[dbus-daemon(1)](http://man.he.net/man1/dbus-daemon) to accept connections
on a TCP socket involves disabling all authentication (it expects UNIX
credential passing, normally) and requires adding an undocumented
`<allow_anonymous/>` tag in the configuration (I only figured this out from
reading the source code).
Even when you've gotten that to work, there is the problem that D-Bus isn't
totally agnostic to the underlying socket protocol. D-Bus has support for
passing UNIX file descriptors over the connection, and this of course doesn't
work over TCP. While this feature is optional and easily avoided, some services
(I can't find one now) use UNIX fds in order to keep track of processes that
listen to a certain event. Obviously, those services can't be accessed over the
network.
**Secure?** Only locally.
: D-Bus has statically typed messages that can be validated automatically, so
that's a plus.
For local authentication, there is support for standard UNIX permissions and
credential passing for more fine-grained authorization. For remote
authentication, I think there is support for a shared secret cookie, but I
haven't tried to use this yet.
There is, as with MPD, no support at all for confidentiality, so using
networked D-Bus over an untrusted network would be a very bad idea anyway.
**Fast?** Mostly.
: The messaging protocol is fairly lightweight, so no problems there. I do have
to mention two potential performance issues, however.
The first issue is that the normal mode of operation in D-Bus is to proxy all
messages through an intermediate D-Bus daemon. This involves extra context
switches and message parsing passes in order to get one message from
application A to application B. I believe it is _officially_ supported to
bypass this daemon and to communicate directly between two processes, but after
my experience with networking I am wary of trying anything that isn't part of
how D-Bus is _intended_ to be used. This particular performance issue is what
kdbus addresses, so I suppose it won't apply to future Linux systems.
The other issue is that a daemon that provides a service over D-Bus does not
know whether there exists an application that is interested receiving its
notifications. This means that the daemon always has to spend resources to send
out notification messages, even if no application is actually interested in
receiving them. In practice this means that the notification mechanism is
avoided for events that may occur fairly often, and an equally inefficient
polling approach has to be used instead. It is possible for a service provider
to keep track of interested applications, but this is not part of the D-Bus
protocol and not something you would want to implement for each possible event.
I've no idea if kdbus addresses this issue, but it would be stupid not to.
**Proxy support?** Yup.
: It's part of normal operation, even.
D-Bus has many faults, some of them are by design, but many are fixable. I
would have contributed to improving the situation, but I get the feeling that
the goals of the D-Bus maintainers are not at all aligned with mine. My
impression is that the D-Bus maintainers are far too focussed on their own
specific needs and care little about projects with slightly different needs.
Especially with the introduction of kdbus, I consider D-Bus too complex now to
consider it worth the effort to improve. Starting from scratch seems less work.
# JSON/XML RPC
While I haven't extensively used JSON-RPC or XML-RPC myself, it's still an
interesting alternative to study.
[Transmission](https://www.transmissionbt.com/) uses JSON-RPC
([spec](https://trac.transmissionbt.com/browser/trunk/extras/rpc-spec.txt)) as
its primary IPC mechanism, and [RTorrent](http://rakshasa.github.io/rtorrent/)
has support for an optional XML-RPC interface. (Why do I keep referencing
torrent clients? Surely there are other interesting applications? Oh well.)
The main selling point of HTTP-based IPC is that it is accessible from
browser-based applications, assuming everything has been setup correctly. This
is a nice advantage, but lack of this support is not really a deal-breaker for
me. Browser-based applications can still use any other IPC mechanism, as long
as there are browser plugins or some form of proxy server that converts the
messages of the IPC mechanism to something that is usable over HTTP. For
example, both solutions exist for D-Bus, in the form of the [Browser DBus
Bridge](http://sandbox.movial.com/wiki/index.php/Browser_DBus_Bridge) and
[cloudeebus](https://github.com/01org/cloudeebus). Of course, such solutions
typically aren't as convenient as native HTTP support.
Since HTTP is, by design, purely request-response, JSON-RPC and XML-RPC don't
generally support asynchronous notifications. It's possible to still get
asynchronous notifications by using
[WebSockets](https://en.wikipedia.org/wiki/WebSocket) (Ugh, opaque stream
sockets, time to go back to our [custom protocol](#custom-protocol)) or by
having the client implement a HTTP server itself and send its URL to the
service provider (This is known as a
[callback](https://duckduckgo.com/?q=web%20service%20callback) in the
[SOAP](https://en.wikipedia.org/wiki/SOAP) world. I have a lot of respect for
developers who can put up with that crap). As I already hinted, neither
solution is simple or easy.
Let's move on to the usual grading...
**Easy?** Sure.
: The ubiquity of HTTP, JSON and XML on the internet means that most developers
are already familiar with using it. And even if you aren't, there are so many
easy-to-use and well-documented libraries available that you're ready to go in
a matter of minutes.
Although interface description languages/formats exist for XML-RPC (and
possibly for JSON-RPC, too), I get the impression these are not often used
outside of the SOAP world. As a result, interacting with such a service tends
be weakly/stringly typed, which, I imagine, is not as convenient in strongly
typed programming languages.
**Simple?** Not really.
: Many people have the impression that HTTP is somehow a simple protocol. Sure,
it may look simple on the wire, but in reality it is a hugely bloated and
complex protocol. I strongly encourage everyone to read through [RFC
2616](https://tools.ietf.org/html/rfc2616) at least once to get an idea of its
size and complexity. To make things worse, there's a lot of recent activity to
standardize on a next generation HTTP
([SPDY](https://en.wikipedia.org/wiki/SPDY) and [HTTP
2.0](https://en.wikipedia.org/wiki/HTTP_2.0)), but I suppose we can ignore these
developments for the foreseeable future for the use case of IPC.
Of course, a lot of the functionality specified for HTTP is optional and can be
ignored for the purpose of IPC, but that doesn't mean that these options don't
exist. When implementing a client, it would be useful to know exactly which
HTTP options the server supports. It would be wasteful to implement compression
support if the server doesn't support it, or keep-alive, or content
negotiation, or ranged requests, or authentication, or correct handling for all
response codes when the server will only ever send 'OK'. What also commonly
happens is that server implementors want to support as much as possible, to the
point that you can have JSON or XML output, depending on what the client
requested.
XML faces a similar problem. The format looks simple, but the specification has
a bunch of features that hardly anyone uses. In contrast to HTTP, however, a
correct XML parser can't just decide to not parse `<!DOCTYPE ..>` stuff,
so it _has_ to implement some of this complexity.
On the upside, JSON is a really simple serialization format, and if you're
careful enough to only implement the functionality necessary for basic HTTP, a
JSON-RPC implementation _can_ be somewhat simple.
**Small?** Not really.
: What typically happens is that implementors take an existing HTTP library and
build on top of that. A generic HTTP library likely implements a lot more than
necessary for IPC, so that's not going to be very small. RTorrent, for example,
makes use of the not-very-small [xmlrpc-c](http://xmlrpc-c.sourceforge.net/),
which in turn uses [libcurl](http://curl.haxx.se/) (400 KiB, excluding TLS
library) and either the bloated [libxml2](http://xmlsoft.org/) (1.5 MiB) or
[libexpat](http://www.libexpat.org/) (170 KiB). In any case, expect your
programs to grow by a megabyte or more if you go this route.
Transmission seems rather less bloated. It uses the HTTP library that is built
into [libevent](http://libevent.org/) (totalling ~500 KiB, but libevent is also
used for other networking parts), and a simple JSON parser can't be that large
either. I'm sure that if you reimplement everything from scratch for the
purpose of building an API, you could get something much smaller. Then again,
even if you manage to shrink the size of the server that way, you can't expect
all your users to do the same.
If HTTPS is to be supported, add ~500 KiB more. TLS isn't the simplest
protocol, either.
**Language independent?** Yes.
: Almost every language has libraries for web stuff.
**Networked?** Definitely.
: In fact, I've never seen anyone use XML/JSON RPC over UNIX sockets.
**Secure?** Alright.
: HTTP has built-in support for authentication, but it also isn't uncommon to use
some other mechanism (based on cookies, I guess?).
Confidentiality can be achieved with HTTPS. There is the problem of verifying
the certificate, since I doubt anyone is going to have certificates of their
local applications signed by a certificate authority, but there's always the
option of trust-on-first use. Custom applications can also include a
fingerprint of the server certificate in the URL for verification, but this
won't work for web apps.
**Fast?** No.
: JSON/XML RPC messages add significant overhead to the network and requires more
parsing than a simple custom solution or D-Bus. I wouldn't really call it
_fast_, but admittedly, it might still be _fast enough_ for most purposes.
**Proxy support?** Sure.
: HTTP has native support for proxying, and it's always possible to proxy some
URI on the main server to another server, assuming the libraries you use
support that. It's not necessarily simple to implement, however.
The lack of asynchronous notifications and the overhead and complexity of
JSON/XML RPC make me stay away from it, but it certainly is a solution that
many client developers will like because of its ease of use.
# Other Systems
There are a more alternatives out there than I have described so far. Most of
those were options I dismissed early on because they're either incomplete
solutions or specific to a single framework or language. I'll still mention a
few here.
## Message Queues
In the context of IPC I see that message queues such as
[RabbitMQ](https://www.rabbitmq.com/) and [ZeroMQ](http://zeromq.org/) are
quite popular. I can't say I have much experience with any of these, but these
MQs don't seem to offer a solution to the problem I described in the
introduction. My impression of MQs is that they offer a higher-level and more
powerful alternative to TCP and UDP. That is, they route messages from one
endpoint to another. The contents of the messages are still completely up to
the application, so you're still on your own in implementing an RPC mechanism
on top of that. And for the purpose of building a simple RPC mechanism, I'm
convinced that plain old UNIX sockets or TCP will do just fine.
## Cap'n Proto
I probably should be spending a full chapter on [Cap'n
Proto](http://kentonv.github.io/capnproto/) instead of this tiny little section,
but I'm simply not familiar enough with it to offer any deep insights. I can
still offer my blatantly uninformed impression of it: It looks very promising,
but puts, in my opinion, too much emphasis on performance and too little
emphasis on ease of use. It lacks introspection and requires that clients have
already obtained the schema of the service in order to interact with it. It
also uses a capability system to handle authorization, which, despite being
elegant and powerful, increases complexity and cognitive load (though I
obviously need more experience to quantify this). It still lacks
confidentiality for networked access and the number of bindings to other
programming languages is limited, but these problems can be addressed.
Cap'n Proto seems like the ideal IPC mechanism for internal communication
within a single (distributed) application and offers a bunch of unique features
not found in other RPC systems. But it doesn't feel quite right as an easy API
for others to use.
## CORBA
CORBA has been used by the GNOME project in the past, and was later abandoned
in favour of D-Bus, primarily (I think) because CORBA was deemed too [complex
and incomplete](http://dbus.freedesktop.org/doc/dbus-faq.html#corba). A system
that is deemed more complex than D-Bus is an immediate red flag. The [long and
painful history of CORBA](http://queue.acm.org/detail.cfm?id=1142044) also makes
me want to avoid it, if only because that makes it very hard to judge the
quality and modernness of existing implementations.
## Project Tanja
A bit over two years ago I was researching the same problem, but from a much
more generic angle. The result of that was a project that I called Tanja. I
described its concepts [in an earlier article](/doc/commvis), and wrote an
incomplete [specification](https://g.blicky.net/tanja.git/) along with
implementations in [C](https://g.blicky.net/tanja-c.git/),
[Go](https://g.blicky.net/tanja-go.git/) and
[Perl](https://g.blicky.net/tanja-perl.git/). I consider project Tanja a
failure, primarily because of its genericity. It supported too many
communication models and the lack of a specification as to which model was
used, and the lack of any guarantee that this model was actually followed, made
Tanja hard to use in practice. It was a very interesting experiment, but not
something I would actually use. I learned the hard way that you sometimes have
to move some complexity down into a lower abstraction layer in order to keep
the complexity in check at higher layers of abstraction.
# Conclusions
This must be the longest rant I've written so far.
In any case, there isn't really a perfect IPC mechanism for my use case. A
custom protocol involves reimplementing a lot of stuff, D-Bus is a pain, and
JSON/XML RPC are bloat.
I am still undecided on what to do. I have a lot of ideas as to what a perfect
IPC solution would look like, both in terms of features and in how to implement
it, and I feel like I have enough experience by now to actually develop a
proper solution. Unfortunately, writing a complete IPC system with the required
utilities and language bindings takes **a lot** of time and effort. It's not
really worth it if I am the only one using it.
So here is my plea to you, dear reader: If you know of any existing solutions
I've missed, please tell me. If you empathize with me and want a better
solution to this problem, please get in touch as well! I'd love to hear about
projects which face similar problems and have similar requirements.

517
dat/doc/funcweb.md Normal file
View file

@ -0,0 +1,517 @@
% An Opinionated Survey of Functional Web Development
(Published on **2017-05-28**)
# Intro
TL;DR: In this article I provide an overview of the frameworks and libraries
available for creating websites in statically-typed functional programming
languages.
I recommend you now skip directly to the next section, but if you're interested
in some context and don't mind a rant, feel free to read on. :-)
**&lt;Rant mode>**
When compared to native desktop application development, web development just
sucks. Native development is relatively simple with toolkits such as
[Qt](https://www.qt.io/), [GTK+](https://www.gtk.org/) and others: You have
convenient widget libraries, and you can describe your entire application, from
interface design to all behavioural aspects, in a single programming language.
You're also largely free to structure code in whichever way makes most sense.
You can describe what a certain input field looks like, what happens when the
user interacts with it and what will happen with the input data, all succinctly
in a single file. There are even drag-and-drop UI builders to speed up
development.
Web development is the exact opposite of that. There are several different
technologies you're forced to work with even when creating the most mundane
website, and there's a necessary but annoying split between code that runs on
the server and code that runs in the browser. Creating a simple input field
requires you to consider and maintain several ends:
- The back end (server-side code) that describes how the input field interacts
with the database.
- Some JavaScript code to describe how the user can interact with the input
field.
- Some CSS to describe what the input field looks like.
- And then there's HTML to act as a glue between the above.
In many web development setups, all four of the above technologies are
maintained in different files. If you want to add, remove or modify an input
field, or just about anything else on a page, you'll be editing at least four
different files with different syntax and meaning. I don't know how other
developers deal with this, but the only way I've been able to keep these places
synchronized is to just edit one or two places, test if it works in a browser,
and then edit the other places accordingly to fix whatever issues I find. This
doesn't always work well: I don't get a warning if I remove an HTML element
somewhere and forget to also remove the associated CSS. Heck, in larger
projects I can't even tell whether it's safe to remove or edit a certain line
of CSS because I have no way to know for sure that it's not still being used
elsewhere. Perhaps this particular case can be solved with proper organization
and discipline, but similar problems exist with the other technologies.
Yet despite that, why do I still create websites in my free time? Because it is
the only environment with high portability and low friction - after all, pretty
much anyone can browse the web. I would not have been able to create a useful
"[Visual Novel Database](https://vndb.org/)" any other way than through a
website. And the entire purpose of [Manned.org](https://manned.org/) is to
provide quick access to man pages from anywhere, which is not easily possible
with native applications.
**&lt;/Rant mode>**
Fortunately, I am not the only one who sees the problems with the "classic"
development strategy mentioned above. There are many existing attempts to
improve on that situation. A popular approach to simplify development is the
[Single-page
application](https://en.wikipedia.org/wiki/Single-page_application) (SPA). The
idea is to move as much code as possible to the front end, and keep only a
minimal back end. Both the HTML and the entire behaviour of the page can be
defined in the same language and same file. With libraries such as
[React](https://facebook.github.io/react/) and browser support for [Web
components](https://developer.mozilla.org/en-US/docs/Web/Web_Components), the
split between files described above can be largely eliminated. And if
JavaScript isn't your favorite language, there are many alternative languages
that compile to JavaScript. (See [The JavaScript
Minefield](http://walkercoderanger.com/blog/2014/02/javascript-minefield/) for
an excellent series of articles on that topic).
While that approach certainly has the potential to make web development more
pleasant, it has a very significant drawback: Performance. For some
applications, such as web based email clients or CRM systems, it can be
perfectly acceptable to have a megabyte of JavaScript as part of the initial
page load. But for most other sites, such as this one, or the two sites I
mentioned earlier, or sites like Wikipedia, a slow initial page load is
something I consider to be absolutely unacceptable. The web can be really fast,
and developer laziness is not a valid excuse to ruin it. (If you haven't seen
or read [The Website Obesity
Crisis](http://idlewords.com/talks/website_obesity.htm) yet, please do so now).
I'm much more interested in the opposite approach to SPA: Move as much code as
possible to the back end, and only send a minimal amount of JavaScript to the
browser. This is arguably how web development has always been done in the past,
and there's little reason to deviate from it. The difference, however, is that
people tend to expect much more "interactivity" from web sites nowadays, so the
amount of JavaScript is increasing. And that is alright, so long as the
JavaScript doesn't prevent the initial page from loading quickly. But this
increase in JavaScript does amplify the "multiple files" problem I ranted about
earlier.
So my ideal solution is a framework where I can describe all aspects of a site
in a single language, and organize the code among files in a way that makes
sense to me. That is, I want the same kind of freedom that I get with native
desktop software development. Such a framework should run on the back end, and
automatically generate efficient JavaScript and, optionally, CSS for the front
end. As an additional requirement (or rather, strong preference), all this
should be in a statically-typed language - because I am seemingly incapable of
writing large reliable applications with dynamic typing - and in a language
from functional heritage - because programming in functional languages has
spoiled me.
I'm confident that what I describe is possible, and it's evident that I'm not
the only person to want this, as several (potential) solutions like this do
indeed exist. I've been looking around for these solutions and have
experimented with a few that looked promising. This article provides an
overview of what I have found so far.
# OCaml
My adventure began with [OCaml](https://ocaml.org/). It's been a few years
since I last used OCaml for anything, but development on the language and its
ecosystem have all but halted. [Real World OCaml](https://realworldocaml.org/)
has been a great resource to get me up to speed again.
## Ocsigen
For OCaml there is one project that has it all: [Ocsigen](http://ocsigen.org/).
It comes with an OCaml to JavaScript compiler, a web server, several handy
libraries, and a [framework](http://ocsigen.org/eliom/) to put everything
together. Its [syntax
extension](http://ocsigen.org/eliom/6.2/manual/ppx-syntax) allows you to mix
front and back end code, and you can easily share code between both ends. The
final result is a binary that runs the server and a JavaScript file that
handles everything on the client side.
The framework comes with an embedded DSL with which you can conveniently
generate HTML without actually typing HTML. And best of all, this DSL works on
both the client and the server: On the server side it generates an HTML string
that can be sent to the client, and running the same code on the client side
will result in a DOM element that is ready to be used.
Ocsigen makes heavy use of the OCaml type system to statically guarantee the
correctness of various aspects of the application. The HTML DSL ensures not
only that the generated HTML well-formed, but also prevents you from
incorrectly nesting certain elements and using the wrong attributes on the
wrong elements. Similarly, an HTML element generated on the server side can be
referenced from client side code without having to manually assign a unique ID
to the element. This prevents accidental typos in the ID naming and guarantees
that the element that the client side code refers to actually exists. URL
routing and links to internal pages are also checked at compile time.
Ocsigen almost exactly matches what I previously described as the perfect
development framework. Unfortunately, it has a few drawbacks:
- The generated JavaScript is quite large, a bit over 400 KiB for an hello
world. In my brief experience with the framework, this also results in a
noticeably slower page load. I don't know if it was done for performance
purposes, but subsequent page views are per default performed via in-browser
XHR requests, which do not require that all the JavaScript is re-parsed and
evaluated, and is thus much faster. This, however, doesn't work well if the
user opens pages in multiple tabs or performs a page reload for whatever
reason. And as I mentioned, I care a lot about the initial page loading time.
- The framework has a steep learning curve, and the available documentation is
by far not complete enough to help you. I've found myself wondering many
times how I was supposed to use a certain API and have had to look for
example code for enlightenment. At some point I ended up just reading the
source code instead of going for the documentation. What doesn't help here is
that, because of the heavy use of the type system to ensure code correctness,
most of the function signatures are far from intuitive and are sometimes very
hard to interpret. This problem is made even worse with the generally
unhelpful error messages from the compiler. (A few months with
[Rust](https://www.rust-lang.org/) and its excellent error messages has
really spoiled me on this aspect, I suppose).
- I believe they went a bit too far with the compile-time verification of
certain correctness properties. Apart from making the framework harder to
learn, it also increases the verbosity of the code and removes a lot of
flexibility. For instance, in order for internal links to be checked, you
have to declare your URLs (or _services_, as they call it) somewhere central
such that the view part of your application can access it. Then elsewhere you
have to register a handler to that service. This adds boilerplate and
enforces a certain code structure. And the gain of all this is, in my
opinion, pretty small: In the 15 years that I have been building web sites, I
don't remember a single occurrence where I mistyped the URL in an internal
link. I do suppose that this feature makes it easy to change URLs without
causing breakage, but there is a trivial counter-argument to that: [Cool URIs
don't change](https://www.w3.org/Provider/Style/URI.html). (Also, somewhat
ironically, I have found more dead internal links on the Ocsigen website than
on any other site I have visited in the past year, so perhaps this was indeed
a problem they considered worth fixing. Too bad it didn't seem to work out so
well for them).
Despite these drawbacks, I am really impressed with what the Ocsigen project
has achieved, and it has set a high bar for the future frameworks that I will
be considering.
# Haskell
I have always seen Haskell as that potentially awesome language that I just
can't seem to wrap my head around, despite several attempts in the past to
learn it. Apparently the only thing I was missing in those attempts was a
proper goal: When I finally started playing around with some web frameworks I
actually managed to get productive in Haskell with relative ease. What also
helped me this time was a practical introductory Haskell reference, [What I
Wish I Knew When Learning Haskell](http://dev.stephendiehl.com/hask/), in
addition to the more theoretical [Learn You A Haskell for Great
Good](http://learnyouahaskell.com/).
Haskell itself already has a few advantages when compared to OCaml: For one, it
has a larger ecosystem, so for any task you can think of there is probably
already at least one existing library. As an example, I was unable to find an
actively maintained SQL DSL for OCaml, while there are several available for
Haskell. Another advantage that I found were the much more friendly and
detailed error messages generated by the Haskell compiler, GHC. In terms of
build systems, Haskell has standardized on
[Cabal](https://www.haskell.org/cabal/), which works alright most of the time.
Packaging is still often complex and messy, but it's certainly improving as
[Stack](http://haskellstack.org/) is gaining more widespread adoption. Finally,
I feel that the Haskell syntax is slightly less verbose, and more easily lends
itself to convenient DSLs.
Despite Haskell's larger web development community, I could not find a single
complete and integrated client/server development framework such as Ocsigen.
Instead, there are a whole bunch of different projects focussing on either the
back end or the front end. I'll explore some of them with the idea that,
perhaps, it's possible to mix and match different libraries and frameworks in
order to get the perfect development environment. And indeed, this seems to be
a common approach in many Haskell projects.
## Server-side
Let's start with a few back end frameworks.
Scotty
: [Scotty](https://github.com/scotty-web/scotty) is a web framework inspired by
[Sinatra](http://www.sinatrarb.com/). I have no experience with (web)
development in Ruby and have never used Sinatra, but it has some similarities
to what I have been using for a long time: [TUWF](https://dev.yorhel.nl/tuwf).
Scotty is a very minimalist framework; It does routing (that is, mapping URLs
to Haskell functions), it has some functions to access request data and some
functions to create and modify a response. That's it. No database handling,
session management, HTML generation, form handling or other niceties. But
that's alright, because there are many generic libraries to help you out there.
Thanks to its minimalism, I found Scotty to be very easy to learn and get used
to. Even as a Haskell newbie I had a simple website running within a day. The
documentation is appropriate, but the idiomatic way of combining Scotty with
other libraries is through the use of Monad Transformers, and a few more
examples in this area would certainly have helped.
Spock
: Continuing with the Star Trek franchise, there's
[Spock](https://www.spock.li/). Spock is very similar to Scotty, but comes with
type-safe routing and various other goodies such as session and state
management, [CSRF](https://en.wikipedia.org/wiki/Cross-site_request_forgery)
protection and database helpers.
As with everything that is (supposedly) more convenient, it also comes with a
slightly steeper learning curve. I haven't, for example, figured out yet how to
do regular expression based routing. I don't even know if that's still possible
in the latest version - the documentation isn't very clear. Likewise, it's
unclear to me what the session handling does exactly (Does it store something?
And where? Is there a timeout?) and how that interacts with CSRF protection.
Spock seems useful, but requires more than just a cursory glance.
Servant
: [Servant](http://haskell-servant.github.io/) is another minimalist web
framework, although it is primarily designed for creating RESTful APIs.
Servant distinguishes itself from Scotty and Spock by not only featuring
type-safe routing, it furthermore allows you to describe your complete public
API as a type, and get strongly typed responses for free. This also enables
support for automatically generated documentation and client-side API wrappers.
Servant would be an excellent back end for a SPA, but it does not seem like an
obvious approach to building regular websites.
Happstack / Snap / Yesod
: [Happstack](http://www.happstack.com/), [Yesod](http://www.yesodweb.com/) and
[Snap](http://snapframework.com/) are three large frameworks with many
auxiliary libraries. They all come with a core web server, routing, state and
database management. Many of the libraries are not specific to the framework
and can be used together with other frameworks. I won't go into a detailed
comparison between the three projects because I have no personal experience
with any of them, and fortunately [someone else already wrote a
comparison](http://softwaresimply.blogspot.nl/2012/04/hopefully-fair-and-useful-comparison-of.html)
in 2012 - though I don't know how accurate that still is today.
So there are a fair amount of frameworks to choose from, and they can all work
together with other libraries to implement additional functions. Apart from the
framework, another important aspect of web development is how you generate the
HTML to send to the client. In true Haskell style, there are several answers.
For those who prefer embedded DSLs, there are
[xhtml](http://hackage.haskell.org/package/xhtml),
[BlazeHTML](https://jaspervdj.be/blaze/) and
[Lucid](https://github.com/chrisdone/lucid). The xhtml package is not being
used much nowadays and has been superseded by BlazeHTML, which is both faster
and offers a more readable DSL using Haskell's do-notation. Lucid is heavily
inspired by Blaze, and attempts to [fix several of its
shortcomings](http://chrisdone.com/posts/lucid). Having used Lucid a bit
myself, I can attest that it is easy to get started with and pretty convenient
in use.
I definitely prefer to generate HTML using DSLs as that keeps the entire
application in a single host language and with consistent syntax, but the
alternative approach, templating, is also fully supported in Haskell. The Snap
framework comes with [Heist](https://github.com/snapframework/heist), which are
run-time interpreted templates, like similar systems in most other languages.
Yesod comes with [Shakespeare](http://hackage.haskell.org/package/shakespeare),
which is a type-safe templating system with support for inlining the templates
in Haskell code. Interestingly, Shakespeare also has explicit support for
templating JavaScript code. Too bad that this doesn't take away the need to
write the JavaScript yourself, so I don't see how this is an improvement over
some other JavaScript solution that uses JSON for communication with the back
end.
## Client-side
It is rather unusual to have multiple compiler implementations targeting
JavaScript for the same source language, but Haskell has three of them. All
three can be used to write front end code without touching a single line of
JavaScript, but there are large philosophical differences between the three
projects.
Fay
: [Fay](https://github.com/faylang/fay/wiki) compiles Haskell code directly to
JavaScript. The main advantage of Fay is that it does not come with a large
runtime, resulting small and efficient JavaScript. The main downside is that it
only [supports a subset of
Haskell](https://github.com/faylang/fay/wiki/What-is-not-supported?). The
result is a development environment that is very browser-friendly, but where
you can't share much code between the front and back ends. You're basically
back to the separated front and back end situation in classic web development,
but at least you can use the same language for both - somewhat.
Fay itself doesn't come with many convenient UI libraries, but
[Cinder](http://crooney.github.io/cinder/index.html) covers that with a
convenient HTML DSL and DOM manipulation library.
Fay is still seeing sporadic development activity, but there is not much of
a lively community around it. Most people have moved on to other solutions.
GHCJS
: [GHCJS](https://github.com/ghcjs/ghcjs) uses GHC itself to compile Haskell to a
low-level intermediate language, and then compiles that language to JavaScript.
This allows GHCJS to achieve excellent compatibility with native Haskell code,
but comes, quite predictably, at the high cost of duplicating a large part of
the Haskell runtime into the JavaScript output. The generated JavaScript code
is typically measured in megabytes rather than kilobytes, which is (in my
opinion) far too large for regular web sites. The upside of this high
compatibility, of course, is that you can re-use a lot of code between the
front and back ends, which will certainly make web development more tolerable.
The community around GHCJS seems to be more active than that of Fay. GHCJS
integrates properly with the Stack package manager, and there are a [whole
bunch](http://hackage.haskell.org/packages/search?terms=ghcjs) of libraries
available.
Haste
: [Haste](https://github.com/valderman/haste-compiler) provides a middle ground
between Fay and GHCJS. Like GHCJS, Haste is based on GHC, but it instead of
using low-level compiler output, Haste uses a higher-level intermediate
language. This results in good compatibility with regular Haskell code while
keeping the output size in check. Haste has a JavaScript runtime of around 60
KiB and the compiled code is roughly as space-efficient as Fay.
While it should be possible to share a fair amount of code between the front
and back ends, not all libraries work well with Haste. I tried to use Lucid
within a Haste application, for example, but that did not work. Apparently one
of its dependencies (probably the UTF-8 codec, as far as I could debug the
problem) performs some low-level performance optimizations that are
incompatible with Haste.
Haste itself is still being sporadically developed, but not active enough to be
called alive. The compiler lags behind on the GHC version, and the upcoming 0.6
version has stayed unreleased and in limbo state for at least 4 months on the
git repository. The community around Haste is in a similar state. Various
libraries do exist, such as [Shade](https://github.com/takeoutweight/shade)
(HTML DSL, Reactive UI), [Perch](https://github.com/agocorona/haste-perch)
(another HTML DSL), [haste-markup](https://github.com/ajnsit/haste-markup) (yet
another HTML DSL) and
[haste-dome](https://github.com/wilfriedvanasten/haste-dome) (_yet_ another
HTML DSL), but they're all pretty much dead.
Despite having three options available, only Haste provides enough benefit of
code reuse while remaining efficient enough for the kind of site that I
envision. Haste really deserves more love than it is currently getting.
## More Haskell
In my quest for Haskell web development frameworks and tools, I came across a
few other interesting libraries. One of them is
[Clay](http://fvisser.nl/clay/), a CSS preprocessor as a DSL. This will by
itself not solve the CSS synchronisation problem that I mentioned at the start
of this article, but it could still be used to keep the CSS closer to code
implementing the rest of the site.
It also would not do to write an article on Haskell web development and not
mention a set of related projects: [MFlow](https://github.com/agocorona/MFlow),
[HPlayground](https://github.com/agocorona/hplayground) and the more recent
[Axiom](https://github.com/transient-haskell/axiom). These are ambitious
efforts at building a very high-level and functional framework for both front
and back end web development. I haven't spend nearly enough time on these
projects to fully understand their scope, but I'm afraid of these being a bit
too high level. This invariably results in reduced flexibility (i.e. too many
opinions being hard-coded in the API) and less efficient JavaScript output.
Axiom being based on GHCJS reinforces the latter concern.
# Other languages
I've covered OCaml and Haskell now, but there are relevant projects in other
languages, too:
PureScript
: [PureScript](http://www.purescript.org/) is the spiritual successor of Fay -
except it does not try to be compatible with Haskell, and in fact
[intentionally deviates from
Haskell](https://github.com/purescript/documentation/blob/master/language/Differences-from-Haskell.md)
at several points. Like Fay, and perhaps even more so, PureScript compiles down
to efficient and small JavaScript.
Being a not-quite-Haskell language, sharing code between a PureScript front end
and a Haskell back end is not possible, the differences are simply too large.
It is, however, possible to go into the other direction: PureScript could also
run on the back end in a NodeJS environment. I don't really know how well this
is supported by the language ecosystem, but I'm not sure I'm comfortable with
replacing the excellent quality of Haskell back end frameworks with a fragile
NodeJS back end (or such is my perception, I admittedly don't have too much
faith in most JavaScript-heavy projects).
The PureScript community is very active and many libraries are available in the
[Persuit](https://pursuit.purescript.org/) package repository. Of note is
[Halogen](https://pursuit.purescript.org/packages/purescript-halogen), a
high-level reactive UI library. One thing to be aware of is that not all
libraries are written with space efficiency as their highest priority, the
simple [Halogen
button](https://github.com/slamdata/purescript-halogen/tree/v2.0.1/examples/basic)
example already compiles down to a hefty 300 KB for me.
Elm
: [Elm](http://elm-lang.org/) is similar to PureScript, but rather than trying to
be a generic something-to-JavaScript compiler, Elm focuses exclusively on
providing a good environment to create web UIs. The reactive UI libraries are
well maintained and part of the core Elm project. Elm has a strong focus on
being easy to learn and comes with good documentation and many examples to get
started with.
Ur/Web
: [Ur/Web](http://www.impredicative.com/ur/) is an ML and Haskell inspired
programming language specifically designed for client/server programming. Based
on its description, Ur/Web is exactly the kind of thing I'm looking for: It
uses a single language for the front and back ends and provides convenient
methods for communication between the two.
This has been a low priority on my to-try list because it seems to be primarily
a one-man effort, and the ecosystem around it is pretty small. Using Ur/Web for
practical applications will likely involve writing your own libraries or
wrappers for many common tasks, such as for image manipulation or advanced text
processing. Nonetheless, I definitely should be giving this a try sometime.
(Besides, who still uses frames in this day and age? :-)
Opa
: I'll be moving out of the functional programming world for a bit.
[Opa](http://opalang.org/) is another language and environment designed for
client/server programming. Opa takes a similar approach to "everything in
PureScript": Just compile everything to JavaScript and run the server-side code
on NodeJS. The main difference with other to-JavaScript compilers is that Opa
supports mixing back end code with front end code, and it can automatically
figure out where the code should be run and how the back and front ends
communicate with each other.
Opa, as a language, is reminiscent of a statically-typed JavaScript with
various syntax extensions. While it does support SQL databases, its database
API seems to strongly favor object-oriented use rather than relational database
access.
GWT
: Previously I compared web development to native GUI application development.
There is no reason why you can't directly apply native development structure
and strategies onto the web, and that's exactly what
[GWT](http://www.gwtproject.org/) does. It provides a widget-based programming
environment that eventually runs on the server and compiles the client-side
part to JavaScript. I haven't really considered it further, as Java is not a
language I can be very productive in.
Webtoolkit
: In the same vein, there's [Wt](https://www.webtoolkit.eu/wt). The name might
suggest that it is a web-based clone of Qt, and indeed that's what it looks
like. Wt is written in C++, but there are wrappers for [other
languages](https://www.webtoolkit.eu/wt/other_language). None of the languages
really interest me much, however.
That said, if I had to write a web UI for a resource-constrained device, this
seems like an excellent project to consider.
# To conclude
To be honest, I am a bit overwhelmed at the number of options. On the one hand,
it makes me very happy to see that a lot is happening in this world, and that
alternatives to boring web frameworks do exist. Yet after all this research I
still have no clue what I should use to develop my next website. I do like the
mix and match culture of Haskell, which has the potential to form a development
environment entirely to my own taste and with my own chosen trade-offs. On the
other hand, the client-side Haskell solutions are simply too immature and
integration with the back end frameworks is almost nonexistent.
Almost none of the frameworks I discussed attempt to tackle the CSS problem
that I mentioned in the introduction, so there is clearly room for more
research in this area.
There are a few technologies that I should spend more time on to familiarize
myself with. Ur/Web is an obvious candidate here, but perhaps it is possible to
create a Haskell interface to Wt. Or maybe some enhancements to the Haste
ecosystem could be enough to make that a workable solution instead.

576
dat/doc/sqlaccess.md Normal file
View file

@ -0,0 +1,576 @@
% Multi-threaded Access to an SQLite3 Database
(Published on **2011-11-26**)
(Minor 2013-04-06 update: I abstracted my message passing solution from ncdc
and implemented it in a POSIX C library for general use. It's called _sqlasync_
and is part of my [Ylib library collection](/ylib).)
# Introduction
As I was porting [ncdc](/ncdc) over to use SQLite3 as storage backend, I
stumbled on a problem: The program uses a few threads for background jobs, and
it would be nice to give these threads access to the database.
Serializing all database access through the main thread wouldn't have been very
hard to implement in this particular case, but that would have been far from
optimal. The main thread is also responsible for keeping the user interface
responsive and handling most of the network interaction. Overall responsiveness
of the program would significantly improve when the threads could access the
database without involvement of the main thread.
Which brought me to the following questions: What solutions are available for
providing multi-threaded access to an SQLite database? What problems may I run
in to? I was unable to find a good overview in this area on the net, so I wrote
this article with the hope to improve that situation.
# SQLite3 and threading
Let's first see what SQLite3 itself has to offer in terms of threading support.
The official documentation mentions threading support several times in various
places, but this information is scattered around and no good overview is given.
Someone has tried to organize this before on a [single
page](http://www.sqlite.org/cvstrac/wiki?p=MultiThreading), and while this
indeed gives a nice overview, it has unfortunately not been updated since 2006.
The advices are therefore a little on the conservative side.
Nonetheless, it is wise to remain portable with different SQLite versions,
especially when writing programs that dynamically link with some random version
installed on someone's system. It should be fairly safe to assume that SQLite
binaries provided by most systems, if not all, are compiled with thread safety
enabled. This doesn't mean all that much, unfortunately: The only thing _thread
safe_ means in this context is that you can use SQLite3 in multiple threads,
but a single database connection should still stay within a single thread.
Since SQLite 3.3.1, which was released in early 2006, it is possible to move a
single database connection along multiple threads. Doing this with older
versions is not advisable, as explained in [the SQLite
FAQ](http://www.sqlite.org/faq.html#q6). But even with 3.3.1 and later there is
an annoying restriction: A connection can only be passed to another thread when
any outstanding statements are closed and finalized. In practice this means
that it is not possible to keep a prepared statement in memory for later
executions.
Since SQLite 3.5.0, released in 2007, a single SQLite connection can be used
from multiple threads simultaneously. SQLite will internally manage locks to
avoid any data corruption. I can't recommend making use of this facility,
however, as there are still many issues with the API. The [error fetching
functions](http://www.sqlite.org/c3ref/errcode.html) and
[sqlite3\_last\_insert\_row\_id()](http://www.sqlite.org/c3ref/last_insert_rowid.html),
among others, are still useless without explicit locking in the application. I
also believe that the previously mentioned restriction on having to finalize
statements has been relaxed in this version, so keeping prepared statements in
memory and passing them among different threads becomes possible.
When using multiple database connections within a single process, SQLite offers
a facility to allow [sharing of its
cache](http://www.sqlite.org/sharedcache.html), in order to reduce memory usage
and disk I/O. The semantics of this feature have changed with different SQLite
versions and appear to have stabilised in 3.5.0. This feature may prove useful
to optimize certain situations, but does not open up new possibilities of
communicating with a shared database.
# Criteria
Before looking at some available solutions, let's first determine the criteria
we can use to evaluate them.
Implementation size
: Obviously, a solution that requires only a few lines of code to implement is
preferable over one that requires several levels of abstraction in order to be
usable. I won't be giving actual implementations here, so the sizes will be
rough estimates for comparison purposes. The actual size of an implementation
is of course heavily dependent on the programming environment as well.
Memory/CPU overhead
: The most efficient solution for a single-threaded application is to simply have
direct access to a single database connection. Every solution is in principle a
modification or extension of this idea, and will therefore add a certain
overhead. This overhead manifests itself in both increased CPU and memory
usage. The order of which varies between solutions.
Prepared statement re-use
: Is it possible to prepare a statement once and keep using it for the lifetime
of the program? Or will prepared statements have to be thrown away and
recreated every time? Keeping statement handles in memory will result in a nice
performance boost for applications that run the same SQL statement many times.
Transaction grouping
: A somewhat similar issue to prepared statement re-use: From a performance point
of view, it is very important to try to batch many UPDATE/DELETE/INSERT
statements within a single transaction, as opposed to running each modify query
separately. Running each query separately will force SQLite to flush the data
to disk separately every time, whereas a single transaction will batch-flush
all the changes to disk in a single go. Some solutions allow for grouping
multiple statements in a single transaction quite easily, while others require
more involved steps.
Background processing
: In certain situations it may be desirable to queue a certain query for later
processing, without explicitly waiting for it to complete. For example, if
something in the database has to be modified as a result of user interaction in
a UI thread, then the application would feel a lot more responsive if the
UPDATE query was simply queued to be processed in a background thread than when
the query had run in the UI thread itself. A database accessing solution with
built-in support for background processing of queries will significantly help
with building a responsive application.
Concurrency
: Concurrency indicates how well the solution allows for concurrent access. The
worst possible concurrency is achieved when a single database connection is
used for all threads, as only a single action can be performed on the database
at any point in time. Maximum concurrency is achieved when each thread has its
own SQLite connection. Note that maximum concurrency doesn't mean that the
database can be accessed in a _fully_ concurrent manner. SQLite uses internal
database-level locks to avoid data corruption, and these will limit the actual
maximum concurrency. I am not too knowledgeable about the inner workings of
these locks, but it is at least possible to have a large number truly
concurrent database _reads_. Database _writes_ from multiple threads may
still allow for significantly more concurrency than when they are manually
serialized over a single database connection.
Portability
: What is the minimum SQLite version required to implement the solution? Does it
require any special OS features or SQLite compilation settings? As outlined
above, different versions of SQLite offer different features with regards to
threading. Relying one of the relatively new features will decrease
portability.
# The Solutions
Here I present four solutions to allow database access from multiple threads.
Note that this list may not be exhaustive, these are just a few solutions that
I am aware of. Also note that none of the solutions presented here are in any
way new. Most of these paradigms date back to the entire notion of concurrent
programming, and have been applied in software since decades ago.
## Connection sharing
By far the simplest solution to implement: Keep a single database connection
throughout your program and allow every thread to access it. Of course, you
will need to be careful to always put locks around the code where you access
the database handler. An example implementation could look like the following:
```c
// The global SQLite connection
sqlite3 *db;
int main(int argc, char **argv) {
if(sqlite3_open("database.sqlite3", &db))
exit(1);
// start some threads
// wait until the threads are finished
sqlite3_close(db);
return 0;
}
void *some_thread(void *arg) {
sqlite3_mutex_enter(sqlite3_db_mutex(db));
// Perform some queries on the database
sqlite3_mutex_leave(sqlite3_db_mutex(db));
}
```
Implementation size
: This is where connection sharing shines: There is little extra code required
when compared to using a database connection in a single-threaded context. All
you need to be careful of is to lock the mutex before using the database, and
to unlock it again afterwards.
Memory/CPU overhead
: As the only addition to the single-threaded case are the locks, this solution
has practically no memory overhead. The mutexes are provided by SQLite,
after all. CPU overhead is also as minimal as it can be: mutexes are the most
primitive type provided by threading libraries to serialize access to a shared
resource, and are therefore very efficient.
Prepared statement re-use
: Prepared statements can be safely re-used inside a single enter/leave block.
However, if you want to remain portable with SQLite versions before 3.5.0, then
any prepared statements **must** be freed before the mutex is unlocked. This can
be a major downside if the enter/leave blocks themselves are relatively short
but accessed quite often. If portability with older versions is not an issue,
then this restriction is gone and prepared statements can be re-used easily.
Transaction grouping
: A reliable implementation will not allow transactions to span multiple
enter/leave blocks. So as with prepared statements, transactions need to be
committed to disk before the mutex is unlocked. Again shared with prepared
statement re-use is that this limitation may prove to be a significant problem
in optimizing application performance, disk I/O in particular. One way to lower
the effects of this limitation is to increase the size of a single enter/leave
block, thus allowing for more work to be done in a single transaction. Code
restructuring may be required in order to efficiently implement this. Another
way to get around this problem is to do allow a transaction to span multiple
enter/leave blocks. Implementing this reliably may not be an easy task,
however, and will most likely require application-specific knowledge.
Background processing
: Background processing is not natively supported with connection sharing. It is
possible to spawn a background thread to perform database operations each time
that this is desirable. But care should be taken to make sure that these
background threads will execute dependent queries in the correct order. For
example, if thread A spawns a background thread, say B, to execute an UPDATE
query, and later thread A wants to read that same data back, it must first wait
for thread B to finish execution. This may add more inter-thread communication
than is preferable.
Concurrency
: There is no concurrency at all here. Since the database connection is protected
by an exclusive lock, only a single thread can operate on the database at any
point in time. Additionally, one may be tempted to increase the size of an
enter/leave block in order to allow for larger transactions or better re-use of
prepared statements. However, any time spent on performing operations that do
not directly use the database within such an enter/leave block will lower the
maximum possible database concurrency even further.
Portability
: Connection sharing requires at least SQLite 3.3.1 in order to pass the same
database connection around. SQLite must be compiled with threading support
enabled. If prepared statements are kept around outside of an enter/leave
block, then version 3.5.0 or higher will be required.
## Message passing
An alternative approach is to allow only a single thread to access the
database. Any other thread that wants to access the database in any way will
then have to communicate with this database thread. This communication is done
by sending messages (_requests_) to the database thread, and, when query
results are required, receiving back one or more _response_ messages.
Message passing schemes and libraries are available for many programming
languages and come in many different forms. For this article, I am going to
assume that an asynchronous and unbounded FIFO queue is used to pass around
messages, but most of the following discussion will apply to bounded queues as
well. I'll try to note the important differences between the two where
applicable.
A very simple and naive implementation of a message passing solution is given
below. Here I assume that `queue_create()` will create a message queue (type
`message_queue`), `queue_get()` will return the next message in the queue, or
block if the queue is empty. `thread_create(func, arg)` will run _func_ in a
newly created thread and pass _arg_ as its argument. Error handling has been
ommitted to keep this example consice.
```c
void *db_thread(void *arg) {
message_queue *q = arg;
sqlite3 *db;
if(sqlite3_open("database.sqlite3", &db))
return ERROR;
request_msg *m;
while((m = queue_get(q)) {
if(m->action == QUIT)
break;
if(m->action == EXEC)
sqlite3_exec(db, m->query, NULL, NULL, NULL);
}
sqlite3_close(db);
return OK;
}
int main(int argc, char **argv) {
message_queue *db_queue = queue_create();
thread_create(db_thread, db_queue);
// Do work.
return 0;
}
```
This example implementation has a single database thread running in the
background that accepts the messages `QUIT`, to stop processing queries and
close the database, and `EXEC`, to run a certain query on the database. No
support is available yet for passing query results back to the thread that sent
the message. This can be implemented by including a separate `message_queue`
object in the request messages, to which the results can be sent.
Implementation size
: This will largely depend on the used programming environment and the complexity
of the database thread. If your environment already comes with a message queue
implementation, and constructing the request/response messages is relatively
simple, then a simple implementation as shown above will not require much code.
On the other hand, if you have to implement your own message queue or want more
intelligence in the database thread to improve efficiency, then the complete
implementation may be significantly larger than that of connection sharing.
Memory/CPU overhead
: Constructing and passing around messages will incur a CPU overhead, though with
an efficient implementation this should not be significant enough to worry
about. Memory usage is highly dependent on the size of the messages being
passed around and the length of the queue. If messages are queued faster than
they are processed and there is no bound on the queue length, then a process
may quickly run of out memory. On the other hand, if messages are processed
fast enough then the queue will generally not have more than a single message
in it, and the memory overhead will remain fairly small.
Prepared statement re-use
: As the database connection will never leave the database thread, prepared
statements can be kept in memory and re-used without problems.
Transaction grouping
: A naive but robust implementation will handle each message in its own
transaction. A more clever database thread, however, could wait for multiple
messages to be queued and can then batch-execute them in a single transaction.
Correctly implementing this may require some additional information to be
specified along with the request, such as whether the query may be combined in
a single transaction or whether it may only be executed outside of a
transaction. Some threads may want to have confirmation that the data has been
successfully written to disk, in which case responsiveness will not improve if
such actions are queued for later processing. Nonetheless, since the database
thread has all the knowledge about the state of the database and any
outstanding actions, transaction grouping can be implemented quite reliably.
Background processing
: Background processing is supported natively with a message passing
implementation: a thread that isn't interested in query results can simply
queue the action to be performed by the database thread without indicating a
return path for the results. Of course, if a thread queues many messages that
do not require results followed by one that does, it will have to wait for all
earlier messages to be processed before receiving any results for the last one.
In the case that the actions are not dependent on each other, the database
thread may re-order the messages in order to process the last request first.
This requires knowledge about dependencies and may significantly complicate the
implementation, however.
Concurrency
: As with a shared database connection, database access is exclusive: Only a
single action can be performed on the database at a time. Unlike connection
sharing, however, any processing within the application will not further
degrade the maximum attainable concurrency. As long as unbounded asynchronous
queues are used to pass around messages, the database thread will be able to
continue working on the database without waiting for another thread to process
the results.
Portability
: This is where message passing shines: SQLite is only used within the database
thread, no other thread will have a need to call any SQLite function. This
allows any version of SQLite to be used, even those that have not been compiled
with thread safety enabled.
## Thread-local connections
A rather different approach to giving each thread access to a single database
is to simply open a new database connection for each thread. This way each
connection will be local to the specific thread, which in turn has the power to
do with it as it likes without worrying about what the other threads do. The
following is a short example to illustrate the idea:
```c
void *some_thread(void *arg) {
sqlite3 *db;
if(sqlite3_open("database.sqlite3", &db))
return ERROR;
// Do some work on the database
sqlite3_close(db);
}
int main(int argc, char **argv) {
int i;
for(i=0; i<10; i++)
thread_create(some_thread, NULL);
// Wait until the threads are done
return 0;
}
```
Implementation size
: Giving each thread its own connection is practically not much different from
the single-threaded case where there is only a single database connection. And
as the example shows, this can be implemented quite trivially.
Memory/CPU overhead
: If we assume that threads are not created very often and each thread has a
relatively long life, then the CPU and I/O overhead caused by opening a new
connection for each thread will not be very significant. On the other hand, if
threads are created quite often and lead a relatively short life before they
are destroyed again, then opening a new connection each time will soon require
more resources than running the queries themselves.
There is a significant memory overhead: every new database connection requires
memory. If each connection also has a separate cache, then every thread will
quickly require several megabytes only to interact with the database. Since
version 3.5.0, SQLite allows sharing of this cache with the other threads,
which will reduce this memory overhead.
Prepared statement re-use
: Prepared statements can be re-used without limitations within a single thread.
This will allow full re-use of prepared statements if each thread has a
different task, in which case every thread will have different queries and
access patterns anyway. But when every thread runs the same code, and thus also
the same queries, it will still need its own copy of the prepared statement.
Prepared statements are specific to a single database connection, so they can't
be passed around between the threads. The same argument for CPU overhead works
here: as long as threads are long-lived, then this will not be a very large
problem.
Transaction grouping
: Each thread has full access to its own database connection, so it can easily
batch many queries in a single transaction. It is not possible, however, to
group queries from the other threads in this same transaction as well. The
grouping may therefore not be as optimal as a message passing solution could
provide, but it is still a large improvement compared to connection sharing.
Background processing
: Background processing is not easily possible. While it is possible to spawn a
separate thread for each query that needs to be processed in the background, a
new database connection will have to be opened every time this is done. This
solution will obviously not be very efficient.
Concurrency
: In general, it is not possible to get better concurrency than by providing each
thread with its own database connection. This solution definitely wins in this
area.
Portability
: Thread-local connections are very portable: the only requirement is that SQLite
has been built with threading support enabled. Connections are not passed
around between threads, so any SQLite version will do. In order to make use of
the shared cache feature, however, SQLite 3.5.0 is required.
## Connection pooling
A common approach in server-like applications is to have a connection pool.
When a thread wishes to have access to the database, it requests a database
connection from a pool of (currently) unused database connections. If no unused
connections are available, it can either wait until one becomes available, or
create a new database connection on its own. When a thread is done with a
connection, it will add it back to the pool to allow it to be re-used in an
other thread.
The following example illustrates a basic connection pool implementation in
which a thread creates a new database connection when no connections are
available. A global `db_pool` is defined, on which any thread can call
`pool_pop()` to get an SQLite connection if there is one available, and
`pool_push()` can be used to push a connection back to the pool. This pool can
be implemented as any kind of set: a FIFO or a stack could do the trick, as
long as it can be accessed from multiple threads concurrently.
```c
// Some global pool of database connections
pool_t *db_pool;
sqlite3 *get_database() {
sqlite3 *db = pool_pop(db_pool);
if(db)
return db;
if(sqlite3_open("database.sqlite3", &db))
return NULL;
return db;
}
void *some_thread(void *arg) {
// Do some work
sqlite3 *db = get_database();
// Do some work on the database
pool_push(db_pool, db);
}
int main(int argc, char **argv) {
int i;
for(i=0; i<10; i++)
thread_create(some_thread, NULL);
// Wait until the threads are done
return 0;
}
```
Implementation size
: A connection pool is in essense not very different from thread-local
connections. The only major difference is that the call to sqlite3\_open() is
replaced with a function call to obtain a connection from the pool and
sqlite3\_close() with one to give it back to the pool. As shown above, these
functions can be fairly simple. Note, however, that unlike with thread-local
connections it is advisable to "open" and "close" a connection more often in
long-running threads, in order to give other threads a chance to use the
connection as well.
Memory/CPU overhead
: This mainly depends on the number of connections you allow to be in memory at
any point in time. If this number is not bounded, as in the above example, then
you can assume that after running your program for a certain time, there will
always be enough unused connections available in the pool. Requesting a
connection will then be very fast, since the overhead of creating a new
connection, as would have been done with thread-local connections, is
completely gone.
In terms of memory usage, however, it would be more efficient to put a maximum
limit on the number of open connections, and have the thread wait until another
thread gives a connection back to the pool. Similarly to thread-local
connections, memory usage can be decreased by using SQLite's cache sharing
feature.
Prepared statement re-use
: Unfortunately, this is where connection pooling borrows from connection
sharing. Prepared statements must be cleaned up before passing a connection to
another thread if one aims to be portable. But even if you remove that
portability requirement, prepared statements are always specific to a single
connection. Since you can't assume that you will always get the same connection
from the pool, caching prepared statements is not practical.
On the other hand, a connection pool does allow you to use a single connection
for a longer period of time than with connection sharing without negatively
affecting concurrency. Unless, of course, there is a limit on the number of
open connections, in which case using a connection for a long period of time
may starve another thread.
Transaction grouping
: Pretty much the same arguments with re-using prepared statements also apply to
transaction grouping: Transactions should be committed to disk before passing a
connection back to the pool.
Background processing
: This is also where a connection pool shares a lot of similarity with connection
sharing. With thread-local storage, creating a worker thread to perform
database operations on the background would be very inefficient. But since this
inefficiency is being tackled by allowing connection re-use with a connection
pool, it's not a problem. Still the same warning applies with regard to
dependent queries, though.
Concurrency
: Connection pooling gives you fine-grained control over how much concurrency
you'd like to have. For maximum concurrency, don't put a limit on the number of
maximum database connections. If there is a limit, then that will decrease the
maximim concurrency in favor of lower memory usage.
Portability
: Since database connections are being passed among threads, connection pooling
will require at least SQLite 3.3.1 compiled with thread safety enabled. Making
use of its cache sharing capibilities to reduce memory usage will require
SQLite 3.5.0 or higher.
# Final notes
As for what I used for ncdc. I initially chose connection sharing, for its
simplicity. Then when I noticed that the UI became less responsive than I found
acceptable I started adding a simple queue for background processing of
queries. Later I stumbled upon the main problem with that solution: I wanted to
read back a value that was written in a background thread, and had no way of
knowing whether the background thread had finished executing that query or not.
I then decided to expand the background thread to allow for passing back query
results, and transformed everything into a full message passing solution. This
appears to be working well at the moment, and my current implementation has
support for both prepared statement re-use and transaction grouping, which
measurably increased performance.
To summarize, there isn't really a _best_ solution that works for every
application. Connection sharing works well for applications where
responsiveness and concurrency isn't of major importance. Message passing works
well for applications that aim to be responsive, and is flexible enough for
optimizing CPU and I/O by re-using prepared statements and grouping queries in
larger transactions. Thread-local connections are suitable for applications
that have a relatively fixed number of threads, whereas connection pooling
works better for applications with a varying number of worker threads.

View file

@ -1,75 +0,0 @@
=pod
I write a lot of miscellaneous little perl/shell scripts and micro-libraries
for the purpose of getting something done. This page is a listing of those I
thought might be of useful to others as well.
I also maintain a collection of miscellaneous C micro-libraries. Those are
listed under the collective name of L<Ylib|https://dev.yorhel.nl/ylib>.
=head2 maildir.pl
October 2012. A tiny weechat plugin to display the number of unread emails in a
local Maildir. L<Latest
source|http://www.weechat.org/scripts/source/stable/maildir.pl.html/>
(L<1.0|http://p.blicky.net/wzbzs>).
=head2 ncdc-share-report
December 2011. Playing around with the Go programming language, I wrote another
transfer log parser and statistics generator for ncdc.
L<Example output|http://s.blicky.net/2012/ncdc-share-report.html>.
Download: L<0.3|http://p.blicky.net/h25z8>
(L<0.2|http://p.blicky.net/6yx2d>, L<0.1|http://p.blicky.net/ab4lm>).
=head2 ncdc-transfer-stats
September 2011. L<ncdc|https://dev.yorhel.nl/ncdc> gained transfer logging
features, and I wrote a quick Perl script to fetch some simple statistics from
it. L<source|https://p.blicky.net/4V9Kg59kUJUN> (L<0.2|http://p.blicky.net/eu00a>, L<0.1|http://p.blicky.net/agolr>).
=head2 json.mll
December 2010. I was writing a client for the L<public VNDB
API|http://vndb.org/d11> in OCaml and needed a JSON parser/generator. Since I
wasn't happy with the currently available solutions - they try to do too many
things and have too many dependencies - I decided to write a minimal JSON
library myself. L<source|http://g.blicky.net/serika.git/tree/json.mll>
=head2 vinfo.c
November 2009. The L<public VNDB API|http://vndb.org/d11> was designed to be
easy to use even from low level languages. I wrote this simple program to see
how much work it would be to use the API in C, and as example code for anyone
wishing to use the API for something more useful. Read the comments for more
info. L<source|https://dev.yorhel.nl/download/code/vinfo.c>
=head2 Microdc2 log file parser
June 2007. Simple perl script that parses log files created by
L<microdc2|http://corsair626.no-ip.org/microdc/> and outputs a simple and
ugly html file with all uploaded files. It correctly merges chunked
uploads, calculates average upload speed per file and total bandwidth used
for uploads. L<source|https://dev.yorhel.nl/download/code/mdc2-parse.pl>
B<Note:> for those of you who still use microdc2, please have a look at
L<ncdc|https://dev.yorhel.nl/ncdc>, a modern alternative.
=head2 yapong.c
Feburary 2006. Yet Another Pong, and yet another program written just for
testing/ learning purposes. Tested to work with the ncurses or pdcurses
libraries. L<source|https://dev.yorhel.nl/download/code/yapong.c> (L<older
version|https://dev.yorhel.nl/download/code/yapong-0.01.c>).
=head2 echoserv.c
February 2006. A simple non-blocking single-threaded TCP echo server,
displaying how the select() system call can be used to handle multiple
connections. L<source|https://dev.yorhel.nl/download/code/echoserv.c>
=head2 bbcode.c
January 2006. Simple BBCode to HTML converter written in plain C, for learning
puroses. L<source|https://dev.yorhel.nl/download/code/bbcode.c>

View file

@ -1,102 +0,0 @@
=pod
People who run AWStats on large log files have most likely noticed: the data
files can grow quite large, resulting in both a waste of disk space and longer
page generation times for the AWStats pages. I wrote a small script that
analyzes these data files and can remove any information you think is
unnecessary.
B<Download:> L<awshrink|https://dev.yorhel.nl/download/code/awshrink> (copy to
/usr/bin to install).
=head2 Important
Do B<NOT> use this script on data files that are not completed yet (i.e. data
files of the month you're living in). This will result in inaccurate sorting of
visits, pages, referers and whatever other list you're shrinking. Also, keep
in mind that this is just a fast written perl hack, it is by no means fast and
may hog some memory while shrinking data files.
=head2 Usage
awshrink [-c -s] [-SECTION LINES] [..] datafile
-s Show statistics
-c Overwrite datafile instead of writing to a backupfile (datafile~)
-SECTION LINES
Shrink the selected SECTION to LINES lines. (See example below)
=head2 Typical command-line usage
While awshrink is most useful for monthly cron jobs, here's an example of basic
command line usage to demonstrate what the script can do:
$ wc -c awstats122007.a.txt
29916817 awstats122007.a.txt
$ awshrink -s awstats122007.a.txt
Section Size (Bytes) Lines
SCREENSIZE* 74 0
WORMS 131 0
EMAILRECEIVER 135 0
EMAILSENDER 143 0
CLUSTER* 144 0
LOGIN 155 0
ORIGIN* 178 6
ERRORS* 229 10
SESSION* 236 7
FILETYPES* 340 12
MISC* 341 10
GENERAL* 362 8
OS* 414 29
SEREFERRALS 587 34
TIME* 1270 24
DAY* 1293 31
ROBOT 1644 40
BROWSER 1992 127
DOMAIN 2377 131
UNKNOWNREFERERBROWSER 5439 105
UNKNOWNREFERER 20585 317
SIDER_404 74717 2199
PAGEREFS 130982 2500
KEYWORDS 288189 27036
SIDER 1058723 25470
SEARCHWORDS 5038611 157807
VISITOR 23285662 416084
* = not shrinkable
$ awshrink -s -c -VISITOR 100 -SEARCHWORDS 100 -SIDER 100 awstats122007.a.txt
Section Size (Bytes) Lines
SCREENSIZE* 74 0
WORMS 131 0
EMAILRECEIVER 135 0
EMAILSENDER 143 0
CLUSTER* 144 0
LOGIN 155 0
ORIGIN* 178 6
ERRORS* 229 10
SESSION* 236 7
FILETYPES* 340 12
MISC* 341 10
GENERAL* 362 8
OS* 414 29
SEREFERRALS 587 34
TIME* 1270 24
DAY* 1293 31
ROBOT 1644 40
BROWSER 1992 127
SEARCHWORDS 2289 100
DOMAIN 2377 131
SIDER 3984 100
UNKNOWNREFERERBROWSER 5439 105
VISITOR 5980 100
UNKNOWNREFERER 20585 317
SIDER_404 74717 2199
PAGEREFS 130982 2500
KEYWORDS 288189 27036
* = not shrinkable
$ wc -c awstats122007.a.txt
546074 awstats122007.a.txt

View file

@ -1,40 +0,0 @@
=pod
I<2016-08-16> - btrfs-size.pl is a quick little script to provide an overview
of the disk space used by btrfs subvolumes. It's comparable to
L<btrfs-size.sh|https://poisonpacket.wordpress.com/2015/05/26/btrfs-snapshot-size-disk-usage/>,
but is somewhat faster and has a few options to sort the output.
Honestly, it's still hard to draw any conclusions from the sizes provided by
btrfs, but sadly, L<ncdu|https://dev.yorhel.nl/ncdu> is useless for
snapshot-heavy filesystems.
Only tested with btrfs-progs v4.4.1.
B<Download:> L<btrfs-size.pl|https://p.blicky.net/FNPXpbwMXfTI.txt>
(L<syntax-highligted version|https://p.blicky.net/FNPXpbwMXfTI>).
=head2 Usage
btrfs-size.pl --help [-nser] <path>
-n Order by path name
-s Order by (total) subvolume size
-e Order by exclusive subvolume size
-r Reverse order
=head2 Example output
# btrfs-size.pl /data
gfbf007/cur 46.32 GiB 16.00 KiB
gfbf007/snap/2016-08-14.08 46.32 GiB 428.00 KiB
gfbf007/snap/2016-08-15.03 46.32 GiB 428.00 KiB
gfbf007/snap/2016-08-16.03 46.32 GiB 16.00 KiB
ggit011/cur 23.92 MiB 16.00 KiB
ggit011/snap/2016-08-14.08 23.90 MiB 300.00 KiB
ggit011/snap/2016-08-15.08 23.92 MiB 16.00 KiB
gman015/cur 3.74 GiB 16.00 KiB
gman015/snap/2016-08-14.08 3.74 GiB 112.00 KiB
gman015/snap/2016-08-15.02 3.74 GiB 96.00 KiB
gman015/snap/2016-08-16.02 3.74 GiB 16.00 KiB

View file

@ -1,53 +0,0 @@
=pod
GRenamR is a GTK+ mass file renamer written in Perl, the functionality is
insipred by the
L<rename|http://search.cpan.org/~rmbarker/File-Rename-0.05/rename.PL> command
that comes with a Perl module.
GRenamR allows multiple file renaming using perl expressions. You can see the
effects of your expression while typing it, and can preview your action before
applying them. The accepted expressions are mostly the same as the rename
command (see above paragrah): your expression will be evaluated with $_ set to
the filename, and any modifications to this variable will result in the
renaming of the file. There's one other variable that the rename command does
not have: $i, which reflects the file number (starting from 0) in the current
list. This allows expressions such as as C<$_=sprintf'%03d.txt',$i>.
B<Download: > L<grenamr|https://dev.yorhel.nl/download/code/grenamr-0.1.pl>
(copy to /usr/bin/ to install)
Requires the Gtk2 Perl module. Most distributions have a perl-gtk2 package.
=head2 Example expressions
y/A-Z/a-z/ # Convert filenames to lowercase
$_=lc # Same
s/\.txt$/.utf8/ # Change all '.txt' extensions to '.utf8'
s/([0-9]+)/sprintf'%04d',$1/eg # Zero-pad all numbers in filenames
# Replace each image filename with a zero-padded number starting from 1
s/^.+\.jpg$/sprintf'%03d.jpg',$i+1/e
=head2 Caveats / bugs / TODO
=over
=item * Calling functions as 'sleep' or 'exit' in the expression will trash the program
=item * It's currently not possible to manually order the file list, so $i is
not useful in every situation
=item * It's currently not possible to manually rename files or exclude items
from being effected by the expression
=item * The expression isn't executed in the opened directory, so things like
L<-X|http://perldoc.perl.org/functions/-X.html> won't work
=back
=head2 Screenshot
[img scr grenamr.png GRenamR]

View file

@ -1,101 +0,0 @@
=pod
I decided to do some experimentation with how the colours defined in ncurses
are actually displayed in terminals, what the effects are of combining these
colours with other attributes, and how colour schemes of a terminal can affect
the displayed colours. To this end I wrote a small c file and ran it in
different terminals and different configurations. Note that only the 8 basic
NCurses colours are tested, the more flexible init_color() function is not
used.
B<Source code: > L<nccolour.c|https://dev.yorhel.nl/download/code/nccolour.c>
(L<syntax highlighed version|http://p.blicky.net/xu35c>)
=head2 Notes / observations
=over
=item * The most obvious conclusion: the displayed colours do not have the
exact same colour value in every terminal. Some terminals also allow users to
modify these colours.
=item * You can not assume that the default foreground or background colour can
be represented by one of the 8 basic colours defined by NCurses.
=item * Specifying -1 as colour, to indicate the default foreground or
background colour, seems to work fine in any terminal tested so far.
=item * All tested terminals render the foreground colour in a lighter shade
when the A_BOLD attribute is set. This does not apply to the background colour.
The result of this is that the text becomes visible when using A_BOLD when the
foreground and background colour are set to the same value.
=item * Unfortunately, not all terminals are configured in such a way that all
possible colours are readable. So as a developer you'll still have to support
configurable colour schemes in your ncurses application. :-(
=item * On most terminals, setting the foreground and background colour to the
same value without applying the A_BOLD attribute will make the text invisible.
Don't rely on this, however, as this is not the case on OS X.
=back
=head2 Full screenshot
To avoid wasting unecessary space, the comparison screenshots below only
display the colour table. Here's a screenshot of the full output of the
program, which also explains what each column means.
[img scr nccol-full.png ]
=head2 Screenshots
=over
=item Arch Linux, Roxterm, Default color scheme
[img scr nccol-rox-b.png ]
=item Arch Linux, Roxterm, GTK color scheme
[img scr nccol-rox-w.png ]
=item Arch Linux, Roxterm, Tango color scheme
[img scr nccol-rox-t.png ]
=item Arch Linux, Roxterm, Modified Tango color scheme
[img scr nccol-rox-c.png ]
=item Arch Linux, xterm (default settings)
[img scr nccol-xterm.png ]
=item Ubuntu 11.10, Gnome-terminal
[img scr nccol-ubuntu.png ]
=item Debian Squeeze, VT (default settings)
[img scr nccol-debian.png ]
=item FreeBSD, VT (default settings)
[img scr nccol-fbsd.png ]
=item Mac OS X, Terminal
[img scr nccol-osx-terminal.png ]
=item Mac OS X, iTerm2
[img scr nccol-osx-iterm2.png ]
=item CentOS 6.4
[img scr nccol-centos64.png ]
=back

76
dat/dump.md Normal file
View file

@ -0,0 +1,76 @@
% Code dump
I write a lot of miscellaneous little perl/shell scripts and micro-libraries
for the purpose of getting something done. This page is a listing of those I
thought might be of useful to others as well.
I also maintain a collection of miscellaneous C micro-libraries. Those are
listed under the collective name of [Ylib](/ylib).
## maildir.pl
October 2012. A tiny weechat plugin to display the number of unread emails in a
local Maildir. [Latest
source](https://weechat.org/scripts/source/stable/maildir.pl.html/)
([1.0](https://p.blicky.net/wzbzs)).
## ncdc-share-report
December 2011. Playing around with the Go programming language, I wrote another
transfer log parser and statistics generator for ncdc.
[Example output](https://s.blicky.net/2012/ncdc-share-report.html).
Download: [0.3](https://p.blicky.net/h25z8)
([0.2](https://p.blicky.net/6yx2d), [0.1](https://p.blicky.net/ab4lm)).
## ncdc-transfer-stats
September 2011. [ncdc](/ncdc) gained transfer logging features, and I wrote a
quick Perl script to fetch some simple statistics from it.
[source](https://p.blicky.net/4V9Kg59kUJUN)
([0.2](https://p.blicky.net/eu00a), [0.1](https://p.blicky.net/agolr)).
## json.mll
December 2010. I was writing a client for the [public VNDB
API](https://vndb.org/d11) in OCaml and needed a JSON parser/generator. Since I
wasn't happy with the currently available solutions - they try to do too many
things and have too many dependencies - I decided to write a minimal JSON
library myself. [source](https://g.blicky.net/serika.git/tree/json.mll)
## vinfo.c
November 2009. The [public VNDB API](https://vndb.org/d11) was designed to be
easy to use even from low level languages. I wrote this simple program to see
how much work it would be to use the API in C, and as example code for anyone
wishing to use the API for something more useful. Read the comments for more
info. [source](/download/code/vinfo.c)
## Microdc2 log file parser
June 2007. Simple perl script that parses log files created by
[microdc2](http://corsair626.no-ip.org/microdc/) and outputs a simple and ugly
html file with all uploaded files. It correctly merges chunked uploads,
calculates average upload speed per file and total bandwidth used for uploads.
[source](/download/code/mdc2-parse.pl)
**Note:** for those of you who still use microdc2, please have a look at
[ncdc](/ncdc), a modern alternative.
## yapong.c
Feburary 2006. Yet Another Pong, and yet another program written just for
testing/ learning purposes. Tested to work with the ncurses or pdcurses
libraries. [source](/download/code/yapong.c) ([older
version](/download/code/yapong-0.01.c)).
## echoserv.c
February 2006. A simple non-blocking single-threaded TCP echo server,
displaying how the select() system call can be used to handle multiple
connections. [source](/download/code/echoserv.c)
## bbcode.c
January 2006. Simple BBCode to HTML converter written in plain C, for learning
puroses. [source](/download/code/bbcode.c)

99
dat/dump/awshrink.md Normal file
View file

@ -0,0 +1,99 @@
% AWStats Data File Shrinker
People who run AWStats on large log files have most likely noticed: the data
files can grow quite large, resulting in both a waste of disk space and longer
page generation times for the AWStats pages. I wrote a small script that
analyzes these data files and can remove any information you think is
unnecessary.
**Download:** [awshrink](/download/code/awshrink) (copy to /usr/bin to
install).
## Important
Do **NOT** use this script on data files that are not completed yet (i.e. data
files of the month you're living in). This will result in inaccurate sorting of
visits, pages, referers and whatever other list you're shrinking. Also, keep
in mind that this is just a fast written perl hack, it is by no means fast and
may hog some memory while shrinking data files.
## Usage
awshrink [-c -s] [-SECTION LINES] [..] datafile
-s Show statistics
-c Overwrite datafile instead of writing to a backupfile (datafile~)
-SECTION LINES
Shrink the selected SECTION to LINES lines. (See example below)
## Typical command-line usage
While awshrink is most useful for monthly cron jobs, here's an example of basic
command line usage to demonstrate what the script can do:
$ wc -c awstats122007.a.txt
29916817 awstats122007.a.txt
$ awshrink -s awstats122007.a.txt
Section Size (Bytes) Lines
SCREENSIZE* 74 0
WORMS 131 0
EMAILRECEIVER 135 0
EMAILSENDER 143 0
CLUSTER* 144 0
LOGIN 155 0
ORIGIN* 178 6
ERRORS* 229 10
SESSION* 236 7
FILETYPES* 340 12
MISC* 341 10
GENERAL* 362 8
OS* 414 29
SEREFERRALS 587 34
TIME* 1270 24
DAY* 1293 31
ROBOT 1644 40
BROWSER 1992 127
DOMAIN 2377 131
UNKNOWNREFERERBROWSER 5439 105
UNKNOWNREFERER 20585 317
SIDER_404 74717 2199
PAGEREFS 130982 2500
KEYWORDS 288189 27036
SIDER 1058723 25470
SEARCHWORDS 5038611 157807
VISITOR 23285662 416084
* = not shrinkable
$ awshrink -s -c -VISITOR 100 -SEARCHWORDS 100 -SIDER 100 awstats122007.a.txt
Section Size (Bytes) Lines
SCREENSIZE* 74 0
WORMS 131 0
EMAILRECEIVER 135 0
EMAILSENDER 143 0
CLUSTER* 144 0
LOGIN 155 0
ORIGIN* 178 6
ERRORS* 229 10
SESSION* 236 7
FILETYPES* 340 12
MISC* 341 10
GENERAL* 362 8
OS* 414 29
SEREFERRALS 587 34
TIME* 1270 24
DAY* 1293 31
ROBOT 1644 40
BROWSER 1992 127
SEARCHWORDS 2289 100
DOMAIN 2377 131
SIDER 3984 100
UNKNOWNREFERERBROWSER 5439 105
VISITOR 5980 100
UNKNOWNREFERER 20585 317
SIDER_404 74717 2199
PAGEREFS 130982 2500
KEYWORDS 288189 27036
* = not shrinkable
$ wc -c awstats122007.a.txt
546074 awstats122007.a.txt

37
dat/dump/btrfssize.md Normal file
View file

@ -0,0 +1,37 @@
% btrfs-size.pl
_2016-08-16_ - btrfs-size.pl is a quick little script to provide an overview of
the disk space used by btrfs subvolumes. It's comparable to
[btrfs-size.sh](https://poisonpacket.wordpress.com/2015/05/26/btrfs-snapshot-size-disk-usage/),
but is somewhat faster and has a few options to sort the output.
Honestly, it's still hard to draw any conclusions from the sizes provided by
btrfs, but sadly, [ncdu](/ncdu) is useless for snapshot-heavy filesystems.
Only tested with btrfs-progs v4.4.1.
**Download:** [btrfs-size.pl](https://p.blicky.net/FNPXpbwMXfTI.txt)
([syntax-highligted version](https://p.blicky.net/FNPXpbwMXfTI)).
## Usage
btrfs-size.pl --help [-nser] <path>
-n Order by path name
-s Order by (total) subvolume size
-e Order by exclusive subvolume size
-r Reverse order
## Example output
# btrfs-size.pl /data
gfbf007/cur 46.32 GiB 16.00 KiB
gfbf007/snap/2016-08-14.08 46.32 GiB 428.00 KiB
gfbf007/snap/2016-08-15.03 46.32 GiB 428.00 KiB
gfbf007/snap/2016-08-16.03 46.32 GiB 16.00 KiB
ggit011/cur 23.92 MiB 16.00 KiB
ggit011/snap/2016-08-14.08 23.90 MiB 300.00 KiB
ggit011/snap/2016-08-15.08 23.92 MiB 16.00 KiB
gman015/cur 3.74 GiB 16.00 KiB
gman015/snap/2016-08-14.08 3.74 GiB 112.00 KiB
gman015/snap/2016-08-15.02 3.74 GiB 96.00 KiB
gman015/snap/2016-08-16.02 3.74 GiB 16.00 KiB

View file

@ -1,28 +1,31 @@
=pod
% Demos
Yes, I realise that the title is plural, suggesting there's more than one demo.
That is not quite true, unfortunately. The reason I chose to use plural form is
simply in the hopes that I do, in fact, write more demos, and that this page
will actually get more content in the future. I still happen to be a huge fan
of the L<demoscene|http://demoscene.info/>, and still wish to contribute to
of the [demoscene](http://demoscene.info/), and still wish to contribute to
it... if only I could find the time and self-discipline to do so. In the
meanwhile, here's one demo I did write some time ago:
meanwhile, here's one demo I did write some time ago.
=head1 Blue Cubes
*(2019 update: Don't get your hopes up, I likely won't ever write another demo.
I don't have the patience for it, I guess.)*
[img right bluecubes.png Blue Cubes.]
# Blue Cubes
![Blue Cubes.](/img/bluecubes.png){.right}
August 2006. My first demo - or more exact: intro. Blue Cubes is a 64kB intro
written in OpenGL/SDL with Linux as target OS. I wrote this intro within 10
days without any prior experience in any of the fields of computer generated
graphics or music. So needlessly to say, it sucks. I am ashamed even of the
thought of releasing it at a respectable demoparty like
L<Evoke|http://www.evoke.eu/2006/>. Still, it didn't feel I was unwelcome, I
[Evoke](https://www.evoke.eu/2006/). Still, it didn't feel I was unwelcome, I
did actually receive three prices: 3rd price in the 64k competition (there were
only 3 actual entries, but oh well), best non-windows 64k intro (it was the
only one in the competition), and the Digitale Kultur newcomer award, which
actually is something to be proud of, I guess.
L<download|https://dev.yorhel.nl/download/yorhel~bluecubes.zip> -
L<mirror|http://scene.org/file.php?file=/parties/2006/evoke06/in64/yorhel_bluecubes.zip&fileinfo>
[download](/download/yorhel~bluecubes.zip) -
[mirror](https://scene.org/file.php?file=/parties/2006/evoke06/in64/yorhel_bluecubes.zip&fileinfo)
(includes linux binaries, windows port, and sources) -
L<pouet comments|http://pouet.net/prod.php?which=25866>.
[pouet comments](https://pouet.net/prod.php?which=25866).

44
dat/dump/grenamr.md Normal file
View file

@ -0,0 +1,44 @@
% GTK+ Mass File Renamer
GRenamR is a GTK+ mass file renamer written in Perl, the functionality is
insipred by the
[rename](https://search.cpan.org/~rmbarker/File-Rename-0.05/rename.PL) command
that comes with a Perl module.
GRenamR allows multiple file renaming using perl expressions. You can see the
effects of your expression while typing it, and can preview your action before
applying them. The accepted expressions are mostly the same as the rename
command (see above paragrah): your expression will be evaluated with `$_` set
to the filename, and any modifications to this variable will result in the
renaming of the file. There's one other variable that the rename command does
not have: `$i`, which reflects the file number (starting from 0) in the current
list. This allows expressions such as as `$_=sprintf'%03d.txt',$i`.
**Download:** [grenamr](/download/code/grenamr-0.1.pl)
(copy to /usr/bin/ to install)
Requires the Gtk2 Perl module. Most distributions have a perl-gtk2 package.
## Example expressions
y/A-Z/a-z/ # Convert filenames to lowercase
$_=lc # Same
s/\.txt$/.utf8/ # Change all '.txt' extensions to '.utf8'
s/([0-9]+)/sprintf'%04d',$1/eg # Zero-pad all numbers in filenames
# Replace each image filename with a zero-padded number starting from 1
s/^.+\.jpg$/sprintf'%03d.jpg',$i+1/e
## Caveats / bugs / TODO
- Calling functions as 'sleep' or 'exit' in the expression will trash the program
- It's currently not possible to manually order the file list, so $i is
not useful in every situation
- It's currently not possible to manually rename files or exclude items
from being effected by the expression
- The expression isn't executed in the opened directory, so things like
[-X](https://perldoc.perl.org/functions/-X.html) won't work
## Screenshot
![GRenamR screenshot](/img/grenamr.png){.scr}

View file

@ -1,6 +1,6 @@
=pod
% Insertion Performance Benchmarks
I<2013-07-05> - One of my favourite data structures in C is the ordered vector
_2013-07-05_ - One of my favourite data structures in C is the ordered vector
(or array, whatever you call them). Incredibly simple to implement, very low
memory overhead, and can provide O(log n) lookup with a simple binary search.
However, ordered vectors have one very weak point: insertion and deletion of
@ -15,17 +15,16 @@ how much worse does insertion performance get compared to more complex data
structures?
For comparison, I chose the B-tree and hash table implementations from
L<klib|https://github.com/attractivechaos/klib> (from commit fff70758, to be
[klib](https://github.com/attractivechaos/klib) (from commit fff70758, to be
precise). My goal wasn't to benchmark the performance of different
implementations, so I simply chose two implementations that I suspect are among
the fastest. The vector implementation in the benchmarks is my own creation:
L<vec.h|http://g.blicky.net/globster.git/tree/src/util/vec.h?id=2c11d2a> from
the L<Globster|https://dev.yorhel.nl/globster> code base.
[vec.h](https://g.blicky.net/globster.git/tree/src/util/vec.h?id=2c11d2a) from
the [Globster](/globster) code base.
B<Source code:> L<ins-bench.c|http://p.blicky.net/r746e>
**Source code:** [ins-bench.c](https://p.blicky.net/r746e)
=head2 Best case & worst case
## Best case & worst case
For a start, I decided to benchmark the best and worst case performance of
inserting elements into a vector. The best case happens when inserting all
@ -39,44 +38,42 @@ search. Actual performance will be thus be a bit worse, depending on whether
the final application needs that binary search or whether it can assume its
input to be already sorted.
L<[img graph insbench-bench-thumb.png ]|https://dev.yorhel.nl/img/insbench-bench.png>
[ ![Benchmark results](/img/insbench-bench-thumb.png) ](/img/insbench-bench.png)
Gnuplot script: (The awk(ward) part can likely be done natively in gnuplot as
well, but I was too lazy to figure out how)
set terminal png size 1000, 1500
set output "bench.png"
set logscale xy
set xlabel "number of items"
set ylabel "average time per insert (ms)"
set grid mxtics xtics mytics ytics
plot "< awk '{print $1, $2/$1*1000}' bench-vec" title 'vector, worst case',\
"< awk '{print $1, $2/$1*1000}' bench-best" title 'vector, best case',\
"< awk '{print $1, $2/$1*1000}' bench-hash" title 'khash',\
"< awk '{print $1, $2/$1*1000}' bench-btree" title 'kbtree'
set terminal png size 1000, 1500
set output "bench.png"
set logscale xy
set xlabel "number of items"
set ylabel "average time per insert (ms)"
set grid mxtics xtics mytics ytics
plot "< awk '{print $1, $2/$1*1000}' bench-vec" title 'vector, worst case',\
"< awk '{print $1, $2/$1*1000}' bench-best" title 'vector, best case',\
"< awk '{print $1, $2/$1*1000}' bench-hash" title 'khash',\
"< awk '{print $1, $2/$1*1000}' bench-btree" title 'kbtree'
## Average case
=head2 Average case
For the second benchmark I inserted values created with C<rand()>, which should
For the second benchmark I inserted values created with `rand()`, which should
be a more accurate simulation of some real-world applications. This time I'm
not cheating with the vector implementation, a binary search is performed in
order to insert the items in the correct location.
L<[img graph insbench-rand-thumb.png ]|https://dev.yorhel.nl/img/insbench-rand.png>
[ ![Benchmark results](/img/insbench-rand-thumb.png) ](/img/insbench-rand.png)
set terminal png size 1000, 1500
set output "bench-rand.png"
set logscale xy
set xlabel "number of items"
set ylabel "average time per insert (ms)"
set grid mxtics xtics mytics ytics
plot "< awk '{print $1, $2/$1*1000}' rand-vec" title 'vector',\
"< awk '{print $1, $2/$1*1000}' rand-hash" title 'khash',\
"< awk '{print $1, $2/$1*1000}' rand-btree" title 'kbtree'
set terminal png size 1000, 1500
set output "bench-rand.png"
set logscale xy
set xlabel "number of items"
set ylabel "average time per insert (ms)"
set grid mxtics xtics mytics ytics
plot "< awk '{print $1, $2/$1*1000}' rand-vec" title 'vector',\
"< awk '{print $1, $2/$1*1000}' rand-hash" title 'khash',\
"< awk '{print $1, $2/$1*1000}' rand-btree" title 'kbtree'
=head2 Benchmarking setup
## Benchmarking setup
All benchmarks were performed on a 3 GHz Core Duo E8400 with a 6 MiB cache.
Compiled with the Gentoo-provided gcc 4.6.3 at -O3, linked against glibc 2.15,

86
dat/dump/nccolour.md Normal file
View file

@ -0,0 +1,86 @@
% Colours in NCurses
I decided to do some experimentation with how the colours defined in ncurses
are actually displayed in terminals, what the effects are of combining these
colours with other attributes, and how colour schemes of a terminal can affect
the displayed colours. To this end I wrote a small c file and ran it in
different terminals and different configurations. Note that only the 8 basic
NCurses colours are tested, the more flexible init\_color() function is not
used.
**Source code:** [nccolour.c](/download/code/nccolour.c)
([syntax highlighed version](http://p.blicky.net/xu35c))
## Notes / observations
- The most obvious conclusion: the displayed colours do not have the exact same
colour value in every terminal. Some terminals also allow users to modify
these colours.
- You can not assume that the default foreground or background colour can be
represented by one of the 8 basic colours defined by NCurses.
- Specifying -1 as colour, to indicate the default foreground or background
colour, seems to work fine in any terminal tested so far.
- All tested terminals render the foreground colour in a lighter shade when the
A\_BOLD attribute is set. This does not apply to the background colour. The
result of this is that the text becomes visible when using A\_BOLD when the
foreground and background colour are set to the same value.
- Unfortunately, not all terminals are configured in such a way that all
possible colours are readable. So as a developer you'll still have to support
configurable colour schemes in your ncurses application. :-(
- On most terminals, setting the foreground and background colour to the same
value without applying the A\_BOLD attribute will make the text invisible.
Don't rely on this, however, as this is not the case on OS X.
## Full screenshot
To avoid wasting unecessary space, the comparison screenshots below only
display the colour table. Here's a screenshot of the full output of the
program, which also explains what each column means.
![Full screenshot](/img/nccol-full.png)
## Screenshots
Arch Linux, Roxterm, Default color scheme
![](/img/nccol-rox-b.png)
Arch Linux, Roxterm, GTK color scheme
![](/img/nccol-rox-w.png)
Arch Linux, Roxterm, Tango color scheme
![](/img/nccol-rox-t.png)
Arch Linux, Roxterm, Modified Tango color scheme
![](/img/nccol-rox-c.png)
Arch Linux, xterm (default settings)
![](/img/nccol-xterm.png)
Ubuntu 11.10, Gnome-terminal
![](/img/nccol-ubuntu.png)
Debian Squeeze, VT (default settings)
![](/img/nccol-debian.png)
FreeBSD, VT (default settings)
![](/img/nccol-fbsd.png)
Mac OS X, Terminal
![](/img/nccol-osx-terminal.png)
Mac OS X, iTerm2
![](/img/nccol-osx-iterm2.png)
CentOS 6.4
![](/img/nccol-centos64.png)

View file

@ -1,122 +0,0 @@
=pod
[html]<p><b style="color: #f00">Project Abandoned</b><br />
I've stopped development on Globster. I still believe the overall idea and
architecture of Globster are good, and the DC community would definitely
benefit from a remotely controllable client, but Globster in its current form
wasn't going into the direction I wanted it to. I might restart the project
from scratch (yet again) in the future, but for now... it's as dead as a cute
zombie whale.<br /><br />
</p>
<!-- This code is ugly as hell. -->
<div style="width: 600px; height: 227px; background-image: url(/img/globster.png); margin-bottom: -30px">
<b style="font-size: 14px; position: relative; left: 150px; top: 10px">The Globster What?</b>
<p style="position: relative; left: 150px; top: 20px; width: 420px; text-align: right">
Globster is an efficient file sharing client for the Direct Connect<br />
network. It runs as a background daemon and provides<br />
a convenient and high-level D-Bus API, making<br />
it easy to write scripts, bots and user<br />
interfaces for Direct Connect.
</p></div>É
=head1 Adopt your own Globster
=head2 Download
There are no tarballs at the moment. You'll have to get it from the git repo:
git clone --recursive git://g.blicky.net/globster.git
cd globster
autoreconf -i
./configure
make
sudo make install
When doing a C<git pull> to update your version later on, make sure to follow
up with a C<git submodule update> to get the right dependencies, too.
The git repo is available for
L<online browsing|http://g.blicky.net/globster.git/>.
=head2 Requirements
Globster can be compiled with a (moderately recent) GCC or clang. You'll need
the following libraries: L<libdbus|http://dbus.freedesktop.org/>,
L<GnuTLS|http://gnutls.org> and L<zlib|http://zlib.net/>. If your GnuTLS is too
old (<= 2.12), you also need libgcrypt. The globsterctl script requires Perl
and the Net::DBus module.
On Debian and Ubuntu, that boils down to the following:
apt-get install git make gcc libc-dev automake autoconf\
pkg-config libdbus-1-dev libgnutls-dev libnet-dbus-perl
And for Arch Linux:
pacman -S base-devel git perl-net-dbus
I've only tested things on Linux (glibc and L<musl|http://www.musl-libc.org>),
but I intent to support more kinda-sane POSIX systems in the future as well.
Globster will no doubt require some more libraries as more basic features are
being implemented. And, yes, I<of course> we will get static binaries!
=head2 Status
Remember when I called Globster a "file sharing" client? I lied. It doesn't
share or download files yet, since it's currently in an early alpha stage. So
what I<does> it do?
=over
=item * Connect to ADC and NMDC hubs
=item * User list management
=item * Chatting and private messaging
=back
Those features already make it perfectly suitable for writing chat-only bots
and interfaces.
=head2 Usage
Globster isn't particularly hard to use, but usage documentation is currently a
bit lacking. I have every intention to fix that, but for now, you're encouraged
to join the development hub and bug me for help: C<adc://dc.blicky.net:2780/>.
I did already write some
L<API documentation|https://dev.yorhel.nl/globster/api>.
There are at this point not many scripts or interfaces available for Globster:
=over
=item * L<globsterctl|https://dev.yorhel.nl/globster/ctl> - A control script for the daemon, included in the git repo.
=item * L<globster-feedspam.pl|http://p.blicky.net/0z9uw> - An RSS / Atom notification script.
=item * L<globster-mhc.pl|http://p.blicky.net/8y8mv> - A hub chat link script. More useful as an example than anything else.
=item * L<globgraph|http://p.blicky.net/qvg59> - Munin plugin to monitor Direct Connect hubs.
=back
There's more to come. I'd love to have at least a convenient console client (a
weechat or irssi plugin? An ncdc fork?) and perhaps a web-based interface. But
other stuff is welcome, too. Who's going to write all that, you ask? Erm...
well... You, perhaps? :-)
=head2 Final notes
As you've come to expect from me I<(right?)>, Globster is entirely written in C
and available under a liberal MIT license.
Globster incorporates code from
L<libev|http://software.schmorp.de/pkg/libev.html>,
L<freetiger|http://klondike.es/freetiger/>,
L<klib|https://github.com/attractivechaos/klib> and
L<ylib|https://dev.yorhel.nl/ylib>.
Additionally, L<autoconf-lean|https://bitbucket.org/GregorR/autoconf-lean> is
used to keep the configure script fast.

View file

@ -1 +0,0 @@
../../globster/doc/api.pod

View file

@ -1 +0,0 @@
../../globster/doc/globsterctl.pod

View file

@ -1 +0,0 @@
../../globster/doc/globster.pod

View file

@ -1 +0,0 @@
../../globster/doc/globster-launch.pod

105
dat/globster.md Normal file
View file

@ -0,0 +1,105 @@
% The Globster Direct Connect Client
<b style="color: #f00">Project Abandoned</b><br>
I've stopped development on Globster. I still believe the overall idea and
architecture of Globster are good, and the DC community would definitely
benefit from a remotely controllable client, but Globster in its current form
wasn't going into the direction I wanted it to. I might restart the project
from scratch (yet again) in the future, but for now... it's as dead as a cute
zombie whale.
<div style="width: 600px; height: 227px; background-image: url(/img/globster.png); margin-bottom: -30px">
<b style="font-size: 14px; position: relative; left: 150px; top: 10px">The Globster What?</b>
<p style="position: relative; left: 150px; top: 20px; width: 420px; text-align: right">
Globster is an efficient file sharing client for the Direct Connect<br>
network. It runs as a background daemon and provides<br>
a convenient and high-level D-Bus API, making<br>
it easy to write scripts, bots and user<br>
interfaces for Direct Connect.
</p></div>
# Adopt your own Globster
## Download
There are no tarballs at the moment. You'll have to get it from the git repo:
git clone --recursive git://g.blicky.net/globster.git
cd globster
autoreconf -i
./configure
make
sudo make install
When doing a `git pull` to update your version later on, make sure to follow
up with a `git submodule update` to get the right dependencies, too.
The git repo is available for
[online browsing](https://g.blicky.net/globster.git/).
## Requirements
Globster can be compiled with a (moderately recent) GCC or clang. You'll need
the following libraries: [libdbus](http://dbus.freedesktop.org/),
[GnuTLS](http://gnutls.org) and [zlib](http://zlib.net/). If your GnuTLS is too
old (<= 2.12), you also need libgcrypt. The globsterctl script requires Perl
and the Net::DBus module.
On Debian and Ubuntu, that boils down to the following:
apt-get install git make gcc libc-dev automake autoconf\
pkg-config libdbus-1-dev libgnutls-dev libnet-dbus-perl
And for Arch Linux:
pacman -S base-devel git perl-net-dbus
I've only tested things on Linux (glibc and [musl](http://www.musl-libc.org)),
but I intent to support more kinda-sane POSIX systems in the future as well.
Globster will no doubt require some more libraries as more basic features are
being implemented. And, yes, _of course_ we will get static binaries!
## Status
Remember when I called Globster a "file sharing" client? I lied. It doesn't
share or download files yet, since it's currently in an early alpha stage. So
what _does_ it do?
- Connect to ADC and NMDC hubs
- User list management
- Chatting and private messaging
Those features already make it perfectly suitable for writing chat-only bots
and interfaces.
## Usage
Globster isn't particularly hard to use, but usage documentation is currently a
bit lacking. I have every intention to fix that, but for now, you're encouraged
to join the development hub and bug me for help: `adc://dc.blicky.net:2780/`.
I did already write some [API documentation](/globster/api).
There are at this point not many scripts or interfaces available for Globster:
- [globsterctl](/globster/ctl) - A control script for the daemon, included in the git repo.
- [globster-feedspam.pl](http://p.blicky.net/0z9uw) - An RSS / Atom notification script.
- [globster-mhc.pl](http://p.blicky.net/8y8mv) - A hub chat link script. More useful as an example than anything else.
- [globgraph](http://p.blicky.net/qvg59) - Munin plugin to monitor Direct Connect hubs.
There's more to come. I'd love to have at least a convenient console client (a
weechat or irssi plugin? An ncdc fork?) and perhaps a web-based interface. But
other stuff is welcome, too. Who's going to write all that, you ask? Erm...
well... You, perhaps? :-)
## Final notes
As you've come to expect from me _(right?)_, Globster is entirely written in C
and available under a liberal MIT license.
Globster incorporates code from
[libev](http://software.schmorp.de/pkg/libev.html),
[freetiger](http://klondike.es/freetiger/),
[klib](https://github.com/attractivechaos/klib) and
[ylib](https://dev.yorhel.nl/ylib).
Additionally, [autoconf-lean](https://bitbucket.org/GregorR/autoconf-lean) is
used to keep the configure script fast.

140
dat/ncdc
View file

@ -1,140 +0,0 @@
=pod
Ncdc is a modern and lightweight direct connect client with a friendly
ncurses interface.
=head2 Get ncdc!
=over
=item Latest version
1.20 ([dllink ncdc-1.20.tar.gz download]
- L<changes|https://dev.yorhel.nl/ncdc/changes>)
Convenient static binaries for Linux:
L<64-bit|https://dev.yorhel.nl/download/ncdc-linux-x86_64-1.20-6-g5111a.tar.gz> -
L<32-bit|https://dev.yorhel.nl/download/ncdc-linux-i486-1.20-6-g5111a.tar.gz> -
L<ARM|https://dev.yorhel.nl/download/ncdc-linux-arm-1.20-6-g5111a.tar.gz>. Check the
L<installation instructions|https://dev.yorhel.nl/ncdc/install> for more info.
=item Development version
The latest development version is available from git and can be cloned using
C<git clone git://g.blicky.net/ncdc.git>. The repository is available for
L<online browsing|http://g.blicky.net/ncdc.git/>.
=item Requirements
The following libraries are required: ncurses, zlib, bzip2, sqlite3, glib2 and
gnutls.
Ncdc is entirely written in C and available under a liberal MIT license.
=item Community
[html]
L<Bug tracker|https://dev.yorhel.nl/ncdc/bug> - For bugs reports, feature requests and patches.<br />
C<adcs://dc.blicky.net:2780/> - For real-time chat.
É
=item Packages and ports
Are available for the following systems:
L<Arch Linux|https://aur.archlinux.org/packages/ncdc/> -
L<Fedora|https://apps.fedoraproject.org/packages/ncdc/overview/> -
L<FreeBSD|http://www.freshports.org/net-p2p/ncdc/> -
L<Frugalware|http://frugalware.org/packages?srch=ncdc&op=pkg&arch=all&ver=all> -
L<Gentoo|http://packages.gentoo.org/package/net-p2p/ncdc> -
L<GNU Guix|https://www.gnu.org/software/guix/package-list.html> -
L<Homebrew|http://braumeister.org/formula/ncdc> -
L<OpenSUSE|http://packman.links2linux.org/package/ncdc> -
L<Source Mage|http://download.sourcemage.org/grimoire/codex/test/ftp/ncdc/>
I have a few old packages on the L<Open Build
Service|https://build.opensuse.org/package/show/home:yorhel/ncdc>,
but these are unmaintained. The static binaries are preferred.
A convenient installer is available for
L<Android|http://code.ivysaur.me/ncdcinstaller.html>.
=back
=head2 Features
Common features all modern DC clients (should) have:
=over
=item * Connecting to multiple hubs at the same time,
=item * Support for both ADC and NMDC protocols,
=item * Chatting and private messaging,
=item * Browsing the user list of a connected hub,
=item * Share management and file uploading,
=item * Connections and download queue management,
=item * File list browsing,
=item * TTH-checked, multi-source and segmented file downloading,
=item * Searching for files,
=item * Secure hub (adcs:// and nmdcs://) and client connections on both protocols,
=item * Bandwidth throttling,
=item * IPv6 support.
=back
And special features not commonly found in other clients:
=over
=item * Different connection settings for each hub,
=item * Encrypted UDP messages (ADC SUDP),
=item * Subdirectory refreshing,
=item * Nick notification and highlighting in chat windows,
=item * Trust on First Use for TLS-enabled hubs,
=item * A single listen port for both TLS and TCP connections,
=item * Efficient file uploads using sendfile(),
=item * Large file lists are opened in a background thread,
=item * Doesn't trash your OS file cache (with the flush_file_cache option enabled),
=item * (Relatively...) low memory usage.
=back
=head2 What doesn't ncdc do?
Since the above list is getting larger and larger every time, it may be more
interesting to list a few features that are (relatively) common in other DC
clients, but which ncdc doesn't do. Yet.
=over
=item * NAT Traversal,
=item * OP features (e.g. client detection, file list scanning and other useful stuff for OPs),
=item * SOCKS support.
=back
Of course, there are many more features that could be implemented or improved.
These will all be addressed in later versions (hopefully :).

View file

@ -1,396 +0,0 @@
1.20 - 2016-12-30
- Support bracketed paste mode in input handling (cologic)
- Add 'geoip_cc4' and 'geoip_cc6' settings
- Add 'log_hubchat' setting
- Add 'local' option to 'active_ip' setting
- Add support for multistream bzip2 filelists
- Disable RC4 ciphers by default from tls_priority
- Fix potential null pointer deference
- Fix chmod of destination directories (Johannes Beisswenger)
1.19.1 - 2014-04-23
- Fix remote null pointer dereference
- Searching now works in the search results list
- Fix possible file corruption when moving file to destination
- Fix error handling when finalizing a file download
- Fix downloading of 0-byte files
- Fix extremely slow /gc
- Fix sendfile() with large files on 32-bit Linux
- Fix minor display issue with multicolumn characters
1.19 - 2014-02-11
- Add search functionality to the file browser and user list (/,. keys)
- Add geoip support (requires --with-geoip at configure)
- Add 'download_segment' setting to change minimum segment size
- Log hashing progress to stderr.log
- Fix three (potential) security vulnerabilities
- Fix downloading of file lists when other user has no free slots
1.18.1 - 2013-10-05
- Fix crash when downloading files from multiple sources
- Use the yxml library to parse files.xml.bz2 files
- Fix various XML conformance bugs in parsing files.xml.bz2 files
1.18 - 2013-09-25
- Add support for segmented downloading
- Support $MyINFO without flags byte on NMDC hubs
- Don't require pod2man on build
- Fix tab-completion of nick names when full nick is specified
- Fix cursor position on selected line in listings
- Fix bug with schema-less /connect
1.17 - 2013-06-15
- Add 'q' key to user list for matching a users' files with download queue
- Add transfers.log format documentation to manual page
- Consider non-alphanumeric characters as word separators in input line
- Fix outgoing UDP messages to respect local_address setting
- Fix Alt+Backspace on xterm-like terminals
- Fix handling of "." and ".." file/directory names in files.xml.bz2
- Fix possible crash when receiving unexpected encrypted search results
- Fix sendfile() handling to use fallback on EOVERFLOW
- Fix possible crash when logging UDP messages
1.16.1 - 2013-03-23
- Fix crash when opening connection on ADC in passive mode
- Fix documentation of 'd' key in download_exclude setting
1.16 - 2013-03-21
- List of granted users is now remembered across restarts
- Don't throttle users who are granted a slot
- Support CIDs of variable size on ADC
- Log, but otherwise ignore, DSTA messages on ADC
- Fix possible crash with graceful disconnect on C-C connections
- Fix bug with enabling active mode when active_ip is set
- Fix reporting of active mode on NMDC hubs
- Fix bug with the 'X' key on the queue tab
- Fix idle disconnect timeout when a file transfer is active
1.15 - 2013-03-02
- IPv6 support
- Significantly shorten certificate creation time with old GnuTLS versions
- Always enable tls_policy and sudp_policy by default
- Link against libgcrypt if detected GnuTLS is older than 3.0
- Add color_tab_active setting
- Remove active_tls_port setting
- Allow '-', '.' and '_' characters in hub names
- Allow spaces before a command
- Add Alt+backspace as alias for Ctrl+w
- Add throttle for 'CGET tthl' requests
- Don't throw away PMs from unknown users
- Recognize mode field in $MyINFO without tag
- Fix possible crash with C-C TLS and old GnuTLS versions
- Fix old references to the removed ncdc-db-upgrade utility
- Fix loading of file lists from Shareaza 2.6.0.0 and earlier
- Fix handling of tab and carriage return in log window
- Fix changing of download_dir/incoming_dir if either dir has been deleted
- Fix compilation against glib < 2.26
- Fix unclean C-C TLS disconnect on timeout
1.14 - 2012-11-04
- Added BLOM support for ADC ('/hset adc_blom true' to enable it)
- Added section on connection settings to man page
- Fix incorrect char signedness assumption on ARM
- Fix possible crash when downloading small files
- Fix hub counts reported to the hub on login on ADC
- Fix local time display issue when built against musl (0.9.6)
- Removed legacy ncdc-db-upgrade utility
1.13 - 2012-08-16
- zlib library added as a required dependency
- Purge empty directories from share by default
- Added "share_emptydirs" setting
- Disable tls_policy by default when using an old GnuTLS version
- Improved support for group chat
- Honor G_FILENAME_ENCODING for path autocomplete, /share and queued files
- Use a default connection string on NMDC if no 'connection' has been set
- Support ZLIG for partial file list transfers on ADC
- Send more subdirectories in partial file list transfers
- Removed use of system-provided realpath()
- Don't allow /search with an empty string
- Fix segfault on /search command without query
- Fix display of 'sudp_policy' setting if SUDP is not supported
- Fix --enable-git-version when cross-compiling
1.12 - 2012-07-10
- Don't follow symlinks in share by default
- Added 'share_symlink' option
- Added bell notification and 'notify_bell' option
- Added 'sudp_policy' setting
- List all configured hubs on '/open'
- Added '/delhub' command to remove hub configuration
- Added filtering options to connections tab
- Added TLS support indication to user list
- Added Alt+a key to cycle through tabs with recent activity
- Allow binding to ports below 1024
- Add space after autocompleting a command
- Fix uploading chunks of 2GiB and larger (bug #12)
- Fix bug with duplicate directory detection in '/share'
- Fix display of timer on search tab
- ADC: Use shorter search token to save some bandwidth
- Various attempts at cleaning up some code
1.11 - 2012-05-15
- Drop libxml2 in favour of custom XML parser & writer
- Allow using a single listen port for TCP and TLS
- Added support for encrypted UDP messages (ADC SUDP)
- Included 'makeheaders' in the distribution
- Removed GNU-specific extensions from the Makefile
- Fix /disconnect to cancel automatic reconnect
- Fix loading of file lists with invalid UTF-8 sequences
- Fix ncurses detection on OpenIndiana
- Fix use of TLS in passive mode on ADC
- Fix configure warning when git could not be found
1.10 - 2012-05-03
- Rewrote network backend to use plain sockets instead of GIO
- Added GnuTLS as required dependency
- Removed GIO and glib-networking dependencies
- Removed 'ncdc-gen-cert' utility - ncdc can now generate certs by itself
- Enable client-to-client TLS by default
- Added 'tls_priority' setting
- Added 'reconnect_timeout' setting
- Don't quit ncdc on Ctrl+C
- Display age of file list in the title bar
- Don't build the 'ncdc-db-upgrade' tool by default
- Switched to a single top-level Makefile
- Fix '/browse user -f' ('-f' argument after username)
- Fix hub login when it checks for public hubs = 0
- Fix overflow of long tab titles
- Fix loading of microdc2-generated file lists
- Fix loading of file lists with an invalid character
- Fix occasional crash when TLS is enabled
- Fix transfer rate indication and limiting with TLS connections
- Fix small memory leak when 'upload_rate' is set
1.9 - 2012-03-14
- Allow all 'active_' settings to be changed on a per-hub basis
- Allow 'active_ip' to be unset and automatically get IP from hub
- Added 'active_udp_port' and 'active_tcp_port' settings
- Renamed 'active_bind' to 'local_address' and use it for outgoing
connections as well
- Display connection settings in hub info bar
- Added '/listen' command to display currently used ports
- Don't listen on TLS port when tls_policy is disabled
- Added 'disconnect_offline' setting
- Display '(global)' indicator when showing /hset variables
- Don't strip whitespace from /say
- Don't allow directory separator as /share name
- Allow 'global.' and '#hubname.' prefix for /set keys
- Fix display of long IP addresses on user list
1.8 - 2012-02-13
- Added bandwidth limiting (upload_rate and download_rate settings)
- Added hash speed limiting (hash_rate setting)
- Added 'm' key to connection tab to /msg selected user
- Disable client-to-client TLS by default
- Don't throw away some search results on NMDC
- (Partially) fixed uploading of >2GB chunks
- Fixed file descriptor leak when using the backlog feature
- Fixed crash when opening invalid filelist from search twice
- Use POD for the manual pages
- Minor typo fixes
1.7 - 2011-12-30
- Split /set command in a /set (global) and /hset (hub)
- File downloads are performed in a background thread
- Added glob-style matching on /set and /hset keys
- Added UTF-8 locale check
- Added 'sendfile' setting
- Added finer granularity for the flush_file_cache setting
- Allow flush_file_cache to be enabled for downloads
- Fix sending of $MyINFO with wrong public hub count
- Fix incorrect inclusion of gdbm.h
1.6 - 2011-12-07
- Use SQLite3 for storage instead of GDBM
- Converted config.ini to SQLite3 database
- Added ncdc-db-upgrade utility
- Session directory is architecture-independent
- All data is safe against crashes and power failures
- Added support for removing/adding directories without rehashing
- Always match every file list on 'Q' key on TTH search
- Immediately flush log entries to the kernel
- Faster start-up
- Added support for per-hub 'active_ip' settings
- Allow interval notation when setting autorefresh
- Broadcast SF (number of shared files) on ADC hubs
- Combine TTH data for downloaded files to blocks of at least 1MiB
- Increased hash buffer size (10KiB -> 512KiB)
- Fix case-insensitivity of search results
- Fix reporting of user state in pm tabs at hub disconnect
- Fix generation of client certificates with openssl
- Fix segfault with duplicate users on an ADC hub
- Fix segfault when opening of a filelist fails
- Fix base32 decoding bug (fixes login sequence on some ADC hubs)
1.5 - 2011-11-03
- Added filelist_maxage setting
- Added flush_file_cache setting
- Added /ungrant and improved /grant management
- Added key to download queue to clear user state for all files
- Added keys to search results to download file list and match queue
- Select the right user when using the 'q' key in connection tab
- Fixed possible crash when opening file list from search results
- Fixed detection of incompatible session directory version
1.4 - 2011-10-26
- Added sorting functionality to file list
- Added color settings: title, separator, list_default, list_header and
list_select
- Added "blink" color attribute
- Allow /disconnect to be used on the main tab
- Display number of matched and added items when using match queue feature
- Use git-describe to create a version string, if available
- Decreased memory usage for large file lists
- Handle duplicate filenames in other users' file list
- Fixed incorrect setting of the "Incomplete" flag in files.xml.bz2
- Fixed handling of the PM param in MSG commands on ADC
- Fixed user change notifications for PM tabs
1.3 - 2011-10-14
- Added multi-source downloading
- Added user information view and management keys to download queue tab
- Added "search for alternative" key to queue, file browser and search tabs
- Added "match queue" key to file browser and search tabs
- Added ui_time_format setting
- Added chat_only setting
- Changed default value of color_log_time to dark grey
- Improved tracking of a parent for each tab
- Improved portability for Solaris
- Fixed crash when closing a hub tab while it is connecting
- Fixed crash when auto-completing settings without auto-completion
- Fixed bug with file name display if download_dir ends with a slash
- Fixed bug with uploading chunks larger than 2GiB
- Fixed handling of directory search results on ADC
1.2 - 2011-09-25
- Fixed incorrect handling of outgoing NMDC connections
1.1 - 2011-09-25
- Select item in file browser when opened from a search result
- Added active_bind setting
- Added share_exclude setting
- Added download_exclude setting
- Added incoming_dir setting
- Added autocompletion for the previous values of certain settings
- Allow the "connection" setting to be used for ADC as well
- Added IP column to user list
- Allow sorting on description, email, tag and IP columns in user list
- Display upload speeds in the user list of an ADC hub
- Added TLS indication to connection list
- Mark selected items bold in listings
- Allow /reconnect on the main tab to reconnect all hubs
- Added slash to base path in partial file lists
- Added delay of 5 seconds before reconnecting to a hub
- Added recognition of the AP param on ADC
- Added support for UserIP2 on NMDC
- Removed support for unexpected incoming NMDC connections
1.0 - 2011-09-16
- Added ncdc(1) and ncdc-gen-cert(1) manual pages
- Documented settings (/help set <setting>)
- Documented key bindings (/help keys)
- Improved line wrapping algorithm for the log window
- Added support for client-to-client TLS on NMDC
- Added support for the CGFI command on ADC
- Throttle GET requests on the same file + offset
- Fixed glib assertion failure when disabling active mode
- Fixed downloading from clients using $ADCSND with -1 bytes
- Fixed race condition in file uploading code
- Fixed idle time calculation while connecting to another client
- Properly include unistd.h in dl.c
0.9 - 2011-09-03
- Added TLS support (adcs://, nmdcs://, and ADC client-to-client)
- Added tls_policy setting
- Added KEYP support for ADC
- Added warning when a hub changes TLS certificate
- Display exact listen ports when enabling active mode
0.8 - 2011-08-26
- Added transfer log
- Added log_downloads and log_uploads settings
- Added day changed indicators to the log windows
- Added common readline keys to the text input box
- Changed /refresh shortcut from Ctrl+e/u to Alt+r
- Allow join messages to work even when the join completion detection fails
- Select parent tab when closing a userlist, PM or filelist tab
- Re-open log files when receiving SIGUSR1
- Perform a clean shutdown when the terminal is closed
- Fixed bug in formatting the title of a /search tab
- Fixed log indent for non-ASCII nicks
- Fixed log highlighting and indenting for /me messages
0.7 - 2011-08-17
- Added word wrapping for the log window
- Added basic colors and nick highlighting to the log window
- Allow colors to be changed with the /set command
- Added backlog feature and setting
- Added silent building to the configure script
- Automatically re-open log files when they are moved/truncated externally
- Accept 'nmdc://' URLs as alternative to 'dchub://'
- Fixed spamming of useless $MyINFO and BINF commands every 5 minutes
- Fixed minor memory leak when closing/clearing the log window
0.6 - 2011-08-08
- Added file searching, through a /search command
- Added tab to display the search results
- Listen for incoming messages on UDP in active mode
- Allow specifying a hub address with /open
- Fixed case-sensitivity of shared files
- Various bugfixes and other improvements
0.5 - 2011-08-02
- Downloaded files are now TTH-checked
- Added download queue priorities
- Download queue items are automatically disabled on error
- Improved error handling and reporting for downloads
- Added download_slots setting
- Use a separate thread to load other users' file list
- Improved /gc to also clean up download queue related data
- Decreased memory usage for large file lists
- Improved error handling with sendfile()
- Fixed downloading in passive mode on ADC hubs
- Fixed adding a dir to the download queue while connected to the user
- Fixed segfault when the userlist is open while disconnecting from a hub
0.4 - 2011-07-23
- Added file downloading support
WARNING: Downloaded files are not TTH checked at this moment.
- Added persistent download queue
- Added busy indicators on start-up and with /gc
- Added download speed indicator to status bar
- Improved connection list interface
- Improved performance of UI message handling
- Fixed a remote crash
- Fixed incorrect reporting of hub counters
0.3 - 2011-07-15
- Added file list browser
- Added downloading of other people's file list
- Added 'hubname' setting to rename hub tabs
- Added -v, -c and -n commandline options
- Added -n option to /open to prevent an autoconnect
- Added referer notification
- Improved handling of some ADC commands
- Improved logging of debug messages
- Fixed error when uploading an empty file list
- Fixed display of join/quits on ADC hubs
- Fixed several crashes
0.2 - 2011-06-27
- ADC support
- Added slot granting and /grant command
- Added /kick (for NMDC hubs)
- Added /pm and /nick aliasses
- Added support for passworded login
- Added /me command (mostly useful for ADC hubs)
- Added /whois command
- Added 'share_hidden' option (default: false)
- Improved minislots support
- Added 'minislots' and 'minislot_size' options
- Slightly improved user list and connection list
- /set displays default values for unset options
0.1 - 2011-06-20
Initial version

View file

@ -1,201 +0,0 @@
=head1 General instructions
=head2 Building from source
In theory, the following instructions should work everywhere:
=over
=item * Install the required dependencies: ncurses, bzip2, zlib, sqlite3, glib2 and gnutls,
=item * Download and extract the source tarball from the L<homepage|https://dev.yorhel.nl/ncdc>,
=item * C<./configure>
=item * C<make>
=item * And then run C<make install> with superuser permissions.
=back
In practice, however, this does not always work and may not always be the
prefered method of installation. On this page I try to collect instructions for
each OS and distribution to make the installation process a bit easier for
everyone.
If your system is missing from this page or if you're still having trouble,
don't hesitate to join the support hub at C<adc://dc.blicky.net/> or send me a
mail at L<projects@yorhel.nl|mailto:projects@yorhel.nl>. Contributions to this
page are of course highly welcomed as well. :-)
=head2 Statically linked binaries
If you just want to get ncdc running without going through the trouble of
compiling and/or installing it, I also offer statically linked binaries:
=over
=item * L<Linux, 64-bit|https://dev.yorhel.nl/download/ncdc-linux-x86_64-1.20-6-g5111a.tar.gz>
=item * L<Linux, 32-bit|https://dev.yorhel.nl/download/ncdc-linux-i486-1.20-6-g5111a.tar.gz>
=item * L<Linux, ARM|https://dev.yorhel.nl/download/ncdc-linux-arm-1.20-6-g5111a.tar.gz>
=back
To use them, simply download and extract the tarball, and then run C<./ncdc> on
the command line.
The binaries include all the required dependencies and are linked against
L<musl|http://www.etalabs.net/musl/>, so they should run on any Linux machine
with the right architecture. If you want binaries for an other OS or
architecture, please bug me and I'll see what I can do.
=head1 System-specific instructions
=head2 Android
An L<convenient installer|http://code.ivysaur.me/ncdcinstaller.html> is
available for Android 2.3 and later, which makes use of the static binary.
=head2 Arch Linux
Ncdc is available on L<AUR|https://aur.archlinux.org/packages/ncdc/>, to
install it you can use your favorite AUR-installer. If you don't have a
favorite, go for the manual approach:
wget https://aur.archlinux.org/cgit/aur.git/snapshot/ncdc.tar.gz
tar -xf ncdc.tar.gz
cd ncdc
makepkg -si
=head2 Fedora
There's a L<package|https://apps.fedoraproject.org/packages/ncdc/overview/>
available for Fedora.
=head2 FreeBSD
Ncdc is available in the Ports Collection. To install, L<make sure your
collection is
up-to-date|http://www.freebsd.org/doc/en_US.ISO8859-1/books/handbook/ports-using.html>
and install the Port as any other:
cd /usr/ports/net-p2p/ncdc
make install clean
=head2 Gentoo
Ncdc is available in the Portage tree, so installation is trivial:
emerge ncdc
=head2 Mac OS X
Ncdc is available in L<Homebrew|http://braumeister.org/formula/ncdc>.
=head2 OpenIndiana
This has been tested on OpenIndiana Build 151a Server, but may work on other
versions as well. Compiling from source is your only option at the moment.
First install some required packages (as root):
pkg install gcc-3 glib2 gnutls gettext header-math perl-510/extra
Then, fetch the ncdc source tarball, extract and build as follows:
wget https://dev.yorhel.nl/download/ncdc-1.20.tar.gz
tar -xf ncdc-1.20.tar.gz
cd ncdc-1.20
export PATH="$PATH:/usr/perl5/5.10.0/bin"
./configure --prefix=/usr LDFLAGS='-L/usr/gnu/lib -R/usr/gnu/lib'
make
And finally, to actually install ncdc, run C<make install> as root. You can
safely revert C<$PATH> back to its previous value if you wish, it was only
necessary in order for C<./configure> and C<make> to find C<pod2man>.
=head2 OpenSUSE
Get the package from L<PackMan|http://packman.links2linux.org/package/ncdc>:
Select your openSUSE release and hit the "1 click install" button.
=head2 Ubuntu & Debian
The preferred way of installing ncdc on Ubuntu or Debian is to use the static
binaries provided above.
Alternatively, you can also try to compile ncdc from source. To do so, first
install the required libraries:
sudo apt-get install libbz2-dev libsqlite3-dev libncurses5-dev\
libncursesw5-dev libglib2.0-dev libgnutls-dev zlib1g-dev
Then run the following commands to download and install ncdc:
wget https://dev.yorhel.nl/download/ncdc-1.20.tar.gz
tar -xf ncdc-1.20.tar.gz
cd ncdc-1.20
./configure --prefix=/usr
make
sudo make install
=head2 Windows (Cygwin)
Surprisingly enough, ncdc can be used even on Windows, thanks to Cygwin. If
you haven't done so already, get C<setup.exe> from the L<Cygwin
website|http://cygwin.com/> and use it to install the following packages:
=over
=item * make
=item * gcc4
=item * perl
=item * pkg-config
=item * wget
=item * zlib-devel
=item * libncursesw-devel
=item * libbz2-devel
=item * libglib2.0-devel
=item * libsqlite3-devel
=item * gnutls-devel
=back
Then open a Cygwin terminal and run the following commands to download,
compile, and install ncdc:
wget https://dev.yorhel.nl/download/ncdc-1.20.tar.gz
tar -xf ncdc-1.20.tar.gz
cd ncdc-1.20
./configure --prefix=/usr
make install

View file

@ -1,936 +0,0 @@
=head1 NAME
ncdc - Ncurses Direct Connect Client
=head1 SYNOPSIS
ncdc [options]
=head1 DESCRIPTION
Ncdc is a modern and lightweight direct connect client with a friendly ncurses
interface.
=head1 GETTING STARTED
This is a basic introduction for those who are new to ncdc. See the chapters
below for a more detailed description of the available functionality.
What you see when starting up ncdc is an input line where you can input
commands and a log window where the results are displayed, much like a regular
terminal. Commands within ncdc start with a slash (e.g. C</help>) and have tab
completion to help you.
The first thing you will want to do after starting ncdc for the first time is
to setup some basic information and settings:
/set nick MyNick
/set description ncdc is awesome!
/set connection 10
/share "My Awesome Files" /path/to/files
And if you have a direct connection to the internet or if your router allows
port forwarding, you may also want to enable active mode:
/set active_port 34194
/set active true
See the help text for each of the commands and settings for more information.
Of course, all of the above settings are saved to the database and will be used
again on the next run.
To connect to a hub, use /open:
/open ncdc adc://dc.blicky.net:2780/
Here I<ncdc> is the personal name you give to the hub, and the second argument
the URL. This URL will be saved in the database, so the next time you want to
connect to this hub, you can simply do C</open ncdc>. See the help text for
C</open> and C</connect> for more information. If you want to automatically
connect to a hub when ncdc starts up, use the C<autoconnect> setting.
Ncdc uses a tabbed interface: every hub opens in a new tab, and there are
several other kinds of tabs available as well. The type of tab is indicated in
the tab list on the bottom of the screen with a character prefix. Hubs, for
example, are prefixed with a C<#>. If a tab needs your attention, a colored
exclamation mark is displayed before the tab name, different colors are used
for different types of activity.
Everything else should be fairly self-explanatory: To search for files, use the
C</search> command. To browse through the user list of a hub, use C</userlist> or
hit Alt+u. To browse someone's file list, use C</browse> or hit the 'b' key in
the user list. And to monitor your upload and download connections, use
C</connections> or hit Alt+n.
=head1 OPTIONS
=over
=item B<-c, --session-dir=> I<dir>
Use a different session directory. Defaults to the contents of the environment
variable `$NCDC_DIR' or if this is unset to `$HOME/.ncdc'.
=item B<-h, --help>
Display summary of options.
=item B<-n, --no-autoconnect>
Don't automatically connect to hubs with the C<autoconnect> option set.
=item B<--no-bracketed-paste>
Disable bracketed pasting.
=item B<-v, --version>
Display ncdc version.
=back
=head1 GETTING CONNECTED
As with most file sharing clients, ncdc supports two modes of being connected:
I<active> and I<passive>. In passive mode (the default), you can connect to the
outside world but nobody can connect (directly) to you. When passive, you will
only be able to transfer files with people who are in active mode. In active
mode, however, you will have some port open to the rest of the network to which
other clients can connect. When active, you will be able to transfer files with
everyone and you may get more and faster search results. Configuring active
mode is therefore recommended.
In many setups, all you need to do to switch to active mode is to set a TCP/UDP
port and enable the C<active> setting:
/set active_port 34194
/set active true
When you connect to a hub, the status bar will tell you whether you are active
or passive on that particular hub, and what IP address is being used to allow
others to connect to you. For most hubs, your IP address will be detected
automatically, but in the event that this fails, you can also set it yourself:
/set active_ip 13.33.33.7
If you are behind a NAT or firewall, you have to ensure that the port you
configured is somehow allowed and/or forwarded. The C<active_port> setting is
used for incoming TCP connections and UDP messages. You can configure a
different UDP port with the C<active_udp_port> setting. Contrary to many toher
Direct Connect clients, ncdc only uses a single port for incoming TCP and TLS
connections; There is no separate port for TLS.
The C</listen> command can tell you which ports it expects to be forwarded, and
for which hubs these ports will be used. It only lists hubs on which you are
currently active, so the output will change when you open or close a hub
connection.
If you have multiple network interfaces, you can force ncdc to use only a
single interface by setting the C<local_address> setting to the address of that
interface. This affects both outgoing connections (they will be forced to go
through the configured interface) and incoming connections (the ports will be
bound to the configured interface).
All of the previously mentioned settings can be set globally (with C</set>) and
on a per-hub basis (with C</hset>). This allows you to be active on an internet
hub and a LAN-only hub at the same time. It also allows you to be active in one
hub while passive in another, or to use different ports for each hub.
=head1 INTERACTIVE COMMANDS
The following is the list of commands that can be used within ncdc. The /help
command can also be used get a list of available commands and to access this
documentation.
=over
=item B</accept>
Use this command to accept the TLS certificate of a hub. This command is used only in the case the keyprint of the TLS certificate of a hub does not match the keyprint stored in the database.
=item B</browse> [[-f] <user>]
Without arguments, this opens a new tab where you can browse your own file list. Note that changes to your list are not immediately visible in the browser. You need to re-open the tab to get the latest version of your list.
With arguments, the file list of the specified user will be downloaded (if it has not been downloaded already) and the browse tab will open once it's complete. The `-f' flag can be used to force the file list to be (re-)downloaded.
=item B</clear>
Clears the log displayed on the screen. Does not affect the log files in any way. Ctrl+l is a shortcut for this command.
=item B</close>
Close the current tab. When closing a hub tab, you will be disconnected from the hub and all related userlist and PM tabs will also be closed. Alt+c is a shortcut for this command.
=item B</connect> [<address>]
Initiate a connection with a hub. If no address is specified, will connect to the hub you last used on the current tab. The address should be in the form of `protocol://host:port/' or `host:port'. The `:port' part is in both cases optional and defaults to :411. The following protocols are recognized: dchub, nmdc, nmdcs, adc, adcs. When connecting to an nmdcs or adcs hub and the SHA256 keyprint is known, you can attach this to the url as `?kp=SHA256/<base32-encoded-keyprint>'
Note that this command can only be used on hub tabs. If you want to open a new connection to a hub, you need to use /open first. For example:
/open testhub
/connect dchub://dc.some-test-hub.com/
See the /open command for more information.
=item B</connections>
Open the connections tab.
=item B</delhub> <name>
Remove a hub from the configuration
=item B</disconnect>
Disconnect from a hub.
=item B</gc>
Cleans up unused data and reorganizes existing data to allow more efficient storage and usage. Currently, this commands removes unused hash data, does a VACUUM on db.sqlite3, removes unused files in inc/ and old files in fl/.
This command may take some time to complete, and will fully block ncdc while it is running. It is recommended to run this command every once in a while. Every month is a good interval. Note that when ncdc says that it has completed this command, it's lying to you. Ncdc will still run a few large queries on the background, which may take up to a minute to complete.
=item B</grant> [-list|<user>]
Grant someone a slot. This allows the user to download from you even if you have no free slots. The slot will remain granted until the /ungrant command is used, even if ncdc has been restarted in the mean time.
To get a list of users whom you have granted a slot, use `/grant' without arguments or with `-list'. Be warned that using `/grant' without arguments on a PM tab will grant the slot to the user you are talking with. Make sure to use `-list' in that case.
Note that a granted slot is specific to a single hub. If the same user is also on other hubs, he/she will not be granted a slot on those hubs.
=item B</help> [<command>|set <key>|keys [<section>]]
To get a list of available commands, use /help without arguments.
To get information on a particular command, use /help <command>.
To get information on a configuration setting, use /help set <setting>.
To get help on key bindings, use /help keys.
=item B</hset> [<key> [<value>]]
Get or set per-hub configuration variables. Works equivalent to the `/set' command, but can only be used on hub tabs. Use `/hunset' to reset a variable back to its global value.
=item B</hunset> [<key>]
This command can be used to reset a per-hub configuration variable back to its global value.
=item B</kick> <user>
Kick a user from the hub. This command only works on NMDC hubs, and you need to be an OP to be able to use it.
=item B</listen>
List currently opened ports.
=item B</me> <message>
This allows you to talk in third person. Most clients will display your message as something like:
** Nick is doing something
Note that this command only works correctly on ADC hubs. The NMDC protocol does not have this feature, and your message will be sent as-is, including the /me.
=item B</msg> <user> [<message>]
Send a private message to a user on the currently opened hub. If no message is given, the tab will be opened but no message will be sent.
=item B</nick> [<nick>]
Alias for `/hset nick' on hub tabs, and `/set nick' otherwise.
=item B</open> [-n] [<name>] [<address>]
Without arguments, list all hubs known by the current configuration. Otherwise, this opens a new tab to use for a hub. The name is a (short) personal name you use to identify the hub, and will be used for storing hub-specific configuration.
If you have specified an address or have previously connected to a hub from a tab with the same name, /open will automatically connect to the hub. Use the `-n' flag to disable this behaviour.
See /connect for more information on connecting to a hub.
=item B</password> <password>
This command can be used to send a password to the hub without saving it to the database. If you wish to login automatically without having to type /password every time, use '/hset password <password>'. Be warned, however, that your password will be saved unencrypted in that case.
=item B</pm> <user> [<message>]
Alias for /msg
=item B</queue>
Open the download queue.
=item B</quit>
Quit ncdc.
=item B</reconnect>
Reconnect to the hub. When your nick or the hub encoding have been changed, the new settings will be used after the reconnect.
This command can also be used on the main tab, in which case all connected hubs will be reconnected.
=item B</refresh> [<path>]
Initiates share refresh. If no argument is given, the complete list will be refreshed. Otherwise only the specified directory will be refreshed. The path argument can be either an absolute filesystem path or a virtual path within your share.
=item B</say> <message>
Sends a chat message to the current hub or user. You normally don't have to use the /say command explicitly, any command not staring with '/' will automatically imply `/say <command>'. For example, typing `hello.' in the command line is equivalent to `/say hello.'. Using the /say command explicitly may be useful to send message starting with '/' to the chat, for example `/say /help is what you are looking for'.
=item B</search> [options] <query>
Performs a file search, opening a new tab with the results.
Available options:
-hub Search the current hub only. (default)
-all Search all connected hubs, except those with `chat_only' set.
-le <s> Size of the file must be less than <s>.
-ge <s> Size of the file must be larger than <s>.
-t <t> File must be of type <t>. (see below)
-tth <h> TTH root of this file must match <h>.
File sizes (<s> above) accept the following suffixes: G (GiB), M (MiB) and K (KiB).
The following file types can be used with the -t option:
1 any Any file or directory. (default)
2 audio Audio files.
3 archive (Compressed) archives.
4 doc Text documents.
5 exe Windows executables.
6 img Image files.
7 video Video files.
8 dir Directories.
Note that file type matching is done using file extensions, and is not very reliable.
=item B</set> [<key> [<value>]]
Get or set global configuration variables. Use without arguments to get a list of all global settings and their current value. Glob-style pattern matching on the settings is also possible. Use, for example, `/set color*' to list all color-related settings.
See the `/unset' command to change a setting back to its default, and the `/hset' command to manage configuration on a per-hub basis. Changes to the settings are automatically saved to the database, and will not be lost after restarting ncdc.
To get information on a particular setting, use `/help set <key>'.
=item B</share> [<name> <path>]
Use /share without arguments to get a list of shared directories.
When called with a name and a path, the path will be added to your share. Note that shell escaping may be used in the name. For example, to add a directory with the name `Fun Stuff', you could do the following:
/share "Fun Stuff" /path/to/fun/stuff
Or:
/share Fun\ Stuff /path/to/fun/stuff
The full path to the directory will not be visible to others, only the name you give it will be public. An initial `/refresh' is done automatically on the added directory.
=item B</ungrant> [<user>]
Revoke a granted slot.
=item B</unset> [<key>]
This command can be used to reset a global configuration variable back to its default value.
=item B</unshare> [<name>]
To remove a single directory from your share, use `/unshare <name>', to remove all directories from your share, use `/unshare /'.
Note that the hash data associated with the removed files will remain in the database. This allows you to re-add the files to your share without needing to re-hash them. The downside is that the database file may grow fairly large with unneeded information. See the `/gc' command to clean that up.
=item B</userlist>
Opens the user list of the currently selected hub. Can also be accessed using Alt+u.
=item B</version>
Display version information.
=item B</whois> <user>
This will open the user list and select the given user.
=back
=head1 SETTINGS
The following is a list of configuration settings. These settings can be
changed and queried using the C</set> command for global settings and C</hset>
for hub-local settings. All configuration data is stored in the db.sqlite3 file
in the session directory.
=over
=item B<active> <boolean>
Enables or disables active mode. You may have to configure your router and/or firewall for this to work, see the `active_ip' and `active_port' settings for more information.
=item B<active_ip> <string>
Your public IP address for use in active mode. If this is not set or set to '0.0.0.0' for IPv4 or '::' for IPv6, then ncdc will try to automatically get your IP address from the hub. If you do set this manually, it is important that other clients can reach you using this IP address. If you connect to a hub on the internet, this should be your internet (WAN) IP. Likewise, if you connect to a hub on your LAN, this should be your LAN IP.
Both an IPv4 and an IPv6 address are set by providing two IP addresses separated with a comma. When unset, '0.0.0.0,::' is assumed. Only the IP version used to connect to the hub is used. That is, if you connect to an IPv6 hub, then the configured IPv6 address is used and the IPv4 address is ignored.
When set to the special value `local', ncdc will automatically get your IP address from the local network interface that is used to connect to the hub. This option should only be used if there is no NAT between you and the hub, because this will give the wrong IP if you are behind a NAT.
=item B<active_port> <integer>
The listen port for incoming connections in active mode. Set to `0' to automatically assign a random port. This setting is by default also used for the UDP port, see the `active_tls_port' settings to change that. If you are behind a router or firewall, make sure that you have configured it to forward and allow these ports.
=item B<active_udp_port> <integer>
The listen port for incoming UDP connections in active mode. Defaults to the `active_port' setting, or to a random number if `active_port' is not set.
=item B<adc_blom> <boolean>
Whether to support the BLOM extension on ADC hubs. This may decrease the bandwidth usage on the hub connection, in exchange for a bit of computational overhead. Some hubs require this setting to be enabled. This setting requires a reconnect with the hub to be active.
=item B<autoconnect> <boolean>
Set to true to automatically connect to the current hub when ncdc starts up.
=item B<autorefresh> <interval>
The time between automatic file refreshes. Recognized suffices are 's' for seconds, 'm' for minutes, 'h' for hours and 'd' for days. Set to 0 to disable automatically refreshing the file list. This setting also determines whether ncdc will perform a refresh on startup. See the `/refresh' command to manually refresh your file list.
=item B<backlog> <integer>
When opening a hub or PM tab, ncdc can load a certain amount of lines from the log file into the log window. Setting this to a positive value enables this feature and configures the number of lines to load. Note that, while this setting can be set on a per-hub basis, PM windows will use the global value (global.backlog).
=item B<chat_only> <boolean>
Set to true to indicate that this hub is only used for chatting. That is, you won't or can't download from it. This setting affects the /search command when it is given the -all option.
=item B<color_*> <color>
The settings starting with the `color_' prefix allow you to change the interface colors. The following is a list of available color settings:
list_default - default item in a list
list_header - header of a list
list_select - selected item in a list
log_default - default log color
log_time - the time prefix in log messages
log_nick - default nick color
log_highlight - nick color of a highlighted line
log_ownnick - color of your own nick
log_join - color of join messages
log_quit - color of quit messages
separator - the list separator/footer bar
tab_active - the active tab in the tab list
tabprio_low - low priority tab notification color
tabprio_med - medium priority tab notification color
tabprio_high - high priority tab notification color
title - the title bar
The actual color value can be set with a comma-separated list of color names and/or attributes. The first color in the list is the foreground color, the second color is used for the background. When the fore- or background color is not specified, the default colors of your terminal will be used.
The following color names can be used: black, blue, cyan, default, green, magenta, red, white and yellow.
The following attributes can be used: bold, blink, reverse and underline.
The actual color values displayed by your terminal may vary. Adding the `bold' attribute usually makes the foreground color appear brighter as well.
=item B<connection> <string>
Set your upload speed. This is just an indication for other users in the hub so that they know what speed they can expect when downloading from you. The actual format you can use here may vary, but it is recommended to set it to either a plain number for Mbit/s (e.g. `50' for 50 mbit) or a number with a `KiB/s' indicator (e.g. `2300 KiB/s'). On ADC hubs you must use one of the previously mentioned formats, otherwise no upload speed will be broadcasted. This setting is broadcasted as-is on NMDC hubs, to allow for using old-style connection values (e.g. `DSL' or `Cable') on hubs that require this.
This setting is ignored if `upload_rate' has been set. If it is, that value is broadcasted instead.
=item B<description> <string>
A short public description that will be displayed in the user list of a hub.
=item B<disconnect_offline> <boolean>
Automatically disconnect any upload or download transfers when a user leaves the hub, or when you leave the hub. Setting this to `true' ensures that you are only connected with people who are online on the same hubs as you are.
=item B<download_dir> <path>
The directory where finished downloads are moved to. Finished downloads are by default stored in <session directory>/dl/. It is possible to set this to a location that is on a different filesystem than the incoming directory, but doing so is not recommended: ncdc will block when moving the completed files to their final destination.
=item B<download_exclude> <regex>
When recursively adding a directory to the download queue - by pressing `d' on a directory in the file list browser - any item in the selected directory with a name that matches this regular expression will not be added to the download queue.
This regex is not checked when adding individual files from either the file list browser or the search results.
=item B<download_rate> <speed>
Maximum combined transfer rate of all downloads. The total download speed will be limited to this value. The suffixes `G', 'M', and 'K' can be used for GiB/s, MiB/s and KiB/s, respectively. Note that, similar to upload_rate, TCP overhead are not counted towards this limit, so the actual bandwidth usage might be a little higher.
=item B<download_segment> <size>
Minimum segment size to use when requesting file data from another user. Set to 0 to disable segmented downloading.
=item B<download_slots> <integer>
Maximum number of simultaneous downloads.
=item B<email> <string>
Your email address. This will be displayed in the user list of the hub, so only set this if you want it to be public.
=item B<encoding> <string>
The character set/encoding to use for hub and PM messages. This setting is only used on NMDC hubs, ADC always uses UTF-8. Some common values are:
CP1250 (Central Europe)
CP1251 (Cyrillic)
CP1252 (Western Europe)
ISO-8859-7 (Greek)
KOI8-R (Cyrillic)
UTF-8 (International)
=item B<filelist_maxage> <interval>
The maximum age of a downloaded file list. If a file list was downloaded longer ago than the configured interval, it will be removed from the cache (the fl/ directory) and subsequent requests to open the file list will result in the list being downloaded from the user again. Recognized suffices are 's' for seconds, 'm' for minutes, 'h' for hours and 'd' for days. Set to 0 to disable the cache altogether.
=item B<flush_file_cache> <none|upload|download|hash>[,...]
Tell the OS to flush the file (disk) cache for file contents read while hashing and/or uploading or written to while downloading. On one hand, this will avoid trashing your disk cache with large files and thus improve the overall responsiveness of your system. On the other hand, ncdc may purge any shared files from the cache, even if they are still used by other applications. In general, it is a good idea to enable this if you also use your system for other things besides ncdc, you share large files (>100MB) and people are not constantly downloading the same file from you.
=item B<geoip_cc4> <path>|disabled
Path to the GeoIP Country database file for IPv4, or 'disabled' to disable GeoIP lookup for IPv4 addresses.
=item B<geoip_cc6> <path>|disabled
Path to the GeoIP Country database file for IPv6, or 'disabled' to disable GeoIP lookup for IPv6 addresses.
=item B<hash_rate> <speed>
Maximum file hashing speed. See the `download_rate' setting for allowed formats for this setting.
=item B<hubname> <string>
The name of the currently opened hub tab. This is a user-assigned name, and is only used within ncdc itself. This is the same name as given to the `/open' command.
=item B<incoming_dir> <path>
The directory where incomplete downloads are stored. This setting can only be changed when the download queue is empty. Also see the download_dir setting.
=item B<local_address> <string>
Specifies the address of the local network interface to use for connecting to the outside and for accepting incoming connections in active mode. Both an IPv4 and an IPv6 address are set by providing two IP addresses separated with a comma. When unset, '0.0.0.0,::' is assumed.
If no IPv4 address is specified, '0.0.0.0' is added automatically. Similarly, if no IPv6 address is specified, '::' is added automatically. The address that is actually used depends on the IP version actually used. That is, if you're on an IPv6 hub, then ncdc will listen on the specified IPv6 address. Note that, even if the hub you're on is on IPv6, ncdc may still try to connect to another client over IPv4, at which point the socket will be bound to the configured IPv4 address.
=item B<log_debug> <boolean>
Log debug messages to stderr.log in the session directory. It is highly recommended to enable this setting if you wish to debug or hack ncdc. Be warned, however, that this may generate a lot of data if you're connected to a large hub.
=item B<log_downloads> <boolean>
Log downloaded files to transfers.log.
=item B<log_hubchat> <boolean>
Log the main hub chat. Note that changing this requires any affected hub tabs to be closed and reopened before the change is effective.
=item B<log_uploads> <boolean>
Log file uploads to transfers.log.
=item B<minislots> <integer>
Set the number of available minislots. A `minislot' is a special slot that is used when all regular upload slots are in use and someone is requesting your filelist or a small file. In this case, the other client automatically applies for a minislot, and can still download from you as long as not all minislots are in use. What constitutes a `small' file can be changed with the `minislot_size' setting. Also see the `slots' configuration setting and the `/grant' command.
=item B<minislot_size> <integer>
The maximum size of a file that may be downloaded using a `minislot', in KiB. See the `minislots' setting for more information.
=item B<nick> <string>
Your nick. Nick changes are only visible on newly connected hubs, use the `/reconnect' command to use your new nick immediately. Note that it is highly discouraged to change your nick on NMDC hubs. This is because clients downloading from you have no way of knowing that you changed your nick, and therefore can't immediately continue to download from you.
=item B<notify_bell> <disable|low|medium|high>
When enabled, ncdc will send a bell to your terminal when a tab indicates a notification. The notification types are:
high - Messages directed to you (PM or highlight in hub chat),
medium - Regular hub chat,
low - User joins/quits, new search results, etc.
How a "bell" (or "beep" or "alert", whatever you prefer to call it) manifests itself depends on your terminal. In some setups, this generates an audible system bell. In other setups it can makes your terminal window flash or do other annoying things to get your attention. And in some setups it is ignored completely.
=item B<password> <string>
Sets your password for the current hub and enables auto-login on connect. If you just want to login to a hub without saving your password, use the `/password' command instead. Passwords are saved unencrypted in the config file.
=item B<reconnect_timeout> <interval>
The time to wait before automatically reconnecting to a hub. Set to 0 to disable automatic reconnect.
=item B<sendfile> <boolean>
Whether or not to use the sendfile() system call to upload files, if supported. Using sendfile() allows less resource usage while uploading, but may not work well on all systems.
=item B<share_emptydirs> <boolean>
Share empty directories. When disabled (the default), empty directories in your share will not be visible to others. This also affects empty directories containing only empty directories, etc. A file list refresh is required for this setting to be effective.
=item B<share_exclude> <regex>
Any file or directory with a name that matches this regular expression will not be shared. A file list refresh is required for this setting to be effective.
=item B<share_hidden> <boolean>
Whether to share hidden files and directories. A `hidden' file or directory is one of which the file name starts with a dot. (e.g. `.bashrc'). A file list refresh is required for this setting to be effective.
=item B<share_symlinks> <boolean>
Whether to follow symlinks in shared directories. When disabled (default), ncdc will never share any files outside of the directory you specified. When enabled, any symlinks in your shared directories will be followed, even when they point to a directory outside your share.
=item B<show_joinquit> <boolean>
Whether to display join/quit messages in the hub chat.
=item B<slots> <integer>
The number of upload slots. This determines for the most part how many people can download from you simultaneously. It is possible that this limit is exceeded in certain circumstances, see the `minislots' setting and the `/grant' command.
=item B<sudp_policy> <disabled|allow|prefer>
Set the policy for sending or receiving encrypted UDP search results. When set to `disabled', all UDP search results will be sent and received in plain text. Set this to `allow' to let ncdc reply with encrypted search results if the other client requested it. `prefer' will also cause ncdc itself to request encryption.
Note that, regardless of this setting, encrypted UDP search results are only used on ADCS hubs. They will never be sent on NMDC or non-TLS ADC hubs. Also note that, even if you set this to `prefer', encryption is still only used when the client on the other side of the connection also supports it.
=item B<tls_policy> <disabled|allow|prefer>
Set the policy for secure client-to-client connections. Setting this to `disabled' disables TLS support for client connections, but still allows you to connect to TLS-enabled hubs. `allow' will allow the use of TLS if the other client requests this, but ncdc itself will not request TLS when connecting to others. Setting this to `prefer' tells ncdc to also request TLS when connecting to others.
The use of TLS for client connections usually results in less optimal performance when uploading and downloading, but is quite effective at avoiding protocol-specific traffic shaping that some ISPs may do. Also note that, even if you set this to `prefer', TLS will only be used if the connecting party also supports it.
=item B<tls_priority> <string>
Set the GnuTLS priority string used for all TLS-enabled connections. See the "Priority strings" section in the GnuTLS manual for details on what this does and how it works. Currently it is not possible to set a different priority string for different types of connections (e.g. hub or incoming/outgoing client connections).
=item B<ui_time_format> <string>
The format of the time displayed in the lower-left of the screen. Set `-' to not display a time at all. The string is passed to the Glib g_date_time_format() function, which accepts roughly the same formats as strftime(). Check out the strftime(3) man page or the Glib documentation for more information. Note that this setting does not influence the date/time format used in other places, such as the chat window or log files.
=item B<upload_rate> <speed>
Maximum combined transfer rate of all uploads. See the `download_rate' setting for more information on rate limiting. Note that this setting also overrides any `connection' setting.
=back
=head1 KEY BINDINGS
On any tab without the text input line, you can press `?' to get the key
bindings for that tab. The list of key bindings is available through the
C</help keys> command, and is reproduced below.
=over
=item B<Global key bindings>
Alt+j Open previous tab.
Alt+k Open next tab.
Alt+h Move current tab left.
Alt+l Move current tab right.
Alt+a Move tab with recent activity.
Alt+<num> Open tab with number <num>.
Alt+c Close current tab.
Alt+n Open the connections tab.
Alt+q Open the download queue tab.
Alt+o Open own file list.
Alt+r Refresh file list.
Keys for tabs with a log window:
Ctrl+l Clear current log window.
PgUp Scroll the log backward.
PgDown Scroll the log forward.
Keys for tabs with a text input line:
Left/Right Move cursor one character left or right.
End/Home Move cursor to the end / start of the line.
Up/Down Scroll through the command history.
Tab Auto-complete current command, nick or argument.
Alt+b Move cursor one word backward.
Alt+f Move cursor one word forward.
Backspace Delete character before cursor.
Delete Delete character under cursor.
Ctrl+w Delete to previous space.
Alt+d Delete to next space.
Ctrl+k Delete everything after cursor.
Ctrl+u Delete entire line.
=item B<File browser>
Up/Down Select one item up/down.
k/j Select one item up/down.
PgUp/PgDown Select one page of items up/down.
End/Home Select last/first item in the list.
/ Start incremental regex search (press Return to stop editing).
,/. Search next / previous.
Right/l Open selected directory.
Left/h Open parent directory.
t Toggle sorting directories before files.
s Order by file size.
n Order by file name.
d Add selected file/directory to the download queue.
m Match selected item with the download queue.
M Match entire file list with the download queue.
a Search for alternative download sources.
=item B<Connection list>
Up/Down Select one item up/down.
k/j Select one item up/down.
PgUp/PgDown Select one page of items up/down.
End/Home Select last/first item in the list.
d Disconnect selected connection.
i/Return Toggle information box.
f Find user in user list.
m Send a PM to the selected user.
q Find file in download queue.
=item B<Download queue>
Up/Down Select one item up/down.
k/j Select one item up/down.
PgUp/PgDown Select one page of items up/down.
End/Home Select last/first item in the list.
K/J Select one user up/down.
f Find user in user list.
c Find connection in the connection list.
a Search for alternative download sources.
d Remove selected file from the queue.
+/- Increase/decrease priority.
i/Return Toggle user list.
r Remove selected user for this file.
R Remove selected user from all files in the download queue.
x Clear error state for the selected user for this file.
X Clear error state for the selected user for all files.
Note: when an item in the queue has `ERR' indicated in the
priority column, you have two choices: You can remove the
item from the queue using `d', or attempt to continue the
download by increasing its priority using `+'.
=item B<Search results tab>
Up/Down Select one item up/down.
k/j Select one item up/down.
PgUp/PgDown Select one page of items up/down.
End/Home Select last/first item in the list.
f Find user in user list.
b/B Browse the selected users' list, B to force a redownload.
d Add selected file to the download queue.
h Toggle hub column visibility.
u Order by username.
s Order by file size.
l Order by free slots.
n Order by file name.
m Match selected item with the download queue.
M Match all search results with the download queue.
q Match selected users' list with the download queue.
Q Match all matched users' lists with the download queue.
a Search for alternative download sources.
=item B<User list tab>
Up/Down Select one item up/down.
k/j Select one item up/down.
PgUp/PgDown Select one page of items up/down.
End/Home Select last/first item in the list.
/ Start incremental regex search (press Return to stop editing).
,/. Search next / previous.
o Toggle sorting OPs before others.
s/S Order by share size.
u/U Order by username.
t/T Toggle visibility / order by tag column.
e/E Toggle visibility / order by email column.
c/C Toggle visibility / order by connection column.
p/P Toggle visibility / order by IP column.
i/Return Toggle information box.
m Send a PM to the selected user.
g Grant a slot to the selected user.
b/B Browse the selected users' list, B to force a redownload.
q Match selected users' list with the download queue.
=back
=head1 ENVIRONMENT
$NCDC_DIR is used to determine the session dir, it is only honoured if I<-c> is
not set on the command line.
=head1 FILES
$NCDC_DIR corresponds to the session dir set via I<-c>, environment variable
$NCDC_DIR or $HOME/.ncdc.
=over
=item $NCDC_DIR/cert/
Directory where the client certificates are stored. Must contain a private key
file (client.key) and public certificate (client.crt). These will be generated
automatically when ncdc starts up the first time.
=item $NCDC_DIR/db.sqlite3
The database. This stores all configuration variables, hash data of shared
files, download queue information and other state information. Manually editing
this file with the `sqlite3' commandline tool is possible but discouraged. Any
changes made to the database while ncdc is running will not be read, and may
even get overwritten by ncdc.
=item $NCDC_DIR/dl/
Directory where completed downloads are moved to by default. Can be changed
with the C<download_dir> configuration option.
=item $NCDC_DIR/files.xml.bz2
Filelist containing a listing of all shared files.
=item $NCDC_DIR/fl/
Directory where downloaded file lists from other users are stored. The names of
the files are hex-encoded user IDs that are used internally by ncdc. Old file
lists are deleted automatically after a configurable interval. See the
C<filelist_maxage> configuration option.
=item $NCDC_DIR/history
Command history.
=item $NCDC_DIR/inc/
Default location for incomplete downloads. Can be changed with the
C<incoming_dir> setting. The file names in this directory are the
base32-encoded TTH root of the completed file.
=item $NCDC_DIR/logs/
Directory where all the log files are stored. File names starting with `#' are
hub logs and `~' are user (PM) logs. Special log files are transfers.log and
main.log.
ncdc does not have built-in functionality to rotate or compress log files
automatically. When rotating log files manually (e.g. via a cron job), make
sure to send the SIGUSR1 signal afterwards to force ncdc to flush the old logs
and create or open the new log files.
=item $NCDC_DIR/stderr.log
Error/debug log. This file is cleared every time ncdc starts up.
=item $NCDC_DIR/version
Version of the data directory. This file locked while an ncdc instance is
running, making sure that no two ncdc instances work with the same session
directory at the same time.
=back
=head2 Format of transfers.log
Uploads and downloads are logged in the transfers.log file. Transfers are
separated by a newline (C<0x0a>). Each log line has the following fields,
separated by a space:
=over
=item 1.
Date/time when the transfer ended, formatted as C<[YYYY-MM-DD HH:MM:SS ZONE]>,
=item 2.
Hub name, including the C<#> prefix,
=item 3.
Base32-encoded CID of the other user for ADC transfers, or a '-' for NMDC,
=item 4.
User name (escaped),
=item 5.
IPv4 or IPv6 address,
=item 6.
Direction, C<u> for upload or C<d> for download,
=item 7.
Whether the transfer completed successfully (C<c>) or has been
interrupted/disconnected before all requested file data has been transferred
(C<i>),
=item 8.
Base32-encoded TTH of the transferred file, or '-' for C<files.xml.bz2>,
=item 9.
Total transfer time, in seconds,
=item 10.
File size, in bytes,
=item 11.
File offset, in bytes,
=item 12.
Transfer size, in bytes,
=item 13.
File path (escaped). Absolute virtual path for uploads, destination path for
downloads.
=back
All fields are encoded in UTF-8. Fields that may contain a space or newline are
escaped as follows: A space is escaped as C<\s>, a newline as C<\n> and a
backslash as C<\\>. The timestamp is not escaped.
Many clients download files is separate (smallish) chunks. Ncdc makes no
attempt to combine multiple chunk requests in a single log entry, so you may
see the same uploaded file several times with a different file offset.
=head1 LICENSE
Copyright (C) 2011-2013 Yoran Heling <projects@yorhel.nl>
ncdc is distributed under the MIT license, please read the COPYING file for
more information.
=head1 BUGS
Please report bugs or feature requests to the bug tracker or the mailing list.
Both can be found on the ncdc homepage at L<http://dev.yorhel.nl/ncdc>. There
is also an ADC hub available at C<adc://dc.blicky.net:2780/> for general
support and discussions.
=head1 AUTHOR
ncdc is written by Yoran Heling <projects@yorhel.nl>
Web: L<http://dev.yorhel.nl/ncdc>

View file

@ -1,21 +0,0 @@
=pod
Note: While these screenshots are from version 1.5, the latest version has only
little visible changes. Let me also apologise for the crappy formatting, I
should take some smaller shots next time...
=head2 Main chat
[img scr ncdchub.png Ncdc in the mainchat.]
=head2 File browser
[img scr ncdcbrowse.png Simple file list browser.]
=head2 User list
[img scr ncdcusers.png Ncdc displaying the userlist of a hub.]
=head2 Built-in help
[img scr ncdchelp.png Ncdc built-in help.]

93
dat/ncdc.md Normal file
View file

@ -0,0 +1,93 @@
% NCurses Direct Connect
Ncdc is a modern and lightweight direct connect client with a friendly ncurses
interface.
## Get ncdc!
Latest version
: 1.20 ([dllink ncdc-1.20.tar.gz]
\- [changes](https://dev.yorhel.nl/ncdc/changes))
Convenient static binaries for Linux:
[64-bit](/download/ncdc-linux-x86_64-1.20-6-g5111a.tar.gz) -
[32-bit](/download/ncdc-linux-i486-1.20-6-g5111a.tar.gz) -
[ARM](/download/ncdc-linux-arm-1.20-6-g5111a.tar.gz). Check the
[installation instructions](/ncdc/install) for more info.
Development version
: The latest development version is available from git and can be cloned using
`git clone git://g.blicky.net/ncdc.git`. The repository is available for
[online browsing](https://g.blicky.net/ncdc.git/).
Requirements
: The following libraries are required: ncurses, zlib, bzip2, sqlite3, glib2 and
gnutls.
Ncdc is entirely written in C and available under a liberal MIT license.
Community
: - [Bug tracker](https://dev.yorhel.nl/ncdc/bug) - For bugs reports, feature requests and patches.
- `adcs://dc.blicky.net:2780/` - For real-time chat.
Packages and ports
: Are available for the following systems:
[Arch Linux](https://aur.archlinux.org/packages/ncdc/) -
[Fedora](https://apps.fedoraproject.org/packages/ncdc/overview/) -
[FreeBSD](http://www.freshports.org/net-p2p/ncdc/) -
[Frugalware](http://frugalware.org/packages?srch=ncdc&op=pkg&arch=all&ver=all) -
[Gentoo](http://packages.gentoo.org/package/net-p2p/ncdc) -
[GNU Guix](https://www.gnu.org/software/guix/package-list.html) -
[Homebrew](http://braumeister.org/formula/ncdc) -
[OpenSUSE](http://packman.links2linux.org/package/ncdc) -
[Source Mage](http://download.sourcemage.org/grimoire/codex/test/ftp/ncdc/)
I have a few old packages on the [Open Build
Service](https://build.opensuse.org/package/show/home:yorhel/ncdc), but
these are unmaintained. The static binaries are preferred.
A convenient installer is available for
[Android](http://code.ivysaur.me/ncdcinstaller.html).
## Features
Common features all modern DC clients (should) have:
- Connecting to multiple hubs at the same time,
- Support for both ADC and NMDC protocols,
- Chatting and private messaging,
- Browsing the user list of a connected hub,
- Share management and file uploading,
- Connections and download queue management,
- File list browsing,
- TTH-checked, multi-source and segmented file downloading,
- Searching for files,
- Secure hub (adcs:// and nmdcs://) and client connections on both protocols,
- Bandwidth throttling,
- IPv6 support.
And special features not commonly found in other clients:
- Different connection settings for each hub,
- Encrypted UDP messages (ADC SUDP),
- Subdirectory refreshing,
- Nick notification and highlighting in chat windows,
- Trust on First Use for TLS-enabled hubs,
- A single listen port for both TLS and TCP connections,
- Efficient file uploads using sendfile(),
- Large file lists are opened in a background thread,
- Doesn't trash your OS file cache (with the flush\_file\_cache option enabled),
- (Relatively...) low memory usage.
## What doesn't ncdc do?
Since the above list is getting larger and larger every time, it may be more
interesting to list a few features that are (relatively) common in other DC
clients, but which ncdc doesn't do. Yet.
- NAT Traversal,
- OP features (e.g. client detection, file list scanning and other useful stuff for OPs),
- SOCKS support.
Of course, there are many more features that could be implemented or improved.
These will all be addressed in later versions (hopefully :).

View file

@ -1,13 +1,14 @@
=head1 About ncdc
% Ncdc Q&A
# About ncdc
=head2 What about other text-mode clients?
## What about other text-mode clients?
L<microdc2|http://corsair626.no-ip.org/microdc/> - A rather nice client, yet
[microdc2](http://corsair626.no-ip.org/microdc/) - A rather nice client, yet
not exactly there. It's limited to connecting to a single hub, hasn't been
updated since 2006, and the readline interface is slightly awkward to use.
L<nanodc|http://sourceforge.net/projects/nanodc/> - Can't comment much on this,
[nanodc](http://sourceforge.net/projects/nanodc/) - Can't comment much on this,
except maybe that rocket science is perhaps easier than getting nanodc to
compile.
@ -15,18 +16,17 @@ LDCC - Uses DCTC as backend and an interface based on TurboVision. All
mentioned projects are dead: neither LDCC, DCTC nor TurboVision are seeing any
recent development.
L<ShakesPeer|http://shakespeer.bzero.se/> - Appears to have a commandline
[ShakesPeer](http://shakespeer.bzero.se/) - Appears to have a commandline
interface as well. I haven't personally tried it, but have not heard many
positive things about it. Has not seen any recent development, either.
=head2 Why did you start from scratch? Why not use the DC++ core?
## Why did you start from scratch? Why not use the DC++ core?
There are several reasons why I chose not to use code from existing projects,
but the two most important reasons are the following: 1) I am a control freak,
and 2) personal preferences.
B<Control freak:> I have no idea how to create an interface to a protocol if I
**Control freak:** I have no idea how to create an interface to a protocol if I
don't know the overall design and all the tiny details of the actual protocol
I'm working with. And what's a better way to get used to a protocol than by
writing everything yourself? Then there's some other advantages to
@ -34,20 +34,22 @@ reimplementing everything: I get to choose the library dependencies and the
memory/CPU efficiency trade-offs, and I am not limited by an existing
implementation that needs quite a few modifications to achieve what I want.
Most of the "special features not commonly found in other clients" mentioned on
the L<homepage|https://dev.yorhel.nl/ncdc> are a direct result of this.
the [homepage](/ncdc) are a direct result of this.
B<Personal preferences:> These are simple: I rather dislike C++ and working
**Personal preferences:** These are simple: I rather dislike C++ and working
with other people's code. Working with other people's C++ code isn't exactly
something I wish to spend my free time on.
=head2 Does ncdc support TLS 1.2?
## Does ncdc support TLS 1.2?
Yes, but you need a recent version of GnuTLS. Nobody knows what counts as
"recent", exactly, but I'm guessing any 3.0+ version will do.
## Does ncdc support TLS 1.3?
=head2 What protocol features does ncdc support?
Yes, but you need an even more recent version of GnuTLS.
## What protocol features does ncdc support?
For ADC: BASE, RF, TIGR, BZIP, BLOM, ADCS, KEYP and SUDP.
@ -59,103 +61,89 @@ does not support some of the older NMDC protocol features, like $Get,
$GetZBlock, $CHUNK, $Cancel or non-XML file lists. I am not aware of an other
up-to-date client that still uses any of these features.
=head2 What are those flags / character indications in the connection list?
## What are those flags / character indications in the connection list?
Since the manual page doesn't cover those yet, I'll document it here for now:
The header has C<St>, where the C<S> stands for Status and C<t> for whether TLS
encryption is used or not. The status flags can be either B<C>onnecting,
B<H>andshake, B<I>dle, B<D>ownloading, B<U>ploading or B<-> for disconnected.
The header has `St`, where the `S` stands for Status and `t` for whether TLS
encryption is used or not. The status flags can be either **C**onnecting,
**H**andshake, **I**dle, **D**ownloading, **U**ploading or **-** for disconnected.
## ...And what about those in the user list?
=head2 ...And what about those in the user list?
The user list has three boolean flags: **O**perator, **P**assive, and whether the client has **T**LS enabled.
The user list has three boolean flags: B<O>perator, B<P>assive, and whether the client has B<T>LS enabled.
# Troubleshooting
=head1 Troubleshooting
=head2 Luadch: "(error-40) Invalid named parameter in inf: I4"
## Luadch: "(error-40) Invalid named parameter in inf: I4"
This error occurs when connecting to (some?) luadch hubs. The problem here is
that IP address autodetection is broken on these hubs, and you can work around
it by manually setting C<active_ip> to your (public) IP address: C</set
active_ip 1.3.3.7>.
it by manually setting `active_ip` to your (public) IP address: `/set
active_ip 1.3.3.7`.
=head2 The Alt- keys don't work!
## The Alt- keys don't work!
The ncdc manual refers to the "meta" key as Alt-something, but the actual key
to use tends to differ depending on your setup. In almost every setup, you can
press and release the 'Esc' key as a replacement for Alt-something. If you're
on OS X, L<this stackoverflow answer|http://stackoverflow.com/a/438892>
on OS X, [this stackoverflow answer](http://stackoverflow.com/a/438892)
may be helpful.
=head2 Ncdc crashes a lot!
## Ncdc crashes a lot!
Ncdc 1.19.1 has no known bugs that may cause a crash. If you're running an older
Ncdc 1.20 has no known bugs that may cause a crash. If you're running an older
version of ncdc, please upgrade. If your ncdc is up to date and you still have
a crash, please report a bug.
## Ncdc uses too much disk space!
=head2 Ncdc uses too much disk space!
First, look where this disk space goes to (hint: use [ncdu](/ncdu)). If it's
the log files: you can safely delete or rotate them (see next question).
First, look where this disk space goes to (hint: use
L<ncdu|https://dev.yorhel.nl/ncdu>). If it's the log files: you can safely
delete or rotate them (see next question).
The I<db.sqlite3> file can also grow quite large in certain situations. If you
The _db.sqlite3_ file can also grow quite large in certain situations. If you
modify or rename a lot of files in your share and ncdc re-hashes them, the old
hash data associated with the files is not removed from the database, resulting
in wasted disk space. The C</gc> command in ncdc can be used to clean up this
in wasted disk space. The `/gc` command in ncdc can be used to clean up this
unused data. Be warned, however, that this command needs roughly twice the size
of the old db.sqlite3 file for temporary storage, so make sure you have enough
space available. (Note that this behaviour is not specific to ncdc, most other
DC clients do the same.)
=head2 Why doesn't ncdc rotate log files automatically?
## Why doesn't ncdc rotate log files automatically?
Because you can easily do that yourself. You can either use logrotate or a
simple script that runs from a cron. For an example of the latter option,
L<this is the script I use|http://p.blicky.net/s7132>, which is run as a
[this is the script I use](http://p.blicky.net/s7132), which is run as a
monthly cron job.
# Can ncdc...
=head1 Can ncdc...
=head2 Can ncdc run in the background / as a daemon?
## Can ncdc run in the background / as a daemon?
As with most ncurses applications: no. At least, it does not have this
functionality built-in. Ncdc is designed to be used in combination with a
separate terminal multiplexer or detach utility to handle this. Have a look at
L<GNU screen|http://www.gnu.org/s/screen/>,
L<tmux|http://tmux.sourceforge.net/> or L<dtach|http://dtach.sourceforge.net/>.
[GNU screen](http://www.gnu.org/s/screen/),
[tmux](http://tmux.sourceforge.net/) or [dtach](http://dtach.sourceforge.net/).
## Does ncdc support UPnP?
=head2 Does ncdc support UPnP?
Not natively. However, it is possible to use L<this
script|http://www.howtoforge.com/administrating-your-gateway-device-via-upnp>
Not natively. However, it is possible to use [this
script](http://www.howtoforge.com/administrating-your-gateway-device-via-upnp)
and manually keep a port open using a cron job. I have no experience with this
myself, though. I just run ncdc directly on my router. :-)
=head2 Are there any programs available for analyzing the transfers.log file?
## Are there any programs available for analyzing the transfers.log file?
Nothing like that is included in the release yet, but there is a simple Perl
script available: L<ncdc-transfer-stats|http://p.blicky.net/eu00a>, and a short
Go program: L<ncdc-share-report|http://p.blicky.net/h25z8>.
script available: [ncdc-transfer-stats](http://p.blicky.net/eu00a), and a short
Go program: [ncdc-share-report](http://p.blicky.net/h25z8).
=head2 Can ncdc use the hash data or configuration from an existing DC++ installation?
## Can ncdc use the hash data or configuration from an existing DC++ installation?
No, ncdc uses its own configuration and hash storage directory. However, on
popular demand I could write a conversion utility to transfer the hash data
from other clients to ncdc's format. (Contrary to my expectations, there hasn't
been much interest in such a tool ever since I wrote this FAQ entry two years
been much interest in such a tool ever since I wrote this FAQ entry many years
ago. So I guess this isn't really a FAQ).

156
dat/ncdc/install.md Normal file
View file

@ -0,0 +1,156 @@
% Ncdc Installation Instructions
# General instructions
## Building from source
In theory, the following instructions should work everywhere:
- Install the required dependencies: ncurses, bzip2, zlib, sqlite3, glib2 and gnutls,
- Download and extract the source tarball from the [homepage](/ncdc),
- `./configure`
- `make`
- And then run `make install` with superuser permissions.
In practice, however, this does not always work and may not always be the
prefered method of installation. On this page I try to collect instructions for
each OS and distribution to make the installation process a bit easier for
everyone.
If your system is missing from this page or if you're still having trouble,
don't hesitate to join the support hub at `adc://dc.blicky.net/` or send me a
mail at [projects@yorhel.nl](mailto:projects@yorhel.nl). Contributions to this
page are of course highly welcomed as well. :-)
## Statically linked binaries
If you just want to get ncdc running without going through the trouble of
compiling and/or installing it, I also offer statically linked binaries:
- [Linux, 64-bit](/download/ncdc-linux-x86_64-1.20-6-g5111a.tar.gz)
- [Linux, 32-bit](/download/ncdc-linux-i486-1.20-6-g5111a.tar.gz)
- [Linux, ARM](/download/ncdc-linux-arm-1.20-6-g5111a.tar.gz)
To use them, simply download and extract the tarball, and then run `./ncdc` on
the command line.
The binaries include all the required dependencies and are linked against
[musl](http://www.etalabs.net/musl/), so they should run on any Linux machine
with the right architecture. If you want binaries for an other OS or
architecture, please bug me and I'll see what I can do.
# System-specific instructions
## Android
An [convenient installer](http://code.ivysaur.me/ncdcinstaller.html) is
available for Android 2.3 and later, which makes use of the static binary.
## Arch Linux
Ncdc is available on [AUR](https://aur.archlinux.org/packages/ncdc/), to
install it you can use your favorite AUR-installer. If you don't have a
favorite, go for the manual approach:
wget https://aur.archlinux.org/cgit/aur.git/snapshot/ncdc.tar.gz
tar -xf ncdc.tar.gz
cd ncdc
makepkg -si
## Fedora
There's a [package](https://apps.fedoraproject.org/packages/ncdc/overview/)
available for Fedora.
## FreeBSD
Ncdc is available in the Ports Collection. To install, [make sure your
collection is
up-to-date](http://www.freebsd.org/doc/en_US.ISO8859-1/books/handbook/ports-using.html)
and install the Port as any other:
cd /usr/ports/net-p2p/ncdc
make install clean
## Gentoo
Ncdc is available in the Portage tree, so installation is trivial:
emerge ncdc
## Mac OS X
Ncdc is available in [Homebrew](http://braumeister.org/formula/ncdc).
## OpenIndiana
This has been tested on OpenIndiana Build 151a Server, but may work on other
versions as well. Compiling from source is your only option at the moment.
First install some required packages (as root):
pkg install gcc-3 glib2 gnutls gettext header-math perl-510/extra
Then, fetch the ncdc source tarball, extract and build as follows:
wget https://dev.yorhel.nl/download/ncdc-1.20.tar.gz
tar -xf ncdc-1.20.tar.gz
cd ncdc-1.20
export PATH="$PATH:/usr/perl5/5.10.0/bin"
./configure --prefix=/usr LDFLAGS='-L/usr/gnu/lib -R/usr/gnu/lib'
make
And finally, to actually install ncdc, run `make install` as root. You can
safely revert `$PATH` back to its previous value if you wish, it was only
necessary in order for `./configure` and `make` to find `pod2man`.
## OpenSUSE
Get the package from [PackMan](http://packman.links2linux.org/package/ncdc):
Select your openSUSE release and hit the "1 click install" button.
## Ubuntu & Debian
The preferred way of installing ncdc on Ubuntu or Debian is to use the static
binaries provided above.
Alternatively, you can also try to compile ncdc from source. To do so, first
install the required libraries:
sudo apt-get install libbz2-dev libsqlite3-dev libncurses5-dev\
libncursesw5-dev libglib2.0-dev libgnutls-dev zlib1g-dev
Then run the following commands to download and install ncdc:
wget https://dev.yorhel.nl/download/ncdc-1.20.tar.gz
tar -xf ncdc-1.20.tar.gz
cd ncdc-1.20
./configure --prefix=/usr
make
sudo make install
## Windows (Cygwin)
Surprisingly enough, ncdc can be used even on Windows, thanks to Cygwin. If
you haven't done so already, get `setup.exe` from the [Cygwin
website](http://cygwin.com/) and use it to install the following packages:
- make
- gcc4
- perl
- pkg-config
- wget
- zlib-devel
- libncursesw-devel
- libbz2-devel
- libglib2.0-devel
- libsqlite3-devel
- gnutls-devel
Then open a Cygwin terminal and run the following commands to download,
compile, and install ncdc:
wget https://dev.yorhel.nl/download/ncdc-1.20.tar.gz
tar -xf ncdc-1.20.tar.gz
cd ncdc-1.20
./configure --prefix=/usr
make install

21
dat/ncdc/scr.md Normal file
View file

@ -0,0 +1,21 @@
% Ncdc Screenshots
Note: While these screenshots are from version 1.5, the latest version has only
little visible changes. Let me also apologise for the crappy formatting, I
should take some smaller shots next time...
## Main chat
![Ncdc in the mainchat.](/img/ncdchub.png)
## File browser
![Simple file list browser.](/img/ncdcbrowse.png)
## User list
![Ncdc displaying the userlist of a hub.](/img/ncdcusers.png)
## Built-in help
![Ncdc built-in help.](/img/ncdchelp.png)

102
dat/ncdu
View file

@ -1,102 +0,0 @@
=pod
Ncdu is a disk usage analyzer with an ncurses interface. It is designed to find
space hogs on a remote server where you don't have an entire graphical setup
available, but it is a useful tool even on regular desktop systems. Ncdu aims
to be fast, simple and easy to use, and should be able to run in any minimal
POSIX-like environment with ncurses installed.
=head2 Download
=over
=item Latest version
1.14 ([dllink ncdu-1.14.tar.gz download]
- L<changes|https://dev.yorhel.nl/ncdu/changes>)
I also have convenient static binaries for Linux
L<i486|https://dev.yorhel.nl/download/ncdu-linux-i486-1.14.tar.gz> and
L<ARM|https://dev.yorhel.nl/download/ncdu-linux-arm-1.14.tar.gz>. Download,
extract and run; no compilation or installation necessary (uses
L<musl|http://www.musl-libc.org/>).
=item Development version
The most recent code is available on a git repository and can be cloned with
C<git clone git://g.blicky.net/ncdu.git/>. The repository is also available for
L<online browsing|http://g.blicky.net/ncdu.git/>.
=back
Ncdu is entirely written in C and available under a liberal MIT license.
=head2 Packages and ports
Ncdu has been packaged for quite a few systems, here's a list of the ones I am aware of:
L<AIX|http://www.perzl.org/aix/index.php?n=Main.Ncdu> -
L<Alpine Linux|http://pkgs.alpinelinux.org/packages?name=ncdu> -
L<ALT Linux|http://sisyphus.ru/en/srpm/ncdu> -
L<Arch Linux|https://www.archlinux.org/packages/?q=ncdu> -
L<CRUX|https://crux.nu/portdb/?q=ncdu&a=search> -
L<Cygwin|https://cygwin.com/cgi-bin2/package-grep.cgi?grep=ncdu> -
L<Debian|http://packages.debian.org/ncdu> -
L<Fedora|https://apps.fedoraproject.org/packages/ncdu> -
L<FreeBSD|https://www.freebsd.org/cgi/ports.cgi?query=ncdu&stype=all> -
L<Frugalware|http://frugalware.org/packages/?op=pkg&srch=ncdu&arch=all&ver=all> -
L<Gentoo|https://packages.gentoo.org/packages/sys-fs/ncdu> -
L<GNU Guix|https://www.gnu.org/software/guix/package-list.html> -
L<OpenBSD|http://cvsweb.openbsd.org/cgi-bin/cvsweb/ports/sysutils/ncdu/> -
Mac OS X (L<Fink|http://pdb.finkproject.org/pdb/package.php/ncdu> - L<Homebrew|https://formulae.brew.sh/formula/ncdu> - L<MacPorts|http://www.macports.org/ports.php?by=name&substr=ncdu>) -
L<Puppy Linux|http://www.murga-linux.com/puppy/viewtopic.php?t=35024> -
L<Solaris|http://www.opencsw.org/packages/ncdu> -
L<Slackware|http://slackbuilds.org/repository/14.2/system/ncdu/> -
L<Slax Linux|http://www.slax.org/modules.php?detail=ncdu> -
L<Ubuntu|http://packages.ubuntu.com/search?searchon=sourcenames&keywords=ncdu> -
L<Void Linux|https://voidlinux.org/packages/>
Packages for RHEL and (open)SUSE can be found on the
L<Open Build Service|https://software.opensuse.org//download.html?project=utilities&package=ncdu>.
Packages for NetBSD, DragonFlyBSD, MirBSD and others can be found on
L<pkgsrc|http://pkgsrc.se/sysutils/ncdu>.
A port to z/OS is available L<here|https://dovetail.com/community/ncdu.html>.
=head2 Similar projects
=over
=item L<Duc|http://duc.zevv.nl/> - Multiple user interfaces.
=item L<gt5|http://gt5.sourceforge.net/> - Quite similar to ncdu, but a different approach.
=item L<tdu|http://webonastick.com/tdu/> - Another small ncurses-based disk usage visualization utility.
=item L<TreeSize|http://treesize.sourceforge.net/> - GTK, using a treeview.
=item L<Baobab|http://www.marzocca.net/linux/baobab.html> - GTK, using pie-charts, a treeview and a treemap. Comes with GNOME.
=item L<GdMap|http://gdmap.sourceforge.net/> - GTK, with a treemap display.
=item L<Filelight|http://www.methylblue.com/filelight/> - KDE, using pie-charts.
=item L<KDirStat|http://kdirstat.sourceforge.net/> - KDE, with a treemap display.
=item L<QDiskUsage|http://qt-apps.org/content/show.php/QDiskUsage?content=107012> - Qt, using pie-charts.
=item L<xdiskusage|http://xdiskusage.sourceforge.net/> - FLTK, with a treemap display.
=item L<fsv|http://fsv.sourceforge.net/> - 3D visualization.
=item L<Philesight|http://zevv.nl/play/code/philesight/> - Web-based clone of Filelight.
=back

View file

@ -1,159 +0,0 @@
1.14 - 2019-02-04
- Add mtime display and sorting (Alex Wilson)
- Add (limited) --follow-symlinks option (Simon Doppler)
- Display larger file counts in browser UI
- Add -V, --version, and --help alias flags
- Fix crash when attempting to sort an empty directory
- Fix 100% CPU bug when ncdu loses the terminal
- Fix '--color=off' flag
- Fix some typos
1.13 - 2018-01-29
- Add "extended information" mode and -e flag
- Add file mode, modification time and uid/gid to info window with -e
- Add experimental color support and --color flag
- Add -rr option to disable shell spawning
- Remove directory nesting limit on file import
- Fix handling of interrupts during file import
- Fix undefined behavior that triggered crash on OS X
1.12 - 2016-08-24
- Add NCDU_SHELL environment variable
- Add --confirm-quit flag
- Fix compilation due to missing sys/wait.h include
1.11 - 2015-04-05
- Added 'b' key to spawn shell in the current directory
- Support scanning (and refreshing) of empty directories
- Added --si flag for base 10 prefixes
- Fix toggle dirs before files
1.10 - 2013-05-09
- Added 'c' key to display item counts
- Added 'C' key to order by item counts
- Added CACHEDIR.TAG support and --exclude-caches option
- Use locale-dependent thousand separator
- Use pkg-config to detect ncurses
- Clip file/dir sizes to 8 EiB minus one byte
- Fix buffer overflow when formatting huge file sizes
1.9 - 2012-09-27
- Added option to dump scanned directory information to a file (-o)
- Added option to load scanned directory information from a file (-f)
- Added multiple scan and load interfaces (-0,-1,-2)
- Fit loading and error windows to the terminal width (#13)
- Fix symlink resolving bug (#18)
- Fix path display when scanning an empty directory (#15)
- Fix hang when terminal is resized to a too small size while loading
- Use top-level automake build
- Remove useless AUTHORS, INSTALL and NEWS files
- ncdu.1 now uses POD as source format
1.8 - 2011-11-03
- Use hash table to speed up hard link detection
- Added read-only option (-r)
- Use KiB instead of kiB (#3399279)
1.7 - 2010-08-13
- List the detected hard links in file info window
- Count the size of a hard linked file once for each directory it appears in
- Fixed crash on browsing dirs with a small window size (#2991787)
- Fixed buffer overflow when some directories can't be scanned (#2981704)
- Fixed segfault when launched on a nonexistant directory (#3012787)
- Fixed segfault when root dir only contains hidden files
- Improved browsing performance
- More intuitive multi-page browsing
- Display size graph by default
- Various minor fixes
1.6 - 2009-10-23
- Implemented hard link detection
- Properly select the next item after deletion
- Removed reliance of dirfd()
- Fixed non-void return in void delete_process()
- Fixed several tiny memory leaks
- Return to previously opened directory on failed recalculation
- Properly display MiB units instead of MB (IEEE 1541 - bug #2831412)
- Link to ncursesw when available
- Improved support for non-ASCII characters
- VIM keybindings for browsing through the tree (#2788249, #1880622)
1.5 - 2009-05-02
- Fixed incorrect apparent size on directory refresh
- Browsing keys now work while file info window is displayed
- Current directory is assumed when no directory is specified
- Size graph uses the apparent size if that is displayed
- Items are ordered by displayed size rather than disk usage
- Removed switching between powers of 1000/1024
- Don't rely on the availability of suseconds_t
- Correctly handle paths longer than PATH_MAX
- Fixed various bugs related to rpath()
- Major code rewrite
- Fixed line width when displaying 100%
1.4 - 2008-09-10
- Removed the startup window
- Filenames ending with a tidle (~) will now also
be hidden with the 'h'-key
- Fixed buffer overflow when supplying a path longer
than PATH_MAX (patch by Tobias Stoeckmann)
- Used S_BLKSIZE instead of a hardcoded block size of 512
- Fixed display of disk usage and apparent sizes
- Updated ncdu -h
- Included patches for Cygwin
- Cursor now follows the selected item
- Added spaces around path (debian #472194)
- Fixed segfault on empty directory (debian #472294)
- A few code rewrites and improvements
1.3 - 2007-08-05
- Added 'r'-key to refresh the current directory
- Removed option to calculate apparent size: both
the disk usage and the apparent size are calculated.
- Added 'a'-key to switch between showing apparent
size and disk usage.
- Added 'i'-key to display information about the
selected item.
- Small performance improvements
- configure checks for ncurses.h (bug #1764304)
1.2 - 2007-07-24
- Fixed some bugs on cygwin
- Added du-like exclude patterns
- Fixed bug #1758403: large directories work fine now
- Rewrote a large part of the code
- Fixed a bug with wide characters
- Performance improvements when browsing large dirs
1.1 - 2007-04-30
- Deleting files and directories is now possible from
within ncdu.
- The key for sorting directories between files has
changed to 't' instead of 'd'. The 'd'-key is now
used for deleting files.
1.0 - 2007-04-06
- First stable release
- Small code cleanup
- Added a key to toggle between sorting dirs before
files and dirs between files
- Added graphs and percentages to the directory
browser (can be enabled or disabled with the 'g'-key)
0.3 - 2007-03-04
- When browsing back to the previous directory, the
directory you're getting back from will be selected.
- Added directory scanning in quiet mode to save
bandwidth on remote connections.
0.2 - 2007-02-26
- Fixed POSIX compliance: replaced realpath() with my
own implementation, and gettimeofday() is not
required anymore (but highly recommended)
- Added a warning for terminals smaller than 60x16
- Mountpoints (or any other directory pointing to
another filesystem) are now considered to be
directories rather than files.
0.1 - 2007-02-21
- Initial version

View file

@ -1,255 +0,0 @@
=pod
This document describes the file format that ncdu 1.9 and later use for the
export/import feature (the C<-o> and C<-f> options). Check the L<ncdu
manual|https://dev.yorhel.nl/ncdu/man> for a description on how to use that
feature.
=head2 Top-level object
Ncdu uses L<JSON|http://json.org/> notation as its data format. The top-level
object is an array:
[
<majorver>,
<minorver>,
<metadata>,
<directory>
]
=head2 Versioning
The C<< <majorver> >> and C<< <minorver> >> elements indicate the version of
the file format. These are numbers with accepted values in the range of C<< 0
<= version <= 10000 >>. Major version must be C<1>. Minor version is C<0> for
ncdu 1.9 till 1.12, and C<1> since ncdu 1.13 for the addition of the extended
mode. The major version should increase if backwards-incompatible changes are
made (preferably never), the minor version can be increased to indicate
additions to the existing format.
=head2 Metadata
The C<< <metadata> >> element is a JSON object holding whatever (short)
metadata you'd want. This block is currently (1.9-1.13) ignored by ncdu when
importing, but it writes out the following keys when exporting:
=over
=item progname
String, name of the program that generated the file, i.e. C<"ncdu">.
=item progver
String, version of the program that generated the file, e.g. C<"1.10">.
=item timestamp
Number, UNIX timestamp as returned by the POSIX C<time()> function at the time
the file was generated. Note that this may not necessarily be equivant to when
the directory has been scanned.
=back
=head2 Directory Info
A C<< <directory> >> is represented with a JSON array:
[
<infoblock>,
<directory>, <directory>, <infoblock>, ...
]
That is, the first element of the array must be an C<< <infoblock> >>. If the
directory is empty, that will be its only element. If it isn't, its
subdirectories and files are listed in the remaining elements. Each
subdirectory is represented as a C<< <directory> >> array again, and each file
is represented as just an C<< <infoblock> >> object.
=head2 The Info Object
An C<< <infoblock> >> is a JSON object holding information about a file or
directory. The following fields are supported:
=over
=item name
String I<(required)>. Name of the file/dir. For the top-level directory (that
is, the C<< <directory> >> item in the top-level JSON array), this should be
the full absolute filesystem path, e.g. C<"/media/harddrive">. For any items
below the top-level directory, the name should be just the name of the item.
The name will be in the same encoding as reported by the filesystem (i.e.
L<readdir()|http://manned.org/readdir.3>). The name may not exceed 32768 bytes.
=item asize
Number. The apparent file size, as reported by C<lstat().st_size>. If absent, 0
is assumed. Accepted values are in the range of C<< 0 <= asize < 2^63 >>.
=item dsize
Number. Size of the file, as consumed on the disk. This is obtained through
C<lstat().st_blocks*S_BLKSIZE>. If absent, 0 is assumed. Accepted values are in
the range of C<< 0 <= dsize < 2^63 >>.
=item dev
Number. The device ID. Has to be a unique ID within the context of the exported
dump, but may not have any meaning outside of that. I.e. this can be a
serialization of C<lstat().st_dev>, but also a randomly generated number only
used within this file. As long as it uniquely identifies the device/filesystem
on which this file is stored. This field may be absent, in which case it is
equivalent to that of the parent directory. If this field is absent for the
parent directory, a value of 0 is assumed. Accepted values are in the range of
C<< 0 <= dev < 2^64 >>.
=item ino
Number. Inode number as reported by C<lstat().st_ino>. Together with the Device
ID this uniquely identifies a file in this dump. In the case of hard links, two
objects may appear with the same (C<dev>,C<ino>) combination. A value of 0 is
assumed if this field is absent. This is currently (ncdu 1.9-1.13) not a
problem as long as the C<hlnkc> field is false, otherwise it will consider
everything with the same C<dev> and empty C<ino> values as a single hardlinked
file. Accepted values are in the range of C<< 0 <= ino < 2^64 >>.
=item hlnkc
Boolean. C<true> if this is a file with C<< lstat().st_nlink > 1 >>. If absent,
C<false> is assumed.
=item read_error
Boolean. C<true> if something went wrong while reading this entry. I.e. the
information in this entry may not be complete. For files, this indicates that
the C<lstat()> call failed. For directories, this means that an error occurred
while obtaining the file listing, and some items may be missing. Note that if
C<lstat()> failed, ncdu has no way of knowing whether an item is a file or a
directory, so a file with C<read_error> set might as well be a directory. If
absent, C<false> is assumed.
=item excluded
String. Set if this file or directory is to be excluded from calculation for
some reason. The following values are recognized:
=over
=item C<"pattern">
If the path matched an exclude pattern.
=item C<"otherfs">
If the item is on a different device/filesystem.
=back
Excluded items may still be included in the export, but only by name. C<size>,
C<asize> and other information may be absent. If this item was excluded by a
pattern, ncdu will not do an C<lstat()> on it, and may thus report this item as
a file even if it is a directory.
Other values than mentioned above are accepted by ncdu, but are currently
interpreted to be equivalent to "pattern". This field should be absent if the
item has not been excluded from the calculation.
=item notreg
Boolean. This is C<true> if neither S_ISREG() nor S_ISDIR() evaluates to true.
I.e. this is a symlink, character device, block device, FIFO, socket, or
whatever else your system may support. If absent, C<false> is assumed.
=back
=head3 Extended information
In addition, the following fields are exported when I<extended information>
mode is enabled (available since ncdu 1.13). See the C<-e> flag in L<ncdu(1)>
for details.
=over
=item uid
Number, user ID who owns the file. Accepted values are in the range
C<< 0 <= uid < 2^31 >>.
=item gid
Number, group ID who owns the file. Accepted values are in the range
C<< 0 <= uid < 2^31 >>.
=item mode
Number, the raw file mode as returned by L<lstat(3)>. For Linux systems, see
L<inode(7)> for the interpretation of this field. Accepted range:
C<< 0 <= mode < 2^16 >>.
=item mtime
Number, last modification time as a UNIX timestamp. Accepted range:
C<< 0 <= mtime < 2^64 >>.
=back
=head2 Miscellaneous notes
As mentioned above, file/directory names are B<not> converted to any specific
encoding when exporting. If you want the exported info dump to be valid JSON
(and thus valid UTF-8), you'll have to ensure that you have either no non-UTF-8
filenames in your filesystem, or you should process the dump through a
conversion utility such as C<iconv>. When browsing an imported file with ncdu,
you'll usually want to ensure that the filenames are in the same encoding as
what your terminal is expecting. The browsing interface may look garbled or
otherwise ugly if that's not the case.
Another important thing to keep in mind is that an export can be fairly large.
If you write a program that reads a file in this format and you care about
handling directories with several million files, make sure to optimize for
that. For example, prefer the use of a stream-based JSON parser over a JSON
library that reads the entire file in a single generic data structure, and only
keep the minimum amount of data that you care about in memory.
=head2 Example Export
Here's a simple example export that displays the basic structure of the format.
[
1,
0,
{
"progname" : "ncdu",
"progver" : "1.9",
"timestamp" : 1354477149
},
[
{ "name" : "/media/harddrive",
"dsize" : 4096,
"asize" : 422,
"dev" : 39123423,
"ino" : 29342345
},
{ "name" : "SomeFile",
"dsize" : 32768,
"asize" : 32414,
"ino" : 91245479284
},
[
{ "name" : "EmptyDir",
"dsize" : 4096,
"asize" : 10,
"ino" : 3924
}
]
]
]
The directory described above has the following structure:
/media/harddrive
├── SomeFile
└── EmptyDir

View file

@ -1,425 +0,0 @@
=head1 NAME
B<ncdu> - NCurses Disk Usage
=head1 SYNOPSIS
B<ncdu> [I<options>] I<dir>
=head1 DESCRIPTION
ncdu (NCurses Disk Usage) is a curses-based version of the well-known 'du', and
provides a fast way to see what directories are using your disk space.
=head1 OPTIONS
=head2 Mode Selection
=over
=item -h, --help
Print a short help message and quit.
=item -v, -V, --version
Print ncdu version and quit.
=item -f I<FILE>
Load the given file, which has earlier been created with the C<-o> option. If
I<FILE> is equivalent to C<->, the file is read from standard input.
For the sake of preventing a screw-up, the current version of ncdu will assume
that the directory information in the imported file does not represent the
filesystem on which the file is being imported. That is, the refresh, file
deletion and shell spawning options in the browser will be disabled.
=item I<dir>
Scan the given directory.
=item -o I<FILE>
Export all necessary information to I<FILE> instead of opening the browser
interface. If I<FILE> is C<->, the data is written to standard output. See the
examples section below for some handy use cases.
Be warned that the exported data may grow quite large when exporting a
directory with many files. 10.000 files will get you an export in the order of
600 to 700 KiB uncompressed, or a little over 100 KiB when compressed with
gzip. This scales linearly, so be prepared to handle a few tens of megabytes
when dealing with millions of files.
=item -e
Enable extended information mode. This will, in addition to the usual file
information, also read the ownership, permissions and last modification time
for each file. This will result in higher memory usage (by roughly ~30%) and in
a larger output file when exporting.
When using the file export/import function, this flag will need to be added
both when exporting (to make sure the information is added to the export), and
when importing (to read this extra information in memory). This flag has no
effect when importing a file that has been exported without the extended
information.
This enables viewing and sorting by the latest child mtime, or modified time,
using 'm' and 'M', respectively.
=back
=head2 Interface options
=over
=item -0
Don't give any feedback while scanning a directory or importing a file, other
than when a fatal error occurs. Ncurses will not be initialized until the scan
is complete. When exporting the data with C<-o>, ncurses will not be
initialized at all. This option is the default when exporting to standard
output.
=item -1
Similar to C<-0>, but does give feedback on the scanning progress with a single
line of output. This option is the default when exporting to a file.
In some cases, the ncurses browser interface which you'll see after the
scan/import is complete may look garbled when using this option. If you're not
exporting to a file, C<-2> is probably a better choice.
=item -2
Provide a full-screen ncurses interface while scanning a directory or importing
a file. This is the only interface that provides feedback on any non-fatal
errors while scanning.
=item -q
Quiet mode. While scanning or importing the directory, ncdu will update the
screen 10 times a second by default, this will be decreased to once every 2
seconds in quiet mode. Use this feature to save bandwidth over remote
connections. This option has no effect when C<-0> is used.
=item -r
Read-only mode. This will disable the built-in file deletion feature. This
option has no effect when C<-o> is used, because there will not be a browser
interface in that case. It has no effect when C<-f> is used, either, because
the deletion feature is disabled in that case anyway.
WARNING: This option will only prevent deletion through the file browser. It is
still possible to spawn a shell from ncdu and delete or modify files from
there. To disable that feature as well, pass the C<-r> option twice (see
C<-rr>).
=item -rr
In addition to C<-r>, this will also disable the shell spawning feature of the
file browser.
=item --si
List sizes using base 10 prefixes, that is, powers of 1000 (KB, MB, etc), as
defined in the International System of Units (SI), instead of the usual base 2
prefixes, that is, powers of 1024 (KiB, MiB, etc).
=item --confirm-quit
Requires a confirmation before quitting ncdu. Very helpful when you
accidentally press 'q' during or after a very long scan.
=item --color I<SCHEME>
Select a color scheme. Currently only two schemes are recognized: I<off> to
disable colors (the default) and I<dark> for a color scheme intended for dark
backgrounds.
=back
=head2 Scan Options
These options affect the scanning progress, and have no effect when importing
directory information from a file.
=over
=item -x
Do not cross filesystem boundaries, i.e. only count files and directories on
the same filesystem as the directory being scanned.
=item --exclude I<PATTERN>
Exclude files that match I<PATTERN>. The files will still be displayed by
default, but are not counted towards the disk usage statistics. This argument
can be added multiple times to add more patterns.
=item -X I<FILE>, --exclude-from I<FILE>
Exclude files that match any pattern in I<FILE>. Patterns should be separated
by a newline.
=item --exclude-caches
Exclude directories containing CACHEDIR.TAG. The directories will still be
displayed, but not their content, and they are not counted towards the disk
usage statistics.
See http://www.brynosaurus.com/cachedir/
=item -L, --follow-symlinks
Follow symlinks and count the size of the file they point to. As of ncdu 1.14,
this option will not follow symlinks to directories and will count each
symlinked file as a unique file (i.e. unlike how hard links are handled). This
is subject to change in later versions.
=back
=head1 KEYS
=over
=item ?
Show help + keys + about screen
=item up, down j, k
Cycle through the items
=item right, enter, l
Open selected directory
=item left, <, h
Go to parent directory
=item n
Order by filename (press again for descending order)
=item s
Order by filesize (press again for descending order)
=item C
Order by number of items (press again for descending order)
=item a
Toggle between showing disk usage and showing apparent size.
=item M
Order by latest child mtime, or modified time. (press again for descending order)
Requires the -e flag.
=item d
Delete the selected file or directory. An error message will be shown when the
contents of the directory do not match or do not exist anymore on the
filesystem.
=item t
Toggle dirs before files when sorting.
=item g
Toggle between showing percentage, graph, both, or none. Percentage is relative
to the size of the current directory, graph is relative to the largest item in
the current directory.
=item c
Toggle display of child item counts.
=item m
Toggle display of latest child mtime, or modified time. Requires the -e flag.
=item e
Show/hide 'hidden' or 'excluded' files and directories. Please note that even
though you can't see the hidden files and directories, they are still there and
they are still included in the directory sizes. If you suspect that the totals
shown at the bottom of the screen are not correct, make sure you haven't
enabled this option.
=item i
Show information about the current selected item.
=item r
Refresh/recalculate the current directory.
=item b
Spawn shell in current directory.
Ncdu will determine your preferred shell from the C<NCDU_SHELL> or C<SHELL>
variable (in that order), or will call C</bin/sh> if neither are set. This
allows you to also configure another command to be run when he 'b' key is
pressed. For example, to spawn the L<vifm(1)> file manager instead of a shell,
run ncdu as follows:
export NCDU_SHELL=vifm
ncdu
=item q
Quit
=back
=head1 FILE FLAGS
Entries in the browser interface may be prefixed by a one-character flag. These
flags have the following meaning:
=over
=item !
An error occurred while reading this directory.
=item .
An error occurred while reading a subdirectory, so the indicated size may not be
correct.
=item <
File or directory is excluded from the statistics by using exlude patterns.
=item >
Directory is on another filesystem.
=item @
This is neither a file nor a folder (symlink, socket, ...).
=item H
Same file was already counted (hard link).
=item e
Empty directory.
=back
=head1 EXAMPLES
To scan and browse the directory you're currently in, all you need is a simple:
ncdu
If you want to scan a full filesystem, your root filesystem, for example, then
you'll want to use C<-x>:
ncdu -x /
Since scanning a large directory may take a while, you can scan a directory and
export the results for later viewing:
ncdu -1xo- / | gzip >export.gz
# ...some time later:
zcat export.gz | ncdu -f-
To export from a cron job, make sure to replace C<-1> with C<-0> to suppress
any unnecessary output.
You can also export a directory and browse it once scanning is done:
ncdu -o- | tee export.file | ./ncdu -f-
The same is possible with gzip compression, but is a bit kludgey:
ncdu -o- | gzip | tee export.gz | gunzip | ./ncdu -f-
To scan a system remotely, but browse through the files locally:
ssh -C user@system ncdu -o- / | ./ncdu -f-
The C<-C> option to ssh enables compression, which will be very useful over
slow links. Remote scanning and local viewing has two major advantages when
compared to running ncdu directly on the remote system: You can browse through
the scanned directory on the local system without any network latency, and ncdu
does not keep the entire directory structure in memory when exporting, so you
won't consume much memory on the remote system.
=head1 HARD LINKS
Every disk usage analysis utility has its own way of (not) counting hard links.
There does not seem to be any universally agreed method of handling hard links,
and it is even inconsistent among different versions of ncdu. This section
explains what each version of ncdu does.
ncdu 1.5 and below does not support any hard link detection at all: each link
is considered a separate inode and its size is counted for every link. This
means that the displayed directory sizes are incorrect when analyzing
directories which contain hard links.
ncdu 1.6 has basic hard link detection: When a link to a previously encountered
inode is detected, the link is considered to have a file size of zero bytes.
Its size is not counted again, and the link is indicated in the browser
interface with a 'H' mark. The displayed directory sizes are only correct when
all links to an inode reside within that directory. When this is not the case,
the sizes may or may not be correct, depending on which links were considered
as "duplicate" and which as "original". The indicated size of the topmost
directory (that is, the one specified on the command line upon starting ncdu)
is always correct.
ncdu 1.7 and later has improved hard link detection. Each file that has more
than two links has the "H" mark visible in the browser interface. Each hard
link is counted exactly once for every directory it appears in. The indicated
size of each directory is therefore, correctly, the sum of the sizes of all
unique inodes that can be found in that directory. Note, however, that this may
not always be same as the space that will be reclaimed after deleting the
directory, as some inodes may still be accessible from hard links outside it.
=head1 BUGS
Directory hard links are not supported. They will not be detected as being hard
links, and will thus be scanned and counted multiple times.
Some minor glitches may appear when displaying filenames that contain multibyte
or multicolumn characters.
All sizes are internally represented as a signed 64bit integer. If you have a
directory larger than 8 EiB minus one byte, ncdu will clip its size to 8 EiB
minus one byte. When deleting items in a directory with a clipped size, the
resulting sizes will be incorrect.
Item counts are stored in a signed 32-bit integer without overflow detection.
If you have a directory with more than 2 billion files, quite literally
anything can happen.
Please report any other bugs you may find at the bug tracker, which can be
found on the web site at https://dev.yorhel.nl/ncdu
=head1 AUTHOR
Written by Yoran Heling <projects@yorhel.nl>.
=head1 SEE ALSO
L<du(1)>

View file

@ -1,29 +0,0 @@
=pod
These screenshots were made with ncdu 1.13 with the C<--color=dark> option.
Colors are not available in older versions and (in 1.13) still disabled by
default.
=head2 Scanning...
[img scr ncduscan-2.png Ncdu scanning a large directory.]
=head2 Done scanning
[img scr ncdudone-2.png Ncdu done scanning a large directory.]
=head2 Directory information
[img scr ncduinfo-2.png Ncdu displaying directory information.]
=head2 Delete confirmation
[img scr ncduconfirm-2.png Ncdu asking for confirmation to delete a file.]
=head2 Help screen
[img scr ncduhelp1-2.png Ncdu help screen.]
=head2 About screen
[img scr ncduhelp2-2.png Ncdu about screen.]

73
dat/ncdu.md Normal file
View file

@ -0,0 +1,73 @@
% NCurses Disk Usage
Ncdu is a disk usage analyzer with an ncurses interface. It is designed to find
space hogs on a remote server where you don't have an entire graphical setup
available, but it is a useful tool even on regular desktop systems. Ncdu aims
to be fast, simple and easy to use, and should be able to run in any minimal
POSIX-like environment with ncurses installed.
## Download
Latest version
: 1.14 ([dllink ncdu-1.14.tar.gz] - [changes](/ncdu/changes))
I also have convenient static binaries for Linux
[i486](/download/ncdu-linux-i486-1.14.tar.gz) and
[ARM](/download/ncdu-linux-arm-1.14.tar.gz). Download, extract and run; no
compilation or installation necessary (uses
[musl](http://www.musl-libc.org/)).
Development version
: The most recent code is available on a git repository and can be cloned
with `git clone git://g.blicky.net/ncdu.git/`. The repository is also available
for [online browsing](https://g.blicky.net/ncdu.git/).
Ncdu is entirely written in C and available under a liberal MIT license.
## Packages and ports
Ncdu has been packaged for quite a few systems, here's a list of the ones I am aware of:
[AIX](http://www.perzl.org/aix/index.php?n=Main.Ncdu) -
[Alpine Linux](http://pkgs.alpinelinux.org/packages?name=ncdu) -
[ALT Linux](http://sisyphus.ru/en/srpm/ncdu) -
[Arch Linux](https://www.archlinux.org/packages/?q=ncdu) -
[CRUX](https://crux.nu/portdb/?q=ncdu&a=search) -
[Cygwin](https://cygwin.com/cgi-bin2/package-grep.cgi?grep=ncdu) -
[Debian](http://packages.debian.org/ncdu) -
[Fedora](https://apps.fedoraproject.org/packages/ncdu) -
[FreeBSD](https://www.freebsd.org/cgi/ports.cgi?query=ncdu&stype=all) -
[Frugalware](http://frugalware.org/packages/?op=pkg&srch=ncdu&arch=all&ver=all) -
[Gentoo](https://packages.gentoo.org/packages/sys-fs/ncdu) -
[GNU Guix](https://www.gnu.org/software/guix/package-list.html) -
[OpenBSD](http://cvsweb.openbsd.org/cgi-bin/cvsweb/ports/sysutils/ncdu/) -
Mac OS X ([Fink](http://pdb.finkproject.org/pdb/package.php/ncdu) - [Homebrew](https://formulae.brew.sh/formula/ncdu) - [MacPorts](http://www.macports.org/ports.php?by=name&substr=ncdu)) -
[Puppy Linux](http://www.murga-linux.com/puppy/viewtopic.php?t=35024) -
[Solaris](http://www.opencsw.org/packages/ncdu) -
[Slackware](http://slackbuilds.org/repository/14.2/system/ncdu/) -
[Slax Linux](http://www.slax.org/modules.php?detail=ncdu) -
[Ubuntu](http://packages.ubuntu.com/search?searchon=sourcenames&keywords=ncdu) -
[Void Linux](https://voidlinux.org/packages/)
Packages for RHEL and (open)SUSE can be found on the
[Open Build Service](https://software.opensuse.org//download.html?project=utilities&package=ncdu).
Packages for NetBSD, DragonFlyBSD, MirBSD and others can be found on
[pkgsrc](http://pkgsrc.se/sysutils/ncdu).
A port to z/OS is available [here](https://dovetail.com/community/ncdu.html).
## Similar projects
- [Duc](http://duc.zevv.nl/) - Multiple user interfaces.
- [gt5](http://gt5.sourceforge.net/) - Quite similar to ncdu, but a different approach.
- [tdu](http://webonastick.com/tdu/) - Another small ncurses-based disk usage visualization utility.
- [TreeSize](http://treesize.sourceforge.net/) - GTK, using a treeview.
- [Baobab](http://www.marzocca.net/linux/baobab.html) - GTK, using pie-charts, a treeview and a treemap. Comes with GNOME.
- [GdMap](http://gdmap.sourceforge.net/) - GTK, with a treemap display.
- [Filelight](http://www.methylblue.com/filelight/) - KDE, using pie-charts.
- [KDirStat](http://kdirstat.sourceforge.net/) - KDE, with a treemap display.
- [QDiskUsage](http://qt-apps.org/content/show.php/QDiskUsage?content=107012) - Qt, using pie-charts.
- [xdiskusage](http://xdiskusage.sourceforge.net/) - FLTK, with a treemap display.
- [fsv](http://fsv.sourceforge.net/) - 3D visualization.
- [Philesight](http://zevv.nl/play/code/philesight/) - Web-based clone of Filelight.

220
dat/ncdu/jsonfmt.md Normal file
View file

@ -0,0 +1,220 @@
% Ncdu Export File Format
This document describes the file format that ncdu 1.9 and later use for the
export/import feature (the `-o` and `-f` options). Check the [ncdu
manual](/ncdu/man) for a description on how to use that feature.
## Top-level object
Ncdu uses [JSON](http://json.org/) notation as its data format. The top-level
object is an array:
[
<majorver>,
<minorver>,
<metadata>,
<directory>
]
## Versioning
The `<majorver>` and `<minorver>` elements indicate the version of
the file format. These are numbers with accepted values in the range of `0
<= version <= 10000`. Major version must be `1`. Minor version is `0` for
ncdu 1.9 till 1.12, and `1` since ncdu 1.13 for the addition of the extended
mode. The major version should increase if backwards-incompatible changes are
made (preferably never), the minor version can be increased to indicate
additions to the existing format.
## Metadata
The `<metadata>` element is a JSON object holding whatever (short)
metadata you'd want. This block is currently (1.9-1.13) ignored by ncdu when
importing, but it writes out the following keys when exporting:
progname
: String, name of the program that generated the file, i.e. `"ncdu"`.
progver
: String, version of the program that generated the file, e.g. `"1.10"`.
timestamp
: Number, UNIX timestamp as returned by the POSIX `time()` function at the time
the file was generated. Note that this may not necessarily be equivant to when
the directory has been scanned.
## Directory Info
A `<directory>` is represented with a JSON array:
[
<infoblock>,
<directory>, <directory>, <infoblock>, ...
]
That is, the first element of the array must be an `<infoblock>`. If the
directory is empty, that will be its only element. If it isn't, its
subdirectories and files are listed in the remaining elements. Each
subdirectory is represented as a `<directory>` array again, and each file
is represented as just an `<infoblock>` object.
## The Info Object
An `<infoblock>` is a JSON object holding information about a file or
directory. The following fields are supported:
name
: String _(required)_. Name of the file/dir. For the top-level directory (that
is, the `<directory>` item in the top-level JSON array), this should be
the full absolute filesystem path, e.g. `"/media/harddrive"`. For any items
below the top-level directory, the name should be just the name of the item.
The name will be in the same encoding as reported by the filesystem (i.e.
[readdir()](http://manned.org/readdir.3)). The name may not exceed 32768 bytes.
asize
: Number. The apparent file size, as reported by `lstat().st_size`. If absent, 0
is assumed. Accepted values are in the range of `0 <= asize < 2^63`.
dsize
: Number. Size of the file, as consumed on the disk. This is obtained through
`lstat().st_blocks*S_BLKSIZE`. If absent, 0 is assumed. Accepted values are in
the range of `0 <= dsize < 2^63`.
dev
: Number. The device ID. Has to be a unique ID within the context of the exported
dump, but may not have any meaning outside of that. I.e. this can be a
serialization of `lstat().st_dev`, but also a randomly generated number only
used within this file. As long as it uniquely identifies the device/filesystem
on which this file is stored. This field may be absent, in which case it is
equivalent to that of the parent directory. If this field is absent for the
parent directory, a value of 0 is assumed. Accepted values are in the range of
`0 <= dev < 2^64`.
ino
: Number. Inode number as reported by `lstat().st_ino`. Together with the Device
ID this uniquely identifies a file in this dump. In the case of hard links, two
objects may appear with the same (`dev`,`ino`) combination. A value of 0 is
assumed if this field is absent. This is currently (ncdu 1.9-1.13) not a
problem as long as the `hlnkc` field is false, otherwise it will consider
everything with the same `dev` and empty `ino` values as a single hardlinked
file. Accepted values are in the range of `0 <= ino < 2^64`.
hlnkc
: Boolean. `true` if this is a file with `lstat().st_nlink > 1`. If absent,
`false` is assumed.
read\_error
: Boolean. `true` if something went wrong while reading this entry. I.e. the
information in this entry may not be complete. For files, this indicates that
the `lstat()` call failed. For directories, this means that an error occurred
while obtaining the file listing, and some items may be missing. Note that if
`lstat()` failed, ncdu has no way of knowing whether an item is a file or a
directory, so a file with `read_error` set might as well be a directory. If
absent, `false` is assumed.
excluded
: String. Set if this file or directory is to be excluded from calculation for
some reason. The following values are recognized:
`"pattern"`
: If the path matched an exclude pattern.
`"otherfs"`
: If the item is on a different device/filesystem.
Excluded items may still be included in the export, but only by name. `size`,
`asize` and other information may be absent. If this item was excluded by a
pattern, ncdu will not do an `lstat()` on it, and may thus report this item as
a file even if it is a directory.
Other values than mentioned above are accepted by ncdu, but are currently
interpreted to be equivalent to "pattern". This field should be absent if the
item has not been excluded from the calculation.
notreg
: Boolean. This is `true` if neither S\_ISREG() nor S\_ISDIR() evaluates to true.
I.e. this is a symlink, character device, block device, FIFO, socket, or
whatever else your system may support. If absent, `false` is assumed.
### Extended information
In addition, the following fields are exported when _extended information_ mode
is enabled (available since ncdu 1.13). See the `-e` flag in
[ncdu(1)](/ncdu/man) for details.
uid
: Number, user ID who owns the file. Accepted values are in the range
`0 <= uid < 2^31`.
gid
: Number, group ID who owns the file. Accepted values are in the range
`0 <= uid < 2^31`.
mode
: Number, the raw file mode as returned by
[lstat(3)](https://manned.org/lstat.3). For Linux systems, see
[inode(7)](https://manned.org/inode.7) for the interpretation of this
field. Accepted range: `0 <= mode < 2^16`.
mtime
: Number, last modification time as a UNIX timestamp. Accepted range:
`0 <= mtime < 2^64`.
## Miscellaneous notes
As mentioned above, file/directory names are **not** converted to any specific
encoding when exporting. If you want the exported info dump to be valid JSON
(and thus valid UTF-8), you'll have to ensure that you have either no non-UTF-8
filenames in your filesystem, or you should process the dump through a
conversion utility such as `iconv`. When browsing an imported file with ncdu,
you'll usually want to ensure that the filenames are in the same encoding as
what your terminal is expecting. The browsing interface may look garbled or
otherwise ugly if that's not the case.
Another important thing to keep in mind is that an export can be fairly large.
If you write a program that reads a file in this format and you care about
handling directories with several million files, make sure to optimize for
that. For example, prefer the use of a stream-based JSON parser over a JSON
library that reads the entire file in a single generic data structure, and only
keep the minimum amount of data that you care about in memory.
## Example Export
Here's a simple example export that displays the basic structure of the format.
[
1,
0,
{
"progname" : "ncdu",
"progver" : "1.9",
"timestamp" : 1354477149
},
[
{ "name" : "/media/harddrive",
"dsize" : 4096,
"asize" : 422,
"dev" : 39123423,
"ino" : 29342345
},
{ "name" : "SomeFile",
"dsize" : 32768,
"asize" : 32414,
"ino" : 91245479284
},
[
{ "name" : "EmptyDir",
"dsize" : 4096,
"asize" : 10,
"ino" : 3924
}
]
]
]
The directory described above has the following structure:
/media/harddrive
├── SomeFile
└── EmptyDir

29
dat/ncdu/scr.md Normal file
View file

@ -0,0 +1,29 @@
% Ncdu Screenshots
These screenshots were made with ncdu 1.13 with the `--color=dark` option.
Colors are not available in older versions and (in 1.13) still disabled by
default.
## Scanning...
![Ncdu scanning a large directory.](/img/ncduscan-2.png)
## Done scanning
![Ncdu done scanning a large directory.](/img/ncdudone-2.png)
## Directory information
![Ncdu displaying directory information.](/img/ncduinfo-2.png)
## Delete confirmation
![Ncdu asking for confirmation to delete a file.](/img/ncduconfirm-2.png)
## Help screen
![Ncdu help screen.](/img/ncduhelp1-2.png)
## About screen
![Ncdu about screen.](/img/ncduhelp2-2.png)

View file

@ -1,54 +0,0 @@
=pod
nginx-confgen is a simple preprocessor and macro system for
L<nginx|http://nginx.org/> and nginx-like configuration files. It support
variable substitution, macro expansion and using the output of arbitrary
commands to generate config files.
=head2 Example
pre_set $certdir /etc/nginx-certificates/;
# Fetch the 'resolver' from /etc/resolv.conf
pre_exec $nameserver "grep nameserver /etc/resolv.conf \\
| head -n 1 | sed 's/^nameserver //'";
resolver $nameserver;
# Convenient macro to create a HTTPS virtual host
macro vhost $domain @aliases &block {
server {
listen [::]:443 ssl;
server_name $domain @aliases;
ssl_certificate $certdir/$domain/fullchain.pem;
ssl_certificate_key $certdir/$domain/privkey.pem;
pre_if -f $certdir/$domain/ocsp.der {
ssl_stapling_file $certdir/$domain/ocsp.der;
}
&block;
}
}
vhost example.com www.example.com {
root /var/www/example.com;
}
See the L<manual|https://dev.yorhel.nl/nginx-confgen/man> for more features.
=head2 Download
If you're on a x86_64 Linux system, you can simply use the binary:
curl -s https://dev.yorhel.nl/download/nginx-confgen-linux-amd64-1.2.tar.gz | tar -xzf-
./nginx-confgen <input.conf >output.conf
To compile from source, install L<Haskell Stack|https://haskellstack.org/> and run:
git clone https://code.blicky.net/yorhel/nginx-confgen.git
cd nginx-confgen
stack install
The git repository is also available for L<online
browsing|https://code.blicky.net/yorhel/nginx-confgen>.

View file

@ -1,16 +0,0 @@
1.2 - 2018-02-23
- Preserve original string quoting and variable formatting
- Fix parser to be more lenient with argument formats
- Fix handling of the \\-escape sequence
- Fix handling of quoting & parenthesis in 'if' directive
- Fix handling of empty string in pre_if
- Remove support for parenthesis around pre_if arguments
1.1 - 2018-01-24
- Add pre_warn directive
- Add -i/-o/-v/-h command line arguments
- Add support for custom pre_include search paths (-I flag)
- Fix handling of some common custom block directives (e.g. 'types')
1.0 - 2018-01-19
- Initial version

View file

@ -1,233 +0,0 @@
=pod
=head1 NAME
nginx-confgen - A preprocessor and macro system for nginx(-like) configuration
files.
=head1 SYNOPSIS
nginx-confgen -i input.conf -o output.conf
=head1 DESCRIPTION
nginx-confgen can be used to do pre-processing for nginx configuration files
(and other configuration files with a similar syntax). It has support for
"compile-time" macro expansion and variable interpolation, which should make it
less tedious to maintain large and complex configurations.
nginx-confgen works by parsing the input into a syntax tree, modifying this
tree, and then formatting the tree to generate the output. It is completely
oblivious to nginx contexts and directives, so it is possible to do nonsensical
transformations and generate incorrect configuration files. Comments in the
input file will not be present in the output. See also the L</BUGS & WARTS>
below.
B<WARNING:> Do NOT use nginx-confgen with untrusted input, the C<pre_exec>
directive allows, by design, arbitrary code execution.
=head1 OPTIONS
The following command-line options are supported:
=over
=item -h
Show help text.
=item -V, --version
Show program version.
=item -i I<FILE>
Use the given file name as input file. If this option is not given or set to
C<->, then the file will be read from standard input.
=item -o I<FILE>
Write the output to the given file. If this option is not given or set to C<->,
then the file will be written to standard output.
=item -I I<DIR>
Set the search path for I<pre_include> directives. This option can be given
multiple times to search several directories in order. If this option is not
given, then include files are resolved relative to the directory that
nginx-confgen is run from (i.e. C<-I .>).
=back
=head1 DIRECTIVES
nginx-confgen recognizes and interprets the following directives:
=head2 pre_include
Similar to the C<include> directive in nginx, except that the file is included
during preprocessing. The included file may contain any preprocessing
directives supported by nginx-confgen. Variables and macros defined in the
included file will be available in the parent file.
Relative paths are searched for in the directories given with the C<-I> flag.
=head2 pre_set
Similar to the C<set> directive in nginx, except that variables defined with
C<pre_set> are resolved during preprocessing. Note that variables defined with
C<pre_set> are only available in the same scope as they are defined, for
example:
pre_set $var outer;
location / {
pre_set $var inner;
# $var is now "inner" within this location block.
}
# $var is "outer" again after the location block.
(This may change in the future)
=head2 pre_exec
Run a shell command, and store the output in a variable. For example, nginx
will not use your system's DNS resolution methods to resolve domain names.
Instead you need to manually set a C<resolver> address. With the following hack
you can fetch the nameserver from C</etc/resolv.conf> and use that as the
C<resolver>:
pre_exec $nameserver "grep nameserver /etc/resolv.conf \\
| head -n 1 | sed 's/^nameserver //'";
resolver $nameserver;
(The C<\\> is necessary, otherwise your shell will consider the newline as a
new command).
=head2 pre_if
Similar to the C<if> directive in nginx, except that this is evaluated during
preprocessing. Also unlike C<if>, parenthesis around the arguments are not
supported. Some examples:
pre_if -f $certdir/ocsp.der {
ssl_stapling on;
ssl_stapling_file $certdir/ocsp.der;
}
pre_if !-f $certdir/ocsp.der {
ssl_stapling off;
}
# You can have different configuration depending on the name of
# the system on which nginx-confgen runs. Like... yeah.
pre_exec $hostname 'hostname';
pre_if $hostname ~* ^proxy_for_(.+) {
proxy_pass http://$1/;
}
=head2 pre_warn
This directive, when interpreted, will generate a warning to the standard error
of nginx-confgen. Can be used to signal that a special configuration is being
used:
pre_if -e /etc/offline-mode {
pre_warn "Putting website in offline mode!";
}
Or to warn about certain directives:
pre_macro proxy_cache $var {
pre_warn "Using proxy_cache with $var violates company policy!";
# But we can output it anyway.
proxy_cache $var;
}
=head2 macro
Define a I<macro>, which is a configuration block that you can later refer to.
The general syntax is as follows:
macro macro_name $var1 $var2 @remaining_vars &block_var {
# contents
}
The optional C<@remaining_vars> argument will capture any number of variables,
and can be passed to another directive inside the macro contents. The optional
C<&block_var> allows the macro to be invoked with a block argument, which will
expand to any number of directives. Some examples:
macro le {
location /.well-known/acme-challenge {
alias /etc/letsencrypt/challenge;
}
}
# Usage:
le;
macro redir $path $to {
location $path {
return 301 $to;
}
}
# Usage:
redir / http://blicky.net/;
macro vhost $primary_name @aliases &block {
server {
listen [::]:443 ssl;
server_name $primary_name @aliases;
ssl_certificate $crtdir/$primary_name/fullchain.pem;
ssl_certificate_key $crtdir/$primary_name/privkey.pem;
&block;
}
}
# Usage:
vhost example.com {
root /var/www/example.com;
}
vhost example.org alias.example.org {
root /var/www/example.org;
}
Note that these are I<hygienic> macros, so variable capture is predictable (but
not necessarily the most useful):
pre_var $dest /a;
macro redir {
# This will be /a, regardless of the context in which this macro is called.
return 301 $dest;
}
# $dest is still '/a' inside the macro after this new variable definition.
pre_var $dest /b;
redir;
Similarly, macro arguments will not be available inside C<&block> expansion or
nested macro expansion.
=head1 BUGS & WARTS
nginx-confgen is a quickly written hack to solve a particular use case, it is
quite likely to have some weird behavior and bugs.
Comments and whitespace in the input files are thrown away and ignored. The
generated output is completely reformatted.
The nginx configuration syntax is not as regular as I had hoped. It's possible
for nginx modules to extend the syntax somewhat. A good example is the I<types>
directive in I<ngx_http_core_module>. While nginx-confgen should be able to
handle the I<types> directive just fine, other extensions may cause syntax
errors or will not survive a round-trip through nginx-confgen.
This applies to all I<*_by_lua_block> directives in the I<ngx_http_lua_module>.
The I<_by_lua> directives that accept a string should work just fine.
The error messages given by C<nginx-confgen> aren't always helpful.
=head1 AUTHOR
nginx-confgen is written by Yoran Heling <projects@yorhel.nl>
Web: L<https://dev.yorhel.nl/nginx-confgen>

59
dat/nginx-confgen.md Normal file
View file

@ -0,0 +1,59 @@
% Nginx Configuration Generator
nginx-confgen is a simple preprocessor and macro system for
[nginx](http://nginx.org/) and nginx-like configuration files. It support
variable substitution, macro expansion and using the output of arbitrary
commands to generate config files.
## Example
```bash
pre_set $certdir /etc/nginx-certificates/;
# Fetch the 'resolver' from /etc/resolv.conf
pre_exec $nameserver "grep nameserver /etc/resolv.conf \\
| head -n 1 | sed 's/^nameserver //'";
resolver $nameserver;
# Convenient macro to create a HTTPS virtual host
macro vhost $domain @aliases &block {
server {
listen [::]:443 ssl;
server_name $domain @aliases;
ssl_certificate $certdir/$domain/fullchain.pem;
ssl_certificate_key $certdir/$domain/privkey.pem;
pre_if -f $certdir/$domain/ocsp.der {
ssl_stapling_file $certdir/$domain/ocsp.der;
}
&block;
}
}
vhost example.com www.example.com {
root /var/www/example.com;
}
```
See the [manual](/nginx-confgen/man) for more features.
## Download
If you're on a x86\_64 Linux system, you can simply use the binary:
```
curl -s https://dev.yorhel.nl/download/nginx-confgen-linux-amd64-1.2.tar.gz | tar -xzf-
./nginx-confgen <input.conf >output.conf
```
To compile from source, install [Haskell Stack](https://haskellstack.org/) and run:
```
git clone https://code.blicky.net/yorhel/nginx-confgen.git
cd nginx-confgen
stack install
```
The git repository is also available for [online
browsing](https://code.blicky.net/yorhel/nginx-confgen).

View file

@ -1,644 +0,0 @@
Multi-threaded Access to an SQLite3 Database
=pod
(Published on B<2011-11-26>. Also available in L<POD|https://dev.yorhel.nl/dat/sqlaccess>.)
(Minor 2013-04-06 update: I abstracted my message passing solution from ncdc
and implemented it in a POSIX C library for general use. It's called
I<sqlasync> and is part of my L<Ylib library collection|https://dev.yorhel.nl/ylib>.)
=head1 Introduction
As I was porting L<ncdc|https://dev.yorhel.nl/ncdc> over to use SQLite3 as
storage backend, I stumbled on a problem: The program uses a few threads for
background jobs, and it would be nice to give these threads access to the
database.
Serializing all database access through the main thread wouldn't have been very
hard to implement in this particular case, but that would have been far from
optimal. The main thread is also responsible for keeping the user interface
responsive and handling most of the network interaction. Overall responsiveness
of the program would significantly improve when the threads could access the
database without involvement of the main thread.
Which brought me to the following questions: What solutions are available for
providing multi-threaded access to an SQLite database? What problems may I run
in to? I was unable to find a good overview in this area on the net, so I wrote
this article with the hope to improve that situation.
=head1 SQLite3 and threading
Let's first see what SQLite3 itself has to offer in terms of threading support.
The official documentation mentions threading support several times in various
places, but this information is scattered around and no good overview is given.
Someone has tried to organize this before on a L<single
page|http://www.sqlite.org/cvstrac/wiki?p=MultiThreading>, and while this
indeed gives a nice overview, it has unfortunately not been updated since 2006.
The advices are therefore a little on the conservative side.
Nonetheless, it is wise to remain portable with different SQLite versions,
especially when writing programs that dynamically link with some random version
installed on someone's system. It should be fairly safe to assume that SQLite
binaries provided by most systems, if not all, are compiled with thread safety
enabled. This doesn't mean all that much, unfortunately: The only thing
I<thread safe> means in this context is that you can use SQLite3 in multiple
threads, but a single database connection should still stay within a single
thread.
Since SQLite 3.3.1, which was released in early 2006, it is possible to move a
single database connection along multiple threads. Doing this with older
versions is not advisable, as explained in L<the SQLite
FAQ|http://www.sqlite.org/faq.html#q6>. But even with 3.3.1 and later there is
an annoying restriction: A connection can only be passed to another thread when
any outstanding statements are closed and finalized. In practice this means
that it is not possible to keep a prepared statement in memory for later
executions.
Since SQLite 3.5.0, released in 2007, a single SQLite connection can be used
from multiple threads simultaneously. SQLite will internally manage locks to
avoid any data corruption. I can't recommend making use of this facility,
however, as there are still many issues with the API. The L<error fetching
functions|http://www.sqlite.org/c3ref/errcode.html> and
L<sqlite3_last_insert_row_id()|http://www.sqlite.org/c3ref/last_insert_rowid.html>,
among others, are still useless without explicit locking in the application. I
also believe that the previously mentioned restriction on having to finalize
statements has been relaxed in this version, so keeping prepared statements in
memory and passing them among different threads becomes possible.
When using multiple database connections within a single process, SQLite offers
a facility to allow L<sharing of its
cache|http://www.sqlite.org/sharedcache.html>, in order to reduce memory usage
and disk I/O. The semantics of this feature have changed with different SQLite
versions and appear to have stabilised in 3.5.0. This feature may prove useful
to optimize certain situations, but does not open up new possibilities of
communicating with a shared database.
=head1 Criteria
Before looking at some available solutions, let's first determine the criteria
we can use to evaluate them.
=over
=item Implementation size
Obviously, a solution that requires only a few lines of code to implement is
preferable over one that requires several levels of abstraction in order to be
usable. I won't be giving actual implementations here, so the sizes will be
rough estimates for comparison purposes. The actual size of an implementation
is of course heavily dependent on the programming environment as well.
=item Memory/CPU overhead
The most efficient solution for a single-threaded application is to simply have
direct access to a single database connection. Every solution is in principle a
modification or extension of this idea, and will therefore add a certain
overhead. This overhead manifests itself in both increased CPU and memory
usage. The order of which varies between solutions.
=item Prepared statement re-use
Is it possible to prepare a statement once and keep using it for the lifetime
of the program? Or will prepared statements have to be thrown away and
recreated every time? Keeping statement handles in memory will result in a nice
performance boost for applications that run the same SQL statement many times.
=item Transaction grouping
A somewhat similar issue to prepared statement re-use: From a performance point
of view, it is very important to try to batch many UPDATE/DELETE/INSERT
statements within a single transaction, as opposed to running each modify query
separately. Running each query separately will force SQLite to flush the data
to disk separately every time, whereas a single transaction will batch-flush
all the changes to disk in a single go. Some solutions allow for grouping
multiple statements in a single transaction quite easily, while others require
more involved steps.
=item Background processing
In certain situations it may be desirable to queue a certain query for later
processing, without explicitly waiting for it to complete. For example, if
something in the database has to be modified as a result of user interaction in
a UI thread, then the application would feel a lot more responsive if the
UPDATE query was simply queued to be processed in a background thread than when
the query had run in the UI thread itself. A database accessing solution with
built-in support for background processing of queries will significantly help
with building a responsive application.
=item Concurrency
Concurrency indicates how well the solution allows for concurrent access. The
worst possible concurrency is achieved when a single database connection is
used for all threads, as only a single action can be performed on the database
at any point in time. Maximum concurrency is achieved when each thread has its
own SQLite connection. Note that maximum concurrency doesn't mean that the
database can be accessed in a I<fully> concurrent manner. SQLite uses internal
database-level locks to avoid data corruption, and these will limit the actual
maximum concurrency. I am not too knowledgeable about the inner workings of
these locks, but it is at least possible to have a large number truly
concurrent database I<reads>. Database I<writes> from multiple threads may
still allow for significantly more concurrency than when they are manually
serialized over a single database connection.
=item Portability
What is the minimum SQLite version required to implement the solution? Does it
require any special OS features or SQLite compilation settings? As outlined
above, different versions of SQLite offer different features with regards to
threading. Relying one of the relatively new features will decrease
portability.
=back
=head1 The Solutions
Here I present four solutions to allow database access from multiple threads.
Note that this list may not be exhaustive, these are just a few solutions that
I am aware of. Also note that none of the solutions presented here are in any
way new. Most of these paradigms date back to the entire notion of concurrent
programming, and have been applied in software since decades ago.
=head2 Connection sharing
By far the simplest solution to implement: Keep a single database connection
throughout your program and allow every thread to access it. Of course, you
will need to be careful to always put locks around the code where you access
the database handler. An example implementation could look like the following:
// The global SQLite connection
sqlite3 *db;
int main(int argc, char **argv) {
if(sqlite3_open("database.sqlite3", &db))
exit(1);
// start some threads
// wait until the threads are finished
sqlite3_close(db);
return 0;
}
void *some_thread(void *arg) {
sqlite3_mutex_enter(sqlite3_db_mutex(db));
// Perform some queries on the database
sqlite3_mutex_leave(sqlite3_db_mutex(db));
}
=over
=item Implementation size
This is where connection sharing shines: There is little extra code required
when compared to using a database connection in a single-threaded context. All
you need to be careful of is to lock the mutex before using the database, and
to unlock it again afterwards.
=item Memory/CPU overhead
As the only addition to the single-threaded case are the locks, this solution
has practically no memory overhead. The mutexes are provided by SQLite,
after all. CPU overhead is also as minimal as it can be: mutexes are the most
primitive type provided by threading libraries to serialize access to a shared
resource, and are therefore very efficient.
=item Prepared statement re-use
Prepared statements can be safely re-used inside a single enter/leave block.
However, if you want to remain portable with SQLite versions before 3.5.0, then
any prepared statements B<must> be freed before the mutex is unlocked. This can
be a major downside if the enter/leave blocks themselves are relatively short
but accessed quite often. If portability with older versions is not an issue,
then this restriction is gone and prepared statements can be re-used easily.
=item Transaction grouping
A reliable implementation will not allow transactions to span multiple
enter/leave blocks. So as with prepared statements, transactions need to be
committed to disk before the mutex is unlocked. Again shared with prepared
statement re-use is that this limitation may prove to be a significant problem
in optimizing application performance, disk I/O in particular. One way to lower
the effects of this limitation is to increase the size of a single enter/leave
block, thus allowing for more work to be done in a single transaction. Code
restructuring may be required in order to efficiently implement this. Another
way to get around this problem is to do allow a transaction to span multiple
enter/leave blocks. Implementing this reliably may not be an easy task,
however, and will most likely require application-specific knowledge.
=item Background processing
Background processing is not natively supported with connection sharing. It is
possible to spawn a background thread to perform database operations each time
that this is desirable. But care should be taken to make sure that these
background threads will execute dependent queries in the correct order. For
example, if thread A spawns a background thread, say B, to execute an UPDATE
query, and later thread A wants to read that same data back, it must first wait
for thread B to finish execution. This may add more inter-thread communication
than is preferable.
=item Concurrency
There is no concurrency at all here. Since the database connection is protected
by an exclusive lock, only a single thread can operate on the database at any
point in time. Additionally, one may be tempted to increase the size of an
enter/leave block in order to allow for larger transactions or better re-use of
prepared statements. However, any time spent on performing operations that do
not directly use the database within such an enter/leave block will lower the
maximum possible database concurrency even further.
=item Portability
Connection sharing requires at least SQLite 3.3.1 in order to pass the same
database connection around. SQLite must be compiled with threading support
enabled. If prepared statements are kept around outside of an enter/leave
block, then version 3.5.0 or higher will be required.
=back
=head2 Message passing
An alternative approach is to allow only a single thread to access the
database. Any other thread that wants to access the database in any way will
then have to communicate with this database thread. This communication is done
by sending messages (I<requests>) to the database thread, and, when query
results are required, receiving back one or more I<response> messages.
Message passing schemes and libraries are available for many programming
languages and come in many different forms. For this article, I am going to
assume that an asynchronous and unbounded FIFO queue is used to pass around
messages, but most of the following discussion will apply to bounded queues as
well. I'll try to note the important differences between the two where
applicable.
A very simple and naive implementation of a message passing solution is given
below. Here I assume that C<queue_create()> will create a message queue (type
C<message_queue>), C<queue_get()> will return the next message in the queue, or
block if the queue is empty. C<thread_create(func, arg)> will run I<func> in a
newly created thread and pass I<arg> as its argument. Error handling has been
ommitted to keep this example consice.
void *db_thread(void *arg) {
message_queue *q = arg;
sqlite3 *db;
if(sqlite3_open("database.sqlite3", &db))
return ERROR;
request_msg *m;
while((m = queue_get(q)) {
if(m->action == QUIT)
break;
if(m->action == EXEC)
sqlite3_exec(db, m->query, NULL, NULL, NULL);
}
sqlite3_close(db);
return OK;
}
int main(int argc, char **argv) {
message_queue *db_queue = queue_create();
thread_create(db_thread, db_queue);
// Do work.
return 0;
}
This example implementation has a single database thread running in the
background that accepts the messages C<QUIT>, to stop processing queries and
close the database, and C<EXEC>, to run a certain query on the database. No
support is available yet for passing query results back to the thread that sent
the message. This can be implemented by including a separate C<message_queue>
object in the request messages, to which the results can be sent.
=over
=item Implementation size
This will largely depend on the used programming environment and the complexity
of the database thread. If your environment already comes with a message queue
implementation, and constructing the request/response messages is relatively
simple, then a simple implementation as shown above will not require much code.
On the other hand, if you have to implement your own message queue or want more
intelligence in the database thread to improve efficiency, then the complete
implementation may be significantly larger than that of connection sharing.
=item Memory/CPU overhead
Constructing and passing around messages will incur a CPU overhead, though with
an efficient implementation this should not be significant enough to worry
about. Memory usage is highly dependent on the size of the messages being
passed around and the length of the queue. If messages are queued faster than
they are processed and there is no bound on the queue length, then a process
may quickly run of out memory. On the other hand, if messages are processed
fast enough then the queue will generally not have more than a single message
in it, and the memory overhead will remain fairly small.
=item Prepared statement re-use
As the database connection will never leave the database thread, prepared
statements can be kept in memory and re-used without problems.
=item Transaction grouping
A naive but robust implementation will handle each message in its own
transaction. A more clever database thread, however, could wait for multiple
messages to be queued and can then batch-execute them in a single transaction.
Correctly implementing this may require some additional information to be
specified along with the request, such as whether the query may be combined in
a single transaction or whether it may only be executed outside of a
transaction. Some threads may want to have confirmation that the data has been
successfully written to disk, in which case responsiveness will not improve if
such actions are queued for later processing. Nonetheless, since the database
thread has all the knowledge about the state of the database and any
outstanding actions, transaction grouping can be implemented quite reliably.
=item Background processing
Background processing is supported natively with a message passing
implementation: a thread that isn't interested in query results can simply
queue the action to be performed by the database thread without indicating a
return path for the results. Of course, if a thread queues many messages that
do not require results followed by one that does, it will have to wait for all
earlier messages to be processed before receiving any results for the last one.
In the case that the actions are not dependent on each other, the database
thread may re-order the messages in order to process the last request first.
This requires knowledge about dependencies and may significantly complicate the
implementation, however.
=item Concurrency
As with a shared database connection, database access is exclusive: Only a
single action can be performed on the database at a time. Unlike connection
sharing, however, any processing within the application will not further
degrade the maximum attainable concurrency. As long as unbounded asynchronous
queues are used to pass around messages, the database thread will be able to
continue working on the database without waiting for another thread to process
the results.
=item Portability
This is where message passing shines: SQLite is only used within the database
thread, no other thread will have a need to call any SQLite function. This
allows any version of SQLite to be used, even those that have not been compiled
with thread safety enabled.
=back
=head2 Thread-local connections
A rather different approach to giving each thread access to a single database
is to simply open a new database connection for each thread. This way each
connection will be local to the specific thread, which in turn has the power to
do with it as it likes without worrying about what the other threads do. The
following is a short example to illustrate the idea:
void *some_thread(void *arg) {
sqlite3 *db;
if(sqlite3_open("database.sqlite3", &db))
return ERROR;
// Do some work on the database
sqlite3_close(db);
}
int main(int argc, char **argv) {
int i;
for(i=0; i<10; i++)
thread_create(some_thread, NULL);
// Wait until the threads are done
return 0;
}
=over
=item Implementation size
Giving each thread its own connection is practically not much different from
the single-threaded case where there is only a single database connection. And
as the example shows, this can be implemented quite trivially.
=item Memory/CPU overhead
If we assume that threads are not created very often and each thread has a
relatively long life, then the CPU and I/O overhead caused by opening a new
connection for each thread will not be very significant. On the other hand, if
threads are created quite often and lead a relatively short life before they
are destroyed again, then opening a new connection each time will soon require
more resources than running the queries themselves.
There is a significant memory overhead: every new database connection requires
memory. If each connection also has a separate cache, then every thread will
quickly require several megabytes only to interact with the database. Since
version 3.5.0, SQLite allows sharing of this cache with the other threads,
which will reduce this memory overhead.
=item Prepared statement re-use
Prepared statements can be re-used without limitations within a single thread.
This will allow full re-use of prepared statements if each thread has a
different task, in which case every thread will have different queries and
access patterns anyway. But when every thread runs the same code, and thus also
the same queries, it will still need its own copy of the prepared statement.
Prepared statements are specific to a single database connection, so they can't
be passed around between the threads. The same argument for CPU overhead works
here: as long as threads are long-lived, then this will not be a very large
problem.
=item Transaction grouping
Each thread has full access to its own database connection, so it can easily
batch many queries in a single transaction. It is not possible, however, to
group queries from the other threads in this same transaction as well. The
grouping may therefore not be as optimal as a message passing solution could
provide, but it is still a large improvement compared to connection sharing.
=item Background processing
Background processing is not easily possible. While it is possible to spawn a
separate thread for each query that needs to be processed in the background, a
new database connection will have to be opened every time this is done. This
solution will obviously not be very efficient.
=item Concurrency
In general, it is not possible to get better concurrency than by providing each
thread with its own database connection. This solution definitely wins in this
area.
=item Portability
Thread-local connections are very portable: the only requirement is that SQLite
has been built with threading support enabled. Connections are not passed
around between threads, so any SQLite version will do. In order to make use of
the shared cache feature, however, SQLite 3.5.0 is required.
=back
=head2 Connection pooling
A common approach in server-like applications is to have a connection pool.
When a thread wishes to have access to the database, it requests a database
connection from a pool of (currently) unused database connections. If no unused
connections are available, it can either wait until one becomes available, or
create a new database connection on its own. When a thread is done with a
connection, it will add it back to the pool to allow it to be re-used in an
other thread.
The following example illustrates a basic connection pool implementation in
which a thread creates a new database connection when no connections are
available. A global C<db_pool> is defined, on which any thread can call
C<pool_pop()> to get an SQLite connection if there is one available, and
C<pool_push()> can be used to push a connection back to the pool. This pool can
be implemented as any kind of set: a FIFO or a stack could do the trick, as
long as it can be accessed from multiple threads concurrently.
// Some global pool of database connections
pool_t *db_pool;
sqlite3 *get_database() {
sqlite3 *db = pool_pop(db_pool);
if(db)
return db;
if(sqlite3_open("database.sqlite3", &db))
return NULL;
return db;
}
void *some_thread(void *arg) {
// Do some work
sqlite3 *db = get_database();
// Do some work on the database
pool_push(db_pool, db);
}
int main(int argc, char **argv) {
int i;
for(i=0; i<10; i++)
thread_create(some_thread, NULL);
// Wait until the threads are done
return 0;
}
=over
=item Implementation size
A connection pool is in essense not very different from thread-local
connections. The only major difference is that the call to sqlite3_open() is
replaced with a function call to obtain a connection from the pool and
sqlite3_close() with one to give it back to the pool. As shown above, these
functions can be fairly simple. Note, however, that unlike with thread-local
connections it is advisable to "open" and "close" a connection more often in
long-running threads, in order to give other threads a chance to use the
connection as well.
=item Memory/CPU overhead
This mainly depends on the number of connections you allow to be in memory at
any point in time. If this number is not bounded, as in the above example, then
you can assume that after running your program for a certain time, there will
always be enough unused connections available in the pool. Requesting a
connection will then be very fast, since the overhead of creating a new
connection, as would have been done with thread-local connections, is
completely gone.
In terms of memory usage, however, it would be more efficient to put a maximum
limit on the number of open connections, and have the thread wait until another
thread gives a connection back to the pool. Similarly to thread-local
connections, memory usage can be decreased by using SQLite's cache sharing
feature.
=item Prepared statement re-use
Unfortunately, this is where connection pooling borrows from connection
sharing. Prepared statements must be cleaned up before passing a connection to
another thread if one aims to be portable. But even if you remove that
portability requirement, prepared statements are always specific to a single
connection. Since you can't assume that you will always get the same connection
from the pool, caching prepared statements is not practical.
On the other hand, a connection pool does allow you to use a single connection
for a longer period of time than with connection sharing without negatively
affecting concurrency. Unless, of course, there is a limit on the number of
open connections, in which case using a connection for a long period of time
may starve another thread.
=item Transaction grouping
Pretty much the same arguments with re-using prepared statements also apply to
transaction grouping: Transactions should be committed to disk before passing a
connection back to the pool.
=item Background processing
This is also where a connection pool shares a lot of similarity with connection
sharing. With thread-local storage, creating a worker thread to perform
database operations on the background would be very inefficient. But since this
inefficiency is being tackled by allowing connection re-use with a connection
pool, it's not a problem. Still the same warning applies with regard to
dependent queries, though.
=item Concurrency
Connection pooling gives you fine-grained control over how much concurrency
you'd like to have. For maximum concurrency, don't put a limit on the number of
maximum database connections. If there is a limit, then that will decrease the
maximim concurrency in favor of lower memory usage.
=item Portability
Since database connections are being passed among threads, connection pooling
will require at least SQLite 3.3.1 compiled with thread safety enabled. Making
use of its cache sharing capibilities to reduce memory usage will require
SQLite 3.5.0 or higher.
=back
=head1 Final notes
As for what I used for ncdc. I initially chose connection sharing, for its
simplicity. Then when I noticed that the UI became less responsive than I found
acceptable I started adding a simple queue for background processing of
queries. Later I stumbled upon the main problem with that solution: I wanted to
read back a value that was written in a background thread, and had no way of
knowing whether the background thread had finished executing that query or not.
I then decided to expand the background thread to allow for passing back query
results, and transformed everything into a full message passing solution. This
appears to be working well at the moment, and my current implementation has
support for both prepared statement re-use and transaction grouping, which
measurably increased performance.
To summarize, there isn't really a I<best> solution that works for every
application. Connection sharing works well for applications where
responsiveness and concurrency isn't of major importance. Message passing works
well for applications that aim to be responsive, and is flexible enough for
optimizing CPU and I/O by re-using prepared statements and grouping queries in
larger transactions. Thread-local connections are suitable for applications
that have a relatively fixed number of threads, whereas connection pooling
works better for applications with a varying number of worker threads.
=cut

View file

@ -1,64 +0,0 @@
=pod
TUWF is a very small and lightweight web development framework for Perl. It has
evolved from being a few abstraction layers in two large websites to a separate
set of modules. While initially designed to be used for large and complex
websites, it is also perfectly suited for small single-file websites.
=head2 Main features
=over
=item * Very small, and no extra modules required for the base functionality,
=item * Easy built-in routing,
=item * Handy form validation functions,
=item * Easy XML/HTML generation,
=item * Response buffering and output compression,
=item * Easy access to request data,
=item * Support for CGI, FastCGI and a built-in web server,
=item * Uses UTF-8 for all text,
=item * Convenient SQL execution functions and correct transaction handling,
=item * Open source (duh!) and available under a liberal MIT license.
=back
Read the L<description|TUWF/DESCRIPTION> in the documentation for more
information and details.
=head2 Download
B<Latest packaged version:> 1.2 ([dllink TUWF-1.2.tar.gz download]
- L<CPAN mirror|https://metacpan.org/release/TUWF>)
TUWF is also available on a git repository at L<https://code.blicky.net/yorhel/tuwf>.
=head2 Websites using TUWF
(Not a whole lot)
=over
=item * L<VNDB.org|https://vndb.org/> (the site that spawned TUWF - L<open source|https://g.blicky.net/vndb.git/>)
=item * L<Manned.org|https://manned.org/> (L<open source|https://g.blicky.net/manned.git/>)
=item * L<This website|https://dev.yorhel.nl/> (L<open source|https://g.blicky.net/yorhel-dev.git/tree/index.cgi>)
=item * L<Blicky.net Pastebin|https://p.blicky.net/> (L<open source|https://g.blicky.net/bpaste.git/tree/index.cgi>)
=item * The website embedded in the L<D&R Axum|http://www.d-r.nl/axum.html> mixing console.
=item * L<333networks|http://333networks.com/>
=back

View file

@ -1,57 +0,0 @@
1.2 - 2018-02-18
- Add tuwf() exported function as alias to $self or $TUWF::OBJ
- Add TUWF::get/put/post/etc() as better alternative to TUWF::register()
- Add TUWF::hook() as better alternative to (pre|post)_request_handler
- Add capture() to access route captures
- Add standandlone HTTP dev server (requires HTTP::Server::Simple)
- Add pass() and done() methods to prematurely abort the current handler
- Add 'import_modules' setting
- TUWF::Request: Add reqJSON()
- TUWF::Request: Disallow control characters in HTTP request data
- TUWF::Response: Add resJSON()
- TUWF::Response: Add resBinary()
- TUWF::Response: Add resFile() + mime_types/mime_default settings
- TUWF::Response: Allow setting headers before resRedirect()
- TUWF::Response: resRedirect() now sets a relative 'Location' HTTP header
- TUWF::DB: Add DB query logging and profiling to non-TUWF database functions
- TUWF::DB: Add dbVal()
- TUWF::XML: Add functional-style DSL
- TUWF::XML: Add HTML5 support
- TUWF::XML: Add different naming convention support
- TUWF::XML: Add 'mkclass' utility function
- Improved error pages
- Various documentation improvements
1.1 - 2017-11-26
- Disallow exclamation mark in email address validation
- Add reqProtocol() method
- Add reqFCGI() method
- Remove 'X-Powered-By' header
- Fix handling of space character in load_recursive()
1.0 - 2015-09-17
- !! Some backwards-imcompatible changes, marked * !!
- kv_validate() improvements:
- Fix maxcount option
- Fix non-array argument to 'func'
- Added some default templates: num, int, uint, ascii, email, weburl
- * Removed 'min' and 'max' options, these now require the num template
- Add 'inherit' option for template definitions
- Allow templates to provide default values for 'required', 'default',
'rmwhitespace', 'multi', 'mincount' and 'maxcount'
- Add tests
- * reqPath() now includes the leading slash
- * reqGet(), reqPost(), reqParam(), reqUploadMIME() and reqUploadRaw()
now only work in scalar context.
- * Add plural versions of the above methods (reqGets() etc) that only
work in list context.
- Add reqQuery()
- Fix warning with Perl 5.22
0.2 - 2012-01-19
- Fixed bug with in-place utf8_decode() in recent Perls
- Lowered minimum Perl version to 5.8.0
0.1 - 2011-02-07
Initial version

40
dat/tuwf.md Normal file
View file

@ -0,0 +1,40 @@
% The Ultimate Website Framework
TUWF is a very small and lightweight web development framework for Perl. It has
evolved from being a few abstraction layers in two large websites to a separate
set of modules. While initially designed to be used for large and complex
websites, it is also perfectly suited for small single-file websites.
## Main features
- Very small, and no extra modules required for the base functionality,
- Easy built-in routing,
- Handy form validation functions,
- Easy XML/HTML generation,
- Response buffering and output compression,
- Easy access to request data,
- Support for CGI, FastCGI and a built-in web server,
- Uses UTF-8 for all text,
- Convenient SQL execution functions and correct transaction handling,
- Open source (duh!) and available under a liberal MIT license.
Read the [description](/tuwf/man#description) in the documentation for more
information and details.
## Download
**Latest packaged version:** 1.2 ([dllink TUWF-1.2.tar.gz]
\- [CPAN mirror](https://metacpan.org/release/TUWF))
TUWF is also available on a git repository at
[https://code.blicky.net/yorhel/tuwf](https://code.blicky.net/yorhel/tuwf).
## Websites using TUWF
(Not a whole lot)
- [VNDB.org](https://vndb.org/) (the site that spawned TUWF - [open source](https://g.blicky.net/vndb.git/))
- [Manned.org](https://manned.org/) ([open source](https://g.blicky.net/manned.git/))
- [Blicky.net Pastebin](https://p.blicky.net/) ([open source](https://g.blicky.net/bpaste.git/tree/index.cgi))
- The website embedded in the [D&R Axum](http://www.d-r.nl/axum.html) mixing console.
- [333networks](http://333networks.com/)

View file

@ -1 +0,0 @@
../../ylib/

208
dat/yxml
View file

@ -1,208 +0,0 @@
=pod
I<*But see the L<Bugs and Limitations|/Bugs and Limitations> and L<Conformance Issues|/Conformance Issues> below.>
Yxml is a small (C<6 KiB>) L<non-validating|/Validating vs. non-validating> yet
mostly conforming XML parser written in C. Its primary goals are small binary
size, simplicity and correctness. It also happens to be L<pretty
fast|/Comparison>.
The code can be obtained from the L<git repo|http://g.blicky.net/yxml.git> and
is available under a permissive MIT license. The only two files you need are
L<yxml.c|http://g.blicky.net/yxml.git/plain/yxml.c> and
L<yxml.h|http://g.blicky.net/yxml.git/plain/yxml.h>, which can easily be
included and compiled as part of your project. Complete API documentation is
available in L<the manual|https://dev.yorhel.nl/yxml/man>.
The API follows a simple and mostly buffer-less design, and only consists of
three functions:
void yxml_init(yxml_t *x, void *buf, size_t bufsize);
yxml_ret_t yxml_parse(yxml_t *x, int ch);
yxml_ret_t yxml_eof(yxml_t *x);
Be aware that I<simple> is not necessarily I<easy> or I<convenient>. The API is
relatively low-level and designed to integrate into pretty much any application
and for any use case. This includes incrementally parsing data from a socket in
an event-driven fashion and parsing large XML files on memory-restricted
devices. It is possible to implement a more convenient and high-level API on
top of yxml, but I'm not very fond of libraries that do more than what I
strictly need.
There are no tarball releases available at the moment. The API is relatively
stable, but I won't currently promise any ABI stability. Dynamic linking
against yxml is therefore not a very good idea.
=head3 Features
=over
=item * Simple and low-level API.
=item * Does not require C<malloc()>.
=item * Pure C, should be very portable.
=item * Recognizes and consumes the UTF-8 BOM.
=item * Parses entity references (C<&amp;>) and character references (C<&#x26;>).
=item * Verifies most well-formedness constraints, including the correct
nesting of elements.
=item * Parses XML documents in any ASCII-compatible encoding.
=back
But let's not be I<too> optimistic, because there are also...
=head3 Bugs and Limitations
=over
=item * A conditional section in a C<< <!DOCTYPE ..> >> declaration will result
in a parse error.
=item * Allows multiple C<< <!DOCTYPE ..> >> declarations.
=item * Information encoded in the XML and doctype declarations is currently
not available through the API.
=back
I hope to have these issues fixed in the near future.
=head3 Conformance Issues
=over
=item * Does not verify that non-ASCII characters in element names, element
content, attribute names and attribute values are within the allowed Unicode
character ranges.
=item * Does not verify that attribute names within the same element are unique.
=item * Does not verify that the contents of a C<< <!DOCTYPE ..> >> declaration
follow the XML grammar.
=item * Can't parse documents in a non-ASCII-compatible encoding. You'll have
to convert it to UTF-8 or something similar first.
=item * No support for custom entity references, neither through the API nor
using C<< <!ENTITY> >>.
=back
These conformance issues are the result of the byte-oriented and minimal design
of yxml, and I do not intent to fix these directly within the library. The
intention is to make sure that all of the above mentioned issues can be fixed
on top of yxml (by the application, or by a wrapper) if strict conformance is
required, but the required functionality to support custom entity references
and DTD handling has not been implemented yet.
=head3 Non-features
And now follows a list of things that are not part of the core XML
specification and are not directly supported. As with the conformance issues,
these features can be implemented on top of yxml.
=over
=item * No helper functions to deal with namespaces. Yxml will parse XML files
with namespaces just fine, but it's up to the application to do the rest.
=item * No DTD or XML Schema validation.
=item * No XSLT.
=item * No XPath.
=item * Doesn't do your household chores.
=back
=head2 Comparison
The following benchmark compares L<expat|http://expat.sourceforge.net/>,
L<libxml2|http://xmlsoft.org/> and
L<Mini-XML|http://www.msweet.org/projects.php?Z3> with yxml. A L<strlen(3)>
implementation is also included as an indication of the "theoretical" minimum.
SIZE PERFORMANCE
LIB VER LICENSE OBJ STATIC WIKI DISCOGS
strlen 25 816 0.16 0.09
expat 2.1.0 MIT 162 139 194 432 1.47 1.09
libxml2 2.9.1 MIT 464 328 518 816 2.53 1.75
mxml 2.7 LGPL2+static 32 733 75 832 12.38 7.80
yxml git MIT 5 971 31 416 1.15 0.74
The code for these benchmarks is available in the
L<bench/|http://g.blicky.net/yxml.git/tree/bench> directory on git. Some
explanatory notes:
=over
=item * C<OBJ> is the total size of all object code of the library, measured
with L<size(1)>.
=item * C<STATIC> is the file size of a minimal statically linked binary when
linked against L<musl|http://www.musl-libc.org/> 0.9.13, measured with
L<wc(1)> after running L<strip(1)>.
=item * The performance is the time, in seconds, to load a large XML file.
C<WIKI> refers to C<enwiki-20130805-abstract5.xml> (162 MiB) from a L<Wikipedia
Dump|http://dumps.wikimedia.org/enwiki/>, C<DISCOGS> refers to
C<discogs_20130801_labels.xml> (94 MiB) from a L<Discogs Data
Dump|http://www.discogs.com/data/>.
=item * Libxml2 has been compiled with most of its features disabled with
C<./configure>, but it still manages to be the very definition of bloat.
=item * Everything has been compiled with gcc 4.8.1 at C<-O2>.
=item * Benchmarks are run on Linux 3.10.7 with a 3 Ghz Intel Core Duo E8400
and with 4GB RAM.
=back
And just for fun, here's the same comparison when compiled with C<-Os>, i.e.
optimized for small size. Interestingly enough, Mini-XML actually runs faster
with C<-Os> than with C<-O2>.
SIZE PERFORMANCE
LIB VER LICENSE OBJ STATIC WIKI DISCOGS
strlen 25 816 0.16 0.09
expat 2.1.0 MIT 113 314 145 632 1.58 1.20
libxml2 2.9.1 MIT 356 948 412 256 3.01 2.08
mxml 2.7 LGPL2+static 27 725 71 704 11.70 7.44
yxml git MIT 4 955 30 392 1.67 1.02
=head2 Validating vs. non-validating
TL;DR: yxml does I<not> accept garbage XML documents, it will correctly handle
and report issues if the input does not strictly follow the XML grammar.
The terms I<validating> and I<non-validating> have specific meanings within the
context of XML. A validating parser is one that reads the doctype declaration
(DTD) associated with a document, and validates that the contents of the
document follow the rules described in the DTD. A DTD may also include
instructions on how to parse the document, including the definition of custom
entity references (C<&whatever;>) and instructions on how attribute values or
element contents should be normalized before passing its data to the
application.
A non-validating parser is one that ignores the DTD and happily parses
documents that do not follow the rules described in that DTD. They (usually)
don't support entity references and will not normalize attribute values or
element contents. A non-validating parser still has to verify that the XML
document follows the XML syntax rules.
It should be noted that a lot of XML documents found in the wild are not
described with a DTD, but instead use an alternative technology such as XML
schema. Wikipedia L<has more
information|https://en.wikipedia.org/wiki/XML#Schemas_and_validation> on this.
Using a validating parser for such documents would only add bloat and may
introduce L<potential security
vulnerabilities|https://en.wikipedia.org/wiki/Billion_laughs>.

View file

@ -1 +0,0 @@
../../yxml/yxml.pod

169
dat/yxml.md Normal file
View file

@ -0,0 +1,169 @@
% Yxml - A small, fast and correct\* XML parser
_\*But see the [Bugs and Limitations](#bugs-and-limitations) and [Conformance Issues](#conformance-issues) below._
Yxml is a small (`6 KiB`) [non-validating](#validating-vs.-non-validating) yet
mostly conforming XML parser written in C. Its primary goals are small binary
size, simplicity and correctness. It also happens to be [pretty
fast](#comparison).
The code can be obtained from the [git repo](https://g.blicky.net/yxml.git) and
is available under a permissive MIT license. The only two files you need are
[yxml.c](https://g.blicky.net/yxml.git/plain/yxml.c) and
[yxml.h](https://g.blicky.net/yxml.git/plain/yxml.h), which can easily be
included and compiled as part of your project. Complete API documentation is
available in [the manual](/yxml/man).
The API follows a simple and mostly buffer-less design, and only consists of
three functions:
```c
void yxml_init(yxml_t *x, void *buf, size_t bufsize);
yxml_ret_t yxml_parse(yxml_t *x, int ch);
yxml_ret_t yxml_eof(yxml_t *x);
```
Be aware that _simple_ is not necessarily _easy_ or _convenient_. The API is
relatively low-level and designed to integrate into pretty much any application
and for any use case. This includes incrementally parsing data from a socket in
an event-driven fashion and parsing large XML files on memory-restricted
devices. It is possible to implement a more convenient and high-level API on
top of yxml, but I'm not very fond of libraries that do more than what I
strictly need.
There are no tarball releases available at the moment. The API is relatively
stable, but I won't currently promise any ABI stability. Dynamic linking
against yxml is therefore not a very good idea.
### Features
- Simple and low-level API.
- Does not require `malloc()`.
- Pure C, should be very portable.
- Recognizes and consumes the UTF-8 BOM.
- Parses entity references (`&amp;`) and character references (`&#x26;`).
- Verifies most well-formedness constraints, including the correct nesting of
elements.
- Parses XML documents in any ASCII-compatible encoding.
But let's not be _too_ optimistic, because there are also...
### Bugs and Limitations
- A conditional section in a `<!DOCTYPE ..>` declaration will result in a parse
error.
- Allows multiple `<!DOCTYPE ..>` declarations.
- Information encoded in the XML and doctype declarations is currently not
available through the API.
I hope to have these issues fixed in the near future.
### Conformance Issues
- Does not verify that non-ASCII characters in element names, element content,
attribute names and attribute values are within the allowed Unicode character
ranges.
- Does not verify that attribute names within the same element are unique.
- Does not verify that the contents of a `<!DOCTYPE ..>` declaration follow the
XML grammar.
- Can't parse documents in a non-ASCII-compatible encoding. You'll have to
convert it to UTF-8 or something similar first.
- No support for custom entity references, neither through the API nor using
`<!ENTITY>`.
These conformance issues are the result of the byte-oriented and minimal design
of yxml, and I do not intent to fix these directly within the library. The
intention is to make sure that all of the above mentioned issues can be fixed
on top of yxml (by the application, or by a wrapper) if strict conformance is
required, but the required functionality to support custom entity references
and DTD handling has not been implemented yet.
### Non-features
And now follows a list of things that are not part of the core XML
specification and are not directly supported. As with the conformance issues,
these features can be implemented on top of yxml.
- No helper functions to deal with namespaces. Yxml will parse XML files with
namespaces just fine, but it's up to the application to do the rest.
- No DTD or XML Schema validation.
- No XSLT.
- No XPath.
- Doesn't do your household chores.
## Comparison
The following benchmark compares [expat](http://expat.sourceforge.net/),
[libxml2](http://xmlsoft.org/) and
[Mini-XML](http://www.msweet.org/projects.php?Z3) with yxml. A
[strlen(3)](http://man.he.net/man3/strlen) implementation is also included as
an indication of the "theoretical" minimum.
SIZE PERFORMANCE
LIB VER LICENSE OBJ STATIC WIKI DISCOGS
strlen 25 816 0.16 0.09
expat 2.1.0 MIT 162 139 194 432 1.47 1.09
libxml2 2.9.1 MIT 464 328 518 816 2.53 1.75
mxml 2.7 LGPL2+static 32 733 75 832 12.38 7.80
yxml git MIT 5 971 31 416 1.15 0.74
The code for these benchmarks is available in the
[bench/](https://g.blicky.net/yxml.git/tree/bench) directory on git. Some
explanatory notes:
- `OBJ` is the total size of all object code of the library, measured with
[size(1)](https://manned.org/size.1).
- `STATIC` is the file size of a minimal statically linked binary when linked
against [musl](http://www.musl-libc.org/) 0.9.13, measured with
[wc(1)](https://manned.org/wc.1) after running
[strip(1)](https://manned.org/strip.1).
- The performance is the time, in seconds, to load a large XML file. `WIKI`
refers to `enwiki-20130805-abstract5.xml` (162 MiB) from a [Wikipedia
Dump](http://dumps.wikimedia.org/enwiki/), `DISCOGS` refers to
`discogs_20130801_labels.xml` (94 MiB) from a [Discogs Data
Dump](http://www.discogs.com/data/).
- Libxml2 has been compiled with most of its features disabled with
`./configure`, but it still manages to be the very definition of bloat.
- Everything has been compiled with gcc 4.8.1 at `-O2`.
- Benchmarks are run on Linux 3.10.7 with a 3 Ghz Intel Core Duo E8400 and with
4GB RAM.
And just for fun, here's the same comparison when compiled with `-Os`, i.e.
optimized for small size. Interestingly enough, Mini-XML actually runs faster
with `-Os` than with `-O2`.
SIZE PERFORMANCE
LIB VER LICENSE OBJ STATIC WIKI DISCOGS
strlen 25 816 0.16 0.09
expat 2.1.0 MIT 113 314 145 632 1.58 1.20
libxml2 2.9.1 MIT 356 948 412 256 3.01 2.08
mxml 2.7 LGPL2+static 27 725 71 704 11.70 7.44
yxml git MIT 4 955 30 392 1.67 1.02
## Validating vs. non-validating
TL;DR: yxml does _not_ accept garbage XML documents, it will correctly handle
and report issues if the input does not strictly follow the XML grammar.
The terms _validating_ and _non-validating_ have specific meanings within the
context of XML. A validating parser is one that reads the doctype declaration
(DTD) associated with a document, and validates that the contents of the
document follow the rules described in the DTD. A DTD may also include
instructions on how to parse the document, including the definition of custom
entity references (`&whatever;`) and instructions on how attribute values or
element contents should be normalized before passing its data to the
application.
A non-validating parser is one that ignores the DTD and happily parses
documents that do not follow the rules described in that DTD. They (usually)
don't support entity references and will not normalize attribute values or
element contents. A non-validating parser still has to verify that the XML
document follows the XML syntax rules.
It should be noted that a lot of XML documents found in the wild are not
described with a DTD, but instead use an alternative technology such as XML
schema. Wikipedia [has more
information](https://en.wikipedia.org/wiki/XML#Schemas_and_validation) on this.
Using a validating parser for such documents would only add bloat and may
introduce [potential security
vulnerabilities](https://en.wikipedia.org/wiki/Billion_laughs).

428
dat/yxml/man.md Normal file
View file

@ -0,0 +1,428 @@
% Yxml Manual
# Introduction
Yxml is a small non-validating and mostly conforming XML parser written in C.
The latest version of yxml and this document can be found on
[https://dev.yorhel.nl/yxml](https://dev.yorhel.nl/yxml).
# Compiling yxml
Due to the small size of yxml, the recommended way to use it is to copy the
[yxml.c](https://g.blicky.net/yxml.git/plain/yxml.c) and
[yxml.h](https://g.blicky.net/yxml.git/plain/yxml.h) from the git repository
into your project directory, and compile and link yxml.c as part of your
program or library.
The git repository also includes a Makefile. Running `make` without specifying
a target will compile a `.a` file for easy static linking. A test suite is
available under `make test`.
# API documentation
## Overview
Yxml is designed to be very flexible and efficient, and thus offers a
relatively low-level stream-based API. The entire API consists of two typedefs
and three functions:
```c
typedef enum { /* .. */ } yxml_ret_t;
typedef struct { /* .. */ } yxml_t;
void yxml_init(yxml_t *x, void *buf, size_t bufsize);
yxml_ret_t yxml_parse(yxml_t *x, int ch);
yxml_ret_t yxml_eof(yxml_t *x);
```
The values of _yxml\_ret\_t_ and the public fields of _yxml\_t_ are explained
in detail below. Parsing a file using yxml involves three steps:
1. Initialization, using `yxml_init()`.
2. Parsing. This is performed in a loop where `yxml_parse()` is called on each
character of the input file.
3. Finalization, using `yxml_eof()`.
## Initialization
```c
#define BUFSIZE 4096
void *buf = malloc(BUFSIZE);
yxml_t x;
yxml_init(&x, buf, BUFSIZE);
```
The parsing state for an input document is remembered in the `yxml_t`
structure. This structure needs to be allocated and initialized before parsing
a new XML document.
Allocating space for the `yxml_t` structure is the responsibility of the
application. Allocation can be done on the stack, but it is also possible to
embed the struct inside a larger object or to allocate space for the struct
separately.
`yxml_init()` takes a pointer to an (uninitialized) `yxml_t` struct as first
argument and performs the necessary initialization. The two additional
arguments specify a pointer to a buffer and the size of this buffer. The given
buffer must be writable, but does not have to be initialized by the
application.
The buffer is used internally by yxml to keep a stack of opened XML element
names, property names and PI targets. The size of the buffer determines both
the maximum depth in which XML elements can be nested and the maximum length of
element names, property names and PI targets. Each name consumes
`strlen(name)+1` bytes in the buffer, and the first byte of the buffer is
reserved for the `\0` byte. This means that in order to parse an XML document
with an element name of 100 bytes, a property name or PI target of 50 bytes and
a nesting depth of 10 levels, the buffer must be at least
`1+10*(100+1)+(50+1)=1062` bytes. Note that properties and PIs don't nest, so
the `max(PI_name, property_name)` only needs to be counted once.
It is not currently possibly to dynamically grow the buffer while parsing, so
it is important to choose a buffer size that is large enough to handle all the
XML documents that you want to parse. Since element names, property names and
PI targets are typically much shorter than in the previous example, a buffer
size of 4 or 8 KiB will give enough headroom even for documents with deep
nesting.
As a useful hack, it is possible to merge the memory for the `yxml_t` struct
and the stack buffer in a single allocation:
```c
yxml_t *x = malloc(sizeof(yxml_t) + BUFSIZE);
yxml_init(x, x+1, BUFSIZE);
```
This way, the complete parsing state can be passed around with a single
pointer, and both the struct and the buffer can be freed with a single call to
`free(x)`.
## Parsing
```c
yxml_t *x; /* An initialized state */
char *doc; /* The XML document as a zero-terminated string */
for(; *doc; doc++) {
yxml_ret_t r = yxml_parse(x, *doc);
if(r < 0)
exit(1); /* Handle error */
/* Handle any tokens we are interested in */
}
```
The actual parsing of an XML document is facilitated by the `yxml_parse()`
function. It accepts a pointer to an initialized `yxml_t` struct as first
argument and a byte as second argument. The byte is passed as an `int`, and
values in the range of -128 to 255 (both inclusive) are accepted. This way you
can pass either `signed char` or `unsigned char` values, yxml will work fine
with both. To parse a complete document, `yxml_parse()` needs to be called for
each byte of the document in sequence, as done in the above example.
For each byte, `yxml_parse()` will return either _YXML\_OK_ (0), a token (>0)
or an error (<0). _YXML\_OK_ is returned if the given byte has been
parsed/consumed correctly but that otherwise nothing worthy of note has
happened. The application should then continue processing and pass the next
byte of the document.
### Public State Variables
After each call to `yxml_parse()`, a number of interesting fields in the
`yxml_t` struct are updated. The fields documented here are part of the API,
and are considered as extra return values of `yxml_parse()`. All of these
fields should be considered read-only.
`char *elem;`
: Name of the currently opened XML element. Points into the buffer given to
`yxml_init()`. Described in ["Elements"](#elements).
`char *attr;`
: Name of the currently opened attribute. Points into the buffer given to
`yxml_init()`. Described in ["Attributes"](#attributes).
`char *pi;`
: Target of the currently opened PI. Points into the buffer given to
`yxml_init()`. Described in ["Processing Instructions"](#processing-instructions).
`char data[8];`
: Character data of element contents, attribute values or PI contents. Described
in ["Character Data"](#character-data).
`uint32_t line;`
: Number of the line in the XML document that is currently being parsed.
`uint64_t byte;`
: Byte offset into the current line the XML document.
`uint64_t total;`
: Byte offset into the XML document.
The values of the _elem_, _attr_, _pi_ and _data_ elements depend on the
parsing context, and only remain valid within that context. The exact contexts
in which these fields contain valid information is described in their
respective sections below.
The _line_, _byte_ and _total_ fields are mainly useful for error reporting.
When `yxml_parse()` reports an error, these fields can be used to generate a
useful error message. For example:
```c
printf("Parsing error at %s:%"PRIu32":%"PRIu64" byte offset %"PRIu64",
filename, x->line, x->byte, x->total);
```
### Error Handling
Errors are not recoverable. No further calls to `yxml_parse()` or `yxml_eof()`
should be performed on the same `yxml_t` struct. Re-initializing the same
struct using `yxml_init()` to start parsing a new document is possible,
however. The following error values may be returned by `yxml_parse()`:
YXML\_EREF
: Invalid character or entity reference. E.g. `&whatever;` or `&#ABC;`.
YXML\_ECLOSE
: Close tag does not match open tag. E.g. `<Tag> .. </SomeOtherTag>`.
YXML\_ESTACK
: Stack overflow. This happens when the buffer given to `yxml_init()` was not
large enough to parse this document. E.g. when elements are too deeply nested
or an element name, attribute name or PI target is too long.
YXML\_ESYN
: Miscellaneous syntax error.
## Handling Tokens
The `yxml_parse()` function will return tokens as they are found. When loading
an XML document, it is important to know which tokens are returned in which
situation and how to handle them.
The following graph shows the (simplified) state machine of the parser to
illustrate the order in which tokens are returned. The labels on the edge
indicate the tokens that are returned by `yxml_parse()`, with their `YXML_`
prefix removed. The special return value `YXML_OK` and error returns are not
displayed.
![](https://dev.yorhel.nl/img/yxml-apistates.png)
Tokens that the application is not interested in can be ignored safely. For
example, if you are not interested in handling processing instructions, then
the `YXML_PISTART`, `YXML_PICONTENT` and `YXML_PIEND` tokens can be handled
exactly as if they were an alias for `YXML_OK`.
### Elements
The `YXML_ELEMSTART` and `YXML_ELEMEND` tokens are returned when an XML
element is opened and closed, respectively. When `YXML_ELEMSTART` is returned,
the _elem_ struct field will hold the name of the element. This field will be
valid (i.e. keeps pointing to the name of the opened element) until the end of
the attribute list. That is, until any token other than those described in
["Attributes"](#attributes) is returned. Although the _elem_ pointer itself may be reused
and modified while parsing the contents of the element, the buffer that _elem_
points to will remain valid up to and including the corresponding
`YXML_ELEMEND`.
Yxml will verify that elements properly nest and that the name of each closing
tag properly matches that of the corresponding opening tag. The application may
safely assume that each `YXML_ELEMSTART` is properly matched with a
`YXML_ELEMEND`, or that otherwise an error is returned. Furthermore, only a
single root element is allowed. When the root element is closed, no further
`YXML_ELEMSTART` tokens will be returned.
No distinction is made between self-closing tags and elements with empty
content. For example, both `<a/>` and `<a></a>` will result in the
`YXML_ELEMSTART` token (with `elem="a"`) followed by `YXML_ELEMEND`.
Element contents are returned in the form of the `YXML_CONTENT` token and the
_data_ field. This is described in more detail in ["Character
Data"](#character-data).
### Attributes
Element attributes are passed using the `YXML_ATTRSTART`, `YXML_ATTRVAL` and
`YXML_ATTREND` tokens. The name of the attribute is available in the _attr_
field, which is available when `YXML_ATTRSTART` is returned and valid up to
and including the next `YXML_ATTREND`.
Yxml does not verify that attribute names are unique within a single element.
It is thus possible that the same attribute will appear twice, possibly with a
different value. The correct way to handle this situation is to stop parsing
the rest of the document and to report an error, but if the application is not
interested in all attributes, detecting duplicates in them may complicate the
code and possibly even introduce security vulnerabilities (e.g. algorithmic
complexity attacks in a hash table). As such, the best solution is to report an
error when you can easily detect a duplicate attribute, but ignore duplicates
that require more effort to be detected.
The attribute value is returned with the `YXML_ATTRVAL` token and the _data_
field. This is described in more detail in ["Character Data"](#character-data).
### Processing Instructions
Processing instructions are passed in similar fashion to attributes, and are
passed using `YXML_PISTART`, `YXML_PICONTENT` and `YXML_PIEND`. The target of
the PI is available in the _pi_ field after `YXML_PISTART` and remains valid up
to (but excluding) the next `YXML_PIEND` token.
PI contents are returned as `YXML_PICONTENT` tokens and using the _data_ field,
described in more detail in ["Character Data"](#character-data).
### Character Data
Element contents (`YXML_CONTENT`), attribute values (`YXML_ATTRVAL`) and PI
contents (`YXML_PICONTENT`) are all passed to the application in small chunks
through the _data_ field. Each time that `yxml_parse()` returns one of these
tokens, the _data_ field will contain one or more bytes of the element
contents, attribute value or PI content. The string is zero-terminated, and its
value is only valid until the next call to `yxml_parse()`.
Typically only a single byte is returned after each call, but multiple bytes
can be returned in the following special cases:
- Character references outside of the ASCII character range. When a character
reference is encountered in element contents or in an attribute value, it is
automatically replaced with the referenced character. For example, the XML
string `&#47;` is replaced with the single character "/". If the character
value is above 127, its value is encoded in UTF-8 and then returned as a
multi-byte string in the _data_ field. For example, the character reference
`&#xe7;` is returned as the C string "\\xc3\\xa9", which is the UTF-8
encoding for the character "é". Character references are not expanded in PI
contents.
- The special character "\]" in CDATA sections. When the "\]" character is
encountered inside a CDATA section, yxml can't immediately return it to the
application because it does not know whether the character is part of the
CDATA ending or whether it is still part of its contents. So it remembers the
character for the next call to `yxml_parse()`, and if it then turns out that
the character was part of the CDATA contents, it returns both the "\]"
character and the following byte in the same _data_ string. Similarly, if two
"\]" characters appear in sequence as part of the CDATA content, then the two
characters are returned in a single _data_ string together with the byte that
follows. CDATA sections only appear in element contents, so this does not
happen in attribute values or PI contents.
- The special character "?" in PI contents. This is similar to the issue with
"\]" characters in CDATA sections. Yxml remembers a "?" character while
parsing a PI, and then returns it together with the byte following it if it
turned out to be part of the PI contents.
Note that `yxml_parse()` operates on bytes rather than characters. If the
document is encoded in a multi-byte character encoding such as UTF-8, then each
Unicode character that occupies more than a single byte will be broken up and
its bytes processed individually. As a result, the bytes returned in the
_data_ field may not necessarily represent a single Unicode character. To
ensure that multi-byte characters are not broken up, the application can
concatenate multiple data tokens to a single buffer before attempting to do
further processing on the result.
To make processing easier, an application may want to combine all the tokens
into a single buffer. This can be easily implemented as follows:
```c
SomeString attrval;
while(..) {
yxml_ret_t r = yxml_parse(x, ch);
switch(r) {
case YXML_ATTRSTART:
somestring_initialize(attrval);
break;
case YXML_ATTRVAL:
somestring_append(attrval, x->data);
break;
case YXML_ATTREND:
/* Now we have a full attribute. Its name is in x->attr, and its value is
* in the string 'attrval'. */
somestring_reset(attrval);
break;
}
}
```
The `SomeString` type and `somestring_` functions are stubs for any string
handling library of your choosing. When using Glib, for example, one could use
the [GString](https://developer.gnome.org/glib/stable/glib-Strings.html)
type and the `g_string_new()`, `g_string_append()` and `g_string_free()`
functions. For a more lighter-weight string library there is also
[kstring.h in klib](https://github.com/attractivechaos/klib), but the
functionality required in the above example can easily be implemented in a few
lines of pure C, too.
When buffering data into an ever-growing string, as done in the previous
example, one should be careful to protect against memory exhaustion. This can
be done trivially by limiting the size of the total XML document or the maximum
length of the buffer. If you want to extract information from an XML document
that might not fit into memory, but you know that the information you care
about is limited in size and is only stored in specific attributes or elements,
you can choose to ignore data you don't care about. For example, if you only
want to extract the "Size" attribute and you know that its value is never
larger than 63 bytes, you can limit your code to read only that value and store
it into a small pre-allocated buffer:
```c
char sizebuf[64], *sizecur = NULL, *tmp;
while(..) {
yxml_ret_t r = yxml_parse(x, ch);
switch(r) {
case YXML_ATTRSTART:
if(strcmp(x->attr, "Size") == 0)
sizecur = sizebuf;
break;
case YXML_ATTRVAL:
if(!sizecur) /* Are we in the "Size" attribute? */
break;
/* Append x->data to sizecur while there is space */
tmp = x->data;
while(*tmp && sizecur < sizebuf+sizeof(sizebuf))
*(sizecur++) = *(tmp++);
if(sizecur == sizebuf+sizeof(sizebuf))
exit(1); /* Too long attribute value, handle error */
*sizecur = 0;
break;
case YXML_ATTREND:
if(sizecur) {
/* Now we have the value of the "Size" attribute in sizebuf */
sizecur = NULL;
}
break;
}
}
```
## Finalization
```c
yxml_t *x; /* An initialized state */
yxml_ret_t r = yxml_eof(x);
if(r < 0)
exit(1); /* Handle error */
else
/* No errors in the XML document */
```
Because `yxml_parse()` does not know when the end of the XML document has been
reached, it is unable to detect certain errors in the document. This is why,
after successfully parsing a complete document with `yxml_parse()`, the
application should call `yxml_eof()` to perform some extra checks.
`yxml_eof()` will return `YXML_OK` if the parsed XML document is well-formed,
`YXML_EEOF` otherwise. The following errors are not detected by
`yxml_parse()` but will result in an error on `yxml_eof()`:
- The XML document did not contain a root element (e.g. an empty file).
- The XML root element has not been closed (e.g. "`<a> ..`").
- The XML document ended in the middle of a comment or PI (e.g.
"`<a/><!-- ..`").
## Utility functions
```c
size_t yxml_symlen(yxml_t *, const char *);
```
`yxml_symlen()` returns the length of the element name (`x->elem`), attribute
name (`x->attr`), or PI name (`x->pi`). When used correctly, it gives the same
result as `strlen()`, except without having to scan through the string. This
function should **ONLY** be used directly after the `YXML_ELEMSTART`,
`YXML_ATTRSTART` or `YXML_PISTART` (respectively) tokens have been returned by
`yxml_parse()`, calling this function at any other time may not give the
correct results. This function should **NOT** be used on strings other than
`x->elem`, `x->attr` or `x->pi`.

0
index.cgi Executable file → Normal file
View file

19
mkchangelog.pl Executable file
View file

@ -0,0 +1,19 @@
#!/usr/bin/perl
($project) = (shift =~ /^([^ \/]+)/);
$title = shift;
print "---\ntitle: $title\npage-type: changelog\n...\n";
for (split /\n\n/, join '', <>) {
s/^([0-9]+\.[0-9]+(?:\.[0-9]+)?)\s+-\s+([0-9]{4}-[0-9]{2}-[0-9]{2})//;
print "\n- **$1** - $2";
$dl = "$project-$1.tar.gz";
print " - [dllink $dl]" if -f "pub/download/$dl";
print "\n";
for (split /\r?\n\s+-\s+/) {
s/([*_\\])/\\$1/g;
print " - $_\n" if $_;
}
}

55
mkpod.pl Executable file
View file

@ -0,0 +1,55 @@
#!/usr/bin/perl
package POD2HTML;
use Pod::Simple::XHTML;
@ISA = qw/Pod::Simple::XHTML/;
sub new {
bless shift->SUPER::new(@_), __PACKAGE__;
}
sub resolve_pod_page_link {
(undef, $page, $section) = @_;
$lnk = {
'TUWF' => '/tuwf/man',
'TUWF::DB' => '/tuwf/man/db',
'TUWF::Intro' => '/tuwf/man/intro',
'TUWF::Misc' => '/tuwf/man/misc',
'TUWF::Request' => '/tuwf/man/request',
'TUWF::Response' => '/tuwf/man/response',
'TUWF::XML' => '/tuwf/man/xml',
'TUWF::Validate' => '/tuwf/man/validate',
'' => '',
}->{$page||''} // "https://metacpan.org/pod/$page";
$lnk .= '#'.($section =~ s/ /-/gr) if $section;
$lnk
}
sub resolve_man_page_link {
(undef, $page, undef) = @_;
my $lnk = {qw{
globsterctl(1) /globster/ctl
globster-launch(1) /globster/launch
globster(1) /globster/daemon
globster-api(7) /globster/api
ncdu(1) /ncdu/man
}}->{$page||''} || ($page =~ /(.+)\((.)\)/ and "https://manned.org/$1.$2");
$lnk
}
$p = POD2HTML->new();
$html = '';
#$p->anchor_items(1); # pandoc doesn't support this :(
$p->output_string(\$html);
$p->parse_file(\*STDIN);
# Some post-processing to improve the pandoc-generated markdown
$html =~ s/^ //mg;
$html =~ s/<code> //g;
$html =~ s/<li><p>/<li>/g;
print $html;

Some files were not shown because too many files have changed in this diff Show more