From 77755d18cc3d4a09f0db585458d5e71653dc0b5d Mon Sep 17 00:00:00 2001
From: Yorhel <git@yorhel.nl>
Date: Sun, 2 Dec 2012 21:03:46 +0100
Subject: [PATCH] ncdu: Added export file format documentation

---
 dat/ncdu-jsonfmt | 224 +++++++++++++++++++++++++++++++++++++++++++++++
 index.cgi        |   8 +-
 2 files changed, 230 insertions(+), 2 deletions(-)
 create mode 100644 dat/ncdu-jsonfmt
diff --git a/dat/ncdu-jsonfmt b/dat/ncdu-jsonfmt
new file mode 100644
index 0000000..880b2b7
--- /dev/null
+++ b/dat/ncdu-jsonfmt
@@ -0,0 +1,224 @@
+=pod
+
+This document describes the file format that ncdu 1.9 uses for its
+export/import feature (the C<-o> and C<-f> options). Check the L<ncdu
+manual|http://dev.yorhel.nl/ncdu/man> for a description on how to use that
+feature.
+
+=head2 Top-level object
+
+Ncdu used L<JSON|http://json.org/> notation as its data format. The top-level
+object is an array:
+
+  [
+    <majorver>,
+    <minorver>,
+    <metadata>,
+    <directory>
+  ]
+
+=head2 Versioning
+
+The C<< <majorver> >> and C<< <minorver> >> elements indicate the version of the file
+format. These are numbers with accepted values in the range of
+C<< 0 <= version <= 10000 >>. Major version must be C<1>, minor version is currently C<0>. The
+major version should increase if backwards-incompatible changes are made
+(preferably never), the minor version can be increased to indicate additions to
+the existing format.
+
+=head2 Metadata
+
+The C<< <metadata> >> element is a JSON object holding whatever (short)
+metadata you'd want. This block is currently (1.9) ignored by ncdu when
+importing, but it writes out the following keys when exporting:
+
+=over
+
+=item progname
+
+String, name of the program that generated the file, i.e. C<"ncdu">.
+
+=item progver
+
+String, version of the program that generated the file, e.g. C<"1.9">.
+
+=item timestamp
+
+Number, UNIX timestamp as returned by the POSIX C<time()> function at the time
+the file was generated. Note that this may not necessarily be equivant to when
+the directory has been scanned.
+
+=back
+
+=head2 Directory Info
+
+A C<< <directory> >> is represented with a JSON array:
+
+  [
+    <infoblock>,
+    <directory>, <directory>, <infoblock>, ...
+  ]
+
+That is, the first element of the array must be an C<< <infoblock> >>. If the
+directory is empty, that will be its only element. If it isn't, its
+subdirectories and files are listed in the remaining elements. Each
+subdirectory is represented as a C<< <directory> >> array again, and each file
+is represented as just an C<< <infoblock> >> object.
+
+=head2 The Info Object
+
+An C<< <infoblock> >> is a JSON object holding information about a file or
+directory.  The following fields are supported:
+
+=over
+
+=item name
+
+String I<(required)>. Name of the file/dir. For the top-level directory (that
+is, the C<< <directory> >> item in the top-level JSON array), this should be
+the full absolute filesystem path, e.g. C<"/media/harddrive">. For any items
+below the top-level directory, the name should be just the name of the item.
+
+The name will be in the same encoding as reported by the filesystem (i.e.
+L<readdir()|http://manned.org/readdir.3>). The name may may not exceed 32768
+bytes.
+
+=item asize
+
+Number. The apparent file size, as reported by C<lstat().st_size>. If absent, 0
+is assumed. Accepted values are in the range of C<< 0 <= asize < 2^63 >>.
+
+=item dsize
+
+Number. Size of the file, as consumed on the disk. This is obtained through
+C<lstat().st_blocks*S_BLKSIZE>. If absent, 0 is assumed. Accepted values are in
+the range of C<< 0 <= dsize < 2^63 >>.
+
+=item dev
+
+Number. The device ID. Has to be a unique ID within the context of the exported
+dump, but may not have any meaning outside of that.  I.e. this can be a
+serialization of C<lstat().st_dev>, but also a randomly generated number only
+used within this file. As long as it uniquely identifies the device/filesystem
+on which this file is stored.  This field may be absent, in which case it is
+equivalent to that of the parent directory. If this field is absent for the
+parent directory, a value of 0 is assumed. Accepted values are in the range of
+C<< 0 <= dev < 2^64 >>.
+
+=item ino
+
+Number. Inode number as reported by C<lstat().st_ino>. Together with the Device
+ID this uniquely identifies a file in this dump. In the case of hard links, two
+objects may appear with the same (C<dev>,C<ino>) combination. A value of 0 is
+assumed if this field is absent. This is currently (ncdu 1.9) not a problem as
+long as the C<hlnkc> field is false, otherwise it will consider everything with
+the same C<dev> and empty C<ino> values as a single hardlinked file. Accepted
+values are in the range of C<< 0 <= ino < 2^64 >>.
+
+=item hlnkc
+
+Boolean. C<true> if this is a file with C<< lstat().st_nlink > 1 >>. If absent,
+C<false> is assumed.
+
+=item read_error
+
+Boolean. C<true> if something went wrong while reading this entry. I.e. the
+information in this entry may not be complete. For files, this indicates that
+the C<lstat()> call failed. For directories, this means that an error occurred
+while obtaining the file listing, and some items may be missing. Note that if
+C<lstat()> failed, ncdu has no way of knowing whether an item is a file or a
+directory, so a file with C<read_error> set might as well be a directory. If
+absent, C<false> is assumed.
+
+=item excluded
+
+String. Set if this file or directory is to be excluded from calculation for
+some reason. The following values are recognized:
+
+=over
+
+=item C<"pattern">
+
+If the path matched an exclude pattern.
+
+=item C<"otherfs">
+
+If the item is on a different device/filesystem.
+
+=back
+
+Excluded items may still be included in the export, but only by name. C<size>,
+C<asize> and other information may be absent. If this item was excluded by a
+pattern, ncdu will not do an C<lstat()> on it, and may thus report this item as
+a file even if it is a directory.
+
+Other values than mentioned above are accepted by ncdu, but are currently
+interpreted to be equivalent to "pattern". This field should be absent if the
+item has not been excluded from the calculation.
+
+=item notreg
+
+Boolean. This is C<true> if neither S_ISREG() nor S_ISDIR() evaluates to true.
+I.e. this is a symlink, character device, block device, FIFO, socket, or
+whatever else your system may support. If absent, C<false> is assumed.
+
+=back
+
+=head2 Miscellaneous notes
+
+As mentioned above, file/directory names are B<not> converted to any specific
+encoding when exporting. If you want the exported info dump to be valid JSON
+(and thus valid UTF-8), you'll have to ensure that you have either no non-UTF-8
+filenames in your filesystem, or you should process the dump through a
+conversion utility such as C<iconv>. When browsing an imported file with ncdu,
+you'll usually want to ensure that the filenames are in the same encoding as
+what your terminal is expecting. The browsing interface may look garbled or
+otherwise ugly if that's not the case.
+
+Another important thing to keep in mind is that an export can be fairly large.
+If you write a program that reads a file in this format and you care about
+handling directories with several million files, make sure to optimize for
+that. For example, prefer the use of a stream-based JSON parser over a JSON
+library that reads the entire file in a single generic data structure, and only
+keep the minimum amount of data that you care about in memory.
+
+=head2 Example Export
+
+Here's a simple example export that displays the basic structure of the format.
+
+  [
+    1,
+    0,
+    {
+      "progname"  : "ncdu",
+      "progver"   : "1.9",
+      "timestamp" : 1354477149
+    },
+    [
+      { "name"   : "/media/harddrive",
+        "dsize"  : 4096,
+        "asize"  : 422,
+        "dev"    : 39123423,
+        "ino"    : 29342345
+      },
+      { "name"   : "SomeFile",
+        "dsize"  : 32768,
+        "asize"  : 32414,
+        "ino"    : 91245479284
+      },
+      [
+        { "name"   : "EmptyDir",
+          "dsize"  : 4096,
+          "asize"  : 10,
+          "ino"    : 3924
+        }
+      ]
+    ]
+  ]
+
+The directory described above has the following structure:
+
+  /media/harddrive
+  ├── SomeFile
+  └── EmptyDir
+
diff --git a/index.cgi b/index.cgi
index dd7a970..9db7819 100755
--- a/index.cgi
+++ b/index.cgi
@@ -12,6 +12,7 @@ BEGIN { ($ROOT = abs_path $0) =~ s{index\.cgi$}{}; }
 
 
 my @changes = (
+  [ '2012-12-02', '/ncdc/jsonfmt',   'Documented the ncdu export file format' ],
   [ '2012-11-04', '/ncdc',           'ncdc 1.14 released' ],
   [ '2012-10-17', '/dump',           'Added reference to my repo of small C libs to the code dump' ],
   [ '2012-10-07', '/dump#maildir.pl','Added maildir.pl to the code dump' ],
@@ -68,9 +69,10 @@ my @changes = (
 TUWF::register(
   qr{}              => sub { podpage(shift, 'home', '', '', "Yorhel's Projects") },
   qr{ncdu}          => sub { podpage(shift, 'ncdu', 'ncdu', '', 'NCurses Disk Usage') },
-  qr{ncdu/man}      => sub { podpage(shift, 'ncdu-man', 'ncdu', 'man', 'Ncdu Manual') },
+  qr{ncdu/man}      => sub { podpage(shift, 'ncdu-man', 'ncdu', 'man', 'Ncdu Manual', 1) },
   qr{ncdu/changes}  => sub { changelog(shift, 'ncdu-changelog', 'ncdu', 'ncdu', 'changes', 'Ncdu Changelog') },
   qr{ncdu/scr}      => sub { podpage(shift, 'ncdu-scr', 'ncdu', 'scr', 'Ncdu Screenshots') },
+  qr{ncdu/jsonfmt}  => sub { podpage(shift, 'ncdu-jsonfmt', 'ncdu', 'jsonfmt', 'Ncdu Export File Format') },
   qr{ncdc}          => sub { podpage(shift, 'ncdc', 'ncdc', '', 'NCurses Direct Connect') },
   qr{ncdc/faq}      => sub { podpage(shift, 'ncdc-faq', 'ncdc', 'faq', 'Ncdc Q&A', 1) },
   qr{ncdc/scr}      => sub { podpage(shift, 'ncdc-scr', 'ncdc', 'scr', 'Ncdc Screenshots') },
@@ -445,7 +447,9 @@ sub htmlMenu {
   ul;
   if($o{page} eq 'ncdu') {
     $m->('/ncdu',         'Info',        !$o{sec});
-    $m->('/ncdu/man',     'Manual',      $o{sec} eq 'man');
+    $m->('/ncdu/man',     'Manual',      $o{sec} eq 'man', sub {
+      $m->('/ncdu/jsonfmt','File Format', $o{sec} eq 'jsonfmt');
+    });
     $m->('/ncdu/changes', 'Changelog',   $o{sec} eq 'changes');
     $m->('/ncdu/scr',     'Screenshots', $o{sec} eq 'scr');
     $m->('/ncdu/bug',     'Bug tracker', $o{sec} eq 'bug');