From 77755d18cc3d4a09f0db585458d5e71653dc0b5d Mon Sep 17 00:00:00 2001 From: Yorhel Date: Sun, 2 Dec 2012 21:03:46 +0100 Subject: [PATCH] ncdu: Added export file format documentation --- dat/ncdu-jsonfmt | 224 +++++++++++++++++++++++++++++++++++++++++++++++ index.cgi | 8 +- 2 files changed, 230 insertions(+), 2 deletions(-) create mode 100644 dat/ncdu-jsonfmt diff --git a/dat/ncdu-jsonfmt b/dat/ncdu-jsonfmt new file mode 100644 index 0000000..880b2b7 --- /dev/null +++ b/dat/ncdu-jsonfmt @@ -0,0 +1,224 @@ +=pod + +This document describes the file format that ncdu 1.9 uses for its +export/import feature (the C<-o> and C<-f> options). Check the L for a description on how to use that +feature. + +=head2 Top-level object + +Ncdu used L notation as its data format. The top-level +object is an array: + + [ + , + , + , + + ] + +=head2 Versioning + +The C<< >> and C<< >> elements indicate the version of the file +format. These are numbers with accepted values in the range of +C<< 0 <= version <= 10000 >>. Major version must be C<1>, minor version is currently C<0>. The +major version should increase if backwards-incompatible changes are made +(preferably never), the minor version can be increased to indicate additions to +the existing format. + +=head2 Metadata + +The C<< >> element is a JSON object holding whatever (short) +metadata you'd want. This block is currently (1.9) ignored by ncdu when +importing, but it writes out the following keys when exporting: + +=over + +=item progname + +String, name of the program that generated the file, i.e. C<"ncdu">. + +=item progver + +String, version of the program that generated the file, e.g. C<"1.9">. + +=item timestamp + +Number, UNIX timestamp as returned by the POSIX C function at the time +the file was generated. Note that this may not necessarily be equivant to when +the directory has been scanned. + +=back + +=head2 Directory Info + +A C<< >> is represented with a JSON array: + + [ + , + , , , ... + ] + +That is, the first element of the array must be an C<< >>. If the +directory is empty, that will be its only element. If it isn't, its +subdirectories and files are listed in the remaining elements. Each +subdirectory is represented as a C<< >> array again, and each file +is represented as just an C<< >> object. + +=head2 The Info Object + +An C<< >> is a JSON object holding information about a file or +directory. The following fields are supported: + +=over + +=item name + +String I<(required)>. Name of the file/dir. For the top-level directory (that +is, the C<< >> item in the top-level JSON array), this should be +the full absolute filesystem path, e.g. C<"/media/harddrive">. For any items +below the top-level directory, the name should be just the name of the item. + +The name will be in the same encoding as reported by the filesystem (i.e. +L). The name may may not exceed 32768 +bytes. + +=item asize + +Number. The apparent file size, as reported by C. If absent, 0 +is assumed. Accepted values are in the range of C<< 0 <= asize < 2^63 >>. + +=item dsize + +Number. Size of the file, as consumed on the disk. This is obtained through +C. If absent, 0 is assumed. Accepted values are in +the range of C<< 0 <= dsize < 2^63 >>. + +=item dev + +Number. The device ID. Has to be a unique ID within the context of the exported +dump, but may not have any meaning outside of that. I.e. this can be a +serialization of C, but also a randomly generated number only +used within this file. As long as it uniquely identifies the device/filesystem +on which this file is stored. This field may be absent, in which case it is +equivalent to that of the parent directory. If this field is absent for the +parent directory, a value of 0 is assumed. Accepted values are in the range of +C<< 0 <= dev < 2^64 >>. + +=item ino + +Number. Inode number as reported by C. Together with the Device +ID this uniquely identifies a file in this dump. In the case of hard links, two +objects may appear with the same (C,C) combination. A value of 0 is +assumed if this field is absent. This is currently (ncdu 1.9) not a problem as +long as the C field is false, otherwise it will consider everything with +the same C and empty C values as a single hardlinked file. Accepted +values are in the range of C<< 0 <= ino < 2^64 >>. + +=item hlnkc + +Boolean. C if this is a file with C<< lstat().st_nlink > 1 >>. If absent, +C is assumed. + +=item read_error + +Boolean. C if something went wrong while reading this entry. I.e. the +information in this entry may not be complete. For files, this indicates that +the C call failed. For directories, this means that an error occurred +while obtaining the file listing, and some items may be missing. Note that if +C failed, ncdu has no way of knowing whether an item is a file or a +directory, so a file with C set might as well be a directory. If +absent, C is assumed. + +=item excluded + +String. Set if this file or directory is to be excluded from calculation for +some reason. The following values are recognized: + +=over + +=item C<"pattern"> + +If the path matched an exclude pattern. + +=item C<"otherfs"> + +If the item is on a different device/filesystem. + +=back + +Excluded items may still be included in the export, but only by name. C, +C and other information may be absent. If this item was excluded by a +pattern, ncdu will not do an C on it, and may thus report this item as +a file even if it is a directory. + +Other values than mentioned above are accepted by ncdu, but are currently +interpreted to be equivalent to "pattern". This field should be absent if the +item has not been excluded from the calculation. + +=item notreg + +Boolean. This is C if neither S_ISREG() nor S_ISDIR() evaluates to true. +I.e. this is a symlink, character device, block device, FIFO, socket, or +whatever else your system may support. If absent, C is assumed. + +=back + +=head2 Miscellaneous notes + +As mentioned above, file/directory names are B converted to any specific +encoding when exporting. If you want the exported info dump to be valid JSON +(and thus valid UTF-8), you'll have to ensure that you have either no non-UTF-8 +filenames in your filesystem, or you should process the dump through a +conversion utility such as C. When browsing an imported file with ncdu, +you'll usually want to ensure that the filenames are in the same encoding as +what your terminal is expecting. The browsing interface may look garbled or +otherwise ugly if that's not the case. + +Another important thing to keep in mind is that an export can be fairly large. +If you write a program that reads a file in this format and you care about +handling directories with several million files, make sure to optimize for +that. For example, prefer the use of a stream-based JSON parser over a JSON +library that reads the entire file in a single generic data structure, and only +keep the minimum amount of data that you care about in memory. + +=head2 Example Export + +Here's a simple example export that displays the basic structure of the format. + + [ + 1, + 0, + { + "progname" : "ncdu", + "progver" : "1.9", + "timestamp" : 1354477149 + }, + [ + { "name" : "/media/harddrive", + "dsize" : 4096, + "asize" : 422, + "dev" : 39123423, + "ino" : 29342345 + }, + { "name" : "SomeFile", + "dsize" : 32768, + "asize" : 32414, + "ino" : 91245479284 + }, + [ + { "name" : "EmptyDir", + "dsize" : 4096, + "asize" : 10, + "ino" : 3924 + } + ] + ] + ] + +The directory described above has the following structure: + + /media/harddrive + ├── SomeFile + └── EmptyDir + diff --git a/index.cgi b/index.cgi index dd7a970..9db7819 100755 --- a/index.cgi +++ b/index.cgi @@ -12,6 +12,7 @@ BEGIN { ($ROOT = abs_path $0) =~ s{index\.cgi$}{}; } my @changes = ( + [ '2012-12-02', '/ncdc/jsonfmt', 'Documented the ncdu export file format' ], [ '2012-11-04', '/ncdc', 'ncdc 1.14 released' ], [ '2012-10-17', '/dump', 'Added reference to my repo of small C libs to the code dump' ], [ '2012-10-07', '/dump#maildir.pl','Added maildir.pl to the code dump' ], @@ -68,9 +69,10 @@ my @changes = ( TUWF::register( qr{} => sub { podpage(shift, 'home', '', '', "Yorhel's Projects") }, qr{ncdu} => sub { podpage(shift, 'ncdu', 'ncdu', '', 'NCurses Disk Usage') }, - qr{ncdu/man} => sub { podpage(shift, 'ncdu-man', 'ncdu', 'man', 'Ncdu Manual') }, + qr{ncdu/man} => sub { podpage(shift, 'ncdu-man', 'ncdu', 'man', 'Ncdu Manual', 1) }, qr{ncdu/changes} => sub { changelog(shift, 'ncdu-changelog', 'ncdu', 'ncdu', 'changes', 'Ncdu Changelog') }, qr{ncdu/scr} => sub { podpage(shift, 'ncdu-scr', 'ncdu', 'scr', 'Ncdu Screenshots') }, + qr{ncdu/jsonfmt} => sub { podpage(shift, 'ncdu-jsonfmt', 'ncdu', 'jsonfmt', 'Ncdu Export File Format') }, qr{ncdc} => sub { podpage(shift, 'ncdc', 'ncdc', '', 'NCurses Direct Connect') }, qr{ncdc/faq} => sub { podpage(shift, 'ncdc-faq', 'ncdc', 'faq', 'Ncdc Q&A', 1) }, qr{ncdc/scr} => sub { podpage(shift, 'ncdc-scr', 'ncdc', 'scr', 'Ncdc Screenshots') }, @@ -445,7 +447,9 @@ sub htmlMenu { ul; if($o{page} eq 'ncdu') { $m->('/ncdu', 'Info', !$o{sec}); - $m->('/ncdu/man', 'Manual', $o{sec} eq 'man'); + $m->('/ncdu/man', 'Manual', $o{sec} eq 'man', sub { + $m->('/ncdu/jsonfmt','File Format', $o{sec} eq 'jsonfmt'); + }); $m->('/ncdu/changes', 'Changelog', $o{sec} eq 'changes'); $m->('/ncdu/scr', 'Screenshots', $o{sec} eq 'scr'); $m->('/ncdu/bug', 'Bug tracker', $o{sec} eq 'bug');