#!/usr/bin/perl

# awshrink v0.1
# 2007-01-15
# Yoran Heling

# License: MIT
                                       


use strict;
use warnings;
use bytes;



# hash for easy lookup
my %shrinkable = ( map { $_ => 1 } qw|
  DOMAIN LOGIN ROBOT WORMS EMAILSENDER EMAILRECEIVER
  BROWSER UNKNOWNREFERER UNKOWNREFERERBROWSER SEREFERRALS
  PAGEREFS SEARCHWORDS KEYWORDS SIDER_404 SIDER VISITOR
  UNKNOWNREFERERBROWSER
|);



sub getstats { # read
  my $fl = shift;
  my @l; 
  seek($fl, 0, 0);

  # get positions
  while(<$fl>) {
    last if /^END_MAP/;
    next if !/^POS_([A-Z0-9_]+)\s+([0-9]+)\s+$/;
    push(@l, [ $1, $2, 0, 0 ]);
  }

  # get sizes (bytes)
  @l = sort { $a->[1] <=> $b->[1] } @l;
  $l[$_][2] = ($l[$_+1] ? $l[$_+1][1] : -s $fl) - $l[$_][1]
    for (0..$#l);

  # get lines
  for (0..$#l) {
    seek $fl, $l[$_][1], 0;
    <$fl> =~ /BEGIN_[A-Z0-9_]+\s+([0-9]+)/;
    $l[$_][3] = $1;
  }

  print  "               Section  Size (Bytes)   Lines\n";
  printf "%22s  %12d %7d\n", $_->[0].($shrinkable{$_->[0]}?' ':'*'), $_->[2], $_->[3]
    for (sort { $a->[2] <=> $b->[2] } @l);
  print "* = not shrinkable\n";
}



sub fixmap { # read, write
  my($fl, $FL) = @_;

  my $map = '';
  my @l;
  seek $fl, 0, 0;
  while(<$fl>) {
    next if !/^BEGIN_([A-Z0-9_]+)/ || $1 eq 'MAP';
    $map .= sprintf "POS_%s %-20d\n", $1, tell($fl)-length($_);
  }

  seek $fl, 0, 0;
  seek $FL, 0, 0;

  my $inmap = 0;
  while(<$fl>) {
    if(!$inmap && !/^BEGIN_MAP/) {
      print $FL $_;
      next;
    }
    $inmap = 1;
    if(/^END_MAP/) {
      printf $FL "BEGIN_MAP 27\n%sEND_MAP\n", $map;
      $inmap = 0;
    }
  }
}


sub shrink { # read, write, %sections->{ key = section, value = lines }
  my($fh, $FH, %sec) = @_;

  seek $fh, 0, 0;
  seek $FH, 0, 0;
  while(<$fh>) {
    if(!/^BEGIN_([A-Z0-9_]+)\s+([0-9]+)/ || !$sec{$1} || $sec{$1} >= $2) {
      print $FH $_;
      next;
    }
    my $sec = $1;

    # read entire section
    my @l;
    while(<$fh>) {
      last if /^END_/;
      s/^[\s\r\n]+//;
      s/[\s\r\n]+$//;
      push @l, [ split / / ];
    }
    
    # sort section
    #  DOMAIN, LOGIN, ROBOT, WORMS, EMAILSENDER. EMAILRECEIVER,
    #  BROWSER, UNKNOWNREFERER, UNKOWNREFERERBROWSER, SEREFERRALS,
    #  PAGEREFS, SEARCHWORDS, KEYWORDS, SIDER_404, SIDER -> 1
    #  VISITOR -> 2
    if($sec{$sec} > 10) { # assume sorted if we only want the first ten
      @l = sort { $b->[1] <=> $a->[1] } @l if $sec ne 'VISITOR';
      @l = sort { $b->[2] <=> $a->[2] } @l if $sec eq 'VISITOR';
    }

    # write section
    printf $FH "BEGIN_%s %d\n", $sec, $sec{$sec};
    print $FH join(' ', @{$l[$_]})."\n"
      for (0..($sec{$sec}-1));
    printf $FH "END_%s\n", $sec;
  }
  1;
}



sub run {
  print "Usage: $@ [options] filename\n" and exit if !@ARGV;

  my $file = pop;
  my $ci = 0;
  my $st = 0;

  my %options;
  while(local $_ = shift) {
    /^-c$/ ? $ci++ :
    /^-s$/ ? $st++ :
    /^-([A-Za-z0-9_]+)$/ && $shrinkable{ uc($1) } ? $options{ uc($1) } = shift :
    printf("Unknown option: %s\n", $_) && exit;
  }
  (!$options{$_} || $options{$_} !~ /^[0-9]+$/) && printf("Invalid value for -%s\n", $_) && exit
    for keys %options;

  print "Nothing to do...\n" and exit if !keys %options && !$st;

  open(my $fl, '<', $file) || die $!;
  my $tmpfile;
  open(my $TMP, '+>', \$tmpfile);
  open(my $FL, '+>', "$file~") || die $! if keys %options;

  shrink($fl, $TMP, %options) && fixmap($TMP, $FL) if keys %options;
  getstats(keys %options ? $FL : $fl) if $st;

  close $fl;
  close $TMP;
  close $FL if keys %options;

  rename "$file~", $file if $ci;
}



run @ARGV;
