#!/usr/bin/env perl

# $Id: split_xml.pl 1306 2009-02-24 13:24:37Z pierre $

# Copyright (c) 2006 Pierre Senellart <pierre@senellart.com>
# 
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the
# "Software"), to deal in the Software without restriction, including
# without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and to permit
# persons to whom the Software is furnished to do so, subject to the
# following conditions:
#
# The above copyright notice and this permission notice shall be included
# in all copies or substantial portions of the Software.
# 
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
# NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
# OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
# USE OR OTHER DEALINGS IN THE SOFTWARE.

use strict;
use warnings;

use IO::File;

die "Usage: 7z x -so toto.xml.7z | $0 dates\n" if $#ARGV!=0;

open DATES,$ARGV[0] or die;
my @snapshot_dates;
while(<DATES>) {
  chomp;
  push @snapshot_dates,$_."T00:00:00Z";
}
close DATES;

pop @ARGV;

my $nb_snapshots=@snapshot_dates;

my @out;

foreach(@snapshot_dates) {
  my $out=new IO::File;
  $out->open("| gzip > $_.xml.gz");
  push @out,$out;
}

my $template_string;
my $category_string;

while(<>) {
  foreach my $out (@out) {
    print $out $_;
  }
  if(/<namespace key="10">(.*)</) {
    $template_string=$1;
  } elsif(/namespace key="14">(.*)</) {
    $category_string=$1;
  }
  last if /^  <\/siteinfo>/;
}

my $index=0;

PAGE: while(<>) {
  ++$index;
  print STDERR "\r$index" if $index%1000==0;

  last unless /^  <page>/;

  $_=<>;
  /^    <title>(.*?)</;
  my $title=$1;
  
  if($title=~/^([^:]*):[^\s]/ && $1 ne $template_string && $1 ne
    $category_string) {
    while(<>) {
      next PAGE if /^  <\/page>/;
    }
  }
  
  $_=<>;
  /^    <id>(.*?)</;
  my $id=$1;

  my $current_timestamp;
  my $last_timestamp;
  my $current_text="";
  my $current_snapshot_time=0;
  my @page_snapshots;

  while(<>) {
    if(/^  <\/page>/) {
      if($current_text ne "") {
        for(my $i=$current_snapshot_time;$i<$nb_snapshots;++$i) {
          print_text($out[$i],$id,$title,$current_timestamp,$current_text);
        }
      }
      next PAGE;
    }

    if(/^      <timestamp>/) {
      /^      <timestamp>(.*?)</;
      $last_timestamp=$current_timestamp;
      $current_timestamp=$1;
    }

    if(/^      <text[\s>]/) {
      while($current_timestamp ge $snapshot_dates[$current_snapshot_time]) {
        if($current_text ne "") {
          print_text($out[$current_snapshot_time],$id,$title,$last_timestamp,
                     $current_text);
        }
        
        if(++$current_snapshot_time>=$nb_snapshots) {
          while(<>) {
            if(/^  <\/page>/) {
              next PAGE;
            }
          }
        }
      }

      $current_text="";

      if(!/^      <text[^>]*\/>/) {
        while(!/<\/text>/) {
          $current_text.=$_;
          $_=<>;
        }
      }

      $current_text.=$_;
    }
  }
}

print STDERR "\r$index\n";

foreach my $out (@out) {
  print $out "</mediawiki>\n";
}

sub print_text {
  my ($out,$id,$title,$timestamp,$text)=@_;
  print $out <<EOF;
  <page>
    <title>$title</title>
    <id>$id</id>
    <revision>
      <timestamp>$timestamp</timestamp>
$text    </revision>
  </page>
EOF
}
