#!/usr/bin/env perl

# $Id: ordonne.pl 1306 2009-02-24 13:24:37Z pierre $

# Copyright (c) 2006 Pierre Senellart <pierre@senellart.com>
# 
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the
# "Software"), to deal in the Software without restriction, including
# without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and to permit
# persons to whom the Software is furnished to do so, subject to the
# following conditions:
#
# The above copyright notice and this permission notice shall be included
# in all copies or substantial portions of the Software.
# 
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
# NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
# OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
# USE OR OTHER DEALINGS IN THE SOFTWARE.

use strict;
use warnings;

use constant EXPAND_TEMPLATES => 0;

$ENV{"LC_ALL"}="C"; # Otherwise sort behaves weirdly

my $suffix="";
$suffix=".".$ARGV[0] if exists($ARGV[0]);

print STDERR "Parsing redirect...\n";

my %redirect;
open REDIRECT, "<redirect$suffix" or die;
while(<REDIRECT>) {
  /^(.*) (.*)$/;
  $redirect{$1}=$2 unless $1 eq $2;
}
close REDIRECT;

print STDERR "Getting ids...\n";
open IDS,"<:utf8","ids$suffix" or die;
my %ids;
while(<IDS>) {
  /^(.*?):(.*)$/;
  $ids{$2}=$1;
}
close IDS;

print STDERR "Enumerating nodes...\n";

open NODES, "<:utf8", "nodes$suffix" or die;
my $index=-1;
my %template;
my %category;

my $EXPAND_TEMPLATES=0;

my %nodes;
while(<NODES>) {
  chomp;
  next if exists($redirect{++$index});
  $nodes{$index}=$_;
  $template{$index}="ok" if /^Template:/;
  $category{$index}=1 if /^Category:/;
}
close NODES;

if(EXPAND_TEMPLATES) {
  print STDERR "Retrieving templates...\n";
  open EDGES,"<","edges$suffix" or die;
  while(<EDGES>) {
    if(exists($template{$1}) && $template{$1} eq "ok") {
      /^(.*?) (.*)$/;
      $template{$1}=$2?$2:"";
    }
  }
  close EDGES;
}

print STDERR "Removing empty articles and redirects...\n";

my @nodes;
my ($key,$value);
for(my $i=0;$i<=$index;++$i) {
  push @nodes, $nodes{$i}
    if exists($nodes{$i})    &&  exists($ids{$nodes{$i}}) &&
      !exists($redirect{$i}) && !exists($template{$i});
}

print STDERR "Sorting nodes...\n";
@nodes = sort @nodes;
                         
print STDERR "Writing index...\n";

open INDEX, ">:utf8", "index$suffix" or die;
my %index;
my %index_by_id;
my $nb=0;
while(@nodes) {
  $_=shift @nodes;
  $index_by_id{$ids{$_}}=$nb;
  $index{$_}=$nb++;
  print INDEX $ids{$_}.":$_\n";
  delete $ids{$_};
}
close INDEX;

%ids=();

print STDERR "Computing and sorting edges...\n";

open EDGES,"sort -n edges$suffix |" or die;
open REINDEX_EDGES,"> edges_reindex$suffix" or die;
while(<EDGES>) {
  chomp;
  my @links=split / /;
  my $origin=shift @links;

  if(!exists($template{$origin})) {
    @links=expand_links($origin,@links);

    my $o=shift @links;

    next if !defined($index{$nodes{$o}});
    
    foreach(grep {exists($category{$_})} @links) {
      print REINDEX_EDGES $index{$nodes{$_}},' ',$index{$nodes{$o}},"\n"
        if $index{$nodes{$_}};
    }

    @links=map {$index{$nodes{$_}}} grep {defined($nodes{$_})} @links;

    @links=grep {defined($_)} @links;
    
    print REINDEX_EDGES $index{$nodes{$o}},' ', join(' ',@links),"\n" if @links;
  }
}
close REINDEX_EDGES;

%template=();
%category=();
%index=();
%nodes=();
%redirect=();

my $get_words=0;
if(-f "words") {
  print STDERR "Computing idf...\n";
  open WORDS, "<:utf8", "words$suffix" or die;
  my %idf;
  while(<WORDS>) {
    my ($id)=m/^(.*?)(?: |$)/;
    next if !exists($index_by_id{$id});

    s/ (.*?)\/([^ \n]*)/$idf{$2}++/eg;
  }
  close WORDS;

  print STDERR "Inversing plain-text index...\n";
  open WORDS, "<:utf8", "words$suffix" or die;
  open INVERTED_WORDS, ">:utf8", "inverted_words$suffix" or die;
  while(<WORDS>) {
    my ($id)=m/^(.*?)(?: |$)/;
    next if !exists($index_by_id{$id});
    my $nb_words=0;
    (my $copy=$_)=~s/ (.*?)\/[^ \n]*/$nb_words+=$1/eg;

    s/ (.*?)\/([^ \n]*)/print INVERTED_WORDS "$2 ".$1\/$nb_words."\/$id\n"/eg;
  }
  close INVERTED_WORDS;
  close WORDS;

  print STDERR "Building plain-text index...\n";
  open PLAIN_TEXT,">:utf8","plain-text$suffix" or die;
  open INVERTED_WORDS, "sort inverted_words$suffix |" or die;
  binmode INVERTED_WORDS,"utf8";
  my $oldterm;
  my %tf;
  while(1) {
    $_=<INVERTED_WORDS>;

    my $term;
    if(defined($_)) {
      ($term)=m/^(.*?) /;
    }

    if(!defined($_) || !defined($oldterm) || $term ne $oldterm) {
      if(defined($oldterm)) {
        my $idf=log($nb/$idf{$oldterm})/log(2);

        print PLAIN_TEXT $oldterm;
        foreach my $document (sort {$index_by_id{$a} <=> $index_by_id{$b} } 
          keys %tf) {
          print PLAIN_TEXT " ".$index_by_id{$document}."/".
                               $tf{$document}*$idf;
        }
        print PLAIN_TEXT "\n";
      }

      $oldterm=$term;
      %tf=();
    }
    
    last if(!defined($_));

    m/ (.*?)\/(.*)/;
    $tf{$2}=$1;
  }
  close INVERTED_WORDS;
  unlink "inverted_words$suffix" or die;
  close PLAIN_TEXT;
}

sub expand_links {
  my @newlinks;

  LOOP: foreach(@_) {
    my $a=$_;
    my $nb_redirections=0;
    while (exists($redirect{$_})) {
      $_=$redirect{$_};
      next LOOP if ++$nb_redirections==20; # Prevent redirection loops
    }
    next if /^_$/;

    if(EXPAND_TEMPLATES && exists($template{$_})) {
      my $template=$_;
      push @newlinks,
        expand_links(grep {!/^$template$/} split(/ /,$template{$_}));
    } else {
      push @newlinks,$_;
    }
  }
  
  return @newlinks;
}
