/* $Id: wikipedia_graph.cpp 1306 2009-02-24 13:24:37Z pierre $ */

/*
 * Copyright (c) 2006 Pierre Senellart <pierre@senellart.com>
 * 
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sublicense, and/or sell copies of the Software, and to permit
 * persons to whom the Software is furnished to do so, subject to the
 * following conditions:
 * 
 * The above copyright notice and this permission notice shall be included
 * in all copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
 * NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
 * USE OR OTHER DEALINGS IN THE SOFTWARE.
 */

#include <iostream>
#include <iomanip>
#include <fstream>
#include <vector>
#include <tr1/unordered_map>
#include <map>
#include <set>
#include <utility>

#include <cstdlib>
#include <cstring>

#include <libxml/parser.h>
#include <unicode.h>

std::string category_string;
std::string template_string;

const int MAX_STRING_SIZE=256;

#ifdef GET_WORDS
extern "C" int stem(char * p, int i, int j);
#endif

namespace {
  typedef unsigned index_t;

  std::set<std::string> stop_words_set;
  extern const char * stop_words[];
  
  struct SAXParserState {
    index_t node_nb;
    std::tr1::unordered_map<std::string,index_t> nodes;
    std::string title;
    std::string id;
    std::string timestamp;
    std::string last_text;
    std::string characters;
    std::string namespace_id;
    enum { m_skip, m_write } mode;
    std::ostream *NODES;
    std::ostream *IDS;
    std::ostream *EDGES;
    std::ostream *REDIRECT;
    std::ostream *WORDS;
    unsigned processed_articles;
    unsigned seen_articles;

    const std::string required_timestamp;

    SAXParserState(const std::string &t);
    ~SAXParserState();
  
    void updateProgression() const;
  };

  extern xmlSAXHandler WikiSaxHandler;
}

int main(int argc, char **argv)
{
  if(argc!=2 && argc!=3) {
    std::cerr << "Usage: " << argv[0]
                           << " file.xml[.gz] [YYYY-MM-DDTHH:MM:SSZ]\n";
    return EXIT_FAILURE;
  }

  std::string required_timestamp;
  if(argc==3) {
    required_timestamp=argv[2];
    if(required_timestamp.size()!=20 ||
       required_timestamp[ 4]!='-' ||
       required_timestamp[ 7]!='-' ||
       required_timestamp[ 10]!='T' ||
       required_timestamp[13]!=':' ||
       required_timestamp[16]!=':' ||
       required_timestamp[19]!='Z') {
      std::cerr << "Incorrect timestamp" << std::endl;
      return EXIT_FAILURE;
    }
  }
  
  SAXParserState my_sax_state(required_timestamp);

  if(xmlSAXUserParseFile(&WikiSaxHandler,
                         &my_sax_state,
                         argv[1]))
      return EXIT_FAILURE;
  
  return EXIT_SUCCESS;
}

namespace {
  std::string utf8char(unsigned int uc) {
    unsigned char u1, u2, u3, u4;
    char ret[5];

    if (uc < 0x80) {
      ret[0]=uc;ret[1]=0;
      return ret;
    } else if (uc < 0x800) {
      u2 = 0xC0 | uc >> 6;
      u1 = 0x80 | uc & 0x3F;
      ret[0]=u2; ret[1]=u1;ret[2]=0;
      return ret;
    } else if (uc < 0x10000) {
      u3 = 0xE0 | uc >> 12;
      u2 = 0x80 | uc >> 6 & 0x3F;
      u1 = 0x80 | uc & 0x3F;
      ret[0]=u3; ret[1]=u2;ret[2]=u1;ret[3]=0;
      return ret;
    } else if (uc < 0x200000) {
      u4 = 0xF0 | uc >> 18;
      u3 = 0x80 | uc >> 12 & 0x3F;
      u2 = 0x80 | uc >> 6 & 0x3F;
      u1 = 0x80 | uc & 0x3F;
      ret[0]=u4; ret[1]=u3;ret[2]=u2;ret[3]=u1;ret[4]=0;
      return ret;
    }

    return "";
  }

#ifdef GET_WORDS
  std::string porter_stemming(std::string s)
  {
    static char buf[MAX_STRING_SIZE];
    std::string::size_type size=s.size();

    if(s.size()>MAX_STRING_SIZE)
      return "";
    
    strcpy(buf,s.c_str());
    int end=stem(buf,0,size-1);
    return std::string(buf,end+1);
  }
#endif

  std::string unicode2utf8(unsigned int uc)
  {
    unsigned char u1, u2, u3, u4;
    std::string ret;

    if (uc < 0x80) {
      return std::string(1,uc);
    } else if (uc < 0x800) {
      u2 = 0xC0 | uc >> 6;
      u1 = 0x80 | uc & 0x3F;
      ret+=u2;
      ret+=u1;
      return ret;
    } else if (uc < 0x10000) {
      u3 = 0xE0 | uc >> 12;
      u2 = 0x80 | uc >> 6 & 0x3F;
      u1 = 0x80 | uc & 0x3F;
      ret+=u3;
      ret+=u2;
      ret+=u1;
      return ret;
    } else if (uc < 0x200000) {
      u4 = 0xF0 | uc >> 18;
      u3 = 0x80 | uc >> 12 & 0x3F;
      u2 = 0x80 | uc >> 6 & 0x3F;
      u1 = 0x80 | uc & 0x3F;
      ret+=u4;
      ret+=u3;
      ret+=u2;
      ret+=u1;
      return ret;
    }

    return "";
  }

  std::string capitalize(const std::string &s)
  {
    const char *string=s.c_str();
    unicode_char_t c;
    string=unicode_get_utf8(string,&c);
    if(!string) {
      return s;
    }
    unicode_char_t up=unicode_toupper(c);
    if(up)
      return unicode2utf8(up)+string;
    else
      return s;
  }

  std::string lowercase(const std::string &s)
  {
    const char *string=s.c_str();
    std::string result;

    while(*string) {
      unicode_char_t c;
      string=unicode_get_utf8(string,&c);
      if(!string) {
        return result;
      }
      c=unicode_tolower(c);
      result+=std::string(unicode2utf8(c));
    }

    return result;
  }

  void normalize(std::string &s)
  {
    std::string::size_type k=0;
    while((k=s.find_first_of("_\n\t",k))!=std::string::npos) {
      s.replace(k++,1," ");
    }
    while(!s.empty() && s[0]==' ')
      s.erase(0,1);
    while(!s.empty() && s[s.size()-1]==' ')
      s.erase(s.size()-1,1);
    if(s.empty())
      return;

    s=capitalize(s);
    if(s.substr(0,template_string.size()+1)==template_string+":")
      s="Template:"+s.substr(template_string.size()+1);
    if(s.substr(0,category_string.size()+1)==category_string+":")
      s="Category:"+s.substr(category_string.size()+1);
    if(s.substr(0,10)=="Category: " || s.substr(0,10)=="Template: ") {
      do {
        s.erase(9,1);
      } while(s.size()>9 && s[9]==' ');
    }

    if(s.substr(0,9)=="Category:" || s.substr(0,9)=="Template:") {
      s=s.substr(0,9)+capitalize(s.substr(9));
    }
  }
  
  void fullnormalize(std::string &s) { 
    std::string::size_type k=0;

    while((k=s.find("&#",k))!=std::string::npos) {
      std::string::size_type start=k;

      bool hexa=false;

      k+=2;

      if(s[k]=='x' || s[k]=='X') {
        hexa=true;
        ++k;
      }

      unsigned code=0;

      while(k<s.size() && (s[k]>='0' && s[k]<='9' ||
            hexa && s[k]>='A' && s[k]<='F' ||
            hexa && s[k]>='a' && s[k]<='f')) {
        code*=hexa?16:10;
        code+=(s[k]>='0' && s[k]<='9')?s[k]-'0':
          (s[k]>='A' && s[k]<='F')?s[k]-'A'+10:
          s[k]-'a'+10;
        ++k;
      }

      if(k<s.size() && code && s[k]==';') {
        s.replace(start,k-start+1,utf8char(code));
        k=start+1;
      }
    }

    k=s.find("#");
    if(k!=std::string::npos)
      s.erase(k);

    normalize(s);
  }

  bool is_acceptable_title(const std::string &s, bool &local_anchor,
                           bool namespace_ok, bool template_ok)
  {
    local_anchor=false;
    
    std::string::size_type first_no_space=s.find_first_not_of(" ");

    if(first_no_space==std::string::npos)
      return false;
    
    if(!strncmp(s.c_str()+first_no_space,"anchor#",7)) {
      local_anchor=true;
      return false;
    }
    
    std::string::size_type k=first_no_space;

    while((k=s.find(':',k))!=std::string::npos) {
      if(++k<s.size() && s[k]!=' ' && (!namespace_ok ||
      strcmp(s.substr(first_no_space,k-1-first_no_space).c_str(),"Category") &&
      (!template_ok ||
       strcmp(s.substr(first_no_space,k-1-first_no_space).c_str(),"Template"))))
        return false;
    }

    if(s[first_no_space]=='#') {
      local_anchor=true;
      return false;
    }

    return true;
  }

  bool is_redirect(const std::string &s)
  {
    std::string::size_type k=0;

    while((k=s.find('#',k))!=std::string::npos) {
      if(strncasecmp(s.substr(++k).c_str(),"REDIRECT",8)==0)
        return true;
    }
    
    return false;
  }

  void preprocess_text(std::string &s)
  {
    std::string::size_type k=0;
    
    while((k=s.find("<noinclude>",k))!=std::string::npos) {
      std::string::size_type end=s.find("</noinclude>",k+11);
      if(end==std::string::npos)
        break;
      s.erase(k,end-k+12);
    }

    k=0;

    while((k=s.find("<!--",k))!=std::string::npos) {
      std::string::size_type end=s.find("-->",k+4);
      if(end==std::string::npos)
        break;
      s.erase(k,end-k+3);
    }

    k=0;
  }

#ifdef GET_WORDS
  void getwords(
      std::string s,
      std::map<std::string,int> &words,
      bool wiki)
  {
    std::string::size_type k=0;

    if(wiki) {
      while((k=s.find("{{",k))!=std::string::npos) {
        std::string::size_type end=s.find_first_of("}{",k+2);
        if(end==std::string::npos)
          break;
        if(s[end]=='{' || s[end+1]!='}') {
          k+=2;
          continue;
        }
        s.erase(k,end-k+2);
      }

      k=0;

      while((k=s.find("[",k))!=std::string::npos) {
        std::string::size_type end=s.find_first_of(']',k+1);
        if(end==std::string::npos)
          break;
        if(end<s.size() && s[end+1]==']')
          ++end;
        s.erase(k,end-k+1);
      }

      k=0;
    }

    const std::string::size_type size=s.size();

    for(;;) {
      while(k<size && !isalpha(s[k]) && !isdigit(s[k]))
        ++k;

      if(k==size)
        break;

      const std::string::size_type begin=k;

      std::string word;

      while(k<size && (isalpha(s[k]) || isdigit(s[k]))) {
        word+=tolower(s[k]);
        ++k;
      }

      if(word.size()>1 &&
         stop_words_set.find(word)==stop_words_set.end())
        ++words[porter_stemming(word)];
    }
  }
#endif

  void getlinks(
      std::string s,
      std::vector<std::string> &links,
      const std::string &article_title,
      std::map<std::string,int> &words,
      bool redirect)
  {
    std::string::size_type k=0;

    while((k=s.find("<nowiki>",k))!=std::string::npos) {
      std::string::size_type end=s.find("</nowiki>",k+8);
      if(end==std::string::npos)
        break;
      s.erase(k,end-k+9);
    }
    
    k=0;
    
    while((k=s.find("[[",k))!=std::string::npos) {
      std::string::size_type end=s.find_first_of("][",k+2);
      if(end==std::string::npos || end==s.size()-1)
        break;

      if(s[end]=='[' || s[end+1]!=']') {
        k+=2;
        continue;
      }
      
      std::string::size_type middle=s.find("|",k+2);

      std::string title;
      if(middle!=std::string::npos && middle<end) {
        title=s.substr(k+2,middle-k-2);
      } else {
        title=s.substr(k+2,end-k-2);
      }
      
      fullnormalize(title);
      bool local_anchor;
      if(is_acceptable_title(title,local_anchor,true,redirect)) {
#ifdef GET_WORDS
        if(middle!=std::string::npos && middle<end)
          getwords(s.substr(middle+1,end-middle-1),words,false);
        else
          getwords(title,words,false);
#endif
        if(title!=article_title)
          links.push_back(title);
      } else if(local_anchor && strncmp(article_title.c_str(),"Template:",9))
        links.push_back(article_title);
      
      k=end+2;
    }
    
    k=0;

    while((k=s.find("{{",k))!=std::string::npos) {
      std::string::size_type end=s.find_first_of("}{",k+2);
      if(end==std::string::npos || end==s.size()-1)
        break;

      if(s[end]=='{' || s[end+1]!='}') {
        k+=2;
        continue;
      }
      
      std::string::size_type middle=s.find("|",k+2);

      std::string title;
      if(middle!=std::string::npos && middle<end)
        title=s.substr(k+2,middle-k-2);
      else
        title=s.substr(k+2,end-k-2);
      
      fullnormalize(title);
      bool foo;
      if(is_acceptable_title(title,foo,false,false)) {

        if(middle!=std::string::npos && middle<end && (
            !strncmp(title.c_str(),"Main",4) ||
            !strncmp(title.c_str(),"See also",8) ||
            !strncmp(title.c_str(),"Further",7) ||
            !strncmp(title.c_str(),"Details",7) ||
            !strncmp(title.c_str(),"Seealso",7) ||
            !strncmp(title.c_str(),"See details",11) ||
            !strncmp(title.c_str(),"See",3)
             )) {
          std::string::size_type middle_end;

          do {
            middle_end=s.find_first_of("|}",middle+1);
            if(middle_end==middle+1) // Empty parameter, let's stop here
              break;

            std::string local_text=s.substr(middle+1,middle_end-middle-1);

            if(!strncmp(title.c_str(),"Further",7)) {
              preprocess_text(local_text);
#ifdef GET_WORDS
              getwords(local_text,words,true);
#endif
              getlinks(local_text,links,article_title,words,false);
            } else { 
              fullnormalize(local_text);
              if(is_acceptable_title(local_text,foo,true,false)) {
                if(local_text!=article_title)
                  links.push_back(local_text);
              }
            }

            middle=middle_end;
          } while(middle_end!=end);
        }
          
        links.push_back("Template:"+title);
      }
      
      k=end+2;
    }
  }
   
  void SAXstartElement(void *ctx, const xmlChar *n, const xmlChar **atts)
  {
    SAXParserState *sax_state=reinterpret_cast<SAXParserState *>(ctx);

    std::string name=reinterpret_cast<const char *>(n);
        
    if(name=="title" || (name=="id" && sax_state->id.empty())
    || (name=="text" && !sax_state->title.empty() && 
        (sax_state->required_timestamp.empty() || 
         sax_state->timestamp<=sax_state->required_timestamp))
    || name=="timestamp") {
      sax_state->mode=SAXParserState::m_write;
      sax_state->characters="";
    } else if(name=="namespace") {
      sax_state->namespace_id=reinterpret_cast<const char*>(atts[1]);
      sax_state->mode=SAXParserState::m_write;
      sax_state->characters="";
    }
      
    if(name=="page") {
      if((sax_state->seen_articles % 1000)==0)
        sax_state->updateProgression();
      
      sax_state->seen_articles++;

      sax_state->title.clear();
      sax_state->id.clear();
      sax_state->timestamp.clear();
      sax_state->last_text.clear();
    }
  }
   
  void SAXendElement(void *ctx, const xmlChar *n)
  {
    SAXParserState *sax_state=reinterpret_cast<SAXParserState *>(ctx);

    std::string name=reinterpret_cast<const char *>(n);

    if(name=="title") {
      sax_state->mode=SAXParserState::m_skip;
      
      normalize(sax_state->characters);
      bool foo;
      if(is_acceptable_title(sax_state->characters,foo,true,true)) {
        sax_state->title=sax_state->characters;
      }
    } else if(name=="id" && sax_state->mode==SAXParserState::m_write) {
      sax_state->mode=SAXParserState::m_skip;

      sax_state->id=sax_state->characters;
    } else if(name=="timestamp") {
      sax_state->mode=SAXParserState::m_skip;
      sax_state->timestamp=sax_state->characters;
    } else if(name=="text" && sax_state->mode==SAXParserState::m_write) {
      sax_state->mode=SAXParserState::m_skip;
      sax_state->last_text=sax_state->characters;
    } else if(name=="page" && !sax_state->last_text.empty()) {
      std::vector<std::string> links;
      bool redirect=is_redirect(sax_state->last_text);

      preprocess_text(sax_state->last_text);

      std::map<std::string,int> words;

#ifdef GET_WORDS
      getwords(sax_state->title,words,true);
      getwords(sax_state->last_text,words,true);
#endif

      getlinks(sax_state->last_text,links,sax_state->title,words,redirect);

#ifdef GET_WORDS
      *sax_state->WORDS << sax_state->id;
      for(std::map<std::string,int>::const_iterator it=words.begin(),
                                                 itend=words.end();
          it!=itend;
          ++it) {
        *sax_state->WORDS << " " << it->second << "/" << it->first;
      }
      *sax_state->WORDS << "\n";
#endif

      std::tr1::unordered_map<std::string,index_t>::const_iterator it=
        sax_state->nodes.find(sax_state->title);
      
      if(it==sax_state->nodes.end()) {
        *sax_state->NODES << sax_state->title << "\n";
        sax_state->nodes[sax_state->title]=sax_state->node_nb++;
        it=sax_state->nodes.find(sax_state->title);
      }
        
      *sax_state->IDS << sax_state->id << ":" << sax_state->title << "\n";

      if(redirect) {
        *sax_state->REDIRECT << it->second << " ";
         if(links.empty())
           *sax_state->REDIRECT << "_";
         else {
           const std::string &redirect=*links.begin();
           if(sax_state->nodes.find(redirect)==sax_state->nodes.end()) {
             *sax_state->NODES << redirect << "\n";
             sax_state->nodes[redirect]=sax_state->node_nb++;
           }
           
           *sax_state->REDIRECT << sax_state->nodes[redirect];
         }
        *sax_state->REDIRECT << "\n";
      } else {
        *sax_state->EDGES << it->second;
        
        for(std::vector<std::string>::const_iterator jt=links.begin(),
                                                jtend=links.end();
            jt!=jtend;
            ++jt) {
          if(sax_state->nodes.find(*jt)==sax_state->nodes.end()) {
            *sax_state->NODES << *jt << "\n";
            sax_state->nodes[*jt]=sax_state->node_nb++; 
          }
            
          *sax_state->EDGES << " " << sax_state->nodes[*jt];
        }

        *sax_state->EDGES << "\n";
      }

      sax_state->processed_articles++;
    } else if(name=="namespace") {
      sax_state->mode=SAXParserState::m_skip;
      if(sax_state->namespace_id=="10")
        template_string=sax_state->characters;
      else if(sax_state->namespace_id=="14")
        category_string=sax_state->characters;
    }
  }

  void SAXcharacters(void *ctx, const xmlChar *c, int len)
  {
    SAXParserState *sax_state=reinterpret_cast<SAXParserState *>(ctx);

    if(sax_state->mode==SAXParserState::m_write)
      sax_state->characters+=std::string(reinterpret_cast<const char *>(c),len);
  }

  xmlSAXHandler WikiSaxHandler = {
    0,                             // internalSubset
    0,                             // isStandalone
    0,                             // hasInternalSubset
    0,                             // hasExternalSubset
    0,                             // resolveEntity
    0,                             // getEntity
    0,                             // entityDecl
    0,                             // notationDecl
    0,                             // attributeDecl
    0,                             // elementDecl
    0,                             // unparsedEntityDecl
    0,                             // setDocumentLocator
    0,                             // startDocument
    0,                             // endDocument
    SAXstartElement,               // startElement
    SAXendElement,                 // endElement
    0,                             // reference
    SAXcharacters,                 // characters
    SAXcharacters,                 // ignorableWhitespace
    0,                             // processingInstruction
    0,                             // comment
    0,                             // warning
    0,                             // error
    0,                             // fatalError
    0,                             // getParameterEntity
    SAXcharacters,                 // cdataBlock
    0,                             // externalSubset
    0,
    0,
    0,
    0,
    0
  };

  SAXParserState::SAXParserState(const std::string &t) :
    node_nb(0),mode(m_skip),processed_articles(0),seen_articles(0),
    required_timestamp(t)
  {
    NODES=new std::ofstream("nodes");
    IDS=new std::ofstream("ids");
    EDGES=new std::ofstream("edges");
    REDIRECT=new std::ofstream("redirect");
#ifdef GET_WORDS
    WORDS=new std::ofstream("words");
#endif
    for(const char **p=stop_words;*p!=0;++p)
      stop_words_set.insert(*p);
  }

  SAXParserState::~SAXParserState()
  {
    updateProgression();
    std::cerr << "\n";
    delete NODES;
    delete IDS;
    delete EDGES;
    delete REDIRECT;
#ifdef GET_WORDS
    delete WORDS;
#endif
  }

  void SAXParserState::updateProgression() const {
    std::cerr << "\r" << std::setw(7) << processed_articles << " / " 
                      << std::setw(7) << seen_articles;
  }

  const char * stop_words[]={
    "0",
    "1",
    "10",
    "2",
    "3",
    "4",
    "5",
    "6",
    "7",
    "8",
    "9",
    "a",
    "about",
    "above",
    "across",
    "after",
    "afterwards",
    "again",
    "against",
    "all",
    "almost",
    "alone",
    "along",
    "already",
    "also",
    "although",
    "always",
    "am",
    "among",
    "amongst",
    "amoungst",
    "amount",
    "an",
    "and",
    "another",
    "any",
    "anyhow",
    "anyone",
    "anything",
    "anyway",
    "anywhere",
    "are",
    "around",
    "as",
    "at",
    "back",
    "be",
    "became",
    "because",
    "become",
    "becomes",
    "becoming",
    "been",
    "before",
    "beforehand",
    "behind",
    "being",
    "below",
    "beside",
    "besides",
    "between",
    "beyond",
    "bill",
    "both",
    "bottom",
    "but",
    "by",
    "call",
    "can",
    "cannot",
    "cant",
    "co",
    "computer",
    "con",
    "could",
    "couldnt",
    "cry",
    "de",
    "describe",
    "detail",
    "do",
    "done",
    "down",
    "due",
    "during",
    "each",
    "eg",
    "eight",
    "either",
    "eleven",
    "else",
    "elsewhere",
    "empty",
    "enough",
    "etc",
    "even",
    "ever",
    "every",
    "everyone",
    "everything",
    "everywhere",
    "except",
    "few",
    "fifteen",
    "fify",
    "fill",
    "find",
    "fire",
    "first",
    "five",
    "for",
    "former",
    "formerly",
    "forty",
    "found",
    "four",
    "from",
    "front",
    "full",
    "further",
    "get",
    "give",
    "go",
    "had",
    "has",
    "hasnt",
    "have",
    "he",
    "hence",
    "her",
    "here",
    "hereafter",
    "hereby",
    "herein",
    "hereupon",
    "hers",
    "herself",
    "him",
    "himself",
    "his",
    "how",
    "however",
    "hundred",
    "i",
    "ie",
    "if",
    "in",
    "inc",
    "indeed",
    "interest",
    "into",
    "is",
    "it",
    "its",
    "itself",
    "keep",
    "last",
    "latter",
    "latterly",
    "least",
    "less",
    "ltd",
    "made",
    "many",
    "may",
    "me",
    "meanwhile",
    "might",
    "mill",
    "mine",
    "more",
    "moreover",
    "most",
    "mostly",
    "move",
    "much",
    "must",
    "my",
    "myself",
    "name",
    "namely",
    "neither",
    "never",
    "nevertheless",
    "next",
    "nine",
    "no",
    "nobody",
    "none",
    "noone",
    "nor",
    "not",
    "nothing",
    "now",
    "nowhere",
    "of",
    "off",
    "often",
    "on",
    "once",
    "one",
    "only",
    "onto",
    "or",
    "other",
    "others",
    "otherwise",
    "our",
    "ours",
    "ourselves",
    "out",
    "over",
    "own",
    "part",
    "per",
    "perhaps",
    "please",
    "put",
    "rather",
    "re",
    "reuter",
    "reuters",
    "said",
    "same",
    "see",
    "seem",
    "seemed",
    "seeming",
    "seems",
    "serious",
    "seven",
    "several",
    "she",
    "should",
    "show",
    "side",
    "since",
    "sincere",
    "six",
    "sixty",
    "so",
    "some",
    "somehow",
    "someone",
    "something",
    "sometime",
    "sometimes",
    "somewhere",
    "still",
    "such",
    "system",
    "take",
    "ten",
    "than",
    "that",
    "the",
    "their",
    "them",
    "themselves",
    "then",
    "thence",
    "there",
    "thereafter",
    "thereby",
    "therefore",
    "therein",
    "thereupon",
    "these",
    "they",
    "thick",
    "thin",
    "third",
    "this",
    "those",
    "though",
    "three",
    "through",
    "throughout",
    "thru",
    "thus",
    "to",
    "together",
    "told",
    "too",
    "top",
    "toward",
    "towards",
    "twelve",
    "twenty",
    "two",
    "un",
    "under",
    "until",
    "up",
    "upon",
    "us",
    "very",
    "via",
    "was",
    "we",
    "well",
    "were",
    "what",
    "whatever",
    "when",
    "whence",
    "whenever",
    "where",
    "whereafter",
    "whereas",
    "whereby",
    "wherein",
    "whereupon",
    "wherever",
    "whether",
    "which",
    "while",
    "whither",
    "who",
    "whoever",
    "whole",
    "whom",
    "whose",
    "why",
    "will",
    "with",
    "within",
    "without",
    "would",
    "yet",
    "you",
    "your",
    "yours",
    "yourself",
    "yourselves",
    "zero",
    0
  };
}

