// Copyright 2000 by Kevin Atkinson under the terms of the LGPL

// suggest.cc Suggestion code for Aspell

// The magic behind my spell checker comes from merging Lawrence
// Philips excellent metaphone algorithm and Ispell's near miss
// strategy which is inserting a space or hyphen, interchanging two
// adjacent letters, changing one letter, deleting a letter, or adding
// a letter.
// 
// The process goes something like this.
// 
// 1.     Convert the misspelled word to its soundslike equivalent (its
//        metaphone for English words).
// 
// 2.     Find words that have the same soundslike pattern.
//
// 3.     Find words that have similar soundslike patterns. A similar
//        soundlike pattern is a pattern that is obtained by
//        interchanging two adjacent letters, changing one letter,
//        deleting a letter, or adding a letter.
//
// 4.     Score the result list and return the words with the lowest
//        score. The score is roughly the weighed average of the edit
//        distance of the word to the misspelled word, the soundslike
//        equivalent of the two words, and the phoneme of the two words.
//        The edit distance is the weighed total of the number of
//        deletions, insertions, exchanges, or adjacent swaps needed to
//        make one string equivalent to the other.
//
// Please note that the soundlike equivalent is a rough approximation
// of how the words sounds. It is not the phoneme of the word by any
// means.  For more information on the metaphone algorithm please see
// the file metaphone.cc which included a detailed description of it.


// NOTES TO SELF
// Consider jump starting the distance alogo. when to words have the same
// prefix
// Consider short circling the distance algo. when to words have the same
// suffix

#include "manager.hh"
#include "amanager.hh"
#include "suggest.hh"
#include "asuggest.hh"
#include "editdist.hh"
#include "leditdist.hh"
#include "editdist2.hh"
#include "language.hh"
#include <slist>
#include <hash_set>
#include "hash_string_s.hh"
#include "data.hh"
#include "config.hh"

#include "clone_ptr-t.hh"

//#define DEBUG_SUGGEST

namespace aspell_default_suggest {

  using namespace aspell;
  using namespace autil;
  
  typedef vector<string> NearMissesFinal;

  template <class Iterator>
  inline Iterator preview_next (Iterator i) {
    return ++i;
  }
  
  //
  // OrignalWord stores infomation about the orignal misspelled word
  //   for convince and speed.
  //
  struct OrignalWord {
    string   word;
    string   word_lower;
    string   soundslike;
    string   phoneme;
    Language::CasePattern  case_pattern;
    OrignalWord() {}
    OrignalWord (const string &w, const string &sl, const string &p)
      : word(w), soundslike(sl), phoneme(p) {}
    OrignalWord (const string &w, const string &sl, const string &p,
		 const string &l, Language::CasePattern cp)
      : word(w), word_lower(l), soundslike(sl), phoneme(p), case_pattern(cp) 
    {}
  };

  //
  // struct ScoreWordSound - used for storing the possible words while
  //   they are being processed.
  //

  struct ScoreWordSound {
    const char *  word;
    const char *  word_lower;
    Language::CasePattern  case_pattern;
    int           score;
    int           soundslike_score;
    bool          count;
    ReplacementList::VirEmul * repl_list;
    ScoreWordSound() {repl_list = 0;}
    ~ScoreWordSound() {delete repl_list;}
  private:
    ScoreWordSound & operator=(const ScoreWordSound &);
  };

  inline int compare (const ScoreWordSound &lhs, 
		      const ScoreWordSound &rhs) 
  {
    int temp = lhs.score - rhs.score;
    if (temp) return temp;
    return strcmp(lhs.word,rhs.word);
  }

  inline bool operator < (const ScoreWordSound & lhs, 
			  const ScoreWordSound & rhs) {
    return compare(lhs, rhs) < 0;
  }

  inline bool operator <= (const ScoreWordSound & lhs, 
			   const ScoreWordSound & rhs) {
    return compare(lhs, rhs) <= 0;
  }

  inline bool operator == (const ScoreWordSound & lhs, 
			   const ScoreWordSound & rhs) {
    return compare(lhs, rhs) == 0;
  }

  typedef slist<ScoreWordSound> NearMisses;
 
  class Score {
  protected:
    const Language * lang;
    OrignalWord      orignal_word;
    SuggestParms     parms;

  public:
    Score(const Language *l, const string &w, const SuggestParms & p)
      : lang(l), orignal_word(w, lang->to_soundslike(w.c_str()), 
			      lang->to_phoneme(w.c_str()),
			      lang->to_lower(w.c_str()), 
			      lang->case_pattern(w.c_str())),
      parms(p)
    {
      //double total = parms.word_weight + parms.soundslike_weight;
      //if (lang->have_phoneme()) 
      //total += parms.phoneme_weight;

      //parms.word_weight = parms.word_weight/total;
      //parms.soundslike_weight = parms.soundslike_weight/total;
      //if (lang->have_phoneme()) 
      //parms.phoneme_weight = parms.phoneme_weight/total;
      //else
      //parms.phoneme_weight = 0;
    }
    string fix_case(const string & word) {
      return lang->fix_case(orignal_word.case_pattern,word.c_str());
    }
  };

  class Working : public Score {

    int threshold;

    const Manager    * manager;
    NearMisses         scored_near_misses;
    NearMisses         near_misses;
    NearMissesFinal  * near_misses_final;

    slist<string>      strings;

    static const bool do_count = true;
    static const bool dont_count = false;
    static const bool do_need_alloc = true;
    static const bool dont_need_alloc = false;

    void try_sound(const char *, int ms);
    void add_nearmiss(const char * word, int ms, bool count, 
		      bool need_alloc, ReplacementList::VirEmul * rl = 0) {
      near_misses.push_front(ScoreWordSound());
      ScoreWordSound & d = near_misses.front();
      if (need_alloc) {
	strings.push_front(word);
	d.word = strings.front().c_str();
      } else {
	d.word = word;
      }
      d.case_pattern = lang->case_pattern(word);
      if (d.case_pattern != Language::all_lower) {
	strings.push_front(lang->to_lower(word));
	d.word_lower = strings.front().c_str();
      } else {
	d.word_lower = d.word;
      }
      d.soundslike_score = ms;
      d.count = count;
      d.repl_list = rl;
    }
    int needed_level(int want, int soundslike_score) {
      int n = (100*want - parms.soundslike_weight*soundslike_score)
	/(parms.word_weight*parms.edit_distance_weights.min);
      return n > 0 ? n : 0;
    }
    int weighted_average(int soundslike_score, int word_score) {
      return (parms.word_weight*word_score 
	      + parms.soundslike_weight*soundslike_score)/100;
    }

    void try_others();
    void score_list();
    void transfer();
  public:
    Working(const Manager * m, const Language *l,
	    const string & w, const SuggestParms & p)
      : Score(l,w,p), threshold(1), manager(m) {}
    void get_suggestions(NearMissesFinal &sug);
    void get_suggestions_ultra(NearMissesFinal &sug);
  };

  //
  // try_sound - tries the soundslike string if there is a match add 
  //    the possable words to near_misses
  //

  void Working::try_sound (const char * m, int ms)  
  {
    // sound is the object in the list which is a lot smaller than m

    for (Manager::DataSetCollection::const_iterator i 
	   = manager->word_set_collection().begin();
	 i != manager->word_set_collection().end();
	 ++i) {
      
      if (!i->use_to_suggest) continue;

      if (i->word_set->basic_type == DataSet::basic_word_set) {

	Emulation<const char *> e = static_cast<const BasicWordSet *>
	  (i->word_set)->words_w_soundslike(m);
	const char * word;
	while ((word = e.next()) != 0) 
	  add_nearmiss(word, ms, do_count, dont_need_alloc);
	
      } else {

	Emulation<ReplacementList> e 
	  = static_cast<const BasicReplacementSet *>
	  (i->word_set)->repls_w_soundslike(m);
	ReplacementList repl;
	while (! (repl = e.next()).empty() )
	  add_nearmiss(repl.misspelled_word, ms, 
		       dont_count, dont_need_alloc, repl.elements);	  
      }
    }
  }

  //
  // try_others - tries to come up with possible suggestions
  //
  
  void Working::try_others () {

    const char *replace_list = lang->soundslike_chars();
    
    const string & word       = orignal_word.word;
    const string & soundslike = orignal_word.soundslike;
    
    string::size_type i;
    
    string new_soundslike;
    new_soundslike.reserve(soundslike.size() + 1);

    char a,b;
    const char * c;

    // Insert a space or hyphone

    if (word.size() >= 4) {

      char * new_word = new char[word.size() + 2];
      strncpy(new_word, word.data(), word.size());
      new_word[word.size() + 1] = '\0';
      new_word[word.size() + 0] = new_word[word.size() - 1];

      for (i = word.size() - 2; i >= 2; --i) {
	new_word[i+1] = new_word[i];
	new_word[i] = '\0';
	
	if (manager->check(new_word) && manager->check(new_word + i + 1)) {
	  new_word[i] = ' ';
	  add_nearmiss(new_word, parms.edit_distance_weights.del2,
		       dont_count, do_need_alloc);

	  new_word[i] = '-';
	  add_nearmiss(new_word, parms.edit_distance_weights.del2,
		       dont_count, do_need_alloc);
	}
      }
      
      delete[] new_word;
    }

    if (parms.soundslike_level == 1) {
      
      try_sound(soundslike.c_str(), 0);

      // Change one letter
      
      new_soundslike = soundslike;

      for (i = 0; i != soundslike.size(); ++i) {
	for (c=replace_list; *c; ++c) {
	  if (*c == soundslike[i]) continue;
	  new_soundslike[i] = *c;
	  try_sound(new_soundslike.c_str(),parms.edit_distance_weights.sub);
	}
	new_soundslike[i] = soundslike[i];
      }

      // Interchange two adjacent letters.

      for (i = 0; i+1 != soundslike.size(); ++i) {
	a = new_soundslike[i];
	b = new_soundslike[i+1];
	new_soundslike[i] = b;
	new_soundslike[i+1] = a;
	try_sound(new_soundslike.c_str(),parms.edit_distance_weights.swap);
	new_soundslike[i] = a;
	new_soundslike[i+1] = b;
      }

      // Add one letter

      new_soundslike += ' ';
      i = new_soundslike.size()-1;
      while(true) {
	for (c=replace_list; *c; ++c) {
	  new_soundslike[i] = *c;
	  try_sound(new_soundslike.c_str(),parms.edit_distance_weights.del1);
	}
	if (i == 0) break;
	new_soundslike[i] = new_soundslike[i-1];
	--i;
      }
    
      // Delete one letter

      if (soundslike.size() > 1) {
	new_soundslike = soundslike;
	a = new_soundslike[new_soundslike.size() - 1];
	new_soundslike.resize(new_soundslike.size() - 1);
	i = new_soundslike.size();
	while (true) {
	  try_sound(new_soundslike.c_str(),parms.edit_distance_weights.del2);
	  if (i == 0) break;
	  b = a;
	  a = new_soundslike[i-1];
	  new_soundslike[i-1] = b;
	  --i;
	}
      }

    } else {

      const char * orignal_soundslike = orignal_word.soundslike.c_str();

      for (Manager::DataSetCollection::const_iterator i 
	     = manager->word_set_collection().begin();
	   i != manager->word_set_collection().end();
	   ++i) {

	if (!i->use_to_suggest) continue;
      
	if (i->word_set->basic_type == DataSet::basic_word_set) {

	  const BasicWordSet * word_set 
	    = static_cast<const BasicWordSet *>(i->word_set);

	  Emulation<SoundslikeWord> els = word_set->soundslike_elements();
    
	  SoundslikeWord w;
	  while ( (w = els.next()) == true) {
	    int score = limit2_edit_distance(orignal_soundslike, 
					     w.soundslike,
					     parms.edit_distance_weights);
	    if (score < LARGE_NUM) {
	      Emulation<const char *> e = word_set->words_w_soundslike(w);
	      const char * word;
	      while ((word = e.next()) != 0)
		add_nearmiss(word, score, do_count, dont_need_alloc);
	    }
	  }

	} else {
	
	  const BasicReplacementSet * repl_set
	    = static_cast<const BasicReplacementSet *>(i->word_set);

	  Emulation<SoundslikeWord> els = repl_set->soundslike_elements();
	
	  SoundslikeWord w;
	  while ( (w = els.next()) == true) {
	    int score = limit2_edit_distance(orignal_soundslike, 
					     w.soundslike,
					     parms.edit_distance_weights);
	  
	    if (score < LARGE_NUM) {
	      Emulation<ReplacementList> e = repl_set->repls_w_soundslike(w);
	      ReplacementList repl;
	      while (! (repl = e.next()).empty() )
		add_nearmiss(repl.misspelled_word, score, 
			     dont_count, dont_need_alloc, repl.elements);
	    }
	  
	  }
	
	}
      }
    }
  }

  void Working::score_list() {
    if (near_misses.empty()) return;

    parms.set_original_word_size(orignal_word.word.size());

    NearMisses::iterator i;
    NearMisses::iterator prev;
    int word_score;

    near_misses.push_front(ScoreWordSound());
    // the first item will NEVER be looked at.
    scored_near_misses.push_front(ScoreWordSound());
    scored_near_misses.front().score = -1;
    // this item will only be looked at when sorting so 
    // make it a small value to keep it at the front.

    int try_for = (parms.word_weight*parms.edit_distance_weights.max)/100;
    while (true) {
      try_for += (parms.word_weight*parms.edit_distance_weights.max)/100;

      // put all pairs whose score <= initial_limit*max_weight
      // into the scored list

      prev = near_misses.begin();
      i = prev;
      ++i;
      while (i != near_misses.end()) {

	int level = needed_level(try_for,  i->soundslike_score);
	
	if (level >= int(i->soundslike_score/parms.edit_distance_weights.min))
	  word_score = edit_distance(orignal_word.word_lower.c_str(),
				     i->word_lower,
				     level, level,
				     parms.edit_distance_weights);
	else
	  word_score = LARGE_NUM;

	if (word_score < LARGE_NUM) {
	  i->score = weighted_average(i->soundslike_score, word_score);
	  
	  ++i;
	  scored_near_misses.splice_after(scored_near_misses.begin(),prev);
	  
	} else {

	  prev = i;
	  ++i;
	}
      }

      scored_near_misses.sort();

      i = scored_near_misses.begin();
      ++i;

      if (i == scored_near_misses.end()) continue;
      
      int k = 0;
      while (preview_next(i) != scored_near_misses.end()) 
      // skip over the first couple of items as they should
      // not be counted in the threshold score.
	{
	  if (!i->count) {
	    ++i;
	  } else if (k == parms.skip) {
	    break;
	  } else {
	    ++k;
	    ++i;
	  }
	}

      if ((k == parms.skip && i->score <= try_for) 
	  || prev == near_misses.begin() ) // or no more left in near_misses
	break;
    }
    
    threshold = i->score + parms.span;
    if (threshold < parms.edit_distance_weights.max)
      threshold = parms.edit_distance_weights.max;

#  ifdef DEBUG_SUGGEST
    cout << "Threshold is: " << threshold << endl;
    cout << "try_for: " << try_for << endl;
    cout << "Size of scored: " << scored_near_misses.size() << endl;
    cout << "Size of ! scored: " << near_misses.size() << endl;
#  endif

    //if (threshold - try_for <=  parms.edit_distance_weights.max/2) return;

    prev = near_misses.begin();
    i = prev;
    ++i;
    while (i != near_misses.end()) {

      int initial_level = needed_level(try_for, i->soundslike_score);
      int max_level = needed_level(threshold, i->soundslike_score);
      
      if (initial_level < max_level)
	word_score = edit_distance(orignal_word.word_lower.c_str(),
				   i->word_lower,
				   initial_level+1,max_level,
				   parms.edit_distance_weights);
      else
	word_score = LARGE_NUM;

      if (word_score < LARGE_NUM) {
	i->score = weighted_average(i->soundslike_score, word_score);
	
	++i;
	scored_near_misses.splice_after(scored_near_misses.begin(),prev);
      
      } else {

	prev = i;
	++i;

      }
    }

    scored_near_misses.pop_front();
    scored_near_misses.sort();

  }

  class String : public string {
  public:
    String() {}
    String(const string & s) : string(s) {}
    String(const char * s)   : string(s) {}
  };

  void Working::transfer() {

#  ifdef DEBUG_SUGGEST
    cout << endl << endl 
       << orignal_word.word << '\t' 
       << orignal_word.soundslike << '\t'
       << endl;
#  endif
    int c = 1;
    hash_set<String,HashString<String> > duplicates_check;
    string final_word;
    pair<hash_set<String,HashString<String> >::iterator, bool> dup_pair;
    for (NearMisses::const_iterator i = scored_near_misses.begin();
	 i != scored_near_misses.end() && c <= parms.limit
	   && ( i->score <= threshold || c <= 3 );
	 ++i, ++c) {
#    ifdef DEBUG_SUGGEST
      cout << i->word << '\t' << i->score 
           << '\t' << lang->to_soundslike(i->word) << endl;
#    endif
      if (i->repl_list != 0) {
	const char * word;
	while((word = i->repl_list->next()) != 0) {
	  dup_pair = duplicates_check.insert(fix_case(word));
	  if (dup_pair.second)
	    near_misses_final->push_back(*dup_pair.first);
	}
      } else {
	dup_pair = duplicates_check.insert(fix_case(i->word));
	if (dup_pair.second)
	  near_misses_final->push_back(*dup_pair.first);
      }
    }
  }
  
  void Working::get_suggestions(NearMissesFinal & sug) {
    near_misses_final = & sug;
    if (orignal_word.soundslike.empty()) return;
    try_others();
    score_list();
    transfer();
  }
  
  class SuggestionListImpl : public SuggestionList {
    struct Parms {
      typedef const char *                    Value;
      typedef NearMissesFinal::const_iterator Iterator;
      Iterator end;
      Parms(Iterator e) : end(e) {}
      bool endf(Iterator e) const {return e == end;}
      Value end_state() const {return 0;}
      Value deref(Iterator i) const {return i->c_str();}
    };
  public:
    NearMissesFinal suggestions;

    SuggestionList * clone() const {return new SuggestionListImpl(*this);}
    void assign(const SuggestionList * other) {
      *this = *static_cast<const SuggestionListImpl *>(other);
    }

    bool empty() const { return suggestions.empty(); }
    Size size() const { return suggestions.size(); }
    VirEmul * elements() const {
      return new MakeVirEmulation<Parms>
	(suggestions.begin(), Parms(suggestions.end()));
    }
  };

  class SuggestImpl : public Suggest {
    const Manager * manager_;
    SuggestionListImpl  suggestion_list;
    SuggestParms parms_;
  public:
    SuggestImpl(const Manager * m) 
      : manager_(m), parms_(m->config().retrieve("sug-mode")) {}
    SuggestImpl(const Manager * m, const SuggestParms & p) 
      : manager_(m), parms_(p) {}
    double score(const char *base, const char *other) {
      //parms_.set_original_word_size(strlen(base));
      //Score s(&manager_->lang(),base,parms_);
      //string sl = manager_->lang().to_soundslike(other);
      //ScoreWordSound sws(other, sl.c_str());
      //s.score(sws);
      //return sws.score;
      return -1;
    }
    SuggestionList & suggest(const char * word) { 
#    ifdef DEBUG_SUGGEST
      cout << "=========== begin suggest " << word << " ===========\n";
#    endif
      parms_.set_original_word_size(strlen(word));
      suggestion_list.suggestions.resize(0);
      Working sug(manager_, &manager_->lang(),word,parms_);
      sug.get_suggestions(suggestion_list.suggestions);
#    ifdef DEBUG_SUGGEST
      cout << "^^^^^^^^^^^  end suggest " << word << "  ^^^^^^^^^^^\n";
#    endif
      return suggestion_list;
    }
  };
}

namespace aspell {
  Suggest * new_default_suggest(const Manager * m) {
    return new aspell_default_suggest::SuggestImpl(m);
  }

  Suggest * new_default_suggest(const Manager * m, const SuggestParms & p) {
    return new aspell_default_suggest::SuggestImpl(m,p);
  }
}
