/*
  MeCab -- Yet Another Part-of-Speech and Morphological Analyzer

  $Id: dictionary_rewriter.cpp,v 1.5 2006/07/09 13:34:22 taku-ku Exp $;

  Copyright (C) 2001-2006 Taku Kudo <taku@chasen.org>
  Copyright (C) 2004-2006 Nippon Telegraph and Telephone Corporation

*/
#include <cstring>
#include <string>
#include <vector>
#include <map>
#include <fstream>

#include "utils.h"
#include "common.h"
#include "dictionary_rewriter.h"

namespace MeCab {

  bool RewritePattern::match(const char *pat,
                             const char *str)
  {
    if (pat[0] == '*' || std::strcmp(pat, str) == 0)
      return true;

    size_t len = std::strlen(pat);
    if (len >= 3 && pat[0] == '(' && pat[len-1] == ')') {
      char buf[BUF_SIZE];
      char *col[BUF_SIZE];
      CHECK_DIE(len < sizeof(buf) - 3) << "too long parameter";
      std::strcpy(buf, pat + 1);
      buf[len-2] = '\0';
      size_t n = tokenize(buf, "|", col, sizeof(col));
      CHECK_DIE(n < sizeof(col)) << "too long OR nodes";
      for (size_t i = 0; i < n; ++i) {
        if (std::strcmp(str, col[i]) == 0) return true;
      }
    }

    return false;
  }

  bool RewritePattern::set_pattern(const char *src,
                                   const char *dst)
  {
    char buf[BUF_SIZE];
    spat_.clear();
    dpat_.clear();
    std::strncpy(buf, src, sizeof(buf));
    tokenizeCSV(buf, back_inserter(spat_), 512);
    std::strncpy(buf, dst, sizeof(buf));
    tokenizeCSV(buf, back_inserter(dpat_), 512);
    return (spat_.size() && dpat_.size());
  }

  bool RewritePattern::rewrite(size_t size,
                               char **input,
                               std::string &output)
  {
    if (spat_.size() > size) return false;
    for (size_t i = 0; i < spat_.size(); ++i) {
      if (! match(spat_[i].c_str(), input[i])) return false;
    }

    output.clear();
    for (size_t i = 0; i < dpat_.size(); ++i) {
      size_t len = dpat_[i].size();
      std::string elm;
      if (len >= 2 && dpat_[i][0] == '$') {
        size_t n = std::atoi(dpat_[i].c_str() + 1);
        CHECK_DIE((n - 1) < size)
          << " out of range: " << dpat_[i] << std::endl;
        elm = input[n - 1];
      } else {
        elm = dpat_[i];
      }
      CHECK_DIE(escape_csv_element(elm));
      output += elm;
      if (i != dpat_.size() - 1) output += ","; // lazy
    }

    return true;
  }

  bool RewriteRules::rewrite(size_t s,
                             char **input,
                             std::string &output)
  {
    for (size_t i = 0; i < this->size(); ++i) {
      if ((*this)[i].rewrite(s, input, output))
        return true;
    }
    return false;
  }

  void DictionaryRewriter::append(RewriteRules &r, char* str)
  {
    char *col[2];
    size_t n = tokenize2(str, " \t", col, 2);
    CHECK_DIE(n == 2) << "format error: " << str;
    r.resize(r.size() + 1);
    r.back().set_pattern(col[0], col[1]);
  }

  void DictionaryRewriter::clear()
  {
    cache_.clear();
  }

  bool DictionaryRewriter::open(const char *filename)
  {
    std::ifstream ifs(filename);
    CHECK_DIE(ifs) << "no such file or directory: " << filename;
    char line[BUF_SIZE];
    int append_to = 0;
    while (ifs.getline(line, sizeof(line))) {
      if (std::strlen(line) == 0 || line[0] == '#') continue;
      if (std::strcmp(line, "[unigram rewrite]") == 0) {
        append_to = 1;
      } else if (std::strcmp(line, "[left rewrite]") == 0) {
        append_to = 2;
      } else if (std::strcmp(line, "[right rewrite]") == 0) {
        append_to = 3;
      } else {
        CHECK_DIE(append_to != 0) << "no sections found";
        switch (append_to) {
        case 1: append(unigram_rewrite_, line); break;
        case 2: append(left_rewrite_,    line); break;
        case 3: append(right_rewrite_,   line); break;
        }
      }
    }
    return true;
  }

  // without cache
  bool DictionaryRewriter::rewrite(const std::string &feature,
                                   std::string &ufeature,
                                   std::string &lfeature,
                                   std::string &rfeature) {
    char buf[BUF_SIZE];
    char *col[BUF_SIZE];
    CHECK_DIE(feature.size() < sizeof(buf) - 1) << "too long feature";
    std::strncpy(buf, feature.c_str(), sizeof(buf) - 1);
    size_t n = tokenizeCSV(buf, col, sizeof(col));
    CHECK_DIE(n < sizeof(col)) << "too long CSV entities";
    return (unigram_rewrite_.rewrite(n, col, ufeature) &&
            left_rewrite_.rewrite(n, col, lfeature) &&
            right_rewrite_.rewrite(n, col, rfeature));
  }

  // with cache
  bool DictionaryRewriter::rewrite2(const std::string &feature,
                                    std::string &ufeature,
                                    std::string &lfeature,
                                    std::string &rfeature) {
    std::map<std::string, FeatureSet>::iterator it = cache_.find(feature);
    if (it == cache_.end()) {
      if (! rewrite(feature, ufeature, lfeature, rfeature)) return false;
      FeatureSet f;
      f.ufeature = ufeature;
      f.lfeature = lfeature;
      f.rfeature = rfeature;
      cache_.insert(std::make_pair<std::string, FeatureSet>(feature, f));
    } else {
      ufeature = it->second.ufeature;
      lfeature = it->second.lfeature;
      rfeature = it->second.rfeature;
    }

    return true;
  }

  bool POSIDGenerator::open(const char *filename)
  {
    std::ifstream ifs(filename);
    CHECK_DIE(ifs) << "no such file or directory: " << filename;
    char line[BUF_SIZE];
    while (ifs.getline(line, sizeof(line))) {
      char *col[2];
      size_t n = tokenize2(line, " \t", col, 2);
      CHECK_DIE(n == 2) << "format error: " << line;
      for (char *p = col[1]; *p; ++p) {
        CHECK_DIE(*p >= '0' && *p <= '9') << "not a number: " << col[1];
      }
      rewrite_.resize(rewrite_.size() + 1);
      rewrite_.back().set_pattern(col[0], col[1]);
    }
    return true;
  }

  int POSIDGenerator::id(const char *feature) {
    char buf[BUF_SIZE];
    char *col[BUF_SIZE];
    CHECK_DIE(std::strlen(feature) < sizeof(buf) - 1) << "too long feature";
    std::strncpy(buf, feature, sizeof(buf) - 1);
    size_t n = tokenizeCSV(buf, col, sizeof(col));
    CHECK_DIE(n < sizeof(col)) << "too long CSV entities";
    std::string tmp;
    if (!rewrite_.rewrite(n, col, tmp)) return -1;
    return std::atoi(tmp.c_str());
  }
}
