spell_correction_n_gram/app_2.py

from flask import Flask
from flask_restplus import Resource, Api
from datetime import datetime
from nltk.tokenize import word_tokenize
import re
from lm.lm import KenLM
import unidecode
import editdistance
app = Flask(__name__)
api = Api(app)

with open('./checkpoint/new_dictionary.txt') as f:
    dict = f.read()

dict_not_tone = unidecode.unidecode(dict)
dict_ls = [i for i in word_tokenize(dict)]

def preprocessing(text):
    text = text.lower()
    text = text.replace('\n', '')
    text = text.strip()
    return text

def check_real_word(word):
    if word in dict:
        return True
    else:
        return False

def similarities_by_edit_distance(word, distance=2):
    """
    generate a list of candidates which have edit distance <= 2 with the non-word error
    :param word: non-word error
    :param distance: edit distance {1, 2, 3, 4, ...}
    :return: a list of candidates
    """
    ls_temp = []
    for i in dict_ls:
        if editdistance.eval(i, word) <= distance:
            ls_temp.append(i)
    # print(ls_temp)
    return ls_temp

class Predict:
    def __init__(self, wlm_path):
        print('Loading language model ...')
        self.wlm = KenLM(wlm_path)
    def beam_lm(self, predicted_seq, k=20):
        # replace non-word errors with '*'
        sentence = preprocessing(predicted_seq)
        words = word_tokenize(sentence)
        for index, word in enumerate(words):
            if not check_real_word(word):
                print('Non-Word-Error:', word)
                words[index] = '*'
        predicted_seq_copy = ' '.join(words)
        def beam_lm_(predicted_seq, predicted_seq_uncertain):
            uncertainties = [m.span() for m in re.finditer('\\*+', predicted_seq_uncertain)]
            if len(uncertainties)==0:
                return predicted_seq_uncertain
            ls_words_predict_seq = word_tokenize(predicted_seq)
            ls_words_uncertain = word_tokenize(predicted_seq_uncertain)
            # topk_fwd = [predicted_seq_uncertain[0:uncertainties[0][0]]]
            for i, v  in enumerate(ls_words_uncertain):
                if v != '*':
                    continue
                # generate list of similar words
                c = ls_words_predict_seq[i]
                c_list = similarities_by_edit_distance(c)
                # return old text if not have any similar words
                if len(c_list) == 0:
                    return predicted_seq_uncertain
                # left context
                left_context = ' '.join(ls_words_uncertain[0:i])
                if i == 0:
                    left_context = ''
                # right context, remain context
                remain_context = ''
                right_context = ''
                if i < len(ls_words_uncertain)-1:
                    ls_words_right_context = []
                    for index, w in enumerate(ls_words_uncertain[i+1:]):
                        if w == '*':
                            if index<len(ls_words_uncertain)-1:
                                remain_context = ' '.join(ls_words_uncertain[index+i+1:])
                            break
                        ls_words_right_context.append(w)
                    right_context = ' '.join(ls_words_right_context)
                # get score of sentences with replacing similar words
                candidates = []
                for ch in c_list:
                    candidate = left_context + ' ' + ch + ' ' + right_context
                    score = self.score(candidate)
                    candidates.append({'candidate': candidate, 'score': score})
                candidates = sorted(candidates, key = lambda i: i['score'], reverse=True)
                best_candidate = candidates[0]['candidate'] + ' ' + remain_context
                return beam_lm_(predicted_seq, best_candidate)

        return beam_lm_(sentence, predicted_seq_copy)
    def score(self, candidate):
        return self.wlm.score(candidate)

wlm_path = "/home/minh/projects/aivivn-tone/lm/corpus-wplm-4g-v2.binary"
predict = Predict(wlm_path=wlm_path)

@api.route('/spell/<string:text>')
class Spell(Resource):
    def get(self, text):
        start = datetime.now()
        predicted_seq = predict.beam_lm(predicted_seq=text)
        duration = datetime.now() - start
        dic = {'duration': str(duration), 'result': predicted_seq}
        return dic


if __name__ == '__main__':
    # reload(sys)
    # sys.setdefaultencoding('utf-8')
    app.run(debug=True, port=8080)