spell_correction_n_gram/app_2.py
2019-12-16 17:25:51 +07:00

120 lines
4.4 KiB
Python

from flask import Flask
from flask_restplus import Resource, Api
from datetime import datetime
from nltk.tokenize import word_tokenize
import re
from lm.lm import KenLM
import unidecode
import editdistance
app = Flask(__name__)
api = Api(app)
with open('./checkpoint/new_dictionary.txt') as f:
dict = f.read()
dict_not_tone = unidecode.unidecode(dict)
dict_ls = [i for i in word_tokenize(dict)]
def preprocessing(text):
text = text.lower()
text = text.replace('\n', '')
text = text.strip()
return text
def check_real_word(word):
if word in dict:
return True
else:
return False
def similarities_by_edit_distance(word, distance=2):
"""
generate a list of candidates which have edit distance <= 2 with the non-word error
:param word: non-word error
:param distance: edit distance {1, 2, 3, 4, ...}
:return: a list of candidates
"""
ls_temp = []
for i in dict_ls:
if editdistance.eval(i, word) <= distance:
ls_temp.append(i)
# print(ls_temp)
return ls_temp
class Predict:
def __init__(self, wlm_path):
print('Loading language model ...')
self.wlm = KenLM(wlm_path)
def beam_lm(self, predicted_seq, k=20):
# replace non-word errors with '*'
sentence = preprocessing(predicted_seq)
words = word_tokenize(sentence)
for index, word in enumerate(words):
if not check_real_word(word):
print('Non-Word-Error:', word)
words[index] = '*'
predicted_seq_copy = ' '.join(words)
def beam_lm_(predicted_seq, predicted_seq_uncertain):
uncertainties = [m.span() for m in re.finditer('\\*+', predicted_seq_uncertain)]
if len(uncertainties)==0:
return predicted_seq_uncertain
ls_words_predict_seq = word_tokenize(predicted_seq)
ls_words_uncertain = word_tokenize(predicted_seq_uncertain)
# topk_fwd = [predicted_seq_uncertain[0:uncertainties[0][0]]]
for i, v in enumerate(ls_words_uncertain):
if v != '*':
continue
# generate list of similar words
c = ls_words_predict_seq[i]
c_list = similarities_by_edit_distance(c)
# return old text if not have any similar words
if len(c_list) == 0:
return predicted_seq_uncertain
# left context
left_context = ' '.join(ls_words_uncertain[0:i])
if i == 0:
left_context = ''
# right context, remain context
remain_context = ''
right_context = ''
if i < len(ls_words_uncertain)-1:
ls_words_right_context = []
for index, w in enumerate(ls_words_uncertain[i+1:]):
if w == '*':
if index<len(ls_words_uncertain)-1:
remain_context = ' '.join(ls_words_uncertain[index+i+1:])
break
ls_words_right_context.append(w)
right_context = ' '.join(ls_words_right_context)
# get score of sentences with replacing similar words
candidates = []
for ch in c_list:
candidate = left_context + ' ' + ch + ' ' + right_context
score = self.score(candidate)
candidates.append({'candidate': candidate, 'score': score})
candidates = sorted(candidates, key = lambda i: i['score'], reverse=True)
best_candidate = candidates[0]['candidate'] + ' ' + remain_context
return beam_lm_(predicted_seq, best_candidate)
return beam_lm_(sentence, predicted_seq_copy)
def score(self, candidate):
return self.wlm.score(candidate)
wlm_path = "/home/minh/projects/aivivn-tone/lm/corpus-wplm-4g-v2.binary"
predict = Predict(wlm_path=wlm_path)
@api.route('/spell/<string:text>')
class Spell(Resource):
def get(self, text):
start = datetime.now()
predicted_seq = predict.beam_lm(predicted_seq=text)
duration = datetime.now() - start
dic = {'duration': str(duration), 'result': predicted_seq}
return dic
if __name__ == '__main__':
# reload(sys)
# sys.setdefaultencoding('utf-8')
app.run(debug=True, port=8080)