120 lines
4.4 KiB
Python
120 lines
4.4 KiB
Python
from flask import Flask
|
|
from flask_restplus import Resource, Api
|
|
from datetime import datetime
|
|
from nltk.tokenize import word_tokenize
|
|
import re
|
|
from lm.lm import KenLM
|
|
import unidecode
|
|
import editdistance
|
|
app = Flask(__name__)
|
|
api = Api(app)
|
|
|
|
with open('./checkpoint/new_dictionary.txt') as f:
|
|
dict = f.read()
|
|
|
|
dict_not_tone = unidecode.unidecode(dict)
|
|
dict_ls = [i for i in word_tokenize(dict)]
|
|
|
|
def preprocessing(text):
|
|
text = text.lower()
|
|
text = text.replace('\n', '')
|
|
text = text.strip()
|
|
return text
|
|
|
|
def check_real_word(word):
|
|
if word in dict:
|
|
return True
|
|
else:
|
|
return False
|
|
|
|
def similarities_by_edit_distance(word, distance=2):
|
|
"""
|
|
generate a list of candidates which have edit distance <= 2 with the non-word error
|
|
:param word: non-word error
|
|
:param distance: edit distance {1, 2, 3, 4, ...}
|
|
:return: a list of candidates
|
|
"""
|
|
ls_temp = []
|
|
for i in dict_ls:
|
|
if editdistance.eval(i, word) <= distance:
|
|
ls_temp.append(i)
|
|
# print(ls_temp)
|
|
return ls_temp
|
|
|
|
class Predict:
|
|
def __init__(self, wlm_path):
|
|
print('Loading language model ...')
|
|
self.wlm = KenLM(wlm_path)
|
|
def beam_lm(self, predicted_seq, k=20):
|
|
# replace non-word errors with '*'
|
|
sentence = preprocessing(predicted_seq)
|
|
words = word_tokenize(sentence)
|
|
for index, word in enumerate(words):
|
|
if not check_real_word(word):
|
|
print('Non-Word-Error:', word)
|
|
words[index] = '*'
|
|
predicted_seq_copy = ' '.join(words)
|
|
def beam_lm_(predicted_seq, predicted_seq_uncertain):
|
|
uncertainties = [m.span() for m in re.finditer('\\*+', predicted_seq_uncertain)]
|
|
if len(uncertainties)==0:
|
|
return predicted_seq_uncertain
|
|
ls_words_predict_seq = word_tokenize(predicted_seq)
|
|
ls_words_uncertain = word_tokenize(predicted_seq_uncertain)
|
|
# topk_fwd = [predicted_seq_uncertain[0:uncertainties[0][0]]]
|
|
for i, v in enumerate(ls_words_uncertain):
|
|
if v != '*':
|
|
continue
|
|
# generate list of similar words
|
|
c = ls_words_predict_seq[i]
|
|
c_list = similarities_by_edit_distance(c)
|
|
# return old text if not have any similar words
|
|
if len(c_list) == 0:
|
|
return predicted_seq_uncertain
|
|
# left context
|
|
left_context = ' '.join(ls_words_uncertain[0:i])
|
|
if i == 0:
|
|
left_context = ''
|
|
# right context, remain context
|
|
remain_context = ''
|
|
right_context = ''
|
|
if i < len(ls_words_uncertain)-1:
|
|
ls_words_right_context = []
|
|
for index, w in enumerate(ls_words_uncertain[i+1:]):
|
|
if w == '*':
|
|
if index<len(ls_words_uncertain)-1:
|
|
remain_context = ' '.join(ls_words_uncertain[index+i+1:])
|
|
break
|
|
ls_words_right_context.append(w)
|
|
right_context = ' '.join(ls_words_right_context)
|
|
# get score of sentences with replacing similar words
|
|
candidates = []
|
|
for ch in c_list:
|
|
candidate = left_context + ' ' + ch + ' ' + right_context
|
|
score = self.score(candidate)
|
|
candidates.append({'candidate': candidate, 'score': score})
|
|
candidates = sorted(candidates, key = lambda i: i['score'], reverse=True)
|
|
best_candidate = candidates[0]['candidate'] + ' ' + remain_context
|
|
return beam_lm_(predicted_seq, best_candidate)
|
|
|
|
return beam_lm_(sentence, predicted_seq_copy)
|
|
def score(self, candidate):
|
|
return self.wlm.score(candidate)
|
|
|
|
wlm_path = "/home/minh/projects/aivivn-tone/lm/corpus-wplm-4g-v2.binary"
|
|
predict = Predict(wlm_path=wlm_path)
|
|
|
|
@api.route('/spell/<string:text>')
|
|
class Spell(Resource):
|
|
def get(self, text):
|
|
start = datetime.now()
|
|
predicted_seq = predict.beam_lm(predicted_seq=text)
|
|
duration = datetime.now() - start
|
|
dic = {'duration': str(duration), 'result': predicted_seq}
|
|
return dic
|
|
|
|
|
|
if __name__ == '__main__':
|
|
# reload(sys)
|
|
# sys.setdefaultencoding('utf-8')
|
|
app.run(debug=True, port=8080)
|