pro_spell/pro_check.py
2019-12-06 15:39:07 +07:00

160 lines
7.0 KiB
Python

from nltk.tokenize import word_tokenize
import re
from datetime import datetime
from lm.lm import KenLM
import unidecode
import editdistance
from alphabet import LEGAL, PUNCT
with open('./checkpoint/common-vietnamese-syllables.txt') as f:
dict = f.read()
dict_not_tone = unidecode.unidecode(dict)
dict_ls = [i for i in word_tokenize(dict)]
def preprocessing(text):
text = text.strip().lower()
text = text.replace('\n', '')
text = ''.join(c if c not in PUNCT else '-' for c in text) # replace all punctuations with '-'
line = ''.join(c if c in LEGAL else '?' for c in text) # replace unknown characters with '?'
return text
def check_real_word(word):
if word in dict:
return True
else:
return False
def similarities_by_edit_distance(word, distance=2):
"""
generate a list of candidates which have edit distance <= 2 with the non-word error
:param word: non-word error
:param distance: edit distance {1, 2, 3, 4, ...}
:return: a list of candidates
"""
ls_temp = []
for i in dict_ls:
if editdistance.eval(i, word) <= distance:
ls_temp.append(i)
# print(ls_temp)
return ls_temp
class Predict:
def __init__(self, wlm_path):
print('Loading language model ...')
self.wlm = KenLM(wlm_path)
def beam_lm(self, predicted_seq, k=20):
print('day ne:', predicted_seq)
# replace non-word errors with '*'
sentence = preprocessing(predicted_seq)
words = word_tokenize(sentence)
ls_non_word_errors = []
for index, word in enumerate(words):
if not check_real_word(word):
ls_non_word_errors.append(words[index])
print('Non-Word-Error:', word)
words[index] = '*'
predicted_seq_copy = ' '.join(words)
def beam_lm_(predicted_seq, predicted_seq_uncertain):
uncertainties = [m.span() for m in re.finditer('\\*+', predicted_seq_uncertain)]
if len(uncertainties)==0:
return predicted_seq_uncertain, ls_non_word_errors
ls_words_predict_seq = word_tokenize(predicted_seq)
ls_words_uncertain = word_tokenize(predicted_seq_uncertain)
topk_fwd = [predicted_seq_uncertain[0:uncertainties[0][0]]]
for i, v in enumerate(ls_words_uncertain):
if v != '*':
continue
# generate list of similar words
c = ls_words_predict_seq[i]
c_list = similarities_by_edit_distance(c)
# return old text if not have any similar words
if len(c_list) == 0:
return predicted_seq_uncertain
# left context
left_context = ' '.join(ls_words_uncertain[0:i])
if i == 0:
left_context = ''
# right context, remain context
remain_context = ''
right_context = ''
if i < len(ls_words_uncertain)-1:
ls_words_right_context = []
for index, w in enumerate(ls_words_uncertain[i+1:]):
if w == '*':
if index<len(ls_words_uncertain)-1:
remain_context = ' '.join(ls_words_uncertain[index+i+1:])
break
ls_words_right_context.append(w)
right_context = ' '.join(ls_words_right_context)
# get score of sentences with replacing similar words
candidates = []
for ch in c_list:
candidate = left_context + ' ' + ch + ' ' + right_context
score = self.score(candidate)
candidates.append({'candidate': candidate, 'score': score})
candidates = sorted(candidates, key = lambda i: i['score'], reverse=True)
best_candidate = candidates[0]['candidate'] + ' ' + remain_context
return beam_lm_(predicted_seq, best_candidate)
return beam_lm_(sentence, predicted_seq_copy)
def beam_lm2(self, predicted_seq):
print('day ne:', predicted_seq)
# replace non-word errors with '*'
sentence = preprocessing(predicted_seq)
words = word_tokenize(sentence)
ls_non_word_errors = []
for index, word in enumerate(words):
if not check_real_word(word):
ls_non_word_errors.append(words[index])
print('Non-Word-Error:', word)
words[index] = '*'
predicted_seq_copy = ' '.join(words)
def beam_lm_(predicted_seq, predicted_seq_uncertain):
uncertainties = [m.span() for m in re.finditer('\\*+', predicted_seq_uncertain)]
if len(uncertainties)==0:
return predicted_seq_uncertain, ls_non_word_errors
ls_words_predict_seq = word_tokenize(predicted_seq)
ls_words_uncertain = word_tokenize(predicted_seq_uncertain)
topk_fwd = [predicted_seq_uncertain[0:uncertainties[0][0]]]
for i, v in enumerate(ls_words_uncertain):
if v != '*':
continue
# generate list of similar words
c = ls_words_predict_seq[i]
c_list = similarities_by_edit_distance(c)
# return old text if not have any similar words
if len(c_list) == 0:
return predicted_seq_uncertain
# left context
left_context = ' '.join(ls_words_uncertain[0:i])
if i == 0:
left_context = ''
# right context, remain context
remain_context = ''
right_context = ''
if i < len(ls_words_uncertain)-1:
ls_words_right_context = []
for index, w in enumerate(ls_words_uncertain[i+1:]):
if w == '*':
if index<len(ls_words_uncertain)-1:
remain_context = ' '.join(ls_words_uncertain[index+i+1:])
break
ls_words_right_context.append(w)
right_context = ' '.join(ls_words_right_context)
# get score of sentences with replacing similar words
candidates = []
for ch in c_list:
candidate = left_context + ' ' + ch + ' ' + right_context
score = self.score(candidate)
candidates.append({'candidate': candidate, 'score': score})
candidates = sorted(candidates, key = lambda i: i['score'], reverse=True)
best_candidate = candidates[0]['candidate'] + ' ' + remain_context
return beam_lm_(predicted_seq, best_candidate)
return beam_lm_(sentence, predicted_seq_copy)
def score(self, candidate):
return self.wlm.score(candidate)
start = datetime.now()
wlm_path = "/home/minh/projects/aivivn-tone/lm/corpus-wplm-4g-v2.binary"
predict = Predict(wlm_path=wlm_path)