pro_spell/pro_check.py

from nltk.tokenize import word_tokenize
import re
from datetime import datetime
from lm.lm import KenLM
import unidecode
import editdistance
from alphabet import LEGAL, PUNCT

with open('./checkpoint/common-vietnamese-syllables.txt') as f:
    dict = f.read()

dict_not_tone = unidecode.unidecode(dict)
dict_ls = [i for i in word_tokenize(dict)]

def preprocessing(text):
    text = text.strip().lower()
    text = text.replace('\n', '')
    text = ''.join(c if c not in PUNCT else '-' for c in text)  # replace all punctuations with '-'
    line = ''.join(c if c in LEGAL else '?' for c in text)  # replace unknown characters with '?'
    return text

def check_real_word(word):
    if word in dict:
        return True
    else:
        return False

def similarities_by_edit_distance(word, distance=2):
    """
    generate a list of candidates which have edit distance <= 2 with the non-word error
    :param word: non-word error
    :param distance: edit distance {1, 2, 3, 4, ...}
    :return: a list of candidates
    """
    ls_temp = []
    for i in dict_ls:
        if editdistance.eval(i, word) <= distance:
            ls_temp.append(i)
    # print(ls_temp)
    return ls_temp

class Predict:
    def __init__(self, wlm_path):
        print('Loading language model ...')
        self.wlm = KenLM(wlm_path)
    def beam_lm(self, predicted_seq, k=20):
        print('day ne:', predicted_seq)
        # replace non-word errors with '*'
        sentence = preprocessing(predicted_seq)
        words = word_tokenize(sentence)
        ls_non_word_errors = []
        for index, word in enumerate(words):
            if not check_real_word(word):
                ls_non_word_errors.append(words[index])
                print('Non-Word-Error:', word)
                words[index] = '*'
        predicted_seq_copy = ' '.join(words)
        def beam_lm_(predicted_seq, predicted_seq_uncertain):
            uncertainties = [m.span() for m in re.finditer('\\*+', predicted_seq_uncertain)]
            if len(uncertainties)==0:
                return predicted_seq_uncertain, ls_non_word_errors
            ls_words_predict_seq = word_tokenize(predicted_seq)
            ls_words_uncertain = word_tokenize(predicted_seq_uncertain)
            topk_fwd = [predicted_seq_uncertain[0:uncertainties[0][0]]]
            for i, v  in enumerate(ls_words_uncertain):
                if v != '*':
                    continue
                # generate list of similar words
                c = ls_words_predict_seq[i]
                c_list = similarities_by_edit_distance(c)
                # return old text if not have any similar words
                if len(c_list) == 0:
                    return predicted_seq_uncertain
                # left context
                left_context = ' '.join(ls_words_uncertain[0:i])
                if i == 0:
                    left_context = ''
                # right context, remain context
                remain_context = ''
                right_context = ''
                if i < len(ls_words_uncertain)-1:
                    ls_words_right_context = []
                    for index, w in enumerate(ls_words_uncertain[i+1:]):
                        if w == '*':
                            if index<len(ls_words_uncertain)-1:
                                remain_context = ' '.join(ls_words_uncertain[index+i+1:])
                            break
                        ls_words_right_context.append(w)
                    right_context = ' '.join(ls_words_right_context)
                # get score of sentences with replacing similar words
                candidates = []
                for ch in c_list:
                    candidate = left_context + ' ' + ch + ' ' + right_context
                    score = self.score(candidate)
                    candidates.append({'candidate': candidate, 'score': score})
                candidates = sorted(candidates, key = lambda i: i['score'], reverse=True)
                best_candidate = candidates[0]['candidate'] + ' ' + remain_context
                return beam_lm_(predicted_seq, best_candidate)

        return beam_lm_(sentence, predicted_seq_copy)
    def beam_lm2(self, predicted_seq):
        print('day ne:', predicted_seq)
        # replace non-word errors with '*'
        sentence = preprocessing(predicted_seq)
        words = word_tokenize(sentence)
        ls_non_word_errors = []
        for index, word in enumerate(words):
            if not check_real_word(word):
                ls_non_word_errors.append(words[index])
                print('Non-Word-Error:', word)
                words[index] = '*'
        predicted_seq_copy = ' '.join(words)
        def beam_lm_(predicted_seq, predicted_seq_uncertain):
            uncertainties = [m.span() for m in re.finditer('\\*+', predicted_seq_uncertain)]
            if len(uncertainties)==0:
                return predicted_seq_uncertain, ls_non_word_errors
            ls_words_predict_seq = word_tokenize(predicted_seq)
            ls_words_uncertain = word_tokenize(predicted_seq_uncertain)
            topk_fwd = [predicted_seq_uncertain[0:uncertainties[0][0]]]
            for i, v  in enumerate(ls_words_uncertain):
                if v != '*':
                    continue
                # generate list of similar words
                c = ls_words_predict_seq[i]
                c_list = similarities_by_edit_distance(c)
                # return old text if not have any similar words
                if len(c_list) == 0:
                    return predicted_seq_uncertain
                # left context
                left_context = ' '.join(ls_words_uncertain[0:i])
                if i == 0:
                    left_context = ''
                # right context, remain context
                remain_context = ''
                right_context = ''
                if i < len(ls_words_uncertain)-1:
                    ls_words_right_context = []
                    for index, w in enumerate(ls_words_uncertain[i+1:]):
                        if w == '*':
                            if index<len(ls_words_uncertain)-1:
                                remain_context = ' '.join(ls_words_uncertain[index+i+1:])
                            break
                        ls_words_right_context.append(w)
                    right_context = ' '.join(ls_words_right_context)
                # get score of sentences with replacing similar words
                candidates = []
                for ch in c_list:
                    candidate = left_context + ' ' + ch + ' ' + right_context
                    score = self.score(candidate)
                    candidates.append({'candidate': candidate, 'score': score})
                candidates = sorted(candidates, key = lambda i: i['score'], reverse=True)
                best_candidate = candidates[0]['candidate'] + ' ' + remain_context
                return beam_lm_(predicted_seq, best_candidate)

        return beam_lm_(sentence, predicted_seq_copy)
    def score(self, candidate):
        return self.wlm.score(candidate)
start = datetime.now()
wlm_path = "/home/minh/projects/aivivn-tone/lm/corpus-wplm-4g-v2.binary"
predict = Predict(wlm_path=wlm_path)