noise_data/acronym.py

# encoding = 'utf-8'
from tool.engine_crawling_thread import *
from time import sleep
from model.models import *
from random import choice
from datetime import datetime
start = datetime.now()
# standard0s = Standard2.query.filter(Standard0.id>=353055).all()
standard0s = NeuSrc.query.all()
print("Time to query:", datetime.now()-start)
vowel0s = Vowel0.query.all()
consonants = Consonant.query.all()

ls_rs = []
def unit(standard0):
    # print(standard0.id, ':', standard0.value)
    text = standard0.value.replace(u'\n', '').lower()
    ls = text.split(' ')
    text = ''
    for word in ls:
        if word == '':
            continue
        countv = 0
        for v in vowel0s:
            if v.value in word:
                countv = countv + 1
        countc = 0
        for c in consonants:
            if c.value in word:
                countc = countc + 1
        ls_nb = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '0']
        countn = 0
        for nb in ls_nb:
            if nb in word:
                countn = countn + 1
        if countv == 0 and countc!=0 and countn==0:
            print(word)
            with open('acronym.txt', 'a', newline='') as f:
                f.writelines(word+'\n')
            ls_rs.append(word)


# for i in standard0s:
#     unit(i)

engine_crawling_thread(standard0s, unit=unit, num_threads=5)

ls_rs = list(set(ls_rs))
print(len(ls_rs))