noise_data/acronym.py
2019-10-01 15:17:38 +07:00

50 lines
1.4 KiB
Python

# encoding = 'utf-8'
from tool.engine_crawling_thread import *
from time import sleep
from model.models import *
from random import choice
from datetime import datetime
start = datetime.now()
# standard0s = Standard2.query.filter(Standard0.id>=353055).all()
standard0s = NeuSrc.query.all()
print("Time to query:", datetime.now()-start)
vowel0s = Vowel0.query.all()
consonants = Consonant.query.all()
ls_rs = []
def unit(standard0):
# print(standard0.id, ':', standard0.value)
text = standard0.value.replace(u'\n', '').lower()
ls = text.split(' ')
text = ''
for word in ls:
if word == '':
continue
countv = 0
for v in vowel0s:
if v.value in word:
countv = countv + 1
countc = 0
for c in consonants:
if c.value in word:
countc = countc + 1
ls_nb = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '0']
countn = 0
for nb in ls_nb:
if nb in word:
countn = countn + 1
if countv == 0 and countc!=0 and countn==0:
print(word)
with open('acronym.txt', 'a', newline='') as f:
f.writelines(word+'\n')
ls_rs.append(word)
# for i in standard0s:
# unit(i)
engine_crawling_thread(standard0s, unit=unit, num_threads=5)
ls_rs = list(set(ls_rs))
print(len(ls_rs))