noise_data/to_raw_telex.py

97 lines
3.4 KiB
Python

# encoding = 'utf-8'
from tool.engine_crawling_thread import *
from time import sleep
from model.models import *
from random import choice
from datetime import datetime
start = datetime.now()
# standard0s = Standard2.query.filter(Standard0.id>=353055).all()
standard0s = StandardVal.query.all()
print("Time to query:", datetime.now()-start)
vowels = Vowel.query.all()
def unit(standard0):
print(standard0.id, ':', standard0.value)
text = standard0.value.replace(u'\n', '')
ls = text.split(' ')
text = ''
# text1 = ''
# text2 = ''
# text3 = ''
for word in ls:
if word == '':
text = text + word + ' '
# text1 = text1 + word + ' '
# text2 = text2 + word + ' '
# text3 = text3 + word + ' '
continue
if word[-1:] == '-':
x = choice([0, 1])
if x == 0 and 'đ' in word[0]:
word = word.replace('đ', 'dd')
elif x ==1 and 'đ' in word[0]:
word = word.replace('đ', 'd') + 'd'
word = word[:-1]
# print(word)
word0 = word
# word1 = word
# word2 = word
# word3 = word
for v in vowels:
if v.value in word:
x = choice([1, 2, 3])
if x==1:
word0 = word.replace(v.value, v.raw_telex1) + v.raw_telex1_2
if x == 2:
word0 = word.replace(v.value, v.raw_telex2) + v.raw_telex2_2
if x == 3:
word0 = word.replace(v.value, v.raw_telex3) + v.raw_telex3_2
break
word0 = word0 + '-'
# word1 = word1 + '-'
# word2 = word2 + '-'
# word3 = word3 + '-'
else:
x = choice([0, 1])
if x == 0 and 'đ' in word[0]:
word = word.replace('đ', 'dd')
elif x == 1 and 'đ' in word[0]:
word = word.replace('đ', 'd') + 'd'
word0 = word
# word1 = word
# word2 = word
# word3 = word
for v in vowels:
if v.value in word:
# word1 = word.replace(v.value, v.raw_telex1) + v.raw_telex1_2
# word2 = word.replace(v.value, v.raw_telex2) + v.raw_telex2_2
# word3 = word.replace(v.value, v.raw_telex3) + v.raw_telex3_2
# break
x = choice([1, 2, 3])
if x == 1:
word0 = word.replace(v.value, v.raw_telex1) + v.raw_telex1_2
if x == 2:
word0 = word.replace(v.value, v.raw_telex2) + v.raw_telex2_2
if x == 3:
word0 = word.replace(v.value, v.raw_telex3) + v.raw_telex3_2
break
text = text + word0 + ' '
# text1 = text1 + word1 + ' '
# text2 = text2 + word2 + ' '
# text3 = text3 + word3 + ' '
text = text[:-1] + '\n'
# text1 = text1[:-1]+'\n'
# text2 = text2[:-1]+'\n'
# text3 = text3[:-1]+'\n'
# print(text1)
# print(text2)
# print(text3)
# print(text)
raw_telex = RawTelexVal(standard_id=standard0.id, raw_telex=text)
db.session.add(raw_telex)
db.session.commit()
# for i in standard0s:
# unit(i)
engine_crawling_thread(standard0s, unit=unit, num_threads=5)