-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgetData.py
125 lines (121 loc) · 3.33 KB
/
getData.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
#enconding=utf-8
import os,sys,csv
import numpy as np
import pandas as pd
import codecs
import tensorflow as tf
from modules import *
def full_to_half(s):
"""
将全角字符转换为半角字符
"""
n = []
for char in s:
num = ord(char)
if num == 0x3000:
num = 32
elif 0xFF01 <= num <= 0xFF5E:
num -= 0xfee0
char = chr(num)
n.append(char)
return ''.join(n)
def replace_html(s):
s = s.replace('"','"')
s = s.replace('&','&')
s = s.replace('<','<')
s = s.replace('>','>')
s = s.replace(' ',' ')
s = s.replace("“", "")
s = s.replace("”", "")
s = s.replace("—","")
s = s.replace("\xa0", " ")
return(s)
def setdata(line):
line = line.replace('。','')
line = line.replace('?','')
line = line.replace('!','')
line = line.replace(',','')
line = line.replace('.','')
line = line.replace(',','')
line = line.replace('?','')
line = line.replace('!','')
line = line.replace('“','')
line = line.replace('”','')
return line
'''
y = tf.constant([[4,2,3,4,5,6,7,8,9]])
enc = embedding(y,
vocab_size=20,
num_units=8,
scale=True,
scope="enc_embed")
key_masks = tf.expand_dims(tf.sign(tf.reduce_sum(tf.abs(enc), axis=-1)), -1)
with tf.Session() as sess:
initall = tf.global_variables_initializer()
sess.run(initall)
print(sess.run(key_masks))
'''
vocab = {line.split()[0]:int(line.split()[1]) for line in codecs.open('data/vocab.tsv', 'r', 'utf-8').read().splitlines()}
fp = codecs.open('data/train.answer.tsv','r',encoding='utf-8-sig').read().split('\n')
#vocab = {}
for w in fp:
for i in w.strip():
if i in vocab.keys():
vocab[i] += 1
else:
vocab[i] = 1
with open('data/vocab.tsv','w',encoding='utf-8') as fa:
for k,v in vocab.items():
strs = k+' '+str(v)
fa.write(strs+'\n')
fa.close()
'''
fp = codecs.open('data/xiaohuangji50w_nofenci.conv','r',encoding='utf-8')
i = 1
asks = []
answers = []
sentence = []
for k,w in enumerate(fp):
w = w.strip()
if k > 0:
if "M" not in w and w != 'E':
continue
if i%3 == 0:
sentence[1] = sentence[1].replace(' ','')
sentence[2] = sentence[2].replace(' ','')
if sentence[1][1:] != '' and sentence[2][1:] != '':
asks.append(sentence[1][1:])
answers.append(sentence[2][1:])
sentence = []
i = 1
sentence.append(w)
else:
i += 1
sentence.append(w)
else:
sentence.append(w)
asks = list(filter(None,asks))
answers = list(filter(None,answers))
'''
fp = codecs.open('data/123.txt','r',encoding='utf-8-sig')
i = 1
asks = []
answers = []
for k,w in enumerate(fp):
w = w.strip()
w = full_to_half(w)
w = replace_html(w)
w = setdata(w)
if k%2 == 0:
asks.append(w)
else:
answers.append(w)
with open('data/train.ask.tsv','w',encoding='utf-8') as fa:
for w in asks:
fa.write(w+'\n')
with open('data/train.answer.tsv','w',encoding='utf-8') as fs:
for w in answers:
fs.write(w+'\n')
fa.close()
fs.close()
print('ok')