-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_helper.py
73 lines (64 loc) · 2.63 KB
/
data_helper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Aug 15 14:29:03 2018
@author: wangxindi
This file is used to preprocess text
1. precentage XX% convert to "PERCENTAGE"
2. Chemical numbers(word contains both number and letters) to "Chem"
3. All numbers convert to "NUM"
4. Mathematical symbol (=, <, >, >/=, </= )
5. "-" replace with "_"
6. remove punctuation
7. covert to lowercase
"""
import re
import numpy as np
from keras.preprocessing import sequence
def text_preprocess(string):
string = re.sub("\\d+(\\.\\d+)?%", "Percentage", string)
string = re.sub("((?:[a-zA-Z]+[0-9]|[0-9]+[a-zA-Z])[a-zA-Z0-9]*)", "Chemical", string)
string = re.sub(r'[0-9]+', 'Num', string)
string = re.sub("=", "Equal", string)
string = re.sub(">", "Greater", string)
string = re.sub("<", "Less", string)
string = re.sub(">/=", "GreaterAndEqual", string)
string = re.sub("</=", "LessAndEqual", string)
string = re.sub("-", "_", string)
string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
string = re.sub("[.,?;*!%^&+():\[\]{}]", " ", string)
string = string.replace('"', '')
string = string.replace('/', '')
string = string.replace('\\', '')
string = string.replace("'", '')
string = re.sub(r"\s{2,}", " ", string)
return string.strip().lower()
def padding(input_data,maxlen):
padded_data = sequence.pad_sequences(input_data, maxlen)
return padded_data
def batch_iter(input_x, input_y, input_y_index, batch_size, num_epochs, max_seq_len, shuffle=True):
"""
Generates a batch iterator for a dataset.
"""
input_x = np.array(input_x)
data_size = len(input_x)
num_batches_per_epoch = int((len(input_x)-1)/batch_size) + 1
for epoch in range(num_epochs):
# Shuffle the data at each epoch
if shuffle:
shuffle_indices = np.random.permutation(np.arange(data_size))
shuffled_x = input_x[shuffle_indices]
shuffled_y = input_y[shuffle_indices]
shuffled_y_index = input_y_index[shuffle_indices]
else:
shuffled_x = input_x
shuffled_y = input_y
shuffled_y_index = input_y_index
for batch_num in range(num_batches_per_epoch):
start_index = batch_num * batch_size
end_index = min((batch_num + 1) * batch_size, data_size)
shuffled_x = shuffled_x[start_index:end_index]
padded_shuffled_x = padding(shuffled_x, max_seq_len)
shuffled_y = shuffled_y[start_index:end_index]
shuffled_y_index = shuffled_y_index[start_index:end_index]
yield padded_shuffled_x, shuffled_y, shuffled_y_index