-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathSequenceMaker.py
49 lines (35 loc) · 1.34 KB
/
SequenceMaker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow import keras
import numpy as np
import json
tokenizer = Tokenizer()
text_file_path = 'text'
def get_raw_data_from_file( path ):
text = str()
with open(path, "r") as fd:
text += fd.read()
return text
raw_text = get_raw_data_from_file( text_file_path )
corpus = raw_text.split( "\n\n" )
tokenizer.fit_on_texts(corpus)
total_words = len( tokenizer.word_index ) + 1
input_sequences = []
for line in corpus:
token_list = tokenizer.texts_to_sequences([line])[0]
for i in range(1, len(token_list)):
n_gram_sequence = token_list[:i + 1]
input_sequences.append(n_gram_sequence)
sequence_lengths = list()
for x in input_sequences:
sequence_lengths.append( len( x ) )
max_sequence_len = max( sequence_lengths )
input_sequences = np.array(pad_sequences(input_sequences,
maxlen=max_sequence_len+1, padding='pre'))
x, y = input_sequences[:, :-1], input_sequences[:, -1]
y = keras.utils.to_categorical(y, num_classes=total_words)
np.save( 'data/x.npy', x)
np.save( 'data/y.npy', y)
config = { 'MAX_SEQUENCE_LEN' : max_sequence_len , 'TOTAL_WORDS' : total_words }
with open( 'data/config.json' , 'w' ) as file :
json.dump( config , file )