-
Notifications
You must be signed in to change notification settings - Fork 0
/
create_dataset.py
104 lines (83 loc) · 3.54 KB
/
create_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
# This script takes as an argument the path to the folder which
# contains folders of images.
# It is assumed that name of each folder with images is
# the label for the images, that is all the images in each folder belong
# to the the same class, and the name of that class is the name of the folder.
# Images are assumed to be in the .png format. It is also assumed that
# each folder has the same number of images. It is NOT assumed that all images
# have the same dimensionality, but all the images will be rescaled to 32x32
# before being saved into the dataset file.
# The total number of images is assumed to be divisible by 10.
# The script will produce a file named "characters_dataset" which will
# contain the train/validation/test datasets and labels in numpy arrays.
# The file will also contain the names of all the
# image folders in alphabetic order.
import os
import sys
from scipy import misc
import numpy as np
from sklearn.model_selection import train_test_split
# The path that you have your image folders in
path = sys.argv[1]
# We rescale each image to be of size "SHAPE"
SHAPE = (96,96)
# folder_names is a sorted list containing names of all the folders with images
folder_names = []
for name in sorted(os.listdir(path)):
if os.path.isdir(os.path.join(path, name)):
folder_names.append(name)
# Each element of folder_files is a sorted list of file names
# that are contained within a folder from folder_names
folder_files = []
for folder_name in folder_names:
folder_files.append(sorted(os.listdir(os.path.join(path, folder_name))))
number_of_classes = len(folder_names)
# we assume that all classes have the same number of elements
number_of_examples_per_class = len(folder_files[0])
# the data samples X and the labels y
X = []
y = []
# Load the images and labels into numpy arrays
for i in range(number_of_classes):
for j in range(number_of_examples_per_class):
image_location = os.path.join(
path, folder_names[i], folder_files[i][j])
image = misc.imread(image_location)
image = misc.imresize(image, size=SHAPE, interp='bilinear', mode=None)
X.append(image)
y.append(i)
# Turn the samples into proper numpy array of type
# float32 (for use with GPU) rescaled in [0,1] interval.
X = np.float32(np.array(X)/255.0)
y = np.int32(np.array(y))
hex_codes = np.array(folder_names)
# Make so that each batch of size "number_of_classes" samples is
# balanced with respect to classes.
# That is, each batch of size "number_of_classes" samples
# will contain exactly one sample of each class.
# In this way, when we split the data into train, validation, and test
# datasets, all of them will be balanced with respect to classes
# as long as the sizes of all of them are divisible by "number_of_classes".
X = np.concatenate(
[X[i::number_of_examples_per_class]
for i in range(number_of_examples_per_class)])
y = np.concatenate(
[y[i::number_of_examples_per_class]
for i in range(number_of_examples_per_class)])
dataset_size = number_of_classes * number_of_examples_per_class
# train - test split is 80% - 20%
# We also assume that the dataset_size is divisible by 10.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(dataset_size)
print(hex_codes)
print("Xtrain",X_train)
print("X test",X_test)
print("ytrain",y_train)
print("y test",y_test)
f = open("characters_dataset", "wb")
np.save(f, X_train)
np.save(f, y_train)
np.save(f, X_test)
np.save(f, y_test)
np.save(f, hex_codes) # hex codes of each class (same as folder names)
f.close()