3
3
import requests
4
4
import pandas as pd
5
5
from path import Path
6
- from src . parameters import *
6
+ from parameters import *
7
7
8
8
9
- def downloadData (data_path = ' /input/speech_commands/' ):
9
+ def downloadData (data_path = " /input/speech_commands/" ):
10
10
"""
11
11
Downloads Google Speech Commands dataset (version0.01)
12
12
:param data_path: Path to download dataset
@@ -15,10 +15,10 @@ def downloadData(data_path='/input/speech_commands/'):
15
15
16
16
dataset_path = Path (os .path .abspath (__file__ )).parent .parent + data_path
17
17
18
- datasets = [' train' , ' test' ]
18
+ datasets = [" train" , " test" ]
19
19
urls = [
20
- ' http://download.tensorflow.org/data/speech_commands_v0.01.tar.gz' ,
21
- ' http://download.tensorflow.org/data/speech_commands_test_set_v0.01.tar.gz'
20
+ " http://download.tensorflow.org/data/speech_commands_v0.01.tar.gz" ,
21
+ " http://download.tensorflow.org/data/speech_commands_test_set_v0.01.tar.gz" ,
22
22
]
23
23
24
24
for dataset , url in zip (datasets , urls ):
@@ -27,7 +27,7 @@ def downloadData(data_path='/input/speech_commands/'):
27
27
# Check if we need to extract the dataset
28
28
if not os .path .isdir (dataset_directory ):
29
29
os .makedirs (dataset_directory )
30
- file_name = dataset_path + dataset + ' .tar.gz'
30
+ file_name = dataset_path + dataset + " .tar.gz"
31
31
32
32
# Check if the dataset has been downloaded, else download it
33
33
if os .path .isfile (file_name ):
@@ -36,7 +36,7 @@ def downloadData(data_path='/input/speech_commands/'):
36
36
print ("Downloading '{}' into '{}' file" .format (url , file_name ))
37
37
38
38
data_request = requests .get (url )
39
- with open (file_name , 'wb' ) as file :
39
+ with open (file_name , "wb" ) as file :
40
40
file .write (data_request .content )
41
41
42
42
# Extract downloaded file
@@ -54,7 +54,7 @@ def downloadData(data_path='/input/speech_commands/'):
54
54
print ("Input data setup successful." )
55
55
56
56
57
- def getDataDict (data_path = ' /input/speech_commands/' ):
57
+ def getDataDict (data_path = " /input/speech_commands/" ):
58
58
"""
59
59
Creates a dictionary with train, test, validate and test file names and labels.
60
60
:param data_path: Path to the downloaded dataset
@@ -64,24 +64,24 @@ def getDataDict(data_path='/input/speech_commands/'):
64
64
data_path = Path (os .path .abspath (__file__ )).parent .parent + data_path
65
65
66
66
# Get the validation files
67
- validation_files = open (data_path + ' train/validation_list.txt' ).read ().splitlines ()
68
- validation_files = [data_path + ' train/' + file_name for file_name in validation_files ]
67
+ validation_files = open (data_path + " train/validation_list.txt" ).read ().splitlines ()
68
+ validation_files = [data_path + " train/" + file_name for file_name in validation_files ]
69
69
70
70
# Get the dev files
71
- dev_files = open (data_path + ' train/testing_list.txt' ).read ().splitlines ()
72
- dev_files = [data_path + ' train/' + file_name for file_name in dev_files ]
71
+ dev_files = open (data_path + " train/testing_list.txt" ).read ().splitlines ()
72
+ dev_files = [data_path + " train/" + file_name for file_name in dev_files ]
73
73
74
74
# Find train_files as allFiles - {validation_files, dev_files}
75
75
all_files = []
76
- for root , dirs , files in os .walk (data_path + ' train/' ):
77
- all_files += [root + '/' + file_name for file_name in files if file_name .endswith (' .wav' )]
76
+ for root , dirs , files in os .walk (data_path + " train/" ):
77
+ all_files += [root + "/" + file_name for file_name in files if file_name .endswith (" .wav" )]
78
78
79
79
train_files = list (set (all_files ) - set (validation_files ) - set (dev_files ))
80
80
81
81
# Get the test files
82
82
test_files = list ()
83
- for root , dirs , files in os .walk (data_path + ' test/' ):
84
- test_files += [root + '/' + file_name for file_name in files if file_name .endswith (' .wav' )]
83
+ for root , dirs , files in os .walk (data_path + " test/" ):
84
+ test_files += [root + "/" + file_name for file_name in files if file_name .endswith (" .wav" )]
85
85
86
86
# Get labels
87
87
validation_file_labels = [getLabel (wav ) for wav in validation_files ]
@@ -90,17 +90,12 @@ def getDataDict(data_path='/input/speech_commands/'):
90
90
test_file_labels = [getLabel (wav ) for wav in test_files ]
91
91
92
92
# Create dictionaries containing (file, labels)
93
- trainData = {'files' : train_files , 'labels' : train_file_labels }
94
- valData = {'files' : validation_files , 'labels' : validation_file_labels }
95
- devData = {'files' : dev_files , 'labels' : dev_file_labels }
96
- testData = {'files' : test_files , 'labels' : test_file_labels }
97
-
98
- dataDict = {
99
- 'train' : trainData ,
100
- 'val' : valData ,
101
- 'dev' : devData ,
102
- 'test' : testData
103
- }
93
+ trainData = {"files" : train_files , "labels" : train_file_labels }
94
+ valData = {"files" : validation_files , "labels" : validation_file_labels }
95
+ devData = {"files" : dev_files , "labels" : dev_file_labels }
96
+ testData = {"files" : test_files , "labels" : test_file_labels }
97
+
98
+ dataDict = {"train" : trainData , "val" : valData , "dev" : devData , "test" : testData }
104
99
105
100
return dataDict
106
101
@@ -112,8 +107,8 @@ def getLabel(file_name):
112
107
:return: Class label
113
108
"""
114
109
115
- category = file_name .split ('/' )[- 2 ]
116
- label = categories .get (category , categories [' _background_noise_' ])
110
+ category = file_name .split ("/" )[- 2 ]
111
+ label = categories .get (category , categories [" _background_noise_" ])
117
112
118
113
return label
119
114
@@ -127,9 +122,9 @@ def getDataframe(data, include_unknown=False):
127
122
"""
128
123
129
124
df = pd .DataFrame (data )
130
- df [' category' ] = df .apply (lambda row : inv_categories [row [' labels' ]], axis = 1 )
125
+ df [" category" ] = df .apply (lambda row : inv_categories [row [" labels" ]], axis = 1 )
131
126
132
127
if not include_unknown :
133
- df = df .loc [df [' category' ] != ' _background_noise_' , :]
128
+ df = df .loc [df [" category" ] != " _background_noise_" , :]
134
129
135
130
return df
0 commit comments