forked from modelscope/FunASR
-
Notifications
You must be signed in to change notification settings - Fork 0
/
split_data.py
executable file
·60 lines (45 loc) · 1.51 KB
/
split_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import os
import sys
import random
in_dir = sys.argv[1]
out_dir = sys.argv[2]
num_split = sys.argv[3]
def split_scp(scp, num):
assert len(scp) >= num
avg = len(scp) // num
out = []
begin = 0
for i in range(num):
if i == num - 1:
out.append(scp[begin:])
else:
out.append(scp[begin:begin+avg])
begin += avg
return out
os.path.exists("{}/wav.scp".format(in_dir))
os.path.exists("{}/text".format(in_dir))
with open("{}/wav.scp".format(in_dir), 'r') as infile:
wav_list = infile.readlines()
with open("{}/text".format(in_dir), 'r') as infile:
text_list = infile.readlines()
assert len(wav_list) == len(text_list)
x = list(zip(wav_list, text_list))
random.shuffle(x)
wav_shuffle_list, text_shuffle_list = zip(*x)
num_split = int(num_split)
wav_split_list = split_scp(wav_shuffle_list, num_split)
text_split_list = split_scp(text_shuffle_list, num_split)
for idx, wav_list in enumerate(wav_split_list, 1):
path = out_dir + "/split" + str(num_split) + "/" + str(idx)
if not os.path.exists(path):
os.makedirs(path)
with open("{}/wav.scp".format(path), 'w') as wav_writer:
for line in wav_list:
wav_writer.write(line)
for idx, text_list in enumerate(text_split_list, 1):
path = out_dir + "/split" + str(num_split) + "/" + str(idx)
if not os.path.exists(path):
os.makedirs(path)
with open("{}/text".format(path), 'w') as text_writer:
for line in text_list:
text_writer.write(line)