-
Notifications
You must be signed in to change notification settings - Fork 0
/
match_text.py
52 lines (39 loc) · 1.63 KB
/
match_text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import pandas
import os
import io
import numpy as np
# Read the text transcriptions
path_hc = "./data/textHC.dat"
path_pd = "./data/textPD.dat"
# Get all audio files paths
path_audio = "./data/audios"
# read transcriptions (they have accents, so we need to use latin1 encoding)
with open(path_hc, "r", encoding="latin1") as f:
text_hc = f.readlines()
# For each line, get the name of the file
filenames = [x.split(" ")[0] for x in text_hc]
# Reorder the filenames, they are currently NEHCXXXX_text., their corresponding audios are named HC_text_XXXX.wav
text_names = [
x.split("0")[0][2:4] + "_" + x.split("_")[1] + "_" + x.split("C")[1][:4] + ".txt"
for x in filenames
]
real_transcription = [x.split("-")[1][1:-2] for x in text_hc]
# Now, save in "data/transcriptions" the transcriptions
for i in range(len(text_names)):
with open(os.path.join("data/transcriptions", text_names[i]), "w") as f:
f.write(real_transcription[i])
# Now, the same but for PD
with open(path_pd, "r", encoding="latin1") as f:
text_pd = f.readlines()
# For each line, get the name of the file
filenames = [x.split(" ")[0] for x in text_pd]
# Reorder the filenames, they are currently NEHCXXXX_text., their corresponding audios are named HC_text_XXXX.wav
text_names = [
x.split("0")[0][2:4] + "_" + x.split("_")[1] + "_" + x.split("D")[1][:4] + ".txt"
for x in filenames
]
real_transcription = [x.split("-")[1][1:-2] for x in text_pd]
# Now, save in "data/transcriptions" the transcriptions
for i in range(len(text_names)):
with open(os.path.join("data/transcriptions", text_names[i]), "w") as f:
f.write(real_transcription[i])