-
Notifications
You must be signed in to change notification settings - Fork 1
/
parseTcofSync_10sec.py
288 lines (285 loc) · 12.2 KB
/
parseTcofSync_10sec.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from xml.etree import ElementTree as ET
from sys import argv
from num2words import num2words
from unidecode import unidecode
import re
import os.path
import sys
def transformation_text(text):
bool=True
if "###" in text or len(re.findall(r"\[.+\]", text)) > 0 or \
len(re.findall(r"\p{L}+-[^\p{L}]|\p{L}+-$",text)) > 0 \
or len(re.findall("[^\p{L}]-\p{L}+|^-\p{L}+", text)) > 0:
#print text
#print "Ligne Supprime"
bool=False
else:
# 4x4
# Remove noise sound (BIP) over Name of places and person
#text = re.sub(r"¤[^ ]+|[^ ]+¤|¤", "", text.strip())
if len(re.findall(r"\dx\d",text))>0:
text=re.sub(r"x"," ",text)
if len(re.findall("\d+h\d+",text))>0:
heures=re.findall("\d+h\d+",text)
for h in heures:
split_h=h.split('h')
text_rep=split_h[0]+' heure '+split_h[1]
text=text.replace(h, text_rep)
text=re.sub(r',',' ',text)
# remove silence character : OK
#text=re.sub(r"(/.+/","remplacer par la 1er",text)
# Liaison non standard remarquable
text=re.sub(r'=','',text)
# Comment Transcriber
text=re.sub(r'\{.+\}','',text)
text=re.sub(r'\(.+\}','',text)
#print "detecter (///|/|<|>)"
# Remove undecidable variant heared like on (n') en:
text=re.sub(r"\(.+\)","",text)
#text = re.sub(r"(\+|[*]+|///|/|<|>)", "", text.strip())
#text=re.sub(r"-|_|\."," ",text.strip())
text=re.sub(r'(O.K.)','ok',text)
text = re.sub(r'(O.K)', 'ok', text)
# Replace . with ' '
text=re.sub(r'\.',' ',text)
#text=re.sub(r"{[^{]+}"," ",text.strip())
# Remove ? ! < > : OK
#<[^\p{L}]|[^\p{L}]>|#+|<\p{L}+[ ]|<\p{L}+$
text=re.sub(r":|\?|/|\!|<|>|#+","",text)
# replace silence character with <sil> : OK
#text=re.sub(r"(\+)", "<sil>", text)
text=re.sub(r"(\+)", "!SIL", text)
text=re.sub(r"(///)", "!SIL", text)
#text=re.sub(r"(///)", "<long-sil>", text)
if len(re.findall(r"/.+/", text)) > 0:
#print "AVANT***********"+text
for unchoosen_text in re.findall(r"/.+/", text):
# choose first undecideble word
unchoosen_word=unchoosen_text.split(',')
for choosen_word in unchoosen_word:
# isn't incomprehensible word
if len(re.findall(r"\*+|\d+", choosen_word))==0:
choosen_word = choosen_word.replace('/', '')
text = text.replace(unchoosen_text, choosen_word)
#print "Apres************"+text
# Remove noise sound (BIP) over Name of places and person
text=re.sub(r"(¤.+¤)",'<NOISE>',text)
# replace unkown syllable
text=re.sub(r"\*+","<SPOKEN_NOISE>",text)
# cut of recording : OK
text=re.sub(r"\$+","",text)
# remove " character: OK
text = re.sub(r"\"+", "", text)
# t 'avais
text = re.sub(r"[ ]\'", " ", text)
text = re.sub(r"\'", "\' ", text)
# convert number if exist : OK
num_list = re.findall(" \d+| \d+$", text)
if len(num_list) > 0:
#print text
#print "********************************* NUM2WORD"
for num in num_list:
num_in_word = num2words(int(num), lang='fr')
#num_in_word=normalize('NFKD', num_in_word).encode('ascii', 'ignore')
text = text.replace(str(num), " " + str(num_in_word) + " ")
#print text
# replace n succesive spaces with one space. : OK
text=re.sub(r"\s{2,}"," ",text)
text = re.sub("^ ", '', text)
# change bounding | to < and > : OK
#balise=set(re.findall(r"\|\w+_?\w+\|",text))
balise=set(re.findall(r"\|.+\|",text))
if len(balise)>0:
#print(balise)
for b in balise:
new_balise='<'+b[1:len(b)-1]+'>'
text=text.replace(b,new_balise)
#print(text)
# c'est l'essaim ....
text=text.lower()
return bool,text
if __name__=="__main__":
# Inputs
duration=5
file_trs=argv[1]
#print(file_trs)
#print file_trs
outdir=argv[2]
basename=os.path.basename(file_trs.split('.')[0])
# MetaData File
file_meta = file_trs.split('.')[0] + '.xml'
#print file_trs.split('.')[0]
# Output File needed for kaldi input
segments_file = open(outdir + '/segments', 'a')
utt2spk_file = open(outdir + '/utt2spk', 'a')
text_file = open(outdir + '/text', 'a')
wav_scp = open(outdir + '/wav.scp', 'a')
spk2gender= open(outdir + '/spk2gender', 'a')
# Read Trans File
tree_trs = ET.parse(file_trs)
trsdoc= tree_trs.getroot()
#Read MetaData Of speaker ( ID and Name)
speaker_id=[]
namespk=[]
for spk in trsdoc.iter('Speaker'):
id_spk=spk.get('id')
name_spk=unidecode(spk.get('name'))
#if isinstance(name_spk,str):
#print(type(name_spk))
#name_spk=normalize('NFKD', name_spk).encode('ascii', 'ignore')
speaker_id.append(id_spk.replace(" ",""))
namespk.append(name_spk.lower().replace(" ",""))
#Read MetaData To get Gender of Speaker (Gender and Name)
tree_meta = ET.parse(file_meta)
metadoc= tree_meta.getroot()
speaker_gender=[]
#print namespk
#print speaker_id
for loc in metadoc.iter('locuteur'):
if loc.attrib!=dict({}):
name_loc=loc.get('identifiant')
name_loc = unidecode(name_loc)
name_loc=name_loc.replace(" ","")
#print name_loc
#print name_loc
#If the gender of speaker doesn't mentioned
#print loc.findall('sexe')
if loc.findall('sexe')==[]:
speaker_gender.append([speaker_id[namespk.index(name_loc.lower())],'m'])
else:
# case 1 represent gender informat
gender_loc=loc.find('sexe').text
if gender_loc==None:
speaker_gender.append([speaker_id[namespk.index(name_loc.lower())], 'm'])
else:
#print namespk
#print namespk.index(name_loc.lower())
speaker_gender.append([speaker_id[namespk.index(name_loc.lower())],gender_loc.lower()])
#Turnlist= trsdoc.getElementsByTagName('Turn')
#Synclist= trsdoc.getElementsByTagName('Sync')
#print len(Turnlist)
#print len(Synclist)
#list_time_sync=[]Q
#for sync in Synclist:
# list_time_sync.append(float(sync.attributes['time'].value))
#a=""
#count=1
#print "#id_utt\tid_Seg\tid_Spkr\tstartTime\tendTime\tText"
text=""
Turn_count=0
count=0
has_attrib_speaker=False
# set for uniq add
Spk_that_contribute_to_meeting=set([])
start_utt=0
end_utt=0
sourceEncoding = "iso-8859-1"
targetEncoding = "utf-8"
seg_duration=0
for Element in trsdoc.iter():
if Element.tag=="Turn" and Element.get('speaker') is None:
has_attrib_speaker=False
elif Element.tag=="Turn":
# If the latest Utterance of previous Speaker is the latest one of his Turn speech
if Turn_count>0:
count = 0
#print text
### Save Files For Kaldi ###
seg_id = str(basename) + '_spk-%03d_Turn-%03d_seg-%07d' % (int(spkr.split('spk')[1]), int(Turn_count), int(count))
spkr_id=str(basename)+'_spk-%03d' % int(spkr.split('spk')[1])
bool, text = transformation_text(text)
# File wav.scp
# File utt2spk
# File text
# File speaker_gender
seg_duration=seg_duration+(float(endTime)-float(start_utt))
if bool and text!="" and seg_duration>=duration:
Spk_that_contribute_to_meeting.add(spkr)
segments_file.write(seg_id+" "+basename+" "+str(start_utt)+" "+str(endTime)+"\n")
start_utt=endTime
seg_duration=0
utt2spk_file.write(seg_id+" "+spkr_id+"\n")
text_file.write(seg_id+" "+text+"\n")
text=""
#for spk_tuple in speaker_gender:
# if spk_tuple[0]==spkr:
# print >> spk2gender,'%s %s' % (seg_id, spk_tuple[1])
# break
has_attrib_speaker=True
# Get id_spkr
spkr = Element.get('speaker')
#print file_trs
spkr=spkr.split()[0]
#print spkr
# Get StartSegment
startTime = Element.get('startTime')
# Get EndSegment
endTime = Element.get('endTime')
# count sync for computing start and end utterance
Turn_count = Turn_count+1
elif Element.tag=="Sync" and has_attrib_speaker:
Time_start_current_sync=Element.get('time')
if count>0:
#print text
### Save Files For Kaldi ###
seg_id = str(basename) + '_spk-%03d_Turn-%03d_seg-%07d' % (int(spkr.split('spk')[1]), int(Turn_count) , int(count))
spkr_id=str(basename)+'_spk-%03d' % int(spkr.split('spk')[1])
bool, text = transformation_text(text)
end_utt=Time_start_current_sync
seg_duration=seg_duration+(float(end_utt)-float(start_utt))
if bool and text!="" and seg_duration>=duration:
Spk_that_contribute_to_meeting.add(spkr)
segments_file.write(seg_id+" "+basename+" "+str(start_utt)+" "+str(end_utt)+"\n")
start_utt=Time_start_current_sync
utt2spk_file.write(seg_id+" "+spkr_id+"\n")
text_file.write(seg_id+" "+text+"\n")
text=""
seg_duration=0
text=text+" "+Element.tail.replace('\n', '')
count=count+1
elif Element.tag=="Comment" and has_attrib_speaker and not Element.tail is None:
text=text+" "+Element.tail.replace('\n', '')
elif Element.tag=="Event" and has_attrib_speaker and not Element.tail is None :
if Element.get('type')=='noise':
if Element.get('desc')=='rire':
text=text+" |LAUGH| "+Element.tail.replace('\n', '')
else:
text=text+" |NOISE| "+Element.tail.replace('\n', '')
elif Element.get('type')=='pronounce':
text=text+" |SPOKEN_NOISE| "+Element.tail.replace('\n', '')
else:
text=text+" |NOISE| "+Element.tail.replace('\n', '')
elif Element.tag=="Who" and has_attrib_speaker and not Element.tail is None:
text=text+" "+Element.tail.replace('\n', '')
#else:
# print Element.attrib,Element.tag
# text=str(Element.tail)
# print "*********warning********"+text
# Les phrases appartenant � un tour de parole
# The last Turn, check if count >0 and add latest utterance
#print count
#print has_attrib_speaker
#print Element.tail
if count > 0 and has_attrib_speaker and not Element.tail is None and seg_duration>duration:
#print text
### Save Files For Kaldi ###
seg_id = str(basename) + '_spk-%03d_Turn-%03d_seg-%07d' % (
int(spkr.split('spk')[1]), int(Turn_count), int(count))
spkr_id = str(basename) + '_spk-%03d' % int(spkr.split('spk')[1])
bool, text = transformation_text(text)
if bool and text != "":
Spk_that_contribute_to_meeting.add(spkr)
segments_file.write(seg_id+" "+basename+" "+str(start_utt)+" "+str(endTime)+"\n")
utt2spk_file.write(seg_id+" "+spkr_id+"\n")
text_file.write(seg_id+" "+text+"\n")
for spk in speaker_gender:
if spk[0] in Spk_that_contribute_to_meeting:
spk_id = str(basename)+'_spk-%03d' % int(spk[0].split('spk')[1])
spk2gender.write(spk_id+" "+spk[1]+"\n")
wav_scp.write(basename+" sox "+os.path.dirname(file_trs) + '/' + basename + '.wav'+" -t wav -r 16000 -c 1 - |\n")
segments_file.close()
utt2spk_file.close()
text_file.close()
wav_scp.close()