-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrefine.py
117 lines (92 loc) Β· 4.56 KB
/
refine.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import os
import json
import regex as re
from tqdm import tqdm
# Define the preprocessing functions
def preprocess_text(text):
text = str(text)
# 1. Replace '\n' or ' ' with ' '
text = text.replace('\n', ' ')
text = text.replace(' ', ' ')
# 2. Replace 'γ
γ
' with 'μ'
text = re.sub(r'(?<=^|\s)γ
{2,}(?=$|\s)', 'μ', text)
# 3. Replace special characters
text = re.sub(r"\.{4,}", '...', text)
text = re.sub(r"\?{2,}", '??', text)
# 4. Remove special characters
special_characters = '~!><@_/-α’:;()^'
for char in special_characters:
text = text.replace(char, '')
# 5. Remove text matching regex patterns
text = re.sub(r'\bμλ
\w*', '', text)
text = re.sub(r'\bλ°κ°μ΅\w*', '', text)
# 6. Replace "name1" with "νμ1" and "name2" with "νμ2"
text = text.replace('name1', 'νμ1').replace('name2', 'νμ2')
# 7. Remove consonants/vowels only text like "γ
γ
γ
", "γ
γ
γ
", "γ
γ
γ
"
text = re.sub(r'[γ±-γ
γ
-γ
£]+', '', text)
# 8. If the text contains no Korean and only special characters, replace with empty string
if not re.search(r'[κ°-ν£]', text) and re.search(r'[^\w\s]', text):
text = ''
return text.strip()
# Function to merge consecutive utterances of the same speaker
def merge_consecutive_utterances(conversation, reference_id):
merged_conversation = []
previous_speaker = None
previous_utterance = ''
previous_utterance_ids = []
for utterance in conversation:
speaker = utterance['speaker']
text = utterance['utterance']
utterance_id = utterance['utterance_id']
if speaker == previous_speaker and utterance_id not in reference_id and previous_utterance_ids[-1] not in reference_id:
# Merge with previous utterance
previous_utterance += ' ' + text
previous_utterance_ids.append(utterance_id)
else:
if previous_speaker is not None and previous_utterance.strip() != '':
# Append the previous utterance to the merged conversation
merged_conversation.append({
'speaker': previous_speaker,
'utterance': previous_utterance.strip(),
'utterance_id': previous_utterance_ids if len(previous_utterance_ids) > 1 else previous_utterance_ids[0]
})
# Start a new utterance
previous_speaker = speaker
previous_utterance = text
previous_utterance_ids = [utterance_id]
# Add the last accumulated utterance
if previous_speaker is not None and previous_utterance.strip() != '':
merged_conversation.append({
'speaker': previous_speaker,
'utterance': previous_utterance.strip(),
'utterance_id': previous_utterance_ids if len(previous_utterance_ids) > 1 else previous_utterance_ids[0]
})
return merged_conversation
# Define input and output directories
input_data_dir = 'resource/data'
output_data_dir = 'resource/refined_data'
if __name__=="__main__":
# Ensure the output directory exists
os.makedirs(output_data_dir, exist_ok=True)
for filename in tqdm(os.listdir(input_data_dir)):
if filename.endswith('.json'):
filepath = os.path.join(input_data_dir, filename)
with open(filepath, 'r', encoding='utf-8') as file:
json_data = json.load(file)
output_json_data = []
for datum in json_data:
# Preprocess each conversation's utterance and remove empty utterances
preprocessed_conversation = []
for conversation in datum['input']['conversation']:
conversation['utterance'] = preprocess_text(conversation['utterance'])
if conversation['utterance'] != '':
preprocessed_conversation.append(conversation)
# Merge consecutive utterances
datum['input']['conversation'] = merge_consecutive_utterances(preprocessed_conversation, datum['input']['reference_id'])
output_json_data.append(datum)
# Define the output file path
output_filepath = os.path.join(output_data_dir, filename)
# Write the processed data to the output file
with open(output_filepath, 'w', encoding='utf-8') as file:
json.dump(output_json_data, file, ensure_ascii=False, indent=4)
print("Preprocessing, merging, and filtering complete.")