-
Notifications
You must be signed in to change notification settings - Fork 0
/
textprocess.py
165 lines (135 loc) · 6.32 KB
/
textprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
import os
import re
from pathlib import Path
def preprocessText(post_id):
file_path = os.path.abspath(f'posts/{post_id}.txt')
with open(file_path, 'r', encoding='utf-8') as f:
file_contents = f.read()
file_contents = removeUnicode(file_contents)
# Replace all asterisks with empty string
file_contents = file_contents.replace('*', '')
# Replace all parentheses with empty string
file_contents = file_contents.replace('(', '')
file_contents = file_contents.replace(')', '')
# Replaces European single quotes with American ones, this ruins the TTS.
file_contents = file_contents.replace("’","'")
file_contents = file_contents.replace("‘","'")
file_contents = file_contents.replace(';', '.')
file_contents = file_contents.replace('“', '"')
file_contents = file_contents.replace(',', ',')
file_contents = file_contents.replace('“', '"')
file_contents = file_contents.replace('”', '"')
file_contents = file_contents.replace('_', ' ')
file_contents = file_contents.replace('…', '"')
file_contents = file_contents.replace('TIFU', 'T.I.F.U')
file_contents = file_contents.replace('AITA', 'A.I.T.A')
file_contents = file_contents.replace('A-I-T-A ', 'A.I.T.A')
file_contents = file_contents.replace('WIBTA', 'W.I.B.T.A')
#removes links
file_contents = re.sub(r'http\S+', '', file_contents) # Remove any links starting with http
file_contents = re.sub(r'www\.\S+', '', file_contents) # Remove any links starting with www
#removes silly unicode emojis
file_contents = file_contents.replace('♥️', ' ')
file_contents = file_contents.replace('😂', ' ')
# Write the updated contents back to the file
with open(file_path, 'w', encoding='utf-8') as f:
f.write(file_contents)
def preprocessString(text):
# Replace all asterisks with empty string
text = text.replace('*', '')
text = re.sub(r'http\S+', '', text) # Remove any links starting with http
text = re.sub(r'www\.\S+', '', text) # Remove any links starting with www
# Replace all parentheses with empty string
text = text.replace('(', '')
text = text.replace(')', '')
# Replaces European single quotes with American ones, this ruins the TTS.
text = text.replace("’","'")
text = text.replace("‘","'")
text = text.replace(';', '.')
text = text.replace('.', '.')
text = text.replace(',', ',')
# Replace words for TTS to process easier and more consistently.
text = text.replace('TIFU', 'T.I.F.U')
text = text.replace('AITA', 'A.I.T.A')
text = text.replace('A-A-T-A', 'A.I.T.A')
return text
def removeUnicode(text):
# Create an empty list to store the individual characters of the new text
new_text_chars = []
# Iterate over each character in the text
for char in text:
# Check if the character is a Unicode character
if ord(char) > 127:
# If the character is a Unicode character, skip it
continue
# If the character is not a Unicode character, add it to the new text
new_text_chars.append(char)
# Combine the characters of the new text into a single string and return it
new_text = "".join(new_text_chars)
return new_text
def processSRT(post_id):
file_path = os.path.abspath(f'srt/{post_id}.srt')
with open(file_path, 'r', encoding='utf-8') as f:
file_contents = f.read()
# Replace all asterisks with empty string
file_contents = file_contents.replace('aita', 'AITA')
file_contents = file_contents.replace('aida', 'AITA')
file_contents = file_contents.replace('Wipta', 'WIBTA')
file_contents = file_contents.replace('Ida', 'AITA')
file_contents = file_contents.replace('A-A-T-A ', 'AITA')
file_contents = file_contents.replace('AATA', 'AITA')
file_contents = file_contents.replace('Typhoo', 'TIFU')
file_contents = file_contents.replace('A-T-A ', 'AITA')
file_contents = file_contents.replace('I.T.A.', 'AITA')
file_contents = file_contents.replace('A-I-T-A ', 'AITA')
# Replace all parentheses with empty string
file_contents = file_contents.replace('(', '')
file_contents = file_contents.replace(')', '')
# Replaces European single quotes with American ones, this ruins the TTS.
file_contents = file_contents.replace("’","'")
file_contents = file_contents.replace("‘","'")
file_contents = file_contents.replace(';', '.')
file_contents = file_contents.replace('“', '"')
file_contents = file_contents.replace(',', ',')
file_contents = file_contents.replace('“', '"')
file_contents = file_contents.replace('”', '"')
# Write the updated contents back to the file
with open(file_path, 'w', encoding='utf-8') as f:
f.write(file_contents)
def createPostTextFile(title, body, author, post_id, input_text):
"""
Creates a text file with post information for text-to-speech processing.
"""
filename = f"{post_id}.txt"
path = Path("posts") / filename
preprocessString(body)
if os.path.exists(os.path.join("posts", filename)):
print(f"{post_id} - Skipping TXT creation for {post_id}. File already exists.")
return None
with open(path, "w") as file:
file.write(input_text)
print(f"Saved post with ID {post_id} to {path}")
def identifyUnicode(text):
removeUnicode(text)
text = removeUnicode(text)
# Create an empty dictionary to store the count of each Unicode character
char_counts = {}
# Iterate over each character in the text
for char in text:
# Ignore letters (capital or lowercase)
if char.isalpha():
continue
# Check if the character is a Unicode character
if ord(char) > 127:
# If the character is not already in the dictionary, add it with a count of 1
if char not in char_counts:
char_counts[char] = 1
# If the character is already in the dictionary, increment its count
else:
char_counts[char] += 1
# Output the Unicode characters and their counts in the desired format
if char_counts:
for char, count in char_counts.items():
print(f"Unicode ({ord(char)}), {char}, shown ({count}) times")
else:
print("No Unicode characters were found outside of the ASCII range")