-
Notifications
You must be signed in to change notification settings - Fork 0
/
translate-gpt.py
135 lines (108 loc) · 5.21 KB
/
translate-gpt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import os
import argparse
import sys
import re
import unicodedata
from charset_normalizer import from_path
import concurrent.futures
import openai
from GlotScript import get_script_predictor
sp = get_script_predictor()
import openai
openai.api_key = os.getenv("OPENAI_API_KEY")
def translate_with_chatgpt(batch):
response = openai.chat.completions.create(
model="gpt-4-0125-preview",
messages=[
{
"role": "system",
"content": """
Your role involves translating non-English text from source code into concise English. These translations will replace the original text in the code, so brevity is key. Focus on translating GUI labels concisely, treating them as labels or proper nouns rather than detailed descriptions. If possible, use acronyms, omit standard stopwords, and use shorter English equivalents to keep the translations compact.
Maintain any existing punctuation or symbols in the translations, as they are critical for preserving code context and functionality.
Given the compact nature of Chinese, be mindful of the layout impact when translating brief Chinese phrases into potentially longer English phrases.
Present your translations line-by-line, ensuring that the number of output lines matches the number of input lines exactly. This alignment is crucial for integrating the translations back into the source code seamlessly."""
},
{
"role": "user",
"content": "\n".join(batch)
}
], top_p=1, frequency_penalty=0, stream=True)
content = ""
for chunk in response:
delta_content = chunk.choices[0].delta.content if chunk.choices[0].delta.content is not None else None
if delta_content:
print(delta_content, end="", flush=True, )
content += delta_content
# time.sleep(0.1) # Delay
else:
print("done!")
translations = content.split("\n") #response.choices[0].message.content.strip().split("\n")
return dict(zip(batch, translations))
def get_punctuation_by_category(categories):
"""Return a set of punctuation characters based on given unicode categories."""
punctuations = set()
for i in range(0x10FFFF): # loop over all valid unicode codepoints
char = chr(i)
if unicodedata.category(char) in categories:
punctuations.add(char)
return punctuations
# Create the set of characters based on the Unicode categories and additional characters
fs = get_punctuation_by_category(["Pf", "Pi"]).union(frozenset("><\"`'/${}*|;"))
def is_punctuation(s):
"""Check if the entire string consists of punctuation characters."""
return all(unicodedata.category(c).startswith('P') for c in s)
escapes = ['n','r','t','b','f','x']
def extract_unicode_blocks(text):
blocks = set()
for line in text.splitlines():
line = line.strip()
for chunk in re.split("|".join(map(re.escape, fs)), line):
norm = unicodedata.normalize('NFKC', chunk.strip())
chunk = re.sub(r"(^[^\w]+)|([^\w]+$)", "", chunk)
if re.match(r"^[nrt][^a-z]", chunk):
chunk = chunk[1:]
langs = sp(chunk.strip())
if norm.isascii() or is_punctuation(norm) or langs[0] == "Zyyy":
continue
# print(chunk.strip())
blocks.add(chunk.strip())
return blocks
def translate_file(path, translated_chunks):
try:
print(path)
d = from_path(path)
file_contents = str(d.best())
blocks_in_file = extract_unicode_blocks(file_contents)
blocks_to_translate = [b for b in blocks_in_file if b not in translated_chunks]
if len(blocks_in_file) - len(blocks_to_translate) > 0:
print("%d cached" % (len(blocks_in_file) - len(blocks_to_translate)))
if not blocks_to_translate:
#print(f"nothing to translate in {path}")
return
print("%d blocks to translate %s" % (len(blocks_to_translate), path))
#print(blocks_to_translate)
num_translated = 0
total = len(blocks_to_translate)
translations = translate_with_chatgpt(blocks_to_translate)
translated_chunks.update(translations)
keys = list(translated_chunks.keys())
keys.sort(key=len, reverse=True)
for key in keys:
if key in blocks_in_file and key in translated_chunks:
file_contents = file_contents.replace(key, translated_chunks[key])
print(key, " => ", translated_chunks[key])
f = open(path, "w")
f.write(file_contents)
except Exception as e:
print(e)
def main(args):
translated_chunks = {}
paths = args.paths if args.paths else [p.strip() for p in sys.stdin.readlines()]
with concurrent.futures.ThreadPoolExecutor(max_workers=args.threads) as executor:
executor.map(lambda p: translate_file(p, translated_chunks), paths)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Translate source code')
parser.add_argument('--threads', type=int, default=1, help='Number of threads to use (default 1)')
parser.add_argument('paths', nargs="*", help='Paths to translate')
args = parser.parse_args()
main(args)