-
Notifications
You must be signed in to change notification settings - Fork 0
/
ocr_test.py
105 lines (81 loc) · 3.86 KB
/
ocr_test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
from PIL import Image
import pytesseract
import numpy as np
from translate import Translator
from colorama import Fore
from tqdm import tqdm
# Read your target path from 'my_path.txt'
my_path = open("my_path.txt").read()
# List of languages:
# ['afr', 'amh', 'ara', 'asm', 'aze', 'aze_cyrl', 'bel', 'ben', 'bod', 'bos', 'bre', 'bul', 'cat', 'ceb', 'ces', 'chi_sim', 'chi_sim_vert', 'chi_tra', 'chi_tra_vert', 'chr', 'cos', 'cym', 'dan', 'deu', 'div', 'dzo', 'ell', 'eng', 'enm', 'epo', 'equ', 'est', 'eus', 'fao', 'fas', 'fil', 'fin', 'fra', 'frk', 'frm', 'fry', 'gla', 'gle', 'glg', 'grc', 'guj', 'hat', 'heb', 'hin', 'hrv', 'hun', 'hye', 'iku', 'ind', 'isl', 'ita', 'ita_old', 'jav', 'jpn', 'jpn_vert', 'kan', 'kat', 'kat_old', 'kaz', 'khm', 'kir', 'kmr', 'kor', 'lao', 'lat', 'lav', 'lit', 'ltz', 'mal', 'mar', 'mkd', 'mlt', 'mon', 'mri', 'msa', 'mya', 'nep', 'nld', 'nor', 'oci', 'ori', 'osd', 'pan', 'pol', 'por', 'pus', 'que', 'ron', 'rus', 'san', 'sin', 'slk', 'slv', 'snd', 'spa', 'spa_old', 'sqi', 'srp', 'srp_latn', 'sun', 'swa', 'swe', 'syr', 'tam', 'tat', 'tel', 'tgk', 'tha', 'tir', 'ton', 'tur', 'uig', 'ukr', 'urd', 'uzb', 'uzb_cyrl', 'vie', 'yid', 'yor']
def translateWord(word):
result = ""
try:
result = translator.translate(word)
except:
result = "-"
return result
def translateText(text):
translated_text = []
# Split into words
word_array = text.split()
for word in tqdm(word_array, "Translating"):
myWord = word.split(sep=",")[0]
translated_word = translateWord(myWord)
# add the word to the output text
translated_text.append(translated_word)
# reconstruct the text
separator = " "
return separator.join(translated_text)
def compare(text, translated_text):
# Split into words
word_array = text.split()
translated_word_array = translated_text.split()
# Print OCR words highlighting translation successes
print(Fore.LIGHTCYAN_EX, "OCR " + source_lang.capitalize() + ":\n")
for word in word_array:
if word_array.index(word) < len(translated_word_array):
if translated_word_array[word_array.index(word)] == '-':
print(Fore.RED, word, end='')
else:
print(Fore.GREEN, word, end='')
print("\n")
# Print Translated words highlighting translation successes
print(Fore.LIGHTCYAN_EX, "Translated " + target_lang.capitalize() + ":\n")
for translated_word in translated_word_array:
if translated_word == '-':
print(Fore.RED, translated_word, end='')
else:
print(Fore.GREEN, translated_word, end='')
print("\n")
print(Fore.WHITE, "END\n")
# latin = 'lat', russian = 'rus', english = 'en', ukranian = 'ukr', greek = 'grc', hebrew = 'heb'
source_lang = 'heb'
target_lang = 'en'
# initialize translator
translator = Translator(to_lang=target_lang, from_lang=source_lang)
# get currently supported Tesseract OCR languages
# print(pytesseract.get_languages())
root = my_path
# filename = root + 'latin_uxorem.png'
# filename = root + 'Screenshot_20231229_130556_Instagram.jpg'
# filename = root + 'greek_3.png'
# filename = root + 'october.png'
# filename = root + 'greek_1.PNG'
# filename = root + 'latin_1_utopia.png'
# filename = root + 'Russian_Protest_snip_20240101_041809_PBS_Civilizations_Ep9.jpg'
filename = root + 'Bodleian_Kennicott_bible_fol_439r.PNG'
# open the file
img1 = np.array(Image.open(filename))
# extract the text
text = pytesseract.image_to_string(img1, lang=source_lang)
# Print OCR text
print("\nOCR " + source_lang.capitalize() + ":")
print(text)
# Print Translated text
# print("Translated " + target_lang.capitalize() + ":")
translated_text = translateText(text)
print(translated_text + "\n")
# Print text again highlighting word translation successes and failures
# Based on index
compare(text,translated_text)