-
Notifications
You must be signed in to change notification settings - Fork 0
/
do-ocr.py
64 lines (46 loc) · 1.52 KB
/
do-ocr.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import sys
from os.path import isfile, join
from subprocess import Popen, PIPE
from os import listdir
import os
from PIL import Image
import easyocr
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True
ImageFile.LOAD_TRUNCATED_IMAGES = True
job_count = 5
job_id = int(sys.argv[1])
job_index = int(open("workspace/current_job" +
str(job_id), "r").read().strip()) - 1
read_id_index = job_index
target_id_index = job_index * job_count + job_id
ids = open("ids.txt", "r").read().split(',')
target_id = ids[len(ids) - target_id_index - 1].strip()
os.environ["CURRENTID"] = target_id
os.putenv("CURRENTID", target_id)
process = Popen(['gallery-dl', '-D', './image', '-f',
'{num}.{extension}', 'https://hitomi.la/galleries/' + target_id + '.html'])
process.wait()
if not os.path.isdir('image'):
exit()
onlyfiles = [f for f in listdir('image') if isfile(join('image', f))]
onlyfiles = sorted(onlyfiles)
page_count = 0
outputs = target_id + "\n" + str(len(onlyfiles)) + "\n"
if len(onlyfiles) > 300:
exit()
for file in onlyfiles:
fs = join('image', file)
im = Image.open(fs).convert("RGB")
im.save(fs + '.jpg', "jpeg")
reader = easyocr.Reader(['ko'], gpu=False)
result = reader.readtext(fs + '.jpg')
page_count += 1
outputs += str(result)
outputs += "\n"
print("progress: " + str(page_count) + "/" + str(len(onlyfiles)))
if not os.path.exists('result'):
os.makedirs('result')
f = open('result/' + target_id + ".txt", "w")
f.write(outputs)
f.close()