|
| 1 | +import os, json, sys, operator, urllib, urllib2, subprocess, tempfile, multiprocessing |
| 2 | +from pyquery import PyQuery as pq |
| 3 | + |
| 4 | +def write_file(json_dir, old_meta, new_meta): |
| 5 | + meta = dict(old_meta) |
| 6 | + meta.update(new_meta) |
| 7 | + outf = open(os.path.join(json_dir, "%s.json" % meta['id']), "wb") |
| 8 | + json.dump(meta, outf, indent=4) |
| 9 | + outf.close() |
| 10 | + |
| 11 | +def all_files(dir): |
| 12 | + return reduce(operator.add, [[os.path.join(x[0], y) for y in x[2]] for x in os.walk(dir)]) |
| 13 | + |
| 14 | +def process_json(json_file): |
| 15 | + print json_file |
| 16 | + meta = json.load(open(json_file)) |
| 17 | + |
| 18 | + # find the pdfs |
| 19 | + json_dir = os.path.dirname(json_file) |
| 20 | + pdfs = [f for f in os.listdir(json_dir) if f.startswith(meta['id']) and f.endswith('.pdf')] |
| 21 | + |
| 22 | + # the first pdf can be used straight |
| 23 | + print "Extracting %s..." % pdfs[0] |
| 24 | + text = subprocess.Popen(['pdftotext', os.path.join(json_dir, pdfs[0]), '-'], stdin=subprocess.PIPE, stdout=subprocess.PIPE).communicate()[0] |
| 25 | + write_file(json_dir, meta, {'id': meta['id'] + '-1', 'text': text}) |
| 26 | + |
| 27 | + # the second one needs to be split and re-OCR'ed |
| 28 | + # split |
| 29 | + print "Extracting pages..." |
| 30 | + png_dir = os.path.join(json_dir, meta['id'] + "_png") |
| 31 | + |
| 32 | + pages = sorted(os.listdir(png_dir), key=lambda f: int(f.split("-")[1].split(".")[0])) |
| 33 | + for i, page in enumerate(pages): |
| 34 | + print "Extracting %s (OCR)..." % page |
| 35 | + |
| 36 | + text = subprocess.Popen(['tesseract', os.path.join(png_dir, page), '-'], stdin=subprocess.PIPE, stdout=subprocess.PIPE).communicate()[0] |
| 37 | + write_file(json_dir, meta, {'id': meta['id'] + '-' + str(i + 2), 'text': text}) |
| 38 | + |
| 39 | +if __name__ == "__main__": |
| 40 | + if sys.argv[1].endswith('.json'): |
| 41 | + process_json(sys.argv[1]) |
| 42 | + else: |
| 43 | + json_files = [f for f in all_files(sys.argv[1]) if f.endswith('.json') and '-' not in os.path.basename(f)] |
| 44 | + pool = multiprocessing.Pool(multiprocessing.cpu_count()) |
| 45 | + pool.map(process_json, json_files) |
0 commit comments