Skip to content
This repository was archived by the owner on May 15, 2019. It is now read-only.

Commit 97bde7b

Browse files
committed
A bunch more credo manipulation stuff.
1 parent a2dc96b commit 97bde7b

File tree

3 files changed

+99
-0
lines changed

3 files changed

+99
-0
lines changed

fcc_split/credo_download.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
import os, json, sys, operator, urllib, urllib2
2+
from pyquery import PyQuery as pq
3+
4+
def all_files(dir):
5+
return reduce(operator.add, [[os.path.join(x[0], y) for y in x[2]] for x in os.walk(dir)])
6+
7+
credo_dir = sys.argv[1]
8+
9+
if __name__ == "__main__":
10+
json_files = [f for f in all_files(credo_dir) if f.endswith('.json') and '-' not in os.path.basename(f)]
11+
for filename in json_files:
12+
meta = json.load(open(filename))
13+
14+
page = pq(urllib2.urlopen("http://apps.fcc.gov/ecfs/comment/view?id=" + meta['id']).read())
15+
16+
print 'Processing %s...' % filename
17+
for number, href in enumerate([a.attr('href').strip() for a in page.find('.tableDiv a[href*=document]').items()]):
18+
print "Downloading %s..." % number
19+
urllib.urlretrieve("http://apps.fcc.gov" + str(href), os.path.join(os.path.dirname(filename), str(meta['id']) + "-" + str(number) + ".pdf"))

fcc_split/credo_extract.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
import os, json, sys, operator, urllib, urllib2, subprocess, tempfile, multiprocessing
2+
from pyquery import PyQuery as pq
3+
4+
def write_file(json_dir, old_meta, new_meta):
5+
meta = dict(old_meta)
6+
meta.update(new_meta)
7+
outf = open(os.path.join(json_dir, "%s.json" % meta['id']), "wb")
8+
json.dump(meta, outf, indent=4)
9+
outf.close()
10+
11+
def all_files(dir):
12+
return reduce(operator.add, [[os.path.join(x[0], y) for y in x[2]] for x in os.walk(dir)])
13+
14+
def process_json(json_file):
15+
print json_file
16+
meta = json.load(open(json_file))
17+
18+
# find the pdfs
19+
json_dir = os.path.dirname(json_file)
20+
pdfs = [f for f in os.listdir(json_dir) if f.startswith(meta['id']) and f.endswith('.pdf')]
21+
22+
# the first pdf can be used straight
23+
print "Extracting %s..." % pdfs[0]
24+
text = subprocess.Popen(['pdftotext', os.path.join(json_dir, pdfs[0]), '-'], stdin=subprocess.PIPE, stdout=subprocess.PIPE).communicate()[0]
25+
write_file(json_dir, meta, {'id': meta['id'] + '-1', 'text': text})
26+
27+
# the second one needs to be split and re-OCR'ed
28+
# split
29+
print "Extracting pages..."
30+
png_dir = os.path.join(json_dir, meta['id'] + "_png")
31+
32+
pages = sorted(os.listdir(png_dir), key=lambda f: int(f.split("-")[1].split(".")[0]))
33+
for i, page in enumerate(pages):
34+
print "Extracting %s (OCR)..." % page
35+
36+
text = subprocess.Popen(['tesseract', os.path.join(png_dir, page), '-'], stdin=subprocess.PIPE, stdout=subprocess.PIPE).communicate()[0]
37+
write_file(json_dir, meta, {'id': meta['id'] + '-' + str(i + 2), 'text': text})
38+
39+
if __name__ == "__main__":
40+
if sys.argv[1].endswith('.json'):
41+
process_json(sys.argv[1])
42+
else:
43+
json_files = [f for f in all_files(sys.argv[1]) if f.endswith('.json') and '-' not in os.path.basename(f)]
44+
pool = multiprocessing.Pool(multiprocessing.cpu_count())
45+
pool.map(process_json, json_files)

fcc_split/credo_split.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
import os, json, sys, operator, urllib, urllib2, subprocess, tempfile, shutil, multiprocessing
2+
from pyquery import PyQuery as pq
3+
4+
def all_files(dir):
5+
return reduce(operator.add, [[os.path.join(x[0], y) for y in x[2]] for x in os.walk(dir)])
6+
7+
def process_json(json_file):
8+
print json_file
9+
meta = json.load(open(json_file))
10+
11+
# find the pdfs
12+
json_dir = os.path.dirname(json_file)
13+
pdfs = [f for f in os.listdir(json_dir) if f.startswith(meta['id']) and f.endswith('.pdf')]
14+
15+
# the first pdf can be used straight
16+
text = subprocess.Popen(['pdftotext', os.path.join(json_dir, pdfs[0]), '-'], stdin=subprocess.PIPE, stdout=subprocess.PIPE).communicate()[0]
17+
print text
18+
19+
# the second one needs to be split and re-OCR'ed
20+
# split
21+
print "Extracting pages...",
22+
png_dir = os.path.join(json_dir, meta['id'] + "_png")
23+
if not os.path.exists(png_dir):
24+
os.mkdir(png_dir)
25+
26+
subprocess.Popen(['gs', '-dSAFER', '-dBATCH', '-dNOPAUSE', '-sDEVICE=pnggray', '-r300', "-sOutputFile=%s/pdf-%00d.png".replace("%s", png_dir), os.path.join(json_dir, pdfs[1])], stdin=subprocess.PIPE, stdout=subprocess.PIPE).communicate()
27+
print "done."
28+
29+
if __name__ == "__main__":
30+
if sys.argv[1].endswith('.json'):
31+
process_json(sys.argv[1])
32+
else:
33+
json_files = [f for f in all_files(sys.argv[1]) if f.endswith('.json') and '-' not in os.path.basename(f)]
34+
pool = multiprocessing.Pool(multiprocessing.cpu_count())
35+
pool.map(process_json, json_files)

0 commit comments

Comments
 (0)