Skip to content

Commit

Permalink
Merge branch 'add_scanner'
Browse files Browse the repository at this point in the history
This branch adds a few files to deal with parsing scanned pages. Not
everything's working yet.

Work to be done still:
- Scanning of pages using SANE or something similar
- Parsing of scanned pages
- Generating test pages to verify the code
  • Loading branch information
philsc committed May 19, 2013
2 parents b424c36 + 5361cf3 commit fef0490
Show file tree
Hide file tree
Showing 3 changed files with 147 additions and 0 deletions.
36 changes: 36 additions & 0 deletions read_pdf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#!/usr/bin/env python2

import os
import sys
from argparse import ArgumentParser
from scantron import ScantronParser

# Make this file easier to use by adding nice arguments
parser = ArgumentParser(description='Parse scanned scantron sheets.')
parser.add_argument(
'data',
metavar='input_data',
help='File where the field data is stored. ' +
'This must be a python script with an array called "data" ' +
'of Field entries.')

args = parser.parse_args()

# If the file has a .py extension, we should still accept it
if args.data.endswith('.py'):
args.data = os.path.splitext(args.data)[0]

# Import the file specified on the command line
try:
__import__(args.data)
data = sys.modules[args.data].data
except ImportError:
print('Failed to import %s.' % args.data)
quit(1)
except AttributeError:
print('Could not find data array.')
quit(1)

# If everything went well, proceed to parse the filled scantron
st = ScantronParser()
st.scan(data, 'pages/page-1.jpg')
60 changes: 60 additions & 0 deletions scantron.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,65 @@
from reportlab.pdfgen import canvas
from qrcode import *

import numpy as np
import scipy as sp
from scipy import ndimage
from PIL import Image, ImageDraw
import math


class ScantronParser:
def __init__(self):
pass


def scan(self, data, path):
img = Image.open(path).convert('RGB')
im = sp.misc.fromimage(img, flatten=True)
im = np.where(im > 128, 0, 1)
label_im, num = ndimage.label(im, structure=np.ones((3, 3)).tolist())
centroids = ndimage.measurements.center_of_mass(im, label_im, xrange(1,
num+1))
slices = ndimage.find_objects(label_im)

squares = []

for i in range(len(slices)):
sub_img = np.where(label_im[slices[i]] == i + 1, 1, 0)
num_ones = np.sum(sub_img)
num_all = sub_img.size
shape = sub_img.shape

ratio = float(shape[0]) / float(shape[1])
darkness = float(num_ones)/float(num_all)

if darkness > 0.95 and abs(ratio - 1.0) < 0.1 and shape[0] > 14:
x1, x2 = slices[i][1].start, slices[i][1].stop
y1, y2 = slices[i][0].start, slices[i][0].stop

draw = ImageDraw.Draw(img)
draw.rectangle([x1, y1, x2, y2], outline='blue')
del draw

squares.append(i)

if len(squares) != 3:
print('Could not uniquely identify the three page markers.')
raise Exception

squares = zip(squares, map(lambda s: sum(centroids[s]), squares))
squares = sorted(squares, key=lambda x: x[1])

for s in squares:
print('square ' + str(s))

tl = centroids[squares[0][0]]
bl = centroids[squares[1][0]]
br = centroids[squares[2][0]]

rotation = math.atan2(bl[1] - tl[1], bl[0] - tl[0])
print('rotation: ' + str(rotation))


class Scantron:
def __init__(self, filename, spacing=0.3*inch):
Expand Down Expand Up @@ -104,6 +163,7 @@ def set_box_sizes(self, box_size, box_spacing):
def add_sheet(self, data, match=1, position=1):
# Draw boxes for determining boundaries
self.draw_box(1*inch, 1.2*inch, size=0.4*inch, filled=True)
self.draw_box(1*inch, (10.2-0.4)*inch, size=0.4*inch, filled=True)
self.draw_box((7.5-0.4)*inch, (10.2-0.4)*inch, size=0.4*inch,
filled=True)

Expand Down
51 changes: 51 additions & 0 deletions testcase.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
#!/usr/bin/env python2

from scantron import *
import PythonMagick
from pyPdf import PdfFileReader
from PIL import Image

data = [
Field('foo', 'Foo foo foo', int),
Field('bar', 'Bar bar bar', int),
Field('baz', 'Baz baz baz', int),
Field('laber', 'Laber laber', bool),
]

# Generate PDF
st = Scantron('test.pdf')
st.set_box_sizes(box_size=0.2*inch, box_spacing=0.3*inch)
st.populate(data, matches=1, collate='no')
st.save()

# Convert PDF to a series of pictures
pages = []

pdf = PdfFileReader(file('test.pdf', 'rb'))

for page in range(pdf.getNumPages()):
#page += 1
name = 'test_image_%d.png' % page

im = PythonMagick.Image()
im.density('200')
im.read('test.pdf[%d]' % page)
im.write(name)

pages.append(name)

# Create a series of transformations to apply
transformations = [
lambda x: x.rotate(10, expand=False),
lambda x: x.rotate(-10, expand=False),
]

tf = 0

# Take all pictures and modify them in different ways
for page in pages:
im = Image.open(page).convert('L')
im = transformations[tf](im)
im.save(page)

tf = (tf + 1) % len(transformations)

0 comments on commit fef0490

Please sign in to comment.