Merge branch 'add_scanner'

This branch adds a few files to deal with parsing scanned pages. Not everything's working yet. Work to be done still: - Scanning of pages using SANE or something similar - Parsing of scanned pages - Generating test pages to verify the code
spartonics · May 19, 2013 · fef0490 · fef0490
2 parents b424c36 + 5361cf3
commit fef0490
Show file tree

Hide file tree

Showing 3 changed files with 147 additions and 0 deletions.
diff --git a/read_pdf.py b/read_pdf.py
@@ -0,0 +1,36 @@
+#!/usr/bin/env python2
+
+import os
+import sys
+from argparse import ArgumentParser
+from scantron import ScantronParser
+
+# Make this file easier to use by adding nice arguments
+parser = ArgumentParser(description='Parse scanned scantron sheets.')
+parser.add_argument(
+        'data',
+        metavar='input_data',
+        help='File where the field data is stored. ' +
+                'This must be a python script with an array called "data" ' +
+                'of Field entries.')
+
+args = parser.parse_args()
+
+# If the file has a .py extension, we should still accept it
+if args.data.endswith('.py'):
+    args.data = os.path.splitext(args.data)[0]
+
+# Import the file specified on the command line
+try:
+    __import__(args.data)
+    data = sys.modules[args.data].data
+except ImportError:
+    print('Failed to import %s.' % args.data)
+    quit(1)
+except AttributeError:
+    print('Could not find data array.')
+    quit(1)
+
+# If everything went well, proceed to parse the filled scantron
+st = ScantronParser()
+st.scan(data, 'pages/page-1.jpg')
diff --git a/scantron.py b/scantron.py
@@ -5,6 +5,65 @@
 from reportlab.pdfgen import canvas
 from qrcode import *
 
+import numpy as np
+import scipy as sp
+from scipy import ndimage
+from PIL import Image, ImageDraw
+import math
+
+
+class ScantronParser:
+    def __init__(self):
+        pass
+
+
+    def scan(self, data, path):
+        img = Image.open(path).convert('RGB')
+        im = sp.misc.fromimage(img, flatten=True)
+        im = np.where(im > 128, 0, 1)
+        label_im, num = ndimage.label(im, structure=np.ones((3, 3)).tolist())
+        centroids = ndimage.measurements.center_of_mass(im, label_im, xrange(1, 
+                num+1))
+        slices = ndimage.find_objects(label_im)
+
+        squares = []
+
+        for i in range(len(slices)):
+            sub_img = np.where(label_im[slices[i]] == i + 1, 1, 0)
+            num_ones = np.sum(sub_img)
+            num_all = sub_img.size
+            shape = sub_img.shape
+
+            ratio = float(shape[0]) / float(shape[1])
+            darkness = float(num_ones)/float(num_all)
+
+            if darkness > 0.95 and abs(ratio - 1.0) < 0.1 and shape[0] > 14:
+                x1, x2 = slices[i][1].start, slices[i][1].stop
+                y1, y2 = slices[i][0].start, slices[i][0].stop
+
+                draw = ImageDraw.Draw(img)
+                draw.rectangle([x1, y1, x2, y2], outline='blue')
+                del draw
+
+                squares.append(i)
+
+        if len(squares) != 3:
+            print('Could not uniquely identify the three page markers.')
+            raise Exception
+
+        squares = zip(squares, map(lambda s: sum(centroids[s]), squares))
+        squares = sorted(squares, key=lambda x: x[1])
+
+        for s in squares:
+            print('square ' + str(s))
+
+        tl = centroids[squares[0][0]]
+        bl = centroids[squares[1][0]]
+        br = centroids[squares[2][0]]
+
+        rotation = math.atan2(bl[1] - tl[1], bl[0] - tl[0])
+        print('rotation: ' + str(rotation))
+
 
 class Scantron:
     def __init__(self, filename, spacing=0.3*inch):
@@ -104,6 +163,7 @@ def set_box_sizes(self, box_size, box_spacing):
     def add_sheet(self, data, match=1, position=1):
         # Draw boxes for determining boundaries
         self.draw_box(1*inch, 1.2*inch, size=0.4*inch, filled=True)
+        self.draw_box(1*inch, (10.2-0.4)*inch, size=0.4*inch, filled=True)
         self.draw_box((7.5-0.4)*inch, (10.2-0.4)*inch, size=0.4*inch, 
                 filled=True)
 

diff --git a/testcase.py b/testcase.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python2
+
+from scantron import *
+import PythonMagick
+from pyPdf import PdfFileReader
+from PIL import Image
+
+data = [
+    Field('foo', 'Foo foo foo', int),
+    Field('bar', 'Bar bar bar', int),
+    Field('baz', 'Baz baz baz', int),
+    Field('laber', 'Laber laber', bool),
+]
+
+# Generate PDF
+st = Scantron('test.pdf')
+st.set_box_sizes(box_size=0.2*inch, box_spacing=0.3*inch)
+st.populate(data, matches=1, collate='no')
+st.save()
+
+# Convert PDF to a series of pictures
+pages = []
+
+pdf = PdfFileReader(file('test.pdf', 'rb'))
+
+for page in range(pdf.getNumPages()):
+    #page += 1
+    name = 'test_image_%d.png' % page
+
+    im = PythonMagick.Image()
+    im.density('200')
+    im.read('test.pdf[%d]' % page)
+    im.write(name)
+
+    pages.append(name)
+
+# Create a series of transformations to apply
+transformations = [
+    lambda x: x.rotate(10, expand=False),
+    lambda x: x.rotate(-10, expand=False),
+]
+
+tf = 0
+
+# Take all pictures and modify them in different ways
+for page in pages:
+    im = Image.open(page).convert('L')
+    im = transformations[tf](im)
+    im.save(page)
+
+    tf = (tf + 1) % len(transformations)