From 0abba05efc502fbc2be4df70fdd44c0b1610e6c0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mart=C3=ADn=20Gait=C3=A1n?= <gaitan@gmail.com>
Date: Fri, 8 Nov 2013 02:28:32 -0300
Subject: [PATCH 01/28] simplifying setup.py

---
 setup.py | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/setup.py b/setup.py
index 099adf9..a75e21a 100644
--- a/setup.py
+++ b/setup.py
@@ -1,6 +1,5 @@
 from setuptools import setup, find_packages
-import sys, os
-from pip.req import parse_requirements
+import os
 
 here = os.path.abspath(os.path.dirname(__file__))
 README = open(os.path.join(here, 'README.md')).read()
@@ -9,10 +8,6 @@
 
 version = '0.1'
 
-from_requirements_txt = parse_requirements("requirements.txt")
-install_requires = [    str(ir.req) for ir in from_requirements_txt ]
-
-
 setup(name='pdf-table-extract',
     version=version,
     description="Extract Tables from PDF files",
@@ -28,7 +23,7 @@
     packages=find_packages('src'),
     package_dir = {'': 'src'},include_package_data=True,
     zip_safe=False,
-    install_requires=install_requires,
+    install_requires=['numpy'],
     entry_points={
         'console_scripts':
             ['pdf-table-extract=pdftableextract.scripts:main']

From aa31b88fd7fe9805cb2f93aa6ca914abb05a8ba8 Mon Sep 17 00:00:00 2001
From: Alex Goretoy <alex@goretoy.com>
Date: Thu, 12 Feb 2015 20:33:21 -0800
Subject: [PATCH 02/28] Added Requires to README

---
 README.md | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 222c14d..3085e5a 100644
--- a/README.md
+++ b/README.md
@@ -13,4 +13,8 @@ tables in ST Micro’s datasheets. The script requires numpy and poppler
 ###Tags
 [Utilities](http://ashimagroup.net/os/tag/utilities)
 
-
+###Requires
+apt-get install python-dev poppler-utils
+yum install python-devel poppler-utils
+[numpy](http://www.numpy.org/)
+[pandas](http://pandas.pydata.org/)

From aef8b14bb4d9e12e67fbcaf009c69607c214dcb4 Mon Sep 17 00:00:00 2001
From: Evgeny Cherkashin <eugene@irnok.net>
Date: Thu, 14 Jul 2016 17:31:41 +0800
Subject: [PATCH 03/28] Python 3 adaptation.

---
 example/test_to_pandas.py         |   3 +-
 src/pdftableextract/__init__.py   |   2 +-
 src/pdftableextract/core.py       | 152 +++++++++++++++---------------
 src/pdftableextract/extracttab.py |  73 +++++++-------
 src/pdftableextract/pnm.py        |  17 ++--
 5 files changed, 125 insertions(+), 122 deletions(-)

diff --git a/example/test_to_pandas.py b/example/test_to_pandas.py
index bb31515..d734ea3 100644
--- a/example/test_to_pandas.py
+++ b/example/test_to_pandas.py
@@ -1,3 +1,4 @@
+from __future__ import print_function
 import pandas as pd
 import pdftableextract as pdf
 
@@ -17,4 +18,4 @@
 #data is row '2' through '-1'
 
 data =pd.DataFrame(li[2:-1], columns=li[1], index=[l[0] for l in li[2:-1]])
-print data
+print (data)
diff --git a/src/pdftableextract/__init__.py b/src/pdftableextract/__init__.py
index 6dbe85c..8366135 100644
--- a/src/pdftableextract/__init__.py
+++ b/src/pdftableextract/__init__.py
@@ -1,2 +1,2 @@
 # Example package with a console entry point
-from core import process_page, output, table_to_list
\ No newline at end of file
+from pdftableextract.core import process_page, output, table_to_list
diff --git a/src/pdftableextract/core.py b/src/pdftableextract/core.py
index d1dce80..b96fba4 100644
--- a/src/pdftableextract/core.py
+++ b/src/pdftableextract/core.py
@@ -3,7 +3,7 @@
 from numpy import array, fromstring, ones, zeros, uint8, diff, where, sum, delete
 import subprocess
 from pipes import quote
-from .pnm import readPNM, dumpImage
+from pdftableextract.pnm import readPNM, dumpImage
 import re
 from pipes import quote
 from xml.dom.minidom import getDOMImplementation
@@ -13,7 +13,7 @@
 #-----------------------------------------------------------------------
 def check_for_required_executable(name,command):
     """Checks for an executable called 'name' by running 'command' and supressing
-    output. If the return code is non-zero or an OS error occurs, an Exception is raised""" 
+    output. If the return code is non-zero or an OS error occurs, an Exception is raised"""
     try:
         with open(os.devnull, "w") as fnull:
             result=subprocess.check_call(command,stdout=fnull, stderr=fnull)
@@ -32,12 +32,12 @@ def popen(name,command, *args, **kwargs):
     try:
         result=subprocess.Popen(command,*args, **kwargs)
         return result
-    except OSError, e:
+    except OSError as e:
         message="""Error running {0}. Is it installed correctly?
 Error: {1}""".format(name, e)
         raise OSError(message)
-    except Exception, e:
-        raise 
+    except Exception as e:
+        raise
 
 def colinterp(a,x) :
     """Interpolates colors"""
@@ -53,7 +53,7 @@ def col(x, colmult=1.0) :
     return colinterp(colarr,(colmult * x)% 1.0) / 2
 
 
-def process_page(infile, pgs, 
+def process_page(infile, pgs,
     outfilename=None,
     greyscale_threshold=25,
     page=None,
@@ -64,17 +64,17 @@ def process_page(infile, pgs,
     pad=2,
     white=None,
     black=None,
-    bitmap=False, 
-    checkcrop=False, 
-    checklines=False, 
+    bitmap=False,
+    checkcrop=False,
+    checklines=False,
     checkdivs=False,
     checkcells=False,
     whitespace="normalize",
     boxes=False) :
-    
+
   outfile = open(outfilename,'w') if outfilename else sys.stdout
   page=page or []
-  (pg,frow,lrow) = (map(int,(pgs.split(":")))+[None,None])[0:3]
+  (pg,frow,lrow) = (list(map(int,(pgs.split(":"))))+[None,None])[0:3]
   #check that pdftoppdm exists by running a simple command
   check_for_required_executable("pdftoppm",["pdftoppm","-h"])
   #end check
@@ -91,7 +91,7 @@ def process_page(infile, pgs,
   pad = int(pad)
   height+=pad*2
   width+=pad*2
-  
+
 # reimbed image with a white padd.
   bmp = ones( (height,width) , dtype=bool )
   bmp[pad:height-pad,pad:width-pad] = ( data[:,:] > int(255.0*greyscale_threshold/100.0) )
@@ -109,25 +109,25 @@ def process_page(infile, pgs,
     t=t+1
   if t > 0 :
     t=t-1
-  
+
   b=height-1
   while b > t and sum(bmp[b,:]==0) == 0 :
     b=b-1
   if b < height-1:
     b = b+1
-  
+
   l=0
   while l < width and sum(bmp[:,l]==0) == 0 :
     l=l+1
   if l > 0 :
     l=l-1
-  
+
   r=width-1
   while r > l and sum(bmp[:,r]==0) == 0 :
     r=r-1
   if r < width-1 :
     r=r+1
-  
+
 # Mark bounding box.
   bmp[t,:] = 0
   bmp[b,:] = 0
@@ -139,13 +139,13 @@ def boxOfString(x,p) :
     if len(s) < 4 :
       raise ValueError("boxes have format left:top:right:bottom[:page]")
     return ([bitmap_resolution * float(x) + pad for x in s[0:4] ]
-                + [ p if len(s)<5 else int(s[4]) ] ) 
+                + [ p if len(s)<5 else int(s[4]) ] )
 
 
 # translate crop to paint white.
   whites = []
   if crop :
-    (l,t,r,b,p) = boxOfString(crop,pg) 
+    (l,t,r,b,p) = boxOfString(crop,pg)
     whites.extend( [ (0,0,l,height,p), (0,0,width,t,p),
                      (r,0,width,height,p), (0,b,width,height,p) ] )
 
@@ -157,7 +157,7 @@ def boxOfString(x,p) :
     if p == pg :
       bmp[ t:b+1,l:r+1 ] = 1
       img[ t:b+1,l:r+1 ] = [255,255,255]
-  
+
 # paint black ...
   if black :
     for b in black :
@@ -168,17 +168,17 @@ def boxOfString(x,p) :
   if checkcrop :
     dumpImage(outfile,bmp,img, bitmap, pad)
     return True
-    
+
 #-----------------------------------------------------------------------
 # Line finding section.
 #
-# Find all vertical or horizontal lines that are more than rlthresh 
+# Find all vertical or horizontal lines that are more than rlthresh
 # long, these are considered lines on the table grid.
 
   lthresh = int(line_length * bitmap_resolution)
   vs = zeros(width, dtype=int)
   for i in range(width) :
-    dd = diff( where(bmp[:,i])[0] ) 
+    dd = diff( where(bmp[:,i])[0] )
     if len(dd)>0:
       v = max ( dd )
       if v > lthresh :
@@ -213,19 +213,19 @@ def boxOfString(x,p) :
       vd = delete(vd,i)
     else:
       i=i+2
-  
-  j = 0 
+
+  j = 0
   while j < len(hd):
     if hd[j+1]-hd[j] > maxdiv :
       hd = delete(hd,j)
       hd = delete(hd,j)
     else:
       j=j+2
-  
+
   if checklines :
     for i in vd :
       img[:,i] = [255,0,0] # red
-  
+
     for j in hd :
       img[j,:] = [0,0,255] # blue
     dumpImage(outfile,bmp,img)
@@ -233,25 +233,25 @@ def boxOfString(x,p) :
 #-----------------------------------------------------------------------
 # divider checking.
 #
-# at this point vd holds the x coordinate of vertical  and 
-# hd holds the y coordinate of horizontal divider tansitions for each 
+# at this point vd holds the x coordinate of vertical  and
+# hd holds the y coordinate of horizontal divider tansitions for each
 # vertical and horizontal lines in the table grid.
 
   def isDiv(a, l,r,t,b) :
           # if any col or row (in axis) is all zeros ...
-    return sum( sum(bmp[t:b, l:r], axis=a)==0 ) >0 
+    return sum( sum(bmp[t:b, l:r], axis=a)==0 ) >0
 
   if checkdivs :
     img = img / 2
     for j in range(0,len(hd),2):
       for i in range(0,len(vd),2):
         if i>0 :
-          (l,r,t,b) = (vd[i-1], vd[i],   hd[j],   hd[j+1]) 
+          (l,r,t,b) = (vd[i-1], vd[i],   hd[j],   hd[j+1])
           img[ t:b, l:r, 1 ] = 192
           if isDiv(1, l,r,t,b) :
             img[ t:b, l:r, 0 ] = 0
             img[ t:b, l:r, 2 ] = 255
-          
+
         if j>0 :
           (l,r,t,b) = (vd[i],   vd[i+1], hd[j-1], hd[j] )
           img[ t:b, l:r, 1 ] = 128
@@ -265,7 +265,7 @@ def isDiv(a, l,r,t,b) :
 # This algorithum is width hungry, and always generates rectangular
 # boxes.
 
-  cells =[] 
+  cells =[]
   touched = zeros( (len(hd), len(vd)),dtype=bool )
   j = 0
   while j*2+2 < len (hd) :
@@ -290,8 +290,8 @@ def isDiv(a, l,r,t,b) :
         touched[ j:j+v, i:i+u] = True
       i = i+1
     j=j+1
-  
-  
+
+
   if checkcells :
     nc = len(cells)+0.
     img = img / 2
@@ -301,76 +301,77 @@ def isDiv(a, l,r,t,b) :
       img[ t:b, l:r ] += col( k/nc )
     dumpImage(outfile,bmp,img)
     return True
-  
+
 #-----------------------------------------------------------------------
 # fork out to extract text for each cell.
 
-  whitespace = re.compile( r'\s+')
-   
-  def getCell( (i,j,u,v) ):
+  whitespace = re.compile( rb'\s+')
+
+  def getCell( _coordinate):
+    (i,j,u,v) =_coordinate
     (l,r,t,b) = ( vd[2*i+1] , vd[ 2*(i+u) ], hd[2*j+1], hd[2*(j+v)] )
-    p = popen("pdftotext", 
+    p = popen("pdftotext",
               "pdftotext -r %d -x %d -y %d -W %d -H %d -layout -nopgbrk -f %d -l %d %s -" % (bitmap_resolution, l-pad, t-pad, r-l, b-t, pg, pg, quote(infile)),
-              stdout=subprocess.PIPE, 
+              stdout=subprocess.PIPE,
               shell=True )
-    
+
     ret = p.communicate()[0]
     if whitespace != 'raw' :
-      ret = whitespace.sub( "" if whitespace == "none" else " ", ret )
+      ret = whitespace.sub( b"" if whitespace == "none" else b" ", ret )
       if len(ret) > 0 :
-        ret = ret[ (1 if ret[0]==' ' else 0) : 
-                   len(ret) - (1 if ret[-1]==' ' else 0) ]
+        ret = ret[ (1 if ret[0]==b' ' else 0) :
+                   len(ret) - (1 if ret[-1]==b' ' else 0) ]
     return (i,j,u,v,pg,ret)
-      
+
   if boxes :
-    cells = [ x + (pg,"",) for x in cells if 
+    cells = [ x + (pg,b"",) for x in cells if
               ( frow == None or (x[1] >= frow and x[1] <= lrow)) ]
   else :
     #check that pdftotext exists by running a simple command
     check_for_required_executable("pdftotext",["pdftotext","-h"])
     #end check
-    cells = [ getCell(x)   for x in cells if 
+    cells = [ getCell(x)   for x in cells if
               ( frow == None or (x[1] >= frow and x[1] <= lrow)) ]
   return cells
 
 #-----------------------------------------------------------------------
 #output section.
 
-def output(cells, pgs, 
-                cells_csv_filename=None, 
-                cells_json_filename=None, 
-                cells_xml_filename=None, 
+def output(cells, pgs,
+                cells_csv_filename=None,
+                cells_json_filename=None,
+                cells_xml_filename=None,
                 table_csv_filename=None,
                 table_html_filename=None,
                 table_list_filename=None,
                 infile=None, name=None, output_type=None
                 ):
-                
+
     output_types = [
-             dict(filename=cells_csv_filename, function=o_cells_csv),  
-             dict(filename=cells_json_filename, function=o_cells_json), 
-             dict(filename=cells_xml_filename, function=o_cells_xml), 
+             dict(filename=cells_csv_filename, function=o_cells_csv),
+             dict(filename=cells_json_filename, function=o_cells_json),
+             dict(filename=cells_xml_filename, function=o_cells_xml),
              dict(filename=table_csv_filename, function=o_table_csv),
              dict(filename=table_html_filename, function=o_table_html),
              dict(filename=table_list_filename, function=o_table_list)
              ]
-             
+
     for entry in output_types:
         if entry["filename"]:
             if entry["filename"] != sys.stdout:
                 outfile = open(entry["filename"],'w')
             else:
                 outfile = sys.stdout
-            
-            entry["function"](cells, pgs, 
-                                outfile=outfile, 
-                                name=name, 
-                                infile=infile, 
+
+            entry["function"](cells, pgs,
+                                outfile=outfile,
+                                name=name,
+                                infile=infile,
                                 output_type=output_type)
 
             if entry["filename"] != sys.stdout:
                 outfile.close()
-        
+
 def o_cells_csv(cells,pgs, outfile=None, name=None, infile=None, output_type=None) :
   outfile = outfile or sys.stdout
   csv.writer( outfile , dialect='excel' ).writerows(cells)
@@ -381,20 +382,22 @@ def o_cells_json(cells,pgs, outfile=None, infile=None, name=None, output_type=No
   #defaults
   infile=infile or ""
   name=name or ""
-  
-  json.dump({ 
+
+  json.dump({
     "src": infile,
     "name": name,
     "colnames": ( "x","y","width","height","page","contents" ),
     "cells":cells
     }, outfile)
 
-def o_cells_xml(cells,pgs, outfile=None,infile=None, name=None, output_type=None) : 
+def o_cells_xml(cells,pgs, outfile=None,infile=None, name=None, output_type=None) :
   """Output XML formatted cell data"""
   outfile = outfile or sys.stdout
   #defaults
   infile=infile or ""
   name=name or ""
+  def _lambda(a):
+      return x.setAttribute(*a)
 
   doc = getDOMImplementation().createDocument(None,"table", None)
   root = doc.documentElement;
@@ -404,19 +407,19 @@ def o_cells_xml(cells,pgs, outfile=None,infile=None, name=None, output_type=None
     root.setAttribute("name",name)
   for cl in cells :
     x = doc.createElement("cell")
-    map(lambda(a): x.setAttribute(*a), zip("xywhp",map(str,cl)))
+    map(_lambda, zip("xywhp",map(str,cl)))
     if cl[5] != "" :
       x.appendChild( doc.createTextNode(cl[5]) )
     root.appendChild(x)
   outfile.write( doc.toprettyxml() )
-  
-def table_to_list(cells,pgs) : 
+
+def table_to_list(cells,pgs) :
   """Output list of lists"""
   l=[0,0,0]
   for (i,j,u,v,pg,value) in cells :
       r=[i,j,pg]
       l = [max(x) for x in zip(l,r)]
-  
+
   tab = [ [ [ "" for x in range(l[0]+1)
             ] for x in range(l[1]+1)
           ] for x in range(l[2]+1)
@@ -432,18 +435,18 @@ def o_table_csv(cells,pgs, outfile=None, name=None, infile=None, output_type=Non
   tab=table_to_list(cells, pgs)
   for t in tab:
     csv.writer( outfile , dialect='excel' ).writerows(t)
-  
+
 
 def o_table_list(cells,pgs, outfile=None, name=None, infile=None, output_type=None) :
   """Output list of lists"""
   outfile = outfile or sys.stdout
   tab = table_to_list(cells, pgs)
   print(tab)
-    
-def o_table_html(cells,pgs, outfile=None, output_type=None, name=None, infile=None) : 
+
+def o_table_html(cells,pgs, outfile=None, output_type=None, name=None, infile=None) :
   """Output HTML formatted table"""
 
-  oj = 0 
+  oj = 0
   opg = 0
   doc = getDOMImplementation().createDocument(None,"table", None)
   root = doc.documentElement;
@@ -458,7 +461,7 @@ def o_table_html(cells,pgs, outfile=None, output_type=None, name=None, infile=No
     if j > oj or pg > opg:
       if pg > opg:
         s = "Name: " + name + ", " if name else ""
-        root.appendChild( doc.createComment( s + 
+        root.appendChild( doc.createComment( s +
           ("Source: %s page %d." % (infile, pg) )));
       if tr :
         root.appendChild(tr)
@@ -478,4 +481,3 @@ def o_table_html(cells,pgs, outfile=None, output_type=None, name=None, infile=No
     tr.appendChild(td)
   root.appendChild(tr)
   outfile.write( doc.toprettyxml() )
-  
diff --git a/src/pdftableextract/extracttab.py b/src/pdftableextract/extracttab.py
index ab6c74d..44ccecd 100644
--- a/src/pdftableextract/extracttab.py
+++ b/src/pdftableextract/extracttab.py
@@ -18,7 +18,7 @@ def process_page(pgs) :
 
   p = subprocess.Popen( ("pdftoppm -gray -r %d -f %d -l %d %s " %
       (args.r,pg,pg,quote(args.infile))),
-      stdin=subprocess.PIPE, stdout=subprocess.PIPE, shell=True )
+      stdin=subprocess.PIPE, stdout=subprocess.PIPE, shell=True)
 
 #-----------------------------------------------------------------------
 # image load secion.
@@ -28,7 +28,7 @@ def process_page(pgs) :
   pad = int(args.pad)
   height+=pad*2
   width+=pad*2
-  
+
 # reimbed image with a white padd.
   bmp = ones( (height,width) , dtype=bool )
   bmp[pad:height-pad,pad:width-pad] = ( data[:,:] > int(255.0*args.g/100.0) )
@@ -47,25 +47,25 @@ def process_page(pgs) :
     t=t+1
   if t > 0 :
     t=t-1
-  
+
   b=height-1
   while b > t and sum(bmp[b,:]==0) == 0 :
     b=b-1
   if b < height-1:
     b = b+1
-  
+
   l=0
   while l < width and sum(bmp[:,l]==0) == 0 :
     l=l+1
   if l > 0 :
     l=l-1
-  
+
   r=width-1
   while r > l and sum(bmp[:,r]==0) == 0 :
     r=r-1
   if r < width-1 :
     r=r+1
-  
+
 # Mark bounding box.
   bmp[t,:] = 0
   bmp[b,:] = 0
@@ -77,13 +77,13 @@ def boxOfString(x,p) :
     if len(s) < 4 :
       raise Exception("boxes have format left:top:right:bottom[:page]")
     return ([args.r * float(x) + args.pad for x in s[0:4] ]
-                + [ p if len(s)<5 else int(s[4]) ] ) 
+                + [ p if len(s)<5 else int(s[4]) ] )
 
 
 # translate crop to paint white.
   whites = []
   if args.crop :
-    (l,t,r,b,p) = boxOfString(args.crop,pg) 
+    (l,t,r,b,p) = boxOfString(args.crop,pg)
     whites.extend( [ (0,0,l,height,p), (0,0,width,t,p),
                      (r,0,width,height,p), (0,b,width,height,p) ] )
 
@@ -95,7 +95,7 @@ def boxOfString(x,p) :
     if p == pg :
       bmp[ t:b+1,l:r+1 ] = 1
       img[ t:b+1,l:r+1 ] = [255,255,255]
-  
+
 # paint black ...
   if args.black :
     for b in args.black :
@@ -106,18 +106,18 @@ def boxOfString(x,p) :
   if args.checkcrop :
     dumpImage(args,bmp,img)
     sys.exit(0)
-    
-  
+
+
 #-----------------------------------------------------------------------
 # Line finding section.
 #
-# Find all verticle or horizontal lines that are more than rlthresh 
+# Find all verticle or horizontal lines that are more than rlthresh
 # long, these are considered lines on the table grid.
 
   lthresh = int(args.l * args.r)
   vs = zeros(width, dtype=int)
   for i in range(width) :
-    dd = diff( where(bmp[:,i])[0] ) 
+    dd = diff( where(bmp[:,i])[0] )
     if len(dd)>0:
       v = max ( dd )
       if v > lthresh :
@@ -153,62 +153,62 @@ def boxOfString(x,p) :
       vd = delete(vd,i)
     else:
       i=i+2
-  
-  j = 0 
+
+  j = 0
   while j < len(hd):
     if hd[j+1]-hd[j] > maxdiv :
       hd = delete(hd,j)
       hd = delete(hd,j)
     else:
       j=j+2
-  
+
   if args.checklines :
     for i in vd :
       img[:,i] = [255,0,0] # red
-  
+
     for j in hd :
       img[j,:] = [0,0,255] # blue
     dumpImage(args,bmp,img)
     sys.exit(0)
-  
+
 #-----------------------------------------------------------------------
 # divider checking.
 #
-# at this point vd holds the x coordinate of vertical  and 
-# hd holds the y coordinate of horizontal divider tansitions for each 
+# at this point vd holds the x coordinate of vertical  and
+# hd holds the y coordinate of horizontal divider tansitions for each
 # vertical and horizontal lines in the table grid.
 
   def isDiv(a, l,r,t,b) :
           # if any col or row (in axis) is all zeros ...
-    return sum( sum(bmp[t:b, l:r], axis=a)==0 ) >0 
+    return sum( sum(bmp[t:b, l:r], axis=a)==0 ) >0
 
   if args.checkdivs :
     img = img / 2
     for j in range(0,len(hd),2):
       for i in range(0,len(vd),2):
         if i>0 :
-          (l,r,t,b) = (vd[i-1], vd[i],   hd[j],   hd[j+1]) 
+          (l,r,t,b) = (vd[i-1], vd[i],   hd[j],   hd[j+1])
           img[ t:b, l:r, 1 ] = 192
           if isDiv(1, l,r,t,b) :
             img[ t:b, l:r, 0 ] = 0
             img[ t:b, l:r, 2 ] = 255
-          
+
         if j>0 :
           (l,r,t,b) = (vd[i],   vd[i+1], hd[j-1], hd[j] )
           img[ t:b, l:r, 1 ] = 128
           if isDiv(0, l,r,t,b) :
             img[ t:b, l:r, 0 ] = 255
             img[ t:b, l:r, 2 ] = 0
-  
+
     dumpImage(args,bmp,img)
     sys.exit(0)
-  
+
 #-----------------------------------------------------------------------
 # Cell finding section.
 # This algorithum is width hungry, and always generates rectangular
 # boxes.
 
-  cells =[] 
+  cells =[]
   touched = zeros( (len(hd), len(vd)),dtype=bool )
   j = 0
   while j*2+2 < len (hd) :
@@ -233,8 +233,8 @@ def isDiv(a, l,r,t,b) :
         touched[ j:j+v, i:i+u] = True
       i = i+1
     j=j+1
-  
-  
+
+
   if args.checkcells :
     nc = len(cells)+0.
     img = img / 2
@@ -244,25 +244,25 @@ def isDiv(a, l,r,t,b) :
       img[ t:b, l:r ] += col( k/nc )
     dumpImage(args,bmp,img)
     sys.exit(0)
-  
-  
+
+
 #-----------------------------------------------------------------------
 # fork out to extract text for each cell.
 
   whitespace = re.compile( r'\s+')
-   
+
   def getCell( (i,j,u,v) ):
     (l,r,t,b) = ( vd[2*i+1] , vd[ 2*(i+u) ], hd[2*j+1], hd[2*(j+v)] )
     p = subprocess.Popen(
     ("pdftotext -r %d -x %d -y %d -W %d -H %d -layout -nopgbrk -f %d -l %d %s -"
          % (args.r, l-pad, t-pad, r-l, b-t, pg, pg, quote(args.infile) ) ),
         stdout=subprocess.PIPE, shell=True )
-    
+
     ret = p.communicate()[0]
     if args.w != 'raw' :
       ret = whitespace.sub( "" if args.w == "none" else " ", ret )
       if len(ret) > 0 :
-        ret = ret[ (1 if ret[0]==' ' else 0) : 
+        ret = ret[ (1 if ret[0]==' ' else 0) :
                    len(ret) - (1 if ret[-1]==' ' else 0) ]
     return (i,j,u,v,pg,ret)
 
@@ -270,12 +270,12 @@ def getCell( (i,j,u,v) ):
   #  cells = [ x + (pg,"",) for x in cells ]
   #else :
   #  cells = map(getCell, cells)
-  
+
   if args.boxes :
-    cells = [ x + (pg,"",) for x in cells if 
+    cells = [ x + (pg,"",) for x in cells if
               ( frow == None or (x[1] >= frow and x[1] <= lrow)) ]
   else :
-    cells = [ getCell(x)   for x in cells if 
+    cells = [ getCell(x)   for x in cells if
               ( frow == None or (x[1] >= frow and x[1] <= lrow)) ]
   return cells
 
@@ -294,4 +294,3 @@ def main_script():
       "cells_xml" : o_cells_xml,   "table_csv"  : o_table_csv,
       "table_html": o_table_html,  "table_chtml": o_table_html,
       } [ args.t ](cells,args.page)
-
diff --git a/src/pdftableextract/pnm.py b/src/pdftableextract/pnm.py
index cbb05dd..ae229ea 100644
--- a/src/pdftableextract/pnm.py
+++ b/src/pdftableextract/pnm.py
@@ -1,13 +1,14 @@
+from __future__ import print_function
 from numpy import array, fromstring, uint8, reshape, ones
 #-----------------------------------------------------------------------
 # PNM stuff.
 
 def noncomment(fd):
-  """Read lines from the filehandle until a non-comment line is found. 
+  """Read lines from the filehandle until a non-comment line is found.
   Comments start with #"""
   while True:
-    x = fd.readline() 
-    if x.startswith('#') :
+    x = fd.readline()
+    if x.startswith(b'#') :
       continue
     else:
       return x
@@ -16,7 +17,7 @@ def readPNM(fd):
   """Reads the PNM file from the filehandle"""
   t = noncomment(fd)
   s = noncomment(fd)
-  m = noncomment(fd) if not (t.startswith('P1') or t.startswith('P4')) else '1'
+  m = noncomment(fd) if not (t.startswith(b'P1') or t.startswith(b'P4')) else b'1'
   data = fd.read()
   ls = len(s.split())
   if ls != 2 :
@@ -28,8 +29,8 @@ def readPNM(fd):
   m = int(m)
 
   if m != 255 :
-    print "Just want 8 bit pgms for now!"
-  
+    print ("Just want 8 bit pgms for now!")
+
   d = fromstring(data,dtype=uint8)
   d = reshape(d, (height,width) )
   return (m,width,height, d)
@@ -39,14 +40,14 @@ def writePNM(fd,img):
   s = img.shape
   m = 255
   if img.dtype == bool :
-    img = img + uint8(0) 
+    img = img + uint8(0)
     t = "P5"
     m = 1
   elif len(s) == 2 :
     t = "P5"
   else:
     t = "P6"
-    
+
   fd.write( "%s\n%d %d\n%d\n" % (t, s[1],s[0],m) )
   fd.write( uint8(img).tostring() )
 

From 1fb10b3e3b64a8360d29c511de626d6dd7cffa67 Mon Sep 17 00:00:00 2001
From: Evgeny Cherkashin <eugene@irnok.net>
Date: Thu, 14 Jul 2016 17:50:34 +0800
Subject: [PATCH 04/28] Out result as unicode string.

---
 src/pdftableextract/core.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/pdftableextract/core.py b/src/pdftableextract/core.py
index b96fba4..491e67d 100644
--- a/src/pdftableextract/core.py
+++ b/src/pdftableextract/core.py
@@ -70,7 +70,8 @@ def process_page(infile, pgs,
     checkdivs=False,
     checkcells=False,
     whitespace="normalize",
-    boxes=False) :
+    boxes=False,
+    encoding="utf8") :
 
   outfile = open(outfilename,'w') if outfilename else sys.stdout
   page=page or []
@@ -321,7 +322,7 @@ def getCell( _coordinate):
       if len(ret) > 0 :
         ret = ret[ (1 if ret[0]==b' ' else 0) :
                    len(ret) - (1 if ret[-1]==b' ' else 0) ]
-    return (i,j,u,v,pg,ret)
+    return (i,j,u,v,pg,ret.decode(encoding))
 
   if boxes :
     cells = [ x + (pg,b"",) for x in cells if

From a59833168e2f2152c69ec6b26bb2032a9524bae8 Mon Sep 17 00:00:00 2001
From: Evgeny Cherkashin <eugene@irnok.net>
Date: Fri, 15 Jul 2016 01:26:15 +0800
Subject: [PATCH 05/28] Corrected imports in script.

---
 src/pdftableextract/scripts.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/pdftableextract/scripts.py b/src/pdftableextract/scripts.py
index 68a7b2e..6939713 100644
--- a/src/pdftableextract/scripts.py
+++ b/src/pdftableextract/scripts.py
@@ -2,8 +2,8 @@
 import sys
 import logging
 import subprocess
-from .core import process_page, output
-import core
+from pdftableextract.core import process_page, output
+import pdftableextract.core
 
 #-----------------------------------------------------------------------
 

From 058958c906b32789e328ca7f0cd21f91761e4149 Mon Sep 17 00:00:00 2001
From: Evgeny Cherkashin <eugene@irnok.net>
Date: Fri, 15 Jul 2016 01:27:12 +0800
Subject: [PATCH 06/28] Remove unused file.

---
 src/pdftableextract/extracttab.py | 296 ------------------------------
 1 file changed, 296 deletions(-)
 delete mode 100644 src/pdftableextract/extracttab.py

diff --git a/src/pdftableextract/extracttab.py b/src/pdftableextract/extracttab.py
deleted file mode 100644
index 44ccecd..0000000
--- a/src/pdftableextract/extracttab.py
+++ /dev/null
@@ -1,296 +0,0 @@
-# Description : PDF Table Extraction Utility
-#      Author : Ian McEwan, Ashima Research.
-#  Maintainer : ijm
-#     Lastmod : 20130402 (ijm)
-#     License : Copyright (C) 2011 Ashima Research. All rights reserved.
-#               Distributed under the MIT Expat License. See LICENSE file.
-#               https://github.com/ashima/pdf-table-extract
-
-import sys, argparse, subprocess, re, csv, json
-from numpy import *
-from pipes import quote
-from xml.dom.minidom import getDOMImplementation
-
-# Proccessing function.
-
-def process_page(pgs) :
-  (pg,frow,lrow) = (map(int,(pgs.split(":")))+[None,None])[0:3]
-
-  p = subprocess.Popen( ("pdftoppm -gray -r %d -f %d -l %d %s " %
-      (args.r,pg,pg,quote(args.infile))),
-      stdin=subprocess.PIPE, stdout=subprocess.PIPE, shell=True)
-
-#-----------------------------------------------------------------------
-# image load secion.
-
-  (maxval, width, height, data) = readPNM(p.stdout)
-
-  pad = int(args.pad)
-  height+=pad*2
-  width+=pad*2
-
-# reimbed image with a white padd.
-  bmp = ones( (height,width) , dtype=bool )
-  bmp[pad:height-pad,pad:width-pad] = ( data[:,:] > int(255.0*args.g/100.0) )
-
-# Set up Debuging image.
-  img = zeros( (height,width,3) , dtype=uint8 )
-  img[:,:,0] = bmp*255
-  img[:,:,1] = bmp*255
-  img[:,:,2] = bmp*255
-
-#-----------------------------------------------------------------------
-# Find bounding box.
-
-  t=0
-  while t < height and sum(bmp[t,:]==0) == 0 :
-    t=t+1
-  if t > 0 :
-    t=t-1
-
-  b=height-1
-  while b > t and sum(bmp[b,:]==0) == 0 :
-    b=b-1
-  if b < height-1:
-    b = b+1
-
-  l=0
-  while l < width and sum(bmp[:,l]==0) == 0 :
-    l=l+1
-  if l > 0 :
-    l=l-1
-
-  r=width-1
-  while r > l and sum(bmp[:,r]==0) == 0 :
-    r=r-1
-  if r < width-1 :
-    r=r+1
-
-# Mark bounding box.
-  bmp[t,:] = 0
-  bmp[b,:] = 0
-  bmp[:,l] = 0
-  bmp[:,r] = 0
-
-  def boxOfString(x,p) :
-    s = x.split(":")
-    if len(s) < 4 :
-      raise Exception("boxes have format left:top:right:bottom[:page]")
-    return ([args.r * float(x) + args.pad for x in s[0:4] ]
-                + [ p if len(s)<5 else int(s[4]) ] )
-
-
-# translate crop to paint white.
-  whites = []
-  if args.crop :
-    (l,t,r,b,p) = boxOfString(args.crop,pg)
-    whites.extend( [ (0,0,l,height,p), (0,0,width,t,p),
-                     (r,0,width,height,p), (0,b,width,height,p) ] )
-
-# paint white ...
-  if args.white :
-    whites.extend( [ boxOfString(b, pg) for b in args.white ] )
-
-  for (l,t,r,b,p) in whites :
-    if p == pg :
-      bmp[ t:b+1,l:r+1 ] = 1
-      img[ t:b+1,l:r+1 ] = [255,255,255]
-
-# paint black ...
-  if args.black :
-    for b in args.black :
-      (l,t,r,b) = [args.r * float(x) + args.pad for x in b.split(":") ]
-      bmp[ t:b+1,l:r+1 ] = 0
-      img[ t:b+1,l:r+1 ] = [0,0,0]
-
-  if args.checkcrop :
-    dumpImage(args,bmp,img)
-    sys.exit(0)
-
-
-#-----------------------------------------------------------------------
-# Line finding section.
-#
-# Find all verticle or horizontal lines that are more than rlthresh
-# long, these are considered lines on the table grid.
-
-  lthresh = int(args.l * args.r)
-  vs = zeros(width, dtype=int)
-  for i in range(width) :
-    dd = diff( where(bmp[:,i])[0] )
-    if len(dd)>0:
-      v = max ( dd )
-      if v > lthresh :
-        vs[i] = 1
-    else:
-# it was a solid black line.
-      if bmp[0,i] == 0 :
-        vs[i] = 1
-  vd= ( where(diff(vs[:]))[0] +1 )
-
-  hs = zeros(height, dtype=int)
-  for j in range(height) :
-    dd = diff( where(bmp[j,:]==1)[0] )
-    if len(dd) > 0 :
-      h = max ( dd )
-      if h > lthresh :
-        hs[j] = 1
-    else:
-# it was a solid black line.
-      if bmp[j,0] == 0 :
-        hs[j] = 1
-  hd=(  where(diff(hs[:]==1))[0] +1 )
-
-#-----------------------------------------------------------------------
-# Look for dividors that are too large.
-
-  maxdiv=10
-  i=0
-
-  while i < len(vd) :
-    if vd[i+1]-vd[i] > maxdiv :
-      vd = delete(vd,i)
-      vd = delete(vd,i)
-    else:
-      i=i+2
-
-  j = 0
-  while j < len(hd):
-    if hd[j+1]-hd[j] > maxdiv :
-      hd = delete(hd,j)
-      hd = delete(hd,j)
-    else:
-      j=j+2
-
-  if args.checklines :
-    for i in vd :
-      img[:,i] = [255,0,0] # red
-
-    for j in hd :
-      img[j,:] = [0,0,255] # blue
-    dumpImage(args,bmp,img)
-    sys.exit(0)
-
-#-----------------------------------------------------------------------
-# divider checking.
-#
-# at this point vd holds the x coordinate of vertical  and
-# hd holds the y coordinate of horizontal divider tansitions for each
-# vertical and horizontal lines in the table grid.
-
-  def isDiv(a, l,r,t,b) :
-          # if any col or row (in axis) is all zeros ...
-    return sum( sum(bmp[t:b, l:r], axis=a)==0 ) >0
-
-  if args.checkdivs :
-    img = img / 2
-    for j in range(0,len(hd),2):
-      for i in range(0,len(vd),2):
-        if i>0 :
-          (l,r,t,b) = (vd[i-1], vd[i],   hd[j],   hd[j+1])
-          img[ t:b, l:r, 1 ] = 192
-          if isDiv(1, l,r,t,b) :
-            img[ t:b, l:r, 0 ] = 0
-            img[ t:b, l:r, 2 ] = 255
-
-        if j>0 :
-          (l,r,t,b) = (vd[i],   vd[i+1], hd[j-1], hd[j] )
-          img[ t:b, l:r, 1 ] = 128
-          if isDiv(0, l,r,t,b) :
-            img[ t:b, l:r, 0 ] = 255
-            img[ t:b, l:r, 2 ] = 0
-
-    dumpImage(args,bmp,img)
-    sys.exit(0)
-
-#-----------------------------------------------------------------------
-# Cell finding section.
-# This algorithum is width hungry, and always generates rectangular
-# boxes.
-
-  cells =[]
-  touched = zeros( (len(hd), len(vd)),dtype=bool )
-  j = 0
-  while j*2+2 < len (hd) :
-    i = 0
-    while i*2+2 < len(vd) :
-      u = 1
-      v = 1
-      if not touched[j,i] :
-        while 2+(i+u)*2 < len(vd) and \
-            not isDiv( 0, vd[ 2*(i+u) ], vd[ 2*(i+u)+1],
-               hd[ 2*(j+v)-1 ], hd[ 2*(j+v) ] ):
-          u=u+1
-        bot = False
-        while 2+(j+v)*2 < len(hd) and not bot :
-          bot = False
-          for k in range(1,u+1) :
-            bot |= isDiv( 1, vd[ 2*(i+k)-1 ], vd[ 2*(i+k)],
-               hd[ 2*(j+v) ], hd[ 2*(j+v)+1 ] )
-          if not bot :
-            v=v+1
-        cells.append( (i,j,u,v) )
-        touched[ j:j+v, i:i+u] = True
-      i = i+1
-    j=j+1
-
-
-  if args.checkcells :
-    nc = len(cells)+0.
-    img = img / 2
-    for k in range(len(cells)):
-      (i,j,u,v) = cells[k]
-      (l,r,t,b) = ( vd[2*i+1] , vd[ 2*(i+u) ], hd[2*j+1], hd[2*(j+v)] )
-      img[ t:b, l:r ] += col( k/nc )
-    dumpImage(args,bmp,img)
-    sys.exit(0)
-
-
-#-----------------------------------------------------------------------
-# fork out to extract text for each cell.
-
-  whitespace = re.compile( r'\s+')
-
-  def getCell( (i,j,u,v) ):
-    (l,r,t,b) = ( vd[2*i+1] , vd[ 2*(i+u) ], hd[2*j+1], hd[2*(j+v)] )
-    p = subprocess.Popen(
-    ("pdftotext -r %d -x %d -y %d -W %d -H %d -layout -nopgbrk -f %d -l %d %s -"
-         % (args.r, l-pad, t-pad, r-l, b-t, pg, pg, quote(args.infile) ) ),
-        stdout=subprocess.PIPE, shell=True )
-
-    ret = p.communicate()[0]
-    if args.w != 'raw' :
-      ret = whitespace.sub( "" if args.w == "none" else " ", ret )
-      if len(ret) > 0 :
-        ret = ret[ (1 if ret[0]==' ' else 0) :
-                   len(ret) - (1 if ret[-1]==' ' else 0) ]
-    return (i,j,u,v,pg,ret)
-
-  #if args.boxes :
-  #  cells = [ x + (pg,"",) for x in cells ]
-  #else :
-  #  cells = map(getCell, cells)
-
-  if args.boxes :
-    cells = [ x + (pg,"",) for x in cells if
-              ( frow == None or (x[1] >= frow and x[1] <= lrow)) ]
-  else :
-    cells = [ getCell(x)   for x in cells if
-              ( frow == None or (x[1] >= frow and x[1] <= lrow)) ]
-  return cells
-
-
-#-----------------------------------------------------------------------
-# main
-
-def main_script():
-    args = procargs()
-
-    cells = []
-    for pgs in args.page :
-      cells.extend(process_page(pgs))
-
-    { "cells_csv" : o_cells_csv,   "cells_json" : o_cells_json,
-      "cells_xml" : o_cells_xml,   "table_csv"  : o_table_csv,
-      "table_html": o_table_html,  "table_chtml": o_table_html,
-      } [ args.t ](cells,args.page)

From d05360dd031ea74c80fb7b08c54a9be843b45555 Mon Sep 17 00:00:00 2001
From: Evgeny Cherkashin <eugene@irnok.net>
Date: Fri, 15 Jul 2016 09:15:31 +0800
Subject: [PATCH 07/28] Starting conversion to Gtk's Poppler.

---
 Makefile                    | 25 +++++++++++++++++++++++++
 requirements.txt            |  3 +++
 setup.py                    |  2 +-
 src/pdftableextract/core.py | 28 ++++++++++++++++++++++------
 4 files changed, 51 insertions(+), 7 deletions(-)
 create mode 100644 Makefile
 create mode 100644 requirements.txt

diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..63821a7
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,25 @@
+.PHONY: develop setup run-tests tests test gdb-test
+
+LPYTHON=python3
+V=$(PWD)/../../$(LPYTHON)
+VB=$(V)/bin
+PYTHON=$(VB)/$(LPYTHON)
+ROOT=$(PWD)
+#INI=icc.linkgrammar
+#LCAT=src/icc/linkgrammar/locale/
+
+develop: setup
+	pip install -r requirements.txt
+
+setup:
+	python setup.py develop
+
+run-tests:
+	nosetests -w src/icc/tests
+
+tests:	run-tests
+
+test:	setup run-tests
+
+gdb-test: setup
+	gdb --args $(PYTHON) $(VB)/nosetests -w src/icc/tests
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..008b217
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,3 @@
+numpy
+ruamel.venvgtk
+# pandas
diff --git a/setup.py b/setup.py
index 8591c50..516d5ad 100644
--- a/setup.py
+++ b/setup.py
@@ -8,7 +8,7 @@
 
 version = '0.1'
 
-install_requires = [ "numpy" ]
+install_requires = [ "numpy", "ruamel.venvgtk" ]
 
 
 setup(name='pdf-table-extract',
diff --git a/src/pdftableextract/core.py b/src/pdftableextract/core.py
index 491e67d..1ec262c 100644
--- a/src/pdftableextract/core.py
+++ b/src/pdftableextract/core.py
@@ -1,19 +1,34 @@
 import sys
 import os
 from numpy import array, fromstring, ones, zeros, uint8, diff, where, sum, delete
-import subprocess
-from pipes import quote
+#import subprocess
+#from pipes import quote
 from pdftableextract.pnm import readPNM, dumpImage
 import re
-from pipes import quote
+#from pipes import quote
 from xml.dom.minidom import getDOMImplementation
 import json
 import csv
+import gi
+gi.require_version('Gtk', '3.0')
+from gi.repository import Poppler
+
+class PopplerProcessor(object):
+    """
+    """
+
+    def __init__(self, **kwargs):
+        """
+        """
+        self.p=Popp
+
+
 
 #-----------------------------------------------------------------------
 def check_for_required_executable(name,command):
     """Checks for an executable called 'name' by running 'command' and supressing
     output. If the return code is non-zero or an OS error occurs, an Exception is raised"""
+    return
     try:
         with open(os.devnull, "w") as fnull:
             result=subprocess.check_call(command,stdout=fnull, stderr=fnull)
@@ -29,6 +44,8 @@ def check_for_required_executable(name,command):
 
 #-----------------------------------------------------------------------
 def popen(name,command, *args, **kwargs):
+    print (name,command, *args, **kwargs)
+    wew
     try:
         result=subprocess.Popen(command,*args, **kwargs)
         return result
@@ -81,8 +98,7 @@ def process_page(infile, pgs,
   #end check
 
   p = popen("pdftoppm", ("pdftoppm -gray -r %d -f %d -l %d %s " %
-      (bitmap_resolution,pg,pg,quote(infile))),
-      stdin=subprocess.PIPE, stdout=subprocess.PIPE, shell=True )
+      (bitmap_resolution,pg,pg,infile)))
 
 #-----------------------------------------------------------------------
 # image load secion.
@@ -312,7 +328,7 @@ def getCell( _coordinate):
     (i,j,u,v) =_coordinate
     (l,r,t,b) = ( vd[2*i+1] , vd[ 2*(i+u) ], hd[2*j+1], hd[2*(j+v)] )
     p = popen("pdftotext",
-              "pdftotext -r %d -x %d -y %d -W %d -H %d -layout -nopgbrk -f %d -l %d %s -" % (bitmap_resolution, l-pad, t-pad, r-l, b-t, pg, pg, quote(infile)),
+              "pdftotext -r %d -x %d -y %d -W %d -H %d -layout -nopgbrk -f %d -l %d %s -" % (bitmap_resolution, l-pad, t-pad, r-l, b-t, pg, pg, infile),
               stdout=subprocess.PIPE,
               shell=True )
 

From 5d52e02804d1a2b442487fc9296104339ac95b3b Mon Sep 17 00:00:00 2001
From: Evgeny Cherkashin <eugene@irnok.net>
Date: Fri, 15 Jul 2016 13:05:26 +0800
Subject: [PATCH 08/28] Formatted versions of files

---
 setup.py                    | 7 +++----
 src/pdftableextract/core.py | 2 --
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/setup.py b/setup.py
index 516d5ad..5d691ea 100644
--- a/setup.py
+++ b/setup.py
@@ -5,10 +5,9 @@
 README = open(os.path.join(here, 'README.md')).read()
 #NEWS = open(os.path.join(here, 'NEWS.txt')).read()
 
-
 version = '0.1'
 
-install_requires = [ "numpy", "ruamel.venvgtk" ]
+install_requires = ["numpy", "ruamel.venvgtk"]
 
 
 setup(name='pdf-table-extract',
@@ -21,7 +20,7 @@
     keywords='PDF, tables',
     author='Ian McEwan',
     author_email='ijm@ashimaresearch.com',
-    url='ashimaresearch.com',
+    url='ashimaresearch.dcom',
     license='MIT-Expat',
     packages=find_packages('src'),
     package_dir = {'': 'src'},include_package_data=True,
@@ -31,4 +30,4 @@
         'console_scripts':
             ['pdf-table-extract=pdftableextract.scripts:main']
     }
-)
+      )
diff --git a/src/pdftableextract/core.py b/src/pdftableextract/core.py
index 1ec262c..4baaa55 100644
--- a/src/pdftableextract/core.py
+++ b/src/pdftableextract/core.py
@@ -22,8 +22,6 @@ def __init__(self, **kwargs):
         """
         self.p=Popp
 
-
-
 #-----------------------------------------------------------------------
 def check_for_required_executable(name,command):
     """Checks for an executable called 'name' by running 'command' and supressing

From e93eff5b15d5d11a87e039e49ecb4575306e3f64 Mon Sep 17 00:00:00 2001
From: Evgeny Cherkashin <eugene@irnok.net>
Date: Fri, 15 Jul 2016 22:23:04 +0800
Subject: [PATCH 09/28] Now it works but incorrectly.

---
 example/gtk-test.py         |  37 ++++++++++++
 example/test_to_pandas.py   |   9 +--
 setup.py                    |   2 +-
 src/pdftableextract/core.py | 115 ++++++++++++++++++++++++++----------
 4 files changed, 127 insertions(+), 36 deletions(-)
 create mode 100644 example/gtk-test.py

diff --git a/example/gtk-test.py b/example/gtk-test.py
new file mode 100644
index 0000000..7ceb36a
--- /dev/null
+++ b/example/gtk-test.py
@@ -0,0 +1,37 @@
+# http://stackoverflow.com/a/10031877
+
+import numpy
+import cairo
+import math
+
+from gi.repository import Gtk, Gdk
+
+data = numpy.zeros((200, 200, 4), dtype=numpy.uint8)
+surface = cairo.ImageSurface.create_for_data(data, cairo.FORMAT_ARGB32, 200,
+                                             200)
+cr = cairo.Context(surface)
+
+# fill with solid white
+cr.set_source_rgb(1.0, 1.0, 1.0)
+cr.paint()
+
+# draw red circle
+cr.arc(100, 100, 80, 0, 2 * math.pi)
+cr.set_line_width(3)
+cr.set_source_rgb(1.0, 0.0, 0.0)
+cr.stroke()
+
+#draw directly to the shared buffer
+data[10:30, 10:30, 2] = 128
+
+# write output
+print(data[38:48, 38:48, 0])
+surface.write_to_png("circle.png")
+
+pb = Gdk.pixbuf_get_from_surface(surface, 0, 0, 200, 200)
+im = Gtk.Image.new_from_pixbuf(pb)
+w = Gtk.Window()
+w.connect("delete-event", Gtk.main_quit)
+w.add(im)
+w.show_all()
+Gtk.main()
diff --git a/example/test_to_pandas.py b/example/test_to_pandas.py
index d734ea3..32cd3c7 100644
--- a/example/test_to_pandas.py
+++ b/example/test_to_pandas.py
@@ -3,13 +3,14 @@
 import pdftableextract as pdf
 
 pages = ["1"]
-cells = [pdf.process_page("example.pdf",p) for p in pages]
+cells = [pdf.process_page("example.pdf", p) for p in pages]
 
 #flatten the cells structure
-cells = [item for sublist in cells for item in sublist ]
+cells = [item for sublist in cells for item in sublist]
 
 #without any options, process_page picks up a blank table at the top of the page.
 #so choose table '1'
+print(cells)
 li = pdf.table_to_list(cells, pages)[1]
 
 #li is a list of lists, the first line is the header, last is the footer (for this table only!)
@@ -17,5 +18,5 @@
 #row '1' contains column headings
 #data is row '2' through '-1'
 
-data =pd.DataFrame(li[2:-1], columns=li[1], index=[l[0] for l in li[2:-1]])
-print (data)
+data = pd.DataFrame(li[2:-1], columns=li[1], index=[l[0] for l in li[2:-1]])
+print(data)
diff --git a/setup.py b/setup.py
index 5d691ea..66ef158 100644
--- a/setup.py
+++ b/setup.py
@@ -30,4 +30,4 @@
         'console_scripts':
             ['pdf-table-extract=pdftableextract.scripts:main']
     }
-      )
+)
diff --git a/src/pdftableextract/core.py b/src/pdftableextract/core.py
index 4baaa55..30e2eb7 100644
--- a/src/pdftableextract/core.py
+++ b/src/pdftableextract/core.py
@@ -1,6 +1,7 @@
 import sys
 import os
-from numpy import array, fromstring, ones, zeros, uint8, diff, where, sum, delete
+from numpy import array, fromstring, ones, zeros, uint8, diff, where, sum, delete, frombuffer, reshape
+import numpy
 #import subprocess
 #from pipes import quote
 from pdftableextract.pnm import readPNM, dumpImage
@@ -11,16 +12,76 @@
 import csv
 import gi
 gi.require_version('Gtk', '3.0')
-from gi.repository import Poppler
+gi.require_version('Poppler', '0.18')
+from gi.repository import Gdk, Poppler
+import cairo
 
 class PopplerProcessor(object):
-    """
+    """Class for processing PDF. That's simple.
+    It does two functions.
+    1. Renders a page as a PNM graphics, and
+    2. Get text in a rectangular bounding box.
     """
 
-    def __init__(self, **kwargs):
-        """
+    def __init__(self, filename, **kwargs):
+        """Opens a document denoted by filename.
         """
-        self.p=Popp
+        self.filename=os.path.abspath(filename)
+        self.document=Poppler.Document.new_from_file("file:"+self.filename, None)
+        self.page_num=self.document.get_n_pages()
+        self.resolution=300
+        self.greyscale_threshold=kwargs.get("greyscale_thresholds",25)
+
+    def get_page(self, index):
+        if index<0 or index>=self.page_num:
+            raise IndexError("page number is out of bounds")
+        return self.document.get_page(index)
+
+    def get_image(self, index):
+        page=self.get_page(index)
+        dpi=self.resolution
+        scale = 1
+        width, height = [int(x) for x in page.get_size ()]
+        d=dpi/72.
+        pxw, pxh=int(width * d), int(height * d)
+        # data=zeros((pxw,pxh,4), dtype=uint8)
+        surface = cairo.ImageSurface (
+            # data,
+            cairo.FORMAT_ARGB32,
+            pxw, pxh)
+
+        context = cairo.Context (surface)
+        context.scale (d, d)
+
+        context.save ()
+        page.render (context)
+        context.restore ()
+
+        pixbuf  = Gdk.pixbuf_get_from_surface (surface, 0, 0, pxw, pxh)
+        surface.write_to_png("page.png")
+        #img=image.set_from_pixbuf (pixbuf)
+        data=frombuffer(pixbuf.get_pixels(), dtype=uint8)
+        R=data[0::4]
+        G=data[1::4]
+        B=data[2::4]
+        A=data[3::4]
+        C=R*34+G*0.56+B*0.1
+        # print (max(A))
+        C=C.astype(uint8)
+        A=A<=self.greyscale_threshold
+        C[A]=255
+        # print (C)
+        return C.reshape((pxw,pxh))
+
+    def get_text(self, index, x,y, w,h):
+        rect=Poppler.Rectangle()
+        rect.x1,rect.y1=x,y
+        rect.x2,rect.y2=x+w,y+h
+        # print (help(rect))
+        pg=self.get_page(index)
+        txt=pg.get_text_for_area(rect)
+        Poppler.Rectangle.free(rect)
+        return txt
 
 #-----------------------------------------------------------------------
 def check_for_required_executable(name,command):
@@ -42,8 +103,7 @@ def check_for_required_executable(name,command):
 
 #-----------------------------------------------------------------------
 def popen(name,command, *args, **kwargs):
-    print (name,command, *args, **kwargs)
-    wew
+    #print (name,command, *args, **kwargs)
     try:
         result=subprocess.Popen(command,*args, **kwargs)
         return result
@@ -89,25 +149,25 @@ def process_page(infile, pgs,
     encoding="utf8") :
 
   outfile = open(outfilename,'w') if outfilename else sys.stdout
+  pdfdoc = PopplerProcessor(infile)
   page=page or []
   (pg,frow,lrow) = (list(map(int,(pgs.split(":"))))+[None,None])[0:3]
-  #check that pdftoppdm exists by running a simple command
-  check_for_required_executable("pdftoppm",["pdftoppm","-h"])
-  #end check
+  pdfdoc.resolution=bitmap_resolution
+  pdfdoc.greyscale_threshold=greyscale_threshold
 
-  p = popen("pdftoppm", ("pdftoppm -gray -r %d -f %d -l %d %s " %
-      (bitmap_resolution,pg,pg,infile)))
+  data = pdfdoc.get_image(pg-1)  # Page numbers are 0-based.
 
 #-----------------------------------------------------------------------
-# image load secion.
+# image load section.
 
-  (maxval, width, height, data) = readPNM(p.stdout)
+  #print(data.shape)
+  height, width = data.shape[:2]
 
   pad = int(pad)
   height+=pad*2
   width+=pad*2
 
-# reimbed image with a white padd.
+# reimbed image with a white pad.
   bmp = ones( (height,width) , dtype=bool )
   bmp[pad:height-pad,pad:width-pad] = ( data[:,:] > int(255.0*greyscale_threshold/100.0) )
 
@@ -325,26 +385,19 @@ def isDiv(a, l,r,t,b) :
   def getCell( _coordinate):
     (i,j,u,v) =_coordinate
     (l,r,t,b) = ( vd[2*i+1] , vd[ 2*(i+u) ], hd[2*j+1], hd[2*(j+v)] )
-    p = popen("pdftotext",
-              "pdftotext -r %d -x %d -y %d -W %d -H %d -layout -nopgbrk -f %d -l %d %s -" % (bitmap_resolution, l-pad, t-pad, r-l, b-t, pg, pg, infile),
-              stdout=subprocess.PIPE,
-              shell=True )
-
-    ret = p.communicate()[0]
-    if whitespace != 'raw' :
-      ret = whitespace.sub( b"" if whitespace == "none" else b" ", ret )
-      if len(ret) > 0 :
-        ret = ret[ (1 if ret[0]==b' ' else 0) :
-                   len(ret) - (1 if ret[-1]==b' ' else 0) ]
-    return (i,j,u,v,pg,ret.decode(encoding))
+    ret = pdfdoc.get_text(pg-1, l-pad, t-pad, r-l, b-t)
+    # if whitespace != 'raw' :
+    #   ret = whitespace.sub( b"" if whitespace == "none" else b" ", ret )
+    #   if len(ret) > 0 :
+    #     ret = ret[ (1 if ret[0]==b' ' else 0) :
+    #                len(ret) - (1 if ret[-1]==b' ' else 0) ]
+    return (i,j,u,v,pg,ret)
 
   if boxes :
     cells = [ x + (pg,b"",) for x in cells if
               ( frow == None or (x[1] >= frow and x[1] <= lrow)) ]
   else :
-    #check that pdftotext exists by running a simple command
-    check_for_required_executable("pdftotext",["pdftotext","-h"])
-    #end check
+    print (cells)
     cells = [ getCell(x)   for x in cells if
               ( frow == None or (x[1] >= frow and x[1] <= lrow)) ]
   return cells

From ba5006ec00f9e9a802f1b9be42d93960b4219462 Mon Sep 17 00:00:00 2001
From: Evgeny Cherkashin <eugene@irnok.net>
Date: Sat, 16 Jul 2016 02:25:11 +0800
Subject: [PATCH 10/28] Porting recognition algorithm to RGBA.

---
 src/pdftableextract/core.py | 926 +++++++++++++++++++-----------------
 1 file changed, 499 insertions(+), 427 deletions(-)

diff --git a/src/pdftableextract/core.py b/src/pdftableextract/core.py
index 30e2eb7..490356a 100644
--- a/src/pdftableextract/core.py
+++ b/src/pdftableextract/core.py
@@ -1,12 +1,12 @@
 import sys
 import os
-from numpy import array, fromstring, ones, zeros, uint8, diff, where, sum, delete, frombuffer, reshape
+from numpy import array, fromstring, ones, zeros, uint8, diff, where, sum, delete, frombuffer, reshape, all, any
 import numpy
-#import subprocess
-#from pipes import quote
-from pdftableextract.pnm import readPNM, dumpImage
-import re
-#from pipes import quote
+
+import matplotlib
+matplotlib.use('AGG')
+from matplotlib.image import imsave
+
 from xml.dom.minidom import getDOMImplementation
 import json
 import csv
@@ -16,6 +16,7 @@
 from gi.repository import Gdk, Poppler
 import cairo
 
+
 class PopplerProcessor(object):
     """Class for processing PDF. That's simple.
     It does two functions.
@@ -26,71 +27,91 @@ class PopplerProcessor(object):
     def __init__(self, filename, **kwargs):
         """Opens a document denoted by filename.
         """
-        self.filename=os.path.abspath(filename)
-        self.document=Poppler.Document.new_from_file("file:"+self.filename, None)
-        self.page_num=self.document.get_n_pages()
-        self.resolution=300
-        self.greyscale_threshold=kwargs.get("greyscale_thresholds",25)
+        self.filename = os.path.abspath(filename)
+        self.document = Poppler.Document.new_from_file("file:" + self.filename,
+                                                       None)
+        self.page_num = self.document.get_n_pages()
+        self.resolution = 300
+        self.greyscale_threshold = int(kwargs.get("greyscale_thresholds",
+                                                  25)) * 255.0 / 100.0
 
     def get_page(self, index):
-        if index<0 or index>=self.page_num:
+        if index < 0 or index >= self.page_num:
             raise IndexError("page number is out of bounds")
         return self.document.get_page(index)
 
     def get_image(self, index):
-        page=self.get_page(index)
-        dpi=self.resolution
+        page = self.get_page(index)
+        dpi = self.resolution
         scale = 1
-        width, height = [int(x) for x in page.get_size ()]
-        d=dpi/72.
-        pxw, pxh=int(width * d), int(height * d)
+        width, height = [int(x) for x in page.get_size()]
+        d = dpi / 72.
+        pxw, pxh = int(width * d), int(height * d)
         # data=zeros((pxw,pxh,4), dtype=uint8)
-        surface = cairo.ImageSurface (
+        surface = cairo.ImageSurface(
             # data,
             cairo.FORMAT_ARGB32,
-            pxw, pxh)
+            pxw,
+            pxh)
 
-        context = cairo.Context (surface)
-        context.scale (d, d)
+        context = cairo.Context(surface)
+        context.scale(d, d)
 
-        context.save ()
-        page.render (context)
-        context.restore ()
+        context.save()
+        page.render(context)
+        context.restore()
 
-        pixbuf  = Gdk.pixbuf_get_from_surface (surface, 0, 0, pxw, pxh)
+        pixbuf = Gdk.pixbuf_get_from_surface(surface, 0, 0, pxw, pxh)
         surface.write_to_png("page.png")
         #img=image.set_from_pixbuf (pixbuf)
-        data=frombuffer(pixbuf.get_pixels(), dtype=uint8)
-        R=data[0::4]
-        G=data[1::4]
-        B=data[2::4]
-        A=data[3::4]
-        C=R*34+G*0.56+B*0.1
-        # print (max(A))
-        C=C.astype(uint8)
-        A=A<=self.greyscale_threshold
-        C[A]=255
-        # print (C)
-        return C.reshape((pxw,pxh))
-
-    def get_text(self, index, x,y, w,h):
-        rect=Poppler.Rectangle()
-        rect.x1,rect.y1=x,y
-        rect.x2,rect.y2=x+w,y+h
+        data = frombuffer(pixbuf.get_pixels(), dtype=uint8)
+        # R = data[0::4]
+        # G = data[1::4]
+        # B = data[2::4]
+        # A = data[3::4]
+        # C = R * 34 + G * 0.56 + B * 0.1
+        # # print (max(A))
+        # C = C.astype(uint8)
+        # A = A <= self.greyscale_threshold
+        # C[A] = 255
+
+        # C = C.reshape((pxh, pxw))
+        data = data.reshape((pxh, pxw, 4))
+        #d = data[:, :, 3]
+        alpha = data[:, :, 3]
+        new = zeros(data.shape, dtype=uint8)
+        new[:, :, :] = data
+        new = new[:, :, 0:3]
+        print(data)
+        rc = alpha <= self.greyscale_threshold
+
+        new[rc, 0] = 255
+        new[rc, 1] = 255
+        new[rc, 2] = 255
+        #new[:, :, 3] = 255
+        imsave('nomask.png', new)
+        return new, rc, page
+
+    def get_text(self, page, x, y, w, h):
+        rect = Poppler.Rectangle()
+        rect.x1, rect.y1 = x, y
+        rect.x2, rect.y2 = x + w, y + h
         # print (help(rect))
-        pg=self.get_page(index)
-        txt=pg.get_text_for_area(rect)
-        Poppler.Rectangle.free(rect)
+        txt = page.get_text_for_area(rect)
+        #rect.free()
+        #Poppler.Rectangle.free(rect)
+
         return txt
 
+
 #-----------------------------------------------------------------------
-def check_for_required_executable(name,command):
+def check_for_required_executable(name, command):
     """Checks for an executable called 'name' by running 'command' and supressing
     output. If the return code is non-zero or an OS error occurs, an Exception is raised"""
     return
     try:
         with open(os.devnull, "w") as fnull:
-            result=subprocess.check_call(command,stdout=fnull, stderr=fnull)
+            result = subprocess.check_call(command, stdout=fnull, stderr=fnull)
     except OSError as e:
         message = """Error running {0}.
 Command failed: {1}
@@ -101,148 +122,162 @@ def check_for_required_executable(name,command):
     except Exception as e:
         raise
 
+
 #-----------------------------------------------------------------------
-def popen(name,command, *args, **kwargs):
+def popen(name, command, *args, **kwargs):
     #print (name,command, *args, **kwargs)
     try:
-        result=subprocess.Popen(command,*args, **kwargs)
+        result = subprocess.Popen(command, *args, **kwargs)
         return result
     except OSError as e:
-        message="""Error running {0}. Is it installed correctly?
+        message = """Error running {0}. Is it installed correctly?
 Error: {1}""".format(name, e)
         raise OSError(message)
     except Exception as e:
         raise
 
-def colinterp(a,x) :
-    """Interpolates colors"""
-    l = len(a)-1
-    i = min(l, max(0, int (x * l)))
-    (u,v) = a[i:i+2,:]
-    return u - (u-v) * ((x * l) % 1.0)
-
-colarr = array([ [255,0,0],[255,255,0],[0,255,0],[0,255,255],[0,0,255] ])
 
-def col(x, colmult=1.0) :
-    """colors"""
-    return colinterp(colarr,(colmult * x)% 1.0) / 2
-
-
-def process_page(infile, pgs,
-    outfilename=None,
-    greyscale_threshold=25,
-    page=None,
-    crop=None,
-    line_length=0.17,
-    bitmap_resolution=300,
-    name=None,
-    pad=2,
-    white=None,
-    black=None,
-    bitmap=False,
-    checkcrop=False,
-    checklines=False,
-    checkdivs=False,
-    checkcells=False,
-    whitespace="normalize",
-    boxes=False,
-    encoding="utf8") :
-
-  outfile = open(outfilename,'w') if outfilename else sys.stdout
-  pdfdoc = PopplerProcessor(infile)
-  page=page or []
-  (pg,frow,lrow) = (list(map(int,(pgs.split(":"))))+[None,None])[0:3]
-  pdfdoc.resolution=bitmap_resolution
-  pdfdoc.greyscale_threshold=greyscale_threshold
-
-  data = pdfdoc.get_image(pg-1)  # Page numbers are 0-based.
-
-#-----------------------------------------------------------------------
-# image load section.
-
-  #print(data.shape)
-  height, width = data.shape[:2]
+def colinterp(a, x):
+    """Interpolates colors"""
+    l = len(a) - 1
+    i = min(l, max(0, int(x * l)))
+    (u, v) = a[i:i + 2, :]
+    return u - (u - v) * ((x * l) % 1.0)
 
-  pad = int(pad)
-  height+=pad*2
-  width+=pad*2
 
-# reimbed image with a white pad.
-  bmp = ones( (height,width) , dtype=bool )
-  bmp[pad:height-pad,pad:width-pad] = ( data[:,:] > int(255.0*greyscale_threshold/100.0) )
+colarr = array(
+    [[255, 0, 0], [255, 255, 0], [0, 255, 0], [0, 255, 255], [0, 0, 255]])
 
-# Set up Debuging image.
-  img = zeros( (height,width,3) , dtype=uint8 )
-  img[:,:,0] = bmp*255
-  img[:,:,1] = bmp*255
-  img[:,:,2] = bmp*255
 
-#-----------------------------------------------------------------------
-# Find bounding box.
-  t=0
-  while t < height and sum(bmp[t,:]==0) == 0 :
-    t=t+1
-  if t > 0 :
-    t=t-1
-
-  b=height-1
-  while b > t and sum(bmp[b,:]==0) == 0 :
-    b=b-1
-  if b < height-1:
-    b = b+1
-
-  l=0
-  while l < width and sum(bmp[:,l]==0) == 0 :
-    l=l+1
-  if l > 0 :
-    l=l-1
-
-  r=width-1
-  while r > l and sum(bmp[:,r]==0) == 0 :
-    r=r-1
-  if r < width-1 :
-    r=r+1
+def col(x, colmult=1.0):
+    """colors"""
+    return colinterp(colarr, (colmult * x) % 1.0) / 2
+
+
+def process_page(infile,
+                 pgs,
+                 outfilename=None,
+                 greyscale_threshold=25,
+                 page=None,
+                 crop=None,
+                 line_length=0.17,
+                 bitmap_resolution=300,
+                 name=None,
+                 pad=2,
+                 white=None,
+                 black=None,
+                 bitmap=False,
+                 checkcrop=False,
+                 checklines=False,
+                 checkdivs=False,
+                 checkcells=False,
+                 whitespace="normalize",
+                 boxes=False,
+                 encoding="utf8"):
+
+    outfile = outfilename if outfilename else sys.stdout
+    pdfdoc = PopplerProcessor(infile)
+    page = page or []
+    (pg, frow, lrow) = (list(map(int, (pgs.split(":")))) + [None, None])[0:3]
+    pdfdoc.resolution = bitmap_resolution
+    pdfdoc.greyscale_threshold = greyscale_threshold
+
+    data, notalpha, page = pdfdoc.get_image(
+        pg - 1)  # Page numbers are 0-based.
+    alpha = notalpha != 1
+
+    #-----------------------------------------------------------------------
+    # image load section.
+
+    #print(data.shape)
+    height, width = data.shape[:2]
+
+    pad = int(pad)
+    height += pad * 2
+    width += pad * 2
+
+    # reimbed image with a white pad.
+    bmp = ones((height, width, 3), dtype=bool)
+
+    thr = int(255.0 * greyscale_threshold / 100.0)
+
+    bmp[pad:height - pad, pad:width - pad] = (data[:, :] > thr)
+
+    imsave("foo.png", bmp)
+    # Set up Debuging image.
+    img = zeros((height, width, 3), dtype=uint8)
+    img[:, :, :] = bmp
+    #img[:, :, 0] = bmp * 255
+    #img[:, :, 1] = bmp * 255
+    #img[:, :, 2] = bmp * 255
+
+    #-----------------------------------------------------------------------
+    # Find bounding box.
+    t = 0
+    while t < height and sum(bmp[t, :] == 0) == 0:
+        t = t + 1
+    if t > 0:
+        t = t - 1
+
+    b = height - 1
+    while b > t and sum(bmp[b, :] == 0) == 0:
+        b = b - 1
+    if b < height - 1:
+        b = b + 1
+
+    l = 0
+    while l < width and sum(bmp[:, l] == 0) == 0:
+        l = l + 1
+    if l > 0:
+        l = l - 1
+
+    r = width - 1
+    while r > l and sum(bmp[:, r] == 0) == 0:
+        r = r - 1
+    if r < width - 1:
+        r = r + 1
 
 # Mark bounding box.
-  bmp[t,:] = 0
-  bmp[b,:] = 0
-  bmp[:,l] = 0
-  bmp[:,r] = 0
-
-  def boxOfString(x,p) :
-    s = x.split(":")
-    if len(s) < 4 :
-      raise ValueError("boxes have format left:top:right:bottom[:page]")
-    return ([bitmap_resolution * float(x) + pad for x in s[0:4] ]
-                + [ p if len(s)<5 else int(s[4]) ] )
-
+    bmp[t, :] = 0
+    bmp[b, :] = 0
+    bmp[:, l] = 0
+    bmp[:, r] = 0
+
+    def boxOfString(x, p):
+        s = x.split(":")
+        if len(s) < 4:
+            raise ValueError("boxes have format left:top:right:bottom[:page]")
+        return ([bitmap_resolution * float(x) + pad for x in s[0:4]] +
+                [p if len(s) < 5 else int(s[4])])
 
 # translate crop to paint white.
-  whites = []
-  if crop :
-    (l,t,r,b,p) = boxOfString(crop,pg)
-    whites.extend( [ (0,0,l,height,p), (0,0,width,t,p),
-                     (r,0,width,height,p), (0,b,width,height,p) ] )
+
+    whites = []
+    if crop:
+        (l, t, r, b, p) = boxOfString(crop, pg)
+        whites.extend([(0, 0, l, height, p), (0, 0, width, t, p),
+                       (r, 0, width, height, p), (0, b, width, height, p)])
 
 # paint white ...
-  if white :
-    whites.extend( [ boxOfString(b, pg) for b in white ] )
+    if white:
+        whites.extend([boxOfString(b, pg) for b in white])
 
-  for (l,t,r,b,p) in whites :
-    if p == pg :
-      bmp[ t:b+1,l:r+1 ] = 1
-      img[ t:b+1,l:r+1 ] = [255,255,255]
+    for (l, t, r, b, p) in whites:
+        if p == pg:
+            bmp[t:b + 1, l:r + 1] = 1
+            img[t:b + 1, l:r + 1] = [255, 255, 255]
 
 # paint black ...
-  if black :
-    for b in black :
-      (l,t,r,b) = [bitmap_resolution * float(x) + pad for x in b.split(":") ]
-      bmp[ t:b+1,l:r+1 ] = 0
-      img[ t:b+1,l:r+1 ] = [0,0,0]
+    if black:
+        for b in black:
+            (l, t, r,
+             b) = [bitmap_resolution * float(x) + pad for x in b.split(":")]
+            bmp[t:b + 1, l:r + 1] = 0
+            img[t:b + 1, l:r + 1] = [0, 0, 0]
 
-  if checkcrop :
-    dumpImage(outfile,bmp,img, bitmap, pad)
-    return True
+    if checkcrop:
+        imsave("crop-" + outfile + ".png", img)
+        return True
 
 #-----------------------------------------------------------------------
 # Line finding section.
@@ -250,61 +285,61 @@ def boxOfString(x,p) :
 # Find all vertical or horizontal lines that are more than rlthresh
 # long, these are considered lines on the table grid.
 
-  lthresh = int(line_length * bitmap_resolution)
-  vs = zeros(width, dtype=int)
-  for i in range(width) :
-    dd = diff( where(bmp[:,i])[0] )
-    if len(dd)>0:
-      v = max ( dd )
-      if v > lthresh :
-        vs[i] = 1
-    else:
-# it was a solid black line.
-      if bmp[0,i] == 0 :
-        vs[i] = 1
-  vd= ( where(diff(vs[:]))[0] +1 )
-
-  hs = zeros(height, dtype=int)
-  for j in range(height) :
-    dd = diff( where(bmp[j,:]==1)[0] )
-    if len(dd) > 0 :
-      h = max ( dd )
-      if h > lthresh :
-        hs[j] = 1
-    else:
-# it was a solid black line.
-      if bmp[j,0] == 0 :
-        hs[j] = 1
-  hd=(  where(diff(hs[:]==1))[0] +1 )
-
-#-----------------------------------------------------------------------
-# Look for dividors that are too large.
-  maxdiv=10
-  i=0
-
-  while i < len(vd) :
-    if vd[i+1]-vd[i] > maxdiv :
-      vd = delete(vd,i)
-      vd = delete(vd,i)
-    else:
-      i=i+2
-
-  j = 0
-  while j < len(hd):
-    if hd[j+1]-hd[j] > maxdiv :
-      hd = delete(hd,j)
-      hd = delete(hd,j)
-    else:
-      j=j+2
-
-  if checklines :
-    for i in vd :
-      img[:,i] = [255,0,0] # red
+    lthresh = int(line_length * bitmap_resolution)
+    vs = zeros(width, dtype=int)
+    for i in range(width):
+        dd = diff(where(bmp[:, i])[0])
+        if len(dd) > 0:
+            v = max(dd)
+            if v > lthresh:
+                vs[i] = 1
+        else:
+            # it was a solid black line.
+            if all(bmp[0, i]) == 0:
+                vs[i] = 1
+    vd = (where(diff(vs[:]))[0] + 1)
+
+    hs = zeros(height, dtype=int)
+    for j in range(height):
+        dd = diff(where(bmp[j, :] == 1)[0])
+        if len(dd) > 0:
+            h = max(dd)
+            if h > lthresh:
+                hs[j] = 1
+        else:
+            # it was a solid black line.
+            if all(bmp[j, 0]) == 0:
+                hs[j] = 1
+    hd = (where(diff(hs[:] == 1))[0] + 1)
+
+    #-----------------------------------------------------------------------
+    # Look for dividors that are too large.
+    maxdiv = 10
+    i = 0
 
-    for j in hd :
-      img[j,:] = [0,0,255] # blue
-    dumpImage(outfile,bmp,img)
-    return True
+    while i < len(vd):
+        if vd[i + 1] - vd[i] > maxdiv:
+            vd = delete(vd, i)
+            vd = delete(vd, i)
+        else:
+            i = i + 2
+
+    j = 0
+    while j < len(hd):
+        if hd[j + 1] - hd[j] > maxdiv:
+            hd = delete(hd, j)
+            hd = delete(hd, j)
+        else:
+            j = j + 2
+
+    if checklines:
+        for i in vd:
+            img[:, i] = [255, 0, 0]  # red
+
+        for j in hd:
+            img[j, :] = [0, 0, 255]  # blue
+        imsave("lines-" + outfile + ".png", img)
+        return True
 #-----------------------------------------------------------------------
 # divider checking.
 #
@@ -312,240 +347,277 @@ def boxOfString(x,p) :
 # hd holds the y coordinate of horizontal divider tansitions for each
 # vertical and horizontal lines in the table grid.
 
-  def isDiv(a, l,r,t,b) :
-          # if any col or row (in axis) is all zeros ...
-    return sum( sum(bmp[t:b, l:r], axis=a)==0 ) >0
-
-  if checkdivs :
-    img = img / 2
-    for j in range(0,len(hd),2):
-      for i in range(0,len(vd),2):
-        if i>0 :
-          (l,r,t,b) = (vd[i-1], vd[i],   hd[j],   hd[j+1])
-          img[ t:b, l:r, 1 ] = 192
-          if isDiv(1, l,r,t,b) :
-            img[ t:b, l:r, 0 ] = 0
-            img[ t:b, l:r, 2 ] = 255
-
-        if j>0 :
-          (l,r,t,b) = (vd[i],   vd[i+1], hd[j-1], hd[j] )
-          img[ t:b, l:r, 1 ] = 128
-          if isDiv(0, l,r,t,b) :
-            img[ t:b, l:r, 0 ] = 255
-            img[ t:b, l:r, 2 ] = 0
-    dumpImage(outfile,bmp,img)
-    return True
+    def isDiv(a, l, r, t, b):
+        # if any col or row (in axis) is all zeros ...
+        return sum(sum(bmp[t:b, l:r], axis=a) == 0) > 0
+
+    if checkdivs:
+        img = img / 2
+        for j in range(0, len(hd), 2):
+            for i in range(0, len(vd), 2):
+                if i > 0:
+                    (l, r, t, b) = (vd[i - 1], vd[i], hd[j], hd[j + 1])
+                    img[t:b, l:r, 1] = 192
+                    if isDiv(1, l, r, t, b):
+                        img[t:b, l:r, 0] = 0
+                        img[t:b, l:r, 2] = 255
+
+                if j > 0:
+                    (l, r, t, b) = (vd[i], vd[i + 1], hd[j - 1], hd[j])
+                    img[t:b, l:r, 1] = 128
+                    if isDiv(0, l, r, t, b):
+                        img[t:b, l:r, 0] = 255
+                        img[t:b, l:r, 2] = 0
+        imsave("divs-" + outfile + ".png", img)
+        return True
 #-----------------------------------------------------------------------
 # Cell finding section.
 # This algorithum is width hungry, and always generates rectangular
 # boxes.
 
-  cells =[]
-  touched = zeros( (len(hd), len(vd)),dtype=bool )
-  j = 0
-  while j*2+2 < len (hd) :
-    i = 0
-    while i*2+2 < len(vd) :
-      u = 1
-      v = 1
-      if not touched[j,i] :
-        while 2+(i+u)*2 < len(vd) and \
-            not isDiv( 0, vd[ 2*(i+u) ], vd[ 2*(i+u)+1],
-               hd[ 2*(j+v)-1 ], hd[ 2*(j+v) ] ):
-          u=u+1
-        bot = False
-        while 2+(j+v)*2 < len(hd) and not bot :
-          bot = False
-          for k in range(1,u+1) :
-            bot |= isDiv( 1, vd[ 2*(i+k)-1 ], vd[ 2*(i+k)],
-               hd[ 2*(j+v) ], hd[ 2*(j+v)+1 ] )
-          if not bot :
-            v=v+1
-        cells.append( (i,j,u,v) )
-        touched[ j:j+v, i:i+u] = True
-      i = i+1
-    j=j+1
-
-
-  if checkcells :
-    nc = len(cells)+0.
-    img = img / 2
-    for k in range(len(cells)):
-      (i,j,u,v) = cells[k]
-      (l,r,t,b) = ( vd[2*i+1] , vd[ 2*(i+u) ], hd[2*j+1], hd[2*(j+v)] )
-      img[ t:b, l:r ] += col( k/nc )
-    dumpImage(outfile,bmp,img)
-    return True
+    cells = []
+    touched = zeros((len(hd), len(vd)), dtype=bool)
+    j = 0
+    while j * 2 + 2 < len(hd):
+        i = 0
+        while i * 2 + 2 < len(vd):
+            u = 1
+            v = 1
+            if not touched[j, i]:
+                while 2+(i+u)*2 < len(vd) and \
+                    not isDiv( 0, vd[ 2*(i+u) ], vd[ 2*(i+u)+1],
+                       hd[ 2*(j+v)-1 ], hd[ 2*(j+v) ] ):
+                    u = u + 1
+                bot = False
+                while 2 + (j + v) * 2 < len(hd) and not bot:
+                    bot = False
+                    for k in range(1, u + 1):
+                        bot |= isDiv(1, vd[2 * (i + k) - 1], vd[2 * (i + k)],
+                                     hd[2 * (j + v)], hd[2 * (j + v) + 1])
+                    if not bot:
+                        v = v + 1
+                cells.append((i, j, u, v))
+                touched[j:j + v, i:i + u] = True
+            i = i + 1
+        j = j + 1
+
+    if checkcells:
+        nc = len(cells) + 0.
+        img = img / 2
+        for k in range(len(cells)):
+            (i, j, u, v) = cells[k]
+            (l, r, t, b) = (vd[2 * i + 1], vd[2 * (i + u)], hd[2 * j + 1],
+                            hd[2 * (j + v)])
+            img[t:b, l:r] += col(k / nc)
+        imsave("cells-" + outfile + ".png", img)
+        return True
 
 #-----------------------------------------------------------------------
 # fork out to extract text for each cell.
 
-  whitespace = re.compile( rb'\s+')
-
-  def getCell( _coordinate):
-    (i,j,u,v) =_coordinate
-    (l,r,t,b) = ( vd[2*i+1] , vd[ 2*(i+u) ], hd[2*j+1], hd[2*(j+v)] )
-    ret = pdfdoc.get_text(pg-1, l-pad, t-pad, r-l, b-t)
-    # if whitespace != 'raw' :
-    #   ret = whitespace.sub( b"" if whitespace == "none" else b" ", ret )
-    #   if len(ret) > 0 :
-    #     ret = ret[ (1 if ret[0]==b' ' else 0) :
-    #                len(ret) - (1 if ret[-1]==b' ' else 0) ]
-    return (i,j,u,v,pg,ret)
-
-  if boxes :
-    cells = [ x + (pg,b"",) for x in cells if
-              ( frow == None or (x[1] >= frow and x[1] <= lrow)) ]
-  else :
-    print (cells)
-    cells = [ getCell(x)   for x in cells if
-              ( frow == None or (x[1] >= frow and x[1] <= lrow)) ]
-  return cells
+    def getCell(_coordinate):
+        (i, j, u, v) = _coordinate
+        (l, r, t, b) = (vd[2 * i + 1], vd[2 * (i + u)], hd[2 * j + 1],
+                        hd[2 * (j + v)])
+        ret = pdfdoc.get_text(page, l - pad, t - pad, r - l, b - t)
+        return (i, j, u, v, pg, ret)
+
+    if boxes:
+        cells = [x + (pg,
+                      b"", ) for x in cells
+                 if (frow == None or (x[1] >= frow and x[1] <= lrow))]
+    else:
+        print(cells)
+        cells = [getCell(x) for x in cells
+                 if (frow == None or (x[1] >= frow and x[1] <= lrow))]
+    return cells
 
 #-----------------------------------------------------------------------
 #output section.
 
-def output(cells, pgs,
-                cells_csv_filename=None,
-                cells_json_filename=None,
-                cells_xml_filename=None,
-                table_csv_filename=None,
-                table_html_filename=None,
-                table_list_filename=None,
-                infile=None, name=None, output_type=None
-                ):
+
+def output(cells,
+           pgs,
+           cells_csv_filename=None,
+           cells_json_filename=None,
+           cells_xml_filename=None,
+           table_csv_filename=None,
+           table_html_filename=None,
+           table_list_filename=None,
+           infile=None,
+           name=None,
+           output_type=None):
 
     output_types = [
-             dict(filename=cells_csv_filename, function=o_cells_csv),
-             dict(filename=cells_json_filename, function=o_cells_json),
-             dict(filename=cells_xml_filename, function=o_cells_xml),
-             dict(filename=table_csv_filename, function=o_table_csv),
-             dict(filename=table_html_filename, function=o_table_html),
-             dict(filename=table_list_filename, function=o_table_list)
-             ]
+        dict(filename=cells_csv_filename,
+             function=o_cells_csv), dict(filename=cells_json_filename,
+                                         function=o_cells_json),
+        dict(filename=cells_xml_filename,
+             function=o_cells_xml), dict(filename=table_csv_filename,
+                                         function=o_table_csv),
+        dict(filename=table_html_filename,
+             function=o_table_html), dict(filename=table_list_filename,
+                                          function=o_table_list)
+    ]
 
     for entry in output_types:
         if entry["filename"]:
             if entry["filename"] != sys.stdout:
-                outfile = open(entry["filename"],'w')
+                outfile = open(entry["filename"], 'w')
             else:
                 outfile = sys.stdout
 
-            entry["function"](cells, pgs,
-                                outfile=outfile,
-                                name=name,
-                                infile=infile,
-                                output_type=output_type)
+            entry["function"](cells,
+                              pgs,
+                              outfile=outfile,
+                              name=name,
+                              infile=infile,
+                              output_type=output_type)
 
             if entry["filename"] != sys.stdout:
                 outfile.close()
 
-def o_cells_csv(cells,pgs, outfile=None, name=None, infile=None, output_type=None) :
-  outfile = outfile or sys.stdout
-  csv.writer( outfile , dialect='excel' ).writerows(cells)
-
-def o_cells_json(cells,pgs, outfile=None, infile=None, name=None, output_type=None) :
-  """Output JSON formatted cell data"""
-  outfile = outfile or sys.stdout
-  #defaults
-  infile=infile or ""
-  name=name or ""
-
-  json.dump({
-    "src": infile,
-    "name": name,
-    "colnames": ( "x","y","width","height","page","contents" ),
-    "cells":cells
+
+def o_cells_csv(cells,
+                pgs,
+                outfile=None,
+                name=None,
+                infile=None,
+                output_type=None):
+    outfile = outfile or sys.stdout
+    csv.writer(outfile, dialect='excel').writerows(cells)
+
+
+def o_cells_json(cells,
+                 pgs,
+                 outfile=None,
+                 infile=None,
+                 name=None,
+                 output_type=None):
+    """Output JSON formatted cell data"""
+    outfile = outfile or sys.stdout
+    #defaults
+    infile = infile or ""
+    name = name or ""
+
+    json.dump({
+        "src": infile,
+        "name": name,
+        "colnames": ("x", "y", "width", "height", "page", "contents"),
+        "cells": cells
     }, outfile)
 
-def o_cells_xml(cells,pgs, outfile=None,infile=None, name=None, output_type=None) :
-  """Output XML formatted cell data"""
-  outfile = outfile or sys.stdout
-  #defaults
-  infile=infile or ""
-  name=name or ""
-  def _lambda(a):
-      return x.setAttribute(*a)
-
-  doc = getDOMImplementation().createDocument(None,"table", None)
-  root = doc.documentElement;
-  if infile :
-    root.setAttribute("src",infile)
-  if name :
-    root.setAttribute("name",name)
-  for cl in cells :
-    x = doc.createElement("cell")
-    map(_lambda, zip("xywhp",map(str,cl)))
-    if cl[5] != "" :
-      x.appendChild( doc.createTextNode(cl[5]) )
-    root.appendChild(x)
-  outfile.write( doc.toprettyxml() )
-
-def table_to_list(cells,pgs) :
-  """Output list of lists"""
-  l=[0,0,0]
-  for (i,j,u,v,pg,value) in cells :
-      r=[i,j,pg]
-      l = [max(x) for x in zip(l,r)]
-
-  tab = [ [ [ "" for x in range(l[0]+1)
-            ] for x in range(l[1]+1)
-          ] for x in range(l[2]+1)
-        ]
-  for (i,j,u,v,pg,value) in cells :
-    tab[pg][j][i] = value
-
-  return tab
-
-def o_table_csv(cells,pgs, outfile=None, name=None, infile=None, output_type=None) :
-  """Output CSV formatted table"""
-  outfile = outfile or sys.stdout
-  tab=table_to_list(cells, pgs)
-  for t in tab:
-    csv.writer( outfile , dialect='excel' ).writerows(t)
-
-
-def o_table_list(cells,pgs, outfile=None, name=None, infile=None, output_type=None) :
-  """Output list of lists"""
-  outfile = outfile or sys.stdout
-  tab = table_to_list(cells, pgs)
-  print(tab)
-
-def o_table_html(cells,pgs, outfile=None, output_type=None, name=None, infile=None) :
-  """Output HTML formatted table"""
-
-  oj = 0
-  opg = 0
-  doc = getDOMImplementation().createDocument(None,"table", None)
-  root = doc.documentElement;
-  if (output_type == "table_chtml" ):
-    root.setAttribute("border","1")
-    root.setAttribute("cellspaceing","0")
-    root.setAttribute("style","border-spacing:0")
-  nc = len(cells)
-  tr = None
-  for k in range(nc):
-    (i,j,u,v,pg,value) = cells[k]
-    if j > oj or pg > opg:
-      if pg > opg:
-        s = "Name: " + name + ", " if name else ""
-        root.appendChild( doc.createComment( s +
-          ("Source: %s page %d." % (infile, pg) )));
-      if tr :
-        root.appendChild(tr)
-      tr = doc.createElement("tr")
-      oj = j
-      opg = pg
-    td = doc.createElement("td")
-    if value != "" :
-      td.appendChild( doc.createTextNode(value) )
-    if u>1 :
-      td.setAttribute("colspan",str(u))
-    if v>1 :
-      td.setAttribute("rowspan",str(v))
-    if output_type == "table_chtml" :
-      td.setAttribute("style", "background-color: #%02x%02x%02x" %
-            tuple(128+col(k/(nc+0.))))
-    tr.appendChild(td)
-  root.appendChild(tr)
-  outfile.write( doc.toprettyxml() )
+
+def o_cells_xml(cells,
+                pgs,
+                outfile=None,
+                infile=None,
+                name=None,
+                output_type=None):
+    """Output XML formatted cell data"""
+    outfile = outfile or sys.stdout
+    #defaults
+    infile = infile or ""
+    name = name or ""
+
+    def _lambda(a):
+        return x.setAttribute(*a)
+
+    doc = getDOMImplementation().createDocument(None, "table", None)
+    root = doc.documentElement
+    if infile:
+        root.setAttribute("src", infile)
+    if name:
+        root.setAttribute("name", name)
+    for cl in cells:
+        x = doc.createElement("cell")
+        map(_lambda, zip("xywhp", map(str, cl)))
+        if cl[5] != "":
+            x.appendChild(doc.createTextNode(cl[5]))
+        root.appendChild(x)
+    outfile.write(doc.toprettyxml())
+
+
+def table_to_list(cells, pgs):
+    """Output list of lists"""
+    l = [0, 0, 0]
+    for (i, j, u, v, pg, value) in cells:
+        r = [i, j, pg]
+        l = [max(x) for x in zip(l, r)]
+
+    tab = [[["" for x in range(l[0] + 1)] for x in range(l[1] + 1)]
+           for x in range(l[2] + 1)]
+    for (i, j, u, v, pg, value) in cells:
+        tab[pg][j][i] = value
+
+    return tab
+
+
+def o_table_csv(cells,
+                pgs,
+                outfile=None,
+                name=None,
+                infile=None,
+                output_type=None):
+    """Output CSV formatted table"""
+    outfile = outfile or sys.stdout
+    tab = table_to_list(cells, pgs)
+    for t in tab:
+        csv.writer(outfile, dialect='excel').writerows(t)
+
+
+def o_table_list(cells,
+                 pgs,
+                 outfile=None,
+                 name=None,
+                 infile=None,
+                 output_type=None):
+    """Output list of lists"""
+    outfile = outfile or sys.stdout
+    tab = table_to_list(cells, pgs)
+    print(tab)
+
+
+def o_table_html(cells,
+                 pgs,
+                 outfile=None,
+                 output_type=None,
+                 name=None,
+                 infile=None):
+    """Output HTML formatted table"""
+
+    oj = 0
+    opg = 0
+    doc = getDOMImplementation().createDocument(None, "table", None)
+    root = doc.documentElement
+    if (output_type == "table_chtml"):
+        root.setAttribute("border", "1")
+        root.setAttribute("cellspaceing", "0")
+        root.setAttribute("style", "border-spacing:0")
+    nc = len(cells)
+    tr = None
+    for k in range(nc):
+        (i, j, u, v, pg, value) = cells[k]
+        if j > oj or pg > opg:
+            if pg > opg:
+                s = "Name: " + name + ", " if name else ""
+                root.appendChild(doc.createComment(s + ("Source: %s page %d." %
+                                                        (infile, pg))))
+            if tr:
+                root.appendChild(tr)
+            tr = doc.createElement("tr")
+            oj = j
+            opg = pg
+        td = doc.createElement("td")
+        if value != "":
+            td.appendChild(doc.createTextNode(value))
+        if u > 1:
+            td.setAttribute("colspan", str(u))
+        if v > 1:
+            td.setAttribute("rowspan", str(v))
+        if output_type == "table_chtml":
+            td.setAttribute("style", "background-color: #%02x%02x%02x" %
+                            tuple(128 + col(k / (nc + 0.))))
+        tr.appendChild(td)
+    root.appendChild(tr)
+    outfile.write(doc.toprettyxml())

From 04838c93651716809a097d10603c041c68b1eed3 Mon Sep 17 00:00:00 2001
From: Evgeny Cherkashin <eugene@irnok.net>
Date: Sat, 16 Jul 2016 03:05:52 +0800
Subject: [PATCH 11/28] Debugging. Very hard.

---
 src/pdftableextract/core.py | 37 +++++++++++++++++++++----------------
 1 file changed, 21 insertions(+), 16 deletions(-)

diff --git a/src/pdftableextract/core.py b/src/pdftableextract/core.py
index 490356a..97414d7 100644
--- a/src/pdftableextract/core.py
+++ b/src/pdftableextract/core.py
@@ -82,7 +82,7 @@ def get_image(self, index):
         new = zeros(data.shape, dtype=uint8)
         new[:, :, :] = data
         new = new[:, :, 0:3]
-        print(data)
+        #print(data)
         rc = alpha <= self.greyscale_threshold
 
         new[rc, 0] = 255
@@ -167,15 +167,15 @@ def process_page(infile,
                  white=None,
                  black=None,
                  bitmap=False,
-                 checkcrop=False,
-                 checklines=False,
-                 checkdivs=False,
-                 checkcells=False,
+                 checkcrop=True,
+                 checklines=True,
+                 checkdivs=True,
+                 checkcells=True,
                  whitespace="normalize",
                  boxes=False,
                  encoding="utf8"):
 
-    outfile = outfilename if outfilename else sys.stdout
+    outfile = outfilename if outfilename else "output"
     pdfdoc = PopplerProcessor(infile)
     page = page or []
     (pg, frow, lrow) = (list(map(int, (pgs.split(":")))) + [None, None])[0:3]
@@ -202,11 +202,11 @@ def process_page(infile,
     thr = int(255.0 * greyscale_threshold / 100.0)
 
     bmp[pad:height - pad, pad:width - pad] = (data[:, :] > thr)
-
+    bmp = bmp == False
     imsave("foo.png", bmp)
     # Set up Debuging image.
     img = zeros((height, width, 3), dtype=uint8)
-    img[:, :, :] = bmp
+    img[:, :, :] = bmp * 255
     #img[:, :, 0] = bmp * 255
     #img[:, :, 1] = bmp * 255
     #img[:, :, 2] = bmp * 255
@@ -214,34 +214,39 @@ def process_page(infile,
     #-----------------------------------------------------------------------
     # Find bounding box.
     t = 0
-    while t < height and sum(bmp[t, :] == 0) == 0:
+    imsave("bmp-test.png", bmp)
+
+    while t < height and all(bmp[t, :]) == False:
         t = t + 1
     if t > 0:
         t = t - 1
 
+    import pdb
+    pdb.set_trace()
     b = height - 1
-    while b > t and sum(bmp[b, :] == 0) == 0:
+    while b > t and all(bmp[b, :]) == False:
         b = b - 1
     if b < height - 1:
         b = b + 1
 
     l = 0
-    while l < width and sum(bmp[:, l] == 0) == 0:
+    while l < width and all(bmp[:, l]) == False:
         l = l + 1
     if l > 0:
         l = l - 1
 
     r = width - 1
-    while r > l and sum(bmp[:, r] == 0) == 0:
+    while r > l and all(bmp[:, r]) == False:
         r = r - 1
     if r < width - 1:
         r = r + 1
 
 # Mark bounding box.
-    bmp[t, :] = 0
-    bmp[b, :] = 0
-    bmp[:, l] = 0
-    bmp[:, r] = 0
+    bmp[t, :, 0] = True
+    bmp[b, :, 0] = True
+    bmp[:, l, 0] = True
+    bmp[:, r, 0] = True
+    imsave("bmp-bbox.png", bmp)
 
     def boxOfString(x, p):
         s = x.split(":")

From a0a2bfea8c7e2f753973333662a22fac0ef29ef0 Mon Sep 17 00:00:00 2001
From: Evgeny Cherkashin <eugene@irnok.net>
Date: Sat, 16 Jul 2016 13:53:12 +0800
Subject: [PATCH 12/28] Trying to understand algorithm.

---
 example/gtk-test.py         | 37 -------------------------------
 example/test_to_pandas.py   |  1 +
 src/pdftableextract/core.py | 44 ++++++++++++++++++-------------------
 3 files changed, 22 insertions(+), 60 deletions(-)
 delete mode 100644 example/gtk-test.py

diff --git a/example/gtk-test.py b/example/gtk-test.py
deleted file mode 100644
index 7ceb36a..0000000
--- a/example/gtk-test.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# http://stackoverflow.com/a/10031877
-
-import numpy
-import cairo
-import math
-
-from gi.repository import Gtk, Gdk
-
-data = numpy.zeros((200, 200, 4), dtype=numpy.uint8)
-surface = cairo.ImageSurface.create_for_data(data, cairo.FORMAT_ARGB32, 200,
-                                             200)
-cr = cairo.Context(surface)
-
-# fill with solid white
-cr.set_source_rgb(1.0, 1.0, 1.0)
-cr.paint()
-
-# draw red circle
-cr.arc(100, 100, 80, 0, 2 * math.pi)
-cr.set_line_width(3)
-cr.set_source_rgb(1.0, 0.0, 0.0)
-cr.stroke()
-
-#draw directly to the shared buffer
-data[10:30, 10:30, 2] = 128
-
-# write output
-print(data[38:48, 38:48, 0])
-surface.write_to_png("circle.png")
-
-pb = Gdk.pixbuf_get_from_surface(surface, 0, 0, 200, 200)
-im = Gtk.Image.new_from_pixbuf(pb)
-w = Gtk.Window()
-w.connect("delete-event", Gtk.main_quit)
-w.add(im)
-w.show_all()
-Gtk.main()
diff --git a/example/test_to_pandas.py b/example/test_to_pandas.py
index 32cd3c7..c54da05 100644
--- a/example/test_to_pandas.py
+++ b/example/test_to_pandas.py
@@ -6,6 +6,7 @@
 cells = [pdf.process_page("example.pdf", p) for p in pages]
 
 #flatten the cells structure
+print(cells)
 cells = [item for sublist in cells for item in sublist]
 
 #without any options, process_page picks up a blank table at the top of the page.
diff --git a/src/pdftableextract/core.py b/src/pdftableextract/core.py
index 97414d7..151674f 100644
--- a/src/pdftableextract/core.py
+++ b/src/pdftableextract/core.py
@@ -74,7 +74,6 @@ def get_image(self, index):
         # C = C.astype(uint8)
         # A = A <= self.greyscale_threshold
         # C[A] = 255
-
         # C = C.reshape((pxh, pxw))
         data = data.reshape((pxh, pxw, 4))
         #d = data[:, :, 3]
@@ -100,11 +99,10 @@ def get_text(self, page, x, y, w, h):
         txt = page.get_text_for_area(rect)
         #rect.free()
         #Poppler.Rectangle.free(rect)
-
         return txt
 
 
-#-----------------------------------------------------------------------
+    #-----------------------------------------------------------------------
 def check_for_required_executable(name, command):
     """Checks for an executable called 'name' by running 'command' and supressing
     output. If the return code is non-zero or an OS error occurs, an Exception is raised"""
@@ -161,13 +159,13 @@ def process_page(infile,
                  page=None,
                  crop=None,
                  line_length=0.17,
-                 bitmap_resolution=300,
+                 bitmap_resolution=150, #300,
                  name=None,
                  pad=2,
                  white=None,
                  black=None,
                  bitmap=False,
-                 checkcrop=True,
+                 checkcrop=False,
                  checklines=True,
                  checkdivs=True,
                  checkcells=True,
@@ -214,39 +212,36 @@ def process_page(infile,
     #-----------------------------------------------------------------------
     # Find bounding box.
     t = 0
-    imsave("bmp-test.png", bmp)
+    imsave("bmp-start.png", bmp)
 
-    while t < height and all(bmp[t, :]) == False:
+    while t < height and any(bmp[t, :]) == False:
         t = t + 1
     if t > 0:
         t = t - 1
 
-    import pdb
-    pdb.set_trace()
     b = height - 1
-    while b > t and all(bmp[b, :]) == False:
+    while b > t and any(bmp[b, :]) == False:
         b = b - 1
     if b < height - 1:
         b = b + 1
 
     l = 0
-    while l < width and all(bmp[:, l]) == False:
+    while l < width and any(bmp[:, l]) == False:
         l = l + 1
     if l > 0:
         l = l - 1
 
     r = width - 1
-    while r > l and all(bmp[:, r]) == False:
+    while r > l and any(bmp[:, r]) == False:
         r = r - 1
     if r < width - 1:
         r = r + 1
 
 # Mark bounding box.
-    bmp[t, :, 0] = True
-    bmp[b, :, 0] = True
-    bmp[:, l, 0] = True
-    bmp[:, r, 0] = True
-    imsave("bmp-bbox.png", bmp)
+    bmp[t, :] = True
+    bmp[b, :] = True
+    bmp[:, l] = True
+    bmp[:, r] = True
 
     def boxOfString(x, p):
         s = x.split(":")
@@ -287,11 +282,14 @@ def boxOfString(x, p):
 #-----------------------------------------------------------------------
 # Line finding section.
 #
-# Find all vertical or horizontal lines that are more than rlthresh
+# Find all vertical or horizontal lines that are more than lthresh
 # long, these are considered lines on the table grid.
 
     lthresh = int(line_length * bitmap_resolution)
-    vs = zeros(width, dtype=int)
+    vs = zeros(width, dtype=uint8)
+
+    import pdb
+    pdb.set_trace()
     for i in range(width):
         dd = diff(where(bmp[:, i])[0])
         if len(dd) > 0:
@@ -304,9 +302,9 @@ def boxOfString(x, p):
                 vs[i] = 1
     vd = (where(diff(vs[:]))[0] + 1)
 
-    hs = zeros(height, dtype=int)
+    hs = zeros(height, dtype=uint8)
     for j in range(height):
-        dd = diff(where(bmp[j, :] == 1)[0])
+        dd = diff(where(bmp[j, :])[0])
         if len(dd) > 0:
             h = max(dd)
             if h > lthresh:
@@ -315,7 +313,7 @@ def boxOfString(x, p):
             # it was a solid black line.
             if all(bmp[j, 0]) == 0:
                 hs[j] = 1
-    hd = (where(diff(hs[:] == 1))[0] + 1)
+    hd = (where(diff(hs[:]))[0] + 1)
 
     #-----------------------------------------------------------------------
     # Look for dividors that are too large.
@@ -429,7 +427,7 @@ def getCell(_coordinate):
 
     if boxes:
         cells = [x + (pg,
-                      b"", ) for x in cells
+                      "", ) for x in cells
                  if (frow == None or (x[1] >= frow and x[1] <= lrow))]
     else:
         print(cells)

From da69fd5fab37b726947ebc6f235e5eff59a07689 Mon Sep 17 00:00:00 2001
From: Evgeny Cherkashin <eugene@irnok.net>
Date: Sat, 16 Jul 2016 14:01:55 +0800
Subject: [PATCH 13/28] Remove now unused popen.

---
 src/pdftableextract/core.py | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/src/pdftableextract/core.py b/src/pdftableextract/core.py
index 151674f..5b90fce 100644
--- a/src/pdftableextract/core.py
+++ b/src/pdftableextract/core.py
@@ -121,20 +121,6 @@ def check_for_required_executable(name, command):
         raise
 
 
-#-----------------------------------------------------------------------
-def popen(name, command, *args, **kwargs):
-    #print (name,command, *args, **kwargs)
-    try:
-        result = subprocess.Popen(command, *args, **kwargs)
-        return result
-    except OSError as e:
-        message = """Error running {0}. Is it installed correctly?
-Error: {1}""".format(name, e)
-        raise OSError(message)
-    except Exception as e:
-        raise
-
-
 def colinterp(a, x):
     """Interpolates colors"""
     l = len(a) - 1

From 61301a9a80d584bfa36a27a2022552e8f513b3c9 Mon Sep 17 00:00:00 2001
From: Evgeny Cherkashin <eugene@irnok.net>
Date: Sat, 16 Jul 2016 14:07:20 +0800
Subject: [PATCH 14/28] Remove executable checker.

---
 src/pdftableextract/core.py | 19 -------------------
 1 file changed, 19 deletions(-)

diff --git a/src/pdftableextract/core.py b/src/pdftableextract/core.py
index 5b90fce..40d16ee 100644
--- a/src/pdftableextract/core.py
+++ b/src/pdftableextract/core.py
@@ -102,25 +102,6 @@ def get_text(self, page, x, y, w, h):
         return txt
 
 
-    #-----------------------------------------------------------------------
-def check_for_required_executable(name, command):
-    """Checks for an executable called 'name' by running 'command' and supressing
-    output. If the return code is non-zero or an OS error occurs, an Exception is raised"""
-    return
-    try:
-        with open(os.devnull, "w") as fnull:
-            result = subprocess.check_call(command, stdout=fnull, stderr=fnull)
-    except OSError as e:
-        message = """Error running {0}.
-Command failed: {1}
-{2}""".format(name, " ".join(command), e)
-        raise OSError(message)
-    except subprocess.CalledProcessError as e:
-        raise
-    except Exception as e:
-        raise
-
-
 def colinterp(a, x):
     """Interpolates colors"""
     l = len(a) - 1

From f01042d93b590c07acfb9c84230e835534eb8f09 Mon Sep 17 00:00:00 2001
From: Evgeny Cherkashin <eugene@irnok.net>
Date: Sat, 16 Jul 2016 15:14:13 +0800
Subject: [PATCH 15/28] Made image of 8bit as in original.

---
 src/pdftableextract/core.py | 76 +++++++++++++++++++------------------
 1 file changed, 40 insertions(+), 36 deletions(-)

diff --git a/src/pdftableextract/core.py b/src/pdftableextract/core.py
index 40d16ee..ec5cc26 100644
--- a/src/pdftableextract/core.py
+++ b/src/pdftableextract/core.py
@@ -65,31 +65,38 @@ def get_image(self, index):
         surface.write_to_png("page.png")
         #img=image.set_from_pixbuf (pixbuf)
         data = frombuffer(pixbuf.get_pixels(), dtype=uint8)
-        # R = data[0::4]
-        # G = data[1::4]
-        # B = data[2::4]
-        # A = data[3::4]
-        # C = R * 34 + G * 0.56 + B * 0.1
+        R = data[0::4]
+        G = data[1::4]
+        B = data[2::4]
+        A = data[3::4]
+        C = R * 34 + G * 56 + B * 10 / 100.
         # # print (max(A))
-        # C = C.astype(uint8)
+        C = C.astype(uint8)
+
         # A = A <= self.greyscale_threshold
         # C[A] = 255
         # C = C.reshape((pxh, pxw))
-        data = data.reshape((pxh, pxw, 4))
+        nd = zeros(C.shape, dtype=uint8)
+        print(nd.shape, C.shape)
+        nd[:] = C
+        nd[A <= self.greyscale_threshold] = 255
+
+        #data = data.reshape((pxh, pxw, 4))
         #d = data[:, :, 3]
-        alpha = data[:, :, 3]
-        new = zeros(data.shape, dtype=uint8)
-        new[:, :, :] = data
-        new = new[:, :, 0:3]
+        #alpha = data[:, :, 3]
+        #new = zeros(data.shape, dtype=uint8)
+        #new[:, :, :] = data
+        #new = new[:, :, 0:3]
         #print(data)
-        rc = alpha <= self.greyscale_threshold
+        #rc = alpha <= self.greyscale_threshold
 
-        new[rc, 0] = 255
-        new[rc, 1] = 255
-        new[rc, 2] = 255
+        #new[rc, 0] = 255
+        #new[rc, 1] = 255
+        #new[rc, 2] = 255
         #new[:, :, 3] = 255
-        imsave('nomask.png', new)
-        return new, rc, page
+        nd = nd.reshape((pxh, pxw))
+        imsave('nomask.png', nd)
+        return nd, page
 
     def get_text(self, page, x, y, w, h):
         rect = Poppler.Rectangle()
@@ -126,7 +133,7 @@ def process_page(infile,
                  page=None,
                  crop=None,
                  line_length=0.17,
-                 bitmap_resolution=150, #300,
+                 bitmap_resolution=150, # 300,
                  name=None,
                  pad=2,
                  white=None,
@@ -147,9 +154,7 @@ def process_page(infile,
     pdfdoc.resolution = bitmap_resolution
     pdfdoc.greyscale_threshold = greyscale_threshold
 
-    data, notalpha, page = pdfdoc.get_image(
-        pg - 1)  # Page numbers are 0-based.
-    alpha = notalpha != 1
+    data, page = pdfdoc.get_image(pg - 1)  # Page numbers are 0-based.
 
     #-----------------------------------------------------------------------
     # image load section.
@@ -162,19 +167,19 @@ def process_page(infile,
     width += pad * 2
 
     # reimbed image with a white pad.
-    bmp = ones((height, width, 3), dtype=bool)
+    bmp = ones((height, width), dtype=bool)
 
     thr = int(255.0 * greyscale_threshold / 100.0)
-
+    imsave("white.png", bmp)
     bmp[pad:height - pad, pad:width - pad] = (data[:, :] > thr)
-    bmp = bmp == False
+    #bmp = bmp == False
     imsave("foo.png", bmp)
     # Set up Debuging image.
     img = zeros((height, width, 3), dtype=uint8)
-    img[:, :, :] = bmp * 255
-    #img[:, :, 0] = bmp * 255
-    #img[:, :, 1] = bmp * 255
-    #img[:, :, 2] = bmp * 255
+    #img[:, :, :] = bmp * 255
+    img[:, :, 0] = bmp * 255
+    img[:, :, 1] = bmp * 255
+    img[:, :, 2] = bmp * 255
 
     #-----------------------------------------------------------------------
     # Find bounding box.
@@ -205,10 +210,11 @@ def process_page(infile,
         r = r + 1
 
 # Mark bounding box.
-    bmp[t, :] = True
-    bmp[b, :] = True
-    bmp[:, l] = True
-    bmp[:, r] = True
+    bmp[t, :] = False
+    bmp[b, :] = False
+    bmp[:, l] = False
+    bmp[:, r] = False
+    imsave("bbox-start.png", bmp)
 
     def boxOfString(x, p):
         s = x.split(":")
@@ -255,8 +261,6 @@ def boxOfString(x, p):
     lthresh = int(line_length * bitmap_resolution)
     vs = zeros(width, dtype=uint8)
 
-    import pdb
-    pdb.set_trace()
     for i in range(width):
         dd = diff(where(bmp[:, i])[0])
         if len(dd) > 0:
@@ -271,7 +275,7 @@ def boxOfString(x, p):
 
     hs = zeros(height, dtype=uint8)
     for j in range(height):
-        dd = diff(where(bmp[j, :])[0])
+        dd = diff(where(bmp[j, :] == 1)[0])
         if len(dd) > 0:
             h = max(dd)
             if h > lthresh:
@@ -280,7 +284,7 @@ def boxOfString(x, p):
             # it was a solid black line.
             if all(bmp[j, 0]) == 0:
                 hs[j] = 1
-    hd = (where(diff(hs[:]))[0] + 1)
+    hd = (where(diff(hs[:]) == 1)[0] + 1)
 
     #-----------------------------------------------------------------------
     # Look for dividors that are too large.

From 6ece1da3f86397bdac25e2c8049370772d5b1a8f Mon Sep 17 00:00:00 2001
From: Evgeny Cherkashin <eugene@irnok.net>
Date: Sat, 16 Jul 2016 15:16:32 +0800
Subject: [PATCH 16/28] Ignore debugging data.

---
 .gitingore | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 .gitingore

diff --git a/.gitingore b/.gitingore
new file mode 100644
index 0000000..6f06927
--- /dev/null
+++ b/.gitingore
@@ -0,0 +1 @@
+059285.pdf

From 829f4ea2e9c2eea16bf6a1c55051c10bcf499f55 Mon Sep 17 00:00:00 2001
From: Evgeny Cherkashin <eugene@irnok.net>
Date: Sat, 16 Jul 2016 15:33:36 +0800
Subject: [PATCH 17/28] More adaptation for Python 3.

---
 example/test_to_pandas.py   |  14 +++--
 src/pdftableextract/core.py |   2 +-
 src/pdftableextract/pnm.py  | 104 +++++++++++++++++++-----------------
 3 files changed, 67 insertions(+), 53 deletions(-)

diff --git a/example/test_to_pandas.py b/example/test_to_pandas.py
index d734ea3..5db0c32 100644
--- a/example/test_to_pandas.py
+++ b/example/test_to_pandas.py
@@ -3,10 +3,16 @@
 import pdftableextract as pdf
 
 pages = ["1"]
-cells = [pdf.process_page("example.pdf",p) for p in pages]
+cells = [pdf.process_page("example.pdf",
+                          p,
+                          outfilename="weee.pnm",
+                          checkcrop=False,
+                          checklines=False,
+                          checkdivs=False,
+                          checkcells=False, ) for p in pages]
 
 #flatten the cells structure
-cells = [item for sublist in cells for item in sublist ]
+cells = [item for sublist in cells for item in sublist]
 
 #without any options, process_page picks up a blank table at the top of the page.
 #so choose table '1'
@@ -17,5 +23,5 @@
 #row '1' contains column headings
 #data is row '2' through '-1'
 
-data =pd.DataFrame(li[2:-1], columns=li[1], index=[l[0] for l in li[2:-1]])
-print (data)
+data = pd.DataFrame(li[2:-1], columns=li[1], index=[l[0] for l in li[2:-1]])
+print(data)
diff --git a/src/pdftableextract/core.py b/src/pdftableextract/core.py
index 491e67d..1765aec 100644
--- a/src/pdftableextract/core.py
+++ b/src/pdftableextract/core.py
@@ -73,7 +73,7 @@ def process_page(infile, pgs,
     boxes=False,
     encoding="utf8") :
 
-  outfile = open(outfilename,'w') if outfilename else sys.stdout
+  outfile = open(outfilename,'wb') if outfilename else sys.stdout
   page=page or []
   (pg,frow,lrow) = (list(map(int,(pgs.split(":"))))+[None,None])[0:3]
   #check that pdftoppdm exists by running a simple command
diff --git a/src/pdftableextract/pnm.py b/src/pdftableextract/pnm.py
index ae229ea..befce66 100644
--- a/src/pdftableextract/pnm.py
+++ b/src/pdftableextract/pnm.py
@@ -1,60 +1,68 @@
 from __future__ import print_function
 from numpy import array, fromstring, uint8, reshape, ones
+
 #-----------------------------------------------------------------------
 # PNM stuff.
 
+
 def noncomment(fd):
-  """Read lines from the filehandle until a non-comment line is found.
+    """Read lines from the filehandle until a non-comment line is found.
   Comments start with #"""
-  while True:
-    x = fd.readline()
-    if x.startswith(b'#') :
-      continue
-    else:
-      return x
+    while True:
+        x = fd.readline()
+        if x.startswith(b'#'):
+            continue
+        else:
+            return x
+
 
 def readPNM(fd):
-  """Reads the PNM file from the filehandle"""
-  t = noncomment(fd)
-  s = noncomment(fd)
-  m = noncomment(fd) if not (t.startswith(b'P1') or t.startswith(b'P4')) else b'1'
-  data = fd.read()
-  ls = len(s.split())
-  if ls != 2 :
-    name = "<pipe>" if fd.name=="<fdopen>" else "Filename = {0}".format(fd.name)
-    raise IOError("Expected 2 elements from parsing PNM file, got {0}: {1}".format(ls, name))
-  xs, ys = s.split()
-  width = int(xs)
-  height = int(ys)
-  m = int(m)
-
-  if m != 255 :
-    print ("Just want 8 bit pgms for now!")
-
-  d = fromstring(data,dtype=uint8)
-  d = reshape(d, (height,width) )
-  return (m,width,height, d)
-
-def writePNM(fd,img):
-  """Writes a PNM file to a filehandle given the img data as a numpy array"""
-  s = img.shape
-  m = 255
-  if img.dtype == bool :
-    img = img + uint8(0)
-    t = "P5"
-    m = 1
-  elif len(s) == 2 :
-    t = "P5"
-  else:
-    t = "P6"
-
-  fd.write( "%s\n%d %d\n%d\n" % (t, s[1],s[0],m) )
-  fd.write( uint8(img).tostring() )
-
-
-def dumpImage(outfile,bmp,img,bitmap=False, pad=2) :
+    """Reads the PNM file from the filehandle"""
+    t = noncomment(fd)
+    s = noncomment(fd)
+    m = noncomment(fd) if not (t.startswith(b'P1') or
+                               t.startswith(b'P4')) else b'1'
+    data = fd.read()
+    ls = len(s.split())
+    if ls != 2:
+        name = "<pipe>" if fd.name == "<fdopen>" else "Filename = {0}".format(
+            fd.name)
+        raise IOError(
+            "Expected 2 elements from parsing PNM file, got {0}: {1}".format(
+                ls, name))
+    xs, ys = s.split()
+    width = int(xs)
+    height = int(ys)
+    m = int(m)
+
+    if m != 255:
+        print("Just want 8 bit pgms for now!")
+
+    d = fromstring(data, dtype=uint8)
+    d = reshape(d, (height, width))
+    return (m, width, height, d)
+
+
+def writePNM(fd, img):
+    """Writes a PNM file to a filehandle given the img data as a numpy array"""
+    s = img.shape
+    m = 255
+    if img.dtype == bool:
+        img = img + uint8(0)
+        t = b"P5"
+        m = 1
+    elif len(s) == 2:
+        t = b"P5"
+    else:
+        t = b"P6"
+
+    fd.write(b"%s\n%d %d\n%d\n" % (t, s[1], s[0], m))
+    fd.write(img.astype(uint8).tobytes())
+
+
+def dumpImage(outfile, bmp, img, bitmap=False, pad=2):
     """Dumps the numpy array in image into the filename and closes the outfile"""
     oi = bmp if bitmap else img
-    (height,width) = bmp.shape
-    writePNM(outfile, oi[pad:height-pad, pad:width-pad])
+    (height, width) = bmp.shape
+    writePNM(outfile, oi[pad:height - pad, pad:width - pad])
     outfile.close()

From b2b6a70f6b4348472873d4b4686581537731f4cd Mon Sep 17 00:00:00 2001
From: Evgeny Cherkashin <eugeneai@irnok.net>
Date: Sat, 16 Jul 2016 20:43:59 +0800
Subject: [PATCH 18/28] Shortened some relations.

---
 src/pdftableextract/core.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/pdftableextract/core.py b/src/pdftableextract/core.py
index 1765aec..70f4322 100644
--- a/src/pdftableextract/core.py
+++ b/src/pdftableextract/core.py
@@ -192,7 +192,7 @@ def boxOfString(x,p) :
 
   hs = zeros(height, dtype=int)
   for j in range(height) :
-    dd = diff( where(bmp[j,:]==1)[0] )
+    dd = diff( where(bmp[j,:])[0] )
     if len(dd) > 0 :
       h = max ( dd )
       if h > lthresh :
@@ -201,7 +201,7 @@ def boxOfString(x,p) :
 # it was a solid black line.
       if bmp[j,0] == 0 :
         hs[j] = 1
-  hd=(  where(diff(hs[:]==1))[0] +1 )
+  hd=(  where(diff(hs[:]))[0] +1 )
 
 #-----------------------------------------------------------------------
 # Look for dividors that are too large.

From c5c3c8053a5fb268f748943c3fbc94877fcd6a2e Mon Sep 17 00:00:00 2001
From: Evgeny Cherkashin <eugeneai@irnok.net>
Date: Sat, 16 Jul 2016 20:59:44 +0800
Subject: [PATCH 19/28] Requirements added.

---
 requirements.txt | 5 +++++
 1 file changed, 5 insertions(+)
 create mode 100644 requirements.txt

diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..a7f3d77
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,5 @@
+vext
+numpy
+matplotlib
+pandas
+

From 67f838758daec423e6a99babdd4ff64de632cea8 Mon Sep 17 00:00:00 2001
From: Evgeny Cherkashin <eugeneai@irnok.net>
Date: Sat, 16 Jul 2016 22:11:19 +0800
Subject: [PATCH 20/28] Debugging.

---
 example/test_to_pandas.py   |  2 +-
 src/pdftableextract/core.py | 14 +++++++-------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/example/test_to_pandas.py b/example/test_to_pandas.py
index b7f3088..69360ac 100644
--- a/example/test_to_pandas.py
+++ b/example/test_to_pandas.py
@@ -8,7 +8,7 @@
                           p,
                           outfilename="weee.pnm",
                           checkcrop=False,
-                          checklines=False,
+                          checklines=True,
                           checkdivs=False,
                           checkcells=False, ) for p in pages]
 print(cells)
diff --git a/src/pdftableextract/core.py b/src/pdftableextract/core.py
index 889fd17..df7766c 100644
--- a/src/pdftableextract/core.py
+++ b/src/pdftableextract/core.py
@@ -77,7 +77,6 @@ def get_image(self, index):
         # C[A] = 255
         # C = C.reshape((pxh, pxw))
         nd = zeros(C.shape, dtype=uint8)
-        print(nd.shape, C.shape)
         nd[:] = C
         nd[A <= self.greyscale_threshold] = 255
 
@@ -228,25 +227,25 @@ def process_page(infile,
     t = 0
     imsave("bmp-start.png", bmp)
 
-    while t < height and any(bmp[t, :]) == False:
+    while t < height and bmp[t, :]:
         t = t + 1
     if t > 0:
         t = t - 1
 
     b = height - 1
-    while b > t and any(bmp[b, :]) == False:
+    while b > t and bmp[b, :]:
         b = b - 1
     if b < height - 1:
         b = b + 1
 
     l = 0
-    while l < width and any(bmp[:, l]) == False:
+    while l < width and bmp[:, l]:
         l = l + 1
     if l > 0:
         l = l - 1
 
     r = width - 1
-    while r > l and any(bmp[:, r]) == False:
+    while r > l and bmp[:, r]:
         r = r - 1
     if r < width - 1:
         r = r + 1
@@ -257,6 +256,7 @@ def process_page(infile,
     bmp[:, l] = False
     bmp[:, r] = False
     imsave("bbox-start.png", bmp)
+    print ("Bbox", l,t,b,r)
 
     def boxOfString(x, p):
         s = x.split(":")
@@ -317,7 +317,7 @@ def boxOfString(x, p):
 
     hs = zeros(height, dtype=uint8)
     for j in range(height):
-        dd = diff(where(bmp[j, :] == 1)[0])
+        dd = diff(where(bmp[j, :])[0])
         if len(dd) > 0:
             h = max(dd)
             if h > lthresh:
@@ -326,7 +326,7 @@ def boxOfString(x, p):
             # it was a solid black line.
             if all(bmp[j, 0]) == 0:
                 hs[j] = 1
-    hd = (where(diff(hs[:]) == 1)[0] + 1)
+    hd = (where(diff(hs[:]))[0] + 1)
 
     #-----------------------------------------------------------------------
     # Look for dividors that are too large.

From c4a48614e30bd76cce55c411cefa2f441696cb76 Mon Sep 17 00:00:00 2001
From: Evgeny Cherkashin <eugeneai@irnok.net>
Date: Sat, 16 Jul 2016 22:47:36 +0800
Subject: [PATCH 21/28] Starting to work.

---
 src/pdftableextract/core.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/pdftableextract/core.py b/src/pdftableextract/core.py
index df7766c..805c24b 100644
--- a/src/pdftableextract/core.py
+++ b/src/pdftableextract/core.py
@@ -227,25 +227,27 @@ def process_page(infile,
     t = 0
     imsave("bmp-start.png", bmp)
 
-    while t < height and bmp[t, :]:
+    while t < height and all(bmp[t, :]):
+        bbb=bmp[t,:]
+        print(any(bbb),all(bbb))
         t = t + 1
     if t > 0:
         t = t - 1
 
     b = height - 1
-    while b > t and bmp[b, :]:
+    while b > t and all(bmp[b, :]):
         b = b - 1
     if b < height - 1:
         b = b + 1
 
     l = 0
-    while l < width and bmp[:, l]:
+    while l < width and all(bmp[:, l]):
         l = l + 1
     if l > 0:
         l = l - 1
 
     r = width - 1
-    while r > l and bmp[:, r]:
+    while r > l and all(bmp[:, r]):
         r = r - 1
     if r < width - 1:
         r = r + 1
@@ -257,7 +259,7 @@ def process_page(infile,
     bmp[:, r] = False
     imsave("bbox-start.png", bmp)
     print ("Bbox", l,t,b,r)
-
+    
     def boxOfString(x, p):
         s = x.split(":")
         if len(s) < 4:

From 62d9598f8cc763b3d5ab8df78c40bd97130feb51 Mon Sep 17 00:00:00 2001
From: Evgeny Cherkashin <eugeneai@irnok.net>
Date: Mon, 18 Jul 2016 00:37:16 +0800
Subject: [PATCH 22/28] Algorithm refining. Debugging.

---
 example/test_to_pandas.py   |  11 ++-
 src/pdftableextract/core.py | 140 ++++++++++++++++--------------------
 2 files changed, 67 insertions(+), 84 deletions(-)

diff --git a/example/test_to_pandas.py b/example/test_to_pandas.py
index 69360ac..c14dfee 100644
--- a/example/test_to_pandas.py
+++ b/example/test_to_pandas.py
@@ -6,11 +6,8 @@
 
 cells = [pdf.process_page("example.pdf",
                           p,
-                          outfilename="weee.pnm",
-                          checkcrop=False,
-                          checklines=True,
-                          checkdivs=False,
-                          checkcells=False, ) for p in pages]
+                          outfilename="pandas-test",
+                          checkall=True) for p in pages]
 print(cells)
 
 #flatten the cells structure
@@ -26,5 +23,5 @@
 #row '1' contains column headings
 #data is row '2' through '-1'
 
-data = pd.DataFrame(li[2:-1], columns=li[1], index=[l[0] for l in li[2:-1]])
-print(data)
+#data = pd.DataFrame(li[2:-1], columns=li[1], index=[l[0] for l in li[2:-1]])
+#print(data)
diff --git a/src/pdftableextract/core.py b/src/pdftableextract/core.py
index 805c24b..40bd7b6 100644
--- a/src/pdftableextract/core.py
+++ b/src/pdftableextract/core.py
@@ -45,7 +45,8 @@ def get_image(self, index):
         dpi = self.resolution
         scale = 1
         width, height = [int(x) for x in page.get_size()]
-        d = dpi / 72.
+        d = self.scale = dpi / 72.
+        self.frac_scale=1/d
         pxw, pxh = int(width * d), int(height * d)
         # data=zeros((pxw,pxh,4), dtype=uint8)
         surface = cairo.ImageSurface(
@@ -97,12 +98,33 @@ def get_image(self, index):
         imsave('nomask.png', nd)
         return nd, page
 
+    def print_rect(self, msg, r, page):
+        x1,y1,x2,y2= r.x1, r.y1, r.x2, r.y2
+        x, y, w, h = x1, y1, x2 - x1, y2 - y1
+        print(msg, x, y, w, h, "---", x1,y1,x2,y2)
+        width, height = [int(x) for x in page.get_size()]
+        print(msg, x, height-y, w, h, "---", x1,height-y1,x2,height-y2)
+
     def get_text(self, page, x, y, w, h):
+        #cb = page.get_crop_box()
+        #self.print_rect("Rect crop", cb)
+        width, height = [int(x) for x in page.get_size()]
+        #print("Page_size", width, height)
+        print(x, y, w, h)
+        fc=self.frac_scale
+        print ("FC:",fc)
+        x,y,w,h = (z*fc for z in [x,y,w,h])
         rect = Poppler.Rectangle()
+        print("shifted:",x, y, w, h)
         rect.x1, rect.y1 = x, y
         rect.x2, rect.y2 = x + w, y + h
-        # print (help(rect))
+        self.print_rect ("box:", rect, page)
         txt = page.get_text_for_area(rect)
+        print (txt)
+        attrs=page.get_text_attributes_for_area(rect)
+        print([(a.start_index,a.end_index) for a in attrs])
+        print(help(attrs[0]))
+        wer
         #rect.free()
         #Poppler.Rectangle.free(rect)
         return txt
@@ -110,53 +132,11 @@ def get_text(self, page, x, y, w, h):
 
 def colinterp(a, x):
     """Interpolates colors"""
-    l = len(a)-1
-    i = min(l, max(0, int (x * l)))
-    (u,v) = a[i:i+2,:]
-    return u - (u-v) * ((x * l) % 1.0)
-
-colarr = array([ [255,0,0],[255,255,0],[0,255,0],[0,255,255],[0,0,255] ])
-
-def col(x, colmult=1.0) :
-    """colors"""
-    return colinterp(colarr,(colmult * x)% 1.0) / 2
-
-
-def process_page(infile, pgs,
-    outfilename=None,
-    greyscale_threshold=25,
-    page=None,
-    crop=None,
-    line_length=0.17,
-    bitmap_resolution=300,
-    name=None,
-    pad=2,
-    white=None,
-    black=None,
-    bitmap=False,
-    checkcrop=False,
-    checklines=False,
-    checkdivs=False,
-    checkcells=False,
-    whitespace="normalize",
-    boxes=False,
-    encoding="utf8") :
-
-  outfile = open(outfilename,'wb') if outfilename else sys.stdout
-  page=page or []
-  (pg,frow,lrow) = (list(map(int,(pgs.split(":"))))+[None,None])[0:3]
-  #check that pdftoppdm exists by running a simple command
-  check_for_required_executable("pdftoppm",["pdftoppm","-h"])
-  #end check
-
-  p = popen("pdftoppm", ("pdftoppm -gray -r %d -f %d -l %d %s " %
-      (bitmap_resolution,pg,pg,quote(infile))),
-      stdin=subprocess.PIPE, stdout=subprocess.PIPE, shell=True )
-
-#-----------------------------------------------------------------------
-# image load secion.
+    l = len(a) - 1
+    i = min(l, max(0, int(x * l)))
+    (u, v) = a[i:i + 2, :]
+    return u - (u - v) * ((x * l) % 1.0)
 
-  (maxval, width, height, data) = readPNM(p.stdout)
 
 colarr = array(
     [[255, 0, 0], [255, 255, 0], [0, 255, 0], [0, 255, 255], [0, 0, 255]])
@@ -166,28 +146,34 @@ def col(x, colmult=1.0):
     """colors"""
     return colinterp(colarr, (colmult * x) % 1.0) / 2
 
-
 def process_page(infile,
                  pgs,
                  outfilename=None,
                  greyscale_threshold=25,
                  page=None,
                  crop=None,
-                 line_length=0.17,
-                 bitmap_resolution=150, # 300,
+                 line_length=0.5,
+                 bitmap_resolution=72, # 300,
                  name=None,
                  pad=2,
                  white=None,
                  black=None,
                  bitmap=False,
                  checkcrop=False,
-                 checklines=True,
-                 checkdivs=True,
-                 checkcells=True,
+                 checklines=False,
+                 checkdivs=False,
+                 checkcells=False,
+                 checkall=False,
                  whitespace="normalize",
                  boxes=False,
                  encoding="utf8"):
 
+    if checkall:
+        checkcrop = True
+        checklines = True
+        checkdivs = True
+        checkcells = True
+
     outfile = outfilename if outfilename else "output"
     pdfdoc = PopplerProcessor(infile)
     page = page or []
@@ -222,14 +208,15 @@ def process_page(infile,
     img[:, :, 1] = bmp * 255
     img[:, :, 2] = bmp * 255
 
+    if checkdivs or checkcells:
+        imgfloat = img.astype(float)
+
     #-----------------------------------------------------------------------
     # Find bounding box.
     t = 0
     imsave("bmp-start.png", bmp)
 
     while t < height and all(bmp[t, :]):
-        bbb=bmp[t,:]
-        print(any(bbb),all(bbb))
         t = t + 1
     if t > 0:
         t = t - 1
@@ -258,8 +245,8 @@ def process_page(infile,
     bmp[:, l] = False
     bmp[:, r] = False
     imsave("bbox-start.png", bmp)
-    print ("Bbox", l,t,b,r)
-    
+    print("Bbox", l, t, b, r)
+
     def boxOfString(x, p):
         s = x.split(":")
         if len(s) < 4:
@@ -294,7 +281,6 @@ def boxOfString(x, p):
 
     if checkcrop:
         imsave("crop-" + outfile + ".png", img)
-        return True
 
 #-----------------------------------------------------------------------
 # Line finding section.
@@ -357,20 +343,20 @@ def boxOfString(x, p):
         for j in hd:
             img[j, :] = [0, 0, 255]  # blue
         imsave("lines-" + outfile + ".png", img)
-        return True
-#-----------------------------------------------------------------------
-# divider checking.
-#
-# at this point vd holds the x coordinate of vertical  and
-# hd holds the y coordinate of horizontal divider tansitions for each
-# vertical and horizontal lines in the table grid.
+
+        #-----------------------------------------------------------------------
+        # divider checking.
+        #
+        # at this point vd holds the x coordinate of vertical  and
+        # hd holds the y coordinate of horizontal divider tansitions for each
+        # vertical and horizontal lines in the table grid.
 
     def isDiv(a, l, r, t, b):
         # if any col or row (in axis) is all zeros ...
         return sum(sum(bmp[t:b, l:r], axis=a) == 0) > 0
 
     if checkdivs:
-        img = img / 2
+        img = (imgfloat / 2).astype(uint8)
         for j in range(0, len(hd), 2):
             for i in range(0, len(vd), 2):
                 if i > 0:
@@ -387,11 +373,11 @@ def isDiv(a, l, r, t, b):
                         img[t:b, l:r, 0] = 255
                         img[t:b, l:r, 2] = 0
         imsave("divs-" + outfile + ".png", img)
-        return True
-#-----------------------------------------------------------------------
-# Cell finding section.
-# This algorithum is width hungry, and always generates rectangular
-# boxes.
+
+        #-----------------------------------------------------------------------
+        # Cell finding section.
+        # This algorithum is width hungry, and always generates rectangular
+        # boxes.
 
     cells = []
     touched = zeros((len(hd), len(vd)), dtype=bool)
@@ -421,17 +407,17 @@ def isDiv(a, l, r, t, b):
 
     if checkcells:
         nc = len(cells) + 0.
-        img = img / 2
+        img = (imgfloat / 2.).astype(uint8)
         for k in range(len(cells)):
             (i, j, u, v) = cells[k]
             (l, r, t, b) = (vd[2 * i + 1], vd[2 * (i + u)], hd[2 * j + 1],
                             hd[2 * (j + v)])
-            img[t:b, l:r] += col(k / nc)
+            img[t:b, l:r] += col(k / nc).astype(uint8)
+
         imsave("cells-" + outfile + ".png", img)
-        return True
 
-#-----------------------------------------------------------------------
-# fork out to extract text for each cell.
+        #-----------------------------------------------------------------------
+        # fork out to extract text for each cell.
 
     def getCell(_coordinate):
         (i, j, u, v) = _coordinate

From 8ad74bc030accc7fc024e8b9cfc24a5eb3999763 Mon Sep 17 00:00:00 2001
From: Evgeny Cherkashin <eugene@irnok.net>
Date: Mon, 18 Jul 2016 20:32:37 +0800
Subject: [PATCH 23/28] That's it. It works.

---
 example/test_to_pandas.py   |   8 +-
 src/pdftableextract/core.py | 141 ++++++++++++++++++++++++++++++------
 2 files changed, 122 insertions(+), 27 deletions(-)

diff --git a/example/test_to_pandas.py b/example/test_to_pandas.py
index c14dfee..a9c20bd 100644
--- a/example/test_to_pandas.py
+++ b/example/test_to_pandas.py
@@ -8,14 +8,12 @@
                           p,
                           outfilename="pandas-test",
                           checkall=True) for p in pages]
-print(cells)
 
 #flatten the cells structure
 cells = [item for sublist in cells for item in sublist]
 
 #without any options, process_page picks up a blank table at the top of the page.
 #so choose table '1'
-print(cells)
 li = pdf.table_to_list(cells, pages)[1]
 
 #li is a list of lists, the first line is the header, last is the footer (for this table only!)
@@ -23,5 +21,7 @@
 #row '1' contains column headings
 #data is row '2' through '-1'
 
-#data = pd.DataFrame(li[2:-1], columns=li[1], index=[l[0] for l in li[2:-1]])
-#print(data)
+print (cells[:])
+
+data = pd.DataFrame(li[2:-1], columns=li[1], index=[l[0] for l in li[2:-1]])
+print(data)
diff --git a/src/pdftableextract/core.py b/src/pdftableextract/core.py
index 40bd7b6..f5d0d99 100644
--- a/src/pdftableextract/core.py
+++ b/src/pdftableextract/core.py
@@ -1,4 +1,5 @@
 import sys
+import random
 import os
 from numpy import array, fromstring, ones, zeros, uint8, diff, where, sum, delete, frombuffer, reshape, all, any
 import numpy
@@ -13,10 +14,16 @@
 import gi
 gi.require_version('Gtk', '3.0')
 gi.require_version('Poppler', '0.18')
-from gi.repository import Gdk, Poppler
+gi.require_version('Gdk', '3.0')
+from gi.repository import Gdk, Poppler  #, Glib
 import cairo
 
 
+def interact(locals):
+    import code
+    code.InteractiveConsole(locals=locals).interact()
+
+
 class PopplerProcessor(object):
     """Class for processing PDF. That's simple.
     It does two functions.
@@ -34,11 +41,22 @@ def __init__(self, filename, **kwargs):
         self.resolution = 300
         self.greyscale_threshold = int(kwargs.get("greyscale_thresholds",
                                                   25)) * 255.0 / 100.0
+        self.layout = None
 
     def get_page(self, index):
         if index < 0 or index >= self.page_num:
             raise IndexError("page number is out of bounds")
-        return self.document.get_page(index)
+        page = self.document.get_page(index)
+        if self.layout != None:
+            #Glib.free(self.layout)
+            # Do we need freeing elements of the list # FIXME
+            self.layout = None
+        self.text = page.get_text()
+        self.attributes=page.get_text_attributes()
+        l = page.get_text_layout()
+        if l[0]:
+            self.layout = l[1]
+        return page
 
     def get_image(self, index):
         page = self.get_page(index)
@@ -46,7 +64,7 @@ def get_image(self, index):
         scale = 1
         width, height = [int(x) for x in page.get_size()]
         d = self.scale = dpi / 72.
-        self.frac_scale=1/d
+        self.frac_scale = 1 / d
         pxw, pxh = int(width * d), int(height * d)
         # data=zeros((pxw,pxh,4), dtype=uint8)
         surface = cairo.ImageSurface(
@@ -98,36 +116,92 @@ def get_image(self, index):
         imsave('nomask.png', nd)
         return nd, page
 
-    def print_rect(self, msg, r, page):
-        x1,y1,x2,y2= r.x1, r.y1, r.x2, r.y2
+    def print_rect(self, msg=None, r=None, page=None):
+        if None in [r, page]:
+            raise ValueError("r and page arguments are required")
+        x1, y1, x2, y2 = r.x1, r.y1, r.x2, r.y2
         x, y, w, h = x1, y1, x2 - x1, y2 - y1
-        print(msg, x, y, w, h, "---", x1,y1,x2,y2)
+        print(msg, x, y, w, h, "---", x1, y1, x2, y2)
         width, height = [int(x) for x in page.get_size()]
-        print(msg, x, height-y, w, h, "---", x1,height-y1,x2,height-y2)
+        print(msg, x, height - y, w, h, "---", x1, height - y1, x2,
+              height - y2)
+
+    def within(self, a, b, pad=0):
+        """Is Rectangle b within Rectangle a, i.e. b is in a
+        """
+        if b.x1+pad < a.x1: return False
+        if b.y1+pad < a.y1: return False
+        if b.x2-pad > a.x2: return False
+        if b.y2-pad > a.y2: return False
+        return True
+
+    def rexpand(self, rect, layout, pad=0):
+        """Make rectangle rect include layout
+
+        Arguments:
+        - `rect`: Adjustable Rectangle;
+        - `layout`: Rectangle to be included in rect.
+        """
+
+        r, l = rect, layout
+        if r.x1 > l.x1: r.x1 = l.x1-pad
+        if r.y1 > l.y1: r.y1 = l.y1-pad
+        if r.x2 < l.x2: r.x2 = l.x2+pad
+        if r.y2 < l.y2: r.y2 = l.y2+pad
 
     def get_text(self, page, x, y, w, h):
         #cb = page.get_crop_box()
         #self.print_rect("Rect crop", cb)
         width, height = [int(x) for x in page.get_size()]
         #print("Page_size", width, height)
-        print(x, y, w, h)
-        fc=self.frac_scale
-        print ("FC:",fc)
-        x,y,w,h = (z*fc for z in [x,y,w,h])
+        ##print(x, y, w, h)
+        fc = self.frac_scale
+        ##print("FC:", fc)
+        x, y, w, h = (z * fc for z in [x, y, w, h])
         rect = Poppler.Rectangle()
-        print("shifted:",x, y, w, h)
+        ##print("shifted:", x, y, w, h)
         rect.x1, rect.y1 = x, y
         rect.x2, rect.y2 = x + w, y + h
-        self.print_rect ("box:", rect, page)
+        assert rect.x1<=rect.x2
+        assert rect.y1<=rect.y2
+        #self.print_rect("box:", rect, page)
         txt = page.get_text_for_area(rect)
-        print (txt)
-        attrs=page.get_text_attributes_for_area(rect)
-        print([(a.start_index,a.end_index) for a in attrs])
-        print(help(attrs[0]))
-        wer
+        ##print(txt)
+        attrs = page.get_text_attributes_for_area(rect)
+        ##print([(a.start_index, a.end_index) for a in attrs])
+        ##print(help(attrs[0]))
+        #chars=[]
+        r = Poppler.Rectangle()
+        r.x1 = r.y1 = 1e10
+        r.x2 = r.y2 = -1e10
+        chars=[]
+        for k,l in enumerate(self.layout):
+            if self.within(rect, l, pad=1):
+                self.rexpand(r, l, pad=0.5)
+                chars.append(self.text[k])
+        txt1="".join(chars)
+        #txt1 = page.get_text_for_area(r)
+        print ((r.x1,r.y1,r.x2,r.y2),txt1)
+
+        #interact(locals={"p": page, "d": self.document, "self": self})
         #rect.free()
         #Poppler.Rectangle.free(rect)
-        return txt
+        return txt1, r
+
+    def get_rectangles_for_page(self, page):
+        """Return all rectangles for all letters in the page..
+        Used for debugging
+
+        Arguments:
+        - `page`:
+        """
+        layout=self.layout
+        if layout == None:
+            raise RuntimeError("page is not chosen")
+
+        #interact(locals={"layout":layout, "self":self})
+        answer = [(r.x1,r.y1,r.x2,r.y2) for r in layout]
+        return answer
 
 
 def colinterp(a, x):
@@ -164,6 +238,7 @@ def process_page(infile,
                  checkdivs=False,
                  checkcells=False,
                  checkall=False,
+                 checkletters=False,
                  whitespace="normalize",
                  boxes=False,
                  encoding="utf8"):
@@ -173,6 +248,7 @@ def process_page(infile,
         checklines = True
         checkdivs = True
         checkcells = True
+        checkletters = True
 
     outfile = outfilename if outfilename else "output"
     pdfdoc = PopplerProcessor(infile)
@@ -208,9 +284,19 @@ def process_page(infile,
     img[:, :, 1] = bmp * 255
     img[:, :, 2] = bmp * 255
 
-    if checkdivs or checkcells:
+    if checkdivs or checkcells or checkletters:
         imgfloat = img.astype(float)
 
+    if checkletters:
+        img = (imgfloat/2.).astype(uint8)
+        rectangles=pdfdoc.get_rectangles_for_page(pg)
+        lrn=len(rectangles)
+        for k,r in enumerate(rectangles):
+            x1,y1,x2,y2 = [k+pad+1 for k in r]
+            img[y1:y2, x1:x2] += col(random.random()).astype(uint8)
+        imsave("letters.png", img)
+
+
     #-----------------------------------------------------------------------
     # Find bounding box.
     t = 0
@@ -419,21 +505,30 @@ def isDiv(a, l, r, t, b):
         #-----------------------------------------------------------------------
         # fork out to extract text for each cell.
 
-    def getCell(_coordinate):
+    def getCell(_coordinate, img=None):
         (i, j, u, v) = _coordinate
         (l, r, t, b) = (vd[2 * i + 1], vd[2 * (i + u)], hd[2 * j + 1],
                         hd[2 * (j + v)])
-        ret = pdfdoc.get_text(page, l - pad, t - pad, r - l, b - t)
+        ret, rect = pdfdoc.get_text(page, l - pad, t - pad, r - l, b - t)
+        if img != None and checkcells:
+            (x1,y1,x2,y2) = [rrr+pad for rrr in [rect.x1,rect.y1,rect.x2,rect.y2]]
+            img[y1:y2,x1:x2] += col(random.random()).astype(uint8)
+
         return (i, j, u, v, pg, ret)
 
+    if checkcells:
+        img = (imgfloat / 2.).astype(uint8)
     if boxes:
         cells = [x + (pg,
                       "", ) for x in cells
                  if (frow == None or (x[1] >= frow and x[1] <= lrow))]
     else:
         print(cells)
-        cells = [getCell(x) for x in cells
+        cells = [getCell(x, img) for x in cells
                  if (frow == None or (x[1] >= frow and x[1] <= lrow))]
+    if checkcells:
+        imsave("text-locations.png", img)
+
     return cells
 
 #-----------------------------------------------------------------------

From 2484b75c461bfa32e7495e2a00df011064624eb4 Mon Sep 17 00:00:00 2001
From: Evgeny Cherkashin <eugene@irnok.net>
Date: Mon, 18 Jul 2016 20:36:54 +0800
Subject: [PATCH 24/28] Removed debugging statements.

---
 example/test_to_pandas.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/example/test_to_pandas.py b/example/test_to_pandas.py
index a9c20bd..95e68d0 100644
--- a/example/test_to_pandas.py
+++ b/example/test_to_pandas.py
@@ -21,7 +21,5 @@
 #row '1' contains column headings
 #data is row '2' through '-1'
 
-print (cells[:])
-
 data = pd.DataFrame(li[2:-1], columns=li[1], index=[l[0] for l in li[2:-1]])
 print(data)

From 536b889b698d07a4ad364ed0b9b0c1134de4d22a Mon Sep 17 00:00:00 2001
From: Evgeny Cherkashin <eugene@irnok.net>
Date: Mon, 18 Jul 2016 21:39:57 +0800
Subject: [PATCH 25/28] Removed some debugging statements.

---
 src/pdftableextract/core.py | 91 ++++++++++++++-----------------------
 1 file changed, 33 insertions(+), 58 deletions(-)

diff --git a/src/pdftableextract/core.py b/src/pdftableextract/core.py
index f5d0d99..81f3b55 100644
--- a/src/pdftableextract/core.py
+++ b/src/pdftableextract/core.py
@@ -66,7 +66,6 @@ def get_image(self, index):
         d = self.scale = dpi / 72.
         self.frac_scale = 1 / d
         pxw, pxh = int(width * d), int(height * d)
-        # data=zeros((pxw,pxh,4), dtype=uint8)
         surface = cairo.ImageSurface(
             # data,
             cairo.FORMAT_ARGB32,
@@ -82,41 +81,25 @@ def get_image(self, index):
 
         pixbuf = Gdk.pixbuf_get_from_surface(surface, 0, 0, pxw, pxh)
         surface.write_to_png("page.png")
-        #img=image.set_from_pixbuf (pixbuf)
         data = frombuffer(pixbuf.get_pixels(), dtype=uint8)
         R = data[0::4]
         G = data[1::4]
         B = data[2::4]
         A = data[3::4]
-        C = R * 34 + G * 56 + B * 10 / 100.
-        # # print (max(A))
+        C = (R * 34 + G * 56 + B * 10) / 100. # Convert to gray
+
         C = C.astype(uint8)
 
-        # A = A <= self.greyscale_threshold
-        # C[A] = 255
-        # C = C.reshape((pxh, pxw))
         nd = zeros(C.shape, dtype=uint8)
         nd[:] = C
         nd[A <= self.greyscale_threshold] = 255
-
-        #data = data.reshape((pxh, pxw, 4))
-        #d = data[:, :, 3]
-        #alpha = data[:, :, 3]
-        #new = zeros(data.shape, dtype=uint8)
-        #new[:, :, :] = data
-        #new = new[:, :, 0:3]
-        #print(data)
-        #rc = alpha <= self.greyscale_threshold
-
-        #new[rc, 0] = 255
-        #new[rc, 1] = 255
-        #new[rc, 2] = 255
-        #new[:, :, 3] = 255
         nd = nd.reshape((pxh, pxw))
-        imsave('nomask.png', nd)
+        # imsave('nomask.png', nd)
         return nd, page
 
     def print_rect(self, msg=None, r=None, page=None):
+        """Used for debugging.
+        """
         if None in [r, page]:
             raise ValueError("r and page arguments are required")
         x1, y1, x2, y2 = r.x1, r.y1, r.x2, r.y2
@@ -127,7 +110,11 @@ def print_rect(self, msg=None, r=None, page=None):
               height - y2)
 
     def within(self, a, b, pad=0):
-        """Is Rectangle b within Rectangle a, i.e. b is in a
+        """Is Rectangle b within Rectangle a, i.e. b is in a.
+
+        Arguments:
+        - `a`, `b` : The rectangles;
+        - `pad` : Additional space.
         """
         if b.x1+pad < a.x1: return False
         if b.y1+pad < a.y1: return False
@@ -150,27 +137,19 @@ def rexpand(self, rect, layout, pad=0):
         if r.y2 < l.y2: r.y2 = l.y2+pad
 
     def get_text(self, page, x, y, w, h):
-        #cb = page.get_crop_box()
-        #self.print_rect("Rect crop", cb)
         width, height = [int(x) for x in page.get_size()]
-        #print("Page_size", width, height)
-        ##print(x, y, w, h)
         fc = self.frac_scale
-        ##print("FC:", fc)
         x, y, w, h = (z * fc for z in [x, y, w, h])
         rect = Poppler.Rectangle()
-        ##print("shifted:", x, y, w, h)
         rect.x1, rect.y1 = x, y
         rect.x2, rect.y2 = x + w, y + h
         assert rect.x1<=rect.x2
         assert rect.y1<=rect.y2
-        #self.print_rect("box:", rect, page)
-        txt = page.get_text_for_area(rect)
-        ##print(txt)
-        attrs = page.get_text_attributes_for_area(rect)
-        ##print([(a.start_index, a.end_index) for a in attrs])
-        ##print(help(attrs[0]))
-        #chars=[]
+
+        # Could not make it work correctly # FIXME
+        # txt = page.get_text_for_area(rect)
+        # attrs = page.get_text_attributes_for_area(rect)
+
         r = Poppler.Rectangle()
         r.x1 = r.y1 = 1e10
         r.x2 = r.y2 = -1e10
@@ -179,27 +158,23 @@ def get_text(self, page, x, y, w, h):
             if self.within(rect, l, pad=1):
                 self.rexpand(r, l, pad=0.5)
                 chars.append(self.text[k])
-        txt1="".join(chars)
-        #txt1 = page.get_text_for_area(r)
-        print ((r.x1,r.y1,r.x2,r.y2),txt1)
+        txt="".join(chars)
+
+        # txt = page.get_text_for_area(r) # FIXME
 
-        #interact(locals={"p": page, "d": self.document, "self": self})
-        #rect.free()
-        #Poppler.Rectangle.free(rect)
-        return txt1, r
+        return txt, r
 
     def get_rectangles_for_page(self, page):
         """Return all rectangles for all letters in the page..
-        Used for debugging
+        Used for debugging.
 
         Arguments:
-        - `page`:
+        - `page`: referece to page
         """
         layout=self.layout
         if layout == None:
             raise RuntimeError("page is not chosen")
 
-        #interact(locals={"layout":layout, "self":self})
         answer = [(r.x1,r.y1,r.x2,r.y2) for r in layout]
         return answer
 
@@ -262,8 +237,7 @@ def process_page(infile,
     #-----------------------------------------------------------------------
     # image load section.
 
-    #print(data.shape)
-    height, width = data.shape[:2]
+    height, width = data.shape[:2]  # If not to reduce to gray, the shape will be (,,3) or (,,4).
 
     pad = int(pad)
     height += pad * 2
@@ -273,13 +247,15 @@ def process_page(infile,
     bmp = ones((height, width), dtype=bool)
 
     thr = int(255.0 * greyscale_threshold / 100.0)
-    imsave("white.png", bmp)
+
     bmp[pad:height - pad, pad:width - pad] = (data[:, :] > thr)
-    #bmp = bmp == False
-    imsave("foo.png", bmp)
+
+
     # Set up Debuging image.
     img = zeros((height, width, 3), dtype=uint8)
-    #img[:, :, :] = bmp * 255
+
+    # img[:, :, :] = bmp * 255 # In case of colored input image
+
     img[:, :, 0] = bmp * 255
     img[:, :, 1] = bmp * 255
     img[:, :, 2] = bmp * 255
@@ -287,7 +263,7 @@ def process_page(infile,
     if checkdivs or checkcells or checkletters:
         imgfloat = img.astype(float)
 
-    if checkletters:
+    if checkletters:  # Show bounding boxes for each text object.
         img = (imgfloat/2.).astype(uint8)
         rectangles=pdfdoc.get_rectangles_for_page(pg)
         lrn=len(rectangles)
@@ -300,7 +276,6 @@ def process_page(infile,
     #-----------------------------------------------------------------------
     # Find bounding box.
     t = 0
-    imsave("bmp-start.png", bmp)
 
     while t < height and all(bmp[t, :]):
         t = t + 1
@@ -330,8 +305,6 @@ def process_page(infile,
     bmp[b, :] = False
     bmp[:, l] = False
     bmp[:, r] = False
-    imsave("bbox-start.png", bmp)
-    print("Bbox", l, t, b, r)
 
     def boxOfString(x, p):
         s = x.split(":")
@@ -510,14 +483,16 @@ def getCell(_coordinate, img=None):
         (l, r, t, b) = (vd[2 * i + 1], vd[2 * (i + u)], hd[2 * j + 1],
                         hd[2 * (j + v)])
         ret, rect = pdfdoc.get_text(page, l - pad, t - pad, r - l, b - t)
+
         if img != None and checkcells:
             (x1,y1,x2,y2) = [rrr+pad for rrr in [rect.x1,rect.y1,rect.x2,rect.y2]]
             img[y1:y2,x1:x2] += col(random.random()).astype(uint8)
 
         return (i, j, u, v, pg, ret)
 
-    if checkcells:
+    if checkletters:
         img = (imgfloat / 2.).astype(uint8)
+
     if boxes:
         cells = [x + (pg,
                       "", ) for x in cells
@@ -526,7 +501,7 @@ def getCell(_coordinate, img=None):
         print(cells)
         cells = [getCell(x, img) for x in cells
                  if (frow == None or (x[1] >= frow and x[1] <= lrow))]
-    if checkcells:
+    if checkletters:
         imsave("text-locations.png", img)
 
     return cells

From 9fc9279e434f9f2e158df86da90c626ba72ea868 Mon Sep 17 00:00:00 2001
From: Evgeny Cherkashin <eugene@irnok.net>
Date: Mon, 18 Jul 2016 22:36:37 +0800
Subject: [PATCH 26/28] Possibly ready to go.

---
 example/test_to_pandas.py      |  3 ++-
 src/pdftableextract/core.py    | 26 +++++++++++++++-----------
 src/pdftableextract/scripts.py | 25 +++++++++----------------
 3 files changed, 26 insertions(+), 28 deletions(-)

diff --git a/example/test_to_pandas.py b/example/test_to_pandas.py
index 95e68d0..3b30c80 100644
--- a/example/test_to_pandas.py
+++ b/example/test_to_pandas.py
@@ -7,7 +7,8 @@
 cells = [pdf.process_page("example.pdf",
                           p,
                           outfilename="pandas-test",
-                          checkall=True) for p in pages]
+                          bitmap_resolution=100,
+                          checkall=False) for p in pages]
 
 #flatten the cells structure
 cells = [item for sublist in cells for item in sublist]
diff --git a/src/pdftableextract/core.py b/src/pdftableextract/core.py
index 81f3b55..c5c3161 100644
--- a/src/pdftableextract/core.py
+++ b/src/pdftableextract/core.py
@@ -1,12 +1,17 @@
 import sys
-import random
 import os
+
+DEBUG = False
+
+if DEBUG:
+    import random
 from numpy import array, fromstring, ones, zeros, uint8, diff, where, sum, delete, frombuffer, reshape, all, any
 import numpy
 
-import matplotlib
-matplotlib.use('AGG')
-from matplotlib.image import imsave
+if DEBUG:
+    import matplotlib
+    matplotlib.use('AGG')
+    from matplotlib.image import imsave
 
 from xml.dom.minidom import getDOMImplementation
 import json
@@ -80,13 +85,13 @@ def get_image(self, index):
         context.restore()
 
         pixbuf = Gdk.pixbuf_get_from_surface(surface, 0, 0, pxw, pxh)
-        surface.write_to_png("page.png")
+        # surface.write_to_png("page.png")
         data = frombuffer(pixbuf.get_pixels(), dtype=uint8)
         R = data[0::4]
         G = data[1::4]
         B = data[2::4]
         A = data[3::4]
-        C = (R * 34 + G * 56 + B * 10) / 100. # Convert to gray
+        C = (R * 34. + G * 56. + B * 10.) / 100. # Convert to gray
 
         C = C.astype(uint8)
 
@@ -202,7 +207,7 @@ def process_page(infile,
                  page=None,
                  crop=None,
                  line_length=0.5,
-                 bitmap_resolution=72, # 300,
+                 bitmap_resolution=300,
                  name=None,
                  pad=2,
                  white=None,
@@ -268,7 +273,7 @@ def process_page(infile,
         rectangles=pdfdoc.get_rectangles_for_page(pg)
         lrn=len(rectangles)
         for k,r in enumerate(rectangles):
-            x1,y1,x2,y2 = [k+pad+1 for k in r]
+            x1,y1,x2,y2 = [int(bitmap_resolution* float(k)/72.)+pad for k in r]
             img[y1:y2, x1:x2] += col(random.random()).astype(uint8)
         imsave("letters.png", img)
 
@@ -484,8 +489,8 @@ def getCell(_coordinate, img=None):
                         hd[2 * (j + v)])
         ret, rect = pdfdoc.get_text(page, l - pad, t - pad, r - l, b - t)
 
-        if img != None and checkcells:
-            (x1,y1,x2,y2) = [rrr+pad for rrr in [rect.x1,rect.y1,rect.x2,rect.y2]]
+        if type(img)!=type(None) and checkletters:
+            (x1,y1,x2,y2) = [int(bitmap_resolution * float(rrr)/72+pad) for rrr in [rect.x1,rect.y1,rect.x2,rect.y2]]
             img[y1:y2,x1:x2] += col(random.random()).astype(uint8)
 
         return (i, j, u, v, pg, ret)
@@ -498,7 +503,6 @@ def getCell(_coordinate, img=None):
                       "", ) for x in cells
                  if (frow == None or (x[1] >= frow and x[1] <= lrow))]
     else:
-        print(cells)
         cells = [getCell(x, img) for x in cells
                  if (frow == None or (x[1] >= frow and x[1] <= lrow))]
     if checkletters:
diff --git a/src/pdftableextract/scripts.py b/src/pdftableextract/scripts.py
index 6939713..97a2ee2 100644
--- a/src/pdftableextract/scripts.py
+++ b/src/pdftableextract/scripts.py
@@ -25,10 +25,10 @@ def procargs() :
   p.add_argument("-name", help="name to add to XML tag, or HTML comments")
   p.add_argument("-pad", help="imitial image pading (pixels)", type=int,
      default=2 )
-  p.add_argument("-white",action="append", 
+  p.add_argument("-white",action="append",
     help="paint white to the bitmap as left:top:right:bottom in length units."
          "Done before painting black" )
-  p.add_argument("-black",action="append", 
+  p.add_argument("-black",action="append",
     help="paint black to the bitmap as left:top:right:bottom in length units."
          "Done after poainting white" )
   p.add_argument("-bitmap", action="store_true",
@@ -67,14 +67,10 @@ def main():
         raise
     sys.exit("I/O Error running pdf-table-extract: {0}".format(e))
   except OSError as e:
-    print("An OS Error occurred running pdf-table-extract: Is `pdftoppm` installed and available?")
+    print("An OS Error occurred running pdf-table-extract")
     if args.traceback:
         raise
     sys.exit("OS Error: {0}".format(e))
-  except subprocess.CalledProcessError as e:
-    if args.traceback:
-        raise
-    sys.exit("Error while checking a subprocess call: {0}".format(e))
   except Exception as e:
     if args.traceback:
         raise
@@ -85,9 +81,9 @@ def imain(args):
     if args.checkcrop or args.checklines or args.checkdivs or args.checkcells:
         for pgs in args.page :
             success = process_page(args.infile, pgs,
-                bitmap=args.bitmap, 
-                checkcrop=args.checkcrop, 
-                checklines=args.checklines, 
+                bitmap=args.bitmap,
+                checkcrop=args.checkcrop,
+                checklines=args.checklines,
                 checkdivs=args.checkdivs,
                 checkcells=args.checkcells,
                 whitespace=args.whitespace,
@@ -105,9 +101,9 @@ def imain(args):
     else:
         for pgs in args.page :
             cells.extend(process_page(args.infile, pgs,
-                bitmap=args.bitmap, 
-                checkcrop=args.checkcrop, 
-                checklines=args.checklines, 
+                bitmap=args.bitmap,
+                checkcrop=args.checkcrop,
+                checklines=args.checklines,
                 checkdivs=args.checkdivs,
                 checkcells=args.checkcells,
                 whitespace=args.whitespace,
@@ -127,6 +123,3 @@ def imain(args):
                 args.outfile = sys.stdout
             filenames["{0}_filename".format(args.t)] = args.outfile
             output(cells, args.page, name=args.name, infile=args.infile, output_type=args.t, **filenames)
-
-
-

From 82fd20206735fd82f2fad864bf14603930081849 Mon Sep 17 00:00:00 2001
From: Evgeny Cherkashin <eugene@irnok.net>
Date: Tue, 19 Jul 2016 01:04:05 +0800
Subject: [PATCH 27/28] Experimenting with page recognition.

---
 src/pdftableextract/core.py | 37 ++++++++++++++++++++++---------------
 1 file changed, 22 insertions(+), 15 deletions(-)

diff --git a/src/pdftableextract/core.py b/src/pdftableextract/core.py
index c5c3161..c1acd76 100644
--- a/src/pdftableextract/core.py
+++ b/src/pdftableextract/core.py
@@ -1,7 +1,7 @@
 import sys
 import os
 
-DEBUG = False
+DEBUG = True
 
 if DEBUG:
     import random
@@ -121,11 +121,18 @@ def within(self, a, b, pad=0):
         - `a`, `b` : The rectangles;
         - `pad` : Additional space.
         """
-        if b.x1+pad < a.x1: return False
-        if b.y1+pad < a.y1: return False
-        if b.x2-pad > a.x2: return False
-        if b.y2-pad > a.y2: return False
-        return True
+        if b.x1>=a.x1 and b.y1>=a.y1 and b.x2<=a.x2 and b.y2<=a.y2: # The obvious case.
+            return True
+        def w(x,y):
+            if x>=a.x1+pad and x<=a.x2-pad and y>=a.y1+pad and y<=a.y2-pad:
+                return True
+            else:
+                return False
+        for x,y in [(b.x1,b.y1), (b.x2,b.y2), (b.x1,b.y2), (b.x2,b.y1)]:
+            if w(x,y):
+                return True
+        # FIXME if b is bigger a and intersects it...
+        return False
 
     def rexpand(self, rect, layout, pad=0):
         """Make rectangle rect include layout
@@ -160,8 +167,8 @@ def get_text(self, page, x, y, w, h):
         r.x2 = r.y2 = -1e10
         chars=[]
         for k,l in enumerate(self.layout):
-            if self.within(rect, l, pad=1):
-                self.rexpand(r, l, pad=0.5)
+            if self.within(rect, l, pad=0):
+                self.rexpand(r, l, pad=0)
                 chars.append(self.text[k])
         txt="".join(chars)
 
@@ -275,7 +282,7 @@ def process_page(infile,
         for k,r in enumerate(rectangles):
             x1,y1,x2,y2 = [int(bitmap_resolution* float(k)/72.)+pad for k in r]
             img[y1:y2, x1:x2] += col(random.random()).astype(uint8)
-        imsave("letters.png", img)
+        imsave(outfile+"-letters.png", img)
 
 
     #-----------------------------------------------------------------------
@@ -344,7 +351,7 @@ def boxOfString(x, p):
             img[t:b + 1, l:r + 1] = [0, 0, 0]
 
     if checkcrop:
-        imsave("crop-" + outfile + ".png", img)
+        imsave(outfile+"-crop.png", img)
 
 #-----------------------------------------------------------------------
 # Line finding section.
@@ -406,7 +413,7 @@ def boxOfString(x, p):
 
         for j in hd:
             img[j, :] = [0, 0, 255]  # blue
-        imsave("lines-" + outfile + ".png", img)
+        imsave(outfile+"-lines.png", img)
 
         #-----------------------------------------------------------------------
         # divider checking.
@@ -436,7 +443,7 @@ def isDiv(a, l, r, t, b):
                     if isDiv(0, l, r, t, b):
                         img[t:b, l:r, 0] = 255
                         img[t:b, l:r, 2] = 0
-        imsave("divs-" + outfile + ".png", img)
+        imsave(outfile+"-divs.png", img)
 
         #-----------------------------------------------------------------------
         # Cell finding section.
@@ -476,9 +483,9 @@ def isDiv(a, l, r, t, b):
             (i, j, u, v) = cells[k]
             (l, r, t, b) = (vd[2 * i + 1], vd[2 * (i + u)], hd[2 * j + 1],
                             hd[2 * (j + v)])
-            img[t:b, l:r] += col(k / nc).astype(uint8)
+            img[t:b, l:r] += col(k*0.9 / nc + 0.1*random.random()).astype(uint8)
 
-        imsave("cells-" + outfile + ".png", img)
+        imsave(outfile+"-cells.png", img)
 
         #-----------------------------------------------------------------------
         # fork out to extract text for each cell.
@@ -506,7 +513,7 @@ def getCell(_coordinate, img=None):
         cells = [getCell(x, img) for x in cells
                  if (frow == None or (x[1] >= frow and x[1] <= lrow))]
     if checkletters:
-        imsave("text-locations.png", img)
+        imsave(outfile+"-text-locations.png", img)
 
     return cells
 

From 2ef0dad951b4827ae8034e2801bebd6b925e4378 Mon Sep 17 00:00:00 2001
From: Evgeny Cherkashin <eugene@irnok.net>
Date: Tue, 19 Jul 2016 10:18:27 +0800
Subject: [PATCH 28/28] Better overlapping condition. No debugging.

---
 src/pdftableextract/core.py | 23 ++++++-----------------
 1 file changed, 6 insertions(+), 17 deletions(-)

diff --git a/src/pdftableextract/core.py b/src/pdftableextract/core.py
index c1acd76..7af488a 100644
--- a/src/pdftableextract/core.py
+++ b/src/pdftableextract/core.py
@@ -1,7 +1,7 @@
 import sys
 import os
 
-DEBUG = True
+DEBUG = False
 
 if DEBUG:
     import random
@@ -114,25 +114,14 @@ def print_rect(self, msg=None, r=None, page=None):
         print(msg, x, height - y, w, h, "---", x1, height - y1, x2,
               height - y2)
 
-    def within(self, a, b, pad=0):
-        """Is Rectangle b within Rectangle a, i.e. b is in a.
+    def overlap(self, a, b, pad=0):
+        """Check if Rectangle b and Rectangle overlaps.
 
         Arguments:
         - `a`, `b` : The rectangles;
-        - `pad` : Additional space.
+        - `pad` : Additional space. (IGNORED)
         """
-        if b.x1>=a.x1 and b.y1>=a.y1 and b.x2<=a.x2 and b.y2<=a.y2: # The obvious case.
-            return True
-        def w(x,y):
-            if x>=a.x1+pad and x<=a.x2-pad and y>=a.y1+pad and y<=a.y2-pad:
-                return True
-            else:
-                return False
-        for x,y in [(b.x1,b.y1), (b.x2,b.y2), (b.x1,b.y2), (b.x2,b.y1)]:
-            if w(x,y):
-                return True
-        # FIXME if b is bigger a and intersects it...
-        return False
+        return a.x1 < b.x2 and a.x2 > b.x1 and a.y1 < b.y2 and a.y2 > b.y1
 
     def rexpand(self, rect, layout, pad=0):
         """Make rectangle rect include layout
@@ -167,7 +156,7 @@ def get_text(self, page, x, y, w, h):
         r.x2 = r.y2 = -1e10
         chars=[]
         for k,l in enumerate(self.layout):
-            if self.within(rect, l, pad=0):
+            if self.overlap(rect, l, pad=0):
                 self.rexpand(r, l, pad=0)
                 chars.append(self.text[k])
         txt="".join(chars)