From 0abba05efc502fbc2be4df70fdd44c0b1610e6c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mart=C3=ADn=20Gait=C3=A1n?= Date: Fri, 8 Nov 2013 02:28:32 -0300 Subject: [PATCH 01/28] simplifying setup.py --- setup.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/setup.py b/setup.py index 099adf9..a75e21a 100644 --- a/setup.py +++ b/setup.py @@ -1,6 +1,5 @@ from setuptools import setup, find_packages -import sys, os -from pip.req import parse_requirements +import os here = os.path.abspath(os.path.dirname(__file__)) README = open(os.path.join(here, 'README.md')).read() @@ -9,10 +8,6 @@ version = '0.1' -from_requirements_txt = parse_requirements("requirements.txt") -install_requires = [ str(ir.req) for ir in from_requirements_txt ] - - setup(name='pdf-table-extract', version=version, description="Extract Tables from PDF files", @@ -28,7 +23,7 @@ packages=find_packages('src'), package_dir = {'': 'src'},include_package_data=True, zip_safe=False, - install_requires=install_requires, + install_requires=['numpy'], entry_points={ 'console_scripts': ['pdf-table-extract=pdftableextract.scripts:main'] From aa31b88fd7fe9805cb2f93aa6ca914abb05a8ba8 Mon Sep 17 00:00:00 2001 From: Alex Goretoy Date: Thu, 12 Feb 2015 20:33:21 -0800 Subject: [PATCH 02/28] Added Requires to README --- README.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 222c14d..3085e5a 100644 --- a/README.md +++ b/README.md @@ -13,4 +13,8 @@ tables in ST Micro’s datasheets. The script requires numpy and poppler ###Tags [Utilities](http://ashimagroup.net/os/tag/utilities) - +###Requires +apt-get install python-dev poppler-utils +yum install python-devel poppler-utils +[numpy](http://www.numpy.org/) +[pandas](http://pandas.pydata.org/) From aef8b14bb4d9e12e67fbcaf009c69607c214dcb4 Mon Sep 17 00:00:00 2001 From: Evgeny Cherkashin Date: Thu, 14 Jul 2016 17:31:41 +0800 Subject: [PATCH 03/28] Python 3 adaptation. --- example/test_to_pandas.py | 3 +- src/pdftableextract/__init__.py | 2 +- src/pdftableextract/core.py | 152 +++++++++++++++--------------- src/pdftableextract/extracttab.py | 73 +++++++------- src/pdftableextract/pnm.py | 17 ++-- 5 files changed, 125 insertions(+), 122 deletions(-) diff --git a/example/test_to_pandas.py b/example/test_to_pandas.py index bb31515..d734ea3 100644 --- a/example/test_to_pandas.py +++ b/example/test_to_pandas.py @@ -1,3 +1,4 @@ +from __future__ import print_function import pandas as pd import pdftableextract as pdf @@ -17,4 +18,4 @@ #data is row '2' through '-1' data =pd.DataFrame(li[2:-1], columns=li[1], index=[l[0] for l in li[2:-1]]) -print data +print (data) diff --git a/src/pdftableextract/__init__.py b/src/pdftableextract/__init__.py index 6dbe85c..8366135 100644 --- a/src/pdftableextract/__init__.py +++ b/src/pdftableextract/__init__.py @@ -1,2 +1,2 @@ # Example package with a console entry point -from core import process_page, output, table_to_list \ No newline at end of file +from pdftableextract.core import process_page, output, table_to_list diff --git a/src/pdftableextract/core.py b/src/pdftableextract/core.py index d1dce80..b96fba4 100644 --- a/src/pdftableextract/core.py +++ b/src/pdftableextract/core.py @@ -3,7 +3,7 @@ from numpy import array, fromstring, ones, zeros, uint8, diff, where, sum, delete import subprocess from pipes import quote -from .pnm import readPNM, dumpImage +from pdftableextract.pnm import readPNM, dumpImage import re from pipes import quote from xml.dom.minidom import getDOMImplementation @@ -13,7 +13,7 @@ #----------------------------------------------------------------------- def check_for_required_executable(name,command): """Checks for an executable called 'name' by running 'command' and supressing - output. If the return code is non-zero or an OS error occurs, an Exception is raised""" + output. If the return code is non-zero or an OS error occurs, an Exception is raised""" try: with open(os.devnull, "w") as fnull: result=subprocess.check_call(command,stdout=fnull, stderr=fnull) @@ -32,12 +32,12 @@ def popen(name,command, *args, **kwargs): try: result=subprocess.Popen(command,*args, **kwargs) return result - except OSError, e: + except OSError as e: message="""Error running {0}. Is it installed correctly? Error: {1}""".format(name, e) raise OSError(message) - except Exception, e: - raise + except Exception as e: + raise def colinterp(a,x) : """Interpolates colors""" @@ -53,7 +53,7 @@ def col(x, colmult=1.0) : return colinterp(colarr,(colmult * x)% 1.0) / 2 -def process_page(infile, pgs, +def process_page(infile, pgs, outfilename=None, greyscale_threshold=25, page=None, @@ -64,17 +64,17 @@ def process_page(infile, pgs, pad=2, white=None, black=None, - bitmap=False, - checkcrop=False, - checklines=False, + bitmap=False, + checkcrop=False, + checklines=False, checkdivs=False, checkcells=False, whitespace="normalize", boxes=False) : - + outfile = open(outfilename,'w') if outfilename else sys.stdout page=page or [] - (pg,frow,lrow) = (map(int,(pgs.split(":")))+[None,None])[0:3] + (pg,frow,lrow) = (list(map(int,(pgs.split(":"))))+[None,None])[0:3] #check that pdftoppdm exists by running a simple command check_for_required_executable("pdftoppm",["pdftoppm","-h"]) #end check @@ -91,7 +91,7 @@ def process_page(infile, pgs, pad = int(pad) height+=pad*2 width+=pad*2 - + # reimbed image with a white padd. bmp = ones( (height,width) , dtype=bool ) bmp[pad:height-pad,pad:width-pad] = ( data[:,:] > int(255.0*greyscale_threshold/100.0) ) @@ -109,25 +109,25 @@ def process_page(infile, pgs, t=t+1 if t > 0 : t=t-1 - + b=height-1 while b > t and sum(bmp[b,:]==0) == 0 : b=b-1 if b < height-1: b = b+1 - + l=0 while l < width and sum(bmp[:,l]==0) == 0 : l=l+1 if l > 0 : l=l-1 - + r=width-1 while r > l and sum(bmp[:,r]==0) == 0 : r=r-1 if r < width-1 : r=r+1 - + # Mark bounding box. bmp[t,:] = 0 bmp[b,:] = 0 @@ -139,13 +139,13 @@ def boxOfString(x,p) : if len(s) < 4 : raise ValueError("boxes have format left:top:right:bottom[:page]") return ([bitmap_resolution * float(x) + pad for x in s[0:4] ] - + [ p if len(s)<5 else int(s[4]) ] ) + + [ p if len(s)<5 else int(s[4]) ] ) # translate crop to paint white. whites = [] if crop : - (l,t,r,b,p) = boxOfString(crop,pg) + (l,t,r,b,p) = boxOfString(crop,pg) whites.extend( [ (0,0,l,height,p), (0,0,width,t,p), (r,0,width,height,p), (0,b,width,height,p) ] ) @@ -157,7 +157,7 @@ def boxOfString(x,p) : if p == pg : bmp[ t:b+1,l:r+1 ] = 1 img[ t:b+1,l:r+1 ] = [255,255,255] - + # paint black ... if black : for b in black : @@ -168,17 +168,17 @@ def boxOfString(x,p) : if checkcrop : dumpImage(outfile,bmp,img, bitmap, pad) return True - + #----------------------------------------------------------------------- # Line finding section. # -# Find all vertical or horizontal lines that are more than rlthresh +# Find all vertical or horizontal lines that are more than rlthresh # long, these are considered lines on the table grid. lthresh = int(line_length * bitmap_resolution) vs = zeros(width, dtype=int) for i in range(width) : - dd = diff( where(bmp[:,i])[0] ) + dd = diff( where(bmp[:,i])[0] ) if len(dd)>0: v = max ( dd ) if v > lthresh : @@ -213,19 +213,19 @@ def boxOfString(x,p) : vd = delete(vd,i) else: i=i+2 - - j = 0 + + j = 0 while j < len(hd): if hd[j+1]-hd[j] > maxdiv : hd = delete(hd,j) hd = delete(hd,j) else: j=j+2 - + if checklines : for i in vd : img[:,i] = [255,0,0] # red - + for j in hd : img[j,:] = [0,0,255] # blue dumpImage(outfile,bmp,img) @@ -233,25 +233,25 @@ def boxOfString(x,p) : #----------------------------------------------------------------------- # divider checking. # -# at this point vd holds the x coordinate of vertical and -# hd holds the y coordinate of horizontal divider tansitions for each +# at this point vd holds the x coordinate of vertical and +# hd holds the y coordinate of horizontal divider tansitions for each # vertical and horizontal lines in the table grid. def isDiv(a, l,r,t,b) : # if any col or row (in axis) is all zeros ... - return sum( sum(bmp[t:b, l:r], axis=a)==0 ) >0 + return sum( sum(bmp[t:b, l:r], axis=a)==0 ) >0 if checkdivs : img = img / 2 for j in range(0,len(hd),2): for i in range(0,len(vd),2): if i>0 : - (l,r,t,b) = (vd[i-1], vd[i], hd[j], hd[j+1]) + (l,r,t,b) = (vd[i-1], vd[i], hd[j], hd[j+1]) img[ t:b, l:r, 1 ] = 192 if isDiv(1, l,r,t,b) : img[ t:b, l:r, 0 ] = 0 img[ t:b, l:r, 2 ] = 255 - + if j>0 : (l,r,t,b) = (vd[i], vd[i+1], hd[j-1], hd[j] ) img[ t:b, l:r, 1 ] = 128 @@ -265,7 +265,7 @@ def isDiv(a, l,r,t,b) : # This algorithum is width hungry, and always generates rectangular # boxes. - cells =[] + cells =[] touched = zeros( (len(hd), len(vd)),dtype=bool ) j = 0 while j*2+2 < len (hd) : @@ -290,8 +290,8 @@ def isDiv(a, l,r,t,b) : touched[ j:j+v, i:i+u] = True i = i+1 j=j+1 - - + + if checkcells : nc = len(cells)+0. img = img / 2 @@ -301,76 +301,77 @@ def isDiv(a, l,r,t,b) : img[ t:b, l:r ] += col( k/nc ) dumpImage(outfile,bmp,img) return True - + #----------------------------------------------------------------------- # fork out to extract text for each cell. - whitespace = re.compile( r'\s+') - - def getCell( (i,j,u,v) ): + whitespace = re.compile( rb'\s+') + + def getCell( _coordinate): + (i,j,u,v) =_coordinate (l,r,t,b) = ( vd[2*i+1] , vd[ 2*(i+u) ], hd[2*j+1], hd[2*(j+v)] ) - p = popen("pdftotext", + p = popen("pdftotext", "pdftotext -r %d -x %d -y %d -W %d -H %d -layout -nopgbrk -f %d -l %d %s -" % (bitmap_resolution, l-pad, t-pad, r-l, b-t, pg, pg, quote(infile)), - stdout=subprocess.PIPE, + stdout=subprocess.PIPE, shell=True ) - + ret = p.communicate()[0] if whitespace != 'raw' : - ret = whitespace.sub( "" if whitespace == "none" else " ", ret ) + ret = whitespace.sub( b"" if whitespace == "none" else b" ", ret ) if len(ret) > 0 : - ret = ret[ (1 if ret[0]==' ' else 0) : - len(ret) - (1 if ret[-1]==' ' else 0) ] + ret = ret[ (1 if ret[0]==b' ' else 0) : + len(ret) - (1 if ret[-1]==b' ' else 0) ] return (i,j,u,v,pg,ret) - + if boxes : - cells = [ x + (pg,"",) for x in cells if + cells = [ x + (pg,b"",) for x in cells if ( frow == None or (x[1] >= frow and x[1] <= lrow)) ] else : #check that pdftotext exists by running a simple command check_for_required_executable("pdftotext",["pdftotext","-h"]) #end check - cells = [ getCell(x) for x in cells if + cells = [ getCell(x) for x in cells if ( frow == None or (x[1] >= frow and x[1] <= lrow)) ] return cells #----------------------------------------------------------------------- #output section. -def output(cells, pgs, - cells_csv_filename=None, - cells_json_filename=None, - cells_xml_filename=None, +def output(cells, pgs, + cells_csv_filename=None, + cells_json_filename=None, + cells_xml_filename=None, table_csv_filename=None, table_html_filename=None, table_list_filename=None, infile=None, name=None, output_type=None ): - + output_types = [ - dict(filename=cells_csv_filename, function=o_cells_csv), - dict(filename=cells_json_filename, function=o_cells_json), - dict(filename=cells_xml_filename, function=o_cells_xml), + dict(filename=cells_csv_filename, function=o_cells_csv), + dict(filename=cells_json_filename, function=o_cells_json), + dict(filename=cells_xml_filename, function=o_cells_xml), dict(filename=table_csv_filename, function=o_table_csv), dict(filename=table_html_filename, function=o_table_html), dict(filename=table_list_filename, function=o_table_list) ] - + for entry in output_types: if entry["filename"]: if entry["filename"] != sys.stdout: outfile = open(entry["filename"],'w') else: outfile = sys.stdout - - entry["function"](cells, pgs, - outfile=outfile, - name=name, - infile=infile, + + entry["function"](cells, pgs, + outfile=outfile, + name=name, + infile=infile, output_type=output_type) if entry["filename"] != sys.stdout: outfile.close() - + def o_cells_csv(cells,pgs, outfile=None, name=None, infile=None, output_type=None) : outfile = outfile or sys.stdout csv.writer( outfile , dialect='excel' ).writerows(cells) @@ -381,20 +382,22 @@ def o_cells_json(cells,pgs, outfile=None, infile=None, name=None, output_type=No #defaults infile=infile or "" name=name or "" - - json.dump({ + + json.dump({ "src": infile, "name": name, "colnames": ( "x","y","width","height","page","contents" ), "cells":cells }, outfile) -def o_cells_xml(cells,pgs, outfile=None,infile=None, name=None, output_type=None) : +def o_cells_xml(cells,pgs, outfile=None,infile=None, name=None, output_type=None) : """Output XML formatted cell data""" outfile = outfile or sys.stdout #defaults infile=infile or "" name=name or "" + def _lambda(a): + return x.setAttribute(*a) doc = getDOMImplementation().createDocument(None,"table", None) root = doc.documentElement; @@ -404,19 +407,19 @@ def o_cells_xml(cells,pgs, outfile=None,infile=None, name=None, output_type=None root.setAttribute("name",name) for cl in cells : x = doc.createElement("cell") - map(lambda(a): x.setAttribute(*a), zip("xywhp",map(str,cl))) + map(_lambda, zip("xywhp",map(str,cl))) if cl[5] != "" : x.appendChild( doc.createTextNode(cl[5]) ) root.appendChild(x) outfile.write( doc.toprettyxml() ) - -def table_to_list(cells,pgs) : + +def table_to_list(cells,pgs) : """Output list of lists""" l=[0,0,0] for (i,j,u,v,pg,value) in cells : r=[i,j,pg] l = [max(x) for x in zip(l,r)] - + tab = [ [ [ "" for x in range(l[0]+1) ] for x in range(l[1]+1) ] for x in range(l[2]+1) @@ -432,18 +435,18 @@ def o_table_csv(cells,pgs, outfile=None, name=None, infile=None, output_type=Non tab=table_to_list(cells, pgs) for t in tab: csv.writer( outfile , dialect='excel' ).writerows(t) - + def o_table_list(cells,pgs, outfile=None, name=None, infile=None, output_type=None) : """Output list of lists""" outfile = outfile or sys.stdout tab = table_to_list(cells, pgs) print(tab) - -def o_table_html(cells,pgs, outfile=None, output_type=None, name=None, infile=None) : + +def o_table_html(cells,pgs, outfile=None, output_type=None, name=None, infile=None) : """Output HTML formatted table""" - oj = 0 + oj = 0 opg = 0 doc = getDOMImplementation().createDocument(None,"table", None) root = doc.documentElement; @@ -458,7 +461,7 @@ def o_table_html(cells,pgs, outfile=None, output_type=None, name=None, infile=No if j > oj or pg > opg: if pg > opg: s = "Name: " + name + ", " if name else "" - root.appendChild( doc.createComment( s + + root.appendChild( doc.createComment( s + ("Source: %s page %d." % (infile, pg) ))); if tr : root.appendChild(tr) @@ -478,4 +481,3 @@ def o_table_html(cells,pgs, outfile=None, output_type=None, name=None, infile=No tr.appendChild(td) root.appendChild(tr) outfile.write( doc.toprettyxml() ) - diff --git a/src/pdftableextract/extracttab.py b/src/pdftableextract/extracttab.py index ab6c74d..44ccecd 100644 --- a/src/pdftableextract/extracttab.py +++ b/src/pdftableextract/extracttab.py @@ -18,7 +18,7 @@ def process_page(pgs) : p = subprocess.Popen( ("pdftoppm -gray -r %d -f %d -l %d %s " % (args.r,pg,pg,quote(args.infile))), - stdin=subprocess.PIPE, stdout=subprocess.PIPE, shell=True ) + stdin=subprocess.PIPE, stdout=subprocess.PIPE, shell=True) #----------------------------------------------------------------------- # image load secion. @@ -28,7 +28,7 @@ def process_page(pgs) : pad = int(args.pad) height+=pad*2 width+=pad*2 - + # reimbed image with a white padd. bmp = ones( (height,width) , dtype=bool ) bmp[pad:height-pad,pad:width-pad] = ( data[:,:] > int(255.0*args.g/100.0) ) @@ -47,25 +47,25 @@ def process_page(pgs) : t=t+1 if t > 0 : t=t-1 - + b=height-1 while b > t and sum(bmp[b,:]==0) == 0 : b=b-1 if b < height-1: b = b+1 - + l=0 while l < width and sum(bmp[:,l]==0) == 0 : l=l+1 if l > 0 : l=l-1 - + r=width-1 while r > l and sum(bmp[:,r]==0) == 0 : r=r-1 if r < width-1 : r=r+1 - + # Mark bounding box. bmp[t,:] = 0 bmp[b,:] = 0 @@ -77,13 +77,13 @@ def boxOfString(x,p) : if len(s) < 4 : raise Exception("boxes have format left:top:right:bottom[:page]") return ([args.r * float(x) + args.pad for x in s[0:4] ] - + [ p if len(s)<5 else int(s[4]) ] ) + + [ p if len(s)<5 else int(s[4]) ] ) # translate crop to paint white. whites = [] if args.crop : - (l,t,r,b,p) = boxOfString(args.crop,pg) + (l,t,r,b,p) = boxOfString(args.crop,pg) whites.extend( [ (0,0,l,height,p), (0,0,width,t,p), (r,0,width,height,p), (0,b,width,height,p) ] ) @@ -95,7 +95,7 @@ def boxOfString(x,p) : if p == pg : bmp[ t:b+1,l:r+1 ] = 1 img[ t:b+1,l:r+1 ] = [255,255,255] - + # paint black ... if args.black : for b in args.black : @@ -106,18 +106,18 @@ def boxOfString(x,p) : if args.checkcrop : dumpImage(args,bmp,img) sys.exit(0) - - + + #----------------------------------------------------------------------- # Line finding section. # -# Find all verticle or horizontal lines that are more than rlthresh +# Find all verticle or horizontal lines that are more than rlthresh # long, these are considered lines on the table grid. lthresh = int(args.l * args.r) vs = zeros(width, dtype=int) for i in range(width) : - dd = diff( where(bmp[:,i])[0] ) + dd = diff( where(bmp[:,i])[0] ) if len(dd)>0: v = max ( dd ) if v > lthresh : @@ -153,62 +153,62 @@ def boxOfString(x,p) : vd = delete(vd,i) else: i=i+2 - - j = 0 + + j = 0 while j < len(hd): if hd[j+1]-hd[j] > maxdiv : hd = delete(hd,j) hd = delete(hd,j) else: j=j+2 - + if args.checklines : for i in vd : img[:,i] = [255,0,0] # red - + for j in hd : img[j,:] = [0,0,255] # blue dumpImage(args,bmp,img) sys.exit(0) - + #----------------------------------------------------------------------- # divider checking. # -# at this point vd holds the x coordinate of vertical and -# hd holds the y coordinate of horizontal divider tansitions for each +# at this point vd holds the x coordinate of vertical and +# hd holds the y coordinate of horizontal divider tansitions for each # vertical and horizontal lines in the table grid. def isDiv(a, l,r,t,b) : # if any col or row (in axis) is all zeros ... - return sum( sum(bmp[t:b, l:r], axis=a)==0 ) >0 + return sum( sum(bmp[t:b, l:r], axis=a)==0 ) >0 if args.checkdivs : img = img / 2 for j in range(0,len(hd),2): for i in range(0,len(vd),2): if i>0 : - (l,r,t,b) = (vd[i-1], vd[i], hd[j], hd[j+1]) + (l,r,t,b) = (vd[i-1], vd[i], hd[j], hd[j+1]) img[ t:b, l:r, 1 ] = 192 if isDiv(1, l,r,t,b) : img[ t:b, l:r, 0 ] = 0 img[ t:b, l:r, 2 ] = 255 - + if j>0 : (l,r,t,b) = (vd[i], vd[i+1], hd[j-1], hd[j] ) img[ t:b, l:r, 1 ] = 128 if isDiv(0, l,r,t,b) : img[ t:b, l:r, 0 ] = 255 img[ t:b, l:r, 2 ] = 0 - + dumpImage(args,bmp,img) sys.exit(0) - + #----------------------------------------------------------------------- # Cell finding section. # This algorithum is width hungry, and always generates rectangular # boxes. - cells =[] + cells =[] touched = zeros( (len(hd), len(vd)),dtype=bool ) j = 0 while j*2+2 < len (hd) : @@ -233,8 +233,8 @@ def isDiv(a, l,r,t,b) : touched[ j:j+v, i:i+u] = True i = i+1 j=j+1 - - + + if args.checkcells : nc = len(cells)+0. img = img / 2 @@ -244,25 +244,25 @@ def isDiv(a, l,r,t,b) : img[ t:b, l:r ] += col( k/nc ) dumpImage(args,bmp,img) sys.exit(0) - - + + #----------------------------------------------------------------------- # fork out to extract text for each cell. whitespace = re.compile( r'\s+') - + def getCell( (i,j,u,v) ): (l,r,t,b) = ( vd[2*i+1] , vd[ 2*(i+u) ], hd[2*j+1], hd[2*(j+v)] ) p = subprocess.Popen( ("pdftotext -r %d -x %d -y %d -W %d -H %d -layout -nopgbrk -f %d -l %d %s -" % (args.r, l-pad, t-pad, r-l, b-t, pg, pg, quote(args.infile) ) ), stdout=subprocess.PIPE, shell=True ) - + ret = p.communicate()[0] if args.w != 'raw' : ret = whitespace.sub( "" if args.w == "none" else " ", ret ) if len(ret) > 0 : - ret = ret[ (1 if ret[0]==' ' else 0) : + ret = ret[ (1 if ret[0]==' ' else 0) : len(ret) - (1 if ret[-1]==' ' else 0) ] return (i,j,u,v,pg,ret) @@ -270,12 +270,12 @@ def getCell( (i,j,u,v) ): # cells = [ x + (pg,"",) for x in cells ] #else : # cells = map(getCell, cells) - + if args.boxes : - cells = [ x + (pg,"",) for x in cells if + cells = [ x + (pg,"",) for x in cells if ( frow == None or (x[1] >= frow and x[1] <= lrow)) ] else : - cells = [ getCell(x) for x in cells if + cells = [ getCell(x) for x in cells if ( frow == None or (x[1] >= frow and x[1] <= lrow)) ] return cells @@ -294,4 +294,3 @@ def main_script(): "cells_xml" : o_cells_xml, "table_csv" : o_table_csv, "table_html": o_table_html, "table_chtml": o_table_html, } [ args.t ](cells,args.page) - diff --git a/src/pdftableextract/pnm.py b/src/pdftableextract/pnm.py index cbb05dd..ae229ea 100644 --- a/src/pdftableextract/pnm.py +++ b/src/pdftableextract/pnm.py @@ -1,13 +1,14 @@ +from __future__ import print_function from numpy import array, fromstring, uint8, reshape, ones #----------------------------------------------------------------------- # PNM stuff. def noncomment(fd): - """Read lines from the filehandle until a non-comment line is found. + """Read lines from the filehandle until a non-comment line is found. Comments start with #""" while True: - x = fd.readline() - if x.startswith('#') : + x = fd.readline() + if x.startswith(b'#') : continue else: return x @@ -16,7 +17,7 @@ def readPNM(fd): """Reads the PNM file from the filehandle""" t = noncomment(fd) s = noncomment(fd) - m = noncomment(fd) if not (t.startswith('P1') or t.startswith('P4')) else '1' + m = noncomment(fd) if not (t.startswith(b'P1') or t.startswith(b'P4')) else b'1' data = fd.read() ls = len(s.split()) if ls != 2 : @@ -28,8 +29,8 @@ def readPNM(fd): m = int(m) if m != 255 : - print "Just want 8 bit pgms for now!" - + print ("Just want 8 bit pgms for now!") + d = fromstring(data,dtype=uint8) d = reshape(d, (height,width) ) return (m,width,height, d) @@ -39,14 +40,14 @@ def writePNM(fd,img): s = img.shape m = 255 if img.dtype == bool : - img = img + uint8(0) + img = img + uint8(0) t = "P5" m = 1 elif len(s) == 2 : t = "P5" else: t = "P6" - + fd.write( "%s\n%d %d\n%d\n" % (t, s[1],s[0],m) ) fd.write( uint8(img).tostring() ) From 1fb10b3e3b64a8360d29c511de626d6dd7cffa67 Mon Sep 17 00:00:00 2001 From: Evgeny Cherkashin Date: Thu, 14 Jul 2016 17:50:34 +0800 Subject: [PATCH 04/28] Out result as unicode string. --- src/pdftableextract/core.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/pdftableextract/core.py b/src/pdftableextract/core.py index b96fba4..491e67d 100644 --- a/src/pdftableextract/core.py +++ b/src/pdftableextract/core.py @@ -70,7 +70,8 @@ def process_page(infile, pgs, checkdivs=False, checkcells=False, whitespace="normalize", - boxes=False) : + boxes=False, + encoding="utf8") : outfile = open(outfilename,'w') if outfilename else sys.stdout page=page or [] @@ -321,7 +322,7 @@ def getCell( _coordinate): if len(ret) > 0 : ret = ret[ (1 if ret[0]==b' ' else 0) : len(ret) - (1 if ret[-1]==b' ' else 0) ] - return (i,j,u,v,pg,ret) + return (i,j,u,v,pg,ret.decode(encoding)) if boxes : cells = [ x + (pg,b"",) for x in cells if From a59833168e2f2152c69ec6b26bb2032a9524bae8 Mon Sep 17 00:00:00 2001 From: Evgeny Cherkashin Date: Fri, 15 Jul 2016 01:26:15 +0800 Subject: [PATCH 05/28] Corrected imports in script. --- src/pdftableextract/scripts.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/pdftableextract/scripts.py b/src/pdftableextract/scripts.py index 68a7b2e..6939713 100644 --- a/src/pdftableextract/scripts.py +++ b/src/pdftableextract/scripts.py @@ -2,8 +2,8 @@ import sys import logging import subprocess -from .core import process_page, output -import core +from pdftableextract.core import process_page, output +import pdftableextract.core #----------------------------------------------------------------------- From 058958c906b32789e328ca7f0cd21f91761e4149 Mon Sep 17 00:00:00 2001 From: Evgeny Cherkashin Date: Fri, 15 Jul 2016 01:27:12 +0800 Subject: [PATCH 06/28] Remove unused file. --- src/pdftableextract/extracttab.py | 296 ------------------------------ 1 file changed, 296 deletions(-) delete mode 100644 src/pdftableextract/extracttab.py diff --git a/src/pdftableextract/extracttab.py b/src/pdftableextract/extracttab.py deleted file mode 100644 index 44ccecd..0000000 --- a/src/pdftableextract/extracttab.py +++ /dev/null @@ -1,296 +0,0 @@ -# Description : PDF Table Extraction Utility -# Author : Ian McEwan, Ashima Research. -# Maintainer : ijm -# Lastmod : 20130402 (ijm) -# License : Copyright (C) 2011 Ashima Research. All rights reserved. -# Distributed under the MIT Expat License. See LICENSE file. -# https://github.com/ashima/pdf-table-extract - -import sys, argparse, subprocess, re, csv, json -from numpy import * -from pipes import quote -from xml.dom.minidom import getDOMImplementation - -# Proccessing function. - -def process_page(pgs) : - (pg,frow,lrow) = (map(int,(pgs.split(":")))+[None,None])[0:3] - - p = subprocess.Popen( ("pdftoppm -gray -r %d -f %d -l %d %s " % - (args.r,pg,pg,quote(args.infile))), - stdin=subprocess.PIPE, stdout=subprocess.PIPE, shell=True) - -#----------------------------------------------------------------------- -# image load secion. - - (maxval, width, height, data) = readPNM(p.stdout) - - pad = int(args.pad) - height+=pad*2 - width+=pad*2 - -# reimbed image with a white padd. - bmp = ones( (height,width) , dtype=bool ) - bmp[pad:height-pad,pad:width-pad] = ( data[:,:] > int(255.0*args.g/100.0) ) - -# Set up Debuging image. - img = zeros( (height,width,3) , dtype=uint8 ) - img[:,:,0] = bmp*255 - img[:,:,1] = bmp*255 - img[:,:,2] = bmp*255 - -#----------------------------------------------------------------------- -# Find bounding box. - - t=0 - while t < height and sum(bmp[t,:]==0) == 0 : - t=t+1 - if t > 0 : - t=t-1 - - b=height-1 - while b > t and sum(bmp[b,:]==0) == 0 : - b=b-1 - if b < height-1: - b = b+1 - - l=0 - while l < width and sum(bmp[:,l]==0) == 0 : - l=l+1 - if l > 0 : - l=l-1 - - r=width-1 - while r > l and sum(bmp[:,r]==0) == 0 : - r=r-1 - if r < width-1 : - r=r+1 - -# Mark bounding box. - bmp[t,:] = 0 - bmp[b,:] = 0 - bmp[:,l] = 0 - bmp[:,r] = 0 - - def boxOfString(x,p) : - s = x.split(":") - if len(s) < 4 : - raise Exception("boxes have format left:top:right:bottom[:page]") - return ([args.r * float(x) + args.pad for x in s[0:4] ] - + [ p if len(s)<5 else int(s[4]) ] ) - - -# translate crop to paint white. - whites = [] - if args.crop : - (l,t,r,b,p) = boxOfString(args.crop,pg) - whites.extend( [ (0,0,l,height,p), (0,0,width,t,p), - (r,0,width,height,p), (0,b,width,height,p) ] ) - -# paint white ... - if args.white : - whites.extend( [ boxOfString(b, pg) for b in args.white ] ) - - for (l,t,r,b,p) in whites : - if p == pg : - bmp[ t:b+1,l:r+1 ] = 1 - img[ t:b+1,l:r+1 ] = [255,255,255] - -# paint black ... - if args.black : - for b in args.black : - (l,t,r,b) = [args.r * float(x) + args.pad for x in b.split(":") ] - bmp[ t:b+1,l:r+1 ] = 0 - img[ t:b+1,l:r+1 ] = [0,0,0] - - if args.checkcrop : - dumpImage(args,bmp,img) - sys.exit(0) - - -#----------------------------------------------------------------------- -# Line finding section. -# -# Find all verticle or horizontal lines that are more than rlthresh -# long, these are considered lines on the table grid. - - lthresh = int(args.l * args.r) - vs = zeros(width, dtype=int) - for i in range(width) : - dd = diff( where(bmp[:,i])[0] ) - if len(dd)>0: - v = max ( dd ) - if v > lthresh : - vs[i] = 1 - else: -# it was a solid black line. - if bmp[0,i] == 0 : - vs[i] = 1 - vd= ( where(diff(vs[:]))[0] +1 ) - - hs = zeros(height, dtype=int) - for j in range(height) : - dd = diff( where(bmp[j,:]==1)[0] ) - if len(dd) > 0 : - h = max ( dd ) - if h > lthresh : - hs[j] = 1 - else: -# it was a solid black line. - if bmp[j,0] == 0 : - hs[j] = 1 - hd=( where(diff(hs[:]==1))[0] +1 ) - -#----------------------------------------------------------------------- -# Look for dividors that are too large. - - maxdiv=10 - i=0 - - while i < len(vd) : - if vd[i+1]-vd[i] > maxdiv : - vd = delete(vd,i) - vd = delete(vd,i) - else: - i=i+2 - - j = 0 - while j < len(hd): - if hd[j+1]-hd[j] > maxdiv : - hd = delete(hd,j) - hd = delete(hd,j) - else: - j=j+2 - - if args.checklines : - for i in vd : - img[:,i] = [255,0,0] # red - - for j in hd : - img[j,:] = [0,0,255] # blue - dumpImage(args,bmp,img) - sys.exit(0) - -#----------------------------------------------------------------------- -# divider checking. -# -# at this point vd holds the x coordinate of vertical and -# hd holds the y coordinate of horizontal divider tansitions for each -# vertical and horizontal lines in the table grid. - - def isDiv(a, l,r,t,b) : - # if any col or row (in axis) is all zeros ... - return sum( sum(bmp[t:b, l:r], axis=a)==0 ) >0 - - if args.checkdivs : - img = img / 2 - for j in range(0,len(hd),2): - for i in range(0,len(vd),2): - if i>0 : - (l,r,t,b) = (vd[i-1], vd[i], hd[j], hd[j+1]) - img[ t:b, l:r, 1 ] = 192 - if isDiv(1, l,r,t,b) : - img[ t:b, l:r, 0 ] = 0 - img[ t:b, l:r, 2 ] = 255 - - if j>0 : - (l,r,t,b) = (vd[i], vd[i+1], hd[j-1], hd[j] ) - img[ t:b, l:r, 1 ] = 128 - if isDiv(0, l,r,t,b) : - img[ t:b, l:r, 0 ] = 255 - img[ t:b, l:r, 2 ] = 0 - - dumpImage(args,bmp,img) - sys.exit(0) - -#----------------------------------------------------------------------- -# Cell finding section. -# This algorithum is width hungry, and always generates rectangular -# boxes. - - cells =[] - touched = zeros( (len(hd), len(vd)),dtype=bool ) - j = 0 - while j*2+2 < len (hd) : - i = 0 - while i*2+2 < len(vd) : - u = 1 - v = 1 - if not touched[j,i] : - while 2+(i+u)*2 < len(vd) and \ - not isDiv( 0, vd[ 2*(i+u) ], vd[ 2*(i+u)+1], - hd[ 2*(j+v)-1 ], hd[ 2*(j+v) ] ): - u=u+1 - bot = False - while 2+(j+v)*2 < len(hd) and not bot : - bot = False - for k in range(1,u+1) : - bot |= isDiv( 1, vd[ 2*(i+k)-1 ], vd[ 2*(i+k)], - hd[ 2*(j+v) ], hd[ 2*(j+v)+1 ] ) - if not bot : - v=v+1 - cells.append( (i,j,u,v) ) - touched[ j:j+v, i:i+u] = True - i = i+1 - j=j+1 - - - if args.checkcells : - nc = len(cells)+0. - img = img / 2 - for k in range(len(cells)): - (i,j,u,v) = cells[k] - (l,r,t,b) = ( vd[2*i+1] , vd[ 2*(i+u) ], hd[2*j+1], hd[2*(j+v)] ) - img[ t:b, l:r ] += col( k/nc ) - dumpImage(args,bmp,img) - sys.exit(0) - - -#----------------------------------------------------------------------- -# fork out to extract text for each cell. - - whitespace = re.compile( r'\s+') - - def getCell( (i,j,u,v) ): - (l,r,t,b) = ( vd[2*i+1] , vd[ 2*(i+u) ], hd[2*j+1], hd[2*(j+v)] ) - p = subprocess.Popen( - ("pdftotext -r %d -x %d -y %d -W %d -H %d -layout -nopgbrk -f %d -l %d %s -" - % (args.r, l-pad, t-pad, r-l, b-t, pg, pg, quote(args.infile) ) ), - stdout=subprocess.PIPE, shell=True ) - - ret = p.communicate()[0] - if args.w != 'raw' : - ret = whitespace.sub( "" if args.w == "none" else " ", ret ) - if len(ret) > 0 : - ret = ret[ (1 if ret[0]==' ' else 0) : - len(ret) - (1 if ret[-1]==' ' else 0) ] - return (i,j,u,v,pg,ret) - - #if args.boxes : - # cells = [ x + (pg,"",) for x in cells ] - #else : - # cells = map(getCell, cells) - - if args.boxes : - cells = [ x + (pg,"",) for x in cells if - ( frow == None or (x[1] >= frow and x[1] <= lrow)) ] - else : - cells = [ getCell(x) for x in cells if - ( frow == None or (x[1] >= frow and x[1] <= lrow)) ] - return cells - - -#----------------------------------------------------------------------- -# main - -def main_script(): - args = procargs() - - cells = [] - for pgs in args.page : - cells.extend(process_page(pgs)) - - { "cells_csv" : o_cells_csv, "cells_json" : o_cells_json, - "cells_xml" : o_cells_xml, "table_csv" : o_table_csv, - "table_html": o_table_html, "table_chtml": o_table_html, - } [ args.t ](cells,args.page) From d05360dd031ea74c80fb7b08c54a9be843b45555 Mon Sep 17 00:00:00 2001 From: Evgeny Cherkashin Date: Fri, 15 Jul 2016 09:15:31 +0800 Subject: [PATCH 07/28] Starting conversion to Gtk's Poppler. --- Makefile | 25 +++++++++++++++++++++++++ requirements.txt | 3 +++ setup.py | 2 +- src/pdftableextract/core.py | 28 ++++++++++++++++++++++------ 4 files changed, 51 insertions(+), 7 deletions(-) create mode 100644 Makefile create mode 100644 requirements.txt diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..63821a7 --- /dev/null +++ b/Makefile @@ -0,0 +1,25 @@ +.PHONY: develop setup run-tests tests test gdb-test + +LPYTHON=python3 +V=$(PWD)/../../$(LPYTHON) +VB=$(V)/bin +PYTHON=$(VB)/$(LPYTHON) +ROOT=$(PWD) +#INI=icc.linkgrammar +#LCAT=src/icc/linkgrammar/locale/ + +develop: setup + pip install -r requirements.txt + +setup: + python setup.py develop + +run-tests: + nosetests -w src/icc/tests + +tests: run-tests + +test: setup run-tests + +gdb-test: setup + gdb --args $(PYTHON) $(VB)/nosetests -w src/icc/tests diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..008b217 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +numpy +ruamel.venvgtk +# pandas diff --git a/setup.py b/setup.py index 8591c50..516d5ad 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ version = '0.1' -install_requires = [ "numpy" ] +install_requires = [ "numpy", "ruamel.venvgtk" ] setup(name='pdf-table-extract', diff --git a/src/pdftableextract/core.py b/src/pdftableextract/core.py index 491e67d..1ec262c 100644 --- a/src/pdftableextract/core.py +++ b/src/pdftableextract/core.py @@ -1,19 +1,34 @@ import sys import os from numpy import array, fromstring, ones, zeros, uint8, diff, where, sum, delete -import subprocess -from pipes import quote +#import subprocess +#from pipes import quote from pdftableextract.pnm import readPNM, dumpImage import re -from pipes import quote +#from pipes import quote from xml.dom.minidom import getDOMImplementation import json import csv +import gi +gi.require_version('Gtk', '3.0') +from gi.repository import Poppler + +class PopplerProcessor(object): + """ + """ + + def __init__(self, **kwargs): + """ + """ + self.p=Popp + + #----------------------------------------------------------------------- def check_for_required_executable(name,command): """Checks for an executable called 'name' by running 'command' and supressing output. If the return code is non-zero or an OS error occurs, an Exception is raised""" + return try: with open(os.devnull, "w") as fnull: result=subprocess.check_call(command,stdout=fnull, stderr=fnull) @@ -29,6 +44,8 @@ def check_for_required_executable(name,command): #----------------------------------------------------------------------- def popen(name,command, *args, **kwargs): + print (name,command, *args, **kwargs) + wew try: result=subprocess.Popen(command,*args, **kwargs) return result @@ -81,8 +98,7 @@ def process_page(infile, pgs, #end check p = popen("pdftoppm", ("pdftoppm -gray -r %d -f %d -l %d %s " % - (bitmap_resolution,pg,pg,quote(infile))), - stdin=subprocess.PIPE, stdout=subprocess.PIPE, shell=True ) + (bitmap_resolution,pg,pg,infile))) #----------------------------------------------------------------------- # image load secion. @@ -312,7 +328,7 @@ def getCell( _coordinate): (i,j,u,v) =_coordinate (l,r,t,b) = ( vd[2*i+1] , vd[ 2*(i+u) ], hd[2*j+1], hd[2*(j+v)] ) p = popen("pdftotext", - "pdftotext -r %d -x %d -y %d -W %d -H %d -layout -nopgbrk -f %d -l %d %s -" % (bitmap_resolution, l-pad, t-pad, r-l, b-t, pg, pg, quote(infile)), + "pdftotext -r %d -x %d -y %d -W %d -H %d -layout -nopgbrk -f %d -l %d %s -" % (bitmap_resolution, l-pad, t-pad, r-l, b-t, pg, pg, infile), stdout=subprocess.PIPE, shell=True ) From 5d52e02804d1a2b442487fc9296104339ac95b3b Mon Sep 17 00:00:00 2001 From: Evgeny Cherkashin Date: Fri, 15 Jul 2016 13:05:26 +0800 Subject: [PATCH 08/28] Formatted versions of files --- setup.py | 7 +++---- src/pdftableextract/core.py | 2 -- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/setup.py b/setup.py index 516d5ad..5d691ea 100644 --- a/setup.py +++ b/setup.py @@ -5,10 +5,9 @@ README = open(os.path.join(here, 'README.md')).read() #NEWS = open(os.path.join(here, 'NEWS.txt')).read() - version = '0.1' -install_requires = [ "numpy", "ruamel.venvgtk" ] +install_requires = ["numpy", "ruamel.venvgtk"] setup(name='pdf-table-extract', @@ -21,7 +20,7 @@ keywords='PDF, tables', author='Ian McEwan', author_email='ijm@ashimaresearch.com', - url='ashimaresearch.com', + url='ashimaresearch.dcom', license='MIT-Expat', packages=find_packages('src'), package_dir = {'': 'src'},include_package_data=True, @@ -31,4 +30,4 @@ 'console_scripts': ['pdf-table-extract=pdftableextract.scripts:main'] } -) + ) diff --git a/src/pdftableextract/core.py b/src/pdftableextract/core.py index 1ec262c..4baaa55 100644 --- a/src/pdftableextract/core.py +++ b/src/pdftableextract/core.py @@ -22,8 +22,6 @@ def __init__(self, **kwargs): """ self.p=Popp - - #----------------------------------------------------------------------- def check_for_required_executable(name,command): """Checks for an executable called 'name' by running 'command' and supressing From e93eff5b15d5d11a87e039e49ecb4575306e3f64 Mon Sep 17 00:00:00 2001 From: Evgeny Cherkashin Date: Fri, 15 Jul 2016 22:23:04 +0800 Subject: [PATCH 09/28] Now it works but incorrectly. --- example/gtk-test.py | 37 ++++++++++++ example/test_to_pandas.py | 9 +-- setup.py | 2 +- src/pdftableextract/core.py | 115 ++++++++++++++++++++++++++---------- 4 files changed, 127 insertions(+), 36 deletions(-) create mode 100644 example/gtk-test.py diff --git a/example/gtk-test.py b/example/gtk-test.py new file mode 100644 index 0000000..7ceb36a --- /dev/null +++ b/example/gtk-test.py @@ -0,0 +1,37 @@ +# http://stackoverflow.com/a/10031877 + +import numpy +import cairo +import math + +from gi.repository import Gtk, Gdk + +data = numpy.zeros((200, 200, 4), dtype=numpy.uint8) +surface = cairo.ImageSurface.create_for_data(data, cairo.FORMAT_ARGB32, 200, + 200) +cr = cairo.Context(surface) + +# fill with solid white +cr.set_source_rgb(1.0, 1.0, 1.0) +cr.paint() + +# draw red circle +cr.arc(100, 100, 80, 0, 2 * math.pi) +cr.set_line_width(3) +cr.set_source_rgb(1.0, 0.0, 0.0) +cr.stroke() + +#draw directly to the shared buffer +data[10:30, 10:30, 2] = 128 + +# write output +print(data[38:48, 38:48, 0]) +surface.write_to_png("circle.png") + +pb = Gdk.pixbuf_get_from_surface(surface, 0, 0, 200, 200) +im = Gtk.Image.new_from_pixbuf(pb) +w = Gtk.Window() +w.connect("delete-event", Gtk.main_quit) +w.add(im) +w.show_all() +Gtk.main() diff --git a/example/test_to_pandas.py b/example/test_to_pandas.py index d734ea3..32cd3c7 100644 --- a/example/test_to_pandas.py +++ b/example/test_to_pandas.py @@ -3,13 +3,14 @@ import pdftableextract as pdf pages = ["1"] -cells = [pdf.process_page("example.pdf",p) for p in pages] +cells = [pdf.process_page("example.pdf", p) for p in pages] #flatten the cells structure -cells = [item for sublist in cells for item in sublist ] +cells = [item for sublist in cells for item in sublist] #without any options, process_page picks up a blank table at the top of the page. #so choose table '1' +print(cells) li = pdf.table_to_list(cells, pages)[1] #li is a list of lists, the first line is the header, last is the footer (for this table only!) @@ -17,5 +18,5 @@ #row '1' contains column headings #data is row '2' through '-1' -data =pd.DataFrame(li[2:-1], columns=li[1], index=[l[0] for l in li[2:-1]]) -print (data) +data = pd.DataFrame(li[2:-1], columns=li[1], index=[l[0] for l in li[2:-1]]) +print(data) diff --git a/setup.py b/setup.py index 5d691ea..66ef158 100644 --- a/setup.py +++ b/setup.py @@ -30,4 +30,4 @@ 'console_scripts': ['pdf-table-extract=pdftableextract.scripts:main'] } - ) +) diff --git a/src/pdftableextract/core.py b/src/pdftableextract/core.py index 4baaa55..30e2eb7 100644 --- a/src/pdftableextract/core.py +++ b/src/pdftableextract/core.py @@ -1,6 +1,7 @@ import sys import os -from numpy import array, fromstring, ones, zeros, uint8, diff, where, sum, delete +from numpy import array, fromstring, ones, zeros, uint8, diff, where, sum, delete, frombuffer, reshape +import numpy #import subprocess #from pipes import quote from pdftableextract.pnm import readPNM, dumpImage @@ -11,16 +12,76 @@ import csv import gi gi.require_version('Gtk', '3.0') -from gi.repository import Poppler +gi.require_version('Poppler', '0.18') +from gi.repository import Gdk, Poppler +import cairo class PopplerProcessor(object): - """ + """Class for processing PDF. That's simple. + It does two functions. + 1. Renders a page as a PNM graphics, and + 2. Get text in a rectangular bounding box. """ - def __init__(self, **kwargs): - """ + def __init__(self, filename, **kwargs): + """Opens a document denoted by filename. """ - self.p=Popp + self.filename=os.path.abspath(filename) + self.document=Poppler.Document.new_from_file("file:"+self.filename, None) + self.page_num=self.document.get_n_pages() + self.resolution=300 + self.greyscale_threshold=kwargs.get("greyscale_thresholds",25) + + def get_page(self, index): + if index<0 or index>=self.page_num: + raise IndexError("page number is out of bounds") + return self.document.get_page(index) + + def get_image(self, index): + page=self.get_page(index) + dpi=self.resolution + scale = 1 + width, height = [int(x) for x in page.get_size ()] + d=dpi/72. + pxw, pxh=int(width * d), int(height * d) + # data=zeros((pxw,pxh,4), dtype=uint8) + surface = cairo.ImageSurface ( + # data, + cairo.FORMAT_ARGB32, + pxw, pxh) + + context = cairo.Context (surface) + context.scale (d, d) + + context.save () + page.render (context) + context.restore () + + pixbuf = Gdk.pixbuf_get_from_surface (surface, 0, 0, pxw, pxh) + surface.write_to_png("page.png") + #img=image.set_from_pixbuf (pixbuf) + data=frombuffer(pixbuf.get_pixels(), dtype=uint8) + R=data[0::4] + G=data[1::4] + B=data[2::4] + A=data[3::4] + C=R*34+G*0.56+B*0.1 + # print (max(A)) + C=C.astype(uint8) + A=A<=self.greyscale_threshold + C[A]=255 + # print (C) + return C.reshape((pxw,pxh)) + + def get_text(self, index, x,y, w,h): + rect=Poppler.Rectangle() + rect.x1,rect.y1=x,y + rect.x2,rect.y2=x+w,y+h + # print (help(rect)) + pg=self.get_page(index) + txt=pg.get_text_for_area(rect) + Poppler.Rectangle.free(rect) + return txt #----------------------------------------------------------------------- def check_for_required_executable(name,command): @@ -42,8 +103,7 @@ def check_for_required_executable(name,command): #----------------------------------------------------------------------- def popen(name,command, *args, **kwargs): - print (name,command, *args, **kwargs) - wew + #print (name,command, *args, **kwargs) try: result=subprocess.Popen(command,*args, **kwargs) return result @@ -89,25 +149,25 @@ def process_page(infile, pgs, encoding="utf8") : outfile = open(outfilename,'w') if outfilename else sys.stdout + pdfdoc = PopplerProcessor(infile) page=page or [] (pg,frow,lrow) = (list(map(int,(pgs.split(":"))))+[None,None])[0:3] - #check that pdftoppdm exists by running a simple command - check_for_required_executable("pdftoppm",["pdftoppm","-h"]) - #end check + pdfdoc.resolution=bitmap_resolution + pdfdoc.greyscale_threshold=greyscale_threshold - p = popen("pdftoppm", ("pdftoppm -gray -r %d -f %d -l %d %s " % - (bitmap_resolution,pg,pg,infile))) + data = pdfdoc.get_image(pg-1) # Page numbers are 0-based. #----------------------------------------------------------------------- -# image load secion. +# image load section. - (maxval, width, height, data) = readPNM(p.stdout) + #print(data.shape) + height, width = data.shape[:2] pad = int(pad) height+=pad*2 width+=pad*2 -# reimbed image with a white padd. +# reimbed image with a white pad. bmp = ones( (height,width) , dtype=bool ) bmp[pad:height-pad,pad:width-pad] = ( data[:,:] > int(255.0*greyscale_threshold/100.0) ) @@ -325,26 +385,19 @@ def isDiv(a, l,r,t,b) : def getCell( _coordinate): (i,j,u,v) =_coordinate (l,r,t,b) = ( vd[2*i+1] , vd[ 2*(i+u) ], hd[2*j+1], hd[2*(j+v)] ) - p = popen("pdftotext", - "pdftotext -r %d -x %d -y %d -W %d -H %d -layout -nopgbrk -f %d -l %d %s -" % (bitmap_resolution, l-pad, t-pad, r-l, b-t, pg, pg, infile), - stdout=subprocess.PIPE, - shell=True ) - - ret = p.communicate()[0] - if whitespace != 'raw' : - ret = whitespace.sub( b"" if whitespace == "none" else b" ", ret ) - if len(ret) > 0 : - ret = ret[ (1 if ret[0]==b' ' else 0) : - len(ret) - (1 if ret[-1]==b' ' else 0) ] - return (i,j,u,v,pg,ret.decode(encoding)) + ret = pdfdoc.get_text(pg-1, l-pad, t-pad, r-l, b-t) + # if whitespace != 'raw' : + # ret = whitespace.sub( b"" if whitespace == "none" else b" ", ret ) + # if len(ret) > 0 : + # ret = ret[ (1 if ret[0]==b' ' else 0) : + # len(ret) - (1 if ret[-1]==b' ' else 0) ] + return (i,j,u,v,pg,ret) if boxes : cells = [ x + (pg,b"",) for x in cells if ( frow == None or (x[1] >= frow and x[1] <= lrow)) ] else : - #check that pdftotext exists by running a simple command - check_for_required_executable("pdftotext",["pdftotext","-h"]) - #end check + print (cells) cells = [ getCell(x) for x in cells if ( frow == None or (x[1] >= frow and x[1] <= lrow)) ] return cells From ba5006ec00f9e9a802f1b9be42d93960b4219462 Mon Sep 17 00:00:00 2001 From: Evgeny Cherkashin Date: Sat, 16 Jul 2016 02:25:11 +0800 Subject: [PATCH 10/28] Porting recognition algorithm to RGBA. --- src/pdftableextract/core.py | 926 +++++++++++++++++++----------------- 1 file changed, 499 insertions(+), 427 deletions(-) diff --git a/src/pdftableextract/core.py b/src/pdftableextract/core.py index 30e2eb7..490356a 100644 --- a/src/pdftableextract/core.py +++ b/src/pdftableextract/core.py @@ -1,12 +1,12 @@ import sys import os -from numpy import array, fromstring, ones, zeros, uint8, diff, where, sum, delete, frombuffer, reshape +from numpy import array, fromstring, ones, zeros, uint8, diff, where, sum, delete, frombuffer, reshape, all, any import numpy -#import subprocess -#from pipes import quote -from pdftableextract.pnm import readPNM, dumpImage -import re -#from pipes import quote + +import matplotlib +matplotlib.use('AGG') +from matplotlib.image import imsave + from xml.dom.minidom import getDOMImplementation import json import csv @@ -16,6 +16,7 @@ from gi.repository import Gdk, Poppler import cairo + class PopplerProcessor(object): """Class for processing PDF. That's simple. It does two functions. @@ -26,71 +27,91 @@ class PopplerProcessor(object): def __init__(self, filename, **kwargs): """Opens a document denoted by filename. """ - self.filename=os.path.abspath(filename) - self.document=Poppler.Document.new_from_file("file:"+self.filename, None) - self.page_num=self.document.get_n_pages() - self.resolution=300 - self.greyscale_threshold=kwargs.get("greyscale_thresholds",25) + self.filename = os.path.abspath(filename) + self.document = Poppler.Document.new_from_file("file:" + self.filename, + None) + self.page_num = self.document.get_n_pages() + self.resolution = 300 + self.greyscale_threshold = int(kwargs.get("greyscale_thresholds", + 25)) * 255.0 / 100.0 def get_page(self, index): - if index<0 or index>=self.page_num: + if index < 0 or index >= self.page_num: raise IndexError("page number is out of bounds") return self.document.get_page(index) def get_image(self, index): - page=self.get_page(index) - dpi=self.resolution + page = self.get_page(index) + dpi = self.resolution scale = 1 - width, height = [int(x) for x in page.get_size ()] - d=dpi/72. - pxw, pxh=int(width * d), int(height * d) + width, height = [int(x) for x in page.get_size()] + d = dpi / 72. + pxw, pxh = int(width * d), int(height * d) # data=zeros((pxw,pxh,4), dtype=uint8) - surface = cairo.ImageSurface ( + surface = cairo.ImageSurface( # data, cairo.FORMAT_ARGB32, - pxw, pxh) + pxw, + pxh) - context = cairo.Context (surface) - context.scale (d, d) + context = cairo.Context(surface) + context.scale(d, d) - context.save () - page.render (context) - context.restore () + context.save() + page.render(context) + context.restore() - pixbuf = Gdk.pixbuf_get_from_surface (surface, 0, 0, pxw, pxh) + pixbuf = Gdk.pixbuf_get_from_surface(surface, 0, 0, pxw, pxh) surface.write_to_png("page.png") #img=image.set_from_pixbuf (pixbuf) - data=frombuffer(pixbuf.get_pixels(), dtype=uint8) - R=data[0::4] - G=data[1::4] - B=data[2::4] - A=data[3::4] - C=R*34+G*0.56+B*0.1 - # print (max(A)) - C=C.astype(uint8) - A=A<=self.greyscale_threshold - C[A]=255 - # print (C) - return C.reshape((pxw,pxh)) - - def get_text(self, index, x,y, w,h): - rect=Poppler.Rectangle() - rect.x1,rect.y1=x,y - rect.x2,rect.y2=x+w,y+h + data = frombuffer(pixbuf.get_pixels(), dtype=uint8) + # R = data[0::4] + # G = data[1::4] + # B = data[2::4] + # A = data[3::4] + # C = R * 34 + G * 0.56 + B * 0.1 + # # print (max(A)) + # C = C.astype(uint8) + # A = A <= self.greyscale_threshold + # C[A] = 255 + + # C = C.reshape((pxh, pxw)) + data = data.reshape((pxh, pxw, 4)) + #d = data[:, :, 3] + alpha = data[:, :, 3] + new = zeros(data.shape, dtype=uint8) + new[:, :, :] = data + new = new[:, :, 0:3] + print(data) + rc = alpha <= self.greyscale_threshold + + new[rc, 0] = 255 + new[rc, 1] = 255 + new[rc, 2] = 255 + #new[:, :, 3] = 255 + imsave('nomask.png', new) + return new, rc, page + + def get_text(self, page, x, y, w, h): + rect = Poppler.Rectangle() + rect.x1, rect.y1 = x, y + rect.x2, rect.y2 = x + w, y + h # print (help(rect)) - pg=self.get_page(index) - txt=pg.get_text_for_area(rect) - Poppler.Rectangle.free(rect) + txt = page.get_text_for_area(rect) + #rect.free() + #Poppler.Rectangle.free(rect) + return txt + #----------------------------------------------------------------------- -def check_for_required_executable(name,command): +def check_for_required_executable(name, command): """Checks for an executable called 'name' by running 'command' and supressing output. If the return code is non-zero or an OS error occurs, an Exception is raised""" return try: with open(os.devnull, "w") as fnull: - result=subprocess.check_call(command,stdout=fnull, stderr=fnull) + result = subprocess.check_call(command, stdout=fnull, stderr=fnull) except OSError as e: message = """Error running {0}. Command failed: {1} @@ -101,148 +122,162 @@ def check_for_required_executable(name,command): except Exception as e: raise + #----------------------------------------------------------------------- -def popen(name,command, *args, **kwargs): +def popen(name, command, *args, **kwargs): #print (name,command, *args, **kwargs) try: - result=subprocess.Popen(command,*args, **kwargs) + result = subprocess.Popen(command, *args, **kwargs) return result except OSError as e: - message="""Error running {0}. Is it installed correctly? + message = """Error running {0}. Is it installed correctly? Error: {1}""".format(name, e) raise OSError(message) except Exception as e: raise -def colinterp(a,x) : - """Interpolates colors""" - l = len(a)-1 - i = min(l, max(0, int (x * l))) - (u,v) = a[i:i+2,:] - return u - (u-v) * ((x * l) % 1.0) - -colarr = array([ [255,0,0],[255,255,0],[0,255,0],[0,255,255],[0,0,255] ]) -def col(x, colmult=1.0) : - """colors""" - return colinterp(colarr,(colmult * x)% 1.0) / 2 - - -def process_page(infile, pgs, - outfilename=None, - greyscale_threshold=25, - page=None, - crop=None, - line_length=0.17, - bitmap_resolution=300, - name=None, - pad=2, - white=None, - black=None, - bitmap=False, - checkcrop=False, - checklines=False, - checkdivs=False, - checkcells=False, - whitespace="normalize", - boxes=False, - encoding="utf8") : - - outfile = open(outfilename,'w') if outfilename else sys.stdout - pdfdoc = PopplerProcessor(infile) - page=page or [] - (pg,frow,lrow) = (list(map(int,(pgs.split(":"))))+[None,None])[0:3] - pdfdoc.resolution=bitmap_resolution - pdfdoc.greyscale_threshold=greyscale_threshold - - data = pdfdoc.get_image(pg-1) # Page numbers are 0-based. - -#----------------------------------------------------------------------- -# image load section. - - #print(data.shape) - height, width = data.shape[:2] +def colinterp(a, x): + """Interpolates colors""" + l = len(a) - 1 + i = min(l, max(0, int(x * l))) + (u, v) = a[i:i + 2, :] + return u - (u - v) * ((x * l) % 1.0) - pad = int(pad) - height+=pad*2 - width+=pad*2 -# reimbed image with a white pad. - bmp = ones( (height,width) , dtype=bool ) - bmp[pad:height-pad,pad:width-pad] = ( data[:,:] > int(255.0*greyscale_threshold/100.0) ) +colarr = array( + [[255, 0, 0], [255, 255, 0], [0, 255, 0], [0, 255, 255], [0, 0, 255]]) -# Set up Debuging image. - img = zeros( (height,width,3) , dtype=uint8 ) - img[:,:,0] = bmp*255 - img[:,:,1] = bmp*255 - img[:,:,2] = bmp*255 -#----------------------------------------------------------------------- -# Find bounding box. - t=0 - while t < height and sum(bmp[t,:]==0) == 0 : - t=t+1 - if t > 0 : - t=t-1 - - b=height-1 - while b > t and sum(bmp[b,:]==0) == 0 : - b=b-1 - if b < height-1: - b = b+1 - - l=0 - while l < width and sum(bmp[:,l]==0) == 0 : - l=l+1 - if l > 0 : - l=l-1 - - r=width-1 - while r > l and sum(bmp[:,r]==0) == 0 : - r=r-1 - if r < width-1 : - r=r+1 +def col(x, colmult=1.0): + """colors""" + return colinterp(colarr, (colmult * x) % 1.0) / 2 + + +def process_page(infile, + pgs, + outfilename=None, + greyscale_threshold=25, + page=None, + crop=None, + line_length=0.17, + bitmap_resolution=300, + name=None, + pad=2, + white=None, + black=None, + bitmap=False, + checkcrop=False, + checklines=False, + checkdivs=False, + checkcells=False, + whitespace="normalize", + boxes=False, + encoding="utf8"): + + outfile = outfilename if outfilename else sys.stdout + pdfdoc = PopplerProcessor(infile) + page = page or [] + (pg, frow, lrow) = (list(map(int, (pgs.split(":")))) + [None, None])[0:3] + pdfdoc.resolution = bitmap_resolution + pdfdoc.greyscale_threshold = greyscale_threshold + + data, notalpha, page = pdfdoc.get_image( + pg - 1) # Page numbers are 0-based. + alpha = notalpha != 1 + + #----------------------------------------------------------------------- + # image load section. + + #print(data.shape) + height, width = data.shape[:2] + + pad = int(pad) + height += pad * 2 + width += pad * 2 + + # reimbed image with a white pad. + bmp = ones((height, width, 3), dtype=bool) + + thr = int(255.0 * greyscale_threshold / 100.0) + + bmp[pad:height - pad, pad:width - pad] = (data[:, :] > thr) + + imsave("foo.png", bmp) + # Set up Debuging image. + img = zeros((height, width, 3), dtype=uint8) + img[:, :, :] = bmp + #img[:, :, 0] = bmp * 255 + #img[:, :, 1] = bmp * 255 + #img[:, :, 2] = bmp * 255 + + #----------------------------------------------------------------------- + # Find bounding box. + t = 0 + while t < height and sum(bmp[t, :] == 0) == 0: + t = t + 1 + if t > 0: + t = t - 1 + + b = height - 1 + while b > t and sum(bmp[b, :] == 0) == 0: + b = b - 1 + if b < height - 1: + b = b + 1 + + l = 0 + while l < width and sum(bmp[:, l] == 0) == 0: + l = l + 1 + if l > 0: + l = l - 1 + + r = width - 1 + while r > l and sum(bmp[:, r] == 0) == 0: + r = r - 1 + if r < width - 1: + r = r + 1 # Mark bounding box. - bmp[t,:] = 0 - bmp[b,:] = 0 - bmp[:,l] = 0 - bmp[:,r] = 0 - - def boxOfString(x,p) : - s = x.split(":") - if len(s) < 4 : - raise ValueError("boxes have format left:top:right:bottom[:page]") - return ([bitmap_resolution * float(x) + pad for x in s[0:4] ] - + [ p if len(s)<5 else int(s[4]) ] ) - + bmp[t, :] = 0 + bmp[b, :] = 0 + bmp[:, l] = 0 + bmp[:, r] = 0 + + def boxOfString(x, p): + s = x.split(":") + if len(s) < 4: + raise ValueError("boxes have format left:top:right:bottom[:page]") + return ([bitmap_resolution * float(x) + pad for x in s[0:4]] + + [p if len(s) < 5 else int(s[4])]) # translate crop to paint white. - whites = [] - if crop : - (l,t,r,b,p) = boxOfString(crop,pg) - whites.extend( [ (0,0,l,height,p), (0,0,width,t,p), - (r,0,width,height,p), (0,b,width,height,p) ] ) + + whites = [] + if crop: + (l, t, r, b, p) = boxOfString(crop, pg) + whites.extend([(0, 0, l, height, p), (0, 0, width, t, p), + (r, 0, width, height, p), (0, b, width, height, p)]) # paint white ... - if white : - whites.extend( [ boxOfString(b, pg) for b in white ] ) + if white: + whites.extend([boxOfString(b, pg) for b in white]) - for (l,t,r,b,p) in whites : - if p == pg : - bmp[ t:b+1,l:r+1 ] = 1 - img[ t:b+1,l:r+1 ] = [255,255,255] + for (l, t, r, b, p) in whites: + if p == pg: + bmp[t:b + 1, l:r + 1] = 1 + img[t:b + 1, l:r + 1] = [255, 255, 255] # paint black ... - if black : - for b in black : - (l,t,r,b) = [bitmap_resolution * float(x) + pad for x in b.split(":") ] - bmp[ t:b+1,l:r+1 ] = 0 - img[ t:b+1,l:r+1 ] = [0,0,0] + if black: + for b in black: + (l, t, r, + b) = [bitmap_resolution * float(x) + pad for x in b.split(":")] + bmp[t:b + 1, l:r + 1] = 0 + img[t:b + 1, l:r + 1] = [0, 0, 0] - if checkcrop : - dumpImage(outfile,bmp,img, bitmap, pad) - return True + if checkcrop: + imsave("crop-" + outfile + ".png", img) + return True #----------------------------------------------------------------------- # Line finding section. @@ -250,61 +285,61 @@ def boxOfString(x,p) : # Find all vertical or horizontal lines that are more than rlthresh # long, these are considered lines on the table grid. - lthresh = int(line_length * bitmap_resolution) - vs = zeros(width, dtype=int) - for i in range(width) : - dd = diff( where(bmp[:,i])[0] ) - if len(dd)>0: - v = max ( dd ) - if v > lthresh : - vs[i] = 1 - else: -# it was a solid black line. - if bmp[0,i] == 0 : - vs[i] = 1 - vd= ( where(diff(vs[:]))[0] +1 ) - - hs = zeros(height, dtype=int) - for j in range(height) : - dd = diff( where(bmp[j,:]==1)[0] ) - if len(dd) > 0 : - h = max ( dd ) - if h > lthresh : - hs[j] = 1 - else: -# it was a solid black line. - if bmp[j,0] == 0 : - hs[j] = 1 - hd=( where(diff(hs[:]==1))[0] +1 ) - -#----------------------------------------------------------------------- -# Look for dividors that are too large. - maxdiv=10 - i=0 - - while i < len(vd) : - if vd[i+1]-vd[i] > maxdiv : - vd = delete(vd,i) - vd = delete(vd,i) - else: - i=i+2 - - j = 0 - while j < len(hd): - if hd[j+1]-hd[j] > maxdiv : - hd = delete(hd,j) - hd = delete(hd,j) - else: - j=j+2 - - if checklines : - for i in vd : - img[:,i] = [255,0,0] # red + lthresh = int(line_length * bitmap_resolution) + vs = zeros(width, dtype=int) + for i in range(width): + dd = diff(where(bmp[:, i])[0]) + if len(dd) > 0: + v = max(dd) + if v > lthresh: + vs[i] = 1 + else: + # it was a solid black line. + if all(bmp[0, i]) == 0: + vs[i] = 1 + vd = (where(diff(vs[:]))[0] + 1) + + hs = zeros(height, dtype=int) + for j in range(height): + dd = diff(where(bmp[j, :] == 1)[0]) + if len(dd) > 0: + h = max(dd) + if h > lthresh: + hs[j] = 1 + else: + # it was a solid black line. + if all(bmp[j, 0]) == 0: + hs[j] = 1 + hd = (where(diff(hs[:] == 1))[0] + 1) + + #----------------------------------------------------------------------- + # Look for dividors that are too large. + maxdiv = 10 + i = 0 - for j in hd : - img[j,:] = [0,0,255] # blue - dumpImage(outfile,bmp,img) - return True + while i < len(vd): + if vd[i + 1] - vd[i] > maxdiv: + vd = delete(vd, i) + vd = delete(vd, i) + else: + i = i + 2 + + j = 0 + while j < len(hd): + if hd[j + 1] - hd[j] > maxdiv: + hd = delete(hd, j) + hd = delete(hd, j) + else: + j = j + 2 + + if checklines: + for i in vd: + img[:, i] = [255, 0, 0] # red + + for j in hd: + img[j, :] = [0, 0, 255] # blue + imsave("lines-" + outfile + ".png", img) + return True #----------------------------------------------------------------------- # divider checking. # @@ -312,240 +347,277 @@ def boxOfString(x,p) : # hd holds the y coordinate of horizontal divider tansitions for each # vertical and horizontal lines in the table grid. - def isDiv(a, l,r,t,b) : - # if any col or row (in axis) is all zeros ... - return sum( sum(bmp[t:b, l:r], axis=a)==0 ) >0 - - if checkdivs : - img = img / 2 - for j in range(0,len(hd),2): - for i in range(0,len(vd),2): - if i>0 : - (l,r,t,b) = (vd[i-1], vd[i], hd[j], hd[j+1]) - img[ t:b, l:r, 1 ] = 192 - if isDiv(1, l,r,t,b) : - img[ t:b, l:r, 0 ] = 0 - img[ t:b, l:r, 2 ] = 255 - - if j>0 : - (l,r,t,b) = (vd[i], vd[i+1], hd[j-1], hd[j] ) - img[ t:b, l:r, 1 ] = 128 - if isDiv(0, l,r,t,b) : - img[ t:b, l:r, 0 ] = 255 - img[ t:b, l:r, 2 ] = 0 - dumpImage(outfile,bmp,img) - return True + def isDiv(a, l, r, t, b): + # if any col or row (in axis) is all zeros ... + return sum(sum(bmp[t:b, l:r], axis=a) == 0) > 0 + + if checkdivs: + img = img / 2 + for j in range(0, len(hd), 2): + for i in range(0, len(vd), 2): + if i > 0: + (l, r, t, b) = (vd[i - 1], vd[i], hd[j], hd[j + 1]) + img[t:b, l:r, 1] = 192 + if isDiv(1, l, r, t, b): + img[t:b, l:r, 0] = 0 + img[t:b, l:r, 2] = 255 + + if j > 0: + (l, r, t, b) = (vd[i], vd[i + 1], hd[j - 1], hd[j]) + img[t:b, l:r, 1] = 128 + if isDiv(0, l, r, t, b): + img[t:b, l:r, 0] = 255 + img[t:b, l:r, 2] = 0 + imsave("divs-" + outfile + ".png", img) + return True #----------------------------------------------------------------------- # Cell finding section. # This algorithum is width hungry, and always generates rectangular # boxes. - cells =[] - touched = zeros( (len(hd), len(vd)),dtype=bool ) - j = 0 - while j*2+2 < len (hd) : - i = 0 - while i*2+2 < len(vd) : - u = 1 - v = 1 - if not touched[j,i] : - while 2+(i+u)*2 < len(vd) and \ - not isDiv( 0, vd[ 2*(i+u) ], vd[ 2*(i+u)+1], - hd[ 2*(j+v)-1 ], hd[ 2*(j+v) ] ): - u=u+1 - bot = False - while 2+(j+v)*2 < len(hd) and not bot : - bot = False - for k in range(1,u+1) : - bot |= isDiv( 1, vd[ 2*(i+k)-1 ], vd[ 2*(i+k)], - hd[ 2*(j+v) ], hd[ 2*(j+v)+1 ] ) - if not bot : - v=v+1 - cells.append( (i,j,u,v) ) - touched[ j:j+v, i:i+u] = True - i = i+1 - j=j+1 - - - if checkcells : - nc = len(cells)+0. - img = img / 2 - for k in range(len(cells)): - (i,j,u,v) = cells[k] - (l,r,t,b) = ( vd[2*i+1] , vd[ 2*(i+u) ], hd[2*j+1], hd[2*(j+v)] ) - img[ t:b, l:r ] += col( k/nc ) - dumpImage(outfile,bmp,img) - return True + cells = [] + touched = zeros((len(hd), len(vd)), dtype=bool) + j = 0 + while j * 2 + 2 < len(hd): + i = 0 + while i * 2 + 2 < len(vd): + u = 1 + v = 1 + if not touched[j, i]: + while 2+(i+u)*2 < len(vd) and \ + not isDiv( 0, vd[ 2*(i+u) ], vd[ 2*(i+u)+1], + hd[ 2*(j+v)-1 ], hd[ 2*(j+v) ] ): + u = u + 1 + bot = False + while 2 + (j + v) * 2 < len(hd) and not bot: + bot = False + for k in range(1, u + 1): + bot |= isDiv(1, vd[2 * (i + k) - 1], vd[2 * (i + k)], + hd[2 * (j + v)], hd[2 * (j + v) + 1]) + if not bot: + v = v + 1 + cells.append((i, j, u, v)) + touched[j:j + v, i:i + u] = True + i = i + 1 + j = j + 1 + + if checkcells: + nc = len(cells) + 0. + img = img / 2 + for k in range(len(cells)): + (i, j, u, v) = cells[k] + (l, r, t, b) = (vd[2 * i + 1], vd[2 * (i + u)], hd[2 * j + 1], + hd[2 * (j + v)]) + img[t:b, l:r] += col(k / nc) + imsave("cells-" + outfile + ".png", img) + return True #----------------------------------------------------------------------- # fork out to extract text for each cell. - whitespace = re.compile( rb'\s+') - - def getCell( _coordinate): - (i,j,u,v) =_coordinate - (l,r,t,b) = ( vd[2*i+1] , vd[ 2*(i+u) ], hd[2*j+1], hd[2*(j+v)] ) - ret = pdfdoc.get_text(pg-1, l-pad, t-pad, r-l, b-t) - # if whitespace != 'raw' : - # ret = whitespace.sub( b"" if whitespace == "none" else b" ", ret ) - # if len(ret) > 0 : - # ret = ret[ (1 if ret[0]==b' ' else 0) : - # len(ret) - (1 if ret[-1]==b' ' else 0) ] - return (i,j,u,v,pg,ret) - - if boxes : - cells = [ x + (pg,b"",) for x in cells if - ( frow == None or (x[1] >= frow and x[1] <= lrow)) ] - else : - print (cells) - cells = [ getCell(x) for x in cells if - ( frow == None or (x[1] >= frow and x[1] <= lrow)) ] - return cells + def getCell(_coordinate): + (i, j, u, v) = _coordinate + (l, r, t, b) = (vd[2 * i + 1], vd[2 * (i + u)], hd[2 * j + 1], + hd[2 * (j + v)]) + ret = pdfdoc.get_text(page, l - pad, t - pad, r - l, b - t) + return (i, j, u, v, pg, ret) + + if boxes: + cells = [x + (pg, + b"", ) for x in cells + if (frow == None or (x[1] >= frow and x[1] <= lrow))] + else: + print(cells) + cells = [getCell(x) for x in cells + if (frow == None or (x[1] >= frow and x[1] <= lrow))] + return cells #----------------------------------------------------------------------- #output section. -def output(cells, pgs, - cells_csv_filename=None, - cells_json_filename=None, - cells_xml_filename=None, - table_csv_filename=None, - table_html_filename=None, - table_list_filename=None, - infile=None, name=None, output_type=None - ): + +def output(cells, + pgs, + cells_csv_filename=None, + cells_json_filename=None, + cells_xml_filename=None, + table_csv_filename=None, + table_html_filename=None, + table_list_filename=None, + infile=None, + name=None, + output_type=None): output_types = [ - dict(filename=cells_csv_filename, function=o_cells_csv), - dict(filename=cells_json_filename, function=o_cells_json), - dict(filename=cells_xml_filename, function=o_cells_xml), - dict(filename=table_csv_filename, function=o_table_csv), - dict(filename=table_html_filename, function=o_table_html), - dict(filename=table_list_filename, function=o_table_list) - ] + dict(filename=cells_csv_filename, + function=o_cells_csv), dict(filename=cells_json_filename, + function=o_cells_json), + dict(filename=cells_xml_filename, + function=o_cells_xml), dict(filename=table_csv_filename, + function=o_table_csv), + dict(filename=table_html_filename, + function=o_table_html), dict(filename=table_list_filename, + function=o_table_list) + ] for entry in output_types: if entry["filename"]: if entry["filename"] != sys.stdout: - outfile = open(entry["filename"],'w') + outfile = open(entry["filename"], 'w') else: outfile = sys.stdout - entry["function"](cells, pgs, - outfile=outfile, - name=name, - infile=infile, - output_type=output_type) + entry["function"](cells, + pgs, + outfile=outfile, + name=name, + infile=infile, + output_type=output_type) if entry["filename"] != sys.stdout: outfile.close() -def o_cells_csv(cells,pgs, outfile=None, name=None, infile=None, output_type=None) : - outfile = outfile or sys.stdout - csv.writer( outfile , dialect='excel' ).writerows(cells) - -def o_cells_json(cells,pgs, outfile=None, infile=None, name=None, output_type=None) : - """Output JSON formatted cell data""" - outfile = outfile or sys.stdout - #defaults - infile=infile or "" - name=name or "" - - json.dump({ - "src": infile, - "name": name, - "colnames": ( "x","y","width","height","page","contents" ), - "cells":cells + +def o_cells_csv(cells, + pgs, + outfile=None, + name=None, + infile=None, + output_type=None): + outfile = outfile or sys.stdout + csv.writer(outfile, dialect='excel').writerows(cells) + + +def o_cells_json(cells, + pgs, + outfile=None, + infile=None, + name=None, + output_type=None): + """Output JSON formatted cell data""" + outfile = outfile or sys.stdout + #defaults + infile = infile or "" + name = name or "" + + json.dump({ + "src": infile, + "name": name, + "colnames": ("x", "y", "width", "height", "page", "contents"), + "cells": cells }, outfile) -def o_cells_xml(cells,pgs, outfile=None,infile=None, name=None, output_type=None) : - """Output XML formatted cell data""" - outfile = outfile or sys.stdout - #defaults - infile=infile or "" - name=name or "" - def _lambda(a): - return x.setAttribute(*a) - - doc = getDOMImplementation().createDocument(None,"table", None) - root = doc.documentElement; - if infile : - root.setAttribute("src",infile) - if name : - root.setAttribute("name",name) - for cl in cells : - x = doc.createElement("cell") - map(_lambda, zip("xywhp",map(str,cl))) - if cl[5] != "" : - x.appendChild( doc.createTextNode(cl[5]) ) - root.appendChild(x) - outfile.write( doc.toprettyxml() ) - -def table_to_list(cells,pgs) : - """Output list of lists""" - l=[0,0,0] - for (i,j,u,v,pg,value) in cells : - r=[i,j,pg] - l = [max(x) for x in zip(l,r)] - - tab = [ [ [ "" for x in range(l[0]+1) - ] for x in range(l[1]+1) - ] for x in range(l[2]+1) - ] - for (i,j,u,v,pg,value) in cells : - tab[pg][j][i] = value - - return tab - -def o_table_csv(cells,pgs, outfile=None, name=None, infile=None, output_type=None) : - """Output CSV formatted table""" - outfile = outfile or sys.stdout - tab=table_to_list(cells, pgs) - for t in tab: - csv.writer( outfile , dialect='excel' ).writerows(t) - - -def o_table_list(cells,pgs, outfile=None, name=None, infile=None, output_type=None) : - """Output list of lists""" - outfile = outfile or sys.stdout - tab = table_to_list(cells, pgs) - print(tab) - -def o_table_html(cells,pgs, outfile=None, output_type=None, name=None, infile=None) : - """Output HTML formatted table""" - - oj = 0 - opg = 0 - doc = getDOMImplementation().createDocument(None,"table", None) - root = doc.documentElement; - if (output_type == "table_chtml" ): - root.setAttribute("border","1") - root.setAttribute("cellspaceing","0") - root.setAttribute("style","border-spacing:0") - nc = len(cells) - tr = None - for k in range(nc): - (i,j,u,v,pg,value) = cells[k] - if j > oj or pg > opg: - if pg > opg: - s = "Name: " + name + ", " if name else "" - root.appendChild( doc.createComment( s + - ("Source: %s page %d." % (infile, pg) ))); - if tr : - root.appendChild(tr) - tr = doc.createElement("tr") - oj = j - opg = pg - td = doc.createElement("td") - if value != "" : - td.appendChild( doc.createTextNode(value) ) - if u>1 : - td.setAttribute("colspan",str(u)) - if v>1 : - td.setAttribute("rowspan",str(v)) - if output_type == "table_chtml" : - td.setAttribute("style", "background-color: #%02x%02x%02x" % - tuple(128+col(k/(nc+0.)))) - tr.appendChild(td) - root.appendChild(tr) - outfile.write( doc.toprettyxml() ) + +def o_cells_xml(cells, + pgs, + outfile=None, + infile=None, + name=None, + output_type=None): + """Output XML formatted cell data""" + outfile = outfile or sys.stdout + #defaults + infile = infile or "" + name = name or "" + + def _lambda(a): + return x.setAttribute(*a) + + doc = getDOMImplementation().createDocument(None, "table", None) + root = doc.documentElement + if infile: + root.setAttribute("src", infile) + if name: + root.setAttribute("name", name) + for cl in cells: + x = doc.createElement("cell") + map(_lambda, zip("xywhp", map(str, cl))) + if cl[5] != "": + x.appendChild(doc.createTextNode(cl[5])) + root.appendChild(x) + outfile.write(doc.toprettyxml()) + + +def table_to_list(cells, pgs): + """Output list of lists""" + l = [0, 0, 0] + for (i, j, u, v, pg, value) in cells: + r = [i, j, pg] + l = [max(x) for x in zip(l, r)] + + tab = [[["" for x in range(l[0] + 1)] for x in range(l[1] + 1)] + for x in range(l[2] + 1)] + for (i, j, u, v, pg, value) in cells: + tab[pg][j][i] = value + + return tab + + +def o_table_csv(cells, + pgs, + outfile=None, + name=None, + infile=None, + output_type=None): + """Output CSV formatted table""" + outfile = outfile or sys.stdout + tab = table_to_list(cells, pgs) + for t in tab: + csv.writer(outfile, dialect='excel').writerows(t) + + +def o_table_list(cells, + pgs, + outfile=None, + name=None, + infile=None, + output_type=None): + """Output list of lists""" + outfile = outfile or sys.stdout + tab = table_to_list(cells, pgs) + print(tab) + + +def o_table_html(cells, + pgs, + outfile=None, + output_type=None, + name=None, + infile=None): + """Output HTML formatted table""" + + oj = 0 + opg = 0 + doc = getDOMImplementation().createDocument(None, "table", None) + root = doc.documentElement + if (output_type == "table_chtml"): + root.setAttribute("border", "1") + root.setAttribute("cellspaceing", "0") + root.setAttribute("style", "border-spacing:0") + nc = len(cells) + tr = None + for k in range(nc): + (i, j, u, v, pg, value) = cells[k] + if j > oj or pg > opg: + if pg > opg: + s = "Name: " + name + ", " if name else "" + root.appendChild(doc.createComment(s + ("Source: %s page %d." % + (infile, pg)))) + if tr: + root.appendChild(tr) + tr = doc.createElement("tr") + oj = j + opg = pg + td = doc.createElement("td") + if value != "": + td.appendChild(doc.createTextNode(value)) + if u > 1: + td.setAttribute("colspan", str(u)) + if v > 1: + td.setAttribute("rowspan", str(v)) + if output_type == "table_chtml": + td.setAttribute("style", "background-color: #%02x%02x%02x" % + tuple(128 + col(k / (nc + 0.)))) + tr.appendChild(td) + root.appendChild(tr) + outfile.write(doc.toprettyxml()) From 04838c93651716809a097d10603c041c68b1eed3 Mon Sep 17 00:00:00 2001 From: Evgeny Cherkashin Date: Sat, 16 Jul 2016 03:05:52 +0800 Subject: [PATCH 11/28] Debugging. Very hard. --- src/pdftableextract/core.py | 37 +++++++++++++++++++++---------------- 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/src/pdftableextract/core.py b/src/pdftableextract/core.py index 490356a..97414d7 100644 --- a/src/pdftableextract/core.py +++ b/src/pdftableextract/core.py @@ -82,7 +82,7 @@ def get_image(self, index): new = zeros(data.shape, dtype=uint8) new[:, :, :] = data new = new[:, :, 0:3] - print(data) + #print(data) rc = alpha <= self.greyscale_threshold new[rc, 0] = 255 @@ -167,15 +167,15 @@ def process_page(infile, white=None, black=None, bitmap=False, - checkcrop=False, - checklines=False, - checkdivs=False, - checkcells=False, + checkcrop=True, + checklines=True, + checkdivs=True, + checkcells=True, whitespace="normalize", boxes=False, encoding="utf8"): - outfile = outfilename if outfilename else sys.stdout + outfile = outfilename if outfilename else "output" pdfdoc = PopplerProcessor(infile) page = page or [] (pg, frow, lrow) = (list(map(int, (pgs.split(":")))) + [None, None])[0:3] @@ -202,11 +202,11 @@ def process_page(infile, thr = int(255.0 * greyscale_threshold / 100.0) bmp[pad:height - pad, pad:width - pad] = (data[:, :] > thr) - + bmp = bmp == False imsave("foo.png", bmp) # Set up Debuging image. img = zeros((height, width, 3), dtype=uint8) - img[:, :, :] = bmp + img[:, :, :] = bmp * 255 #img[:, :, 0] = bmp * 255 #img[:, :, 1] = bmp * 255 #img[:, :, 2] = bmp * 255 @@ -214,34 +214,39 @@ def process_page(infile, #----------------------------------------------------------------------- # Find bounding box. t = 0 - while t < height and sum(bmp[t, :] == 0) == 0: + imsave("bmp-test.png", bmp) + + while t < height and all(bmp[t, :]) == False: t = t + 1 if t > 0: t = t - 1 + import pdb + pdb.set_trace() b = height - 1 - while b > t and sum(bmp[b, :] == 0) == 0: + while b > t and all(bmp[b, :]) == False: b = b - 1 if b < height - 1: b = b + 1 l = 0 - while l < width and sum(bmp[:, l] == 0) == 0: + while l < width and all(bmp[:, l]) == False: l = l + 1 if l > 0: l = l - 1 r = width - 1 - while r > l and sum(bmp[:, r] == 0) == 0: + while r > l and all(bmp[:, r]) == False: r = r - 1 if r < width - 1: r = r + 1 # Mark bounding box. - bmp[t, :] = 0 - bmp[b, :] = 0 - bmp[:, l] = 0 - bmp[:, r] = 0 + bmp[t, :, 0] = True + bmp[b, :, 0] = True + bmp[:, l, 0] = True + bmp[:, r, 0] = True + imsave("bmp-bbox.png", bmp) def boxOfString(x, p): s = x.split(":") From a0a2bfea8c7e2f753973333662a22fac0ef29ef0 Mon Sep 17 00:00:00 2001 From: Evgeny Cherkashin Date: Sat, 16 Jul 2016 13:53:12 +0800 Subject: [PATCH 12/28] Trying to understand algorithm. --- example/gtk-test.py | 37 ------------------------------- example/test_to_pandas.py | 1 + src/pdftableextract/core.py | 44 ++++++++++++++++++------------------- 3 files changed, 22 insertions(+), 60 deletions(-) delete mode 100644 example/gtk-test.py diff --git a/example/gtk-test.py b/example/gtk-test.py deleted file mode 100644 index 7ceb36a..0000000 --- a/example/gtk-test.py +++ /dev/null @@ -1,37 +0,0 @@ -# http://stackoverflow.com/a/10031877 - -import numpy -import cairo -import math - -from gi.repository import Gtk, Gdk - -data = numpy.zeros((200, 200, 4), dtype=numpy.uint8) -surface = cairo.ImageSurface.create_for_data(data, cairo.FORMAT_ARGB32, 200, - 200) -cr = cairo.Context(surface) - -# fill with solid white -cr.set_source_rgb(1.0, 1.0, 1.0) -cr.paint() - -# draw red circle -cr.arc(100, 100, 80, 0, 2 * math.pi) -cr.set_line_width(3) -cr.set_source_rgb(1.0, 0.0, 0.0) -cr.stroke() - -#draw directly to the shared buffer -data[10:30, 10:30, 2] = 128 - -# write output -print(data[38:48, 38:48, 0]) -surface.write_to_png("circle.png") - -pb = Gdk.pixbuf_get_from_surface(surface, 0, 0, 200, 200) -im = Gtk.Image.new_from_pixbuf(pb) -w = Gtk.Window() -w.connect("delete-event", Gtk.main_quit) -w.add(im) -w.show_all() -Gtk.main() diff --git a/example/test_to_pandas.py b/example/test_to_pandas.py index 32cd3c7..c54da05 100644 --- a/example/test_to_pandas.py +++ b/example/test_to_pandas.py @@ -6,6 +6,7 @@ cells = [pdf.process_page("example.pdf", p) for p in pages] #flatten the cells structure +print(cells) cells = [item for sublist in cells for item in sublist] #without any options, process_page picks up a blank table at the top of the page. diff --git a/src/pdftableextract/core.py b/src/pdftableextract/core.py index 97414d7..151674f 100644 --- a/src/pdftableextract/core.py +++ b/src/pdftableextract/core.py @@ -74,7 +74,6 @@ def get_image(self, index): # C = C.astype(uint8) # A = A <= self.greyscale_threshold # C[A] = 255 - # C = C.reshape((pxh, pxw)) data = data.reshape((pxh, pxw, 4)) #d = data[:, :, 3] @@ -100,11 +99,10 @@ def get_text(self, page, x, y, w, h): txt = page.get_text_for_area(rect) #rect.free() #Poppler.Rectangle.free(rect) - return txt -#----------------------------------------------------------------------- + #----------------------------------------------------------------------- def check_for_required_executable(name, command): """Checks for an executable called 'name' by running 'command' and supressing output. If the return code is non-zero or an OS error occurs, an Exception is raised""" @@ -161,13 +159,13 @@ def process_page(infile, page=None, crop=None, line_length=0.17, - bitmap_resolution=300, + bitmap_resolution=150, #300, name=None, pad=2, white=None, black=None, bitmap=False, - checkcrop=True, + checkcrop=False, checklines=True, checkdivs=True, checkcells=True, @@ -214,39 +212,36 @@ def process_page(infile, #----------------------------------------------------------------------- # Find bounding box. t = 0 - imsave("bmp-test.png", bmp) + imsave("bmp-start.png", bmp) - while t < height and all(bmp[t, :]) == False: + while t < height and any(bmp[t, :]) == False: t = t + 1 if t > 0: t = t - 1 - import pdb - pdb.set_trace() b = height - 1 - while b > t and all(bmp[b, :]) == False: + while b > t and any(bmp[b, :]) == False: b = b - 1 if b < height - 1: b = b + 1 l = 0 - while l < width and all(bmp[:, l]) == False: + while l < width and any(bmp[:, l]) == False: l = l + 1 if l > 0: l = l - 1 r = width - 1 - while r > l and all(bmp[:, r]) == False: + while r > l and any(bmp[:, r]) == False: r = r - 1 if r < width - 1: r = r + 1 # Mark bounding box. - bmp[t, :, 0] = True - bmp[b, :, 0] = True - bmp[:, l, 0] = True - bmp[:, r, 0] = True - imsave("bmp-bbox.png", bmp) + bmp[t, :] = True + bmp[b, :] = True + bmp[:, l] = True + bmp[:, r] = True def boxOfString(x, p): s = x.split(":") @@ -287,11 +282,14 @@ def boxOfString(x, p): #----------------------------------------------------------------------- # Line finding section. # -# Find all vertical or horizontal lines that are more than rlthresh +# Find all vertical or horizontal lines that are more than lthresh # long, these are considered lines on the table grid. lthresh = int(line_length * bitmap_resolution) - vs = zeros(width, dtype=int) + vs = zeros(width, dtype=uint8) + + import pdb + pdb.set_trace() for i in range(width): dd = diff(where(bmp[:, i])[0]) if len(dd) > 0: @@ -304,9 +302,9 @@ def boxOfString(x, p): vs[i] = 1 vd = (where(diff(vs[:]))[0] + 1) - hs = zeros(height, dtype=int) + hs = zeros(height, dtype=uint8) for j in range(height): - dd = diff(where(bmp[j, :] == 1)[0]) + dd = diff(where(bmp[j, :])[0]) if len(dd) > 0: h = max(dd) if h > lthresh: @@ -315,7 +313,7 @@ def boxOfString(x, p): # it was a solid black line. if all(bmp[j, 0]) == 0: hs[j] = 1 - hd = (where(diff(hs[:] == 1))[0] + 1) + hd = (where(diff(hs[:]))[0] + 1) #----------------------------------------------------------------------- # Look for dividors that are too large. @@ -429,7 +427,7 @@ def getCell(_coordinate): if boxes: cells = [x + (pg, - b"", ) for x in cells + "", ) for x in cells if (frow == None or (x[1] >= frow and x[1] <= lrow))] else: print(cells) From da69fd5fab37b726947ebc6f235e5eff59a07689 Mon Sep 17 00:00:00 2001 From: Evgeny Cherkashin Date: Sat, 16 Jul 2016 14:01:55 +0800 Subject: [PATCH 13/28] Remove now unused popen. --- src/pdftableextract/core.py | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/src/pdftableextract/core.py b/src/pdftableextract/core.py index 151674f..5b90fce 100644 --- a/src/pdftableextract/core.py +++ b/src/pdftableextract/core.py @@ -121,20 +121,6 @@ def check_for_required_executable(name, command): raise -#----------------------------------------------------------------------- -def popen(name, command, *args, **kwargs): - #print (name,command, *args, **kwargs) - try: - result = subprocess.Popen(command, *args, **kwargs) - return result - except OSError as e: - message = """Error running {0}. Is it installed correctly? -Error: {1}""".format(name, e) - raise OSError(message) - except Exception as e: - raise - - def colinterp(a, x): """Interpolates colors""" l = len(a) - 1 From 61301a9a80d584bfa36a27a2022552e8f513b3c9 Mon Sep 17 00:00:00 2001 From: Evgeny Cherkashin Date: Sat, 16 Jul 2016 14:07:20 +0800 Subject: [PATCH 14/28] Remove executable checker. --- src/pdftableextract/core.py | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/src/pdftableextract/core.py b/src/pdftableextract/core.py index 5b90fce..40d16ee 100644 --- a/src/pdftableextract/core.py +++ b/src/pdftableextract/core.py @@ -102,25 +102,6 @@ def get_text(self, page, x, y, w, h): return txt - #----------------------------------------------------------------------- -def check_for_required_executable(name, command): - """Checks for an executable called 'name' by running 'command' and supressing - output. If the return code is non-zero or an OS error occurs, an Exception is raised""" - return - try: - with open(os.devnull, "w") as fnull: - result = subprocess.check_call(command, stdout=fnull, stderr=fnull) - except OSError as e: - message = """Error running {0}. -Command failed: {1} -{2}""".format(name, " ".join(command), e) - raise OSError(message) - except subprocess.CalledProcessError as e: - raise - except Exception as e: - raise - - def colinterp(a, x): """Interpolates colors""" l = len(a) - 1 From f01042d93b590c07acfb9c84230e835534eb8f09 Mon Sep 17 00:00:00 2001 From: Evgeny Cherkashin Date: Sat, 16 Jul 2016 15:14:13 +0800 Subject: [PATCH 15/28] Made image of 8bit as in original. --- src/pdftableextract/core.py | 76 +++++++++++++++++++------------------ 1 file changed, 40 insertions(+), 36 deletions(-) diff --git a/src/pdftableextract/core.py b/src/pdftableextract/core.py index 40d16ee..ec5cc26 100644 --- a/src/pdftableextract/core.py +++ b/src/pdftableextract/core.py @@ -65,31 +65,38 @@ def get_image(self, index): surface.write_to_png("page.png") #img=image.set_from_pixbuf (pixbuf) data = frombuffer(pixbuf.get_pixels(), dtype=uint8) - # R = data[0::4] - # G = data[1::4] - # B = data[2::4] - # A = data[3::4] - # C = R * 34 + G * 0.56 + B * 0.1 + R = data[0::4] + G = data[1::4] + B = data[2::4] + A = data[3::4] + C = R * 34 + G * 56 + B * 10 / 100. # # print (max(A)) - # C = C.astype(uint8) + C = C.astype(uint8) + # A = A <= self.greyscale_threshold # C[A] = 255 # C = C.reshape((pxh, pxw)) - data = data.reshape((pxh, pxw, 4)) + nd = zeros(C.shape, dtype=uint8) + print(nd.shape, C.shape) + nd[:] = C + nd[A <= self.greyscale_threshold] = 255 + + #data = data.reshape((pxh, pxw, 4)) #d = data[:, :, 3] - alpha = data[:, :, 3] - new = zeros(data.shape, dtype=uint8) - new[:, :, :] = data - new = new[:, :, 0:3] + #alpha = data[:, :, 3] + #new = zeros(data.shape, dtype=uint8) + #new[:, :, :] = data + #new = new[:, :, 0:3] #print(data) - rc = alpha <= self.greyscale_threshold + #rc = alpha <= self.greyscale_threshold - new[rc, 0] = 255 - new[rc, 1] = 255 - new[rc, 2] = 255 + #new[rc, 0] = 255 + #new[rc, 1] = 255 + #new[rc, 2] = 255 #new[:, :, 3] = 255 - imsave('nomask.png', new) - return new, rc, page + nd = nd.reshape((pxh, pxw)) + imsave('nomask.png', nd) + return nd, page def get_text(self, page, x, y, w, h): rect = Poppler.Rectangle() @@ -126,7 +133,7 @@ def process_page(infile, page=None, crop=None, line_length=0.17, - bitmap_resolution=150, #300, + bitmap_resolution=150, # 300, name=None, pad=2, white=None, @@ -147,9 +154,7 @@ def process_page(infile, pdfdoc.resolution = bitmap_resolution pdfdoc.greyscale_threshold = greyscale_threshold - data, notalpha, page = pdfdoc.get_image( - pg - 1) # Page numbers are 0-based. - alpha = notalpha != 1 + data, page = pdfdoc.get_image(pg - 1) # Page numbers are 0-based. #----------------------------------------------------------------------- # image load section. @@ -162,19 +167,19 @@ def process_page(infile, width += pad * 2 # reimbed image with a white pad. - bmp = ones((height, width, 3), dtype=bool) + bmp = ones((height, width), dtype=bool) thr = int(255.0 * greyscale_threshold / 100.0) - + imsave("white.png", bmp) bmp[pad:height - pad, pad:width - pad] = (data[:, :] > thr) - bmp = bmp == False + #bmp = bmp == False imsave("foo.png", bmp) # Set up Debuging image. img = zeros((height, width, 3), dtype=uint8) - img[:, :, :] = bmp * 255 - #img[:, :, 0] = bmp * 255 - #img[:, :, 1] = bmp * 255 - #img[:, :, 2] = bmp * 255 + #img[:, :, :] = bmp * 255 + img[:, :, 0] = bmp * 255 + img[:, :, 1] = bmp * 255 + img[:, :, 2] = bmp * 255 #----------------------------------------------------------------------- # Find bounding box. @@ -205,10 +210,11 @@ def process_page(infile, r = r + 1 # Mark bounding box. - bmp[t, :] = True - bmp[b, :] = True - bmp[:, l] = True - bmp[:, r] = True + bmp[t, :] = False + bmp[b, :] = False + bmp[:, l] = False + bmp[:, r] = False + imsave("bbox-start.png", bmp) def boxOfString(x, p): s = x.split(":") @@ -255,8 +261,6 @@ def boxOfString(x, p): lthresh = int(line_length * bitmap_resolution) vs = zeros(width, dtype=uint8) - import pdb - pdb.set_trace() for i in range(width): dd = diff(where(bmp[:, i])[0]) if len(dd) > 0: @@ -271,7 +275,7 @@ def boxOfString(x, p): hs = zeros(height, dtype=uint8) for j in range(height): - dd = diff(where(bmp[j, :])[0]) + dd = diff(where(bmp[j, :] == 1)[0]) if len(dd) > 0: h = max(dd) if h > lthresh: @@ -280,7 +284,7 @@ def boxOfString(x, p): # it was a solid black line. if all(bmp[j, 0]) == 0: hs[j] = 1 - hd = (where(diff(hs[:]))[0] + 1) + hd = (where(diff(hs[:]) == 1)[0] + 1) #----------------------------------------------------------------------- # Look for dividors that are too large. From 6ece1da3f86397bdac25e2c8049370772d5b1a8f Mon Sep 17 00:00:00 2001 From: Evgeny Cherkashin Date: Sat, 16 Jul 2016 15:16:32 +0800 Subject: [PATCH 16/28] Ignore debugging data. --- .gitingore | 1 + 1 file changed, 1 insertion(+) create mode 100644 .gitingore diff --git a/.gitingore b/.gitingore new file mode 100644 index 0000000..6f06927 --- /dev/null +++ b/.gitingore @@ -0,0 +1 @@ +059285.pdf From 829f4ea2e9c2eea16bf6a1c55051c10bcf499f55 Mon Sep 17 00:00:00 2001 From: Evgeny Cherkashin Date: Sat, 16 Jul 2016 15:33:36 +0800 Subject: [PATCH 17/28] More adaptation for Python 3. --- example/test_to_pandas.py | 14 +++-- src/pdftableextract/core.py | 2 +- src/pdftableextract/pnm.py | 104 +++++++++++++++++++----------------- 3 files changed, 67 insertions(+), 53 deletions(-) diff --git a/example/test_to_pandas.py b/example/test_to_pandas.py index d734ea3..5db0c32 100644 --- a/example/test_to_pandas.py +++ b/example/test_to_pandas.py @@ -3,10 +3,16 @@ import pdftableextract as pdf pages = ["1"] -cells = [pdf.process_page("example.pdf",p) for p in pages] +cells = [pdf.process_page("example.pdf", + p, + outfilename="weee.pnm", + checkcrop=False, + checklines=False, + checkdivs=False, + checkcells=False, ) for p in pages] #flatten the cells structure -cells = [item for sublist in cells for item in sublist ] +cells = [item for sublist in cells for item in sublist] #without any options, process_page picks up a blank table at the top of the page. #so choose table '1' @@ -17,5 +23,5 @@ #row '1' contains column headings #data is row '2' through '-1' -data =pd.DataFrame(li[2:-1], columns=li[1], index=[l[0] for l in li[2:-1]]) -print (data) +data = pd.DataFrame(li[2:-1], columns=li[1], index=[l[0] for l in li[2:-1]]) +print(data) diff --git a/src/pdftableextract/core.py b/src/pdftableextract/core.py index 491e67d..1765aec 100644 --- a/src/pdftableextract/core.py +++ b/src/pdftableextract/core.py @@ -73,7 +73,7 @@ def process_page(infile, pgs, boxes=False, encoding="utf8") : - outfile = open(outfilename,'w') if outfilename else sys.stdout + outfile = open(outfilename,'wb') if outfilename else sys.stdout page=page or [] (pg,frow,lrow) = (list(map(int,(pgs.split(":"))))+[None,None])[0:3] #check that pdftoppdm exists by running a simple command diff --git a/src/pdftableextract/pnm.py b/src/pdftableextract/pnm.py index ae229ea..befce66 100644 --- a/src/pdftableextract/pnm.py +++ b/src/pdftableextract/pnm.py @@ -1,60 +1,68 @@ from __future__ import print_function from numpy import array, fromstring, uint8, reshape, ones + #----------------------------------------------------------------------- # PNM stuff. + def noncomment(fd): - """Read lines from the filehandle until a non-comment line is found. + """Read lines from the filehandle until a non-comment line is found. Comments start with #""" - while True: - x = fd.readline() - if x.startswith(b'#') : - continue - else: - return x + while True: + x = fd.readline() + if x.startswith(b'#'): + continue + else: + return x + def readPNM(fd): - """Reads the PNM file from the filehandle""" - t = noncomment(fd) - s = noncomment(fd) - m = noncomment(fd) if not (t.startswith(b'P1') or t.startswith(b'P4')) else b'1' - data = fd.read() - ls = len(s.split()) - if ls != 2 : - name = "" if fd.name=="" else "Filename = {0}".format(fd.name) - raise IOError("Expected 2 elements from parsing PNM file, got {0}: {1}".format(ls, name)) - xs, ys = s.split() - width = int(xs) - height = int(ys) - m = int(m) - - if m != 255 : - print ("Just want 8 bit pgms for now!") - - d = fromstring(data,dtype=uint8) - d = reshape(d, (height,width) ) - return (m,width,height, d) - -def writePNM(fd,img): - """Writes a PNM file to a filehandle given the img data as a numpy array""" - s = img.shape - m = 255 - if img.dtype == bool : - img = img + uint8(0) - t = "P5" - m = 1 - elif len(s) == 2 : - t = "P5" - else: - t = "P6" - - fd.write( "%s\n%d %d\n%d\n" % (t, s[1],s[0],m) ) - fd.write( uint8(img).tostring() ) - - -def dumpImage(outfile,bmp,img,bitmap=False, pad=2) : + """Reads the PNM file from the filehandle""" + t = noncomment(fd) + s = noncomment(fd) + m = noncomment(fd) if not (t.startswith(b'P1') or + t.startswith(b'P4')) else b'1' + data = fd.read() + ls = len(s.split()) + if ls != 2: + name = "" if fd.name == "" else "Filename = {0}".format( + fd.name) + raise IOError( + "Expected 2 elements from parsing PNM file, got {0}: {1}".format( + ls, name)) + xs, ys = s.split() + width = int(xs) + height = int(ys) + m = int(m) + + if m != 255: + print("Just want 8 bit pgms for now!") + + d = fromstring(data, dtype=uint8) + d = reshape(d, (height, width)) + return (m, width, height, d) + + +def writePNM(fd, img): + """Writes a PNM file to a filehandle given the img data as a numpy array""" + s = img.shape + m = 255 + if img.dtype == bool: + img = img + uint8(0) + t = b"P5" + m = 1 + elif len(s) == 2: + t = b"P5" + else: + t = b"P6" + + fd.write(b"%s\n%d %d\n%d\n" % (t, s[1], s[0], m)) + fd.write(img.astype(uint8).tobytes()) + + +def dumpImage(outfile, bmp, img, bitmap=False, pad=2): """Dumps the numpy array in image into the filename and closes the outfile""" oi = bmp if bitmap else img - (height,width) = bmp.shape - writePNM(outfile, oi[pad:height-pad, pad:width-pad]) + (height, width) = bmp.shape + writePNM(outfile, oi[pad:height - pad, pad:width - pad]) outfile.close() From b2b6a70f6b4348472873d4b4686581537731f4cd Mon Sep 17 00:00:00 2001 From: Evgeny Cherkashin Date: Sat, 16 Jul 2016 20:43:59 +0800 Subject: [PATCH 18/28] Shortened some relations. --- src/pdftableextract/core.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/pdftableextract/core.py b/src/pdftableextract/core.py index 1765aec..70f4322 100644 --- a/src/pdftableextract/core.py +++ b/src/pdftableextract/core.py @@ -192,7 +192,7 @@ def boxOfString(x,p) : hs = zeros(height, dtype=int) for j in range(height) : - dd = diff( where(bmp[j,:]==1)[0] ) + dd = diff( where(bmp[j,:])[0] ) if len(dd) > 0 : h = max ( dd ) if h > lthresh : @@ -201,7 +201,7 @@ def boxOfString(x,p) : # it was a solid black line. if bmp[j,0] == 0 : hs[j] = 1 - hd=( where(diff(hs[:]==1))[0] +1 ) + hd=( where(diff(hs[:]))[0] +1 ) #----------------------------------------------------------------------- # Look for dividors that are too large. From c5c3c8053a5fb268f748943c3fbc94877fcd6a2e Mon Sep 17 00:00:00 2001 From: Evgeny Cherkashin Date: Sat, 16 Jul 2016 20:59:44 +0800 Subject: [PATCH 19/28] Requirements added. --- requirements.txt | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..a7f3d77 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +vext +numpy +matplotlib +pandas + From 67f838758daec423e6a99babdd4ff64de632cea8 Mon Sep 17 00:00:00 2001 From: Evgeny Cherkashin Date: Sat, 16 Jul 2016 22:11:19 +0800 Subject: [PATCH 20/28] Debugging. --- example/test_to_pandas.py | 2 +- src/pdftableextract/core.py | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/example/test_to_pandas.py b/example/test_to_pandas.py index b7f3088..69360ac 100644 --- a/example/test_to_pandas.py +++ b/example/test_to_pandas.py @@ -8,7 +8,7 @@ p, outfilename="weee.pnm", checkcrop=False, - checklines=False, + checklines=True, checkdivs=False, checkcells=False, ) for p in pages] print(cells) diff --git a/src/pdftableextract/core.py b/src/pdftableextract/core.py index 889fd17..df7766c 100644 --- a/src/pdftableextract/core.py +++ b/src/pdftableextract/core.py @@ -77,7 +77,6 @@ def get_image(self, index): # C[A] = 255 # C = C.reshape((pxh, pxw)) nd = zeros(C.shape, dtype=uint8) - print(nd.shape, C.shape) nd[:] = C nd[A <= self.greyscale_threshold] = 255 @@ -228,25 +227,25 @@ def process_page(infile, t = 0 imsave("bmp-start.png", bmp) - while t < height and any(bmp[t, :]) == False: + while t < height and bmp[t, :]: t = t + 1 if t > 0: t = t - 1 b = height - 1 - while b > t and any(bmp[b, :]) == False: + while b > t and bmp[b, :]: b = b - 1 if b < height - 1: b = b + 1 l = 0 - while l < width and any(bmp[:, l]) == False: + while l < width and bmp[:, l]: l = l + 1 if l > 0: l = l - 1 r = width - 1 - while r > l and any(bmp[:, r]) == False: + while r > l and bmp[:, r]: r = r - 1 if r < width - 1: r = r + 1 @@ -257,6 +256,7 @@ def process_page(infile, bmp[:, l] = False bmp[:, r] = False imsave("bbox-start.png", bmp) + print ("Bbox", l,t,b,r) def boxOfString(x, p): s = x.split(":") @@ -317,7 +317,7 @@ def boxOfString(x, p): hs = zeros(height, dtype=uint8) for j in range(height): - dd = diff(where(bmp[j, :] == 1)[0]) + dd = diff(where(bmp[j, :])[0]) if len(dd) > 0: h = max(dd) if h > lthresh: @@ -326,7 +326,7 @@ def boxOfString(x, p): # it was a solid black line. if all(bmp[j, 0]) == 0: hs[j] = 1 - hd = (where(diff(hs[:]) == 1)[0] + 1) + hd = (where(diff(hs[:]))[0] + 1) #----------------------------------------------------------------------- # Look for dividors that are too large. From c4a48614e30bd76cce55c411cefa2f441696cb76 Mon Sep 17 00:00:00 2001 From: Evgeny Cherkashin Date: Sat, 16 Jul 2016 22:47:36 +0800 Subject: [PATCH 21/28] Starting to work. --- src/pdftableextract/core.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/pdftableextract/core.py b/src/pdftableextract/core.py index df7766c..805c24b 100644 --- a/src/pdftableextract/core.py +++ b/src/pdftableextract/core.py @@ -227,25 +227,27 @@ def process_page(infile, t = 0 imsave("bmp-start.png", bmp) - while t < height and bmp[t, :]: + while t < height and all(bmp[t, :]): + bbb=bmp[t,:] + print(any(bbb),all(bbb)) t = t + 1 if t > 0: t = t - 1 b = height - 1 - while b > t and bmp[b, :]: + while b > t and all(bmp[b, :]): b = b - 1 if b < height - 1: b = b + 1 l = 0 - while l < width and bmp[:, l]: + while l < width and all(bmp[:, l]): l = l + 1 if l > 0: l = l - 1 r = width - 1 - while r > l and bmp[:, r]: + while r > l and all(bmp[:, r]): r = r - 1 if r < width - 1: r = r + 1 @@ -257,7 +259,7 @@ def process_page(infile, bmp[:, r] = False imsave("bbox-start.png", bmp) print ("Bbox", l,t,b,r) - + def boxOfString(x, p): s = x.split(":") if len(s) < 4: From 62d9598f8cc763b3d5ab8df78c40bd97130feb51 Mon Sep 17 00:00:00 2001 From: Evgeny Cherkashin Date: Mon, 18 Jul 2016 00:37:16 +0800 Subject: [PATCH 22/28] Algorithm refining. Debugging. --- example/test_to_pandas.py | 11 ++- src/pdftableextract/core.py | 140 ++++++++++++++++-------------------- 2 files changed, 67 insertions(+), 84 deletions(-) diff --git a/example/test_to_pandas.py b/example/test_to_pandas.py index 69360ac..c14dfee 100644 --- a/example/test_to_pandas.py +++ b/example/test_to_pandas.py @@ -6,11 +6,8 @@ cells = [pdf.process_page("example.pdf", p, - outfilename="weee.pnm", - checkcrop=False, - checklines=True, - checkdivs=False, - checkcells=False, ) for p in pages] + outfilename="pandas-test", + checkall=True) for p in pages] print(cells) #flatten the cells structure @@ -26,5 +23,5 @@ #row '1' contains column headings #data is row '2' through '-1' -data = pd.DataFrame(li[2:-1], columns=li[1], index=[l[0] for l in li[2:-1]]) -print(data) +#data = pd.DataFrame(li[2:-1], columns=li[1], index=[l[0] for l in li[2:-1]]) +#print(data) diff --git a/src/pdftableextract/core.py b/src/pdftableextract/core.py index 805c24b..40bd7b6 100644 --- a/src/pdftableextract/core.py +++ b/src/pdftableextract/core.py @@ -45,7 +45,8 @@ def get_image(self, index): dpi = self.resolution scale = 1 width, height = [int(x) for x in page.get_size()] - d = dpi / 72. + d = self.scale = dpi / 72. + self.frac_scale=1/d pxw, pxh = int(width * d), int(height * d) # data=zeros((pxw,pxh,4), dtype=uint8) surface = cairo.ImageSurface( @@ -97,12 +98,33 @@ def get_image(self, index): imsave('nomask.png', nd) return nd, page + def print_rect(self, msg, r, page): + x1,y1,x2,y2= r.x1, r.y1, r.x2, r.y2 + x, y, w, h = x1, y1, x2 - x1, y2 - y1 + print(msg, x, y, w, h, "---", x1,y1,x2,y2) + width, height = [int(x) for x in page.get_size()] + print(msg, x, height-y, w, h, "---", x1,height-y1,x2,height-y2) + def get_text(self, page, x, y, w, h): + #cb = page.get_crop_box() + #self.print_rect("Rect crop", cb) + width, height = [int(x) for x in page.get_size()] + #print("Page_size", width, height) + print(x, y, w, h) + fc=self.frac_scale + print ("FC:",fc) + x,y,w,h = (z*fc for z in [x,y,w,h]) rect = Poppler.Rectangle() + print("shifted:",x, y, w, h) rect.x1, rect.y1 = x, y rect.x2, rect.y2 = x + w, y + h - # print (help(rect)) + self.print_rect ("box:", rect, page) txt = page.get_text_for_area(rect) + print (txt) + attrs=page.get_text_attributes_for_area(rect) + print([(a.start_index,a.end_index) for a in attrs]) + print(help(attrs[0])) + wer #rect.free() #Poppler.Rectangle.free(rect) return txt @@ -110,53 +132,11 @@ def get_text(self, page, x, y, w, h): def colinterp(a, x): """Interpolates colors""" - l = len(a)-1 - i = min(l, max(0, int (x * l))) - (u,v) = a[i:i+2,:] - return u - (u-v) * ((x * l) % 1.0) - -colarr = array([ [255,0,0],[255,255,0],[0,255,0],[0,255,255],[0,0,255] ]) - -def col(x, colmult=1.0) : - """colors""" - return colinterp(colarr,(colmult * x)% 1.0) / 2 - - -def process_page(infile, pgs, - outfilename=None, - greyscale_threshold=25, - page=None, - crop=None, - line_length=0.17, - bitmap_resolution=300, - name=None, - pad=2, - white=None, - black=None, - bitmap=False, - checkcrop=False, - checklines=False, - checkdivs=False, - checkcells=False, - whitespace="normalize", - boxes=False, - encoding="utf8") : - - outfile = open(outfilename,'wb') if outfilename else sys.stdout - page=page or [] - (pg,frow,lrow) = (list(map(int,(pgs.split(":"))))+[None,None])[0:3] - #check that pdftoppdm exists by running a simple command - check_for_required_executable("pdftoppm",["pdftoppm","-h"]) - #end check - - p = popen("pdftoppm", ("pdftoppm -gray -r %d -f %d -l %d %s " % - (bitmap_resolution,pg,pg,quote(infile))), - stdin=subprocess.PIPE, stdout=subprocess.PIPE, shell=True ) - -#----------------------------------------------------------------------- -# image load secion. + l = len(a) - 1 + i = min(l, max(0, int(x * l))) + (u, v) = a[i:i + 2, :] + return u - (u - v) * ((x * l) % 1.0) - (maxval, width, height, data) = readPNM(p.stdout) colarr = array( [[255, 0, 0], [255, 255, 0], [0, 255, 0], [0, 255, 255], [0, 0, 255]]) @@ -166,28 +146,34 @@ def col(x, colmult=1.0): """colors""" return colinterp(colarr, (colmult * x) % 1.0) / 2 - def process_page(infile, pgs, outfilename=None, greyscale_threshold=25, page=None, crop=None, - line_length=0.17, - bitmap_resolution=150, # 300, + line_length=0.5, + bitmap_resolution=72, # 300, name=None, pad=2, white=None, black=None, bitmap=False, checkcrop=False, - checklines=True, - checkdivs=True, - checkcells=True, + checklines=False, + checkdivs=False, + checkcells=False, + checkall=False, whitespace="normalize", boxes=False, encoding="utf8"): + if checkall: + checkcrop = True + checklines = True + checkdivs = True + checkcells = True + outfile = outfilename if outfilename else "output" pdfdoc = PopplerProcessor(infile) page = page or [] @@ -222,14 +208,15 @@ def process_page(infile, img[:, :, 1] = bmp * 255 img[:, :, 2] = bmp * 255 + if checkdivs or checkcells: + imgfloat = img.astype(float) + #----------------------------------------------------------------------- # Find bounding box. t = 0 imsave("bmp-start.png", bmp) while t < height and all(bmp[t, :]): - bbb=bmp[t,:] - print(any(bbb),all(bbb)) t = t + 1 if t > 0: t = t - 1 @@ -258,8 +245,8 @@ def process_page(infile, bmp[:, l] = False bmp[:, r] = False imsave("bbox-start.png", bmp) - print ("Bbox", l,t,b,r) - + print("Bbox", l, t, b, r) + def boxOfString(x, p): s = x.split(":") if len(s) < 4: @@ -294,7 +281,6 @@ def boxOfString(x, p): if checkcrop: imsave("crop-" + outfile + ".png", img) - return True #----------------------------------------------------------------------- # Line finding section. @@ -357,20 +343,20 @@ def boxOfString(x, p): for j in hd: img[j, :] = [0, 0, 255] # blue imsave("lines-" + outfile + ".png", img) - return True -#----------------------------------------------------------------------- -# divider checking. -# -# at this point vd holds the x coordinate of vertical and -# hd holds the y coordinate of horizontal divider tansitions for each -# vertical and horizontal lines in the table grid. + + #----------------------------------------------------------------------- + # divider checking. + # + # at this point vd holds the x coordinate of vertical and + # hd holds the y coordinate of horizontal divider tansitions for each + # vertical and horizontal lines in the table grid. def isDiv(a, l, r, t, b): # if any col or row (in axis) is all zeros ... return sum(sum(bmp[t:b, l:r], axis=a) == 0) > 0 if checkdivs: - img = img / 2 + img = (imgfloat / 2).astype(uint8) for j in range(0, len(hd), 2): for i in range(0, len(vd), 2): if i > 0: @@ -387,11 +373,11 @@ def isDiv(a, l, r, t, b): img[t:b, l:r, 0] = 255 img[t:b, l:r, 2] = 0 imsave("divs-" + outfile + ".png", img) - return True -#----------------------------------------------------------------------- -# Cell finding section. -# This algorithum is width hungry, and always generates rectangular -# boxes. + + #----------------------------------------------------------------------- + # Cell finding section. + # This algorithum is width hungry, and always generates rectangular + # boxes. cells = [] touched = zeros((len(hd), len(vd)), dtype=bool) @@ -421,17 +407,17 @@ def isDiv(a, l, r, t, b): if checkcells: nc = len(cells) + 0. - img = img / 2 + img = (imgfloat / 2.).astype(uint8) for k in range(len(cells)): (i, j, u, v) = cells[k] (l, r, t, b) = (vd[2 * i + 1], vd[2 * (i + u)], hd[2 * j + 1], hd[2 * (j + v)]) - img[t:b, l:r] += col(k / nc) + img[t:b, l:r] += col(k / nc).astype(uint8) + imsave("cells-" + outfile + ".png", img) - return True -#----------------------------------------------------------------------- -# fork out to extract text for each cell. + #----------------------------------------------------------------------- + # fork out to extract text for each cell. def getCell(_coordinate): (i, j, u, v) = _coordinate From 8ad74bc030accc7fc024e8b9cfc24a5eb3999763 Mon Sep 17 00:00:00 2001 From: Evgeny Cherkashin Date: Mon, 18 Jul 2016 20:32:37 +0800 Subject: [PATCH 23/28] That's it. It works. --- example/test_to_pandas.py | 8 +- src/pdftableextract/core.py | 141 ++++++++++++++++++++++++++++++------ 2 files changed, 122 insertions(+), 27 deletions(-) diff --git a/example/test_to_pandas.py b/example/test_to_pandas.py index c14dfee..a9c20bd 100644 --- a/example/test_to_pandas.py +++ b/example/test_to_pandas.py @@ -8,14 +8,12 @@ p, outfilename="pandas-test", checkall=True) for p in pages] -print(cells) #flatten the cells structure cells = [item for sublist in cells for item in sublist] #without any options, process_page picks up a blank table at the top of the page. #so choose table '1' -print(cells) li = pdf.table_to_list(cells, pages)[1] #li is a list of lists, the first line is the header, last is the footer (for this table only!) @@ -23,5 +21,7 @@ #row '1' contains column headings #data is row '2' through '-1' -#data = pd.DataFrame(li[2:-1], columns=li[1], index=[l[0] for l in li[2:-1]]) -#print(data) +print (cells[:]) + +data = pd.DataFrame(li[2:-1], columns=li[1], index=[l[0] for l in li[2:-1]]) +print(data) diff --git a/src/pdftableextract/core.py b/src/pdftableextract/core.py index 40bd7b6..f5d0d99 100644 --- a/src/pdftableextract/core.py +++ b/src/pdftableextract/core.py @@ -1,4 +1,5 @@ import sys +import random import os from numpy import array, fromstring, ones, zeros, uint8, diff, where, sum, delete, frombuffer, reshape, all, any import numpy @@ -13,10 +14,16 @@ import gi gi.require_version('Gtk', '3.0') gi.require_version('Poppler', '0.18') -from gi.repository import Gdk, Poppler +gi.require_version('Gdk', '3.0') +from gi.repository import Gdk, Poppler #, Glib import cairo +def interact(locals): + import code + code.InteractiveConsole(locals=locals).interact() + + class PopplerProcessor(object): """Class for processing PDF. That's simple. It does two functions. @@ -34,11 +41,22 @@ def __init__(self, filename, **kwargs): self.resolution = 300 self.greyscale_threshold = int(kwargs.get("greyscale_thresholds", 25)) * 255.0 / 100.0 + self.layout = None def get_page(self, index): if index < 0 or index >= self.page_num: raise IndexError("page number is out of bounds") - return self.document.get_page(index) + page = self.document.get_page(index) + if self.layout != None: + #Glib.free(self.layout) + # Do we need freeing elements of the list # FIXME + self.layout = None + self.text = page.get_text() + self.attributes=page.get_text_attributes() + l = page.get_text_layout() + if l[0]: + self.layout = l[1] + return page def get_image(self, index): page = self.get_page(index) @@ -46,7 +64,7 @@ def get_image(self, index): scale = 1 width, height = [int(x) for x in page.get_size()] d = self.scale = dpi / 72. - self.frac_scale=1/d + self.frac_scale = 1 / d pxw, pxh = int(width * d), int(height * d) # data=zeros((pxw,pxh,4), dtype=uint8) surface = cairo.ImageSurface( @@ -98,36 +116,92 @@ def get_image(self, index): imsave('nomask.png', nd) return nd, page - def print_rect(self, msg, r, page): - x1,y1,x2,y2= r.x1, r.y1, r.x2, r.y2 + def print_rect(self, msg=None, r=None, page=None): + if None in [r, page]: + raise ValueError("r and page arguments are required") + x1, y1, x2, y2 = r.x1, r.y1, r.x2, r.y2 x, y, w, h = x1, y1, x2 - x1, y2 - y1 - print(msg, x, y, w, h, "---", x1,y1,x2,y2) + print(msg, x, y, w, h, "---", x1, y1, x2, y2) width, height = [int(x) for x in page.get_size()] - print(msg, x, height-y, w, h, "---", x1,height-y1,x2,height-y2) + print(msg, x, height - y, w, h, "---", x1, height - y1, x2, + height - y2) + + def within(self, a, b, pad=0): + """Is Rectangle b within Rectangle a, i.e. b is in a + """ + if b.x1+pad < a.x1: return False + if b.y1+pad < a.y1: return False + if b.x2-pad > a.x2: return False + if b.y2-pad > a.y2: return False + return True + + def rexpand(self, rect, layout, pad=0): + """Make rectangle rect include layout + + Arguments: + - `rect`: Adjustable Rectangle; + - `layout`: Rectangle to be included in rect. + """ + + r, l = rect, layout + if r.x1 > l.x1: r.x1 = l.x1-pad + if r.y1 > l.y1: r.y1 = l.y1-pad + if r.x2 < l.x2: r.x2 = l.x2+pad + if r.y2 < l.y2: r.y2 = l.y2+pad def get_text(self, page, x, y, w, h): #cb = page.get_crop_box() #self.print_rect("Rect crop", cb) width, height = [int(x) for x in page.get_size()] #print("Page_size", width, height) - print(x, y, w, h) - fc=self.frac_scale - print ("FC:",fc) - x,y,w,h = (z*fc for z in [x,y,w,h]) + ##print(x, y, w, h) + fc = self.frac_scale + ##print("FC:", fc) + x, y, w, h = (z * fc for z in [x, y, w, h]) rect = Poppler.Rectangle() - print("shifted:",x, y, w, h) + ##print("shifted:", x, y, w, h) rect.x1, rect.y1 = x, y rect.x2, rect.y2 = x + w, y + h - self.print_rect ("box:", rect, page) + assert rect.x1<=rect.x2 + assert rect.y1<=rect.y2 + #self.print_rect("box:", rect, page) txt = page.get_text_for_area(rect) - print (txt) - attrs=page.get_text_attributes_for_area(rect) - print([(a.start_index,a.end_index) for a in attrs]) - print(help(attrs[0])) - wer + ##print(txt) + attrs = page.get_text_attributes_for_area(rect) + ##print([(a.start_index, a.end_index) for a in attrs]) + ##print(help(attrs[0])) + #chars=[] + r = Poppler.Rectangle() + r.x1 = r.y1 = 1e10 + r.x2 = r.y2 = -1e10 + chars=[] + for k,l in enumerate(self.layout): + if self.within(rect, l, pad=1): + self.rexpand(r, l, pad=0.5) + chars.append(self.text[k]) + txt1="".join(chars) + #txt1 = page.get_text_for_area(r) + print ((r.x1,r.y1,r.x2,r.y2),txt1) + + #interact(locals={"p": page, "d": self.document, "self": self}) #rect.free() #Poppler.Rectangle.free(rect) - return txt + return txt1, r + + def get_rectangles_for_page(self, page): + """Return all rectangles for all letters in the page.. + Used for debugging + + Arguments: + - `page`: + """ + layout=self.layout + if layout == None: + raise RuntimeError("page is not chosen") + + #interact(locals={"layout":layout, "self":self}) + answer = [(r.x1,r.y1,r.x2,r.y2) for r in layout] + return answer def colinterp(a, x): @@ -164,6 +238,7 @@ def process_page(infile, checkdivs=False, checkcells=False, checkall=False, + checkletters=False, whitespace="normalize", boxes=False, encoding="utf8"): @@ -173,6 +248,7 @@ def process_page(infile, checklines = True checkdivs = True checkcells = True + checkletters = True outfile = outfilename if outfilename else "output" pdfdoc = PopplerProcessor(infile) @@ -208,9 +284,19 @@ def process_page(infile, img[:, :, 1] = bmp * 255 img[:, :, 2] = bmp * 255 - if checkdivs or checkcells: + if checkdivs or checkcells or checkletters: imgfloat = img.astype(float) + if checkletters: + img = (imgfloat/2.).astype(uint8) + rectangles=pdfdoc.get_rectangles_for_page(pg) + lrn=len(rectangles) + for k,r in enumerate(rectangles): + x1,y1,x2,y2 = [k+pad+1 for k in r] + img[y1:y2, x1:x2] += col(random.random()).astype(uint8) + imsave("letters.png", img) + + #----------------------------------------------------------------------- # Find bounding box. t = 0 @@ -419,21 +505,30 @@ def isDiv(a, l, r, t, b): #----------------------------------------------------------------------- # fork out to extract text for each cell. - def getCell(_coordinate): + def getCell(_coordinate, img=None): (i, j, u, v) = _coordinate (l, r, t, b) = (vd[2 * i + 1], vd[2 * (i + u)], hd[2 * j + 1], hd[2 * (j + v)]) - ret = pdfdoc.get_text(page, l - pad, t - pad, r - l, b - t) + ret, rect = pdfdoc.get_text(page, l - pad, t - pad, r - l, b - t) + if img != None and checkcells: + (x1,y1,x2,y2) = [rrr+pad for rrr in [rect.x1,rect.y1,rect.x2,rect.y2]] + img[y1:y2,x1:x2] += col(random.random()).astype(uint8) + return (i, j, u, v, pg, ret) + if checkcells: + img = (imgfloat / 2.).astype(uint8) if boxes: cells = [x + (pg, "", ) for x in cells if (frow == None or (x[1] >= frow and x[1] <= lrow))] else: print(cells) - cells = [getCell(x) for x in cells + cells = [getCell(x, img) for x in cells if (frow == None or (x[1] >= frow and x[1] <= lrow))] + if checkcells: + imsave("text-locations.png", img) + return cells #----------------------------------------------------------------------- From 2484b75c461bfa32e7495e2a00df011064624eb4 Mon Sep 17 00:00:00 2001 From: Evgeny Cherkashin Date: Mon, 18 Jul 2016 20:36:54 +0800 Subject: [PATCH 24/28] Removed debugging statements. --- example/test_to_pandas.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/example/test_to_pandas.py b/example/test_to_pandas.py index a9c20bd..95e68d0 100644 --- a/example/test_to_pandas.py +++ b/example/test_to_pandas.py @@ -21,7 +21,5 @@ #row '1' contains column headings #data is row '2' through '-1' -print (cells[:]) - data = pd.DataFrame(li[2:-1], columns=li[1], index=[l[0] for l in li[2:-1]]) print(data) From 536b889b698d07a4ad364ed0b9b0c1134de4d22a Mon Sep 17 00:00:00 2001 From: Evgeny Cherkashin Date: Mon, 18 Jul 2016 21:39:57 +0800 Subject: [PATCH 25/28] Removed some debugging statements. --- src/pdftableextract/core.py | 91 ++++++++++++++----------------------- 1 file changed, 33 insertions(+), 58 deletions(-) diff --git a/src/pdftableextract/core.py b/src/pdftableextract/core.py index f5d0d99..81f3b55 100644 --- a/src/pdftableextract/core.py +++ b/src/pdftableextract/core.py @@ -66,7 +66,6 @@ def get_image(self, index): d = self.scale = dpi / 72. self.frac_scale = 1 / d pxw, pxh = int(width * d), int(height * d) - # data=zeros((pxw,pxh,4), dtype=uint8) surface = cairo.ImageSurface( # data, cairo.FORMAT_ARGB32, @@ -82,41 +81,25 @@ def get_image(self, index): pixbuf = Gdk.pixbuf_get_from_surface(surface, 0, 0, pxw, pxh) surface.write_to_png("page.png") - #img=image.set_from_pixbuf (pixbuf) data = frombuffer(pixbuf.get_pixels(), dtype=uint8) R = data[0::4] G = data[1::4] B = data[2::4] A = data[3::4] - C = R * 34 + G * 56 + B * 10 / 100. - # # print (max(A)) + C = (R * 34 + G * 56 + B * 10) / 100. # Convert to gray + C = C.astype(uint8) - # A = A <= self.greyscale_threshold - # C[A] = 255 - # C = C.reshape((pxh, pxw)) nd = zeros(C.shape, dtype=uint8) nd[:] = C nd[A <= self.greyscale_threshold] = 255 - - #data = data.reshape((pxh, pxw, 4)) - #d = data[:, :, 3] - #alpha = data[:, :, 3] - #new = zeros(data.shape, dtype=uint8) - #new[:, :, :] = data - #new = new[:, :, 0:3] - #print(data) - #rc = alpha <= self.greyscale_threshold - - #new[rc, 0] = 255 - #new[rc, 1] = 255 - #new[rc, 2] = 255 - #new[:, :, 3] = 255 nd = nd.reshape((pxh, pxw)) - imsave('nomask.png', nd) + # imsave('nomask.png', nd) return nd, page def print_rect(self, msg=None, r=None, page=None): + """Used for debugging. + """ if None in [r, page]: raise ValueError("r and page arguments are required") x1, y1, x2, y2 = r.x1, r.y1, r.x2, r.y2 @@ -127,7 +110,11 @@ def print_rect(self, msg=None, r=None, page=None): height - y2) def within(self, a, b, pad=0): - """Is Rectangle b within Rectangle a, i.e. b is in a + """Is Rectangle b within Rectangle a, i.e. b is in a. + + Arguments: + - `a`, `b` : The rectangles; + - `pad` : Additional space. """ if b.x1+pad < a.x1: return False if b.y1+pad < a.y1: return False @@ -150,27 +137,19 @@ def rexpand(self, rect, layout, pad=0): if r.y2 < l.y2: r.y2 = l.y2+pad def get_text(self, page, x, y, w, h): - #cb = page.get_crop_box() - #self.print_rect("Rect crop", cb) width, height = [int(x) for x in page.get_size()] - #print("Page_size", width, height) - ##print(x, y, w, h) fc = self.frac_scale - ##print("FC:", fc) x, y, w, h = (z * fc for z in [x, y, w, h]) rect = Poppler.Rectangle() - ##print("shifted:", x, y, w, h) rect.x1, rect.y1 = x, y rect.x2, rect.y2 = x + w, y + h assert rect.x1<=rect.x2 assert rect.y1<=rect.y2 - #self.print_rect("box:", rect, page) - txt = page.get_text_for_area(rect) - ##print(txt) - attrs = page.get_text_attributes_for_area(rect) - ##print([(a.start_index, a.end_index) for a in attrs]) - ##print(help(attrs[0])) - #chars=[] + + # Could not make it work correctly # FIXME + # txt = page.get_text_for_area(rect) + # attrs = page.get_text_attributes_for_area(rect) + r = Poppler.Rectangle() r.x1 = r.y1 = 1e10 r.x2 = r.y2 = -1e10 @@ -179,27 +158,23 @@ def get_text(self, page, x, y, w, h): if self.within(rect, l, pad=1): self.rexpand(r, l, pad=0.5) chars.append(self.text[k]) - txt1="".join(chars) - #txt1 = page.get_text_for_area(r) - print ((r.x1,r.y1,r.x2,r.y2),txt1) + txt="".join(chars) + + # txt = page.get_text_for_area(r) # FIXME - #interact(locals={"p": page, "d": self.document, "self": self}) - #rect.free() - #Poppler.Rectangle.free(rect) - return txt1, r + return txt, r def get_rectangles_for_page(self, page): """Return all rectangles for all letters in the page.. - Used for debugging + Used for debugging. Arguments: - - `page`: + - `page`: referece to page """ layout=self.layout if layout == None: raise RuntimeError("page is not chosen") - #interact(locals={"layout":layout, "self":self}) answer = [(r.x1,r.y1,r.x2,r.y2) for r in layout] return answer @@ -262,8 +237,7 @@ def process_page(infile, #----------------------------------------------------------------------- # image load section. - #print(data.shape) - height, width = data.shape[:2] + height, width = data.shape[:2] # If not to reduce to gray, the shape will be (,,3) or (,,4). pad = int(pad) height += pad * 2 @@ -273,13 +247,15 @@ def process_page(infile, bmp = ones((height, width), dtype=bool) thr = int(255.0 * greyscale_threshold / 100.0) - imsave("white.png", bmp) + bmp[pad:height - pad, pad:width - pad] = (data[:, :] > thr) - #bmp = bmp == False - imsave("foo.png", bmp) + + # Set up Debuging image. img = zeros((height, width, 3), dtype=uint8) - #img[:, :, :] = bmp * 255 + + # img[:, :, :] = bmp * 255 # In case of colored input image + img[:, :, 0] = bmp * 255 img[:, :, 1] = bmp * 255 img[:, :, 2] = bmp * 255 @@ -287,7 +263,7 @@ def process_page(infile, if checkdivs or checkcells or checkletters: imgfloat = img.astype(float) - if checkletters: + if checkletters: # Show bounding boxes for each text object. img = (imgfloat/2.).astype(uint8) rectangles=pdfdoc.get_rectangles_for_page(pg) lrn=len(rectangles) @@ -300,7 +276,6 @@ def process_page(infile, #----------------------------------------------------------------------- # Find bounding box. t = 0 - imsave("bmp-start.png", bmp) while t < height and all(bmp[t, :]): t = t + 1 @@ -330,8 +305,6 @@ def process_page(infile, bmp[b, :] = False bmp[:, l] = False bmp[:, r] = False - imsave("bbox-start.png", bmp) - print("Bbox", l, t, b, r) def boxOfString(x, p): s = x.split(":") @@ -510,14 +483,16 @@ def getCell(_coordinate, img=None): (l, r, t, b) = (vd[2 * i + 1], vd[2 * (i + u)], hd[2 * j + 1], hd[2 * (j + v)]) ret, rect = pdfdoc.get_text(page, l - pad, t - pad, r - l, b - t) + if img != None and checkcells: (x1,y1,x2,y2) = [rrr+pad for rrr in [rect.x1,rect.y1,rect.x2,rect.y2]] img[y1:y2,x1:x2] += col(random.random()).astype(uint8) return (i, j, u, v, pg, ret) - if checkcells: + if checkletters: img = (imgfloat / 2.).astype(uint8) + if boxes: cells = [x + (pg, "", ) for x in cells @@ -526,7 +501,7 @@ def getCell(_coordinate, img=None): print(cells) cells = [getCell(x, img) for x in cells if (frow == None or (x[1] >= frow and x[1] <= lrow))] - if checkcells: + if checkletters: imsave("text-locations.png", img) return cells From 9fc9279e434f9f2e158df86da90c626ba72ea868 Mon Sep 17 00:00:00 2001 From: Evgeny Cherkashin Date: Mon, 18 Jul 2016 22:36:37 +0800 Subject: [PATCH 26/28] Possibly ready to go. --- example/test_to_pandas.py | 3 ++- src/pdftableextract/core.py | 26 +++++++++++++++----------- src/pdftableextract/scripts.py | 25 +++++++++---------------- 3 files changed, 26 insertions(+), 28 deletions(-) diff --git a/example/test_to_pandas.py b/example/test_to_pandas.py index 95e68d0..3b30c80 100644 --- a/example/test_to_pandas.py +++ b/example/test_to_pandas.py @@ -7,7 +7,8 @@ cells = [pdf.process_page("example.pdf", p, outfilename="pandas-test", - checkall=True) for p in pages] + bitmap_resolution=100, + checkall=False) for p in pages] #flatten the cells structure cells = [item for sublist in cells for item in sublist] diff --git a/src/pdftableextract/core.py b/src/pdftableextract/core.py index 81f3b55..c5c3161 100644 --- a/src/pdftableextract/core.py +++ b/src/pdftableextract/core.py @@ -1,12 +1,17 @@ import sys -import random import os + +DEBUG = False + +if DEBUG: + import random from numpy import array, fromstring, ones, zeros, uint8, diff, where, sum, delete, frombuffer, reshape, all, any import numpy -import matplotlib -matplotlib.use('AGG') -from matplotlib.image import imsave +if DEBUG: + import matplotlib + matplotlib.use('AGG') + from matplotlib.image import imsave from xml.dom.minidom import getDOMImplementation import json @@ -80,13 +85,13 @@ def get_image(self, index): context.restore() pixbuf = Gdk.pixbuf_get_from_surface(surface, 0, 0, pxw, pxh) - surface.write_to_png("page.png") + # surface.write_to_png("page.png") data = frombuffer(pixbuf.get_pixels(), dtype=uint8) R = data[0::4] G = data[1::4] B = data[2::4] A = data[3::4] - C = (R * 34 + G * 56 + B * 10) / 100. # Convert to gray + C = (R * 34. + G * 56. + B * 10.) / 100. # Convert to gray C = C.astype(uint8) @@ -202,7 +207,7 @@ def process_page(infile, page=None, crop=None, line_length=0.5, - bitmap_resolution=72, # 300, + bitmap_resolution=300, name=None, pad=2, white=None, @@ -268,7 +273,7 @@ def process_page(infile, rectangles=pdfdoc.get_rectangles_for_page(pg) lrn=len(rectangles) for k,r in enumerate(rectangles): - x1,y1,x2,y2 = [k+pad+1 for k in r] + x1,y1,x2,y2 = [int(bitmap_resolution* float(k)/72.)+pad for k in r] img[y1:y2, x1:x2] += col(random.random()).astype(uint8) imsave("letters.png", img) @@ -484,8 +489,8 @@ def getCell(_coordinate, img=None): hd[2 * (j + v)]) ret, rect = pdfdoc.get_text(page, l - pad, t - pad, r - l, b - t) - if img != None and checkcells: - (x1,y1,x2,y2) = [rrr+pad for rrr in [rect.x1,rect.y1,rect.x2,rect.y2]] + if type(img)!=type(None) and checkletters: + (x1,y1,x2,y2) = [int(bitmap_resolution * float(rrr)/72+pad) for rrr in [rect.x1,rect.y1,rect.x2,rect.y2]] img[y1:y2,x1:x2] += col(random.random()).astype(uint8) return (i, j, u, v, pg, ret) @@ -498,7 +503,6 @@ def getCell(_coordinate, img=None): "", ) for x in cells if (frow == None or (x[1] >= frow and x[1] <= lrow))] else: - print(cells) cells = [getCell(x, img) for x in cells if (frow == None or (x[1] >= frow and x[1] <= lrow))] if checkletters: diff --git a/src/pdftableextract/scripts.py b/src/pdftableextract/scripts.py index 6939713..97a2ee2 100644 --- a/src/pdftableextract/scripts.py +++ b/src/pdftableextract/scripts.py @@ -25,10 +25,10 @@ def procargs() : p.add_argument("-name", help="name to add to XML tag, or HTML comments") p.add_argument("-pad", help="imitial image pading (pixels)", type=int, default=2 ) - p.add_argument("-white",action="append", + p.add_argument("-white",action="append", help="paint white to the bitmap as left:top:right:bottom in length units." "Done before painting black" ) - p.add_argument("-black",action="append", + p.add_argument("-black",action="append", help="paint black to the bitmap as left:top:right:bottom in length units." "Done after poainting white" ) p.add_argument("-bitmap", action="store_true", @@ -67,14 +67,10 @@ def main(): raise sys.exit("I/O Error running pdf-table-extract: {0}".format(e)) except OSError as e: - print("An OS Error occurred running pdf-table-extract: Is `pdftoppm` installed and available?") + print("An OS Error occurred running pdf-table-extract") if args.traceback: raise sys.exit("OS Error: {0}".format(e)) - except subprocess.CalledProcessError as e: - if args.traceback: - raise - sys.exit("Error while checking a subprocess call: {0}".format(e)) except Exception as e: if args.traceback: raise @@ -85,9 +81,9 @@ def imain(args): if args.checkcrop or args.checklines or args.checkdivs or args.checkcells: for pgs in args.page : success = process_page(args.infile, pgs, - bitmap=args.bitmap, - checkcrop=args.checkcrop, - checklines=args.checklines, + bitmap=args.bitmap, + checkcrop=args.checkcrop, + checklines=args.checklines, checkdivs=args.checkdivs, checkcells=args.checkcells, whitespace=args.whitespace, @@ -105,9 +101,9 @@ def imain(args): else: for pgs in args.page : cells.extend(process_page(args.infile, pgs, - bitmap=args.bitmap, - checkcrop=args.checkcrop, - checklines=args.checklines, + bitmap=args.bitmap, + checkcrop=args.checkcrop, + checklines=args.checklines, checkdivs=args.checkdivs, checkcells=args.checkcells, whitespace=args.whitespace, @@ -127,6 +123,3 @@ def imain(args): args.outfile = sys.stdout filenames["{0}_filename".format(args.t)] = args.outfile output(cells, args.page, name=args.name, infile=args.infile, output_type=args.t, **filenames) - - - From 82fd20206735fd82f2fad864bf14603930081849 Mon Sep 17 00:00:00 2001 From: Evgeny Cherkashin Date: Tue, 19 Jul 2016 01:04:05 +0800 Subject: [PATCH 27/28] Experimenting with page recognition. --- src/pdftableextract/core.py | 37 ++++++++++++++++++++++--------------- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/src/pdftableextract/core.py b/src/pdftableextract/core.py index c5c3161..c1acd76 100644 --- a/src/pdftableextract/core.py +++ b/src/pdftableextract/core.py @@ -1,7 +1,7 @@ import sys import os -DEBUG = False +DEBUG = True if DEBUG: import random @@ -121,11 +121,18 @@ def within(self, a, b, pad=0): - `a`, `b` : The rectangles; - `pad` : Additional space. """ - if b.x1+pad < a.x1: return False - if b.y1+pad < a.y1: return False - if b.x2-pad > a.x2: return False - if b.y2-pad > a.y2: return False - return True + if b.x1>=a.x1 and b.y1>=a.y1 and b.x2<=a.x2 and b.y2<=a.y2: # The obvious case. + return True + def w(x,y): + if x>=a.x1+pad and x<=a.x2-pad and y>=a.y1+pad and y<=a.y2-pad: + return True + else: + return False + for x,y in [(b.x1,b.y1), (b.x2,b.y2), (b.x1,b.y2), (b.x2,b.y1)]: + if w(x,y): + return True + # FIXME if b is bigger a and intersects it... + return False def rexpand(self, rect, layout, pad=0): """Make rectangle rect include layout @@ -160,8 +167,8 @@ def get_text(self, page, x, y, w, h): r.x2 = r.y2 = -1e10 chars=[] for k,l in enumerate(self.layout): - if self.within(rect, l, pad=1): - self.rexpand(r, l, pad=0.5) + if self.within(rect, l, pad=0): + self.rexpand(r, l, pad=0) chars.append(self.text[k]) txt="".join(chars) @@ -275,7 +282,7 @@ def process_page(infile, for k,r in enumerate(rectangles): x1,y1,x2,y2 = [int(bitmap_resolution* float(k)/72.)+pad for k in r] img[y1:y2, x1:x2] += col(random.random()).astype(uint8) - imsave("letters.png", img) + imsave(outfile+"-letters.png", img) #----------------------------------------------------------------------- @@ -344,7 +351,7 @@ def boxOfString(x, p): img[t:b + 1, l:r + 1] = [0, 0, 0] if checkcrop: - imsave("crop-" + outfile + ".png", img) + imsave(outfile+"-crop.png", img) #----------------------------------------------------------------------- # Line finding section. @@ -406,7 +413,7 @@ def boxOfString(x, p): for j in hd: img[j, :] = [0, 0, 255] # blue - imsave("lines-" + outfile + ".png", img) + imsave(outfile+"-lines.png", img) #----------------------------------------------------------------------- # divider checking. @@ -436,7 +443,7 @@ def isDiv(a, l, r, t, b): if isDiv(0, l, r, t, b): img[t:b, l:r, 0] = 255 img[t:b, l:r, 2] = 0 - imsave("divs-" + outfile + ".png", img) + imsave(outfile+"-divs.png", img) #----------------------------------------------------------------------- # Cell finding section. @@ -476,9 +483,9 @@ def isDiv(a, l, r, t, b): (i, j, u, v) = cells[k] (l, r, t, b) = (vd[2 * i + 1], vd[2 * (i + u)], hd[2 * j + 1], hd[2 * (j + v)]) - img[t:b, l:r] += col(k / nc).astype(uint8) + img[t:b, l:r] += col(k*0.9 / nc + 0.1*random.random()).astype(uint8) - imsave("cells-" + outfile + ".png", img) + imsave(outfile+"-cells.png", img) #----------------------------------------------------------------------- # fork out to extract text for each cell. @@ -506,7 +513,7 @@ def getCell(_coordinate, img=None): cells = [getCell(x, img) for x in cells if (frow == None or (x[1] >= frow and x[1] <= lrow))] if checkletters: - imsave("text-locations.png", img) + imsave(outfile+"-text-locations.png", img) return cells From 2ef0dad951b4827ae8034e2801bebd6b925e4378 Mon Sep 17 00:00:00 2001 From: Evgeny Cherkashin Date: Tue, 19 Jul 2016 10:18:27 +0800 Subject: [PATCH 28/28] Better overlapping condition. No debugging. --- src/pdftableextract/core.py | 23 ++++++----------------- 1 file changed, 6 insertions(+), 17 deletions(-) diff --git a/src/pdftableextract/core.py b/src/pdftableextract/core.py index c1acd76..7af488a 100644 --- a/src/pdftableextract/core.py +++ b/src/pdftableextract/core.py @@ -1,7 +1,7 @@ import sys import os -DEBUG = True +DEBUG = False if DEBUG: import random @@ -114,25 +114,14 @@ def print_rect(self, msg=None, r=None, page=None): print(msg, x, height - y, w, h, "---", x1, height - y1, x2, height - y2) - def within(self, a, b, pad=0): - """Is Rectangle b within Rectangle a, i.e. b is in a. + def overlap(self, a, b, pad=0): + """Check if Rectangle b and Rectangle overlaps. Arguments: - `a`, `b` : The rectangles; - - `pad` : Additional space. + - `pad` : Additional space. (IGNORED) """ - if b.x1>=a.x1 and b.y1>=a.y1 and b.x2<=a.x2 and b.y2<=a.y2: # The obvious case. - return True - def w(x,y): - if x>=a.x1+pad and x<=a.x2-pad and y>=a.y1+pad and y<=a.y2-pad: - return True - else: - return False - for x,y in [(b.x1,b.y1), (b.x2,b.y2), (b.x1,b.y2), (b.x2,b.y1)]: - if w(x,y): - return True - # FIXME if b is bigger a and intersects it... - return False + return a.x1 < b.x2 and a.x2 > b.x1 and a.y1 < b.y2 and a.y2 > b.y1 def rexpand(self, rect, layout, pad=0): """Make rectangle rect include layout @@ -167,7 +156,7 @@ def get_text(self, page, x, y, w, h): r.x2 = r.y2 = -1e10 chars=[] for k,l in enumerate(self.layout): - if self.within(rect, l, pad=0): + if self.overlap(rect, l, pad=0): self.rexpand(r, l, pad=0) chars.append(self.text[k]) txt="".join(chars)