diff --git a/.gitingore b/.gitingore new file mode 100644 index 0000000..6f06927 --- /dev/null +++ b/.gitingore @@ -0,0 +1 @@ +059285.pdf diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..63821a7 --- /dev/null +++ b/Makefile @@ -0,0 +1,25 @@ +.PHONY: develop setup run-tests tests test gdb-test + +LPYTHON=python3 +V=$(PWD)/../../$(LPYTHON) +VB=$(V)/bin +PYTHON=$(VB)/$(LPYTHON) +ROOT=$(PWD) +#INI=icc.linkgrammar +#LCAT=src/icc/linkgrammar/locale/ + +develop: setup + pip install -r requirements.txt + +setup: + python setup.py develop + +run-tests: + nosetests -w src/icc/tests + +tests: run-tests + +test: setup run-tests + +gdb-test: setup + gdb --args $(PYTHON) $(VB)/nosetests -w src/icc/tests diff --git a/README.md b/README.md index 222c14d..3085e5a 100644 --- a/README.md +++ b/README.md @@ -13,4 +13,8 @@ tables in ST Micro’s datasheets. The script requires numpy and poppler ###Tags [Utilities](http://ashimagroup.net/os/tag/utilities) - +###Requires +apt-get install python-dev poppler-utils +yum install python-devel poppler-utils +[numpy](http://www.numpy.org/) +[pandas](http://pandas.pydata.org/) diff --git a/example/test_to_pandas.py b/example/test_to_pandas.py index bb31515..3b30c80 100644 --- a/example/test_to_pandas.py +++ b/example/test_to_pandas.py @@ -1,11 +1,17 @@ +from __future__ import print_function import pandas as pd import pdftableextract as pdf pages = ["1"] -cells = [pdf.process_page("example.pdf",p) for p in pages] + +cells = [pdf.process_page("example.pdf", + p, + outfilename="pandas-test", + bitmap_resolution=100, + checkall=False) for p in pages] #flatten the cells structure -cells = [item for sublist in cells for item in sublist ] +cells = [item for sublist in cells for item in sublist] #without any options, process_page picks up a blank table at the top of the page. #so choose table '1' @@ -16,5 +22,5 @@ #row '1' contains column headings #data is row '2' through '-1' -data =pd.DataFrame(li[2:-1], columns=li[1], index=[l[0] for l in li[2:-1]]) -print data +data = pd.DataFrame(li[2:-1], columns=li[1], index=[l[0] for l in li[2:-1]]) +print(data) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..e917532 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,4 @@ +ruamel.venvgtk +numpy +matplotlib +pandas diff --git a/setup.py b/setup.py index 8591c50..66ef158 100644 --- a/setup.py +++ b/setup.py @@ -5,10 +5,9 @@ README = open(os.path.join(here, 'README.md')).read() #NEWS = open(os.path.join(here, 'NEWS.txt')).read() - version = '0.1' -install_requires = [ "numpy" ] +install_requires = ["numpy", "ruamel.venvgtk"] setup(name='pdf-table-extract', @@ -21,7 +20,7 @@ keywords='PDF, tables', author='Ian McEwan', author_email='ijm@ashimaresearch.com', - url='ashimaresearch.com', + url='ashimaresearch.dcom', license='MIT-Expat', packages=find_packages('src'), package_dir = {'': 'src'},include_package_data=True, diff --git a/src/pdftableextract/__init__.py b/src/pdftableextract/__init__.py index 6dbe85c..8366135 100644 --- a/src/pdftableextract/__init__.py +++ b/src/pdftableextract/__init__.py @@ -1,2 +1,2 @@ # Example package with a console entry point -from core import process_page, output, table_to_list \ No newline at end of file +from pdftableextract.core import process_page, output, table_to_list diff --git a/src/pdftableextract/core.py b/src/pdftableextract/core.py index d1dce80..7af488a 100644 --- a/src/pdftableextract/core.py +++ b/src/pdftableextract/core.py @@ -1,481 +1,697 @@ import sys import os -from numpy import array, fromstring, ones, zeros, uint8, diff, where, sum, delete -import subprocess -from pipes import quote -from .pnm import readPNM, dumpImage -import re -from pipes import quote + +DEBUG = False + +if DEBUG: + import random +from numpy import array, fromstring, ones, zeros, uint8, diff, where, sum, delete, frombuffer, reshape, all, any +import numpy + +if DEBUG: + import matplotlib + matplotlib.use('AGG') + from matplotlib.image import imsave + from xml.dom.minidom import getDOMImplementation import json import csv +import gi +gi.require_version('Gtk', '3.0') +gi.require_version('Poppler', '0.18') +gi.require_version('Gdk', '3.0') +from gi.repository import Gdk, Poppler #, Glib +import cairo + + +def interact(locals): + import code + code.InteractiveConsole(locals=locals).interact() + + +class PopplerProcessor(object): + """Class for processing PDF. That's simple. + It does two functions. + 1. Renders a page as a PNM graphics, and + 2. Get text in a rectangular bounding box. + """ + + def __init__(self, filename, **kwargs): + """Opens a document denoted by filename. + """ + self.filename = os.path.abspath(filename) + self.document = Poppler.Document.new_from_file("file:" + self.filename, + None) + self.page_num = self.document.get_n_pages() + self.resolution = 300 + self.greyscale_threshold = int(kwargs.get("greyscale_thresholds", + 25)) * 255.0 / 100.0 + self.layout = None + + def get_page(self, index): + if index < 0 or index >= self.page_num: + raise IndexError("page number is out of bounds") + page = self.document.get_page(index) + if self.layout != None: + #Glib.free(self.layout) + # Do we need freeing elements of the list # FIXME + self.layout = None + self.text = page.get_text() + self.attributes=page.get_text_attributes() + l = page.get_text_layout() + if l[0]: + self.layout = l[1] + return page + + def get_image(self, index): + page = self.get_page(index) + dpi = self.resolution + scale = 1 + width, height = [int(x) for x in page.get_size()] + d = self.scale = dpi / 72. + self.frac_scale = 1 / d + pxw, pxh = int(width * d), int(height * d) + surface = cairo.ImageSurface( + # data, + cairo.FORMAT_ARGB32, + pxw, + pxh) + + context = cairo.Context(surface) + context.scale(d, d) + + context.save() + page.render(context) + context.restore() + + pixbuf = Gdk.pixbuf_get_from_surface(surface, 0, 0, pxw, pxh) + # surface.write_to_png("page.png") + data = frombuffer(pixbuf.get_pixels(), dtype=uint8) + R = data[0::4] + G = data[1::4] + B = data[2::4] + A = data[3::4] + C = (R * 34. + G * 56. + B * 10.) / 100. # Convert to gray + + C = C.astype(uint8) + + nd = zeros(C.shape, dtype=uint8) + nd[:] = C + nd[A <= self.greyscale_threshold] = 255 + nd = nd.reshape((pxh, pxw)) + # imsave('nomask.png', nd) + return nd, page + + def print_rect(self, msg=None, r=None, page=None): + """Used for debugging. + """ + if None in [r, page]: + raise ValueError("r and page arguments are required") + x1, y1, x2, y2 = r.x1, r.y1, r.x2, r.y2 + x, y, w, h = x1, y1, x2 - x1, y2 - y1 + print(msg, x, y, w, h, "---", x1, y1, x2, y2) + width, height = [int(x) for x in page.get_size()] + print(msg, x, height - y, w, h, "---", x1, height - y1, x2, + height - y2) + + def overlap(self, a, b, pad=0): + """Check if Rectangle b and Rectangle overlaps. + + Arguments: + - `a`, `b` : The rectangles; + - `pad` : Additional space. (IGNORED) + """ + return a.x1 < b.x2 and a.x2 > b.x1 and a.y1 < b.y2 and a.y2 > b.y1 + + def rexpand(self, rect, layout, pad=0): + """Make rectangle rect include layout + + Arguments: + - `rect`: Adjustable Rectangle; + - `layout`: Rectangle to be included in rect. + """ + + r, l = rect, layout + if r.x1 > l.x1: r.x1 = l.x1-pad + if r.y1 > l.y1: r.y1 = l.y1-pad + if r.x2 < l.x2: r.x2 = l.x2+pad + if r.y2 < l.y2: r.y2 = l.y2+pad + + def get_text(self, page, x, y, w, h): + width, height = [int(x) for x in page.get_size()] + fc = self.frac_scale + x, y, w, h = (z * fc for z in [x, y, w, h]) + rect = Poppler.Rectangle() + rect.x1, rect.y1 = x, y + rect.x2, rect.y2 = x + w, y + h + assert rect.x1<=rect.x2 + assert rect.y1<=rect.y2 + + # Could not make it work correctly # FIXME + # txt = page.get_text_for_area(rect) + # attrs = page.get_text_attributes_for_area(rect) + + r = Poppler.Rectangle() + r.x1 = r.y1 = 1e10 + r.x2 = r.y2 = -1e10 + chars=[] + for k,l in enumerate(self.layout): + if self.overlap(rect, l, pad=0): + self.rexpand(r, l, pad=0) + chars.append(self.text[k]) + txt="".join(chars) + + # txt = page.get_text_for_area(r) # FIXME + + return txt, r + + def get_rectangles_for_page(self, page): + """Return all rectangles for all letters in the page.. + Used for debugging. + + Arguments: + - `page`: referece to page + """ + layout=self.layout + if layout == None: + raise RuntimeError("page is not chosen") + + answer = [(r.x1,r.y1,r.x2,r.y2) for r in layout] + return answer + + +def colinterp(a, x): + """Interpolates colors""" + l = len(a) - 1 + i = min(l, max(0, int(x * l))) + (u, v) = a[i:i + 2, :] + return u - (u - v) * ((x * l) % 1.0) -#----------------------------------------------------------------------- -def check_for_required_executable(name,command): - """Checks for an executable called 'name' by running 'command' and supressing - output. If the return code is non-zero or an OS error occurs, an Exception is raised""" - try: - with open(os.devnull, "w") as fnull: - result=subprocess.check_call(command,stdout=fnull, stderr=fnull) - except OSError as e: - message = """Error running {0}. -Command failed: {1} -{2}""".format(name, " ".join(command), e) - raise OSError(message) - except subprocess.CalledProcessError as e: - raise - except Exception as e: - raise -#----------------------------------------------------------------------- -def popen(name,command, *args, **kwargs): - try: - result=subprocess.Popen(command,*args, **kwargs) - return result - except OSError, e: - message="""Error running {0}. Is it installed correctly? -Error: {1}""".format(name, e) - raise OSError(message) - except Exception, e: - raise - -def colinterp(a,x) : - """Interpolates colors""" - l = len(a)-1 - i = min(l, max(0, int (x * l))) - (u,v) = a[i:i+2,:] - return u - (u-v) * ((x * l) % 1.0) +colarr = array( + [[255, 0, 0], [255, 255, 0], [0, 255, 0], [0, 255, 255], [0, 0, 255]]) -colarr = array([ [255,0,0],[255,255,0],[0,255,0],[0,255,255],[0,0,255] ]) -def col(x, colmult=1.0) : +def col(x, colmult=1.0): """colors""" - return colinterp(colarr,(colmult * x)% 1.0) / 2 - - -def process_page(infile, pgs, - outfilename=None, - greyscale_threshold=25, - page=None, - crop=None, - line_length=0.17, - bitmap_resolution=300, - name=None, - pad=2, - white=None, - black=None, - bitmap=False, - checkcrop=False, - checklines=False, - checkdivs=False, - checkcells=False, - whitespace="normalize", - boxes=False) : - - outfile = open(outfilename,'w') if outfilename else sys.stdout - page=page or [] - (pg,frow,lrow) = (map(int,(pgs.split(":")))+[None,None])[0:3] - #check that pdftoppdm exists by running a simple command - check_for_required_executable("pdftoppm",["pdftoppm","-h"]) - #end check - - p = popen("pdftoppm", ("pdftoppm -gray -r %d -f %d -l %d %s " % - (bitmap_resolution,pg,pg,quote(infile))), - stdin=subprocess.PIPE, stdout=subprocess.PIPE, shell=True ) - -#----------------------------------------------------------------------- -# image load secion. + return colinterp(colarr, (colmult * x) % 1.0) / 2 + +def process_page(infile, + pgs, + outfilename=None, + greyscale_threshold=25, + page=None, + crop=None, + line_length=0.5, + bitmap_resolution=300, + name=None, + pad=2, + white=None, + black=None, + bitmap=False, + checkcrop=False, + checklines=False, + checkdivs=False, + checkcells=False, + checkall=False, + checkletters=False, + whitespace="normalize", + boxes=False, + encoding="utf8"): + + if checkall: + checkcrop = True + checklines = True + checkdivs = True + checkcells = True + checkletters = True + + outfile = outfilename if outfilename else "output" + pdfdoc = PopplerProcessor(infile) + page = page or [] + (pg, frow, lrow) = (list(map(int, (pgs.split(":")))) + [None, None])[0:3] + pdfdoc.resolution = bitmap_resolution + pdfdoc.greyscale_threshold = greyscale_threshold + + data, page = pdfdoc.get_image(pg - 1) # Page numbers are 0-based. + + #----------------------------------------------------------------------- + # image load section. + + height, width = data.shape[:2] # If not to reduce to gray, the shape will be (,,3) or (,,4). + + pad = int(pad) + height += pad * 2 + width += pad * 2 + + # reimbed image with a white pad. + bmp = ones((height, width), dtype=bool) + + thr = int(255.0 * greyscale_threshold / 100.0) + + bmp[pad:height - pad, pad:width - pad] = (data[:, :] > thr) + + + # Set up Debuging image. + img = zeros((height, width, 3), dtype=uint8) + + # img[:, :, :] = bmp * 255 # In case of colored input image + + img[:, :, 0] = bmp * 255 + img[:, :, 1] = bmp * 255 + img[:, :, 2] = bmp * 255 + + if checkdivs or checkcells or checkletters: + imgfloat = img.astype(float) + + if checkletters: # Show bounding boxes for each text object. + img = (imgfloat/2.).astype(uint8) + rectangles=pdfdoc.get_rectangles_for_page(pg) + lrn=len(rectangles) + for k,r in enumerate(rectangles): + x1,y1,x2,y2 = [int(bitmap_resolution* float(k)/72.)+pad for k in r] + img[y1:y2, x1:x2] += col(random.random()).astype(uint8) + imsave(outfile+"-letters.png", img) + + + #----------------------------------------------------------------------- + # Find bounding box. + t = 0 + + while t < height and all(bmp[t, :]): + t = t + 1 + if t > 0: + t = t - 1 + + b = height - 1 + while b > t and all(bmp[b, :]): + b = b - 1 + if b < height - 1: + b = b + 1 + + l = 0 + while l < width and all(bmp[:, l]): + l = l + 1 + if l > 0: + l = l - 1 + + r = width - 1 + while r > l and all(bmp[:, r]): + r = r - 1 + if r < width - 1: + r = r + 1 - (maxval, width, height, data) = readPNM(p.stdout) +# Mark bounding box. + bmp[t, :] = False + bmp[b, :] = False + bmp[:, l] = False + bmp[:, r] = False + + def boxOfString(x, p): + s = x.split(":") + if len(s) < 4: + raise ValueError("boxes have format left:top:right:bottom[:page]") + return ([bitmap_resolution * float(x) + pad for x in s[0:4]] + + [p if len(s) < 5 else int(s[4])]) - pad = int(pad) - height+=pad*2 - width+=pad*2 - -# reimbed image with a white padd. - bmp = ones( (height,width) , dtype=bool ) - bmp[pad:height-pad,pad:width-pad] = ( data[:,:] > int(255.0*greyscale_threshold/100.0) ) +# translate crop to paint white. -# Set up Debuging image. - img = zeros( (height,width,3) , dtype=uint8 ) - img[:,:,0] = bmp*255 - img[:,:,1] = bmp*255 - img[:,:,2] = bmp*255 + whites = [] + if crop: + (l, t, r, b, p) = boxOfString(crop, pg) + whites.extend([(0, 0, l, height, p), (0, 0, width, t, p), + (r, 0, width, height, p), (0, b, width, height, p)]) -#----------------------------------------------------------------------- -# Find bounding box. - t=0 - while t < height and sum(bmp[t,:]==0) == 0 : - t=t+1 - if t > 0 : - t=t-1 - - b=height-1 - while b > t and sum(bmp[b,:]==0) == 0 : - b=b-1 - if b < height-1: - b = b+1 - - l=0 - while l < width and sum(bmp[:,l]==0) == 0 : - l=l+1 - if l > 0 : - l=l-1 - - r=width-1 - while r > l and sum(bmp[:,r]==0) == 0 : - r=r-1 - if r < width-1 : - r=r+1 - -# Mark bounding box. - bmp[t,:] = 0 - bmp[b,:] = 0 - bmp[:,l] = 0 - bmp[:,r] = 0 +# paint white ... + if white: + whites.extend([boxOfString(b, pg) for b in white]) - def boxOfString(x,p) : - s = x.split(":") - if len(s) < 4 : - raise ValueError("boxes have format left:top:right:bottom[:page]") - return ([bitmap_resolution * float(x) + pad for x in s[0:4] ] - + [ p if len(s)<5 else int(s[4]) ] ) + for (l, t, r, b, p) in whites: + if p == pg: + bmp[t:b + 1, l:r + 1] = 1 + img[t:b + 1, l:r + 1] = [255, 255, 255] +# paint black ... + if black: + for b in black: + (l, t, r, + b) = [bitmap_resolution * float(x) + pad for x in b.split(":")] + bmp[t:b + 1, l:r + 1] = 0 + img[t:b + 1, l:r + 1] = [0, 0, 0] -# translate crop to paint white. - whites = [] - if crop : - (l,t,r,b,p) = boxOfString(crop,pg) - whites.extend( [ (0,0,l,height,p), (0,0,width,t,p), - (r,0,width,height,p), (0,b,width,height,p) ] ) + if checkcrop: + imsave(outfile+"-crop.png", img) -# paint white ... - if white : - whites.extend( [ boxOfString(b, pg) for b in white ] ) - - for (l,t,r,b,p) in whites : - if p == pg : - bmp[ t:b+1,l:r+1 ] = 1 - img[ t:b+1,l:r+1 ] = [255,255,255] - -# paint black ... - if black : - for b in black : - (l,t,r,b) = [bitmap_resolution * float(x) + pad for x in b.split(":") ] - bmp[ t:b+1,l:r+1 ] = 0 - img[ t:b+1,l:r+1 ] = [0,0,0] - - if checkcrop : - dumpImage(outfile,bmp,img, bitmap, pad) - return True - #----------------------------------------------------------------------- # Line finding section. # -# Find all vertical or horizontal lines that are more than rlthresh +# Find all vertical or horizontal lines that are more than lthresh # long, these are considered lines on the table grid. - lthresh = int(line_length * bitmap_resolution) - vs = zeros(width, dtype=int) - for i in range(width) : - dd = diff( where(bmp[:,i])[0] ) - if len(dd)>0: - v = max ( dd ) - if v > lthresh : - vs[i] = 1 - else: -# it was a solid black line. - if bmp[0,i] == 0 : - vs[i] = 1 - vd= ( where(diff(vs[:]))[0] +1 ) - - hs = zeros(height, dtype=int) - for j in range(height) : - dd = diff( where(bmp[j,:]==1)[0] ) - if len(dd) > 0 : - h = max ( dd ) - if h > lthresh : - hs[j] = 1 - else: -# it was a solid black line. - if bmp[j,0] == 0 : - hs[j] = 1 - hd=( where(diff(hs[:]==1))[0] +1 ) + lthresh = int(line_length * bitmap_resolution) + vs = zeros(width, dtype=uint8) + + for i in range(width): + dd = diff(where(bmp[:, i])[0]) + if len(dd) > 0: + v = max(dd) + if v > lthresh: + vs[i] = 1 + else: + # it was a solid black line. + if all(bmp[0, i]) == 0: + vs[i] = 1 + vd = (where(diff(vs[:]))[0] + 1) + + hs = zeros(height, dtype=uint8) + for j in range(height): + dd = diff(where(bmp[j, :])[0]) + if len(dd) > 0: + h = max(dd) + if h > lthresh: + hs[j] = 1 + else: + # it was a solid black line. + if all(bmp[j, 0]) == 0: + hs[j] = 1 + hd = (where(diff(hs[:]))[0] + 1) + + #----------------------------------------------------------------------- + # Look for dividors that are too large. + maxdiv = 10 + i = 0 -#----------------------------------------------------------------------- -# Look for dividors that are too large. - maxdiv=10 - i=0 - - while i < len(vd) : - if vd[i+1]-vd[i] > maxdiv : - vd = delete(vd,i) - vd = delete(vd,i) - else: - i=i+2 - - j = 0 - while j < len(hd): - if hd[j+1]-hd[j] > maxdiv : - hd = delete(hd,j) - hd = delete(hd,j) + while i < len(vd): + if vd[i + 1] - vd[i] > maxdiv: + vd = delete(vd, i) + vd = delete(vd, i) + else: + i = i + 2 + + j = 0 + while j < len(hd): + if hd[j + 1] - hd[j] > maxdiv: + hd = delete(hd, j) + hd = delete(hd, j) + else: + j = j + 2 + + if checklines: + for i in vd: + img[:, i] = [255, 0, 0] # red + + for j in hd: + img[j, :] = [0, 0, 255] # blue + imsave(outfile+"-lines.png", img) + + #----------------------------------------------------------------------- + # divider checking. + # + # at this point vd holds the x coordinate of vertical and + # hd holds the y coordinate of horizontal divider tansitions for each + # vertical and horizontal lines in the table grid. + + def isDiv(a, l, r, t, b): + # if any col or row (in axis) is all zeros ... + return sum(sum(bmp[t:b, l:r], axis=a) == 0) > 0 + + if checkdivs: + img = (imgfloat / 2).astype(uint8) + for j in range(0, len(hd), 2): + for i in range(0, len(vd), 2): + if i > 0: + (l, r, t, b) = (vd[i - 1], vd[i], hd[j], hd[j + 1]) + img[t:b, l:r, 1] = 192 + if isDiv(1, l, r, t, b): + img[t:b, l:r, 0] = 0 + img[t:b, l:r, 2] = 255 + + if j > 0: + (l, r, t, b) = (vd[i], vd[i + 1], hd[j - 1], hd[j]) + img[t:b, l:r, 1] = 128 + if isDiv(0, l, r, t, b): + img[t:b, l:r, 0] = 255 + img[t:b, l:r, 2] = 0 + imsave(outfile+"-divs.png", img) + + #----------------------------------------------------------------------- + # Cell finding section. + # This algorithum is width hungry, and always generates rectangular + # boxes. + + cells = [] + touched = zeros((len(hd), len(vd)), dtype=bool) + j = 0 + while j * 2 + 2 < len(hd): + i = 0 + while i * 2 + 2 < len(vd): + u = 1 + v = 1 + if not touched[j, i]: + while 2+(i+u)*2 < len(vd) and \ + not isDiv( 0, vd[ 2*(i+u) ], vd[ 2*(i+u)+1], + hd[ 2*(j+v)-1 ], hd[ 2*(j+v) ] ): + u = u + 1 + bot = False + while 2 + (j + v) * 2 < len(hd) and not bot: + bot = False + for k in range(1, u + 1): + bot |= isDiv(1, vd[2 * (i + k) - 1], vd[2 * (i + k)], + hd[2 * (j + v)], hd[2 * (j + v) + 1]) + if not bot: + v = v + 1 + cells.append((i, j, u, v)) + touched[j:j + v, i:i + u] = True + i = i + 1 + j = j + 1 + + if checkcells: + nc = len(cells) + 0. + img = (imgfloat / 2.).astype(uint8) + for k in range(len(cells)): + (i, j, u, v) = cells[k] + (l, r, t, b) = (vd[2 * i + 1], vd[2 * (i + u)], hd[2 * j + 1], + hd[2 * (j + v)]) + img[t:b, l:r] += col(k*0.9 / nc + 0.1*random.random()).astype(uint8) + + imsave(outfile+"-cells.png", img) + + #----------------------------------------------------------------------- + # fork out to extract text for each cell. + + def getCell(_coordinate, img=None): + (i, j, u, v) = _coordinate + (l, r, t, b) = (vd[2 * i + 1], vd[2 * (i + u)], hd[2 * j + 1], + hd[2 * (j + v)]) + ret, rect = pdfdoc.get_text(page, l - pad, t - pad, r - l, b - t) + + if type(img)!=type(None) and checkletters: + (x1,y1,x2,y2) = [int(bitmap_resolution * float(rrr)/72+pad) for rrr in [rect.x1,rect.y1,rect.x2,rect.y2]] + img[y1:y2,x1:x2] += col(random.random()).astype(uint8) + + return (i, j, u, v, pg, ret) + + if checkletters: + img = (imgfloat / 2.).astype(uint8) + + if boxes: + cells = [x + (pg, + "", ) for x in cells + if (frow == None or (x[1] >= frow and x[1] <= lrow))] else: - j=j+2 - - if checklines : - for i in vd : - img[:,i] = [255,0,0] # red - - for j in hd : - img[j,:] = [0,0,255] # blue - dumpImage(outfile,bmp,img) - return True -#----------------------------------------------------------------------- -# divider checking. -# -# at this point vd holds the x coordinate of vertical and -# hd holds the y coordinate of horizontal divider tansitions for each -# vertical and horizontal lines in the table grid. - - def isDiv(a, l,r,t,b) : - # if any col or row (in axis) is all zeros ... - return sum( sum(bmp[t:b, l:r], axis=a)==0 ) >0 - - if checkdivs : - img = img / 2 - for j in range(0,len(hd),2): - for i in range(0,len(vd),2): - if i>0 : - (l,r,t,b) = (vd[i-1], vd[i], hd[j], hd[j+1]) - img[ t:b, l:r, 1 ] = 192 - if isDiv(1, l,r,t,b) : - img[ t:b, l:r, 0 ] = 0 - img[ t:b, l:r, 2 ] = 255 - - if j>0 : - (l,r,t,b) = (vd[i], vd[i+1], hd[j-1], hd[j] ) - img[ t:b, l:r, 1 ] = 128 - if isDiv(0, l,r,t,b) : - img[ t:b, l:r, 0 ] = 255 - img[ t:b, l:r, 2 ] = 0 - dumpImage(outfile,bmp,img) - return True -#----------------------------------------------------------------------- -# Cell finding section. -# This algorithum is width hungry, and always generates rectangular -# boxes. - - cells =[] - touched = zeros( (len(hd), len(vd)),dtype=bool ) - j = 0 - while j*2+2 < len (hd) : - i = 0 - while i*2+2 < len(vd) : - u = 1 - v = 1 - if not touched[j,i] : - while 2+(i+u)*2 < len(vd) and \ - not isDiv( 0, vd[ 2*(i+u) ], vd[ 2*(i+u)+1], - hd[ 2*(j+v)-1 ], hd[ 2*(j+v) ] ): - u=u+1 - bot = False - while 2+(j+v)*2 < len(hd) and not bot : - bot = False - for k in range(1,u+1) : - bot |= isDiv( 1, vd[ 2*(i+k)-1 ], vd[ 2*(i+k)], - hd[ 2*(j+v) ], hd[ 2*(j+v)+1 ] ) - if not bot : - v=v+1 - cells.append( (i,j,u,v) ) - touched[ j:j+v, i:i+u] = True - i = i+1 - j=j+1 - - - if checkcells : - nc = len(cells)+0. - img = img / 2 - for k in range(len(cells)): - (i,j,u,v) = cells[k] - (l,r,t,b) = ( vd[2*i+1] , vd[ 2*(i+u) ], hd[2*j+1], hd[2*(j+v)] ) - img[ t:b, l:r ] += col( k/nc ) - dumpImage(outfile,bmp,img) - return True - -#----------------------------------------------------------------------- -# fork out to extract text for each cell. - - whitespace = re.compile( r'\s+') - - def getCell( (i,j,u,v) ): - (l,r,t,b) = ( vd[2*i+1] , vd[ 2*(i+u) ], hd[2*j+1], hd[2*(j+v)] ) - p = popen("pdftotext", - "pdftotext -r %d -x %d -y %d -W %d -H %d -layout -nopgbrk -f %d -l %d %s -" % (bitmap_resolution, l-pad, t-pad, r-l, b-t, pg, pg, quote(infile)), - stdout=subprocess.PIPE, - shell=True ) - - ret = p.communicate()[0] - if whitespace != 'raw' : - ret = whitespace.sub( "" if whitespace == "none" else " ", ret ) - if len(ret) > 0 : - ret = ret[ (1 if ret[0]==' ' else 0) : - len(ret) - (1 if ret[-1]==' ' else 0) ] - return (i,j,u,v,pg,ret) - - if boxes : - cells = [ x + (pg,"",) for x in cells if - ( frow == None or (x[1] >= frow and x[1] <= lrow)) ] - else : - #check that pdftotext exists by running a simple command - check_for_required_executable("pdftotext",["pdftotext","-h"]) - #end check - cells = [ getCell(x) for x in cells if - ( frow == None or (x[1] >= frow and x[1] <= lrow)) ] - return cells + cells = [getCell(x, img) for x in cells + if (frow == None or (x[1] >= frow and x[1] <= lrow))] + if checkletters: + imsave(outfile+"-text-locations.png", img) + + return cells #----------------------------------------------------------------------- #output section. -def output(cells, pgs, - cells_csv_filename=None, - cells_json_filename=None, - cells_xml_filename=None, - table_csv_filename=None, - table_html_filename=None, - table_list_filename=None, - infile=None, name=None, output_type=None - ): - + +def output(cells, + pgs, + cells_csv_filename=None, + cells_json_filename=None, + cells_xml_filename=None, + table_csv_filename=None, + table_html_filename=None, + table_list_filename=None, + infile=None, + name=None, + output_type=None): + output_types = [ - dict(filename=cells_csv_filename, function=o_cells_csv), - dict(filename=cells_json_filename, function=o_cells_json), - dict(filename=cells_xml_filename, function=o_cells_xml), - dict(filename=table_csv_filename, function=o_table_csv), - dict(filename=table_html_filename, function=o_table_html), - dict(filename=table_list_filename, function=o_table_list) - ] - + dict(filename=cells_csv_filename, + function=o_cells_csv), dict(filename=cells_json_filename, + function=o_cells_json), + dict(filename=cells_xml_filename, + function=o_cells_xml), dict(filename=table_csv_filename, + function=o_table_csv), + dict(filename=table_html_filename, + function=o_table_html), dict(filename=table_list_filename, + function=o_table_list) + ] + for entry in output_types: if entry["filename"]: if entry["filename"] != sys.stdout: - outfile = open(entry["filename"],'w') + outfile = open(entry["filename"], 'w') else: outfile = sys.stdout - - entry["function"](cells, pgs, - outfile=outfile, - name=name, - infile=infile, - output_type=output_type) + + entry["function"](cells, + pgs, + outfile=outfile, + name=name, + infile=infile, + output_type=output_type) if entry["filename"] != sys.stdout: outfile.close() - -def o_cells_csv(cells,pgs, outfile=None, name=None, infile=None, output_type=None) : - outfile = outfile or sys.stdout - csv.writer( outfile , dialect='excel' ).writerows(cells) - -def o_cells_json(cells,pgs, outfile=None, infile=None, name=None, output_type=None) : - """Output JSON formatted cell data""" - outfile = outfile or sys.stdout - #defaults - infile=infile or "" - name=name or "" - - json.dump({ - "src": infile, - "name": name, - "colnames": ( "x","y","width","height","page","contents" ), - "cells":cells + + +def o_cells_csv(cells, + pgs, + outfile=None, + name=None, + infile=None, + output_type=None): + outfile = outfile or sys.stdout + csv.writer(outfile, dialect='excel').writerows(cells) + + +def o_cells_json(cells, + pgs, + outfile=None, + infile=None, + name=None, + output_type=None): + """Output JSON formatted cell data""" + outfile = outfile or sys.stdout + #defaults + infile = infile or "" + name = name or "" + + json.dump({ + "src": infile, + "name": name, + "colnames": ("x", "y", "width", "height", "page", "contents"), + "cells": cells }, outfile) -def o_cells_xml(cells,pgs, outfile=None,infile=None, name=None, output_type=None) : - """Output XML formatted cell data""" - outfile = outfile or sys.stdout - #defaults - infile=infile or "" - name=name or "" - - doc = getDOMImplementation().createDocument(None,"table", None) - root = doc.documentElement; - if infile : - root.setAttribute("src",infile) - if name : - root.setAttribute("name",name) - for cl in cells : - x = doc.createElement("cell") - map(lambda(a): x.setAttribute(*a), zip("xywhp",map(str,cl))) - if cl[5] != "" : - x.appendChild( doc.createTextNode(cl[5]) ) - root.appendChild(x) - outfile.write( doc.toprettyxml() ) - -def table_to_list(cells,pgs) : - """Output list of lists""" - l=[0,0,0] - for (i,j,u,v,pg,value) in cells : - r=[i,j,pg] - l = [max(x) for x in zip(l,r)] - - tab = [ [ [ "" for x in range(l[0]+1) - ] for x in range(l[1]+1) - ] for x in range(l[2]+1) - ] - for (i,j,u,v,pg,value) in cells : - tab[pg][j][i] = value - - return tab - -def o_table_csv(cells,pgs, outfile=None, name=None, infile=None, output_type=None) : - """Output CSV formatted table""" - outfile = outfile or sys.stdout - tab=table_to_list(cells, pgs) - for t in tab: - csv.writer( outfile , dialect='excel' ).writerows(t) - - -def o_table_list(cells,pgs, outfile=None, name=None, infile=None, output_type=None) : - """Output list of lists""" - outfile = outfile or sys.stdout - tab = table_to_list(cells, pgs) - print(tab) - -def o_table_html(cells,pgs, outfile=None, output_type=None, name=None, infile=None) : - """Output HTML formatted table""" - - oj = 0 - opg = 0 - doc = getDOMImplementation().createDocument(None,"table", None) - root = doc.documentElement; - if (output_type == "table_chtml" ): - root.setAttribute("border","1") - root.setAttribute("cellspaceing","0") - root.setAttribute("style","border-spacing:0") - nc = len(cells) - tr = None - for k in range(nc): - (i,j,u,v,pg,value) = cells[k] - if j > oj or pg > opg: - if pg > opg: - s = "Name: " + name + ", " if name else "" - root.appendChild( doc.createComment( s + - ("Source: %s page %d." % (infile, pg) ))); - if tr : - root.appendChild(tr) - tr = doc.createElement("tr") - oj = j - opg = pg - td = doc.createElement("td") - if value != "" : - td.appendChild( doc.createTextNode(value) ) - if u>1 : - td.setAttribute("colspan",str(u)) - if v>1 : - td.setAttribute("rowspan",str(v)) - if output_type == "table_chtml" : - td.setAttribute("style", "background-color: #%02x%02x%02x" % - tuple(128+col(k/(nc+0.)))) - tr.appendChild(td) - root.appendChild(tr) - outfile.write( doc.toprettyxml() ) - + +def o_cells_xml(cells, + pgs, + outfile=None, + infile=None, + name=None, + output_type=None): + """Output XML formatted cell data""" + outfile = outfile or sys.stdout + #defaults + infile = infile or "" + name = name or "" + + def _lambda(a): + return x.setAttribute(*a) + + doc = getDOMImplementation().createDocument(None, "table", None) + root = doc.documentElement + if infile: + root.setAttribute("src", infile) + if name: + root.setAttribute("name", name) + for cl in cells: + x = doc.createElement("cell") + map(_lambda, zip("xywhp", map(str, cl))) + if cl[5] != "": + x.appendChild(doc.createTextNode(cl[5])) + root.appendChild(x) + outfile.write(doc.toprettyxml()) + + +def table_to_list(cells, pgs): + """Output list of lists""" + l = [0, 0, 0] + for (i, j, u, v, pg, value) in cells: + r = [i, j, pg] + l = [max(x) for x in zip(l, r)] + + tab = [[["" for x in range(l[0] + 1)] for x in range(l[1] + 1)] + for x in range(l[2] + 1)] + for (i, j, u, v, pg, value) in cells: + tab[pg][j][i] = value + + return tab + + +def o_table_csv(cells, + pgs, + outfile=None, + name=None, + infile=None, + output_type=None): + """Output CSV formatted table""" + outfile = outfile or sys.stdout + tab = table_to_list(cells, pgs) + for t in tab: + csv.writer(outfile, dialect='excel').writerows(t) + + +def o_table_list(cells, + pgs, + outfile=None, + name=None, + infile=None, + output_type=None): + """Output list of lists""" + outfile = outfile or sys.stdout + tab = table_to_list(cells, pgs) + print(tab) + + +def o_table_html(cells, + pgs, + outfile=None, + output_type=None, + name=None, + infile=None): + """Output HTML formatted table""" + + oj = 0 + opg = 0 + doc = getDOMImplementation().createDocument(None, "table", None) + root = doc.documentElement + if (output_type == "table_chtml"): + root.setAttribute("border", "1") + root.setAttribute("cellspaceing", "0") + root.setAttribute("style", "border-spacing:0") + nc = len(cells) + tr = None + for k in range(nc): + (i, j, u, v, pg, value) = cells[k] + if j > oj or pg > opg: + if pg > opg: + s = "Name: " + name + ", " if name else "" + root.appendChild(doc.createComment(s + ("Source: %s page %d." % + (infile, pg)))) + if tr: + root.appendChild(tr) + tr = doc.createElement("tr") + oj = j + opg = pg + td = doc.createElement("td") + if value != "": + td.appendChild(doc.createTextNode(value)) + if u > 1: + td.setAttribute("colspan", str(u)) + if v > 1: + td.setAttribute("rowspan", str(v)) + if output_type == "table_chtml": + td.setAttribute("style", "background-color: #%02x%02x%02x" % + tuple(128 + col(k / (nc + 0.)))) + tr.appendChild(td) + root.appendChild(tr) + outfile.write(doc.toprettyxml()) diff --git a/src/pdftableextract/extracttab.py b/src/pdftableextract/extracttab.py deleted file mode 100644 index ab6c74d..0000000 --- a/src/pdftableextract/extracttab.py +++ /dev/null @@ -1,297 +0,0 @@ -# Description : PDF Table Extraction Utility -# Author : Ian McEwan, Ashima Research. -# Maintainer : ijm -# Lastmod : 20130402 (ijm) -# License : Copyright (C) 2011 Ashima Research. All rights reserved. -# Distributed under the MIT Expat License. See LICENSE file. -# https://github.com/ashima/pdf-table-extract - -import sys, argparse, subprocess, re, csv, json -from numpy import * -from pipes import quote -from xml.dom.minidom import getDOMImplementation - -# Proccessing function. - -def process_page(pgs) : - (pg,frow,lrow) = (map(int,(pgs.split(":")))+[None,None])[0:3] - - p = subprocess.Popen( ("pdftoppm -gray -r %d -f %d -l %d %s " % - (args.r,pg,pg,quote(args.infile))), - stdin=subprocess.PIPE, stdout=subprocess.PIPE, shell=True ) - -#----------------------------------------------------------------------- -# image load secion. - - (maxval, width, height, data) = readPNM(p.stdout) - - pad = int(args.pad) - height+=pad*2 - width+=pad*2 - -# reimbed image with a white padd. - bmp = ones( (height,width) , dtype=bool ) - bmp[pad:height-pad,pad:width-pad] = ( data[:,:] > int(255.0*args.g/100.0) ) - -# Set up Debuging image. - img = zeros( (height,width,3) , dtype=uint8 ) - img[:,:,0] = bmp*255 - img[:,:,1] = bmp*255 - img[:,:,2] = bmp*255 - -#----------------------------------------------------------------------- -# Find bounding box. - - t=0 - while t < height and sum(bmp[t,:]==0) == 0 : - t=t+1 - if t > 0 : - t=t-1 - - b=height-1 - while b > t and sum(bmp[b,:]==0) == 0 : - b=b-1 - if b < height-1: - b = b+1 - - l=0 - while l < width and sum(bmp[:,l]==0) == 0 : - l=l+1 - if l > 0 : - l=l-1 - - r=width-1 - while r > l and sum(bmp[:,r]==0) == 0 : - r=r-1 - if r < width-1 : - r=r+1 - -# Mark bounding box. - bmp[t,:] = 0 - bmp[b,:] = 0 - bmp[:,l] = 0 - bmp[:,r] = 0 - - def boxOfString(x,p) : - s = x.split(":") - if len(s) < 4 : - raise Exception("boxes have format left:top:right:bottom[:page]") - return ([args.r * float(x) + args.pad for x in s[0:4] ] - + [ p if len(s)<5 else int(s[4]) ] ) - - -# translate crop to paint white. - whites = [] - if args.crop : - (l,t,r,b,p) = boxOfString(args.crop,pg) - whites.extend( [ (0,0,l,height,p), (0,0,width,t,p), - (r,0,width,height,p), (0,b,width,height,p) ] ) - -# paint white ... - if args.white : - whites.extend( [ boxOfString(b, pg) for b in args.white ] ) - - for (l,t,r,b,p) in whites : - if p == pg : - bmp[ t:b+1,l:r+1 ] = 1 - img[ t:b+1,l:r+1 ] = [255,255,255] - -# paint black ... - if args.black : - for b in args.black : - (l,t,r,b) = [args.r * float(x) + args.pad for x in b.split(":") ] - bmp[ t:b+1,l:r+1 ] = 0 - img[ t:b+1,l:r+1 ] = [0,0,0] - - if args.checkcrop : - dumpImage(args,bmp,img) - sys.exit(0) - - -#----------------------------------------------------------------------- -# Line finding section. -# -# Find all verticle or horizontal lines that are more than rlthresh -# long, these are considered lines on the table grid. - - lthresh = int(args.l * args.r) - vs = zeros(width, dtype=int) - for i in range(width) : - dd = diff( where(bmp[:,i])[0] ) - if len(dd)>0: - v = max ( dd ) - if v > lthresh : - vs[i] = 1 - else: -# it was a solid black line. - if bmp[0,i] == 0 : - vs[i] = 1 - vd= ( where(diff(vs[:]))[0] +1 ) - - hs = zeros(height, dtype=int) - for j in range(height) : - dd = diff( where(bmp[j,:]==1)[0] ) - if len(dd) > 0 : - h = max ( dd ) - if h > lthresh : - hs[j] = 1 - else: -# it was a solid black line. - if bmp[j,0] == 0 : - hs[j] = 1 - hd=( where(diff(hs[:]==1))[0] +1 ) - -#----------------------------------------------------------------------- -# Look for dividors that are too large. - - maxdiv=10 - i=0 - - while i < len(vd) : - if vd[i+1]-vd[i] > maxdiv : - vd = delete(vd,i) - vd = delete(vd,i) - else: - i=i+2 - - j = 0 - while j < len(hd): - if hd[j+1]-hd[j] > maxdiv : - hd = delete(hd,j) - hd = delete(hd,j) - else: - j=j+2 - - if args.checklines : - for i in vd : - img[:,i] = [255,0,0] # red - - for j in hd : - img[j,:] = [0,0,255] # blue - dumpImage(args,bmp,img) - sys.exit(0) - -#----------------------------------------------------------------------- -# divider checking. -# -# at this point vd holds the x coordinate of vertical and -# hd holds the y coordinate of horizontal divider tansitions for each -# vertical and horizontal lines in the table grid. - - def isDiv(a, l,r,t,b) : - # if any col or row (in axis) is all zeros ... - return sum( sum(bmp[t:b, l:r], axis=a)==0 ) >0 - - if args.checkdivs : - img = img / 2 - for j in range(0,len(hd),2): - for i in range(0,len(vd),2): - if i>0 : - (l,r,t,b) = (vd[i-1], vd[i], hd[j], hd[j+1]) - img[ t:b, l:r, 1 ] = 192 - if isDiv(1, l,r,t,b) : - img[ t:b, l:r, 0 ] = 0 - img[ t:b, l:r, 2 ] = 255 - - if j>0 : - (l,r,t,b) = (vd[i], vd[i+1], hd[j-1], hd[j] ) - img[ t:b, l:r, 1 ] = 128 - if isDiv(0, l,r,t,b) : - img[ t:b, l:r, 0 ] = 255 - img[ t:b, l:r, 2 ] = 0 - - dumpImage(args,bmp,img) - sys.exit(0) - -#----------------------------------------------------------------------- -# Cell finding section. -# This algorithum is width hungry, and always generates rectangular -# boxes. - - cells =[] - touched = zeros( (len(hd), len(vd)),dtype=bool ) - j = 0 - while j*2+2 < len (hd) : - i = 0 - while i*2+2 < len(vd) : - u = 1 - v = 1 - if not touched[j,i] : - while 2+(i+u)*2 < len(vd) and \ - not isDiv( 0, vd[ 2*(i+u) ], vd[ 2*(i+u)+1], - hd[ 2*(j+v)-1 ], hd[ 2*(j+v) ] ): - u=u+1 - bot = False - while 2+(j+v)*2 < len(hd) and not bot : - bot = False - for k in range(1,u+1) : - bot |= isDiv( 1, vd[ 2*(i+k)-1 ], vd[ 2*(i+k)], - hd[ 2*(j+v) ], hd[ 2*(j+v)+1 ] ) - if not bot : - v=v+1 - cells.append( (i,j,u,v) ) - touched[ j:j+v, i:i+u] = True - i = i+1 - j=j+1 - - - if args.checkcells : - nc = len(cells)+0. - img = img / 2 - for k in range(len(cells)): - (i,j,u,v) = cells[k] - (l,r,t,b) = ( vd[2*i+1] , vd[ 2*(i+u) ], hd[2*j+1], hd[2*(j+v)] ) - img[ t:b, l:r ] += col( k/nc ) - dumpImage(args,bmp,img) - sys.exit(0) - - -#----------------------------------------------------------------------- -# fork out to extract text for each cell. - - whitespace = re.compile( r'\s+') - - def getCell( (i,j,u,v) ): - (l,r,t,b) = ( vd[2*i+1] , vd[ 2*(i+u) ], hd[2*j+1], hd[2*(j+v)] ) - p = subprocess.Popen( - ("pdftotext -r %d -x %d -y %d -W %d -H %d -layout -nopgbrk -f %d -l %d %s -" - % (args.r, l-pad, t-pad, r-l, b-t, pg, pg, quote(args.infile) ) ), - stdout=subprocess.PIPE, shell=True ) - - ret = p.communicate()[0] - if args.w != 'raw' : - ret = whitespace.sub( "" if args.w == "none" else " ", ret ) - if len(ret) > 0 : - ret = ret[ (1 if ret[0]==' ' else 0) : - len(ret) - (1 if ret[-1]==' ' else 0) ] - return (i,j,u,v,pg,ret) - - #if args.boxes : - # cells = [ x + (pg,"",) for x in cells ] - #else : - # cells = map(getCell, cells) - - if args.boxes : - cells = [ x + (pg,"",) for x in cells if - ( frow == None or (x[1] >= frow and x[1] <= lrow)) ] - else : - cells = [ getCell(x) for x in cells if - ( frow == None or (x[1] >= frow and x[1] <= lrow)) ] - return cells - - -#----------------------------------------------------------------------- -# main - -def main_script(): - args = procargs() - - cells = [] - for pgs in args.page : - cells.extend(process_page(pgs)) - - { "cells_csv" : o_cells_csv, "cells_json" : o_cells_json, - "cells_xml" : o_cells_xml, "table_csv" : o_table_csv, - "table_html": o_table_html, "table_chtml": o_table_html, - } [ args.t ](cells,args.page) - diff --git a/src/pdftableextract/pnm.py b/src/pdftableextract/pnm.py index cbb05dd..befce66 100644 --- a/src/pdftableextract/pnm.py +++ b/src/pdftableextract/pnm.py @@ -1,59 +1,68 @@ +from __future__ import print_function from numpy import array, fromstring, uint8, reshape, ones + #----------------------------------------------------------------------- # PNM stuff. + def noncomment(fd): - """Read lines from the filehandle until a non-comment line is found. + """Read lines from the filehandle until a non-comment line is found. Comments start with #""" - while True: - x = fd.readline() - if x.startswith('#') : - continue - else: - return x + while True: + x = fd.readline() + if x.startswith(b'#'): + continue + else: + return x + def readPNM(fd): - """Reads the PNM file from the filehandle""" - t = noncomment(fd) - s = noncomment(fd) - m = noncomment(fd) if not (t.startswith('P1') or t.startswith('P4')) else '1' - data = fd.read() - ls = len(s.split()) - if ls != 2 : - name = "" if fd.name=="" else "Filename = {0}".format(fd.name) - raise IOError("Expected 2 elements from parsing PNM file, got {0}: {1}".format(ls, name)) - xs, ys = s.split() - width = int(xs) - height = int(ys) - m = int(m) - - if m != 255 : - print "Just want 8 bit pgms for now!" - - d = fromstring(data,dtype=uint8) - d = reshape(d, (height,width) ) - return (m,width,height, d) - -def writePNM(fd,img): - """Writes a PNM file to a filehandle given the img data as a numpy array""" - s = img.shape - m = 255 - if img.dtype == bool : - img = img + uint8(0) - t = "P5" - m = 1 - elif len(s) == 2 : - t = "P5" - else: - t = "P6" - - fd.write( "%s\n%d %d\n%d\n" % (t, s[1],s[0],m) ) - fd.write( uint8(img).tostring() ) - - -def dumpImage(outfile,bmp,img,bitmap=False, pad=2) : + """Reads the PNM file from the filehandle""" + t = noncomment(fd) + s = noncomment(fd) + m = noncomment(fd) if not (t.startswith(b'P1') or + t.startswith(b'P4')) else b'1' + data = fd.read() + ls = len(s.split()) + if ls != 2: + name = "" if fd.name == "" else "Filename = {0}".format( + fd.name) + raise IOError( + "Expected 2 elements from parsing PNM file, got {0}: {1}".format( + ls, name)) + xs, ys = s.split() + width = int(xs) + height = int(ys) + m = int(m) + + if m != 255: + print("Just want 8 bit pgms for now!") + + d = fromstring(data, dtype=uint8) + d = reshape(d, (height, width)) + return (m, width, height, d) + + +def writePNM(fd, img): + """Writes a PNM file to a filehandle given the img data as a numpy array""" + s = img.shape + m = 255 + if img.dtype == bool: + img = img + uint8(0) + t = b"P5" + m = 1 + elif len(s) == 2: + t = b"P5" + else: + t = b"P6" + + fd.write(b"%s\n%d %d\n%d\n" % (t, s[1], s[0], m)) + fd.write(img.astype(uint8).tobytes()) + + +def dumpImage(outfile, bmp, img, bitmap=False, pad=2): """Dumps the numpy array in image into the filename and closes the outfile""" oi = bmp if bitmap else img - (height,width) = bmp.shape - writePNM(outfile, oi[pad:height-pad, pad:width-pad]) + (height, width) = bmp.shape + writePNM(outfile, oi[pad:height - pad, pad:width - pad]) outfile.close() diff --git a/src/pdftableextract/scripts.py b/src/pdftableextract/scripts.py index 68a7b2e..97a2ee2 100644 --- a/src/pdftableextract/scripts.py +++ b/src/pdftableextract/scripts.py @@ -2,8 +2,8 @@ import sys import logging import subprocess -from .core import process_page, output -import core +from pdftableextract.core import process_page, output +import pdftableextract.core #----------------------------------------------------------------------- @@ -25,10 +25,10 @@ def procargs() : p.add_argument("-name", help="name to add to XML tag, or HTML comments") p.add_argument("-pad", help="imitial image pading (pixels)", type=int, default=2 ) - p.add_argument("-white",action="append", + p.add_argument("-white",action="append", help="paint white to the bitmap as left:top:right:bottom in length units." "Done before painting black" ) - p.add_argument("-black",action="append", + p.add_argument("-black",action="append", help="paint black to the bitmap as left:top:right:bottom in length units." "Done after poainting white" ) p.add_argument("-bitmap", action="store_true", @@ -67,14 +67,10 @@ def main(): raise sys.exit("I/O Error running pdf-table-extract: {0}".format(e)) except OSError as e: - print("An OS Error occurred running pdf-table-extract: Is `pdftoppm` installed and available?") + print("An OS Error occurred running pdf-table-extract") if args.traceback: raise sys.exit("OS Error: {0}".format(e)) - except subprocess.CalledProcessError as e: - if args.traceback: - raise - sys.exit("Error while checking a subprocess call: {0}".format(e)) except Exception as e: if args.traceback: raise @@ -85,9 +81,9 @@ def imain(args): if args.checkcrop or args.checklines or args.checkdivs or args.checkcells: for pgs in args.page : success = process_page(args.infile, pgs, - bitmap=args.bitmap, - checkcrop=args.checkcrop, - checklines=args.checklines, + bitmap=args.bitmap, + checkcrop=args.checkcrop, + checklines=args.checklines, checkdivs=args.checkdivs, checkcells=args.checkcells, whitespace=args.whitespace, @@ -105,9 +101,9 @@ def imain(args): else: for pgs in args.page : cells.extend(process_page(args.infile, pgs, - bitmap=args.bitmap, - checkcrop=args.checkcrop, - checklines=args.checklines, + bitmap=args.bitmap, + checkcrop=args.checkcrop, + checklines=args.checklines, checkdivs=args.checkdivs, checkcells=args.checkcells, whitespace=args.whitespace, @@ -127,6 +123,3 @@ def imain(args): args.outfile = sys.stdout filenames["{0}_filename".format(args.t)] = args.outfile output(cells, args.page, name=args.name, infile=args.infile, output_type=args.t, **filenames) - - -