Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 8 additions & 3 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -1,9 +1,14 @@
language: python
python:
- "2.7"
- "3.5"
- "3.6"
before_install:
- sudo apt-get -qq update
- sudo apt-get install -y tesseract-ocr ghostscript imagemagick
install:
- "pip install -r requirements.txt --use-mirrors"
- "pip install pytest mock --use-mirrors"
- "pip install -r requirements.txt"
- "pip install pytest mock"
- "pip install ."
script:
- "python setup.py test"
- "pytest test"
61 changes: 40 additions & 21 deletions pypdfocr/pypdfocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,24 +21,37 @@
import itertools
from functools import wraps

from version import __version__
from .version import __version__
from PIL import Image
import yaml

import multiprocessing
# Replace the Popen routine to allow win32 pyinstaller to build
from multiprocessing import forking
from pypdfocr_multiprocessing import _Popen

""" Special work-around to support multiprocessing and pyinstaller --onefile on windows systms

https://github.com/pyinstaller/pyinstaller/wiki/Recipe-Multiprocessing
"""
try:
# Python 3.4+
if sys.platform.startswith('win'):
import multiprocessing.popen_spawn_win32 as forking
else:
import multiprocessing.popen_fork as forking
except ImportError:
import multiprocessing.forking as forking

from .pypdfocr_multiprocessing import _Popen
forking.Popen = _Popen

from pypdfocr_pdf import PyPdf
from pypdfocr_tesseract import PyTesseract
from pypdfocr_gs import PyGs
from pypdfocr_watcher import PyPdfWatcher
from pypdfocr_pdffiler import PyPdfFiler
from pypdfocr_filer_dirs import PyFilerDirs
from pypdfocr_filer_evernote import PyFilerEvernote
from pypdfocr_preprocess import PyPreprocess
from .pypdfocr_pdf import PyPdf
from .pypdfocr_tesseract import PyTesseract
from .pypdfocr_gs import PyGs
from .pypdfocr_watcher import PyPdfWatcher
from .pypdfocr_pdffiler import PyPdfFiler
from .pypdfocr_filer_dirs import PyFilerDirs
from .pypdfocr_filer_evernote import ENABLED as evernote_enabled
from .pypdfocr_filer_evernote import PyFilerEvernote
from .pypdfocr_preprocess import PyPreprocess

def error(text):
print("ERROR: %s" % text)
Expand All @@ -49,12 +62,14 @@ def retry(count=5, exc_type = Exception):
def decorator(func):
@wraps(func)
def result(*args, **kwargs):
err = None
for _ in range(count):
try:
return func(*args, **kwargs)
except exc_type:
pass
raise
except exc_type as e:
err = e
else:
raise err
return result
return decorator

Expand Down Expand Up @@ -161,11 +176,11 @@ def get_options(self, argv):
filing_group = p.add_argument_group(title="Filing optinos")
filing_group.add_argument('-f', '--file', action='store_true',
default=False, dest='enable_filing', help='Enable filing of converted PDFs')
#filing_group.add_argument('-c', '--config', type = argparse.FileType('r'),
# filing_group.add_argument('-c', '--config', type = argparse.FileType('r'),
filing_group.add_argument('-c', '--config', type = lambda x: open_file_with_timeout(p,x),
dest='configfile', help='Configuration file for defaults and PDF filing')
filing_group.add_argument('-e', '--evernote', action='store_true',
default=False, dest='enable_evernote', help='Enable filing to Evernote')
default=False, dest='enable_evernote', help='Enable filing to Evernote.')
filing_group.add_argument('-n', action='store_true',
default=False, dest='match_using_filename', help='Use filename to match if contents did not match anything, before filing to default folder')

Expand Down Expand Up @@ -204,7 +219,11 @@ def get_options(self, argv):
logging.debug("Read in configuration file")
logging.debug(self.config)

if args.enable_evernote:
# Evernote filing does not work in py3
if args.enable_evernote and not evernote_enabled:
print("Warning: Evernote filing disabled, could not find evernote API. Evernote not available in py3.")
self.enable_evernote = False
elif args.enable_evernote:
self.enable_evernote = True
else:
self.enable_evernote = False
Expand Down Expand Up @@ -367,11 +386,11 @@ def run_conversion(self, pdf_filename):
time.sleep(1)
if not self.debug:
# Need to clean up the original image files before preprocessing
if locals().has_key("fns"): # Have to check if this was set before exception raised
if "fns" in locals(): # Have to check if this was set before exception raised
logging.info("Cleaning up %s" % fns)
self._clean_up_files(fns)

if locals().has_key("preprocess_imagefilenames"): # Have to check if this was set before exception raised
if "preprocess_imagefilenames" in locals(): # Have to check if this was set before exception raised
logging.info("Cleaning up %s" % preprocess_imagefilenames)
self._clean_up_files(preprocess_imagefilenames) # splat the hocr_filenames as it is a list of pairs
for ext in [".hocr", ".html", ".txt"]:
Expand Down Expand Up @@ -467,7 +486,7 @@ def go(self, argv):
except KeyboardInterrupt:
break
except Exception as e:
print traceback.print_exc(e)
print(traceback.print_exc(e))
py_watcher.stop()

else:
Expand Down
2 changes: 1 addition & 1 deletion pypdfocr/pypdfocr_filer_dirs.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
import os
import shutil

from pypdfocr_filer import PyFiler
from .pypdfocr_filer import PyFiler

"""
Implementation of a filer class
Expand Down
20 changes: 12 additions & 8 deletions pypdfocr/pypdfocr_filer_evernote.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,17 +19,21 @@
import time
import sys

from pypdfocr_filer import PyFiler
from .pypdfocr_filer import PyFiler

import functools

from evernote.api.client import EvernoteClient
import evernote.edam.type.ttypes as Types
import evernote.edam.userstore.constants as UserStoreConstants
from evernote.edam.error.ttypes import EDAMUserException
from evernote.edam.error.ttypes import EDAMSystemException
from evernote.edam.error.ttypes import EDAMNotFoundException
from evernote.edam.error.ttypes import EDAMErrorCode
try:
from evernote.api.client import EvernoteClient
import evernote.edam.type.ttypes as Types
import evernote.edam.userstore.constants as UserStoreConstants
from evernote.edam.error.ttypes import EDAMUserException
from evernote.edam.error.ttypes import EDAMSystemException
from evernote.edam.error.ttypes import EDAMNotFoundException
from evernote.edam.error.ttypes import EDAMErrorCode
ENABLED = True
except ImportError:
ENABLED = False


"""
Expand Down
26 changes: 13 additions & 13 deletions pypdfocr/pypdfocr_gs.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,21 +92,21 @@ def _find_windows_gs(self):
listing = os.listdir('.')

# Find all possible gs* sub-directories
listing = [x for x in listing if x.startswith('gs')]
listing = [x for x in listing if x.startswith('gs')]

# TODO: Make this a natural sort
listing.sort(reverse=True)
for bindir in listing:
binpath = os.path.join(bindir,'bin')
if not os.path.exists(binpath): continue
os.chdir(binpath)
for bindir in listing:
binpath = os.path.join(bindir,'bin')
if not os.path.exists(binpath): continue
os.chdir(binpath)
# Look for gswin64c.exe or gswin32c.exe (the c is for the command-line version)
gswin = glob.glob('gswin*c.exe')
if len(gswin) == 0:
continue
gs = os.path.abspath(gswin[0]) # Just use the first found .exe (Do i need to do anything more complicated here?)
os.chdir(cwd)
return gs
gswin = glob.glob('gswin*c.exe')
if len(gswin) == 0:
continue
gs = os.path.abspath(gswin[0]) # Just use the first found .exe (Do i need to do anything more complicated here?)
os.chdir(cwd)
return gs

if not gs:
error(self.msgs['GS_MISSING_BINARY'])
Expand Down Expand Up @@ -171,10 +171,10 @@ def _run_gs(self, options, output_filename, pdf_filename):
try:
cmd = '%s -q -dNOPAUSE %s -sOutputFile="%s" "%s" -c quit' % (self.binary, options, output_filename, pdf_filename)
logging.info(cmd)
out = subprocess.check_output(cmd, shell=True)
out = subprocess.check_output(cmd, shell=True, universal_newlines=True)

except subprocess.CalledProcessError as e:
print e.output
print(e.output)
if "undefined in .getdeviceparams" in e.output:
error(self.msgs['GS_OUTDATED'])
else:
Expand Down
16 changes: 11 additions & 5 deletions pypdfocr/pypdfocr_multiprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,19 +13,25 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import sys, os, multiprocessing.forking
import logging
import os
import sys

""" Special work-around to support multiprocessing and pyinstaller --onefile on windows systms

https://github.com/pyinstaller/pyinstaller/wiki/Recipe-Multiprocessing
"""
try:
# Python 3.4+
if sys.platform.startswith('win'):
import multiprocessing.popen_spawn_win32 as forking
else:
import multiprocessing.popen_fork as forking
except ImportError:
import multiprocessing.forking as forking

import multiprocessing.forking as forking
import os
import sys

class _Popen(multiprocessing.forking.Popen):
class _Popen(forking.Popen):
def __init__(self, *args, **kw):
if hasattr(sys, 'frozen'):
# We have to set original _MEIPASS2 value from sys._MEIPASS
Expand Down
8 changes: 4 additions & 4 deletions pypdfocr/pypdfocr_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@
import tempfile
import glob

import cStringIO
import base64
import zlib
import math
Expand All @@ -52,7 +51,7 @@
from reportlab.lib.enums import TA_LEFT
from reportlab.platypus.paragraph import Paragraph

from pypdfocr_util import Retry
from .pypdfocr_util import Retry
from functools import partial

class RotatedPara(Paragraph):
Expand Down Expand Up @@ -152,10 +151,11 @@ def overlay_hocr_pages(self, dpi, hocr_filenames, orig_pdf_filename):
all_text_filename = os.path.join(pdf_dir, "%s_text.pdf" % (basename))
merger = PdfFileMerger()
for text_pdf_filename in text_pdf_filenames:
merger.append(PdfFileReader(file(text_pdf_filename, 'rb')))
with open(text_pdf_filename, 'rb') as f:
merger.append(PdfFileReader(f))
merger.write(all_text_filename)
merger.close()
del merger
del merger


writer = PdfFileWriter()
Expand Down
17 changes: 8 additions & 9 deletions pypdfocr/pypdfocr_pdffiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,14 @@
on keywords
"""

from sets import Set
import sys, os
import re
import logging
import shutil

from PyPDF2 import PdfFileReader
from pypdfocr_filer import PyFiler
from pypdfocr_filer_dirs import PyFilerDirs
from .pypdfocr_filer import PyFiler
from .pypdfocr_filer_dirs import PyFilerDirs

class PyPdfFiler(object):
def __init__(self, filer):
Expand All @@ -36,15 +35,15 @@ def __init__(self, filer):

# Whether to fall back on filename for matching keywords against
# if there is no match in the text
self.file_using_filename = False
self.file_using_filename = False

def iter_pdf_page_text(self, filename):
self.filename = filename
reader = PdfFileReader(filename)
logging.info("pdf scanner found %d pages in %s" % (reader.getNumPages(), filename))
for pgnum in range(reader.getNumPages()):
text = reader.getPage(pgnum).extractText()
text = text.encode('ascii', 'ignore')
# text = text.encode('ascii', 'ignore')
text = text.replace('\n', ' ')
yield text

Expand All @@ -56,10 +55,10 @@ def _get_matching_folder(self, pdfText):
if s in searchText:
logging.info("Matched keyword '%s'" % s)
return folder
# No match found, so return
# No match found, so return
return None

def file_original (self, original_filename):
def file_original(self, original_filename):
return self.filer.file_original(original_filename)

def move_to_matching_folder(self, filename):
Expand All @@ -72,9 +71,9 @@ def move_to_matching_folder(self, filename):

tgt_file = self.filer.move_to_matching_folder(filename, tgt_folder)
return tgt_file

if __name__ == '__main__':
p = PyPdfFiler(PyFilerDirs())
for page_text in p.iter_pdf_page_text("scan_ocr.pdf"):
print (page_text)
print(page_text)

8 changes: 4 additions & 4 deletions pypdfocr/pypdfocr_preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
import signal

from multiprocessing import Pool
from pypdfocr_interrupts import init_worker
from .pypdfocr_interrupts import init_worker

# Ugly hack to pass in object method to the multiprocessing library
# From http://www.rueckstiess.net/research/snippets/show/ca1d7d90
Expand Down Expand Up @@ -58,7 +58,7 @@ def cmd(self, cmd_list):
logging.debug(out)
return out
except subprocess.CalledProcessError as e:
print e.output
print(e.output)
self._warn("Could not run command %s" % cmd_list)


Expand Down Expand Up @@ -102,14 +102,14 @@ def preprocess(self, in_filenames):
logging.info("Starting preprocessing parallel execution")
preprocessed_filenames = pool.map(unwrap_self,zip([self]*len(fns),fns))
pool.close()
except KeyboardInterrupt or Exception:
except (KeyboardInterrupt, Exception):
print("Caught keyboard interrupt... terminating")
pool.terminate()
#sys,exit(-1)
raise
finally:
pool.join()
logging.info ("Completed preprocessing")
logging.info("Completed preprocessing")

return preprocessed_filenames

Expand Down
Loading