File tree Expand file tree Collapse file tree 4 files changed +54
-3
lines changed Expand file tree Collapse file tree 4 files changed +54
-3
lines changed Original file line number Diff line number Diff line change 22import contextlib
33from tempfile import mkstemp
44import os
5- from .utils import antiword , antixml
5+ from .utils import antiword , antixml , pdftotext
66from cached_property import cached_property
77import io , requests
88import logging
@@ -89,6 +89,17 @@ def antiword_xml(self):
8989 def antiword_text (self ):
9090 return antixml (self .antiword_xml )
9191
92+ @cached_property
93+ def pdf_text (self ):
94+ """ Uses pdftotext to extract text from a PDF document.
95+
96+ Pages are separated by a 0x0c (form feed) character.
97+ """
98+
99+ text = pdftotext (self .file_name ).decode ('utf-8' )
100+ # FIXME: remove explicit bidi characters?
101+ return text
102+
92103 def _close (self ):
93104 [func () for func in self ._cleanup ]
94105
Original file line number Diff line number Diff line change @@ -50,7 +50,13 @@ def text(self):
5050 if self ._file_type == 'text' :
5151 return self ._file_data
5252 else :
53- text = decode (self .antiword_text , 'utf-8' )
53+ extension = self .file_extension .lower ()
54+ if self .file_extension == 'doc' :
55+ text = decode (self .antiword_text , 'utf-8' )
56+ elif self .file_extension == 'pdf' :
57+ text = decode (self .pdf_text , 'utf-8' )
58+ else :
59+ text = ''
5460 tmp = text .split ('OMNITECH' )
5561 if len (tmp )== 2 and len (tmp [0 ]) < 40 :
5662 text = tmp [1 ]
Original file line number Diff line number Diff line change @@ -8,3 +8,16 @@ def __str__(self):
88 return "antiword processing failed, probably because antiword is not installed, try 'sudo apt-get install antiword'"
99 else :
1010 return "antiword processing failed: {output}" .format (output = self .output .split ("\n " )[0 ])
11+
12+
13+ class PdftotextNotInstalledException (Exception ):
14+ def __str__ (self ):
15+ return "pdftotext binary does not seem to be installed. Try installing it using e.g. 'sudo apt-get install poppler-utils'"
16+
17+
18+ class PdftotextException (CalledProcessError ):
19+ def __str__ (self ):
20+ if not self .output :
21+ return "pdftotext processing silently failed."
22+ else :
23+ return "pdftotext processing failed: {output}" .format (output = self .output .split ("\n " )[0 ])
Original file line number Diff line number Diff line change 22import logging
33import subprocess
44import os
5+ import subprocess
56import xml .etree .ElementTree as ET
6- from .exceptions import AntiwordException
7+ from .exceptions import AntiwordException , PdftotextException , \
8+ PdftotextNotInstalledException
79import six
810
911# solve issues with unicode for python3/2
@@ -44,6 +46,25 @@ def antiword(filename):
4446 return xmldata
4547
4648
49+ def pdftotext (filename ):
50+ """ returns the text of a PDF file given by its file.
51+
52+ Uses pdftotext from package poppler-utils on Debian
53+ """
54+ if not os .path .exists (filename ):
55+ raise IOError ('File not found: %s' % filename )
56+ try :
57+ text = subprocess .check_output (['pdftotext' , filename , '-' ],
58+ stderr = subprocess .STDOUT )
59+ except FileNotFoundError as e :
60+ raise PdftotextNotInstalledException ()
61+ except subprocess .CalledProcessError as e :
62+ print ("Tzafrir: <<no poppler?>>" )
63+ sys .exit (2 )
64+ raise PdftotextException (e .returncode , e .cmd , e .output )
65+ return text
66+
67+
4768def fix_hyphens (text ):
4869 return text .replace (u"\n \n –\n \n " ,u" – " )
4970
You can’t perform that action at this time.
0 commit comments