Skip to content

Commit 3942004

Browse files
author
Tzafrir Cohen
committed
Extract text from PDF files using pdftotext
* Use pdftotext (program from poppler-utils) to extract text from PDF files. * Don't try to extract text from files that don't have a '.doc' extension.
1 parent 2f0d890 commit 3942004

File tree

4 files changed

+54
-3
lines changed

4 files changed

+54
-3
lines changed

knesset_data/protocols/base.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import contextlib
33
from tempfile import mkstemp
44
import os
5-
from .utils import antiword, antixml
5+
from .utils import antiword, antixml, pdftotext
66
from cached_property import cached_property
77
import io, requests
88
import logging
@@ -89,6 +89,17 @@ def antiword_xml(self):
8989
def antiword_text(self):
9090
return antixml(self.antiword_xml)
9191

92+
@cached_property
93+
def pdf_text(self):
94+
""" Uses pdftotext to extract text from a PDF document.
95+
96+
Pages are separated by a 0x0c (form feed) character.
97+
"""
98+
99+
text = pdftotext(self.file_name).decode('utf-8')
100+
# FIXME: remove explicit bidi characters?
101+
return text
102+
92103
def _close(self):
93104
[func() for func in self._cleanup]
94105

knesset_data/protocols/committee.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,13 @@ def text(self):
5050
if self._file_type == 'text':
5151
return self._file_data
5252
else:
53-
text = decode(self.antiword_text, 'utf-8')
53+
extension = self.file_extension.lower()
54+
if self.file_extension == 'doc':
55+
text = decode(self.antiword_text, 'utf-8')
56+
elif self.file_extension == 'pdf':
57+
text = decode(self.pdf_text, 'utf-8')
58+
else:
59+
text = ''
5460
tmp = text.split('OMNITECH')
5561
if len(tmp)==2 and len(tmp[0]) < 40:
5662
text = tmp[1]

knesset_data/protocols/exceptions.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,3 +8,16 @@ def __str__(self):
88
return "antiword processing failed, probably because antiword is not installed, try 'sudo apt-get install antiword'"
99
else:
1010
return "antiword processing failed: {output}".format(output=self.output.split("\n")[0])
11+
12+
13+
class PdftotextNotInstalledException(Exception):
14+
def __str__(self):
15+
return "pdftotext binary does not seem to be installed. Try installing it using e.g. 'sudo apt-get install poppler-utils'"
16+
17+
18+
class PdftotextException(CalledProcessError):
19+
def __str__(self):
20+
if not self.output:
21+
return "pdftotext processing silently failed."
22+
else:
23+
return "pdftotext processing failed: {output}".format(output=self.output.split("\n")[0])

knesset_data/protocols/utils.py

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,10 @@
22
import logging
33
import subprocess
44
import os
5+
import subprocess
56
import xml.etree.ElementTree as ET
6-
from .exceptions import AntiwordException
7+
from .exceptions import AntiwordException, PdftotextException, \
8+
PdftotextNotInstalledException
79
import six
810

911
# solve issues with unicode for python3/2
@@ -44,6 +46,25 @@ def antiword(filename):
4446
return xmldata
4547

4648

49+
def pdftotext(filename):
50+
""" returns the text of a PDF file given by its file.
51+
52+
Uses pdftotext from package poppler-utils on Debian
53+
"""
54+
if not os.path.exists(filename):
55+
raise IOError('File not found: %s'%filename)
56+
try:
57+
text = subprocess.check_output(['pdftotext', filename, '-'],
58+
stderr=subprocess.STDOUT)
59+
except FileNotFoundError as e:
60+
raise PdftotextNotInstalledException()
61+
except subprocess.CalledProcessError as e:
62+
print("Tzafrir: <<no poppler?>>")
63+
sys.exit(2)
64+
raise PdftotextException(e.returncode, e.cmd, e.output)
65+
return text
66+
67+
4768
def fix_hyphens(text):
4869
return text.replace(u"\n\n\n\n",u" – ")
4970

0 commit comments

Comments
 (0)