Extract text from PDF files using pdftotext

Tzafrir Cohen · Tzafrir Cohen · commit 3942004c6771 · 2018-02-23T19:21:50.000+02:00
* Use pdftotext (program from poppler-utils) to extract text from PDF
  files.
* Don't try to extract text from files that don't have a '.doc'
  extension.
diff --git a/knesset_data/protocols/base.py b/knesset_data/protocols/base.py
@@ -2,7 +2,7 @@
 import contextlib
 from tempfile import mkstemp
 import os
-from .utils import antiword, antixml
+from .utils import antiword, antixml, pdftotext
 from cached_property import cached_property
 import io, requests
 import logging
@@ -89,6 +89,17 @@ def antiword_xml(self):
     def antiword_text(self):
         return antixml(self.antiword_xml)
 
+    @cached_property
+    def pdf_text(self):
+        """ Uses pdftotext to extract text from a PDF document.
+
+        Pages are separated by a 0x0c (form feed) character.
+        """
+
+        text = pdftotext(self.file_name).decode('utf-8')
+        # FIXME: remove explicit bidi characters?
+        return text
+
     def _close(self):
         [func() for func in self._cleanup]
 
diff --git a/knesset_data/protocols/committee.py b/knesset_data/protocols/committee.py
@@ -50,7 +50,13 @@ def text(self):
         if self._file_type == 'text':
             return self._file_data
         else:
-            text = decode(self.antiword_text, 'utf-8')
+            extension = self.file_extension.lower()
+            if self.file_extension == 'doc':
+                text = decode(self.antiword_text, 'utf-8')
+            elif self.file_extension == 'pdf':
+                text = decode(self.pdf_text, 'utf-8')
+            else:
+                text = ''
             tmp = text.split('OMNITECH')
             if len(tmp)==2 and len(tmp[0]) < 40:
                 text = tmp[1]
diff --git a/knesset_data/protocols/exceptions.py b/knesset_data/protocols/exceptions.py
@@ -8,3 +8,16 @@ def __str__(self):
             return "antiword processing failed, probably because antiword is not installed, try 'sudo apt-get install antiword'"
         else:
             return "antiword processing failed: {output}".format(output=self.output.split("\n")[0])
+
+
+class PdftotextNotInstalledException(Exception):
+    def __str__(self):
+        return "pdftotext binary does not seem to be installed. Try installing it using e.g. 'sudo apt-get install poppler-utils'"
+
+
+class PdftotextException(CalledProcessError):
+    def __str__(self):
+        if not self.output:
+            return "pdftotext processing silently failed."
+        else:
+            return "pdftotext processing failed: {output}".format(output=self.output.split("\n")[0])
diff --git a/knesset_data/protocols/utils.py b/knesset_data/protocols/utils.py
@@ -2,8 +2,10 @@
 import logging
 import subprocess
 import os
+import subprocess
 import xml.etree.ElementTree as ET
-from .exceptions import AntiwordException
+from .exceptions import AntiwordException, PdftotextException, \
+                        PdftotextNotInstalledException
 import six
 
 # solve issues with unicode for python3/2
@@ -44,6 +46,25 @@ def antiword(filename):
     return xmldata
 
 
+def pdftotext(filename):
+    """ returns the text of a PDF file given by its file.
+
+    Uses pdftotext from package poppler-utils on Debian
+    """
+    if not os.path.exists(filename):
+        raise IOError('File not found: %s'%filename)
+    try:
+        text = subprocess.check_output(['pdftotext', filename, '-'],
+                                       stderr=subprocess.STDOUT)
+    except FileNotFoundError as e:
+        raise PdftotextNotInstalledException()
+    except subprocess.CalledProcessError as e:
+        print("Tzafrir: <<no poppler?>>")
+        sys.exit(2)
+        raise PdftotextException(e.returncode, e.cmd, e.output)
+    return text
+
+
 def fix_hyphens(text):
     return text.replace(u"\n\n–\n\n",u" – ")