hasadna · tzafrir0hasadna01 · Feb 23, 2018 · Feb 23, 2018
diff --git a/.travis.yml b/.travis.yml
@@ -6,6 +6,7 @@ addons:
   apt:
     packages:
     - antiword
+    - poppler-utils
     - language-pack-he
 install:
 - ".travis/install.sh"

diff --git a/knesset_data/protocols/base.py b/knesset_data/protocols/base.py
@@ -2,7 +2,7 @@
 import contextlib
 from tempfile import mkstemp
 import os
-from .utils import antiword, antixml
+from .utils import antiword, antixml, pdftotext
 from cached_property import cached_property
 import io, requests
 import logging
@@ -15,8 +15,9 @@ class BaseProtocolFile(object):
 
     temp_file_suffix = "temp_knesset_data_protocols_"
 
-    def __init__(self, file, proxies=None):
+    def __init__(self, file, proxies=None, extension=None):
         self._file_type, self._file_data = file
+        self.extension = extension
         self._cleanup = []
         self._proxies = proxies if proxies else {}
 
@@ -55,6 +56,8 @@ def file_extension(self):
         if self._file_type in ("filename", "url") and self._file_data:
             filename, file_extension = os.path.splitext(self._file_data)
             return file_extension[1:]
+        if self.extension is not None:
+            return self.extension
         else:
             return None
 
@@ -86,13 +89,24 @@ def antiword_xml(self):
     def antiword_text(self):
         return antixml(self.antiword_xml)
 
+    @cached_property
+    def pdf_text(self):
+        """ Uses pdftotext to extract text from a PDF document.
+
+        Pages are separated by a 0x0c (form feed) character.
+        """
+
+        text = pdftotext(self.file_name).decode('utf-8')
+        # FIXME: remove explicit bidi characters?
+        return text
+
     def _close(self):
         [func() for func in self._cleanup]
 
     @classmethod
     @contextlib.contextmanager
-    def _get_from(cls, file_type, file_data, proxies=None):
-        obj = cls((file_type, file_data), proxies=proxies)
+    def _get_from(cls, file_type, file_data, proxies=None, extension=None):
+        obj = cls((file_type, file_data), proxies=proxies, extension=extension)
         try:
             yield obj
         finally:
@@ -110,8 +124,8 @@ def get_from_url(cls, url, proxies=None):
 
     @classmethod
     @contextlib.contextmanager
-    def get_from_file(cls, file):
-        with cls._get_from('file', file) as p: yield p
+    def get_from_file(cls, file, extension=None):
+        with cls._get_from('file', file, extension=extension) as p: yield p
 
     @classmethod
     @contextlib.contextmanager

diff --git a/knesset_data/protocols/committee.py b/knesset_data/protocols/committee.py
@@ -50,7 +50,13 @@ def text(self):
         if self._file_type == 'text':
             return self._file_data
         else:
-            text = decode(self.antiword_text, 'utf-8')
+            extension = self.file_extension.lower()
+            if self.file_extension == 'doc':
+                text = decode(self.antiword_text, 'utf-8')
+            elif self.file_extension == 'pdf':
+                text = decode(self.pdf_text, 'utf-8')
+            else:
+                text = ''
             tmp = text.split('OMNITECH')
             if len(tmp)==2 and len(tmp[0]) < 40:
                 text = tmp[1]

diff --git a/knesset_data/protocols/exceptions.py b/knesset_data/protocols/exceptions.py
@@ -8,3 +8,16 @@ def __str__(self):
             return "antiword processing failed, probably because antiword is not installed, try 'sudo apt-get install antiword'"
         else:
             return "antiword processing failed: {output}".format(output=self.output.split("\n")[0])
+
+
+class PdftotextNotInstalledException(Exception):
+    def __str__(self):
+        return "pdftotext binary does not seem to be installed. Try installing it using e.g. 'sudo apt-get install poppler-utils'"
+
+
+class PdftotextException(CalledProcessError):
+    def __str__(self):
+        if not self.output:
+            return "pdftotext processing silently failed."
+        else:
+            return "pdftotext processing failed: {output}".format(output=self.output.split("\n")[0])
diff --git a/knesset_data/protocols/utils.py b/knesset_data/protocols/utils.py
@@ -2,8 +2,10 @@
 import logging
 import subprocess
 import os
+import subprocess
 import xml.etree.ElementTree as ET
-from .exceptions import AntiwordException
+from .exceptions import AntiwordException, PdftotextException, \
+                        PdftotextNotInstalledException
 import six
 
 # solve issues with unicode for python3/2
@@ -44,6 +46,25 @@ def antiword(filename):
     return xmldata
 
 
+def pdftotext(filename):
+    """ returns the text of a PDF file given by its file.
+
+    Uses pdftotext from package poppler-utils on Debian
+    """
+    if not os.path.exists(filename):
+        raise IOError('File not found: %s'%filename)
+    try:
+        text = subprocess.check_output(['pdftotext', filename, '-'],
+                                       stderr=subprocess.STDOUT)
+    except FileNotFoundError as e:
+        raise PdftotextNotInstalledException()
+    except subprocess.CalledProcessError as e:
+        print("Tzafrir: <<no poppler?>>")
+        sys.exit(2)
+        raise PdftotextException(e.returncode, e.cmd, e.output)
+    return text
+
+
 def fix_hyphens(text):
     return text.replace(u"\n\n–\n\n",u" – ")