Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ addons:
apt:
packages:
- antiword
- poppler-utils
- language-pack-he
install:
- ".travis/install.sh"
Expand Down
26 changes: 20 additions & 6 deletions knesset_data/protocols/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import contextlib
from tempfile import mkstemp
import os
from .utils import antiword, antixml
from .utils import antiword, antixml, pdftotext
from cached_property import cached_property
import io, requests
import logging
Expand All @@ -15,8 +15,9 @@ class BaseProtocolFile(object):

temp_file_suffix = "temp_knesset_data_protocols_"

def __init__(self, file, proxies=None):
def __init__(self, file, proxies=None, extension=None):
self._file_type, self._file_data = file
self.extension = extension
self._cleanup = []
self._proxies = proxies if proxies else {}

Expand Down Expand Up @@ -55,6 +56,8 @@ def file_extension(self):
if self._file_type in ("filename", "url") and self._file_data:
filename, file_extension = os.path.splitext(self._file_data)
return file_extension[1:]
if self.extension is not None:
return self.extension
else:
return None

Expand Down Expand Up @@ -86,13 +89,24 @@ def antiword_xml(self):
def antiword_text(self):
return antixml(self.antiword_xml)

@cached_property
def pdf_text(self):
""" Uses pdftotext to extract text from a PDF document.

Pages are separated by a 0x0c (form feed) character.
"""

text = pdftotext(self.file_name).decode('utf-8')
# FIXME: remove explicit bidi characters?
return text

def _close(self):
[func() for func in self._cleanup]

@classmethod
@contextlib.contextmanager
def _get_from(cls, file_type, file_data, proxies=None):
obj = cls((file_type, file_data), proxies=proxies)
def _get_from(cls, file_type, file_data, proxies=None, extension=None):
obj = cls((file_type, file_data), proxies=proxies, extension=extension)
try:
yield obj
finally:
Expand All @@ -110,8 +124,8 @@ def get_from_url(cls, url, proxies=None):

@classmethod
@contextlib.contextmanager
def get_from_file(cls, file):
with cls._get_from('file', file) as p: yield p
def get_from_file(cls, file, extension=None):
with cls._get_from('file', file, extension=extension) as p: yield p

@classmethod
@contextlib.contextmanager
Expand Down
8 changes: 7 additions & 1 deletion knesset_data/protocols/committee.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,13 @@ def text(self):
if self._file_type == 'text':
return self._file_data
else:
text = decode(self.antiword_text, 'utf-8')
extension = self.file_extension.lower()
if self.file_extension == 'doc':
text = decode(self.antiword_text, 'utf-8')
elif self.file_extension == 'pdf':
text = decode(self.pdf_text, 'utf-8')
else:
text = ''
tmp = text.split('OMNITECH')
if len(tmp)==2 and len(tmp[0]) < 40:
text = tmp[1]
Expand Down
13 changes: 13 additions & 0 deletions knesset_data/protocols/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,16 @@ def __str__(self):
return "antiword processing failed, probably because antiword is not installed, try 'sudo apt-get install antiword'"
else:
return "antiword processing failed: {output}".format(output=self.output.split("\n")[0])


class PdftotextNotInstalledException(Exception):
def __str__(self):
return "pdftotext binary does not seem to be installed. Try installing it using e.g. 'sudo apt-get install poppler-utils'"


class PdftotextException(CalledProcessError):
def __str__(self):
if not self.output:
return "pdftotext processing silently failed."
else:
return "pdftotext processing failed: {output}".format(output=self.output.split("\n")[0])
23 changes: 22 additions & 1 deletion knesset_data/protocols/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@
import logging
import subprocess
import os
import subprocess
import xml.etree.ElementTree as ET
from .exceptions import AntiwordException
from .exceptions import AntiwordException, PdftotextException, \
PdftotextNotInstalledException
import six

# solve issues with unicode for python3/2
Expand Down Expand Up @@ -44,6 +46,25 @@ def antiword(filename):
return xmldata


def pdftotext(filename):
""" returns the text of a PDF file given by its file.

Uses pdftotext from package poppler-utils on Debian
"""
if not os.path.exists(filename):
raise IOError('File not found: %s'%filename)
try:
text = subprocess.check_output(['pdftotext', filename, '-'],
stderr=subprocess.STDOUT)
except FileNotFoundError as e:
raise PdftotextNotInstalledException()
except subprocess.CalledProcessError as e:
print("Tzafrir: <<no poppler?>>")
sys.exit(2)
raise PdftotextException(e.returncode, e.cmd, e.output)
return text


def fix_hyphens(text):
return text.replace(u"\n\n–\n\n",u" – ")

Expand Down