-
-
Notifications
You must be signed in to change notification settings - Fork 310
/
utils.py
59 lines (50 loc) · 2.06 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# Copyright 2015-2021 Akretion France
# @author: Alexis de Lattre <[email protected]>
# Copyright 2022 Camptocamp SA
# @author: Simone Orsi <[email protected]>
# License LGPL-3.0 or later (http://www.gnu.org/licenses/lgpl).
import logging
import mimetypes
from io import BytesIO
from lxml import etree
_logger = logging.getLogger(__name__)
try:
import pypdf
except ImportError:
_logger.debug("Cannot import pypdf")
class PDFParser:
def __init__(self, pdf_file):
self.pdf_file = pdf_file
def get_xml_files(self):
"""Parse PDF files to extract XML content.
:param pdf_file: binary PDF file content
:returns: a dict like {$filename: $parsed_xml_file_obj}.
"""
res = {}
with BytesIO(self.pdf_file) as fd:
res = self._extract_xml_files(fd)
if res:
_logger.debug("Valid XML files found in PDF: %s", list(res.keys()))
return res
def _extract_xml_files(self, fd):
reader = pypdf.PdfReader(fd)
# attachment parsing via pypdf doesn't support /Kids
# cf my bug report https://github.com/py-pdf/pypdf/issues/2087
xmlfiles = {}
for filename, content_list in reader.attachments.items():
_logger.debug("Attachment %s found in PDF", filename)
mime_res = mimetypes.guess_type(filename)
if mime_res and mime_res[0] in ["application/xml", "text/xml"]:
try:
_logger.debug("Trying to parse XML attachment %s", filename)
xml_root = etree.fromstring(content_list[0])
if len(xml_root) > 0:
_logger.info("Valid XML file %s found in attachments", filename)
xmlfiles[filename] = xml_root
else:
_logger.warning("XML file %s is empty", filename)
except Exception as err:
_logger.warning(
"Failed to parse XML file %s. Error: %s", filename, str(err)
)
return xmlfiles