From 9e040e99fc7501a7280e56e3e3a150c7d2527cf4 Mon Sep 17 00:00:00 2001 From: Tendo33 Date: Tue, 24 Dec 2024 17:52:50 +0800 Subject: [PATCH] fix(mspowerpoint): handle invalid images in PowerPoint slides - Add error handling for images that cannot be loaded by Pillow - Improve resilience when encountering corrupted or unsupported image formats - Maintain processing of other slide elements even if an image fails to load --- docling/backend/mspowerpoint_backend.py | 26 ++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/docling/backend/mspowerpoint_backend.py b/docling/backend/mspowerpoint_backend.py index f595e4bd..995969d4 100644 --- a/docling/backend/mspowerpoint_backend.py +++ b/docling/backend/mspowerpoint_backend.py @@ -16,7 +16,7 @@ TableCell, TableData, ) -from PIL import Image +from PIL import Image, UnidentifiedImageError from pptx import Presentation from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER @@ -120,6 +120,7 @@ def handle_text_elements(self, shape, parent_slide, slide_ind, doc): bullet_type = "None" list_text = "" list_label = GroupLabel.LIST + doc_label = DocItemLabel.LIST_ITEM prov = self.generate_prov(shape, slide_ind, shape.text.strip()) # Identify if shape contains lists @@ -276,16 +277,19 @@ def handle_pictures(self, shape, parent_slide, slide_ind, doc): im_dpi, _ = image.dpi # Open it with PIL - pil_image = Image.open(BytesIO(image_bytes)) - - # shape has picture - prov = self.generate_prov(shape, slide_ind, "") - doc.add_picture( - parent=parent_slide, - image=ImageRef.from_pil(image=pil_image, dpi=im_dpi), - caption=None, - prov=prov, - ) + try: + pil_image = Image.open(BytesIO(image_bytes)) + + # shape has picture + prov = self.generate_prov(shape, slide_ind, "") + doc.add_picture( + parent=parent_slide, + image=ImageRef.from_pil(image=pil_image, dpi=im_dpi), + caption=None, + prov=prov, + ) + except (UnidentifiedImageError, OSError) as e: + _log.warning(f"Warning: image cannot be loaded by Pillow: {e}") return def handle_tables(self, shape, parent_slide, slide_ind, doc):