Skip to content

Commit 9683245

Browse files
committed
fix(mspowerpoint): handle invalid images in PowerPoint slides
- Add error handling for images that cannot be loaded by Pillow - Improve resilience when encountering corrupted or unsupported image formats - Maintain processing of other slide elements even if an image fails to load Signed-off-by: Tendo33 <[email protected]>
1 parent 2b591f9 commit 9683245

File tree

1 file changed

+15
-11
lines changed

1 file changed

+15
-11
lines changed

docling/backend/mspowerpoint_backend.py

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
TableCell,
1717
TableData,
1818
)
19-
from PIL import Image
19+
from PIL import Image, UnidentifiedImageError
2020
from pptx import Presentation
2121
from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER
2222

@@ -120,6 +120,7 @@ def handle_text_elements(self, shape, parent_slide, slide_ind, doc):
120120
bullet_type = "None"
121121
list_text = ""
122122
list_label = GroupLabel.LIST
123+
doc_label = DocItemLabel.LIST_ITEM
123124
prov = self.generate_prov(shape, slide_ind, shape.text.strip())
124125

125126
# Identify if shape contains lists
@@ -276,16 +277,19 @@ def handle_pictures(self, shape, parent_slide, slide_ind, doc):
276277
im_dpi, _ = image.dpi
277278

278279
# Open it with PIL
279-
pil_image = Image.open(BytesIO(image_bytes))
280-
281-
# shape has picture
282-
prov = self.generate_prov(shape, slide_ind, "")
283-
doc.add_picture(
284-
parent=parent_slide,
285-
image=ImageRef.from_pil(image=pil_image, dpi=im_dpi),
286-
caption=None,
287-
prov=prov,
288-
)
280+
try:
281+
pil_image = Image.open(BytesIO(image_bytes))
282+
283+
# shape has picture
284+
prov = self.generate_prov(shape, slide_ind, "")
285+
doc.add_picture(
286+
parent=parent_slide,
287+
image=ImageRef.from_pil(image=pil_image, dpi=im_dpi),
288+
caption=None,
289+
prov=prov,
290+
)
291+
except (UnidentifiedImageError, OSError) as e:
292+
_log.warning(f"Warning: image cannot be loaded by Pillow: {e}")
289293
return
290294

291295
def handle_tables(self, shape, parent_slide, slide_ind, doc):

0 commit comments

Comments
 (0)