Skip to content

Commit

Permalink
fix(mspowerpoint): handle invalid images in PowerPoint slides
Browse files Browse the repository at this point in the history
- Add error handling for images that cannot be loaded by Pillow
- Improve resilience when encountering corrupted or unsupported image formats
- Maintain processing of other slide elements even if an image fails to load
  • Loading branch information
Tendo33 committed Dec 24, 2024
1 parent 2b591f9 commit 9e040e9
Showing 1 changed file with 15 additions and 11 deletions.
26 changes: 15 additions & 11 deletions docling/backend/mspowerpoint_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
TableCell,
TableData,
)
from PIL import Image
from PIL import Image, UnidentifiedImageError
from pptx import Presentation
from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER

Expand Down Expand Up @@ -120,6 +120,7 @@ def handle_text_elements(self, shape, parent_slide, slide_ind, doc):
bullet_type = "None"
list_text = ""
list_label = GroupLabel.LIST
doc_label = DocItemLabel.LIST_ITEM
prov = self.generate_prov(shape, slide_ind, shape.text.strip())

# Identify if shape contains lists
Expand Down Expand Up @@ -276,16 +277,19 @@ def handle_pictures(self, shape, parent_slide, slide_ind, doc):
im_dpi, _ = image.dpi

# Open it with PIL
pil_image = Image.open(BytesIO(image_bytes))

# shape has picture
prov = self.generate_prov(shape, slide_ind, "")
doc.add_picture(
parent=parent_slide,
image=ImageRef.from_pil(image=pil_image, dpi=im_dpi),
caption=None,
prov=prov,
)
try:
pil_image = Image.open(BytesIO(image_bytes))

# shape has picture
prov = self.generate_prov(shape, slide_ind, "")
doc.add_picture(
parent=parent_slide,
image=ImageRef.from_pil(image=pil_image, dpi=im_dpi),
caption=None,
prov=prov,
)
except (UnidentifiedImageError, OSError) as e:
_log.warning(f"Warning: image cannot be loaded by Pillow: {e}")
return

def handle_tables(self, shape, parent_slide, slide_ind, doc):
Expand Down

0 comments on commit 9e040e9

Please sign in to comment.