diff --git a/libs/community/langchain_community/document_loaders/parsers/pdf.py b/libs/community/langchain_community/document_loaders/parsers/pdf.py index 4cdfa1b9..9cce9497 100644 --- a/libs/community/langchain_community/document_loaders/parsers/pdf.py +++ b/libs/community/langchain_community/document_loaders/parsers/pdf.py @@ -436,37 +436,219 @@ def extract_images_from_page(self, page: pypdf._page.PageObject) -> str: xObject = page["/Resources"]["/XObject"].get_object() images = [] - for obj in xObject: + for obj_name in xObject: # Renamed obj to obj_name np_image: Any = None - if xObject[obj]["/Subtype"] == "/Image": - img_filter = ( - xObject[obj]["/Filter"][1:] - if type(xObject[obj]["/Filter"]) is pypdf.generic._base.NameObject - else xObject[obj]["/Filter"][0][1:] - ) - if img_filter in _PDF_FILTER_WITHOUT_LOSS: - height, width = xObject[obj]["/Height"], xObject[obj]["/Width"] - - np_image = np.frombuffer( - xObject[obj].get_data(), dtype=np.uint8 - ).reshape(height, width, -1) - elif img_filter in _PDF_FILTER_WITH_LOSS: - np_image = np.array(Image.open(io.BytesIO(xObject[obj].get_data()))) - - else: - logger.warning("Unknown PDF Filter!") - if np_image is not None: - image_bytes = io.BytesIO() + image_data_processed = False # Flag to indicate if np_image is set + try: + if xObject[obj_name]["/Subtype"] == "/Image": + raw_data = xObject[obj_name].get_data() + img_filter_obj = xObject[obj_name]["/Filter"] + + # Handle cases where Filter might be a list or a single NameObject + if isinstance(img_filter_obj, pypdf.generic.ArrayObject): + if not img_filter_obj: # Empty filter array + logger.warning(f"PyPDFParser: Skipping image ({obj_name}) due to empty /Filter array.") + continue + # Get the first filter name from the array + first_filter_in_array = img_filter_obj[0] + if isinstance(first_filter_in_array, pypdf.generic.NameObject): + filter_val = str(first_filter_in_array) + img_filter = filter_val[1:] if filter_val.startswith("/") else filter_val + else: + logger.warning(f"PyPDFParser: Skipping image ({obj_name}) as first element in /Filter array is not a NameObject: {type(first_filter_in_array)}.") + continue + elif isinstance(img_filter_obj, pypdf.generic.NameObject): + filter_val = str(img_filter_obj) + img_filter = filter_val[1:] if filter_val.startswith("/") else filter_val + else: + logger.warning(f"PyPDFParser: Skipping image ({obj_name}) due to unknown /Filter type: {type(img_filter_obj)}.") + continue - if image_bytes.getbuffer().nbytes == 0: + if not img_filter: + logger.warning(f"PyPDFParser: Skipping image ({obj_name}) due to missing or empty /Filter name.") continue - Image.fromarray(np_image).save(image_bytes, format="PNG") - blob = Blob.from_data(image_bytes.getvalue(), mime_type="image/png") - image_text = next(self.images_parser.lazy_parse(blob)).page_content - images.append( - _format_inner_image(blob, image_text, self.images_inner_format) + # Define a new list for filters best handled by Pillow + # This includes lossy filters, 1-bit fax/jbig2, and common generic + # compression filters that might wrap actual image file formats (e.g., PNG). + _FILTERS_HANDLED_BY_PILLOW = ( + _PDF_FILTER_WITH_LOSS + + [ + "CCITTFaxDecode", "CCF", + "JBIG2Decode", + "FlateDecode", "Fl", # Added FlateDecode + "LZWDecode", "LZW", # Added LZWDecode + "RunLengthDecode", "RL", # Added RunLengthDecode + ] ) + # ASCII85Decode and ASCIIHexDecode are typically encoding layers for binary data, + # not image formats themselves. The data *after* these decodes would then + # be subject to further filtering (e.g. FlateDecode then DCTDecode). + # pypdf's get_data() should handle these base encodings. + # So, _PDF_FILTER_WITHOUT_LOSS will now be much smaller, primarily for cases + # where data might truly be raw pixels after ASCII/Hex decode, if any. + + if img_filter in _FILTERS_HANDLED_BY_PILLOW: + try: + pil_img_from_bytes = Image.open(io.BytesIO(raw_data)) + # Convert mode if necessary to ensure consistency for np.array + # and channel detection. + # Common modes after open: 1, L, P (palette), RGB, RGBA, CMYK, YCbCr + if pil_img_from_bytes.mode == "P": # Palette + # Convert P to RGBA or RGB to handle transparency and simplify + pil_img_from_bytes = pil_img_from_bytes.convert("RGBA" if "transparency" in pil_img_from_bytes.info else "RGB") + elif pil_img_from_bytes.mode not in ("1", "L", "RGB", "RGBA"): + # For other modes like CMYK, YCbCr, etc., convert to RGB + pil_img_from_bytes = pil_img_from_bytes.convert("RGB") + + np_image = np.array(pil_img_from_bytes) + image_data_processed = True + + # Determine channels from the (potentially converted) PIL image mode + if pil_img_from_bytes.mode == "1" or pil_img_from_bytes.mode == "L": + pil_channels = 1 + elif pil_img_from_bytes.mode == "RGBA": + pil_channels = 4 + elif pil_img_from_bytes.mode == "RGB": # Covers P converted to RGB, CMYK to RGB etc. + pil_channels = 3 + else: + # This case should ideally not be reached if conversions above are comprehensive + logger.warning(f"PyPDFParser: Image ({obj_name}, filter {img_filter}) has an unexpected mode '{pil_img_from_bytes.mode}' after Pillow processing. Attempting 3 channels.") + pil_channels = 3 # Default assumption + + except Exception as e: + logger.warning( + f"PyPDFParser: Could not open/process image ({obj_name}) " + f"with filter {img_filter} using Pillow: {e}" + ) + continue + elif img_filter in _PDF_FILTER_WITHOUT_LOSS: + # This path is now for filters NOT in _FILTERS_HANDLED_BY_PILLOW. + # Given the expansion of _FILTERS_HANDLED_BY_PILLOW, this branch + # will be hit less often. It might apply to ASCIIHex/ASCII85 if they + # were the *only* filter and the result was raw pixel data, or other + # very specific uncompressed formats not typically wrapped by Flate etc. + # The safeguard below is now more general. + if img_filter in _FILTERS_HANDLED_BY_PILLOW: # Safeguard + logger.warning( + f"PyPDFParser: Filter {img_filter} for image ({obj_name}) " + "was unexpectedly routed to reshape path (safeguard). Skipping." + ) + continue + + height = int(xObject[obj_name]["/Height"]) + width = int(xObject[obj_name]["/Width"]) + actual_size = len(raw_data) + + if actual_size == 0 or height == 0 or width == 0: + logger.warning( + f"PyPDFParser: Skipping image ({obj_name}, filter: {img_filter}) " + f"due to zero size, height, or width. HxW: {height}x{width}, Size: {actual_size}" + ) + continue + + if actual_size % (height * width) != 0: + logger.warning( + f"PyPDFParser: Skipping image ({obj_name}, filter: {img_filter}). " + f"Actual data size ({actual_size}) is not divisible by H*W " + f"({height*width})." + ) + continue + + inferred_channels = actual_size // (height * width) + if inferred_channels not in (1, 3, 4): + logger.warning( + f"PyPDFParser: Skipping image ({obj_name}, filter: {img_filter}). " + f"Inferred channels ({inferred_channels}) is not 1, 3, or 4. " + f"Dimensions: {width}x{height}, Actual Size: {actual_size}." + ) + continue + + np_image = np.frombuffer(raw_data, dtype=np.uint8).reshape( + height, width, inferred_channels + ) + image_data_processed = True + # Store inferred_channels for PIL processing + pil_channels = inferred_channels + + elif img_filter in _PDF_FILTER_WITH_LOSS: # e.g. DCTDecode (JPEG), JPXDecode (JPEG2000) + try: + pil_img_from_bytes = Image.open(io.BytesIO(raw_data)) + # Convert to RGB if it's not, common for JPEGs that might be CMYK etc. + if pil_img_from_bytes.mode not in ("L", "RGB", "RGBA"): + pil_img_from_bytes = pil_img_from_bytes.convert("RGB") + np_image = np.array(pil_img_from_bytes) + image_data_processed = True + # Determine channels from PIL image mode + if pil_img_from_bytes.mode == "L": + pil_channels = 1 + elif pil_img_from_bytes.mode == "RGBA": + pil_channels = 4 + else: # Primarily RGB + pil_channels = 3 + except Exception as e: + logger.warning( + f"PyPDFParser: Could not open image ({obj_name}) " + f"with filter {img_filter} using Pillow: {e}" + ) + continue + else: + logger.warning(f"PyPDFParser: Unknown PDF Filter for image ({obj_name}): {img_filter}") + continue # Skip if filter is unknown + + if image_data_processed and np_image is not None: + # Ensure np_image is a numpy array (Image.open might return PIL image) + if not isinstance(np_image, np.ndarray): + np_image = np.array(np_image) + + # Check for empty arrays after conversion (e.g. if PIL image was empty) + if np_image.size == 0: + logger.warning(f"PyPDFParser: Skipping image ({obj_name}) as it resulted in an empty numpy array.") + continue + + pil_image_to_save = Image.fromarray(np_image) + + # Adjust PIL image mode based on (inferred) channels for correct saving + if pil_channels == 1: + pil_image_to_save = pil_image_to_save.convert("L") + elif pil_channels == 4: + # Check if alpha channel is all opaque, then convert to RGB + if np_image.ndim == 3 and np_image.shape[2] == 4: + if (np_image[:, :, 3] == 255).all(): + pil_image_to_save = pil_image_to_save.convert("RGB") + # For 3 channels, it's often already RGB. convert("RGB") handles other modes. + elif pil_image_to_save.mode != "RGB": # Ensure RGB for 3 channels if not already + pil_image_to_save = pil_image_to_save.convert("RGB") + + + image_bytes_io = io.BytesIO() + save_format = "PNG" # PNG is a good lossless choice for varied channel data + pil_image_to_save.save(image_bytes_io, format=save_format) + + if image_bytes_io.getbuffer().nbytes == 0: + logger.warning( + f"PyPDFParser: Skipping image ({obj_name}) as it resulted in " + f"zero bytes after PIL saving." + ) + continue + + blob = Blob.from_data( + image_bytes_io.getvalue(), mime_type=f"image/{save_format.lower()}" + ) + image_text = next(self.images_parser.lazy_parse(blob)).page_content + images.append( + _format_inner_image(blob, image_text, self.images_inner_format) + ) + except ValueError as e: # Catch reshape errors specifically + logger.warning( + f"PyPDFParser: Failed to reshape image ({obj_name}). Error: {e}" + ) + continue # Skip to next image object + except Exception as e: # Catch other errors during processing an image object + logger.warning( + f"PyPDFParser: Error processing XObject ({obj_name}). Error: {e}" + ) + continue # Skip to next image object return _FORMAT_IMAGE_STR.format( image_text=_JOIN_IMAGES.join(filter(None, images)) ) @@ -1104,26 +1286,137 @@ def _extract_images_from_page( img_list = page.get_images() images = [] - for img in img_list: + for img_info in img_list: # Renamed img to img_info to avoid conflict with PIL.Image if self.images_parser: - xref = img[0] - pix = pymupdf.Pixmap(doc, xref) - image = np.frombuffer(pix.samples, dtype=np.uint8).reshape( - pix.height, pix.width, -1 - ) - image_bytes = io.BytesIO() - if image_bytes.getbuffer().nbytes == 0: - continue + xref = img_info[0] + try: + pix = pymupdf.Pixmap(doc, xref) + height = pix.height + width = pix.width + channels = pix.n + samples = pix.samples + actual_size = len(samples) + expected_size = height * width * channels + + if actual_size == 0 or height == 0 or width == 0: + logger.warning( + f"PyMuPDFParser: Skipping image (xref: {xref}) due to zero " + f"size, height, or width. HxW: {height}x{width}, Size: {actual_size}" + ) + continue - numpy.save(image_bytes, image) - blob = Blob.from_data( - image_bytes.getvalue(), mime_type="application/x-npy" - ) - image_text = next(self.images_parser.lazy_parse(blob)).page_content + image_array = None + if channels not in (1, 3, 4): # Grayscale, RGB, RGBA + # Attempt to handle if pix.n is 0 but samples are valid for common channels + if channels == 0 and actual_size % (height * width) == 0: + inferred_ch = actual_size // (height * width) + if inferred_ch in (1,3,4): + logger.warning( + f"PyMuPDFParser: Image (xref: {xref}) has pix.n=0. " + f"Attempting reshape with inferred channels: {inferred_ch}." + ) + channels = inferred_ch # Update channels for subsequent use + expected_size = height * width * channels # Update expected_size + else: + logger.warning( + f"PyMuPDFParser: Skipping image (xref: {xref}) with pix.n=0 and " + f"inferred_channels ({inferred_ch}) not in (1,3,4)." + ) + continue + else: + logger.warning( + f"PyMuPDFParser: Skipping image (xref: {xref}) with " + f"unusual number of channels: {channels}. Expected 1, 3, or 4. " + f"Dimensions: {width}x{height}." + ) + continue - images.append( - _format_inner_image(blob, image_text, self.images_inner_format) - ) + if actual_size == expected_size: + image_array = np.frombuffer(samples, dtype=np.uint8).reshape( + height, width, channels + ) + elif actual_size % (height * width) == 0: + inferred_channels_fallback = actual_size // (height * width) + if inferred_channels_fallback in (1, 3, 4): + logger.warning( + f"PyMuPDFParser: Image (xref: {xref}) data size ({actual_size}) " + f"does not match expected size ({expected_size}) based on " + f"pix.n ({pix.n}). Attempting reshape with inferred " + f"channels: {inferred_channels_fallback}." + ) + image_array = np.frombuffer(samples, dtype=np.uint8).reshape( + height, width, inferred_channels_fallback + ) + channels = inferred_channels_fallback # Update channels for PIL + else: + logger.warning( + f"PyMuPDFParser: Skipping image (xref: {xref}). Actual data size " + f"({actual_size}) is a multiple of H*W ({height*width}), " + f"but fallback inferred channels ({inferred_channels_fallback}) " + f"is not 1, 3, or 4." + ) + continue + else: + logger.warning( + f"PyMuPDFParser: Skipping image (xref: {xref}) due to data size " + f"mismatch. Dimensions: {width}x{height}, " + f"Channels (from pix.n): {pix.n}, Actual Size: {actual_size}, " + f"Expected Size: {expected_size}. Actual size is not a " + f"multiple of H*W." + ) + continue + + if image_array is None: + logger.warning( + f"PyMuPDFParser: Image array for (xref: {xref}) was not created. Skipping." + ) + continue + + # Convert numpy array to PIL Image and save to bytes + pil_image = Image.fromarray(image_array) + + # Adjust PIL image mode based on channels for correct saving + if channels == 1: + pil_image = pil_image.convert("L") + elif channels == 4: + # Check if alpha channel is all opaque, then convert to RGB + if image_array.shape[2] == 4: + if (image_array[:, :, 3] == 255).all(): + pil_image = pil_image.convert("RGB") + channels = 3 # Update channels as it's now effectively RGB + # For 3 channels, it's already RGB or handled by convert("RGB") if needed + + image_bytes_io = io.BytesIO() + save_format = "PNG" # PNG supports L, RGB, RGBA + pil_image.save(image_bytes_io, format=save_format) + + if image_bytes_io.getbuffer().nbytes == 0: + logger.warning( + f"PyMuPDFParser: Skipping image (xref: {xref}) as it resulted " + f"in zero bytes after PIL saving." + ) + continue + + blob = Blob.from_data( + image_bytes_io.getvalue(), mime_type=f"image/{save_format.lower()}" + ) + image_text = next(self.images_parser.lazy_parse(blob)).page_content + images.append( + _format_inner_image(blob, image_text, self.images_inner_format) + ) + except ValueError as e: # Catch reshape errors specifically + logger.warning( + f"PyMuPDFParser: Failed to reshape image (xref: {xref}). " + f"Original pix.n: {pix.n if 'pix' in locals() else 'N/A'}, " + f"HxW: {height if 'height' in locals() else 'N/A'}x" + f"{width if 'width' in locals() else 'N/A'}. Error: {e}" + ) + continue # Skip to next image + except Exception as e: # Catch other errors during processing + logger.warning( + f"PyMuPDFParser: Error processing image (xref: {xref}). Error: {e}" + ) + continue # Skip to next image return _FORMAT_IMAGE_STR.format( image_text=_JOIN_IMAGES.join(filter(None, images)) )