codeflash-ai · codeflash-ai · Jan 24, 2026
diff --git a/unstructured/staging/base.py b/unstructured/staging/base.py
@@ -381,50 +381,57 @@ def convert_to_csv(elements: Iterable[Element]) -> str:
 
 @requires_dependencies(["pandas"])
 def get_default_pandas_dtypes() -> dict[str, Any]:
-    return {
-        "text": pd.StringDtype(),  # type: ignore
-        "type": pd.StringDtype(),  # type: ignore
-        "element_id": pd.StringDtype(),  # type: ignore
-        "filename": pd.StringDtype(),  # Optional[str]  # type: ignore
-        "filetype": pd.StringDtype(),  # Optional[str]  # type: ignore
-        "file_directory": pd.StringDtype(),  # Optional[str]  # type: ignore
-        "last_modified": pd.StringDtype(),  # Optional[str]  # type: ignore
-        "attached_to_filename": pd.StringDtype(),  # Optional[str]  # type: ignore
-        "parent_id": pd.StringDtype(),  # Optional[str],  # type: ignore
-        "category_depth": "Int64",  # Optional[int]
-        "image_path": pd.StringDtype(),  # Optional[str]  # type: ignore
-        "languages": object,  # Optional[list[str]]
-        "page_number": "Int64",  # Optional[int]
-        "page_name": pd.StringDtype(),  # Optional[str]  # type: ignore
-        "url": pd.StringDtype(),  # Optional[str]  # type: ignore
-        "link_urls": pd.StringDtype(),  # Optional[str]  # type: ignore
-        "link_texts": object,  # Optional[list[str]]
-        "links": object,
-        "sent_from": object,  # Optional[list[str]],
-        "sent_to": object,  # Optional[list[str]]
-        "subject": pd.StringDtype(),  # Optional[str]  # type: ignore
-        "section": pd.StringDtype(),  # Optional[str]  # type: ignore
-        "header_footer_type": pd.StringDtype(),  # Optional[str]  # type: ignore
-        "emphasized_text_contents": object,  # Optional[list[str]]
-        "emphasized_text_tags": object,  # Optional[list[str]]
-        "text_as_html": pd.StringDtype(),  # Optional[str]  # type: ignore
-        "max_characters": "Int64",  # Optional[int]
-        "is_continuation": "boolean",  # Optional[bool]
-        "detection_class_prob": float,  # Optional[float],
-        "sender": pd.StringDtype(),  # type: ignore
-        "coordinates_points": object,
-        "coordinates_system": pd.StringDtype(),  # type: ignore
-        "coordinates_layout_width": float,
-        "coordinates_layout_height": float,
-        "data_source_url": pd.StringDtype(),  # Optional[str]  # type: ignore
-        "data_source_version": pd.StringDtype(),  # Optional[str]  # type: ignore
-        "data_source_record_locator": object,
-        "data_source_date_created": pd.StringDtype(),  # Optional[str]  # type: ignore
-        "data_source_date_modified": pd.StringDtype(),  # Optional[str]  # type: ignore
-        "data_source_date_processed": pd.StringDtype(),  # Optional[str]  # type: ignore
-        "data_source_permissions_data": object,
-        "embeddings": object,
-    }
+    cached = getattr(get_default_pandas_dtypes, "_cached_template", None)
+    if cached is None:
+        pd_string = pd.StringDtype()  # type: ignore
+        cached = {
+            "text": pd_string,  # type: ignore
+            "type": pd_string,  # type: ignore
+            "element_id": pd_string,  # type: ignore
+            "filename": pd_string,  # Optional[str]  # type: ignore
+            "filetype": pd_string,  # Optional[str]  # type: ignore
+            "file_directory": pd_string,  # Optional[str]  # type: ignore
+            "last_modified": pd_string,  # Optional[str]  # type: ignore
+            "attached_to_filename": pd_string,  # Optional[str]  # type: ignore
+            "parent_id": pd_string,  # Optional[str],  # type: ignore
+            "category_depth": "Int64",  # Optional[int]
+            "image_path": pd_string,  # Optional[str]  # type: ignore
+            "languages": object,  # Optional[list[str]]
+            "page_number": "Int64",  # Optional[int]
+            "page_name": pd_string,  # Optional[str]  # type: ignore
+            "url": pd_string,  # Optional[str]  # type: ignore
+            "link_urls": pd_string,  # Optional[str]  # type: ignore
+            "link_texts": object,  # Optional[list[str]]
+            "links": object,
+            "sent_from": object,  # Optional[list[str]],
+            "sent_to": object,  # Optional[list[str]]
+            "subject": pd_string,  # Optional[str]  # type: ignore
+            "section": pd_string,  # Optional[str]  # type: ignore
+            "header_footer_type": pd_string,  # Optional[str]  # type: ignore
+            "emphasized_text_contents": object,  # Optional[list[str]]
+            "emphasized_text_tags": object,  # Optional[list[str]]
+            "text_as_html": pd_string,  # Optional[str]  # type: ignore
+            "max_characters": "Int64",  # Optional[int]
+            "is_continuation": "boolean",  # Optional[bool]
+            "detection_class_prob": float,  # Optional[float],
+            "sender": pd_string,  # type: ignore
+            "coordinates_points": object,
+            "coordinates_system": pd_string,  # type: ignore
+            "coordinates_layout_width": float,
+            "coordinates_layout_height": float,
+            "data_source_url": pd_string,  # Optional[str]  # type: ignore
+            "data_source_version": pd_string,  # Optional[str]  # type: ignore
+            "data_source_record_locator": object,
+            "data_source_date_created": pd_string,  # Optional[str]  # type: ignore
+            "data_source_date_modified": pd_string,  # Optional[str]  # type: ignore
+            "data_source_date_processed": pd_string,  # Optional[str]  # type: ignore
+            "data_source_permissions_data": object,
+            "embeddings": object,
+        }
+        # cache the template dict to avoid recreating dtype instances repeatedly
+        get_default_pandas_dtypes._cached_template = cached
+    # return a shallow copy to preserve original behavior (fresh dict each call)
+    return dict(cached)
 
 
 @requires_dependencies(["pandas"])