forcedotcom
diff --git a/‎src/datacustomcode/__init__.py‎
Lines changed: 25 additions & 5 deletions b/‎src/datacustomcode/__init__.py‎
Lines changed: 25 additions & 5 deletions
diff --git a/‎src/datacustomcode/client.py‎
Lines changed: 2 additions & 2 deletions b/‎src/datacustomcode/client.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/datacustomcode/function/feature_types/chunking.py‎
Lines changed: 83 additions & 45 deletions b/‎src/datacustomcode/function/feature_types/chunking.py‎
Lines changed: 83 additions & 45 deletions
diff --git a/‎src/datacustomcode/function_utils.py‎
Lines changed: 25 additions & 3 deletions b/‎src/datacustomcode/function_utils.py‎
Lines changed: 25 additions & 3 deletions
diff --git a/‎src/datacustomcode/io/reader/sf_cli.py‎
Lines changed: 3 additions & 1 deletion b/‎src/datacustomcode/io/reader/sf_cli.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎src/datacustomcode/io/reader/utils.py‎
Lines changed: 19 additions & 19 deletions b/‎src/datacustomcode/io/reader/utils.py‎
Lines changed: 19 additions & 19 deletions
@@ -13,15 +13,35 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from datacustomcode.client import Client
-from datacustomcode.credentials import AuthType, Credentials
-from datacustomcode.io.reader.query_api import QueryAPIDataCloudReader
-from datacustomcode.io.writer.print import PrintDataCloudWriter
-
 __all__ = [
     "AuthType",
     "Client",
     "Credentials",
     "PrintDataCloudWriter",
     "QueryAPIDataCloudReader",
 ]
+
+
+def __getattr__(name: str):
+    """Lazy import heavy dependencies."""
+    if name == "Client":
+        from datacustomcode.client import Client
+
+        return Client
+    elif name == "AuthType":
+        from datacustomcode.credentials import AuthType
+
+        return AuthType
+    elif name == "Credentials":
+        from datacustomcode.credentials import Credentials
+
+        return Credentials
+    elif name == "PrintDataCloudWriter":
+        from datacustomcode.io.writer.print import PrintDataCloudWriter
+
+        return PrintDataCloudWriter
+    elif name == "QueryAPIDataCloudReader":
+        from datacustomcode.io.reader.query_api import QueryAPIDataCloudReader
+
+        return QueryAPIDataCloudReader
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
@@ -112,8 +112,8 @@ class Client:
     def __new__(
         cls,
         reader: Optional[BaseDataCloudReader] = None,
-        writer: Optional["BaseDataCloudWriter"] = None,
-        spark_provider: Optional["BaseSparkSessionProvider"] = None,
+        writer: Optional[BaseDataCloudWriter] = None,
+        spark_provider: Optional[BaseSparkSessionProvider] = None,
         code_type: str = "script",
     ) -> Client:
 
 
@@ -50,82 +50,114 @@ class ChunkType(str, Enum):
 class SearchIndexChunkingV1PrependField(BaseModel):
     """Field to prepend to chunk content"""
 
-    dmo_name: str = Field(
-        default="", description="Data Model Object name", examples=["udmo_1__dlm"]
+    dmo_name: Optional[str] = Field(
+        default=None, description="Data Model Object name", examples=["udmo_1__dlm"]
     )
-    field_name: str = Field(
-        default="",
+    field_name: Optional[str] = Field(
+        default=None,
         description="Field name to prepend",
         examples=["ResolvedFilePath__c"],
     )
-    value: str = Field(
-        default="",
+    value: Optional[str] = Field(
+        default=None,
         description="Field value to prepend",
         examples=["udlo_1__dll:quarterly_report.pdf"],
     )
     model_config = ConfigDict(extra="ignore")
 
 
 class SearchIndexChunkingV1TranscriptField(BaseModel):
-    """Field to prepend to chunk content"""
+    """Transcript timing and speaker metadata for audio/video documents"""
 
-    speaker: str = Field(
-        default="",
+    speaker: Optional[str] = Field(
+        default=None,
         description="Speaker name for audio/video transcripts",
         examples=["Agent"],
     )
-    start_timestamp: str = Field(
-        default="",
+    start_timestamp: Optional[str] = Field(
+        default=None,
         description="Start timestamp in ISO8601 format: YYYY-MM-DDTHH:MM:SS.ffffff",
         examples=["2026-03-25T02:01:24.918000"],
     )
-    end_timestamp: str = Field(
-        default="",
+    end_timestamp: Optional[str] = Field(
+        default=None,
         description="End timestamp in ISO8601 format: YYYY-MM-DDTHH:MM:SS.ffffff",
         examples=["2026-03-25T02:01:30.500000"],
     )
     model_config = ConfigDict(extra="ignore")
 
 
 class SearchIndexChunkingV1Metadata(BaseModel):
-    """Metadata for input documents"""
+    """Metadata for input documents."""
 
-    type: DocumentType = Field(
-        default=DocumentType.TEXT, description="Document type (text)", examples=["text"]
-    )
-    transcript_fields: SearchIndexChunkingV1TranscriptField = Field(
-        default_factory=SearchIndexChunkingV1TranscriptField,
+    type: Optional[DocumentType] = Field(
+        default=DocumentType.TEXT,
         description=(
-            "Transcript information. Will only be there in case of audio-video files"
+            "Document type of the chunk input. Currently only 'text' is supported."
         ),
+        examples=["text"],
     )
-    page_number: int = Field(
-        default=0,
-        description="Page number in the source document (0-based)",
+    page_number: Optional[int] = Field(
+        default=None,
+        description=("Page number in the source document (0-based). "),
         examples=[1],
     )
+    transcript_fields: Optional[SearchIndexChunkingV1TranscriptField] = Field(
+        default=None,
+        description=(
+            "Speaker and timestamp metadata for audio/video transcripts. "
+            "Optional — only present when the source document is a transcript."
+        ),
+    )
     text_as_html: Optional[str] = Field(
         default=None,
-        description="HTML representation of the document text",
+        description=("HTML representation of the chunk text, if available. "),
         examples=["<p>Online Remittance Instructions</p>"],
     )
-    source_dmo_fields: Dict[str, Union[str, int]] = Field(
-        default_factory=dict,
+    source_dmo_fields: Optional[Dict[str, Union[str, int, float]]] = Field(
+        default=None,
         description=(
-            "Source Data Model Object fields as key-value pairs "
-            "(values can be string or int)"
+            "Source Data Model Object fields as key-value pairs. "
+            "Values can be string, int, or float."
         ),
         examples=[
             {
                 "FilePath__c": "quarterly_report.pdf",
-                "Size__c": 1377454,
+                "Size__c": 1377454.0,
                 "ContentType__c": "pdf",
                 "LastModified__c": "2026-03-25T02:01:24.918000",
             }
         ],
     )
-    prepend: List[SearchIndexChunkingV1PrependField] = Field(
-        default_factory=list, description="List of fields to prepend to each chunk"
+    prepend: Optional[List[SearchIndexChunkingV1PrependField]] = Field(
+        default=None,
+        description=(
+            "List of DMO fields whose values are prepended to the chunk "
+            "text before indexing"
+        ),
+    )
+    image_base64: Optional[str] = Field(
+        default=None,
+        description=(
+            "Base64-encoded image data associated with this chunk. "
+            "Optional — only applicable for image-type document elements."
+        ),
+    )
+    image_mime_type: Optional[str] = Field(
+        default=None,
+        description=(
+            "MIME type of the associated image (e.g., 'image/png', 'image/jpeg'). "
+            "Optional — should be provided alongside image_base64 when present."
+        ),
+        examples=["image/png", "image/jpeg"],
+    )
+    image_type: Optional[str] = Field(
+        default=None,
+        description=(
+            "Semantic category of the image content"
+            "(e.g., 'diagram', 'screenshot', 'chart'). Optional."
+        ),
+        examples=["diagram", "screenshot"],
     )
     model_config = ConfigDict(extra="ignore")
 
@@ -143,9 +175,12 @@ class SearchIndexChunkingV1DocElement(BaseModel):
             )
         ],
     )
-    metadata: SearchIndexChunkingV1Metadata = Field(
-        default_factory=SearchIndexChunkingV1Metadata,
-        description="Source document metadata",
+    metadata: Optional[SearchIndexChunkingV1Metadata] = Field(
+        default=None,
+        description=(
+            "Source document metadata. Optional — may be absent if no "
+            "metadata is available for the document element."
+        ),
     )
     model_config = ConfigDict(extra="ignore")
 
@@ -159,21 +194,25 @@ class SearchIndexChunkingV1Output(BaseModel):
         examples=["Online Remittance Instructions"],
     )
     seq_no: int = Field(
-        default=0, description="Sequential chunk number (1-based)", ge=1, examples=[1]
-    )
-    chunk_id: str = Field(
-        default="",
-        description="Unique identifier for this chunk (UUID format)",
-        examples=["550e8400-e29b-41d4-a716-446655440000"],
+        default=0,
+        description=(
+            "Sequential order of this chunk within the output "
+            "Represents chunk ordering within the source document (1-based)."
+        ),
+        ge=1,
+        examples=[1],
     )
     chunk_type: ChunkType = Field(
         default=ChunkType.TEXT,
-        description="Type of chunk (e.g., 'text')",
+        description="Type of chunk. Fixed value — always 'text'.",
         examples=["text"],
     )
-    citations: Dict[str, str] = Field(
-        default_factory=dict,
-        description="Citation information as key-value pairs",
+    citations: Optional[Dict[str, str]] = Field(
+        default=None,
+        description=(
+            "Citation metadata associated with this chunk as key-value "
+            "pairs. Optional — defaults to None if no citations are present."
+        ),
         examples=[{"source": "quarterly_report.pdf"}],
     )
     model_config = ConfigDict(extra="ignore")
@@ -194,4 +233,3 @@ class SearchIndexChunkingV1Response(BaseModel):
     output: List[SearchIndexChunkingV1Output] = Field(
         default_factory=list, description="Flat list of chunks from all docs"
     )
-    model_config = ConfigDict(extra="ignore")
 
@@ -16,6 +16,7 @@
 """Utilities for inspecting and working with function entrypoints."""
 
 import ast
+from enum import Enum
 import importlib.util
 import inspect
 import json
@@ -278,11 +279,17 @@ def _generate_model_sample_data(model_type):
         # Use examples if available
         if field_info.examples and len(field_info.examples) > 0:
             sample_data[field_name] = field_info.examples[0]
-        # Check if field has a real default value
-        elif field_info.default is not PydanticUndefined:
+        # If field has a non-None, non-empty default value, use it
+        elif (
+            field_info.default is not PydanticUndefined
+            and field_info.default is not None
+            and field_info.default != []
+            and field_info.default != {}
+        ):
             sample_data[field_name] = field_info.default
+        # For all other fields (including default_factory, None defaults,
+        # empty defaults), generate sample data
         else:
-            # Required field or field without default - generate sample
             sample_data[field_name] = generate_sample_value(
                 field_info.annotation, field_name
             )
@@ -301,6 +308,17 @@ def generate_sample_value(field_type, field_name: str):
     """
     origin = typing.get_origin(field_type)
 
+    # Handle Optional[T] (Union[T, None]) by unwrapping to T
+    if origin is typing.Union:
+        non_none_args = [
+            arg for arg in typing.get_args(field_type) if arg is not type(None)
+        ]
+        return (
+            generate_sample_value(non_none_args[0], field_name)
+            if non_none_args
+            else None
+        )
+
     if origin is list or field_type is list:
         args = typing.get_args(field_type)
         if args:
@@ -320,6 +338,10 @@ def generate_sample_value(field_type, field_name: str):
         return 1.0
     elif field_type is bool:
         return True
+    # Handle Enum types
+    elif isinstance(field_type, type) and issubclass(field_type, Enum):
+        # Return the first enum value
+        return next(iter(field_type)).value
     elif hasattr(field_type, "model_fields"):
         # Nested Pydantic model - use shared helper
         return _generate_model_sample_data(field_type)
 
@@ -23,14 +23,14 @@
     Union,
 )
 
-import pandas as pd
 import requests
 
 from datacustomcode.io.reader.base import BaseDataCloudReader
 from datacustomcode.io.reader.utils import _pandas_to_spark_schema
 from datacustomcode.token_provider import SFCLITokenProvider
 
 if TYPE_CHECKING:
+    import pandas as pd
     from pyspark.sql import DataFrame as PySparkDataFrame, SparkSession
     from pyspark.sql.types import AtomicType, StructType
 
@@ -97,6 +97,8 @@ def _execute_query(self, sql: str) -> pd.DataFrame:
         Raises:
             RuntimeError: On HTTP errors or unexpected response shapes.
         """
+        import pandas as pd
+
         access_token, instance_url = self._get_token()
 
         url = f"{instance_url}/services/data/{API_VERSION}/ssot/query-sql"
 
@@ -16,32 +16,32 @@
 
 from typing import TYPE_CHECKING
 
-import pandas.api.types as pd_types
-from pyspark.sql.types import (
-    BooleanType,
-    DoubleType,
-    LongType,
-    StringType,
-    StructField,
-    StructType,
-    TimestampType,
-)
-
 if TYPE_CHECKING:
     import pandas
-    from pyspark.sql.types import AtomicType
-
-PANDAS_TYPE_MAPPING = {
-    "object": StringType(),
-    "int64": LongType(),
-    "float64": DoubleType(),
-    "bool": BooleanType(),
-}
+    from pyspark.sql.types import AtomicType, StructType
 
 
 def _pandas_to_spark_schema(
     pandas_df: pandas.DataFrame, nullable: bool = True
 ) -> StructType:
+    import pandas.api.types as pd_types
+    from pyspark.sql.types import (
+        BooleanType,
+        DoubleType,
+        LongType,
+        StringType,
+        StructField,
+        StructType,
+        TimestampType,
+    )
+
+    PANDAS_TYPE_MAPPING = {
+        "object": StringType(),
+        "int64": LongType(),
+        "float64": DoubleType(),
+        "bool": BooleanType(),
+    }
+
     fields = []
     for column, dtype in pandas_df.dtypes.items():
         spark_type: AtomicType