Skip to content

Commit 2e974af

Browse files
Lazy load pyspark/numpy dependency instead of eager load, modify chunking contract as per UDS team request (#97)
* Making SDK lean for function * Enable lazy loading * Updated chunking contract with field addition * Updated chunking contract with field addition * sf script run failure fix
1 parent 2229898 commit 2e974af

13 files changed

Lines changed: 208 additions & 86 deletions

File tree

src/datacustomcode/__init__.py

Lines changed: 25 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -13,15 +13,35 @@
1313
# See the License for the specific language governing permissions and
1414
# limitations under the License.
1515

16-
from datacustomcode.client import Client
17-
from datacustomcode.credentials import AuthType, Credentials
18-
from datacustomcode.io.reader.query_api import QueryAPIDataCloudReader
19-
from datacustomcode.io.writer.print import PrintDataCloudWriter
20-
2116
__all__ = [
2217
"AuthType",
2318
"Client",
2419
"Credentials",
2520
"PrintDataCloudWriter",
2621
"QueryAPIDataCloudReader",
2722
]
23+
24+
25+
def __getattr__(name: str):
26+
"""Lazy import heavy dependencies."""
27+
if name == "Client":
28+
from datacustomcode.client import Client
29+
30+
return Client
31+
elif name == "AuthType":
32+
from datacustomcode.credentials import AuthType
33+
34+
return AuthType
35+
elif name == "Credentials":
36+
from datacustomcode.credentials import Credentials
37+
38+
return Credentials
39+
elif name == "PrintDataCloudWriter":
40+
from datacustomcode.io.writer.print import PrintDataCloudWriter
41+
42+
return PrintDataCloudWriter
43+
elif name == "QueryAPIDataCloudReader":
44+
from datacustomcode.io.reader.query_api import QueryAPIDataCloudReader
45+
46+
return QueryAPIDataCloudReader
47+
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")

src/datacustomcode/client.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -112,8 +112,8 @@ class Client:
112112
def __new__(
113113
cls,
114114
reader: Optional[BaseDataCloudReader] = None,
115-
writer: Optional["BaseDataCloudWriter"] = None,
116-
spark_provider: Optional["BaseSparkSessionProvider"] = None,
115+
writer: Optional[BaseDataCloudWriter] = None,
116+
spark_provider: Optional[BaseSparkSessionProvider] = None,
117117
code_type: str = "script",
118118
) -> Client:
119119

src/datacustomcode/function/feature_types/chunking.py

Lines changed: 83 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -50,82 +50,114 @@ class ChunkType(str, Enum):
5050
class SearchIndexChunkingV1PrependField(BaseModel):
5151
"""Field to prepend to chunk content"""
5252

53-
dmo_name: str = Field(
54-
default="", description="Data Model Object name", examples=["udmo_1__dlm"]
53+
dmo_name: Optional[str] = Field(
54+
default=None, description="Data Model Object name", examples=["udmo_1__dlm"]
5555
)
56-
field_name: str = Field(
57-
default="",
56+
field_name: Optional[str] = Field(
57+
default=None,
5858
description="Field name to prepend",
5959
examples=["ResolvedFilePath__c"],
6060
)
61-
value: str = Field(
62-
default="",
61+
value: Optional[str] = Field(
62+
default=None,
6363
description="Field value to prepend",
6464
examples=["udlo_1__dll:quarterly_report.pdf"],
6565
)
6666
model_config = ConfigDict(extra="ignore")
6767

6868

6969
class SearchIndexChunkingV1TranscriptField(BaseModel):
70-
"""Field to prepend to chunk content"""
70+
"""Transcript timing and speaker metadata for audio/video documents"""
7171

72-
speaker: str = Field(
73-
default="",
72+
speaker: Optional[str] = Field(
73+
default=None,
7474
description="Speaker name for audio/video transcripts",
7575
examples=["Agent"],
7676
)
77-
start_timestamp: str = Field(
78-
default="",
77+
start_timestamp: Optional[str] = Field(
78+
default=None,
7979
description="Start timestamp in ISO8601 format: YYYY-MM-DDTHH:MM:SS.ffffff",
8080
examples=["2026-03-25T02:01:24.918000"],
8181
)
82-
end_timestamp: str = Field(
83-
default="",
82+
end_timestamp: Optional[str] = Field(
83+
default=None,
8484
description="End timestamp in ISO8601 format: YYYY-MM-DDTHH:MM:SS.ffffff",
8585
examples=["2026-03-25T02:01:30.500000"],
8686
)
8787
model_config = ConfigDict(extra="ignore")
8888

8989

9090
class SearchIndexChunkingV1Metadata(BaseModel):
91-
"""Metadata for input documents"""
91+
"""Metadata for input documents."""
9292

93-
type: DocumentType = Field(
94-
default=DocumentType.TEXT, description="Document type (text)", examples=["text"]
95-
)
96-
transcript_fields: SearchIndexChunkingV1TranscriptField = Field(
97-
default_factory=SearchIndexChunkingV1TranscriptField,
93+
type: Optional[DocumentType] = Field(
94+
default=DocumentType.TEXT,
9895
description=(
99-
"Transcript information. Will only be there in case of audio-video files"
96+
"Document type of the chunk input. Currently only 'text' is supported."
10097
),
98+
examples=["text"],
10199
)
102-
page_number: int = Field(
103-
default=0,
104-
description="Page number in the source document (0-based)",
100+
page_number: Optional[int] = Field(
101+
default=None,
102+
description=("Page number in the source document (0-based). "),
105103
examples=[1],
106104
)
105+
transcript_fields: Optional[SearchIndexChunkingV1TranscriptField] = Field(
106+
default=None,
107+
description=(
108+
"Speaker and timestamp metadata for audio/video transcripts. "
109+
"Optional — only present when the source document is a transcript."
110+
),
111+
)
107112
text_as_html: Optional[str] = Field(
108113
default=None,
109-
description="HTML representation of the document text",
114+
description=("HTML representation of the chunk text, if available. "),
110115
examples=["<p>Online Remittance Instructions</p>"],
111116
)
112-
source_dmo_fields: Dict[str, Union[str, int]] = Field(
113-
default_factory=dict,
117+
source_dmo_fields: Optional[Dict[str, Union[str, int, float]]] = Field(
118+
default=None,
114119
description=(
115-
"Source Data Model Object fields as key-value pairs "
116-
"(values can be string or int)"
120+
"Source Data Model Object fields as key-value pairs. "
121+
"Values can be string, int, or float."
117122
),
118123
examples=[
119124
{
120125
"FilePath__c": "quarterly_report.pdf",
121-
"Size__c": 1377454,
126+
"Size__c": 1377454.0,
122127
"ContentType__c": "pdf",
123128
"LastModified__c": "2026-03-25T02:01:24.918000",
124129
}
125130
],
126131
)
127-
prepend: List[SearchIndexChunkingV1PrependField] = Field(
128-
default_factory=list, description="List of fields to prepend to each chunk"
132+
prepend: Optional[List[SearchIndexChunkingV1PrependField]] = Field(
133+
default=None,
134+
description=(
135+
"List of DMO fields whose values are prepended to the chunk "
136+
"text before indexing"
137+
),
138+
)
139+
image_base64: Optional[str] = Field(
140+
default=None,
141+
description=(
142+
"Base64-encoded image data associated with this chunk. "
143+
"Optional — only applicable for image-type document elements."
144+
),
145+
)
146+
image_mime_type: Optional[str] = Field(
147+
default=None,
148+
description=(
149+
"MIME type of the associated image (e.g., 'image/png', 'image/jpeg'). "
150+
"Optional — should be provided alongside image_base64 when present."
151+
),
152+
examples=["image/png", "image/jpeg"],
153+
)
154+
image_type: Optional[str] = Field(
155+
default=None,
156+
description=(
157+
"Semantic category of the image content"
158+
"(e.g., 'diagram', 'screenshot', 'chart'). Optional."
159+
),
160+
examples=["diagram", "screenshot"],
129161
)
130162
model_config = ConfigDict(extra="ignore")
131163

@@ -143,9 +175,12 @@ class SearchIndexChunkingV1DocElement(BaseModel):
143175
)
144176
],
145177
)
146-
metadata: SearchIndexChunkingV1Metadata = Field(
147-
default_factory=SearchIndexChunkingV1Metadata,
148-
description="Source document metadata",
178+
metadata: Optional[SearchIndexChunkingV1Metadata] = Field(
179+
default=None,
180+
description=(
181+
"Source document metadata. Optional — may be absent if no "
182+
"metadata is available for the document element."
183+
),
149184
)
150185
model_config = ConfigDict(extra="ignore")
151186

@@ -159,21 +194,25 @@ class SearchIndexChunkingV1Output(BaseModel):
159194
examples=["Online Remittance Instructions"],
160195
)
161196
seq_no: int = Field(
162-
default=0, description="Sequential chunk number (1-based)", ge=1, examples=[1]
163-
)
164-
chunk_id: str = Field(
165-
default="",
166-
description="Unique identifier for this chunk (UUID format)",
167-
examples=["550e8400-e29b-41d4-a716-446655440000"],
197+
default=0,
198+
description=(
199+
"Sequential order of this chunk within the output "
200+
"Represents chunk ordering within the source document (1-based)."
201+
),
202+
ge=1,
203+
examples=[1],
168204
)
169205
chunk_type: ChunkType = Field(
170206
default=ChunkType.TEXT,
171-
description="Type of chunk (e.g., 'text')",
207+
description="Type of chunk. Fixed value — always 'text'.",
172208
examples=["text"],
173209
)
174-
citations: Dict[str, str] = Field(
175-
default_factory=dict,
176-
description="Citation information as key-value pairs",
210+
citations: Optional[Dict[str, str]] = Field(
211+
default=None,
212+
description=(
213+
"Citation metadata associated with this chunk as key-value "
214+
"pairs. Optional — defaults to None if no citations are present."
215+
),
177216
examples=[{"source": "quarterly_report.pdf"}],
178217
)
179218
model_config = ConfigDict(extra="ignore")
@@ -194,4 +233,3 @@ class SearchIndexChunkingV1Response(BaseModel):
194233
output: List[SearchIndexChunkingV1Output] = Field(
195234
default_factory=list, description="Flat list of chunks from all docs"
196235
)
197-
model_config = ConfigDict(extra="ignore")

src/datacustomcode/function_utils.py

Lines changed: 25 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
"""Utilities for inspecting and working with function entrypoints."""
1717

1818
import ast
19+
from enum import Enum
1920
import importlib.util
2021
import inspect
2122
import json
@@ -278,11 +279,17 @@ def _generate_model_sample_data(model_type):
278279
# Use examples if available
279280
if field_info.examples and len(field_info.examples) > 0:
280281
sample_data[field_name] = field_info.examples[0]
281-
# Check if field has a real default value
282-
elif field_info.default is not PydanticUndefined:
282+
# If field has a non-None, non-empty default value, use it
283+
elif (
284+
field_info.default is not PydanticUndefined
285+
and field_info.default is not None
286+
and field_info.default != []
287+
and field_info.default != {}
288+
):
283289
sample_data[field_name] = field_info.default
290+
# For all other fields (including default_factory, None defaults,
291+
# empty defaults), generate sample data
284292
else:
285-
# Required field or field without default - generate sample
286293
sample_data[field_name] = generate_sample_value(
287294
field_info.annotation, field_name
288295
)
@@ -301,6 +308,17 @@ def generate_sample_value(field_type, field_name: str):
301308
"""
302309
origin = typing.get_origin(field_type)
303310

311+
# Handle Optional[T] (Union[T, None]) by unwrapping to T
312+
if origin is typing.Union:
313+
non_none_args = [
314+
arg for arg in typing.get_args(field_type) if arg is not type(None)
315+
]
316+
return (
317+
generate_sample_value(non_none_args[0], field_name)
318+
if non_none_args
319+
else None
320+
)
321+
304322
if origin is list or field_type is list:
305323
args = typing.get_args(field_type)
306324
if args:
@@ -320,6 +338,10 @@ def generate_sample_value(field_type, field_name: str):
320338
return 1.0
321339
elif field_type is bool:
322340
return True
341+
# Handle Enum types
342+
elif isinstance(field_type, type) and issubclass(field_type, Enum):
343+
# Return the first enum value
344+
return next(iter(field_type)).value
323345
elif hasattr(field_type, "model_fields"):
324346
# Nested Pydantic model - use shared helper
325347
return _generate_model_sample_data(field_type)

src/datacustomcode/io/reader/sf_cli.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,14 +23,14 @@
2323
Union,
2424
)
2525

26-
import pandas as pd
2726
import requests
2827

2928
from datacustomcode.io.reader.base import BaseDataCloudReader
3029
from datacustomcode.io.reader.utils import _pandas_to_spark_schema
3130
from datacustomcode.token_provider import SFCLITokenProvider
3231

3332
if TYPE_CHECKING:
33+
import pandas as pd
3434
from pyspark.sql import DataFrame as PySparkDataFrame, SparkSession
3535
from pyspark.sql.types import AtomicType, StructType
3636

@@ -97,6 +97,8 @@ def _execute_query(self, sql: str) -> pd.DataFrame:
9797
Raises:
9898
RuntimeError: On HTTP errors or unexpected response shapes.
9999
"""
100+
import pandas as pd
101+
100102
access_token, instance_url = self._get_token()
101103

102104
url = f"{instance_url}/services/data/{API_VERSION}/ssot/query-sql"

src/datacustomcode/io/reader/utils.py

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -16,32 +16,32 @@
1616

1717
from typing import TYPE_CHECKING
1818

19-
import pandas.api.types as pd_types
20-
from pyspark.sql.types import (
21-
BooleanType,
22-
DoubleType,
23-
LongType,
24-
StringType,
25-
StructField,
26-
StructType,
27-
TimestampType,
28-
)
29-
3019
if TYPE_CHECKING:
3120
import pandas
32-
from pyspark.sql.types import AtomicType
33-
34-
PANDAS_TYPE_MAPPING = {
35-
"object": StringType(),
36-
"int64": LongType(),
37-
"float64": DoubleType(),
38-
"bool": BooleanType(),
39-
}
21+
from pyspark.sql.types import AtomicType, StructType
4022

4123

4224
def _pandas_to_spark_schema(
4325
pandas_df: pandas.DataFrame, nullable: bool = True
4426
) -> StructType:
27+
import pandas.api.types as pd_types
28+
from pyspark.sql.types import (
29+
BooleanType,
30+
DoubleType,
31+
LongType,
32+
StringType,
33+
StructField,
34+
StructType,
35+
TimestampType,
36+
)
37+
38+
PANDAS_TYPE_MAPPING = {
39+
"object": StringType(),
40+
"int64": LongType(),
41+
"float64": DoubleType(),
42+
"bool": BooleanType(),
43+
}
44+
4545
fields = []
4646
for column, dtype in pandas_df.dtypes.items():
4747
spark_type: AtomicType

0 commit comments

Comments
 (0)