Skip to content

Commit e17397a

Browse files
committed
Update example
1 parent a24a51c commit e17397a

2 files changed

Lines changed: 107 additions & 79 deletions

File tree

src/datacustomcode/templates/function/chunking/payload/entrypoint.py

Lines changed: 3 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
import logging
22

3+
from utility import extract_citations, split_text_into_chunks
4+
35
from datacustomcode.function import Runtime
46
from datacustomcode.function.feature_types.chunking import (
57
ChunkType,
@@ -15,80 +17,6 @@
1517
DEFAULT_MAX_CHUNK_SIZE = 50
1618

1719

18-
def split_text_into_chunks(text: str, max_size: int, overlap: int = 20):
19-
"""Split text into chunks with overlap, trying to break at natural boundaries.
20-
21-
Tries to break at natural boundaries in order of preference:
22-
1. Paragraph boundaries (\\n\\n)
23-
2. Line boundaries (\\n)
24-
3. Sentence boundaries (. ! ?)
25-
4. Word boundaries (space)
26-
5. Hard cut if no good boundary found
27-
28-
Args:
29-
text: Text to split
30-
max_size: Maximum characters per chunk
31-
overlap: Number of characters to overlap between chunks
32-
33-
Returns:
34-
List of text chunks
35-
"""
36-
if len(text) <= max_size:
37-
return [text]
38-
39-
chunks = []
40-
start = 0
41-
42-
while start < len(text):
43-
# Determine end position for this chunk
44-
end = start + max_size
45-
46-
if end >= len(text):
47-
# Last chunk
48-
chunks.append(text[start:])
49-
break
50-
51-
# Try to find a good breaking point (in order of preference)
52-
chunk_text = text[start:end]
53-
break_point = None
54-
55-
# Try to break at paragraph boundary (\n\n)
56-
last_paragraph = chunk_text.rfind("\n\n")
57-
if last_paragraph > max_size * 0.5: # Only if it's past halfway
58-
break_point = start + last_paragraph + 2 # +2 to skip the \n\n
59-
60-
# Try to break at line boundary (\n)
61-
if break_point is None:
62-
last_newline = chunk_text.rfind("\n")
63-
if last_newline > max_size * 0.5:
64-
break_point = start + last_newline + 1
65-
66-
# Try to break at sentence boundary (. ! ?)
67-
if break_point is None:
68-
for punct in [". ", "! ", "? "]:
69-
last_sentence = chunk_text.rfind(punct)
70-
if last_sentence > max_size * 0.5:
71-
break_point = start + last_sentence + len(punct)
72-
break
73-
74-
# Try to break at word boundary (space)
75-
if break_point is None:
76-
last_space = chunk_text.rfind(" ")
77-
if last_space > max_size * 0.5:
78-
break_point = start + last_space + 1
79-
80-
# If no good breaking point, just hard cut
81-
if break_point is None:
82-
break_point = end
83-
84-
chunks.append(text[start:break_point].strip())
85-
86-
# Move start position with overlap
87-
start = max(break_point - overlap, start + 1)
88-
89-
return chunks
90-
91-
9220
def function(
9321
request: SearchIndexChunkingV1Request, runtime: Runtime
9422
) -> SearchIndexChunkingV1Response:
@@ -121,11 +49,7 @@ def function(
12149

12250
# Create chunk outputs
12351
for chunk_text in text_chunks:
124-
# Create citations from source_dmo_fields if available
125-
citations = {}
126-
if metadata and metadata.source_dmo_fields:
127-
for key, value in metadata.source_dmo_fields.items():
128-
citations[key] = str(value)
52+
citations = extract_citations(metadata)
12953

13054
chunk_output = SearchIndexChunkingV1Output(
13155
chunk_type=ChunkType.TEXT,
Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
"""Utility functions for text chunking operations."""
2+
3+
import logging
4+
from typing import (
5+
Dict,
6+
List,
7+
Optional,
8+
)
9+
10+
from datacustomcode.function.feature_types.chunking import SearchIndexChunkingV1Metadata
11+
12+
logger = logging.getLogger(__name__)
13+
14+
15+
def split_text_into_chunks(text: str, max_size: int, overlap: int = 20) -> List[str]:
16+
"""Split text into chunks with overlap, trying to break at natural boundaries.
17+
18+
Tries to break at natural boundaries in order of preference:
19+
1. Paragraph boundaries (\\n\\n)
20+
2. Line boundaries (\\n)
21+
3. Sentence boundaries (. ! ?)
22+
4. Word boundaries (space)
23+
5. Hard cut if no good boundary found
24+
25+
Args:
26+
text: Text to split
27+
max_size: Maximum characters per chunk
28+
overlap: Number of characters to overlap between chunks
29+
30+
Returns:
31+
List of text chunks
32+
"""
33+
if len(text) <= max_size:
34+
return [text]
35+
36+
chunks = []
37+
start = 0
38+
39+
while start < len(text):
40+
# Determine end position for this chunk
41+
end = start + max_size
42+
43+
if end >= len(text):
44+
# Last chunk
45+
chunks.append(text[start:])
46+
break
47+
48+
# Try to find a good breaking point (in order of preference)
49+
chunk_text = text[start:end]
50+
break_point = None
51+
52+
# Try to break at paragraph boundary (\n\n)
53+
last_paragraph = chunk_text.rfind("\n\n")
54+
if last_paragraph > max_size * 0.5: # Only if it's past halfway
55+
break_point = start + last_paragraph + 2 # +2 to skip the \n\n
56+
57+
# Try to break at line boundary (\n)
58+
if break_point is None:
59+
last_newline = chunk_text.rfind("\n")
60+
if last_newline > max_size * 0.5:
61+
break_point = start + last_newline + 1
62+
63+
# Try to break at sentence boundary (. ! ?)
64+
if break_point is None:
65+
for punct in [". ", "! ", "? "]:
66+
last_sentence = chunk_text.rfind(punct)
67+
if last_sentence > max_size * 0.5:
68+
break_point = start + last_sentence + len(punct)
69+
break
70+
71+
# Try to break at word boundary (space)
72+
if break_point is None:
73+
last_space = chunk_text.rfind(" ")
74+
if last_space > max_size * 0.5:
75+
break_point = start + last_space + 1
76+
77+
# If no good breaking point, just hard cut
78+
if break_point is None:
79+
break_point = end
80+
81+
chunks.append(text[start:break_point].strip())
82+
83+
# Move start position with overlap
84+
start = max(break_point - overlap, start + 1)
85+
86+
return chunks
87+
88+
89+
def extract_citations(
90+
metadata: Optional[SearchIndexChunkingV1Metadata],
91+
) -> Dict[str, str]:
92+
"""Extract citations from document metadata.
93+
94+
Args:
95+
metadata: Document metadata containing source DMO fields
96+
97+
Returns:
98+
Dictionary of citation key-value pairs
99+
"""
100+
citations = {}
101+
if metadata and metadata.source_dmo_fields:
102+
for key, value in metadata.source_dmo_fields.items():
103+
citations[key] = str(value)
104+
return citations

0 commit comments

Comments
 (0)