|
1 | 1 | import logging |
2 | 2 |
|
| 3 | +from utility import extract_citations, split_text_into_chunks |
| 4 | + |
3 | 5 | from datacustomcode.function import Runtime |
4 | 6 | from datacustomcode.function.feature_types.chunking import ( |
5 | 7 | ChunkType, |
|
15 | 17 | DEFAULT_MAX_CHUNK_SIZE = 50 |
16 | 18 |
|
17 | 19 |
|
18 | | -def split_text_into_chunks(text: str, max_size: int, overlap: int = 20): |
19 | | - """Split text into chunks with overlap, trying to break at natural boundaries. |
20 | | -
|
21 | | - Tries to break at natural boundaries in order of preference: |
22 | | - 1. Paragraph boundaries (\\n\\n) |
23 | | - 2. Line boundaries (\\n) |
24 | | - 3. Sentence boundaries (. ! ?) |
25 | | - 4. Word boundaries (space) |
26 | | - 5. Hard cut if no good boundary found |
27 | | -
|
28 | | - Args: |
29 | | - text: Text to split |
30 | | - max_size: Maximum characters per chunk |
31 | | - overlap: Number of characters to overlap between chunks |
32 | | -
|
33 | | - Returns: |
34 | | - List of text chunks |
35 | | - """ |
36 | | - if len(text) <= max_size: |
37 | | - return [text] |
38 | | - |
39 | | - chunks = [] |
40 | | - start = 0 |
41 | | - |
42 | | - while start < len(text): |
43 | | - # Determine end position for this chunk |
44 | | - end = start + max_size |
45 | | - |
46 | | - if end >= len(text): |
47 | | - # Last chunk |
48 | | - chunks.append(text[start:]) |
49 | | - break |
50 | | - |
51 | | - # Try to find a good breaking point (in order of preference) |
52 | | - chunk_text = text[start:end] |
53 | | - break_point = None |
54 | | - |
55 | | - # Try to break at paragraph boundary (\n\n) |
56 | | - last_paragraph = chunk_text.rfind("\n\n") |
57 | | - if last_paragraph > max_size * 0.5: # Only if it's past halfway |
58 | | - break_point = start + last_paragraph + 2 # +2 to skip the \n\n |
59 | | - |
60 | | - # Try to break at line boundary (\n) |
61 | | - if break_point is None: |
62 | | - last_newline = chunk_text.rfind("\n") |
63 | | - if last_newline > max_size * 0.5: |
64 | | - break_point = start + last_newline + 1 |
65 | | - |
66 | | - # Try to break at sentence boundary (. ! ?) |
67 | | - if break_point is None: |
68 | | - for punct in [". ", "! ", "? "]: |
69 | | - last_sentence = chunk_text.rfind(punct) |
70 | | - if last_sentence > max_size * 0.5: |
71 | | - break_point = start + last_sentence + len(punct) |
72 | | - break |
73 | | - |
74 | | - # Try to break at word boundary (space) |
75 | | - if break_point is None: |
76 | | - last_space = chunk_text.rfind(" ") |
77 | | - if last_space > max_size * 0.5: |
78 | | - break_point = start + last_space + 1 |
79 | | - |
80 | | - # If no good breaking point, just hard cut |
81 | | - if break_point is None: |
82 | | - break_point = end |
83 | | - |
84 | | - chunks.append(text[start:break_point].strip()) |
85 | | - |
86 | | - # Move start position with overlap |
87 | | - start = max(break_point - overlap, start + 1) |
88 | | - |
89 | | - return chunks |
90 | | - |
91 | | - |
92 | 20 | def function( |
93 | 21 | request: SearchIndexChunkingV1Request, runtime: Runtime |
94 | 22 | ) -> SearchIndexChunkingV1Response: |
@@ -121,11 +49,7 @@ def function( |
121 | 49 |
|
122 | 50 | # Create chunk outputs |
123 | 51 | for chunk_text in text_chunks: |
124 | | - # Create citations from source_dmo_fields if available |
125 | | - citations = {} |
126 | | - if metadata and metadata.source_dmo_fields: |
127 | | - for key, value in metadata.source_dmo_fields.items(): |
128 | | - citations[key] = str(value) |
| 52 | + citations = extract_citations(metadata) |
129 | 53 |
|
130 | 54 | chunk_output = SearchIndexChunkingV1Output( |
131 | 55 | chunk_type=ChunkType.TEXT, |
|
0 commit comments