@@ -65,13 +65,13 @@ def _patch_tokenizer_and_max_tokens(self) -> Self:
65
65
)
66
66
return self
67
67
68
- def _count_tokens (self , text : Optional [Union [str , list [str ]]]):
68
+ def _count_text_tokens (self , text : Optional [Union [str , list [str ]]]):
69
69
if text is None :
70
70
return 0
71
71
elif isinstance (text , list ):
72
72
total = 0
73
73
for t in text :
74
- total += self ._count_tokens (t )
74
+ total += self ._count_text_tokens (t )
75
75
return total
76
76
return len (self ._tokenizer .tokenize (text , max_length = None ))
77
77
@@ -80,102 +80,83 @@ class _ChunkLengthInfo(BaseModel):
80
80
text_len : int
81
81
other_len : int
82
82
83
+ def _count_chunk_tokens (self , doc_chunk : DocChunk ):
84
+ ser_txt = self .serialize (chunk = doc_chunk )
85
+ return len (self ._tokenizer .tokenize (text = ser_txt , max_length = None ))
86
+
83
87
def _doc_chunk_length (self , doc_chunk : DocChunk ):
84
- text_length = self ._count_tokens (doc_chunk .text )
85
- headings_length = self ._count_tokens (doc_chunk .meta .headings )
86
- captions_length = self ._count_tokens (doc_chunk .meta .captions )
87
- total = text_length + headings_length + captions_length
88
+ text_length = self ._count_text_tokens (doc_chunk .text )
89
+ total = self ._count_chunk_tokens (doc_chunk = doc_chunk )
88
90
return self ._ChunkLengthInfo (
89
91
total_len = total ,
90
92
text_len = text_length ,
91
93
other_len = total - text_length ,
92
94
)
93
95
94
96
def _make_chunk_from_doc_items (
95
- self , doc_chunk : DocChunk , window_text : str , window_start : int , window_end : int
97
+ self , doc_chunk : DocChunk , window_start : int , window_end : int
96
98
):
99
+ doc_items = doc_chunk .meta .doc_items [window_start : window_end + 1 ]
97
100
meta = DocMeta (
98
- doc_items = doc_chunk . meta . doc_items [ window_start : window_end + 1 ] ,
101
+ doc_items = doc_items ,
99
102
headings = doc_chunk .meta .headings ,
100
103
captions = doc_chunk .meta .captions ,
101
104
origin = doc_chunk .meta .origin ,
102
105
)
106
+ window_text = (
107
+ doc_chunk .text
108
+ if len (doc_chunk .meta .doc_items ) == 1
109
+ else self .delim .join (
110
+ [
111
+ doc_item .text
112
+ for doc_item in doc_items
113
+ if isinstance (doc_item , TextItem )
114
+ ]
115
+ )
116
+ )
103
117
new_chunk = DocChunk (text = window_text , meta = meta )
104
118
return new_chunk
105
119
106
- def _merge_text (self , t1 , t2 ):
107
- if t1 == "" :
108
- return t2
109
- elif t2 == "" :
110
- return t1
111
- else :
112
- return f"{ t1 } { self .delim } { t2 } "
113
-
114
120
def _split_by_doc_items (self , doc_chunk : DocChunk ) -> list [DocChunk ]:
115
- if doc_chunk .meta .doc_items is None or len (doc_chunk .meta .doc_items ) <= 1 :
116
- return [doc_chunk ]
117
- length = self ._doc_chunk_length (doc_chunk )
118
- if length .total_len <= self .max_tokens :
119
- return [doc_chunk ]
120
- else :
121
- chunks = []
122
- window_start = 0
123
- window_end = 0
124
- window_text = ""
125
- window_text_length = 0
126
- other_length = length .other_len
127
- num_items = len (doc_chunk .meta .doc_items )
128
- while window_end < num_items :
129
- doc_item = doc_chunk .meta .doc_items [window_end ]
130
- if isinstance (doc_item , TextItem ):
131
- text = doc_item .text
132
- else :
133
- raise RuntimeError ("Non-TextItem split not implemented yet" )
134
- text_length = self ._count_tokens (text )
135
- if (
136
- text_length + window_text_length + other_length < self .max_tokens
137
- and window_end < num_items - 1
138
- ):
121
+ chunks = []
122
+ window_start = 0
123
+ window_end = 0 # an inclusive index
124
+ num_items = len (doc_chunk .meta .doc_items )
125
+ while window_end < num_items :
126
+ new_chunk = self ._make_chunk_from_doc_items (
127
+ doc_chunk = doc_chunk ,
128
+ window_start = window_start ,
129
+ window_end = window_end ,
130
+ )
131
+ if self ._count_chunk_tokens (doc_chunk = new_chunk ) <= self .max_tokens :
132
+ if window_end < num_items - 1 :
133
+ window_end += 1
139
134
# Still room left to add more to this chunk AND still at least one
140
135
# item left
141
- window_end += 1
142
- window_text_length += text_length
143
- window_text = self ._merge_text (window_text , text )
144
- elif text_length + window_text_length + other_length < self .max_tokens :
136
+ continue
137
+ else :
145
138
# All the items in the window fit into the chunk and there are no
146
139
# other items left
147
- window_text = self ._merge_text (window_text , text )
148
- new_chunk = self ._make_chunk_from_doc_items (
149
- doc_chunk , window_text , window_start , window_end
150
- )
151
- chunks .append (new_chunk )
152
- window_end = num_items
153
- elif window_start == window_end :
154
- # Only one item in the window and it doesn't fit into the chunk. So
155
- # we'll just make it a chunk for now and it will get split in the
156
- # plain text splitter.
157
- window_text = self ._merge_text (window_text , text )
158
- new_chunk = self ._make_chunk_from_doc_items (
159
- doc_chunk , window_text , window_start , window_end
160
- )
161
- chunks .append (new_chunk )
162
- window_start = window_end + 1
163
- window_end = window_start
164
- window_text = ""
165
- window_text_length = 0
166
- else :
167
- # Multiple items in the window but they don't fit into the chunk.
168
- # However, the existing items must have fit or we wouldn't have
169
- # gotten here. So we put everything but the last item into the chunk
170
- # and then start a new window INCLUDING the current window end.
171
- new_chunk = self ._make_chunk_from_doc_items (
172
- doc_chunk , window_text , window_start , window_end - 1
173
- )
174
- chunks .append (new_chunk )
175
- window_start = window_end
176
- window_text = ""
177
- window_text_length = 0
178
- return chunks
140
+ window_end = num_items # signalizing the last loop
141
+ elif window_start == window_end :
142
+ # Only one item in the window and it doesn't fit into the chunk. So
143
+ # we'll just make it a chunk for now and it will get split in the
144
+ # plain text splitter.
145
+ window_end += 1
146
+ window_start = window_end
147
+ else :
148
+ # Multiple items in the window but they don't fit into the chunk.
149
+ # However, the existing items must have fit or we wouldn't have
150
+ # gotten here. So we put everything but the last item into the chunk
151
+ # and then start a new window INCLUDING the current window end.
152
+ new_chunk = self ._make_chunk_from_doc_items (
153
+ doc_chunk = doc_chunk ,
154
+ window_start = window_start ,
155
+ window_end = window_end - 1 ,
156
+ )
157
+ window_start = window_end
158
+ chunks .append (new_chunk )
159
+ return chunks
179
160
180
161
def _split_using_plain_text (
181
162
self ,
@@ -204,53 +185,45 @@ def _split_using_plain_text(
204
185
def _merge_chunks_with_matching_metadata (self , chunks : list [DocChunk ]):
205
186
output_chunks = []
206
187
window_start = 0
207
- window_end = 0
188
+ window_end = 0 # an inclusive index
208
189
num_chunks = len (chunks )
209
190
while window_end < num_chunks :
210
191
chunk = chunks [window_end ]
211
- lengths = self ._doc_chunk_length (chunk )
212
192
headings_and_captions = (chunk .meta .headings , chunk .meta .captions )
213
193
ready_to_append = False
214
194
if window_start == window_end :
215
- # starting a new block of chunks to potentially merge
216
195
current_headings_and_captions = headings_and_captions
217
- window_text = chunk .text
218
- window_other_length = lengths .other_len
219
- window_text_length = lengths .text_len
220
- window_items = chunk .meta .doc_items
221
196
window_end += 1
222
197
first_chunk_of_window = chunk
223
- elif (
224
- headings_and_captions == current_headings_and_captions
225
- and window_text_length + window_other_length + lengths .text_len
226
- <= self .max_tokens
227
- ):
228
- # there is room to include the new chunk so add it to the window and
229
- # continue
230
- window_text = self ._merge_text (window_text , chunk .text )
231
- window_text_length += lengths .text_len
232
- window_items = window_items + chunk .meta .doc_items
233
- window_end += 1
234
198
else :
235
- ready_to_append = True
236
-
199
+ chks = chunks [window_start : window_end + 1 ]
200
+ doc_items = [it for chk in chks for it in chk .meta .doc_items ]
201
+ candidate = DocChunk (
202
+ text = self .delim .join ([chk .text for chk in chks ]),
203
+ meta = DocMeta (
204
+ doc_items = doc_items ,
205
+ headings = current_headings_and_captions [0 ],
206
+ captions = current_headings_and_captions [1 ],
207
+ origin = chunk .meta .origin ,
208
+ ),
209
+ )
210
+ if (
211
+ headings_and_captions == current_headings_and_captions
212
+ and self ._count_chunk_tokens (doc_chunk = candidate ) <= self .max_tokens
213
+ ):
214
+ # there is room to include the new chunk so add it to the window and
215
+ # continue
216
+ window_end += 1
217
+ new_chunk = candidate
218
+ else :
219
+ ready_to_append = True
237
220
if ready_to_append or window_end == num_chunks :
238
221
# no more room OR the start of new metadata. Either way, end the block
239
222
# and use the current window_end as the start of a new block
240
223
if window_start + 1 == window_end :
241
224
# just one chunk so use it as is
242
225
output_chunks .append (first_chunk_of_window )
243
226
else :
244
- new_meta = DocMeta (
245
- doc_items = window_items ,
246
- headings = current_headings_and_captions [0 ],
247
- captions = current_headings_and_captions [1 ],
248
- origin = chunk .meta .origin ,
249
- )
250
- new_chunk = DocChunk (
251
- text = window_text ,
252
- meta = new_meta ,
253
- )
254
227
output_chunks .append (new_chunk )
255
228
# no need to reset window_text, etc. because that will be reset in the
256
229
# next iteration in the if window_start == window_end block
0 commit comments