Skip to content

Commit f763699

Browse files
committed
fix: fix hybrid chunker token constraint
Signed-off-by: Panos Vagenas <[email protected]>
1 parent 618df13 commit f763699

File tree

3 files changed

+261
-106
lines changed

3 files changed

+261
-106
lines changed

docling_core/transforms/chunker/hybrid_chunker.py

Lines changed: 79 additions & 106 deletions
Original file line numberDiff line numberDiff line change
@@ -65,13 +65,13 @@ def _patch_tokenizer_and_max_tokens(self) -> Self:
6565
)
6666
return self
6767

68-
def _count_tokens(self, text: Optional[Union[str, list[str]]]):
68+
def _count_text_tokens(self, text: Optional[Union[str, list[str]]]):
6969
if text is None:
7070
return 0
7171
elif isinstance(text, list):
7272
total = 0
7373
for t in text:
74-
total += self._count_tokens(t)
74+
total += self._count_text_tokens(t)
7575
return total
7676
return len(self._tokenizer.tokenize(text, max_length=None))
7777

@@ -80,102 +80,83 @@ class _ChunkLengthInfo(BaseModel):
8080
text_len: int
8181
other_len: int
8282

83+
def _count_chunk_tokens(self, doc_chunk: DocChunk):
84+
ser_txt = self.serialize(chunk=doc_chunk)
85+
return len(self._tokenizer.tokenize(text=ser_txt, max_length=None))
86+
8387
def _doc_chunk_length(self, doc_chunk: DocChunk):
84-
text_length = self._count_tokens(doc_chunk.text)
85-
headings_length = self._count_tokens(doc_chunk.meta.headings)
86-
captions_length = self._count_tokens(doc_chunk.meta.captions)
87-
total = text_length + headings_length + captions_length
88+
text_length = self._count_text_tokens(doc_chunk.text)
89+
total = self._count_chunk_tokens(doc_chunk=doc_chunk)
8890
return self._ChunkLengthInfo(
8991
total_len=total,
9092
text_len=text_length,
9193
other_len=total - text_length,
9294
)
9395

9496
def _make_chunk_from_doc_items(
95-
self, doc_chunk: DocChunk, window_text: str, window_start: int, window_end: int
97+
self, doc_chunk: DocChunk, window_start: int, window_end: int
9698
):
99+
doc_items = doc_chunk.meta.doc_items[window_start : window_end + 1]
97100
meta = DocMeta(
98-
doc_items=doc_chunk.meta.doc_items[window_start : window_end + 1],
101+
doc_items=doc_items,
99102
headings=doc_chunk.meta.headings,
100103
captions=doc_chunk.meta.captions,
101104
origin=doc_chunk.meta.origin,
102105
)
106+
window_text = (
107+
doc_chunk.text
108+
if len(doc_chunk.meta.doc_items) == 1
109+
else self.delim.join(
110+
[
111+
doc_item.text
112+
for doc_item in doc_items
113+
if isinstance(doc_item, TextItem)
114+
]
115+
)
116+
)
103117
new_chunk = DocChunk(text=window_text, meta=meta)
104118
return new_chunk
105119

106-
def _merge_text(self, t1, t2):
107-
if t1 == "":
108-
return t2
109-
elif t2 == "":
110-
return t1
111-
else:
112-
return f"{t1}{self.delim}{t2}"
113-
114120
def _split_by_doc_items(self, doc_chunk: DocChunk) -> list[DocChunk]:
115-
if doc_chunk.meta.doc_items is None or len(doc_chunk.meta.doc_items) <= 1:
116-
return [doc_chunk]
117-
length = self._doc_chunk_length(doc_chunk)
118-
if length.total_len <= self.max_tokens:
119-
return [doc_chunk]
120-
else:
121-
chunks = []
122-
window_start = 0
123-
window_end = 0
124-
window_text = ""
125-
window_text_length = 0
126-
other_length = length.other_len
127-
num_items = len(doc_chunk.meta.doc_items)
128-
while window_end < num_items:
129-
doc_item = doc_chunk.meta.doc_items[window_end]
130-
if isinstance(doc_item, TextItem):
131-
text = doc_item.text
132-
else:
133-
raise RuntimeError("Non-TextItem split not implemented yet")
134-
text_length = self._count_tokens(text)
135-
if (
136-
text_length + window_text_length + other_length < self.max_tokens
137-
and window_end < num_items - 1
138-
):
121+
chunks = []
122+
window_start = 0
123+
window_end = 0 # an inclusive index
124+
num_items = len(doc_chunk.meta.doc_items)
125+
while window_end < num_items:
126+
new_chunk = self._make_chunk_from_doc_items(
127+
doc_chunk=doc_chunk,
128+
window_start=window_start,
129+
window_end=window_end,
130+
)
131+
if self._count_chunk_tokens(doc_chunk=new_chunk) <= self.max_tokens:
132+
if window_end < num_items - 1:
133+
window_end += 1
139134
# Still room left to add more to this chunk AND still at least one
140135
# item left
141-
window_end += 1
142-
window_text_length += text_length
143-
window_text = self._merge_text(window_text, text)
144-
elif text_length + window_text_length + other_length < self.max_tokens:
136+
continue
137+
else:
145138
# All the items in the window fit into the chunk and there are no
146139
# other items left
147-
window_text = self._merge_text(window_text, text)
148-
new_chunk = self._make_chunk_from_doc_items(
149-
doc_chunk, window_text, window_start, window_end
150-
)
151-
chunks.append(new_chunk)
152-
window_end = num_items
153-
elif window_start == window_end:
154-
# Only one item in the window and it doesn't fit into the chunk. So
155-
# we'll just make it a chunk for now and it will get split in the
156-
# plain text splitter.
157-
window_text = self._merge_text(window_text, text)
158-
new_chunk = self._make_chunk_from_doc_items(
159-
doc_chunk, window_text, window_start, window_end
160-
)
161-
chunks.append(new_chunk)
162-
window_start = window_end + 1
163-
window_end = window_start
164-
window_text = ""
165-
window_text_length = 0
166-
else:
167-
# Multiple items in the window but they don't fit into the chunk.
168-
# However, the existing items must have fit or we wouldn't have
169-
# gotten here. So we put everything but the last item into the chunk
170-
# and then start a new window INCLUDING the current window end.
171-
new_chunk = self._make_chunk_from_doc_items(
172-
doc_chunk, window_text, window_start, window_end - 1
173-
)
174-
chunks.append(new_chunk)
175-
window_start = window_end
176-
window_text = ""
177-
window_text_length = 0
178-
return chunks
140+
window_end = num_items # signalizing the last loop
141+
elif window_start == window_end:
142+
# Only one item in the window and it doesn't fit into the chunk. So
143+
# we'll just make it a chunk for now and it will get split in the
144+
# plain text splitter.
145+
window_end += 1
146+
window_start = window_end
147+
else:
148+
# Multiple items in the window but they don't fit into the chunk.
149+
# However, the existing items must have fit or we wouldn't have
150+
# gotten here. So we put everything but the last item into the chunk
151+
# and then start a new window INCLUDING the current window end.
152+
new_chunk = self._make_chunk_from_doc_items(
153+
doc_chunk=doc_chunk,
154+
window_start=window_start,
155+
window_end=window_end - 1,
156+
)
157+
window_start = window_end
158+
chunks.append(new_chunk)
159+
return chunks
179160

180161
def _split_using_plain_text(
181162
self,
@@ -204,53 +185,45 @@ def _split_using_plain_text(
204185
def _merge_chunks_with_matching_metadata(self, chunks: list[DocChunk]):
205186
output_chunks = []
206187
window_start = 0
207-
window_end = 0
188+
window_end = 0 # an inclusive index
208189
num_chunks = len(chunks)
209190
while window_end < num_chunks:
210191
chunk = chunks[window_end]
211-
lengths = self._doc_chunk_length(chunk)
212192
headings_and_captions = (chunk.meta.headings, chunk.meta.captions)
213193
ready_to_append = False
214194
if window_start == window_end:
215-
# starting a new block of chunks to potentially merge
216195
current_headings_and_captions = headings_and_captions
217-
window_text = chunk.text
218-
window_other_length = lengths.other_len
219-
window_text_length = lengths.text_len
220-
window_items = chunk.meta.doc_items
221196
window_end += 1
222197
first_chunk_of_window = chunk
223-
elif (
224-
headings_and_captions == current_headings_and_captions
225-
and window_text_length + window_other_length + lengths.text_len
226-
<= self.max_tokens
227-
):
228-
# there is room to include the new chunk so add it to the window and
229-
# continue
230-
window_text = self._merge_text(window_text, chunk.text)
231-
window_text_length += lengths.text_len
232-
window_items = window_items + chunk.meta.doc_items
233-
window_end += 1
234198
else:
235-
ready_to_append = True
236-
199+
chks = chunks[window_start : window_end + 1]
200+
doc_items = [it for chk in chks for it in chk.meta.doc_items]
201+
candidate = DocChunk(
202+
text=self.delim.join([chk.text for chk in chks]),
203+
meta=DocMeta(
204+
doc_items=doc_items,
205+
headings=current_headings_and_captions[0],
206+
captions=current_headings_and_captions[1],
207+
origin=chunk.meta.origin,
208+
),
209+
)
210+
if (
211+
headings_and_captions == current_headings_and_captions
212+
and self._count_chunk_tokens(doc_chunk=candidate) <= self.max_tokens
213+
):
214+
# there is room to include the new chunk so add it to the window and
215+
# continue
216+
window_end += 1
217+
new_chunk = candidate
218+
else:
219+
ready_to_append = True
237220
if ready_to_append or window_end == num_chunks:
238221
# no more room OR the start of new metadata. Either way, end the block
239222
# and use the current window_end as the start of a new block
240223
if window_start + 1 == window_end:
241224
# just one chunk so use it as is
242225
output_chunks.append(first_chunk_of_window)
243226
else:
244-
new_meta = DocMeta(
245-
doc_items=window_items,
246-
headings=current_headings_and_captions[0],
247-
captions=current_headings_and_captions[1],
248-
origin=chunk.meta.origin,
249-
)
250-
new_chunk = DocChunk(
251-
text=window_text,
252-
meta=new_meta,
253-
)
254227
output_chunks.append(new_chunk)
255228
# no need to reset window_text, etc. because that will be reset in the
256229
# next iteration in the if window_start == window_end block

0 commit comments

Comments
 (0)