Skip to content

Commit

Permalink
added merging_separator argument to allow for specifying chunk merge (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
NoahChristiansen authored Mar 6, 2025
1 parent d569009 commit d3a861f
Showing 1 changed file with 22 additions and 9 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ class SemanticDoubleMergingSplitterNodeParser(NodeParser):
merging_threshold (float): sets threshold for merging whole chunks
max_chunk_size (int): maximum size of chunk (in characters)
merging_range (int): How many chunks 'ahead' beyond the nearest neighbor to be merged if similar (1 or 2 available)
merging_separator (str): The separator to use when merging chunks. Defaults to a single space.
sentence_splitter (Optional[Callable]): splits text into sentences
"""

Expand Down Expand Up @@ -123,6 +124,11 @@ class SemanticDoubleMergingSplitterNodeParser(NodeParser):
),
)

merging_separator: str = Field(
default=" ",
description="The separator to use when merging chunks. Defaults to a single space.",
)

sentence_splitter: Callable[[str], List[str]] = Field(
default_factory=split_by_sentence_tokenizer,
description="The text splitter to use when splitting documents.",
Expand All @@ -142,6 +148,7 @@ def from_defaults(
merging_threshold: Optional[float] = 0.8,
max_chunk_size: Optional[int] = 1000,
merging_range: Optional[int] = 1,
merging_separator: Optional[str] = " ",
sentence_splitter: Optional[Callable[[str], List[str]]] = None,
original_text_metadata_key: str = DEFAULT_OG_TEXT_METADATA_KEY,
include_metadata: bool = True,
Expand All @@ -162,6 +169,7 @@ def from_defaults(
merging_threshold=merging_threshold,
max_chunk_size=max_chunk_size,
merging_range=merging_range,
merging_separator=merging_separator,
sentence_splitter=sentence_splitter,
original_text_metadata_key=original_text_metadata_key,
include_metadata=include_metadata,
Expand Down Expand Up @@ -239,14 +247,14 @@ def _create_initial_chunks(self, sentences: List[str]) -> List[str]:
chunk_sentences = [chunk]
if len(chunk) + len(sentence) + 1 <= self.max_chunk_size:
chunk_sentences.append(sentence)
chunk = " ".join(chunk_sentences)
chunk = self.merging_separator.join(chunk_sentences)
new = False
else:
new = True
initial_chunks.append(chunk)
chunk = sentence
continue
last_sentences = " ".join(chunk_sentences[-2:])
last_sentences = self.merging_separator.join(chunk_sentences[-2:])
# new = False

elif (
Expand All @@ -261,8 +269,8 @@ def _create_initial_chunks(self, sentences: List[str]) -> List[str]:
):
# elif nlp(last_sentences).similarity(nlp(sentence)) > self.threshold:
chunk_sentences.append(sentence)
last_sentences = " ".join(chunk_sentences[-2:])
chunk += " " + sentence
last_sentences = self.merging_separator.join(chunk_sentences[-2:])
chunk += self.merging_separator + sentence
else:
initial_chunks.append(chunk)
chunk = sentence # ""
Expand Down Expand Up @@ -302,7 +310,7 @@ def _merge_initial_chunks(self, initial_chunks: List[str]) -> List[str]:
> self.merging_threshold
and len(current) + len(initial_chunks[i]) + 1 <= self.max_chunk_size
):
current += " " + initial_chunks[i]
current += self.merging_separator + initial_chunks[i]

# check if 1st and 3rd chunk are similar, if yes then merge 1st, 2nd, 3rd together
elif (
Expand All @@ -319,7 +327,12 @@ def _merge_initial_chunks(self, initial_chunks: List[str]) -> List[str]:
+ 2
<= self.max_chunk_size
):
current += " " + initial_chunks[i] + " " + initial_chunks[i + 1]
current += (
self.merging_separator
+ initial_chunks[i]
+ self.merging_separator
+ initial_chunks[i + 1]
)
skip = 1

# check if 1st and 4th chunk are smilar, if yes then merge 1st, 2nd, 3rd and 4th together
Expand All @@ -340,11 +353,11 @@ def _merge_initial_chunks(self, initial_chunks: List[str]) -> List[str]:
<= self.max_chunk_size
):
current += (
" "
self.merging_separator
+ initial_chunks[i]
+ " "
+ self.merging_separator
+ initial_chunks[i + 1]
+ " "
+ self.merging_separator
+ initial_chunks[i + 2]
)
skip = 2
Expand Down

0 comments on commit d3a861f

Please sign in to comment.