Skip to content

Commit

Permalink
[MarkdownNodeParser] Adding customizable header path separator char (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
jpsoultanis-kapa authored Mar 5, 2025
1 parent 5731f91 commit a804b39
Show file tree
Hide file tree
Showing 2 changed files with 53 additions and 3 deletions.
20 changes: 17 additions & 3 deletions llama-index-core/llama_index/core/node_parser/file/markdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import re
from typing import Any, List, Optional, Sequence

from llama_index.core.bridge.pydantic import Field
from llama_index.core.callbacks.base import CallbackManager
from llama_index.core.node_parser.interface import NodeParser
from llama_index.core.node_parser.node_utils import build_nodes_from_splits
Expand All @@ -18,19 +19,26 @@ class MarkdownNodeParser(NodeParser):
Args:
include_metadata (bool): whether to include metadata in nodes
include_prev_next_rel (bool): whether to include prev/next relationships
header_path_separator (str): separator char used for section header path metadata
"""

header_path_separator: str = Field(
default="/", description="Separator char used for section header path metadata."
)

@classmethod
def from_defaults(
cls,
include_metadata: bool = True,
include_prev_next_rel: bool = True,
header_path_separator: str = "/",
callback_manager: Optional[CallbackManager] = None,
) -> "MarkdownNodeParser":
callback_manager = callback_manager or CallbackManager([])
return cls(
include_metadata=include_metadata,
include_prev_next_rel=include_prev_next_rel,
header_path_separator=header_path_separator,
callback_manager=callback_manager,
)

Expand Down Expand Up @@ -61,7 +69,9 @@ def get_nodes_from_node(self, node: BaseNode) -> List[TextNode]:
self._build_node_from_split(
current_section.strip(),
node,
"/".join(h[1] for h in header_stack[:-1]),
self.header_path_separator.join(
h[1] for h in header_stack[:-1]
),
)
)

Expand All @@ -87,7 +97,7 @@ def get_nodes_from_node(self, node: BaseNode) -> List[TextNode]:
self._build_node_from_split(
current_section.strip(),
node,
"/".join(h[1] for h in header_stack[:-1]),
self.header_path_separator.join(h[1] for h in header_stack[:-1]),
)
)

Expand All @@ -103,8 +113,12 @@ def _build_node_from_split(
node = build_nodes_from_splits([text_split], node, id_func=self.id_func)[0]

if self.include_metadata:
separator = self.header_path_separator
node.metadata["header_path"] = (
"/" + header_path + "/" if header_path else "/"
# ex: "/header1/header2/" || "/"
separator + header_path + separator
if header_path
else separator
)

return node
Expand Down
36 changes: 36 additions & 0 deletions llama-index-core/tests/node_parser/test_markdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,42 @@ def test_header_splits() -> None:
assert splits[1].text == "# Header 2\nHeader 2 content"


def test_header_splits_with_forwardslash() -> None:
markdown_parser = MarkdownNodeParser(
header_path_separator="\u203A"
) # Unicode for "›", infrequently used char

splits = markdown_parser.get_nodes_from_documents(
[
Document(
text="""# Main Header
Header 1 content
## FAQ
FAQ content
### 24/7 Support
Support content
#### Contact info
Contact info content
"""
)
]
)
assert len(splits) == 4
assert splits[0].metadata == {"header_path": "›"}
assert splits[1].metadata == {"header_path": "›Main Header›"}
assert splits[2].metadata == {"header_path": "›Main Header›FAQ›"}
assert splits[3].metadata == {"header_path": "›Main Header›FAQ›24/7 Support›"}

assert splits[0].text == "# Main Header\n\nHeader 1 content"
assert splits[1].text == "## FAQ\nFAQ content"
assert splits[2].text == "### 24/7 Support\nSupport content"
assert splits[3].text == "#### Contact info\nContact info content"


def test_header_splits_with_indented_code_blocks() -> None:
markdown_parser = MarkdownNodeParser()

Expand Down

0 comments on commit a804b39

Please sign in to comment.