From a51ffc9147962a71f6a18d0a03be1b35f078f6dc Mon Sep 17 00:00:00 2001 From: AswanthManoj Date: Fri, 23 Aug 2024 16:13:55 +0530 Subject: [PATCH] Added structural cue chunking strategy based on JinaAI's tokenizer chunking --- omniparse/chunking/__init__.py | 157 +++++++++++++++++++++++++++++++++ 1 file changed, 157 insertions(+) diff --git a/omniparse/chunking/__init__.py b/omniparse/chunking/__init__.py index 66429f0..748c6d8 100644 --- a/omniparse/chunking/__init__.py +++ b/omniparse/chunking/__init__.py @@ -111,3 +111,160 @@ def chunk(self, text: str) -> list: for i in range(0, len(words), self.step): chunks.append(" ".join(words[i : i + self.window_size])) return chunks + + +# Structural cue based chunking +class StructuralCueChunking(ChunkingStrategy): + """ + Inspired by https://jina.ai/tokenizer/#chunking which leverage common structural cues + and build a set of rules and heuristics which should perform exceptionally well across + diverse types of content, including Markdown, HTML, LaTeX, and more, + ensuring accurate segmentation of text into meaningful chunks. + + Reference: https://gist.github.com/JeremiahZhang/2f8ae87dad836b25f40c02b8c43d16ec + Original x post: https://x.com/JinaAI_/status/1823756993108304135 + """ + def __init__(self, max_chunk_size: int=500, **kwargs): + """ + Args: + max_chunk_size (int, optional): The maximum size of a chunk. Defaults to 500. + **kwargs: Additional keyword arguments. + + Returns: + None + """ + import regex + self.MAX_TABLE_ROWS = 20 + self.LOOKAHEAD_RANGE = 100 + self.MAX_HEADING_LENGTH = 7 + self.MAX_SENTENCE_LENGTH = 400 + self.MAX_NESTED_LIST_ITEMS = 6 + self.MAX_BLOCKQUOTE_LINES = 15 + self.MAX_NESTED_PARENTHESES = 5 + self.MAX_LIST_INDENT_SPACES = 7 + self.MAX_LIST_ITEM_LENGTH = 200 + self.MAX_TABLE_CELL_LENGTH = 200 + self.MAX_MATH_BLOCK_LENGTH = 500 + self.MAX_PARAGRAPH_LENGTH = 1000 + self.MAX_QUOTED_TEXT_LENGTH = 300 + self.MAX_INDENTED_CODE_LINES = 20 + self.MAX_CODE_BLOCK_LENGTH = 1500 + self.MAX_HTML_TABLE_LENGTH = 2000 + self.MAX_MATH_INLINE_LENGTH = 100 + self.MAX_CODE_LANGUAGE_LENGTH = 20 + self.MIN_HORIZONTAL_RULE_LENGTH = 3 + self.max_chunk_size = max_chunk_size + self.MAX_BLOCKQUOTE_LINE_LENGTH = 200 + self.MAX_HEADING_CONTENT_LENGTH = 200 + self.MAX_STANDALONE_LINE_LENGTH = 800 + self.MAX_HEADING_UNDERLINE_LENGTH = 200 + self.MAX_HTML_TAG_CONTENT_LENGTH = 1000 + self.MAX_HTML_TAG_ATTRIBUTES_LENGTH = 100 + self.MAX_PARENTHETICAL_CONTENT_LENGTH = 200 + self.MAX_HTML_HEADING_ATTRIBUTES_LENGTH = 100 + + self.pattern = self.__pattern__() + + def __pattern__(self) -> str: + + # 1. Headings (Setext-style, Markdown, and HTML-style, with length constraints) + heading_regex = rf"""(?:^(?:[#*=-]{{1,{self.MAX_HEADING_LENGTH}}}|\w[^\r\n]{{0,{self.MAX_HEADING_CONTENT_LENGTH}}}\r?\n[-=]{{2,{self.MAX_HEADING_UNDERLINE_LENGTH}}}|]{{0,{self.MAX_HTML_HEADING_ATTRIBUTES_LENGTH}}}>)[^\r\n]{{1,{self.MAX_HEADING_CONTENT_LENGTH}}}(?:)?(?:\r?\n|$))""" + + # 2. New pattern for citations + citation_regex = rf"(?:\[[0-9]+\][^\r\n]{{1,{self.MAX_STANDALONE_LINE_LENGTH}}})" + + # 3. List items (bulleted, numbered, lettered, or task lists, including nested, up to three levels, with length constraints) + list_item_regex = rf"(?:(?:^|\r?\n)[ \t]{{0,3}}(?:[-*+•]|\d{{1,3}}\.\w\.|\[[ xX]\])[ \t]+(?:(?:\b[^\r\n]{{1,{self.MAX_LIST_ITEM_LENGTH}}}\b(?:[.!?…]|\.{{3}}|[\u2026\u2047-\u2049]|[\p{{Emoji_Presentation}}\p{{Extended_Pictographic}}])(?=\s|$))|(?:\b[^\r\n]{{1,{self.MAX_LIST_ITEM_LENGTH}}}\b(?=[\r\n]|$))|(?:\b[^\r\n]{{1,{self.MAX_LIST_ITEM_LENGTH}}}\b(?=[.!?…]|\.{{3}}|[\u2026\u2047-\u2049]|[\p{{Emoji_Presentation}}\p{{Extended_Pictographic}}])(?:.{{1,{self.LOOKAHEAD_RANGE}}}(?:[.!?…]|\.{{3}}|[\u2026\u2047-\u2049]|[\p{{Emoji_Presentation}}\p{{Extended_Pictographic}}])(?=\s|$))?))" + list_item_regex += rf"(?:(?:\r?\n[ \t]{{2,5}}(?:[-*+•]|\d{{1,3}}\.\w\.|\[[ xX]\])[ \t]+(?:(?:\b[^\r\n]{{1,{self.MAX_LIST_ITEM_LENGTH}}}\b(?:[.!?…]|\.{{3}}|[\u2026\u2047-\u2049]|[\p{{Emoji_Presentation}}\p{{Extended_Pictographic}}])(?=\s|$))|(?:\b[^\r\n]{{1,${self.MAX_LIST_ITEM_LENGTH}}}\b(?=[\r\n]|$))|(?:\b[^\r\n]{{1,${self.MAX_LIST_ITEM_LENGTH}}}\b(?=[.!?…]|\.{{3}}|[\u2026\u2047-\u2049]|[\p{{Emoji_Presentation}}\p{{Extended_Pictographic}}])(?:.{{1,{self.LOOKAHEAD_RANGE}}}(?:[.!?…]|\.{{3}}|[\u2026\u2047-\u2049]|[\p{{Emoji_Presentation}}\p{{Extended_Pictographic}}])(?=\s|$))?)))" + list_item_regex += rf"{{0,{self.MAX_NESTED_LIST_ITEMS}}}(?:\r?\n[ \t]{{4,{self.MAX_LIST_INDENT_SPACES}}}(?:[-*+•]|\d{{1,3}}\.\w\.|\[[ xX]\])[ \t]+(?:(?:\b[^\r\n]{{1,{self.MAX_LIST_ITEM_LENGTH}}}\b(?:[.!?…]|\.{{3}}|[\u2026\u2047-\u2049]|[\p{{Emoji_Presentation}}\p{{Extended_Pictographic}}])(?=\s|$))|(?:\b[^\r\n]{{1,{self.MAX_LIST_ITEM_LENGTH}}}\b(?=[\r\n]|$))|(?:\b[^\r\n]{{1,{self.MAX_LIST_ITEM_LENGTH}}}\b(?=[.!?…]|\.{{3}}|[\u2026\u2047-\u2049]|[\p{{Emoji_Presentation}}\p{{Extended_Pictographic}}])(?:.{{1,{self.LOOKAHEAD_RANGE}}}(?:[.!?…]|\.{{3}}|[\u2026\u2047-\u2049]|[\p{{Emoji_Presentation}}\p{{Extended_Pictographic}}])(?=\s|$))?)))" + list_item_regex += rf"{{0,{self.MAX_NESTED_LIST_ITEMS}}})?)" + + # 4. Block quotes (including nested quotes and citations, up to three levels, with length constraints) + block_regex = rf"(?:(?:^>(?:>|\s{{2,}}){{0,2}}(?:(?:\b[^\r\n]{{0,{self.MAX_BLOCKQUOTE_LINE_LENGTH}}}\b(?:[.!?…]|\.{{3}}|[\u2026\u2047-\u2049]|[\p{{Emoji_Presentation}}\p{{Extended_Pictographic}}])(?=\s|$))|(?:\b[^\r\n]{{0,{self.MAX_BLOCKQUOTE_LINE_LENGTH}}}\b(?=[\r\n]|$))|(?:\b[^\r\n]{{0,{self.MAX_BLOCKQUOTE_LINE_LENGTH}}}\b(?=[.!?…]|\.{{3}}|[\u2026\u2047-\u2049]|[\p{{Emoji_Presentation}}\p{{Extended_Pictographic}}])(?:.{{1,{self.LOOKAHEAD_RANGE}}}(?:[.!?…]|\.{{3}}|[\u2026\u2047-\u2049]|[\p{{Emoji_Presentation}}\p{{Extended_Pictographic}}])(?=\s|$))?))\r?\n?){{1,{self.MAX_BLOCKQUOTE_LINES}}})" + + # 5. Code blocks (fenced, indented, or HTML pre/code tags, with length constraints) + code_block_regex = rf"(?:(?:^|\r?\n)(?:\`\`\`|~~~)(?:\w{{0,{self.MAX_CODE_LANGUAGE_LENGTH}}})?\r?\n[\s\S]{{0,{self.MAX_CODE_BLOCK_LENGTH}}}?(?:\`\`\`|~~~)\r?\n?" + code_block_regex += rf"|(?:(?:^|\r?\n)(?: {{4}}|\t)[^\r\n]{{0,{self.MAX_LIST_ITEM_LENGTH}}}(?:\r?\n(?: {{4}}|\t)[^\r\n]{{0,{self.MAX_LIST_ITEM_LENGTH}}}){{0,{self.MAX_INDENTED_CODE_LINES}}}\r?\n?)" + code_block_regex += rf"|(?:
(?:)?[\s\S]{{0,{self.MAX_CODE_BLOCK_LENGTH}}}?(?:)?
))" + + # 6. Tables (Markdown, grid tables, and HTML tables, with length constraints) + table_regex = rf"(?:(?:^|\r?\n)(?:\|[^\r\n]{{0,{self.MAX_TABLE_CELL_LENGTH}}}\|(?:\r?\n\|[-:]{{1,{self.MAX_TABLE_CELL_LENGTH}}}\|){{0,1}}(?:\r?\n\|[^\r\n]{{0,{self.MAX_TABLE_CELL_LENGTH}}}\|){{0,{self.MAX_TABLE_ROWS}}}" + table_regex += rf"|[\s\S]{{0,{self.MAX_HTML_TABLE_LENGTH}}}?
))" + + # 7. Horizontal rules (Markdown and HTML hr tag) + horizontal_rule_regex = rf"(?:^(?:[-*_]){{{self.MIN_HORIZONTAL_RULE_LENGTH},}}\s*$|)" + + # 8. Standalone lines or phrases (including single-line blocks and HTML elements, with length constraints) + single_line_regex = rf"(?:^(?:<[a-zA-Z][^>]{{0,{self.MAX_HTML_TAG_ATTRIBUTES_LENGTH}}}>)?(?:(?:[^\r\n]{{1,{self.MAX_STANDALONE_LINE_LENGTH}}}(?:[.!?…]|\.\.\.|[\u2026\u2047-\u2049]|[\p{{Emoji_Presentation}}\p{{Extended_Pictographic}}])(?=\s|$))|(?:[^\r\n]{{1,{self.MAX_STANDALONE_LINE_LENGTH}}}(?=[\r\n]|$))|(?:[^\r\n]{{1,{self.MAX_STANDALONE_LINE_LENGTH}}}(?=[.!?…]|\.\.\.|[\u2026\u2047-\u2049]|[\p{{Emoji_Presentation}}\p{{Extended_Pictographic}}])(?:.{{1,{self.LOOKAHEAD_RANGE}}}(?:[.!?…]|\.\.\.|[\u2026\u2047-\u2049]|[\p{{Emoji_Presentation}}\p{{Extended_Pictographic}}])(?=\s|$))?))(?:)?(?:\r?\n|$))" + + # 9. Sentences or phrases ending with punctuation (including ellipsis and Unicode punctuation) + sentence_regex = rf"(?:(?:[^\r\n]{{1,{self.MAX_SENTENCE_LENGTH}}}(?:[.!?…]|\.\.\.|[\u2026\u2047-\u2049]|[\p{{Emoji_Presentation}}\p{{Extended_Pictographic}}])(?=\s|$))|(?:[^\r\n]{{1,{self.MAX_SENTENCE_LENGTH}}}(?=[\r\n]|$))|(?:[^\r\n]{{1,{self.MAX_SENTENCE_LENGTH}}}(?=[.!?…]|\.\.\.|[\u2026\u2047-\u2049]|[\p{{Emoji_Presentation}}\p{{Extended_Pictographic}}])(?:.{{1,{self.LOOKAHEAD_RANGE}}}(?:[.!?…]|\.\.\.|[\u2026\u2047-\u2049]|[\p{{Emoji_Presentation}}\p{{Extended_Pictographic}}])(?=\s|$))?))" + + # 10. Quoted text, parenthetical phrases, or bracketed content (with length constraints) + quoted_text = "(?:" + quoted_text += rf"(?['"\`'"])[^\r\n]{{0,{self.MAX_QUOTED_TEXT_LENGTH}}}(?P=quote)(?!\w)""" + quoted_text += rf"|\([^\r\n()]{{0,{self.MAX_PARENTHETICAL_CONTENT_LENGTH}}}(?:\([^\r\n()]{{0,{self.MAX_PARENTHETICAL_CONTENT_LENGTH}}}\)[^\r\n()]{{0,{self.MAX_PARENTHETICAL_CONTENT_LENGTH}}}){{0,{self.MAX_NESTED_PARENTHESES}}}\)" + quoted_text += rf"|\[[^\r\n\[\]]{{0,{self.MAX_PARENTHETICAL_CONTENT_LENGTH}}}(?:\[[^\r\n\[\]]{{0,{self.MAX_PARENTHETICAL_CONTENT_LENGTH}}}\][^\r\n\[\]]{{0,{self.MAX_PARENTHETICAL_CONTENT_LENGTH}}}){{0,{self.MAX_NESTED_PARENTHESES}}}\]" + quoted_text += rf"|\$[^\r\n$]{{0,{self.MAX_MATH_INLINE_LENGTH}}}\$" + quoted_text += rf"|\`[^\`\r\n]{{0,{self.MAX_MATH_INLINE_LENGTH}}}\`" + quoted_text += ")" + + # 11. Paragraphs (with length constraints) + paragraph_regex = rf"(?:(?:^|\r?\n\r?\n)(?:

)?(?:(?:[^\r\n]{{1,{self.MAX_PARAGRAPH_LENGTH}}}(?:[.!?…]|\.{{3}}|[\u2026\u2047-\u2049]|[\p{{Emoji_Presentation}}\p{{Extended_Pictographic}}])(?=\s|$))|(?:[^\r\n]{{1,{self.MAX_PARAGRAPH_LENGTH}}}(?=[\r\n]|$))|(?:[^\r\n]{{1,{self.MAX_PARAGRAPH_LENGTH}}}(?=[.!?…]|\.{{3}}|[\u2026\u2047-\u2049]|[\p{{Emoji_Presentation}}\p{{Extended_Pictographic}}])(?:.{{1,{self.LOOKAHEAD_RANGE}}}(?:[.!?…]|\.{{3}}|[\u2026\u2047-\u2049]|[\p{{Emoji_Presentation}}\p{{Extended_Pictographic}}])(?=\s|$))?))(?:

)?(?=\r?\n\r?\n|$))" + + # 12. HTML-like tags and their content (including self-closing tags and attributes, with length constraints) + html_like_regex = rf"(?:<[a-zA-Z][^>]{{0,{self.MAX_HTML_TAG_ATTRIBUTES_LENGTH}}}(?:>[\s\S]{{0,{self.MAX_HTML_TAG_CONTENT_LENGTH}}}?|\s*/>))" + + #13. LaTeX-style math expressions (inline and block, with length constraints) + latex_regex = rf"(?:(?:\$\$[\s\S]{{0,{self.MAX_MATH_BLOCK_LENGTH}}}?\$\$)|(?:\$[^\$\r\n]{{0,{self.MAX_MATH_INLINE_LENGTH}}}\$))" + + # 14. Fallback for any remaining content (with length constraints) + fallback_regex = rf"(?:(?:[^\r\n]{{1,{self.MAX_STANDALONE_LINE_LENGTH}}}(?:[.!?…]|\.{{3}}|[\u2026\u2047-\u2049]|[\p{{Emoji_Presentation}}\p{{Extended_Pictographic}}])(?=\s|$))|(?:[^\r\n]{{1,{self.MAX_STANDALONE_LINE_LENGTH}}}(?=[\r\n]|$))|(?:[^\r\n]{{1,{self.MAX_STANDALONE_LINE_LENGTH}}}(?=[.!?…]|\.{{3}}|[\u2026\u2047-\u2049]|[\p{{Emoji_Presentation}}\p{{Extended_Pictographic}}])(?:.{{1,{self.LOOKAHEAD_RANGE}}}(?:[.!?…]|\.{{3}}|[\u2026\u2047-\u2049]|[\p{{Emoji_Presentation}}\p{{Extended_Pictographic}}])(?=\s|$))?))" + + return re.compile('|'.join((f"({heading_regex}", citation_regex, list_item_regex, block_regex, code_block_regex, table_regex, horizontal_rule_regex, single_line_regex, sentence_regex, quoted_text, paragraph_regex, html_like_regex, latex_regex, f"{fallback_regex})")), re.MULTILINE | re.DOTALL) + + def chunk(self, text: str) -> list: + """ + Breaks down a given text into smaller chunks based on common stuctural cues and maximum chunk size. + + Args: + text (str): The input text to be chunked. + + Returns: + list: A list of chunked text, where each chunk is a string. + """ + chunks = re.findall(self.pattern, text) + + temp_chunk = "" + final_chunks = [] + + for chunk in chunks: + chunk=chunk[0] + if len(temp_chunk) + len(chunk) > self.max_chunk_size: + final_chunks.append(temp_chunk.strip()) + temp_chunk = chunk + else: + temp_chunk += chunk + + if temp_chunk: + final_chunks.append(temp_chunk.strip()) + + # If a chunk is too large, break it down further + refined_chunks = [] + for chunk in final_chunks: + if len(chunk) > self.max_chunk_size: + sentences = re.split(r'(?<=[.!?]) +', chunk) # Split by sentence + temp_chunk = "" + for sentence in sentences: + if len(temp_chunk) + len(sentence) > self.max_chunk_size: + refined_chunks.append(temp_chunk.strip()) + temp_chunk = sentence + else: + temp_chunk += f" {sentence}" + if temp_chunk: + refined_chunks.append(temp_chunk.strip()) + else: + refined_chunks.append(chunk) + + return refined_chunks \ No newline at end of file