-
Notifications
You must be signed in to change notification settings - Fork 257
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: modular parser and formatter v0 (#175)
* feat: create first format modules * add: example file * add: structured output formatter * fix: all parsers outputs list of elements & compatibility formatters * feat: new basemodel for document * add: structured output * fix: test * fix: add uncategorized text handling * add: skip on flaky pdf * add: section block
- Loading branch information
Showing
22 changed files
with
1,208 additions
and
405 deletions.
There are no files selected for viewing
Empty file.
This file was deleted.
Oops, something went wrong.
211 changes: 0 additions & 211 deletions
211
libs/megaparse/src/megaparse/checker/markdown_processor.py
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,18 +1,39 @@ | ||
from megaparse import MegaParse | ||
import asyncio | ||
from pathlib import Path | ||
from typing import List | ||
|
||
from langchain_openai import ChatOpenAI | ||
from llama_index.core.schema import Document as LlamaDocument | ||
from llama_parse import LlamaParse | ||
from llama_parse.utils import Language, ResultType | ||
from megaparse.formatter.structured_formatter.custom_structured_formatter import ( | ||
CustomStructuredFormatter, | ||
) | ||
from megaparse.megaparse import MegaParse | ||
from megaparse.parser.doctr_parser import DoctrParser | ||
from megaparse.parser.unstructured_parser import UnstructuredParser | ||
import pypdfium2 as pdfium | ||
from megaparse_sdk.schema.extensions import FileExtension | ||
from pydantic import BaseModel, Field | ||
|
||
|
||
class MyCustomFormat(BaseModel): | ||
title: str = Field(description="The title of the document.") | ||
problem: str = Field(description="The problem statement.") | ||
solution: str = Field(description="The solution statement.") | ||
|
||
|
||
def main(): | ||
parser = UnstructuredParser() | ||
megaparse = MegaParse(parser=parser) | ||
async def main(): | ||
# Parse a file | ||
parser = DoctrParser() | ||
model = ChatOpenAI(name="gpt-4o") | ||
formatter_1 = CustomStructuredFormatter(model=model, output_model=MyCustomFormat) | ||
|
||
file_path = "./tests/pdf/native/0168029.pdf" | ||
megaparse = MegaParse(ocr_parser=parser, formatters=[formatter_1]) | ||
|
||
parsed_file = megaparse.load(file_path) | ||
print(f"\n----- File Response : {file_path} -----\n") | ||
print(parsed_file) | ||
file_path = Path("./tests/pdf/sample_pdf.pdf") | ||
result = await megaparse.aload(file_path=file_path) | ||
print(result) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() | ||
asyncio.run(main()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
from abc import ABC | ||
from pathlib import Path | ||
from typing import List, Union | ||
|
||
from langchain_core.language_models.chat_models import BaseChatModel | ||
from megaparse.models.document import Document | ||
|
||
|
||
class BaseFormatter(ABC): | ||
""" | ||
A class used to improve the layout of elements, particularly focusing on converting HTML tables to markdown tables. | ||
Attributes | ||
---------- | ||
model : BaseChatModel | ||
An instance of a chat model used to process and improve the layout of elements. | ||
Methods | ||
------- | ||
improve_layout(elements: List[Element]) -> List[Element] | ||
Processes a list of elements, converting HTML tables to markdown tables and improving the overall layout. | ||
""" | ||
|
||
def __init__(self, model: BaseChatModel | None = None): | ||
self.model = model | ||
|
||
def format( | ||
self, document: Document, file_path: Path | str | None = None | ||
) -> Union[Document, str]: | ||
raise NotImplementedError("Subclasses should implement this method") | ||
|
||
async def aformat( | ||
self, document: Document, file_path: Path | str | None = None | ||
) -> Union[Document, str]: | ||
raise NotImplementedError("Subclasses should implement this method") |
25 changes: 25 additions & 0 deletions
25
libs/megaparse/src/megaparse/formatter/structured_formatter/__init__.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
from pathlib import Path | ||
from langchain_core.language_models.chat_models import BaseChatModel | ||
from megaparse.formatter.base import BaseFormatter | ||
from megaparse.models.document import Document | ||
from pydantic import BaseModel | ||
|
||
|
||
class StructuredFormatter(BaseFormatter): | ||
def __init__(self, model: BaseChatModel, output_model: type[BaseModel]): | ||
super().__init__(model) | ||
self.output_model = output_model | ||
|
||
async def aformat( | ||
self, | ||
document: Document, | ||
file_path: Path | str | None = None, | ||
) -> str: # FIXME: Return a structured output of type BaseModel ? | ||
raise NotImplementedError() | ||
|
||
def format( | ||
self, | ||
document: Document, | ||
file_path: Path | str | None = None, | ||
) -> str: # FIXME: Return a structured output of type BaseModel ? | ||
raise NotImplementedError() |
Oops, something went wrong.