Skip to content

Commit

Permalink
feat: modular parser and formatter v0 (#175)
Browse files Browse the repository at this point in the history
* feat: create first format modules

* add: example file

* add: structured output formatter

* fix: all parsers outputs list of elements & compatibility formatters

* feat: new basemodel for document

* add: structured output

* fix: test

* fix: add uncategorized text handling

* add: skip on flaky pdf

* add: section block
  • Loading branch information
chloedia authored Jan 10, 2025
1 parent c6c00bb commit 1f4dcf8
Show file tree
Hide file tree
Showing 22 changed files with 1,208 additions and 405 deletions.
Empty file.
26 changes: 0 additions & 26 deletions libs/megaparse/src/megaparse/checker/format_checker.py

This file was deleted.

211 changes: 0 additions & 211 deletions libs/megaparse/src/megaparse/checker/markdown_processor.py

This file was deleted.

41 changes: 31 additions & 10 deletions libs/megaparse/src/megaparse/examples/parse_file.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,39 @@
from megaparse import MegaParse
import asyncio
from pathlib import Path
from typing import List

from langchain_openai import ChatOpenAI
from llama_index.core.schema import Document as LlamaDocument
from llama_parse import LlamaParse
from llama_parse.utils import Language, ResultType
from megaparse.formatter.structured_formatter.custom_structured_formatter import (
CustomStructuredFormatter,
)
from megaparse.megaparse import MegaParse
from megaparse.parser.doctr_parser import DoctrParser
from megaparse.parser.unstructured_parser import UnstructuredParser
import pypdfium2 as pdfium
from megaparse_sdk.schema.extensions import FileExtension
from pydantic import BaseModel, Field


class MyCustomFormat(BaseModel):
title: str = Field(description="The title of the document.")
problem: str = Field(description="The problem statement.")
solution: str = Field(description="The solution statement.")


def main():
parser = UnstructuredParser()
megaparse = MegaParse(parser=parser)
async def main():
# Parse a file
parser = DoctrParser()
model = ChatOpenAI(name="gpt-4o")
formatter_1 = CustomStructuredFormatter(model=model, output_model=MyCustomFormat)

file_path = "./tests/pdf/native/0168029.pdf"
megaparse = MegaParse(ocr_parser=parser, formatters=[formatter_1])

parsed_file = megaparse.load(file_path)
print(f"\n----- File Response : {file_path} -----\n")
print(parsed_file)
file_path = Path("./tests/pdf/sample_pdf.pdf")
result = await megaparse.aload(file_path=file_path)
print(result)


if __name__ == "__main__":
main()
asyncio.run(main())
33 changes: 33 additions & 0 deletions libs/megaparse/src/megaparse/formatter/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
from abc import ABC
from pathlib import Path
from typing import List, Union

from langchain_core.language_models.chat_models import BaseChatModel
from megaparse.models.document import Document


class BaseFormatter(ABC):
"""
A class used to improve the layout of elements, particularly focusing on converting HTML tables to markdown tables.
Attributes
----------
model : BaseChatModel
An instance of a chat model used to process and improve the layout of elements.
Methods
-------
improve_layout(elements: List[Element]) -> List[Element]
Processes a list of elements, converting HTML tables to markdown tables and improving the overall layout.
"""

def __init__(self, model: BaseChatModel | None = None):
self.model = model

def format(
self, document: Document, file_path: Path | str | None = None
) -> Union[Document, str]:
raise NotImplementedError("Subclasses should implement this method")

async def aformat(
self, document: Document, file_path: Path | str | None = None
) -> Union[Document, str]:
raise NotImplementedError("Subclasses should implement this method")
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
from pathlib import Path
from langchain_core.language_models.chat_models import BaseChatModel
from megaparse.formatter.base import BaseFormatter
from megaparse.models.document import Document
from pydantic import BaseModel


class StructuredFormatter(BaseFormatter):
def __init__(self, model: BaseChatModel, output_model: type[BaseModel]):
super().__init__(model)
self.output_model = output_model

async def aformat(
self,
document: Document,
file_path: Path | str | None = None,
) -> str: # FIXME: Return a structured output of type BaseModel ?
raise NotImplementedError()

def format(
self,
document: Document,
file_path: Path | str | None = None,
) -> str: # FIXME: Return a structured output of type BaseModel ?
raise NotImplementedError()
Loading

0 comments on commit 1f4dcf8

Please sign in to comment.