Skip to content

Commit

Permalink
feat!: Massive quality improvements to v2 parser and new sanitize_cel…
Browse files Browse the repository at this point in the history
…ls API (#73)

Signed-off-by: Peter Staar <[email protected]>
Signed-off-by: Christoph Auer <[email protected]>
Co-authored-by: Christoph Auer <[email protected]>
  • Loading branch information
PeterStaar-IBM and cau-git authored Dec 9, 2024
1 parent 7706471 commit 1fccb29
Show file tree
Hide file tree
Showing 325 changed files with 4,695,527 additions and 5,402,417 deletions.
2 changes: 1 addition & 1 deletion .github/scripts/build_rhel.sh
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ sudo -E XDG_RUNTIME_DIR= podman build --progress=plain \
--no-deps --no-build-isolation -w /dist/ \
/src/docling_parse*.tar.gz \
&& pip3.11 install /dist/docling_parse*.whl \
&& python3.11 -c 'from docling_parse.docling_parse import pdf_parser_v1, pdf_parser_v2'
&& python3.11 -c 'from docling_parse.pdf_parsers import pdf_parser_v1, pdf_parser_v2'
COPY ./tests /src/tests
Expand Down
28 changes: 26 additions & 2 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,42 @@ repos:
hooks:
- id: system
name: Black
entry: poetry run black docling_parse
entry: poetry run black docling_parse tests
pass_filenames: false
language: system
files: '\.py$'
- repo: local
hooks:
- id: system
name: isort
entry: poetry run isort docling_parse
entry: poetry run isort docling_parse tests
pass_filenames: false
language: system
files: '\.py$'
- repo: local
hooks:
- id: autoflake
name: autoflake
entry: poetry run autoflake docling_parse
pass_filenames: false
language: system
files: '\.py$'
- repo: local
hooks:
- id: mypy
name: MyPy
entry: poetry run mypy docling_parse tests
pass_filenames: false
language: system
files: '\.py$'
# - repo: local
# hooks:
# - id: pytest
# name: Pytest
# entry: poetry run pytest tests/
# pass_filenames: false
# language: system
# files: '\.py$'
- repo: local
hooks:
- id: system
Expand Down
18 changes: 12 additions & 6 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -192,18 +192,24 @@ target_link_libraries(parse_v2 ${DEPENDENCIES} ${LIB_LINK})
# https://pybind11.readthedocs.io/en/stable/compiling.html#configuration-variables
find_package(pybind11 CONFIG REQUIRED)

pybind11_add_module(docling_parse "${TOPLEVEL_PREFIX_PATH}/app/pybind_parse.cpp")
#pybind11_add_module(docling_parse "${TOPLEVEL_PREFIX_PATH}/app/pybind_parse.cpp")
pybind11_add_module(pdf_parsers "${TOPLEVEL_PREFIX_PATH}/app/pybind_parse.cpp")

add_dependencies(docling_parse parse_v1 parse_v2)
#add_dependencies(docling_parse parse_v1 parse_v2)
add_dependencies(pdf_parsers parse_v1 parse_v2)

target_include_directories(docling_parse INTERFACE ${DEPENDENCIES})
#target_include_directories(docling_parse INTERFACE ${DEPENDENCIES})
target_include_directories(pdf_parsers INTERFACE ${DEPENDENCIES})

target_compile_definitions(docling_parse PRIVATE VERSION_INFO=${CMAKE_PROJECT_VERSION})
#target_compile_definitions(docling_parse PRIVATE VERSION_INFO=${CMAKE_PROJECT_VERSION})
target_compile_definitions(pdf_parsers PRIVATE VERSION_INFO=${CMAKE_PROJECT_VERSION})

target_link_libraries(docling_parse PRIVATE parse_v1 parse_v2)
#target_link_libraries(docling_parse PRIVATE parse_v1 parse_v2)
target_link_libraries(pdf_parsers PRIVATE parse_v1 parse_v2)

# *****************
# *** Install ***
# *****************

install(TARGETS docling_parse DESTINATION "${TOPLEVEL_PREFIX_PATH}/docling_parse")
#install(TARGETS docling_parse DESTINATION "${TOPLEVEL_PREFIX_PATH}/docling_parse")
install(TARGETS pdf_parsers DESTINATION "${TOPLEVEL_PREFIX_PATH}/docling_parse")
6 changes: 2 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,7 @@ Convert a PDF (look in the [visualise.py](docling_parse/visualise.py) for a more
from docling_parse.docling_parse import pdf_parser_v2

# Do this only once to load fonts (avoid initialising it many times)
parser = pdf_parser_v2()

# parser.set_loglevel(1) # 1=error, 2=warning, 3=success, 4=info
parser = pdf_parser_v2("error") # info, warning, error, fatal

doc_file = "my-doc.pdf" # filename
doc_key = f"key={pdf_doc}" # unique document key (eg hash, UUID, etc)
Expand Down Expand Up @@ -167,7 +165,7 @@ If you dont have an input file, then a template input file will be printed on th
To build the package, simply run (make sure [poetry](https://python-poetry.org/) is [installed](https://python-poetry.org/docs/#installing-with-the-official-installer)),

```
poetry build
poetry install
```

To test the package, run:
Expand Down
Loading

0 comments on commit 1fccb29

Please sign in to comment.