Skip to content

Commit b7e6e90

Browse files
authored
Merge pull request #297 from pymupdf/v0.0.27
Version 0.0.27
2 parents 2084a4f + ba94061 commit b7e6e90

File tree

5 files changed

+59
-41
lines changed

5 files changed

+59
-41
lines changed

CHANGES.md

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,18 @@
11
# Change Log
22

3+
## Changes in version 0.0.27
4+
5+
### Fixes:
6+
7+
* [296](https://github.com/pymupdf/RAG/issues/296) - [Bug] A specific diagram recognized as significant ...
8+
* [294](https://github.com/pymupdf/RAG/issues/294) - Unable to extract images from Page
9+
* [272](https://github.com/pymupdf/RAG/issues/272) - Disappeared page breaks
10+
11+
### Other Changes:
12+
13+
* Added new parameter to `to_markdown`: `page_separators=False`. If `True` and `page_chunks=False` a line like `--- end of page=nnn ---` is appended to each pages markdown text. The page number is 0-based. Intended for debugging purposes.
14+
15+
316
## Changes in version 0.0.26
417

518
### Fixes:
@@ -14,7 +27,7 @@
1427

1528
* The class `TocHeaders` is now a top-level import and can now be directly used.
1629

17-
* Method `to_markdown` has a new parameter `detect_bg_color=True` which guesses the page's background color. If detection is successful, vectors having this fill color are ignored (default). Setting this to `False` will "fill" vectors to always be considered in vector graphics detection.
30+
* Method `to_markdown` has a new parameter `detect_bg_color=True` (default) which guesses the page's background color. If a background is detected, fill-only vectors having this color are ignored. `False` will always consider "fill" vectors in vector graphics detection.
1831

1932
* Text written with a `Type 3` font will now always be considered. Previously, this text was always treated as invisible and was hence suppressed.
2033

pdf4llm/setup.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -13,11 +13,11 @@
1313
"Programming Language :: Python :: 3",
1414
"Topic :: Utilities",
1515
]
16-
requires = ["pymupdf4llm==0.0.26"]
16+
requires = ["pymupdf4llm==0.0.27"]
1717

1818
setuptools.setup(
1919
name="pdf4llm",
20-
version="0.0.26",
20+
version="0.0.27",
2121
author="Artifex",
2222
author_email="[email protected]",
2323
description="PyMuPDF Utilities for LLM/RAG",
@@ -29,13 +29,12 @@
2929
license="Dual Licensed - GNU AFFERO GPL 3.0 or Artifex Commercial License",
3030
url="https://github.com/pymupdf/RAG",
3131
classifiers=classifiers,
32-
package_data={
33-
"pdf4llm": ["LICENSE"],
34-
},
32+
package_data={},
3533
project_urls={
3634
"Documentation": "https://pymupdf.readthedocs.io/",
3735
"Source": "https://github.com/pymupdf/RAG/tree/main/pdf4llm/pdf4llm",
3836
"Tracker": "https://github.com/pymupdf/RAG/issues",
3937
"Changelog": "https://github.com/pymupdf/RAG/blob/main/CHANGES.md",
38+
"License": "https://github.com/pymupdf/RAG/blob/main/LICENSE",
4039
},
4140
)

pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py

Lines changed: 35 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -316,6 +316,7 @@ def to_markdown(
316316
filename=None,
317317
force_text=True,
318318
page_chunks=False,
319+
page_separators=False,
319320
margins=0,
320321
dpi=150,
321322
page_width=612,
@@ -341,6 +342,7 @@ def to_markdown(
341342
image_format: (str) use this image format. Choose a supported one.
342343
force_text: (bool) output text despite of image background.
343344
page_chunks: (bool) whether to segment output by page.
345+
page_separators: (bool) whether to include page separators in output.
344346
margins: omit content overlapping margin areas.
345347
dpi: (int) desired resolution for generated images.
346348
page_width: (float) assumption if page layout is variable.
@@ -381,7 +383,7 @@ def to_markdown(
381383
IGNORE_IMAGES = ignore_images
382384
IGNORE_GRAPHICS = ignore_graphics
383385
DETECT_BG_COLOR = detect_bg_color
384-
if doc.is_form_pdf or doc.has_annots():
386+
if doc.is_form_pdf or (doc.is_pdf and doc.has_annots()):
385387
doc.bake()
386388

387389
# for reflowable documents allow making 1 page for the whole document
@@ -560,6 +562,7 @@ def write_text(
560562
)
561563
parms.line_rects.extend(cells)
562564
parms.written_tables.append(i)
565+
prev_hdr_string = None
563566

564567
# ------------------------------------------------------------
565568
# Pick up images / graphics ABOVE this text block
@@ -592,6 +595,7 @@ def write_text(
592595
if not is_white(img_txt):
593596
out_string += img_txt
594597
parms.written_images.append(i)
598+
prev_hdr_string = None
595599

596600
parms.line_rects.append(lrect)
597601
# if line rect is far away from the previous one, add a line break
@@ -751,7 +755,7 @@ def output_tables(parms, text_rect):
751755
):
752756
if i in parms.written_tables:
753757
continue
754-
this_md += parms.tabs[i].to_markdown(clean=False)
758+
this_md += parms.tabs[i].to_markdown(clean=False) + "\n"
755759
if EXTRACT_WORDS:
756760
# for "words" extraction, add table cells as line rects
757761
cells = sorted(
@@ -772,7 +776,7 @@ def output_tables(parms, text_rect):
772776
for i, trect in parms.tab_rects.items():
773777
if i in parms.written_tables:
774778
continue
775-
this_md += parms.tabs[i].to_markdown(clean=False)
779+
this_md += parms.tabs[i].to_markdown(clean=False) + "\n"
776780
if EXTRACT_WORDS:
777781
# for "words" extraction, add table cells as line rects
778782
cells = sorted(
@@ -954,7 +958,7 @@ def get_page_output(
954958
) # accept invisible text
955959

956960
# determine background color
957-
parms.bg_color = get_bg_color(page) if DETECT_BG_COLOR else None
961+
parms.bg_color = None if not DETECT_BG_COLOR else get_bg_color(page)
958962

959963
left, top, right, bottom = margins
960964
parms.clip = page.rect + (left, top, -right, -bottom)
@@ -994,12 +998,12 @@ def get_page_output(
994998
if img_info:
995999
img_max_size = abs(parms.clip) * 0.9
9961000
sane = [i for i in img_info if abs(i["bbox"] & parms.clip) < img_max_size]
997-
if len(sane) < len(img_info): # found some
998-
img_info = sane # use those images instead
999-
# output full page image
1000-
name = save_image(parms, parms.clip, "full")
1001-
if name:
1002-
parms.md_string += GRAPHICS_TEXT % name
1001+
if len(sane) < len(img_info): # found some
1002+
img_info = sane # use those images instead
1003+
# output full page image
1004+
name = save_image(parms, parms.clip, "full")
1005+
if name:
1006+
parms.md_string += GRAPHICS_TEXT % name
10031007

10041008
img_info = img_info[:30] # only accept the largest up to 30 images
10051009
# run from back to front (= small to large)
@@ -1024,31 +1028,31 @@ def get_page_output(
10241028
# Locate all tables on page
10251029
parms.written_tables = [] # stores already written tables
10261030
omitted_table_rects = []
1031+
parms.tabs = []
10271032
if IGNORE_GRAPHICS or not table_strategy:
10281033
# do not try to extract tables
1029-
parms.tabs = None
1034+
pass
10301035
else:
1031-
parms.tabs = page.find_tables(clip=parms.clip, strategy=table_strategy)
1032-
# remove tables with too few rows or columns
1033-
for i in range(len(parms.tabs.tables) - 1, -1, -1):
1034-
t = parms.tabs.tables[i]
1036+
tabs = page.find_tables(clip=parms.clip, strategy=table_strategy)
1037+
for t in tabs.tables:
1038+
# remove tables with too few rows or columns
10351039
if t.row_count < 2 or t.col_count < 2:
10361040
omitted_table_rects.append(pymupdf.Rect(t.bbox))
1037-
del parms.tabs.tables[i]
1038-
parms.tabs.tables.sort(key=lambda t: (t.bbox[0], t.bbox[1]))
1041+
continue
1042+
parms.tabs.append(t)
1043+
parms.tabs.sort(key=lambda t: (t.bbox[0], t.bbox[1]))
10391044

10401045
# Make a list of table boundary boxes.
10411046
# Must include the header bbox (which may exist outside tab.bbox)
10421047
tab_rects = {}
1043-
if parms.tabs is not None:
1044-
for i, t in enumerate(parms.tabs.tables):
1045-
tab_rects[i] = pymupdf.Rect(t.bbox) | pymupdf.Rect(t.header.bbox)
1046-
tab_dict = {
1047-
"bbox": tuple(tab_rects[i]),
1048-
"rows": t.row_count,
1049-
"columns": t.col_count,
1050-
}
1051-
parms.tables.append(tab_dict)
1048+
for i, t in enumerate(parms.tabs):
1049+
tab_rects[i] = pymupdf.Rect(t.bbox) | pymupdf.Rect(t.header.bbox)
1050+
tab_dict = {
1051+
"bbox": tuple(tab_rects[i]),
1052+
"rows": t.row_count,
1053+
"columns": t.col_count,
1054+
}
1055+
parms.tables.append(tab_dict)
10521056
parms.tab_rects = tab_rects
10531057
# list of table rectangles
10541058
parms.tab_rects0 = list(tab_rects.values())
@@ -1064,15 +1068,12 @@ def get_page_output(
10641068
and p["rect"].width < parms.clip.width
10651069
and p["rect"].height < parms.clip.height
10661070
and (p["rect"].width > 3 or p["rect"].height > 3)
1067-
and not (p["fill"] == parms.bg_color and p["fill"] != None)
1068-
and not intersects_rects(
1069-
p["rect"], parms.tab_rects0 + omitted_table_rects
1070-
)
1071+
and not (p["type"] == "f" and p["fill"] == parms.bg_color)
1072+
and not intersects_rects(p["rect"], parms.tab_rects0)
10711073
and not intersects_rects(p["rect"], parms.annot_rects)
10721074
]
10731075
else:
10741076
paths = []
1075-
10761077
# catch too-many-graphics situation
10771078
if GRAPHICS_LIMIT and len(paths) > GRAPHICS_LIMIT:
10781079
paths = []
@@ -1168,6 +1169,9 @@ def get_page_output(
11681169
else:
11691170
words = []
11701171
parms.words = words
1172+
if page_separators:
1173+
# add page separators to output
1174+
parms.md_string += f"\n\n--- end of page={parms.page.number} ---\n\n"
11711175
return parms
11721176

11731177
if page_chunks is False:
Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
1+
# Generated file - do not edit.
12
MINIMUM_PYMUPDF_VERSION = (1, 26, 3)
2-
VERSION = '0.0.26'
3+
VERSION = '0.0.27'

pymupdf4llm/setup.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,12 @@
1414
"Topic :: Utilities",
1515
]
1616

17-
version = "0.0.26"
17+
version = "0.0.27"
1818
requires = ["pymupdf>=1.26.3"]
1919

2020
text = requires[0].split("=")[1]
2121
text = tuple(map(int, text.split(".")))
22-
text = f"MINIMUM_PYMUPDF_VERSION = {text}\nVERSION = '{version}'\n"
22+
text = f"# Generated file - do not edit.\nMINIMUM_PYMUPDF_VERSION = {text}\nVERSION = '{version}'\n"
2323
Path("pymupdf4llm/versions_file.py").write_text(text)
2424

2525
setuptools.setup(
@@ -37,12 +37,13 @@
3737
url="https://github.com/pymupdf/RAG",
3838
classifiers=classifiers,
3939
package_data={
40-
"pymupdf4llm": ["LICENSE", "helpers/*.py", "llama/*.py"],
40+
"pymupdf4llm": ["helpers/*.py", "llama/*.py"],
4141
},
4242
project_urls={
4343
"Documentation": "https://pymupdf.readthedocs.io/",
4444
"Source": "https://github.com/pymupdf/RAG/tree/main/pymupdf4llm/pymupdf4llm",
4545
"Tracker": "https://github.com/pymupdf/RAG/issues",
4646
"Changelog": "https://github.com/pymupdf/RAG/blob/main/CHANGES.md",
47+
"License": "https://github.com/pymupdf/RAG/blob/main/LICENSE",
4748
},
4849
)

0 commit comments

Comments
 (0)