@@ -316,6 +316,7 @@ def to_markdown(
316
316
filename = None ,
317
317
force_text = True ,
318
318
page_chunks = False ,
319
+ page_separators = False ,
319
320
margins = 0 ,
320
321
dpi = 150 ,
321
322
page_width = 612 ,
@@ -341,6 +342,7 @@ def to_markdown(
341
342
image_format: (str) use this image format. Choose a supported one.
342
343
force_text: (bool) output text despite of image background.
343
344
page_chunks: (bool) whether to segment output by page.
345
+ page_separators: (bool) whether to include page separators in output.
344
346
margins: omit content overlapping margin areas.
345
347
dpi: (int) desired resolution for generated images.
346
348
page_width: (float) assumption if page layout is variable.
@@ -381,7 +383,7 @@ def to_markdown(
381
383
IGNORE_IMAGES = ignore_images
382
384
IGNORE_GRAPHICS = ignore_graphics
383
385
DETECT_BG_COLOR = detect_bg_color
384
- if doc .is_form_pdf or doc .has_annots ():
386
+ if doc .is_form_pdf or ( doc .is_pdf and doc . has_annots () ):
385
387
doc .bake ()
386
388
387
389
# for reflowable documents allow making 1 page for the whole document
@@ -560,6 +562,7 @@ def write_text(
560
562
)
561
563
parms .line_rects .extend (cells )
562
564
parms .written_tables .append (i )
565
+ prev_hdr_string = None
563
566
564
567
# ------------------------------------------------------------
565
568
# Pick up images / graphics ABOVE this text block
@@ -592,6 +595,7 @@ def write_text(
592
595
if not is_white (img_txt ):
593
596
out_string += img_txt
594
597
parms .written_images .append (i )
598
+ prev_hdr_string = None
595
599
596
600
parms .line_rects .append (lrect )
597
601
# if line rect is far away from the previous one, add a line break
@@ -751,7 +755,7 @@ def output_tables(parms, text_rect):
751
755
):
752
756
if i in parms .written_tables :
753
757
continue
754
- this_md += parms .tabs [i ].to_markdown (clean = False )
758
+ this_md += parms .tabs [i ].to_markdown (clean = False ) + " \n "
755
759
if EXTRACT_WORDS :
756
760
# for "words" extraction, add table cells as line rects
757
761
cells = sorted (
@@ -772,7 +776,7 @@ def output_tables(parms, text_rect):
772
776
for i , trect in parms .tab_rects .items ():
773
777
if i in parms .written_tables :
774
778
continue
775
- this_md += parms .tabs [i ].to_markdown (clean = False )
779
+ this_md += parms .tabs [i ].to_markdown (clean = False ) + " \n "
776
780
if EXTRACT_WORDS :
777
781
# for "words" extraction, add table cells as line rects
778
782
cells = sorted (
@@ -954,7 +958,7 @@ def get_page_output(
954
958
) # accept invisible text
955
959
956
960
# determine background color
957
- parms .bg_color = get_bg_color ( page ) if DETECT_BG_COLOR else None
961
+ parms .bg_color = None if not DETECT_BG_COLOR else get_bg_color ( page )
958
962
959
963
left , top , right , bottom = margins
960
964
parms .clip = page .rect + (left , top , - right , - bottom )
@@ -994,12 +998,12 @@ def get_page_output(
994
998
if img_info :
995
999
img_max_size = abs (parms .clip ) * 0.9
996
1000
sane = [i for i in img_info if abs (i ["bbox" ] & parms .clip ) < img_max_size ]
997
- if len (sane ) < len (img_info ): # found some
998
- img_info = sane # use those images instead
999
- # output full page image
1000
- name = save_image (parms , parms .clip , "full" )
1001
- if name :
1002
- parms .md_string += GRAPHICS_TEXT % name
1001
+ if len (sane ) < len (img_info ): # found some
1002
+ img_info = sane # use those images instead
1003
+ # output full page image
1004
+ name = save_image (parms , parms .clip , "full" )
1005
+ if name :
1006
+ parms .md_string += GRAPHICS_TEXT % name
1003
1007
1004
1008
img_info = img_info [:30 ] # only accept the largest up to 30 images
1005
1009
# run from back to front (= small to large)
@@ -1024,31 +1028,31 @@ def get_page_output(
1024
1028
# Locate all tables on page
1025
1029
parms .written_tables = [] # stores already written tables
1026
1030
omitted_table_rects = []
1031
+ parms .tabs = []
1027
1032
if IGNORE_GRAPHICS or not table_strategy :
1028
1033
# do not try to extract tables
1029
- parms . tabs = None
1034
+ pass
1030
1035
else :
1031
- parms .tabs = page .find_tables (clip = parms .clip , strategy = table_strategy )
1032
- # remove tables with too few rows or columns
1033
- for i in range (len (parms .tabs .tables ) - 1 , - 1 , - 1 ):
1034
- t = parms .tabs .tables [i ]
1036
+ tabs = page .find_tables (clip = parms .clip , strategy = table_strategy )
1037
+ for t in tabs .tables :
1038
+ # remove tables with too few rows or columns
1035
1039
if t .row_count < 2 or t .col_count < 2 :
1036
1040
omitted_table_rects .append (pymupdf .Rect (t .bbox ))
1037
- del parms .tabs .tables [i ]
1038
- parms .tabs .tables .sort (key = lambda t : (t .bbox [0 ], t .bbox [1 ]))
1041
+ continue
1042
+ parms .tabs .append (t )
1043
+ parms .tabs .sort (key = lambda t : (t .bbox [0 ], t .bbox [1 ]))
1039
1044
1040
1045
# Make a list of table boundary boxes.
1041
1046
# Must include the header bbox (which may exist outside tab.bbox)
1042
1047
tab_rects = {}
1043
- if parms .tabs is not None :
1044
- for i , t in enumerate (parms .tabs .tables ):
1045
- tab_rects [i ] = pymupdf .Rect (t .bbox ) | pymupdf .Rect (t .header .bbox )
1046
- tab_dict = {
1047
- "bbox" : tuple (tab_rects [i ]),
1048
- "rows" : t .row_count ,
1049
- "columns" : t .col_count ,
1050
- }
1051
- parms .tables .append (tab_dict )
1048
+ for i , t in enumerate (parms .tabs ):
1049
+ tab_rects [i ] = pymupdf .Rect (t .bbox ) | pymupdf .Rect (t .header .bbox )
1050
+ tab_dict = {
1051
+ "bbox" : tuple (tab_rects [i ]),
1052
+ "rows" : t .row_count ,
1053
+ "columns" : t .col_count ,
1054
+ }
1055
+ parms .tables .append (tab_dict )
1052
1056
parms .tab_rects = tab_rects
1053
1057
# list of table rectangles
1054
1058
parms .tab_rects0 = list (tab_rects .values ())
@@ -1064,15 +1068,12 @@ def get_page_output(
1064
1068
and p ["rect" ].width < parms .clip .width
1065
1069
and p ["rect" ].height < parms .clip .height
1066
1070
and (p ["rect" ].width > 3 or p ["rect" ].height > 3 )
1067
- and not (p ["fill" ] == parms .bg_color and p ["fill" ] != None )
1068
- and not intersects_rects (
1069
- p ["rect" ], parms .tab_rects0 + omitted_table_rects
1070
- )
1071
+ and not (p ["type" ] == "f" and p ["fill" ] == parms .bg_color )
1072
+ and not intersects_rects (p ["rect" ], parms .tab_rects0 )
1071
1073
and not intersects_rects (p ["rect" ], parms .annot_rects )
1072
1074
]
1073
1075
else :
1074
1076
paths = []
1075
-
1076
1077
# catch too-many-graphics situation
1077
1078
if GRAPHICS_LIMIT and len (paths ) > GRAPHICS_LIMIT :
1078
1079
paths = []
@@ -1168,6 +1169,9 @@ def get_page_output(
1168
1169
else :
1169
1170
words = []
1170
1171
parms .words = words
1172
+ if page_separators :
1173
+ # add page separators to output
1174
+ parms .md_string += f"\n \n --- end of page={ parms .page .number } ---\n \n "
1171
1175
return parms
1172
1176
1173
1177
if page_chunks is False :
0 commit comments