@@ -157,6 +157,8 @@ def get_header_id(self, span: dict, page=None) -> str:
157157 markdown header prefix string of 0 to n concatenated '#' characters.
158158 """
159159 fontsize = round (span ["size" ]) # compute fontsize
160+ if fontsize <= self .body_limit :
161+ return ""
160162 hdr_id = self .header_id .get (fontsize , "" )
161163 return hdr_id
162164
@@ -278,7 +280,7 @@ def to_markdown(
278280 ignore_code: (bool) suppress code-like formatting (mono-space fonts)
279281 extract_words: (bool) include "words"-like output in page chunks
280282 show_progress: (bool) print progress as each page is processed.
281- glyph_fallback : (bool) replace the Invalid Unicode by glyph number .
283+ use_glyphs : (bool) replace the Invalid Unicode by glyph numbers .
282284
283285 """
284286 if write_images is False and embed_images is False and force_text is False :
@@ -427,8 +429,8 @@ def write_text(
427429 if clip is None :
428430 clip = parms .clip
429431 out_string = ""
430- # This is a list of tuples (linerect, spanlist)
431432
433+ # This is a list of tuples (linerect, spanlist)
432434 nlines = get_raw_lines (parms .textpage , clip = clip , tolerance = 3 )
433435 nlines = [
434436 l for l in nlines if not intersects_rects (l [0 ], parms .tab_rects .values ())
@@ -450,21 +452,18 @@ def write_text(
450452 # Pick up tables ABOVE this text block
451453 # ------------------------------------------------------------
452454 if tables :
453- tab_candidates = sorted (
454- [
455- (i , tab_rect )
456- for i , tab_rect in parms .tab_rects .items ()
457- if tab_rect .y1 <= lrect .y0
458- and i not in parms .deleted_tables
459- and (
460- 0
461- or lrect .x0 <= tab_rect .x0 < lrect .x1
462- or lrect .x0 < tab_rect .x1 <= lrect .x1
463- or tab_rect .x0 <= lrect .x0 < lrect .x1 <= tab_rect .x1
464- )
465- ],
466- key = lambda j : (j [1 ].y1 , j [1 ].x0 ),
467- )
455+ tab_candidates = [
456+ (i , tab_rect )
457+ for i , tab_rect in parms .tab_rects .items ()
458+ if tab_rect .y1 <= lrect .y0
459+ and i not in parms .written_tables
460+ and (
461+ 0
462+ or lrect .x0 <= tab_rect .x0 < lrect .x1
463+ or lrect .x0 < tab_rect .x1 <= lrect .x1
464+ or tab_rect .x0 <= lrect .x0 < lrect .x1 <= tab_rect .x1
465+ )
466+ ]
468467 for i , _ in tab_candidates :
469468 out_string += "\n " + parms .tabs [i ].to_markdown (clean = False ) + "\n "
470469 if EXTRACT_WORDS :
@@ -481,14 +480,14 @@ def write_text(
481480 key = lambda c : (c .y1 , c .x0 ),
482481 )
483482 parms .line_rects .extend (cells )
484- parms .deleted_tables .append (i )
483+ parms .written_tables .append (i )
485484
486485 # ------------------------------------------------------------
487486 # Pick up images / graphics ABOVE this text block
488487 # ------------------------------------------------------------
489488 if images :
490489 for i in range (len (parms .img_rects )):
491- if i in parms .deleted_images :
490+ if i in parms .written_images :
492491 continue
493492 r = parms .img_rects [i ]
494493 if r .y1 <= lrect .y0 and (
@@ -502,7 +501,7 @@ def write_text(
502501 out_string += GRAPHICS_TEXT % pathname
503502
504503 # recursive invocation
505- if force_text :
504+ if force_text is True :
506505 img_txt = write_text (
507506 parms ,
508507 r ,
@@ -513,7 +512,7 @@ def write_text(
513512
514513 if not is_white (img_txt ):
515514 out_string += img_txt
516- parms .deleted_images .append (i )
515+ parms .written_images .append (i )
517516
518517 parms .line_rects .append (lrect )
519518
@@ -668,7 +667,7 @@ def output_tables(parms, text_rect):
668667 [j for j in parms .tab_rects .items () if j [1 ].y1 <= text_rect .y0 ],
669668 key = lambda j : (j [1 ].y1 , j [1 ].x0 ),
670669 ):
671- if i in parms .deleted_tables :
670+ if i in parms .written_tables :
672671 continue
673672 this_md += parms .tabs [i ].to_markdown (clean = False )
674673 if EXTRACT_WORDS :
@@ -685,14 +684,11 @@ def output_tables(parms, text_rect):
685684 key = lambda c : (c .y1 , c .x0 ),
686685 )
687686 parms .line_rects .extend (cells )
688- del parms .tab_rects [ i ] # do not touch this table twice
687+ parms .written_tables . append ( i ) # do not touch this table twice
689688
690689 else : # output all remaining tables
691- for i , trect in sorted (
692- parms .tab_rects .items (),
693- key = lambda j : (j [1 ].y1 , j [1 ].x0 ),
694- ):
695- if i in parms .deleted_tables :
690+ for i , trect in parms .tab_rects .items ():
691+ if i in parms .written_tables :
696692 continue
697693 this_md += parms .tabs [i ].to_markdown (clean = False )
698694 if EXTRACT_WORDS :
@@ -709,10 +705,10 @@ def output_tables(parms, text_rect):
709705 key = lambda c : (c .y1 , c .x0 ),
710706 )
711707 parms .line_rects .extend (cells )
712- del parms .tab_rects [ i ] # do not touch this table twice
708+ parms .written_tables . append ( i ) # do not touch this table twice
713709 return this_md
714710
715- def output_images (parms , text_rect ):
711+ def output_images (parms , text_rect , force_text ):
716712 """Output images and graphics above text rectangle."""
717713 if not parms .img_rects :
718714 return ""
@@ -723,10 +719,10 @@ def output_images(parms, text_rect):
723719 continue
724720 if img_rect .x0 >= text_rect .x1 or img_rect .x1 <= text_rect .x0 :
725721 continue
726- if i in parms .deleted_images :
722+ if i in parms .written_images :
727723 continue
728724 pathname = save_image (parms , img_rect , i )
729- parms .deleted_images .append (i ) # do not touch this image twice
725+ parms .written_images .append (i ) # do not touch this image twice
730726 if pathname :
731727 this_md += GRAPHICS_TEXT % pathname
732728 if force_text :
@@ -741,10 +737,10 @@ def output_images(parms, text_rect):
741737 this_md += img_txt
742738 else : # output all remaining images
743739 for i , img_rect in enumerate (parms .img_rects ):
744- if i in parms .deleted_images :
740+ if i in parms .written_images :
745741 continue
746742 pathname = save_image (parms , img_rect , i )
747- parms .deleted_images .append (i ) # do not touch this image twice
743+ parms .written_images .append (i ) # do not touch this image twice
748744 if pathname :
749745 this_md += GRAPHICS_TEXT % pathname
750746 if force_text :
@@ -867,6 +863,9 @@ def get_page_output(
867863 # extract external links on page
868864 parms .links = [l for l in page .get_links () if l ["kind" ] == pymupdf .LINK_URI ]
869865
866+ # extract annotation rectangles on page
867+ parms .annot_rects = [a .rect for a in page .annots ()]
868+
870869 # make a TextPage for all later extractions
871870 parms .textpage = page .get_textpage (flags = textflags , clip = parms .clip )
872871
@@ -904,10 +903,20 @@ def get_page_output(
904903 parms .img_rects = [i ["bbox" ] for i in parms .images ]
905904
906905 # Locate all tables on page
906+ parms .written_tables = [] # stores already written tables
907907 if table_strategy is None :
908908 parms .tabs = []
909909 else :
910910 parms .tabs = page .find_tables (clip = parms .clip , strategy = table_strategy )
911+ del_this = []
912+ for i , t in enumerate (parms .tabs ):
913+ if t .row_count < 2 or t .col_count < 2 :
914+ # ignore tables with too few rows or columns
915+ del_this .append (i )
916+ for i in sorted (del_this , reverse = True ):
917+ del parms .tabs .tables [i ]
918+ parms .tabs .tables .sort (key = lambda t : (t .bbox [0 ], t .bbox [1 ]))
919+
911920 # Make a list of table boundary boxes.
912921 # Must include the header bbox (which may exist outside tab.bbox)
913922 tab_rects = {}
@@ -930,11 +939,13 @@ def get_page_output(
930939 paths = [
931940 p
932941 for p in page .get_drawings ()
933- if not intersects_rects (p ["rect" ], parms .tab_rects0 )
934- and p ["rect" ] in parms .clip
935- and 3 < p ["rect" ].width < parms .clip .width
936- and 3 < p ["rect" ].height < parms .clip .height
937- and not (p ["type" ] == "f" and p ["fill" ] == parms .bg_color )
942+ if p ["rect" ] in parms .clip
943+ and p ["rect" ].width < parms .clip .width
944+ and p ["rect" ].height < parms .clip .height
945+ and (p ["rect" ].width > 3 or p ["rect" ].height > 3 )
946+ and not (p ["fill" ] == parms .bg_color and p ["fill" ] != None )
947+ and not intersects_rects (p ["rect" ], parms .tab_rects0 )
948+ and not intersects_rects (p ["rect" ], parms .annot_rects )
938949 ]
939950 else :
940951 paths = []
@@ -948,19 +959,19 @@ def get_page_output(
948959 vg_clusters0 = [] # worthwhile vector graphics go here
949960
950961 # walk through all vector graphics outside any table
951- for bbox in refine_boxes (page .cluster_drawings (drawings = paths )):
962+ clusters = page .cluster_drawings (drawings = paths )
963+ for bbox in clusters :
952964 if is_significant (bbox , paths ):
953965 vg_clusters0 .append (bbox )
954966
955967 # remove paths that are not in some relevant graphic
956968 parms .actual_paths = [p for p in paths if is_in_rects (p ["rect" ], vg_clusters0 )]
957969
958- # also add image rectangles to the list
970+ # also add image rectangles to the list and vice versa
959971 vg_clusters0 .extend (parms .img_rects )
960972 parms .img_rects .extend (vg_clusters0 )
961973 parms .img_rects = sorted (set (parms .img_rects ), key = lambda r : (r .y1 , r .x0 ))
962- parms .deleted_images = []
963- parms .deleted_tables = []
974+ parms .written_images = []
964975 # these may no longer be pairwise disjoint:
965976 # remove area overlaps by joining into larger rects
966977 parms .vg_clusters0 = refine_boxes (vg_clusters0 )
@@ -989,7 +1000,7 @@ def get_page_output(
9891000 for text_rect in text_rects :
9901001 # output tables above this rectangle
9911002 parms .md_string += output_tables (parms , text_rect )
992- parms .md_string += output_images (parms , text_rect )
1003+ parms .md_string += output_images (parms , text_rect , force_text )
9931004
9941005 # output text inside this rectangle
9951006 parms .md_string += write_text (
@@ -1004,7 +1015,7 @@ def get_page_output(
10041015
10051016 # write any remaining tables and images
10061017 parms .md_string += output_tables (parms , None )
1007- parms .md_string += output_images (parms , None )
1018+ parms .md_string += output_images (parms , None , force_text )
10081019
10091020 parms .md_string += "\n -----\n \n "
10101021 while parms .md_string .startswith ("\n " ):
@@ -1153,7 +1164,7 @@ def extract_images_on_page_simple_drop(page, parms, image_size_limit):
11531164 import time
11541165
11551166 try :
1156- filename = "slide12 .pdf"
1167+ filename = "sample_document .pdf"
11571168 except IndexError :
11581169 print (f"Usage:\n python { os .path .basename (__file__ )} input.pdf" )
11591170 sys .exit ()
0 commit comments