@@ -86,7 +86,7 @@ def __init__(
8686 self ,
8787 doc : str ,
8888 pages : list = None ,
89- body_limit : float = 11 , # default if no text found
89+ body_limit : float = 12 , # force this to be body text
9090 max_levels : int = 6 , # accept this many header levels
9191 ):
9292 """Read all text and make a dictionary of fontsizes.
@@ -135,7 +135,7 @@ def __init__(
135135 )
136136 if temp :
137137 # most frequent font size
138- self .body_limit = min (body_limit , temp [- 1 ][0 ])
138+ self .body_limit = max (body_limit , temp [- 1 ][0 ])
139139 else :
140140 self .body_limit = body_limit
141141
@@ -203,7 +203,7 @@ def get_header_id(self, span: dict, page=None) -> str:
203203 return ""
204204 # check if the span matches a TOC entry
205205 text = span ["text" ].strip ()
206- for t in toc :
206+ for t in my_toc :
207207 title = t [1 ].strip () # title of TOC entry
208208 lvl = t [0 ] # level of TOC entry
209209 if text .startswith (title ) or title .startswith (text ):
@@ -494,7 +494,12 @@ def write_text(
494494 out_string = ""
495495
496496 # This is a list of tuples (linerect, spanlist)
497- nlines = get_raw_lines (parms .textpage , clip = clip , tolerance = 3 )
497+ nlines = get_raw_lines (
498+ parms .textpage ,
499+ clip = clip ,
500+ tolerance = 3 ,
501+ ignore_invisible = not parms .accept_invisible ,
502+ )
498503 nlines = [
499504 l for l in nlines if not intersects_rects (l [0 ], parms .tab_rects .values ())
500505 ]
@@ -821,6 +826,16 @@ def output_images(parms, text_rect, force_text):
821826
822827 return this_md
823828
829+ def page_is_ocr (page ):
830+ """Check if page exclusivley contains OCR text.
831+
832+ For this to be true, all text must be written as "ignore-text".
833+ """
834+ text_types = set ([b [0 ] for b in page .get_bboxlog () if "text" in b [0 ]])
835+ if text_types == {"ignore-text" }:
836+ return True
837+ return False
838+
824839 def get_bg_color (page ):
825840 """Determine the background color of the page.
826841
@@ -919,6 +934,7 @@ def get_page_output(
919934 parms .graphics = []
920935 parms .words = []
921936 parms .line_rects = []
937+ parms .accept_invisible = page_is_ocr (page ) # accept invisible text
922938
923939 # determine background color
924940 parms .bg_color = get_bg_color (page )
@@ -968,11 +984,17 @@ def get_page_output(
968984
969985 parms .img_rects = [i ["bbox" ] for i in parms .images ]
970986
987+ # catch too-many-graphics situation
988+ graphics_count = len ([b for b in page .get_bboxlog () if "path" in b [0 ]])
989+ if GRAPHICS_LIMIT and graphics_count > GRAPHICS_LIMIT :
990+ IGNORE_GRAPHICS = True
991+ table_strategy = None
992+
971993 # Locate all tables on page
972994 parms .written_tables = [] # stores already written tables
973995 omitted_table_rects = []
974996 if table_strategy is None :
975- parms .tabs = []
997+ parms .tabs = None
976998 else :
977999 parms .tabs = page .find_tables (clip = parms .clip , strategy = table_strategy )
9781000 # remove tables with too few rows or columns
@@ -986,14 +1008,15 @@ def get_page_output(
9861008 # Make a list of table boundary boxes.
9871009 # Must include the header bbox (which may exist outside tab.bbox)
9881010 tab_rects = {}
989- for i , t in enumerate (parms .tabs .tables ):
990- tab_rects [i ] = pymupdf .Rect (t .bbox ) | pymupdf .Rect (t .header .bbox )
991- tab_dict = {
992- "bbox" : tuple (tab_rects [i ]),
993- "rows" : t .row_count ,
994- "columns" : t .col_count ,
995- }
996- parms .tables .append (tab_dict )
1011+ if parms .tabs is not None :
1012+ for i , t in enumerate (parms .tabs .tables ):
1013+ tab_rects [i ] = pymupdf .Rect (t .bbox ) | pymupdf .Rect (t .header .bbox )
1014+ tab_dict = {
1015+ "bbox" : tuple (tab_rects [i ]),
1016+ "rows" : t .row_count ,
1017+ "columns" : t .col_count ,
1018+ }
1019+ parms .tables .append (tab_dict )
9971020 parms .tab_rects = tab_rects
9981021 # list of table rectangles
9991022 parms .tab_rects0 = list (tab_rects .values ())
@@ -1084,7 +1107,6 @@ def get_page_output(
10841107 parms .md_string += output_tables (parms , None )
10851108 parms .md_string += output_images (parms , None , force_text )
10861109
1087- parms .md_string += "\n -----\n \n "
10881110 while parms .md_string .startswith ("\n " ):
10891111 parms .md_string = parms .md_string [1 :]
10901112 parms .md_string = parms .md_string .replace (chr (0 ), chr (0xFFFD ))
0 commit comments