1515
1616Text will be sorted in Western reading order. Any table will be included in
1717the text in markdwn format as well.
18-
18+
1919Dependencies
2020-------------
21- PyMuPDF v1.24.2 or later
21+ PyMuPDF v1.24.3 or later
2222
2323Copyright and License
2424----------------------
@@ -247,6 +247,8 @@ def to_markdown(
247247 page_height: (float) assumption if page layout is variable.
248248 table_strategy: choose table detection strategy
249249 graphics_limit: (int) ignore page with too many vector graphics.
250+ ignore_code: (bool) suppress extra formatting for mono-space fonts
251+ extract_words: (bool) include "words"-like output in page chunks
250252 show_progress: (bool) print progress as each page is processed.
251253
252254 """
@@ -403,6 +405,13 @@ def write_text(
403405 key = lambda j : (j [1 ].y1 , j [1 ].x0 ),
404406 ):
405407 out_string += "\n " + tabs [i ].to_markdown (clean = False ) + "\n "
408+ if EXTRACT_WORDS : # determine raw line rects within this table
409+ line_rects .extend (
410+ [
411+ pymupdf .Rect (rl [0 ])
412+ for rl in get_raw_lines (textpage , clip = tab_rects [i ])
413+ ]
414+ )
406415 del tab_rects [i ]
407416
408417 # ------------------------------------------------------------
@@ -548,7 +557,7 @@ def intersects_rects(rect, rect_list):
548557 return i
549558 return 0
550559
551- def output_tables (tabs , text_rect , tab_rects ):
560+ def output_tables (tabs , text_rect , tab_rects , line_rects , textpage ):
552561 """Output tables above a text rectangle."""
553562 this_md = "" # markdown string for table content
554563 if text_rect is not None : # select tables above the text block
@@ -557,6 +566,13 @@ def output_tables(tabs, text_rect, tab_rects):
557566 key = lambda j : (j [1 ].y1 , j [1 ].x0 ),
558567 ):
559568 this_md += tabs [i ].to_markdown (clean = False )
569+ if EXTRACT_WORDS : # determine raw line rects within this table
570+ line_rects .extend (
571+ [
572+ pymupdf .Rect (rl [0 ])
573+ for rl in get_raw_lines (textpage , clip = tab_rects [i ])
574+ ]
575+ )
560576 del tab_rects [i ] # do not touch this table twice
561577
562578 else : # output all remaining table
@@ -565,6 +581,13 @@ def output_tables(tabs, text_rect, tab_rects):
565581 key = lambda j : (j [1 ].y1 , j [1 ].x0 ),
566582 ):
567583 this_md += tabs [i ].to_markdown (clean = False )
584+ if EXTRACT_WORDS : # determine raw line rects within this table
585+ line_rects .extend (
586+ [
587+ pymupdf .Rect (rl [0 ])
588+ for rl in get_raw_lines (textpage , clip = tab_rects [i ])
589+ ]
590+ )
568591 del tab_rects [i ] # do not touch this table twice
569592 return this_md
570593
@@ -748,7 +771,7 @@ def get_page_output(doc, pno, margins, textflags):
748771 """
749772 for text_rect in text_rects :
750773 # output tables above this block of text
751- md_string += output_tables (tabs , text_rect , tab_rects )
774+ md_string += output_tables (tabs , text_rect , tab_rects , line_rects , textpage )
752775 md_string += output_images (
753776 page , textpage , text_rect , vg_clusters , line_rects
754777 )
@@ -768,24 +791,36 @@ def get_page_output(doc, pno, margins, textflags):
768791
769792 md_string = md_string .replace (" ," , "," ).replace ("-\n " , "" )
770793 # write any remaining tables and images
771- md_string += output_tables (tabs , None , tab_rects )
794+ md_string += output_tables (tabs , None , tab_rects , line_rects , textpage )
772795 md_string += output_images (page , textpage , None , vg_clusters , line_rects )
773796 md_string += "\n -----\n \n "
774797 while md_string .startswith ("\n " ):
775798 md_string = md_string [1 :]
776799 md_string = md_string .replace (chr (0 ), chr (0xFFFD ))
800+
777801 if EXTRACT_WORDS is True :
802+ # output words in sequence compliant with Markdown text
778803 rawwords = textpage .extractWORDS ()
779804 words = []
780805 for lrect in line_rects :
781806 lwords = []
782807 for w in rawwords :
783808 wrect = pymupdf .Rect (w [:4 ])
784809 if wrect in lrect :
785- wrect .y0 = lrect .y0
786- wrect .y1 = lrect .y1
810+ wrect .y0 = lrect .y0 # set upper coord to line
811+ wrect .y1 = lrect .y1 # set lower coord to line
787812 lwords .append (list (wrect ) + list (w [4 :]))
813+ # append sorted words of this line
788814 words .extend (sorted (lwords , key = lambda w : w [0 ]))
815+
816+ # remove word duplicates without spoiling the sequence
817+ # duplicates may occur for multiple reasons
818+ nwords = [] # words w/o duplicates
819+ for w in words :
820+ if w not in nwords :
821+ nwords .append (w )
822+ words = nwords
823+
789824 else :
790825 words = []
791826 return md_string , images , tables , graphics , words
0 commit comments