Skip to content

Commit cdd42b8

Browse files
authored
Merge pull request #248 from pymupdf/Version-0.0.21
Version 0.0.21
2 parents 17c00a3 + 5a0679d commit cdd42b8

File tree

5 files changed

+77
-57
lines changed

5 files changed

+77
-57
lines changed

CHANGES.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,14 @@
11
# Change Log
22

3+
## Changes in version 0.0.21
4+
5+
### Fixes:
6+
7+
* [116](https://github.com/pymupdf/RAG/issues/116) - Handling Graphical Images & Superscripts
8+
9+
### Other Changes:
10+
11+
312
## Changes in version 0.0.20
413

514
### Fixes:

pymupdf4llm/pymupdf4llm/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from .helpers.pymupdf_rag import IdentifyHeaders, to_markdown
22

3-
__version__ = "0.0.20"
3+
__version__ = "0.0.21"
44
version = __version__
55
version_tuple = tuple(map(int, version.split(".")))
66

pymupdf4llm/pymupdf4llm/helpers/multi_column.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@
6464

6565
import pymupdf
6666

67-
pymupdf.TOOLS.set_small_glyph_heights(True)
67+
pymupdf.TOOLS.unset_quad_corrections(True)
6868

6969

7070
def column_boxes(
@@ -237,7 +237,7 @@ def join_rects_phase2(bboxes):
237237
if (
238238
abs(r.x0 - r0.x0) <= 3
239239
and abs(r.x1 - r0.x1) <= 3
240-
and abs(r0.y1 - r.y0) <= 12
240+
and abs(r0.y1 - r.y0) <= 10
241241
):
242242
r0 |= r
243243
new_rects[-1] = r0
@@ -344,7 +344,7 @@ def join_rects_phase3(bboxes, path_rects, cache):
344344
]
345345

346346
if textpage is None:
347-
textpage = page.get_textpage(clip=clip, flags=pymupdf.TEXTFLAGS_TEXT)
347+
textpage = page.get_textpage(clip=clip, flags=pymupdf.TEXT_ACCURATE_BBOXES)
348348

349349
bboxes = []
350350

@@ -417,7 +417,6 @@ def join_rects_phase3(bboxes, path_rects, cache):
417417
# immediately return of no text found
418418
if bboxes == []:
419419
return []
420-
421420
# --------------------------------------------------------------------
422421
# Join bboxes to establish some column structure
423422
# --------------------------------------------------------------------
@@ -467,7 +466,8 @@ def join_rects_phase3(bboxes, path_rects, cache):
467466
return nblocks
468467

469468
# several phases of rectangle joining
470-
nblocks = join_rects_phase1(nblocks)
469+
# TODO: disabled for now as too aggressive:
470+
# nblocks = join_rects_phase1(nblocks)
471471
nblocks = join_rects_phase2(nblocks)
472472
nblocks = join_rects_phase3(nblocks, path_rects, cache)
473473

@@ -491,14 +491,14 @@ def join_rects_phase3(bboxes, path_rects, cache):
491491
# check if footer margin is given
492492
if len(sys.argv) > 2:
493493
footer_margin = int(sys.argv[2])
494-
else: # use default vaue
495-
footer_margin = 50
494+
else:
495+
footer_margin = 0
496496

497497
# check if header margin is given
498498
if len(sys.argv) > 3:
499499
header_margin = int(sys.argv[3])
500-
else: # use default vaue
501-
header_margin = 50
500+
else:
501+
header_margin = 0
502502

503503
# open document
504504
doc = pymupdf.open(filename)

pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py

Lines changed: 57 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,8 @@ def get_header_id(self, span: dict, page=None) -> str:
157157
markdown header prefix string of 0 to n concatenated '#' characters.
158158
"""
159159
fontsize = round(span["size"]) # compute fontsize
160+
if fontsize <= self.body_limit:
161+
return ""
160162
hdr_id = self.header_id.get(fontsize, "")
161163
return hdr_id
162164

@@ -278,7 +280,7 @@ def to_markdown(
278280
ignore_code: (bool) suppress code-like formatting (mono-space fonts)
279281
extract_words: (bool) include "words"-like output in page chunks
280282
show_progress: (bool) print progress as each page is processed.
281-
glyph_fallback: (bool) replace the Invalid Unicode by glyph number.
283+
use_glyphs: (bool) replace the Invalid Unicode by glyph numbers.
282284
283285
"""
284286
if write_images is False and embed_images is False and force_text is False:
@@ -427,8 +429,8 @@ def write_text(
427429
if clip is None:
428430
clip = parms.clip
429431
out_string = ""
430-
# This is a list of tuples (linerect, spanlist)
431432

433+
# This is a list of tuples (linerect, spanlist)
432434
nlines = get_raw_lines(parms.textpage, clip=clip, tolerance=3)
433435
nlines = [
434436
l for l in nlines if not intersects_rects(l[0], parms.tab_rects.values())
@@ -450,21 +452,18 @@ def write_text(
450452
# Pick up tables ABOVE this text block
451453
# ------------------------------------------------------------
452454
if tables:
453-
tab_candidates = sorted(
454-
[
455-
(i, tab_rect)
456-
for i, tab_rect in parms.tab_rects.items()
457-
if tab_rect.y1 <= lrect.y0
458-
and i not in parms.deleted_tables
459-
and (
460-
0
461-
or lrect.x0 <= tab_rect.x0 < lrect.x1
462-
or lrect.x0 < tab_rect.x1 <= lrect.x1
463-
or tab_rect.x0 <= lrect.x0 < lrect.x1 <= tab_rect.x1
464-
)
465-
],
466-
key=lambda j: (j[1].y1, j[1].x0),
467-
)
455+
tab_candidates = [
456+
(i, tab_rect)
457+
for i, tab_rect in parms.tab_rects.items()
458+
if tab_rect.y1 <= lrect.y0
459+
and i not in parms.written_tables
460+
and (
461+
0
462+
or lrect.x0 <= tab_rect.x0 < lrect.x1
463+
or lrect.x0 < tab_rect.x1 <= lrect.x1
464+
or tab_rect.x0 <= lrect.x0 < lrect.x1 <= tab_rect.x1
465+
)
466+
]
468467
for i, _ in tab_candidates:
469468
out_string += "\n" + parms.tabs[i].to_markdown(clean=False) + "\n"
470469
if EXTRACT_WORDS:
@@ -481,14 +480,14 @@ def write_text(
481480
key=lambda c: (c.y1, c.x0),
482481
)
483482
parms.line_rects.extend(cells)
484-
parms.deleted_tables.append(i)
483+
parms.written_tables.append(i)
485484

486485
# ------------------------------------------------------------
487486
# Pick up images / graphics ABOVE this text block
488487
# ------------------------------------------------------------
489488
if images:
490489
for i in range(len(parms.img_rects)):
491-
if i in parms.deleted_images:
490+
if i in parms.written_images:
492491
continue
493492
r = parms.img_rects[i]
494493
if r.y1 <= lrect.y0 and (
@@ -502,7 +501,7 @@ def write_text(
502501
out_string += GRAPHICS_TEXT % pathname
503502

504503
# recursive invocation
505-
if force_text:
504+
if force_text is True:
506505
img_txt = write_text(
507506
parms,
508507
r,
@@ -513,7 +512,7 @@ def write_text(
513512

514513
if not is_white(img_txt):
515514
out_string += img_txt
516-
parms.deleted_images.append(i)
515+
parms.written_images.append(i)
517516

518517
parms.line_rects.append(lrect)
519518

@@ -668,7 +667,7 @@ def output_tables(parms, text_rect):
668667
[j for j in parms.tab_rects.items() if j[1].y1 <= text_rect.y0],
669668
key=lambda j: (j[1].y1, j[1].x0),
670669
):
671-
if i in parms.deleted_tables:
670+
if i in parms.written_tables:
672671
continue
673672
this_md += parms.tabs[i].to_markdown(clean=False)
674673
if EXTRACT_WORDS:
@@ -685,14 +684,11 @@ def output_tables(parms, text_rect):
685684
key=lambda c: (c.y1, c.x0),
686685
)
687686
parms.line_rects.extend(cells)
688-
del parms.tab_rects[i] # do not touch this table twice
687+
parms.written_tables.append(i) # do not touch this table twice
689688

690689
else: # output all remaining tables
691-
for i, trect in sorted(
692-
parms.tab_rects.items(),
693-
key=lambda j: (j[1].y1, j[1].x0),
694-
):
695-
if i in parms.deleted_tables:
690+
for i, trect in parms.tab_rects.items():
691+
if i in parms.written_tables:
696692
continue
697693
this_md += parms.tabs[i].to_markdown(clean=False)
698694
if EXTRACT_WORDS:
@@ -709,10 +705,10 @@ def output_tables(parms, text_rect):
709705
key=lambda c: (c.y1, c.x0),
710706
)
711707
parms.line_rects.extend(cells)
712-
del parms.tab_rects[i] # do not touch this table twice
708+
parms.written_tables.append(i) # do not touch this table twice
713709
return this_md
714710

715-
def output_images(parms, text_rect):
711+
def output_images(parms, text_rect, force_text):
716712
"""Output images and graphics above text rectangle."""
717713
if not parms.img_rects:
718714
return ""
@@ -723,10 +719,10 @@ def output_images(parms, text_rect):
723719
continue
724720
if img_rect.x0 >= text_rect.x1 or img_rect.x1 <= text_rect.x0:
725721
continue
726-
if i in parms.deleted_images:
722+
if i in parms.written_images:
727723
continue
728724
pathname = save_image(parms, img_rect, i)
729-
parms.deleted_images.append(i) # do not touch this image twice
725+
parms.written_images.append(i) # do not touch this image twice
730726
if pathname:
731727
this_md += GRAPHICS_TEXT % pathname
732728
if force_text:
@@ -741,10 +737,10 @@ def output_images(parms, text_rect):
741737
this_md += img_txt
742738
else: # output all remaining images
743739
for i, img_rect in enumerate(parms.img_rects):
744-
if i in parms.deleted_images:
740+
if i in parms.written_images:
745741
continue
746742
pathname = save_image(parms, img_rect, i)
747-
parms.deleted_images.append(i) # do not touch this image twice
743+
parms.written_images.append(i) # do not touch this image twice
748744
if pathname:
749745
this_md += GRAPHICS_TEXT % pathname
750746
if force_text:
@@ -867,6 +863,9 @@ def get_page_output(
867863
# extract external links on page
868864
parms.links = [l for l in page.get_links() if l["kind"] == pymupdf.LINK_URI]
869865

866+
# extract annotation rectangles on page
867+
parms.annot_rects = [a.rect for a in page.annots()]
868+
870869
# make a TextPage for all later extractions
871870
parms.textpage = page.get_textpage(flags=textflags, clip=parms.clip)
872871

@@ -904,10 +903,20 @@ def get_page_output(
904903
parms.img_rects = [i["bbox"] for i in parms.images]
905904

906905
# Locate all tables on page
906+
parms.written_tables = [] # stores already written tables
907907
if table_strategy is None:
908908
parms.tabs = []
909909
else:
910910
parms.tabs = page.find_tables(clip=parms.clip, strategy=table_strategy)
911+
del_this = []
912+
for i, t in enumerate(parms.tabs):
913+
if t.row_count < 2 or t.col_count < 2:
914+
# ignore tables with too few rows or columns
915+
del_this.append(i)
916+
for i in sorted(del_this, reverse=True):
917+
del parms.tabs.tables[i]
918+
parms.tabs.tables.sort(key=lambda t: (t.bbox[0], t.bbox[1]))
919+
911920
# Make a list of table boundary boxes.
912921
# Must include the header bbox (which may exist outside tab.bbox)
913922
tab_rects = {}
@@ -930,11 +939,13 @@ def get_page_output(
930939
paths = [
931940
p
932941
for p in page.get_drawings()
933-
if not intersects_rects(p["rect"], parms.tab_rects0)
934-
and p["rect"] in parms.clip
935-
and 3 < p["rect"].width < parms.clip.width
936-
and 3 < p["rect"].height < parms.clip.height
937-
and not (p["type"] == "f" and p["fill"] == parms.bg_color)
942+
if p["rect"] in parms.clip
943+
and p["rect"].width < parms.clip.width
944+
and p["rect"].height < parms.clip.height
945+
and (p["rect"].width > 3 or p["rect"].height > 3)
946+
and not (p["fill"] == parms.bg_color and p["fill"] != None)
947+
and not intersects_rects(p["rect"], parms.tab_rects0)
948+
and not intersects_rects(p["rect"], parms.annot_rects)
938949
]
939950
else:
940951
paths = []
@@ -948,19 +959,19 @@ def get_page_output(
948959
vg_clusters0 = [] # worthwhile vector graphics go here
949960

950961
# walk through all vector graphics outside any table
951-
for bbox in refine_boxes(page.cluster_drawings(drawings=paths)):
962+
clusters = page.cluster_drawings(drawings=paths)
963+
for bbox in clusters:
952964
if is_significant(bbox, paths):
953965
vg_clusters0.append(bbox)
954966

955967
# remove paths that are not in some relevant graphic
956968
parms.actual_paths = [p for p in paths if is_in_rects(p["rect"], vg_clusters0)]
957969

958-
# also add image rectangles to the list
970+
# also add image rectangles to the list and vice versa
959971
vg_clusters0.extend(parms.img_rects)
960972
parms.img_rects.extend(vg_clusters0)
961973
parms.img_rects = sorted(set(parms.img_rects), key=lambda r: (r.y1, r.x0))
962-
parms.deleted_images = []
963-
parms.deleted_tables = []
974+
parms.written_images = []
964975
# these may no longer be pairwise disjoint:
965976
# remove area overlaps by joining into larger rects
966977
parms.vg_clusters0 = refine_boxes(vg_clusters0)
@@ -989,7 +1000,7 @@ def get_page_output(
9891000
for text_rect in text_rects:
9901001
# output tables above this rectangle
9911002
parms.md_string += output_tables(parms, text_rect)
992-
parms.md_string += output_images(parms, text_rect)
1003+
parms.md_string += output_images(parms, text_rect, force_text)
9931004

9941005
# output text inside this rectangle
9951006
parms.md_string += write_text(
@@ -1004,7 +1015,7 @@ def get_page_output(
10041015

10051016
# write any remaining tables and images
10061017
parms.md_string += output_tables(parms, None)
1007-
parms.md_string += output_images(parms, None)
1018+
parms.md_string += output_images(parms, None, force_text)
10081019

10091020
parms.md_string += "\n-----\n\n"
10101021
while parms.md_string.startswith("\n"):
@@ -1153,7 +1164,7 @@ def extract_images_on_page_simple_drop(page, parms, image_size_limit):
11531164
import time
11541165

11551166
try:
1156-
filename = "slide12.pdf"
1167+
filename = "sample_document.pdf"
11571168
except IndexError:
11581169
print(f"Usage:\npython {os.path.basename(__file__)} input.pdf")
11591170
sys.exit()

pymupdf4llm/setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717

1818
setuptools.setup(
1919
name="pymupdf4llm",
20-
version="0.0.20",
20+
version="0.0.21",
2121
author="Artifex",
2222
author_email="support@artifex.com",
2323
description="PyMuPDF Utilities for LLM/RAG",

0 commit comments

Comments
 (0)