Skip to content

Commit 1f06abe

Browse files
authored
Merge pull request #269 from pymupdf/v0.0.23
Changes for v0.0.23
2 parents 1565e67 + 13c84c6 commit 1f06abe

File tree

5 files changed

+50
-20
lines changed

5 files changed

+50
-20
lines changed

pdf4llm/setup.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,11 +13,11 @@
1313
"Programming Language :: Python :: 3",
1414
"Topic :: Utilities",
1515
]
16-
requires = ["pymupdf4llm==0.0.22"]
16+
requires = ["pymupdf4llm==0.0.23"]
1717

1818
setuptools.setup(
1919
name="pdf4llm",
20-
version="0.0.22",
20+
version="0.0.23",
2121
author="Artifex",
2222
author_email="support@artifex.com",
2323
description="PyMuPDF Utilities for LLM/RAG",

pymupdf4llm/pymupdf4llm/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from .helpers.pymupdf_rag import IdentifyHeaders, to_markdown
22

3-
__version__ = "0.0.22"
3+
__version__ = "0.0.23"
44
version = __version__
55
version_tuple = tuple(map(int, version.split(".")))
66

pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,12 @@ def is_white(text):
2424
return WHITE.issuperset(text)
2525

2626

27-
def get_raw_lines(textpage, clip=None, tolerance=3):
27+
def get_raw_lines(
28+
textpage,
29+
clip=None,
30+
tolerance=3,
31+
ignore_invisible=True,
32+
):
2833
"""Extract the text spans from a TextPage in natural reading sequence.
2934
3035
All spans roughly on the same line are joined to generate an improved line.
@@ -43,6 +48,8 @@ def get_raw_lines(textpage, clip=None, tolerance=3):
4348
turn may be based on a sub-rectangle of the full page).
4449
tolerance: (float) put spans on the same line if their top or bottom
4550
coordinate differ by no more than this value.
51+
ignore_invisible: (bool) if True, invisible text is ignored. This may
52+
have been set to False for pages with OCR text.
4653
4754
Returns:
4855
A sorted list of items (rect, [spans]), each representing one line. The
@@ -109,7 +116,8 @@ def sanitize_spans(line):
109116
sbbox = pymupdf.Rect(s["bbox"]) # span bbox as a Rect
110117
if is_white(s["text"]): # ignore white text
111118
continue
112-
if s["alpha"] == 0: # ignore invisible text
119+
# ignore invisible text
120+
if s["alpha"] == 0 and ignore_invisible:
113121
continue
114122
if abs(sbbox & clip) < abs(sbbox) * 0.8: # if not in clip
115123
continue

pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py

Lines changed: 36 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ def __init__(
8686
self,
8787
doc: str,
8888
pages: list = None,
89-
body_limit: float = 11, # default if no text found
89+
body_limit: float = 12, # force this to be body text
9090
max_levels: int = 6, # accept this many header levels
9191
):
9292
"""Read all text and make a dictionary of fontsizes.
@@ -135,7 +135,7 @@ def __init__(
135135
)
136136
if temp:
137137
# most frequent font size
138-
self.body_limit = min(body_limit, temp[-1][0])
138+
self.body_limit = max(body_limit, temp[-1][0])
139139
else:
140140
self.body_limit = body_limit
141141

@@ -203,7 +203,7 @@ def get_header_id(self, span: dict, page=None) -> str:
203203
return ""
204204
# check if the span matches a TOC entry
205205
text = span["text"].strip()
206-
for t in toc:
206+
for t in my_toc:
207207
title = t[1].strip() # title of TOC entry
208208
lvl = t[0] # level of TOC entry
209209
if text.startswith(title) or title.startswith(text):
@@ -494,7 +494,12 @@ def write_text(
494494
out_string = ""
495495

496496
# This is a list of tuples (linerect, spanlist)
497-
nlines = get_raw_lines(parms.textpage, clip=clip, tolerance=3)
497+
nlines = get_raw_lines(
498+
parms.textpage,
499+
clip=clip,
500+
tolerance=3,
501+
ignore_invisible=not parms.accept_invisible,
502+
)
498503
nlines = [
499504
l for l in nlines if not intersects_rects(l[0], parms.tab_rects.values())
500505
]
@@ -821,6 +826,16 @@ def output_images(parms, text_rect, force_text):
821826

822827
return this_md
823828

829+
def page_is_ocr(page):
830+
"""Check if page exclusivley contains OCR text.
831+
832+
For this to be true, all text must be written as "ignore-text".
833+
"""
834+
text_types = set([b[0] for b in page.get_bboxlog() if "text" in b[0]])
835+
if text_types == {"ignore-text"}:
836+
return True
837+
return False
838+
824839
def get_bg_color(page):
825840
"""Determine the background color of the page.
826841
@@ -919,6 +934,7 @@ def get_page_output(
919934
parms.graphics = []
920935
parms.words = []
921936
parms.line_rects = []
937+
parms.accept_invisible = page_is_ocr(page) # accept invisible text
922938

923939
# determine background color
924940
parms.bg_color = get_bg_color(page)
@@ -968,11 +984,17 @@ def get_page_output(
968984

969985
parms.img_rects = [i["bbox"] for i in parms.images]
970986

987+
# catch too-many-graphics situation
988+
graphics_count = len([b for b in page.get_bboxlog() if "path" in b[0]])
989+
if GRAPHICS_LIMIT and graphics_count > GRAPHICS_LIMIT:
990+
IGNORE_GRAPHICS = True
991+
table_strategy = None
992+
971993
# Locate all tables on page
972994
parms.written_tables = [] # stores already written tables
973995
omitted_table_rects = []
974996
if table_strategy is None:
975-
parms.tabs = []
997+
parms.tabs = None
976998
else:
977999
parms.tabs = page.find_tables(clip=parms.clip, strategy=table_strategy)
9781000
# remove tables with too few rows or columns
@@ -986,14 +1008,15 @@ def get_page_output(
9861008
# Make a list of table boundary boxes.
9871009
# Must include the header bbox (which may exist outside tab.bbox)
9881010
tab_rects = {}
989-
for i, t in enumerate(parms.tabs.tables):
990-
tab_rects[i] = pymupdf.Rect(t.bbox) | pymupdf.Rect(t.header.bbox)
991-
tab_dict = {
992-
"bbox": tuple(tab_rects[i]),
993-
"rows": t.row_count,
994-
"columns": t.col_count,
995-
}
996-
parms.tables.append(tab_dict)
1011+
if parms.tabs is not None:
1012+
for i, t in enumerate(parms.tabs.tables):
1013+
tab_rects[i] = pymupdf.Rect(t.bbox) | pymupdf.Rect(t.header.bbox)
1014+
tab_dict = {
1015+
"bbox": tuple(tab_rects[i]),
1016+
"rows": t.row_count,
1017+
"columns": t.col_count,
1018+
}
1019+
parms.tables.append(tab_dict)
9971020
parms.tab_rects = tab_rects
9981021
# list of table rectangles
9991022
parms.tab_rects0 = list(tab_rects.values())
@@ -1084,7 +1107,6 @@ def get_page_output(
10841107
parms.md_string += output_tables(parms, None)
10851108
parms.md_string += output_images(parms, None, force_text)
10861109

1087-
parms.md_string += "\n-----\n\n"
10881110
while parms.md_string.startswith("\n"):
10891111
parms.md_string = parms.md_string[1:]
10901112
parms.md_string = parms.md_string.replace(chr(0), chr(0xFFFD))

pymupdf4llm/setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717

1818
setuptools.setup(
1919
name="pymupdf4llm",
20-
version="0.0.22",
20+
version="0.0.23",
2121
author="Artifex",
2222
author_email="support@artifex.com",
2323
description="PyMuPDF Utilities for LLM/RAG",

0 commit comments

Comments
 (0)