Merge pull request #269 from pymupdf/v0.0.23

JorjMcKie · web-flow · commit 1f06abeb7a8d · 2025-05-09T11:41:52.000-04:00
Changes for v0.0.23
diff --git a/pdf4llm/setup.py b/pdf4llm/setup.py
@@ -13,11 +13,11 @@
     "Programming Language :: Python :: 3",
     "Topic :: Utilities",
 ]
-requires = ["pymupdf4llm==0.0.22"]
+requires = ["pymupdf4llm==0.0.23"]
 
 setuptools.setup(
     name="pdf4llm",
-    version="0.0.22",
+    version="0.0.23",
     author="Artifex",
     author_email="support@artifex.com",
     description="PyMuPDF Utilities for LLM/RAG",
diff --git a/pymupdf4llm/pymupdf4llm/__init__.py b/pymupdf4llm/pymupdf4llm/__init__.py
@@ -1,6 +1,6 @@
 from .helpers.pymupdf_rag import IdentifyHeaders, to_markdown
 
-__version__ = "0.0.22"
+__version__ = "0.0.23"
 version = __version__
 version_tuple = tuple(map(int, version.split(".")))
 
diff --git a/pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py b/pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py
@@ -24,7 +24,12 @@ def is_white(text):
     return WHITE.issuperset(text)
 
 
-def get_raw_lines(textpage, clip=None, tolerance=3):
+def get_raw_lines(
+    textpage,
+    clip=None,
+    tolerance=3,
+    ignore_invisible=True,
+):
     """Extract the text spans from a TextPage in natural reading sequence.
 
     All spans roughly on the same line are joined to generate an improved line.
@@ -43,6 +48,8 @@ def get_raw_lines(textpage, clip=None, tolerance=3):
               turn may be based on a sub-rectangle of the full page).
         tolerance: (float) put spans on the same line if their top or bottom
               coordinate differ by no more than this value.
+        ignore_invisible: (bool) if True, invisible text is ignored. This may
+              have been set to False for pages with OCR text.
 
     Returns:
         A sorted list of items (rect, [spans]), each representing one line. The
@@ -109,7 +116,8 @@ def sanitize_spans(line):
                 sbbox = pymupdf.Rect(s["bbox"])  # span bbox as a Rect
                 if is_white(s["text"]):  # ignore white text
                     continue
-                if s["alpha"] == 0:  # ignore invisible text
+                # ignore invisible text
+                if s["alpha"] == 0 and ignore_invisible:
                     continue
                 if abs(sbbox & clip) < abs(sbbox) * 0.8:  # if not in clip
                     continue
diff --git a/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py b/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py
@@ -86,7 +86,7 @@ def __init__(
         self,
         doc: str,
         pages: list = None,
-        body_limit: float = 11,  # default if no text found
+        body_limit: float = 12,  # force this to be body text
         max_levels: int = 6,  # accept this many header levels
     ):
         """Read all text and make a dictionary of fontsizes.
@@ -135,7 +135,7 @@ def __init__(
         )
         if temp:
             # most frequent font size
-            self.body_limit = min(body_limit, temp[-1][0])
+            self.body_limit = max(body_limit, temp[-1][0])
         else:
             self.body_limit = body_limit
 
@@ -203,7 +203,7 @@ def get_header_id(self, span: dict, page=None) -> str:
             return ""
         # check if the span matches a TOC entry
         text = span["text"].strip()
-        for t in toc:
+        for t in my_toc:
             title = t[1].strip()  # title of TOC entry
             lvl = t[0]  # level of TOC entry
             if text.startswith(title) or title.startswith(text):
@@ -494,7 +494,12 @@ def write_text(
         out_string = ""
 
         # This is a list of tuples (linerect, spanlist)
-        nlines = get_raw_lines(parms.textpage, clip=clip, tolerance=3)
+        nlines = get_raw_lines(
+            parms.textpage,
+            clip=clip,
+            tolerance=3,
+            ignore_invisible=not parms.accept_invisible,
+        )
         nlines = [
             l for l in nlines if not intersects_rects(l[0], parms.tab_rects.values())
         ]
@@ -821,6 +826,16 @@ def output_images(parms, text_rect, force_text):
 
         return this_md
 
+    def page_is_ocr(page):
+        """Check if page exclusivley contains OCR text.
+
+        For this to be true, all text must be written as "ignore-text".
+        """
+        text_types = set([b[0] for b in page.get_bboxlog() if "text" in b[0]])
+        if text_types == {"ignore-text"}:
+            return True
+        return False
+
     def get_bg_color(page):
         """Determine the background color of the page.
 
@@ -919,6 +934,7 @@ def get_page_output(
         parms.graphics = []
         parms.words = []
         parms.line_rects = []
+        parms.accept_invisible = page_is_ocr(page)  # accept invisible text
 
         # determine background color
         parms.bg_color = get_bg_color(page)
@@ -968,11 +984,17 @@ def get_page_output(
 
         parms.img_rects = [i["bbox"] for i in parms.images]
 
+        # catch too-many-graphics situation
+        graphics_count = len([b for b in page.get_bboxlog() if "path" in b[0]])
+        if GRAPHICS_LIMIT and graphics_count > GRAPHICS_LIMIT:
+            IGNORE_GRAPHICS = True
+            table_strategy = None
+
         # Locate all tables on page
         parms.written_tables = []  # stores already written tables
         omitted_table_rects = []
         if table_strategy is None:
-            parms.tabs = []
+            parms.tabs = None
         else:
             parms.tabs = page.find_tables(clip=parms.clip, strategy=table_strategy)
             # remove tables with too few rows or columns
@@ -986,14 +1008,15 @@ def get_page_output(
         # Make a list of table boundary boxes.
         # Must include the header bbox (which may exist outside tab.bbox)
         tab_rects = {}
-        for i, t in enumerate(parms.tabs.tables):
-            tab_rects[i] = pymupdf.Rect(t.bbox) | pymupdf.Rect(t.header.bbox)
-            tab_dict = {
-                "bbox": tuple(tab_rects[i]),
-                "rows": t.row_count,
-                "columns": t.col_count,
-            }
-            parms.tables.append(tab_dict)
+        if parms.tabs is not None:
+            for i, t in enumerate(parms.tabs.tables):
+                tab_rects[i] = pymupdf.Rect(t.bbox) | pymupdf.Rect(t.header.bbox)
+                tab_dict = {
+                    "bbox": tuple(tab_rects[i]),
+                    "rows": t.row_count,
+                    "columns": t.col_count,
+                }
+                parms.tables.append(tab_dict)
         parms.tab_rects = tab_rects
         # list of table rectangles
         parms.tab_rects0 = list(tab_rects.values())
@@ -1084,7 +1107,6 @@ def get_page_output(
         parms.md_string += output_tables(parms, None)
         parms.md_string += output_images(parms, None, force_text)
 
-        parms.md_string += "\n-----\n\n"
         while parms.md_string.startswith("\n"):
             parms.md_string = parms.md_string[1:]
         parms.md_string = parms.md_string.replace(chr(0), chr(0xFFFD))
diff --git a/pymupdf4llm/setup.py b/pymupdf4llm/setup.py
@@ -17,7 +17,7 @@
 
 setuptools.setup(
     name="pymupdf4llm",
-    version="0.0.22",
+    version="0.0.23",
     author="Artifex",
     author_email="support@artifex.com",
     description="PyMuPDF Utilities for LLM/RAG",