pymupdf
diff --git a/‎CHANGES.md‎
Lines changed: 41 additions & 3 deletions b/‎CHANGES.md‎
Lines changed: 41 additions & 3 deletions
diff --git a/‎pdf4llm/setup.py‎
Lines changed: 1 addition & 1 deletion b/‎pdf4llm/setup.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎pymupdf4llm/pymupdf4llm/__init__.py‎
Lines changed: 22 additions & 0 deletions b/‎pymupdf4llm/pymupdf4llm/__init__.py‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎pymupdf4llm/pymupdf4llm/helpers/check_ocr.py‎
Lines changed: 65 additions & 41 deletions b/‎pymupdf4llm/pymupdf4llm/helpers/check_ocr.py‎
Lines changed: 65 additions & 41 deletions
@@ -1,15 +1,53 @@
 # Change Log
 
-## Changes in version 0.0.28
+## Changes in version 0.2.1
 
 ### Fixes:
 
-* [xxx](https://github.com/pymupdf/RAG/issues/xxx) - 
+* [320](https://github.com/pymupdf/RAG/issues/320) - [Bug] ValueError: min() iterable argument is empty ...
+* [319](https://github.com/pymupdf/RAG/issues/319) - [Bug] ValueError: min() arg is an empty sequence 
 
 ### Other Changes:
 
-* xxx
+* OCR invocation now differentiates between full-page OCR and text-only OCR: If the page does contain text but the percentage of unreadable characters exceeds a certain threshold (90%), we only OCR text span boundary boxes and replace span text with OCR'ed text where necessary.
 
+------
+
+## Changes in version 0.2.0
+
+This version introduces full support of the [PyMuPDF-Layout](https://pypi.org/project/pymupdf-layout/) package. This entails a radically new approach for detecting the layout of document pages using the AI-based features of the layout package.
+
+Improvements include:
+
+* Greatly improved table detection
+* Support of list item hierachy levels
+* Detection of page headers and footers
+* Improved detection of text paragraphs, titles and section headers
+* New output options beyond Markdown: plain text and JSON
+* Automatically detect whether a page needs OCR and invoke Tesseract if both, Tesseract is installed and OpenCV (package [opencv-python](https://pypi.org/project/opencv-python/)) is available. Invocation criteria include absence of readable text, full-page coverage with images, presence of many character-sized vector graphics.
+
+The PyMuPDF-Layout package is not open-source and has its own license, which is different from PyMuPDF4LLM. It also is dependent on a number of other, fairly large packages like [onnxruntime](https://pypi.org/project/onnxruntime/), [numpy](https://pypi.org/project/numpy/), [sympy](https://pypi.org/project/sympy/) and [OpenCV](https://pypi.org/project/opencv-python/), which each in turn have their own dependencies.
+
+We therefore keep the use of the layout feature optional. To activate PyMuPDF-Layout support the following import statement **_must be included before_** importing PyMuPDF4LLM itself:
+
+```python
+import pymupdf.layout
+import pymupdf4llm
+```
+
+Thereafter, PyMuPDF's namespace is available. The known method `pymupdf4llm.to_markdown()` automatically works with AI-based empowerment.
+In addition, two new methods become available:
+* `pymupdf4llm.to_text()` - which works much like markdown output but produces plain text.
+* `pymupdf4llm.to_json()` - which outputs the document's metadata and the selected pages in JSON format.
+
+### Fixes:
+
+
+### Other Changes:
+
+* If `show_progress=True`, Python package [tqdm](https://pypi.org/project/tqdm/) is automatically used when available to display a progress bar. If tqdm is not installed, our own text-based progress bar is used.
+
+------
 
 ## Changes in version 0.0.27
 
 
@@ -6,7 +6,7 @@
 with open(os.path.join(setup_py_cwd, "README.md"), encoding="utf-8") as f:
     readme = f.read()
 
-version = "0.2.0"
+version = "0.2.1"
 
 classifiers = [
     "Development Status :: 5 - Production/Stable",
 
@@ -26,6 +26,9 @@ def parse_document(
         image_format="png",
         image_path="",
         pages=None,
+        output_images=True,
+        show_progress=False,
+        force_text=True,
     ):
         return DL.parse_document(
             doc,
@@ -34,6 +37,9 @@ def parse_document(
             image_format=image_format,
             image_path=image_path,
             pages=pages,
+            output_images=output_images,
+            show_progress=show_progress,
+            force_text=force_text,
         )
 
     def to_markdown(
@@ -75,13 +81,17 @@ def to_markdown(
             image_format=image_format,
             image_path=image_path,
             pages=pages,
+            output_images=embed_images or write_images,
+            show_progress=show_progress,
+            force_text=force_text,
         )
         return parsed_doc.to_markdown(
             header=header,
             footer=footer,
             write_images=write_images,
             embed_images=embed_images,
             ignore_code=ignore_code,
+            show_progress=show_progress,
         )
 
     def to_json(
@@ -92,13 +102,19 @@ def to_json(
         image_format="png",
         image_path="",
         pages=None,
+        output_images=False,
+        show_progress=False,
+        force_text=True,
     ):
         parsed_doc = parse_document(
             doc,
             image_dpi=image_dpi,
             image_format=image_format,
             image_path=image_path,
             pages=pages,
+            output_images=output_images,
+            show_progress=show_progress,
+            force_text=force_text,
         )
         return parsed_doc.to_json()
 
@@ -109,6 +125,8 @@ def to_text(
         footer=True,
         pages=None,
         ignore_code=False,
+        show_progress=False,
+        force_text=True,
     ):
         parsed_doc = parse_document(
             doc,
@@ -117,11 +135,15 @@ def to_text(
             image_format="png",
             image_path="",
             pages=pages,
+            output_images=False,
+            show_progress=show_progress,
+            force_text=force_text,
         )
         return parsed_doc.to_text(
             header=header,
             footer=footer,
             ignore_code=ignore_code,
+            show_progress=show_progress,
         )
 
 
 
@@ -1,28 +1,42 @@
-import pymupdf  # PyMuPDF
-import numpy as np
 import cv2
+import numpy as np
+import pymupdf  # PyMuPDF
+from pymupdf4llm.helpers.utils import WHITE_CHARS
 
 
-WHITE_CHARS = set(
-    [chr(i) for i in range(33)]
-    + [
-        "\u00a0",  # Non-breaking space
-        "\u2000",  # En quad
-        "\u2001",  # Em quad
-        "\u2002",  # En space
-        "\u2003",  # Em space
-        "\u2004",  # Three-per-em space
-        "\u2005",  # Four-per-em space
-        "\u2006",  # Six-per-em space
-        "\u2007",  # Figure space
-        "\u2008",  # Punctuation space
-        "\u2009",  # Thin space
-        "\u200a",  # Hair space
-        "\u202f",  # Narrow no-break space
-        "\u205f",  # Medium mathematical space
-        "\u3000",  # Ideographic space
-    ]
-)
+def get_tessocr(page, bbox, dpi=300):
+    """Return OCR-ed span text using Tesseract.
+
+    Args:
+        page: pymupdf Page
+        bbox: pymupdf Rect or its sequence
+        dpi: resolution for OCR image
+    Returns:
+        The OCR-ed text of the bbox.
+    """
+    # Step 1: Make a high-resolution image of the bbox.
+    pix = page.get_pixmap(dpi=dpi, clip=bbox)
+    ocrpdf = pymupdf.open("pdf", pix.pdfocr_tobytes())
+    ocrpage = ocrpdf[0]
+    text = ocrpage.get_text()
+    text = text.replace("\n", " ").strip()
+    return text
+
+
+def repair_blocks(input_blocks, page):
+    repaired_blocks = []
+    for block in input_blocks:
+        if block["type"] == 0:  # text block
+            for line in block["lines"]:
+                for span in line["spans"]:
+                    if not chr(0xFFFD) in span["text"]:
+                        continue
+                    text = get_tessocr(page, span["bbox"])
+                    span["text"] = text
+            repaired_blocks.append(block)
+        else:
+            repaired_blocks.append(block)
+    return repaired_blocks
 
 
 def detect_qr_codes(img):
@@ -152,23 +166,38 @@ def should_ocr_page(
     # Check for text
     text = page.get_text(flags=0)
     decision["has_text"] = not WHITE_CHARS.issuperset(text)
-    if decision["has_text"]:
-        not_readable_count = len([c for c in text if c == chr(0xFFFD)])
-        readability = 1 - not_readable_count / len(text)
-        decision["readable_text"] = readability >= text_readability_thresh
 
     all_text_bboxes = [b for b in page.get_bboxlog() if "text" in b[0]]
     ocr_text_bboxes = [b for b in all_text_bboxes if b[0] == "ignore-text"]
     decision["has_ocr_text"] = bool(ocr_text_bboxes)
+
+    if decision["has_text"]:
+        unreadable_count = len([c for c in text if c == chr(0xFFFD)])
+        readability = 1 - unreadable_count / len(text)
+        decision["readable_text"] = readability >= text_readability_thresh
+
+    if decision["has_text"] and not decision["readable_text"]:
+        decision["should_ocr"] = True
+        decision["image"], decision["transform"], decision["pixmap"] = get_page_image(
+            page, dpi=dpi
+        )
+
+    if decision["has_text"]:
+        # early exit if any text exists
+        print(
+            f"{decision['has_text']=}, {decision['readable_text']=}, {decision['should_ocr']=}"
+        )
+        return decision
+
     # Check for image coverage
-    image_rects=[page_rect&img["bbox"] for img in page.get_image_info()]
-    image_rect=pymupdf.EMPTY_RECT()
+    image_rects = [page_rect & img["bbox"] for img in page.get_image_info()]
+    image_rect = pymupdf.EMPTY_RECT()
     for r in image_rects:
-        image_rect|=r
-    image_area=abs(image_rect)
+        image_rect |= r
+    image_area = abs(image_rect)
     if image_area:
         images_cover = image_area / page_area
-    else:        
+    else:
         images_cover = 0.0
     decision["image_covers_page"] = images_cover >= image_coverage_thresh
 
@@ -189,16 +218,11 @@ def should_ocr_page(
 
     # Final decision
     if (
-        1
-        and not decision["has_text"]
-        and not decision["readable_text"]
-        and (
-            0
-            or decision["image_covers_page"]
-            or decision["has_vector_drawings"]
-            or decision["edge_density"] > edge_thresh
-        )
+        0
+        or decision["image_covers_page"]
+        or decision["has_vector_drawings"]
+        or decision["edge_density"] > edge_thresh
     ):
         decision["should_ocr"] = True
-    
+
     return decision