Skip to content

Commit 83bb17c

Browse files
authored
Merge pull request #324 from pymupdf/v0.2.1
Version 0.2.1
2 parents c9229b7 + 92235cd commit 83bb17c

File tree

13 files changed

+483
-200
lines changed

13 files changed

+483
-200
lines changed

CHANGES.md

Lines changed: 41 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,53 @@
11
# Change Log
22

3-
## Changes in version 0.0.28
3+
## Changes in version 0.2.1
44

55
### Fixes:
66

7-
* [xxx](https://github.com/pymupdf/RAG/issues/xxx) -
7+
* [320](https://github.com/pymupdf/RAG/issues/320) - [Bug] ValueError: min() iterable argument is empty ...
8+
* [319](https://github.com/pymupdf/RAG/issues/319) - [Bug] ValueError: min() arg is an empty sequence
89

910
### Other Changes:
1011

11-
* xxx
12+
* OCR invocation now differentiates between full-page OCR and text-only OCR: If the page does contain text but the percentage of unreadable characters exceeds a certain threshold (90%), we only OCR text span boundary boxes and replace span text with OCR'ed text where necessary.
1213

14+
------
15+
16+
## Changes in version 0.2.0
17+
18+
This version introduces full support of the [PyMuPDF-Layout](https://pypi.org/project/pymupdf-layout/) package. This entails a radically new approach for detecting the layout of document pages using the AI-based features of the layout package.
19+
20+
Improvements include:
21+
22+
* Greatly improved table detection
23+
* Support of list item hierachy levels
24+
* Detection of page headers and footers
25+
* Improved detection of text paragraphs, titles and section headers
26+
* New output options beyond Markdown: plain text and JSON
27+
* Automatically detect whether a page needs OCR and invoke Tesseract if both, Tesseract is installed and OpenCV (package [opencv-python](https://pypi.org/project/opencv-python/)) is available. Invocation criteria include absence of readable text, full-page coverage with images, presence of many character-sized vector graphics.
28+
29+
The PyMuPDF-Layout package is not open-source and has its own license, which is different from PyMuPDF4LLM. It also is dependent on a number of other, fairly large packages like [onnxruntime](https://pypi.org/project/onnxruntime/), [numpy](https://pypi.org/project/numpy/), [sympy](https://pypi.org/project/sympy/) and [OpenCV](https://pypi.org/project/opencv-python/), which each in turn have their own dependencies.
30+
31+
We therefore keep the use of the layout feature optional. To activate PyMuPDF-Layout support the following import statement **_must be included before_** importing PyMuPDF4LLM itself:
32+
33+
```python
34+
import pymupdf.layout
35+
import pymupdf4llm
36+
```
37+
38+
Thereafter, PyMuPDF's namespace is available. The known method `pymupdf4llm.to_markdown()` automatically works with AI-based empowerment.
39+
In addition, two new methods become available:
40+
* `pymupdf4llm.to_text()` - which works much like markdown output but produces plain text.
41+
* `pymupdf4llm.to_json()` - which outputs the document's metadata and the selected pages in JSON format.
42+
43+
### Fixes:
44+
45+
46+
### Other Changes:
47+
48+
* If `show_progress=True`, Python package [tqdm](https://pypi.org/project/tqdm/) is automatically used when available to display a progress bar. If tqdm is not installed, our own text-based progress bar is used.
49+
50+
------
1351

1452
## Changes in version 0.0.27
1553

pdf4llm/setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
with open(os.path.join(setup_py_cwd, "README.md"), encoding="utf-8") as f:
77
readme = f.read()
88

9-
version = "0.2.0"
9+
version = "0.2.1"
1010

1111
classifiers = [
1212
"Development Status :: 5 - Production/Stable",

pymupdf4llm/pymupdf4llm/__init__.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,9 @@ def parse_document(
2626
image_format="png",
2727
image_path="",
2828
pages=None,
29+
output_images=True,
30+
show_progress=False,
31+
force_text=True,
2932
):
3033
return DL.parse_document(
3134
doc,
@@ -34,6 +37,9 @@ def parse_document(
3437
image_format=image_format,
3538
image_path=image_path,
3639
pages=pages,
40+
output_images=output_images,
41+
show_progress=show_progress,
42+
force_text=force_text,
3743
)
3844

3945
def to_markdown(
@@ -75,13 +81,17 @@ def to_markdown(
7581
image_format=image_format,
7682
image_path=image_path,
7783
pages=pages,
84+
output_images=embed_images or write_images,
85+
show_progress=show_progress,
86+
force_text=force_text,
7887
)
7988
return parsed_doc.to_markdown(
8089
header=header,
8190
footer=footer,
8291
write_images=write_images,
8392
embed_images=embed_images,
8493
ignore_code=ignore_code,
94+
show_progress=show_progress,
8595
)
8696

8797
def to_json(
@@ -92,13 +102,19 @@ def to_json(
92102
image_format="png",
93103
image_path="",
94104
pages=None,
105+
output_images=False,
106+
show_progress=False,
107+
force_text=True,
95108
):
96109
parsed_doc = parse_document(
97110
doc,
98111
image_dpi=image_dpi,
99112
image_format=image_format,
100113
image_path=image_path,
101114
pages=pages,
115+
output_images=output_images,
116+
show_progress=show_progress,
117+
force_text=force_text,
102118
)
103119
return parsed_doc.to_json()
104120

@@ -109,6 +125,8 @@ def to_text(
109125
footer=True,
110126
pages=None,
111127
ignore_code=False,
128+
show_progress=False,
129+
force_text=True,
112130
):
113131
parsed_doc = parse_document(
114132
doc,
@@ -117,11 +135,15 @@ def to_text(
117135
image_format="png",
118136
image_path="",
119137
pages=pages,
138+
output_images=False,
139+
show_progress=show_progress,
140+
force_text=force_text,
120141
)
121142
return parsed_doc.to_text(
122143
header=header,
123144
footer=footer,
124145
ignore_code=ignore_code,
146+
show_progress=show_progress,
125147
)
126148

127149

pymupdf4llm/pymupdf4llm/helpers/check_ocr.py

Lines changed: 65 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -1,28 +1,42 @@
1-
import pymupdf # PyMuPDF
2-
import numpy as np
31
import cv2
2+
import numpy as np
3+
import pymupdf # PyMuPDF
4+
from pymupdf4llm.helpers.utils import WHITE_CHARS
45

56

6-
WHITE_CHARS = set(
7-
[chr(i) for i in range(33)]
8-
+ [
9-
"\u00a0", # Non-breaking space
10-
"\u2000", # En quad
11-
"\u2001", # Em quad
12-
"\u2002", # En space
13-
"\u2003", # Em space
14-
"\u2004", # Three-per-em space
15-
"\u2005", # Four-per-em space
16-
"\u2006", # Six-per-em space
17-
"\u2007", # Figure space
18-
"\u2008", # Punctuation space
19-
"\u2009", # Thin space
20-
"\u200a", # Hair space
21-
"\u202f", # Narrow no-break space
22-
"\u205f", # Medium mathematical space
23-
"\u3000", # Ideographic space
24-
]
25-
)
7+
def get_tessocr(page, bbox, dpi=300):
8+
"""Return OCR-ed span text using Tesseract.
9+
10+
Args:
11+
page: pymupdf Page
12+
bbox: pymupdf Rect or its sequence
13+
dpi: resolution for OCR image
14+
Returns:
15+
The OCR-ed text of the bbox.
16+
"""
17+
# Step 1: Make a high-resolution image of the bbox.
18+
pix = page.get_pixmap(dpi=dpi, clip=bbox)
19+
ocrpdf = pymupdf.open("pdf", pix.pdfocr_tobytes())
20+
ocrpage = ocrpdf[0]
21+
text = ocrpage.get_text()
22+
text = text.replace("\n", " ").strip()
23+
return text
24+
25+
26+
def repair_blocks(input_blocks, page):
27+
repaired_blocks = []
28+
for block in input_blocks:
29+
if block["type"] == 0: # text block
30+
for line in block["lines"]:
31+
for span in line["spans"]:
32+
if not chr(0xFFFD) in span["text"]:
33+
continue
34+
text = get_tessocr(page, span["bbox"])
35+
span["text"] = text
36+
repaired_blocks.append(block)
37+
else:
38+
repaired_blocks.append(block)
39+
return repaired_blocks
2640

2741

2842
def detect_qr_codes(img):
@@ -152,23 +166,38 @@ def should_ocr_page(
152166
# Check for text
153167
text = page.get_text(flags=0)
154168
decision["has_text"] = not WHITE_CHARS.issuperset(text)
155-
if decision["has_text"]:
156-
not_readable_count = len([c for c in text if c == chr(0xFFFD)])
157-
readability = 1 - not_readable_count / len(text)
158-
decision["readable_text"] = readability >= text_readability_thresh
159169

160170
all_text_bboxes = [b for b in page.get_bboxlog() if "text" in b[0]]
161171
ocr_text_bboxes = [b for b in all_text_bboxes if b[0] == "ignore-text"]
162172
decision["has_ocr_text"] = bool(ocr_text_bboxes)
173+
174+
if decision["has_text"]:
175+
unreadable_count = len([c for c in text if c == chr(0xFFFD)])
176+
readability = 1 - unreadable_count / len(text)
177+
decision["readable_text"] = readability >= text_readability_thresh
178+
179+
if decision["has_text"] and not decision["readable_text"]:
180+
decision["should_ocr"] = True
181+
decision["image"], decision["transform"], decision["pixmap"] = get_page_image(
182+
page, dpi=dpi
183+
)
184+
185+
if decision["has_text"]:
186+
# early exit if any text exists
187+
print(
188+
f"{decision['has_text']=}, {decision['readable_text']=}, {decision['should_ocr']=}"
189+
)
190+
return decision
191+
163192
# Check for image coverage
164-
image_rects=[page_rect&img["bbox"] for img in page.get_image_info()]
165-
image_rect=pymupdf.EMPTY_RECT()
193+
image_rects = [page_rect & img["bbox"] for img in page.get_image_info()]
194+
image_rect = pymupdf.EMPTY_RECT()
166195
for r in image_rects:
167-
image_rect|=r
168-
image_area=abs(image_rect)
196+
image_rect |= r
197+
image_area = abs(image_rect)
169198
if image_area:
170199
images_cover = image_area / page_area
171-
else:
200+
else:
172201
images_cover = 0.0
173202
decision["image_covers_page"] = images_cover >= image_coverage_thresh
174203

@@ -189,16 +218,11 @@ def should_ocr_page(
189218

190219
# Final decision
191220
if (
192-
1
193-
and not decision["has_text"]
194-
and not decision["readable_text"]
195-
and (
196-
0
197-
or decision["image_covers_page"]
198-
or decision["has_vector_drawings"]
199-
or decision["edge_density"] > edge_thresh
200-
)
221+
0
222+
or decision["image_covers_page"]
223+
or decision["has_vector_drawings"]
224+
or decision["edge_density"] > edge_thresh
201225
):
202226
decision["should_ocr"] = True
203-
227+
204228
return decision

0 commit comments

Comments
 (0)