Skip to content

Commit 327e61c

Browse files
authored
Merge pull request #314 from pymupdf/v0.1.9
Version 0.1.9
2 parents 70c90a4 + 61696fe commit 327e61c

File tree

6 files changed

+35
-9
lines changed

6 files changed

+35
-9
lines changed

pdf4llm/setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
with open(os.path.join(setup_py_cwd, "README.md"), encoding="utf-8") as f:
77
readme = f.read()
88

9-
version = "0.1.8"
9+
version = "0.1.9"
1010

1111
classifiers = [
1212
"Development Status :: 5 - Production/Stable",

pymupdf4llm/pymupdf4llm/helpers/document_layout.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,11 @@
88
import pymupdf
99
import tabulate
1010
from pymupdf4llm.helpers.get_text_lines import get_raw_lines
11-
from pymupdf4llm.helpers import utils, check_ocr
11+
from pymupdf4llm.helpers import utils
1212

1313
try:
1414
import cv2
15+
from pymupdf4llm.helpers import check_ocr
1516
except ImportError:
1617
cv2 = None
1718

pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -93,8 +93,12 @@ def sanitize_spans(line):
9393
if s0["bbox"].x1 + delta < s1["bbox"].x0 or (
9494
s0["flags"],
9595
s0["char_flags"] & ~2,
96-
s0["size"],
97-
) != (s1["flags"], s1["char_flags"] & ~2, s1["size"]):
96+
# s0["size"],
97+
) != (
98+
s1["flags"],
99+
s1["char_flags"] & ~2,
100+
# s1["size"],
101+
):
98102
continue # no joining
99103
# We need to join bbox and text of two consecutive spans
100104
# On occasion, spans may also be duplicated.

pymupdf4llm/pymupdf4llm/helpers/utils.py

Lines changed: 24 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -221,11 +221,31 @@ def cluster_stripes(boxes, vertical_gap: float = 12):
221221
Returns:
222222
List of disjoint horizontal stripes. Each stripe is a list of boxes.
223223
"""
224+
225+
def is_multi_column_layout(boxes):
226+
sorted_boxes = sorted(boxes, key=lambda b: b[0])
227+
columns = []
228+
current_column = [sorted_boxes[0]]
229+
for box in sorted_boxes[1:]:
230+
prev_right = max([b[2] for b in current_column])
231+
if box[0] - prev_right > 3:
232+
columns.append(current_column)
233+
current_column = [box]
234+
else:
235+
current_column.append(box)
236+
columns.append(current_column)
237+
return len(columns) > 1
238+
224239
# Sort top to bottom
225240
sorted_boxes = sorted(boxes, key=lambda b: b[1])
226241
stripes = []
227242
if not sorted_boxes:
228243
return stripes
244+
245+
# Early exit for clean multi-column layouts
246+
if is_multi_column_layout(sorted_boxes):
247+
return [boxes]
248+
229249
current_stripe = [sorted_boxes[0]]
230250

231251
for box in sorted_boxes[1:]:
@@ -257,7 +277,7 @@ def cluster_columns_in_stripe(stripe: list):
257277

258278
for box in sorted_boxes[1:]:
259279
prev_right = max([b[2] for b in current_column])
260-
if box[0] - prev_right >= -1:
280+
if box[0] - prev_right > 1:
261281
columns.append(sorted(current_column, key=lambda b: b[3]))
262282
current_column = [box]
263283
else:
@@ -292,14 +312,15 @@ def compute_reading_order(boxes, vertical_gap: float = 12):
292312
return ordered
293313

294314

295-
def find_reading_order(boxes, vertical_gap: float = 12) -> list:
315+
def find_reading_order(boxes, vertical_gap: float = 36) -> list:
296316
"""Given page layout information, return the boxes in reading order.
297317
298318
Args:
299319
boxes: List of classified bounding boxes with class info as defined
300320
by pymupdf_layout: (x0, y0, x1, y1, "class").
301321
vertical_gap: Minimum vertical gap to separate stripes. The default
302-
value of 12 works well for most documents.
322+
value of 36 works well for most documents. It roughly
323+
corresponds to 2 -3 text line heights
303324
304325
Returns:
305326
List of boxes in reading order.
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
# Generated file - do not edit.
22
MINIMUM_PYMUPDF_VERSION = (1, 26, 6)
3-
VERSION = '0.1.8'
3+
VERSION = '0.1.9'

pymupdf4llm/setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
"Topic :: Utilities",
1515
]
1616

17-
version = "0.1.8"
17+
version = "0.1.9"
1818
requires = ["pymupdf>=1.26.6", "tabulate"]
1919

2020
text = requires[0].split("=")[1]

0 commit comments

Comments
 (0)