Skip to content

Commit 65130d2

Browse files
authored
Merge pull request #126 from pymupdf/v0.0.14
Updates for v0.0.14
2 parents f626f2a + 5a11865 commit 65130d2

File tree

6 files changed

+87
-24
lines changed

6 files changed

+87
-24
lines changed

pymupdf4llm/pymupdf4llm/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from .helpers.pymupdf_rag import IdentifyHeaders, to_markdown
22

3-
__version__ = "0.0.13"
3+
__version__ = "0.0.14"
44
version = __version__
55
version_tuple = tuple(map(int, version.split(".")))
66

pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,8 @@ def sanitize_spans(line):
9898
spans = [] # all spans in TextPage here
9999
for bno, b in enumerate(blocks): # the numbered blocks
100100
for lno, line in enumerate(b["lines"]): # the numbered lines
101+
if abs(1-line["dir"][0]) > 1e-3: # only accept horizontal text
102+
continue
101103
for sno, s in enumerate(line["spans"]): # the numered spans
102104
sbbox = pymupdf.Rect(s["bbox"]) # span bbox as a Rect
103105
mpoint = (sbbox.tl + sbbox.br) / 2 # middle point

pymupdf4llm/pymupdf4llm/helpers/multi_column.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -152,9 +152,11 @@ def can_extend(temp, bb, bboxlist, vert_bboxes):
152152
def join_rects_phase1(bboxes):
153153
"""Postprocess identified text blocks, phase 1.
154154
155-
Joins any rectangles that "touch" each other. This means that
156-
their intersection is valid (but may be empty).
155+
Joins any rectangles that "touch" each other.
156+
This means that their intersection is valid (but may be empty).
157+
To prefer vertical joins, we will ignore small horizontal gaps.
157158
"""
159+
delta=(0,-3,0,3) # allow thid gap above and below
158160
prects = bboxes[:]
159161
new_rects = []
160162
while prects:
@@ -163,7 +165,7 @@ def join_rects_phase1(bboxes):
163165
while repeat:
164166
repeat = False
165167
for i in range(len(prects) - 1, 0, -1):
166-
if (prect0 & prects[i]).is_valid:
168+
if ((prect0+delta) & (prects[i]+delta)).is_valid:
167169
prect0 |= prects[i]
168170
del prects[i]
169171
repeat = True

pymupdf4llm/pymupdf4llm/helpers/progress.py

Lines changed: 36 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,17 @@
1+
"""
2+
This script defines a text-based progress bar to allow watching the advancement
3+
of Markdown conversion of document pages.
4+
5+
Dependencies
6+
-------------
7+
None
8+
9+
Copyright and License
10+
----------------------
11+
Copyright 2024 Artifex Software, Inc.
12+
License GNU Affero GPL 3.0
13+
"""
14+
115
import sys
216
from typing import List, Any
317

@@ -15,11 +29,9 @@ def __init__(self, items: List[Any], progress_width: int = 40):
1529
self._increment = self._progress_width / self._len if self._len else 1
1630

1731
# Init progress bar
18-
sys.stdout.write("[%s] (0/%d)" %
19-
(" " * self._progress_width, self._len))
32+
sys.stdout.write("[%s] (0/%d)" % (" " * self._progress_width, self._len))
2033
sys.stdout.flush()
21-
sys.stdout.write(
22-
"\b" * (self._progress_width + len(str(self._len)) + 6))
34+
sys.stdout.write("\b" * (self._progress_width + len(str(self._len)) + 6))
2335

2436
def __iter__(self):
2537
return self
@@ -45,17 +57,29 @@ def __next__(self):
4557
# Update the numerical progress
4658
padded_index = str(self._current_index).rjust(self._len_digits)
4759
progress_info = f" ({padded_index}/{self._len})"
48-
sys.stdout.write(
49-
"\b" * (self._progress_width + len(progress_info) + 1))
60+
sys.stdout.write("\b" * (self._progress_width + len(progress_info) + 1))
5061
sys.stdout.write("[")
51-
sys.stdout.write("=" * int(self._current_index *
52-
self._progress_width / self._len))
53-
sys.stdout.write(" " * (self._progress_width -
54-
int(self._current_index * self._progress_width / self._len)))
62+
sys.stdout.write(
63+
"=" * int(self._current_index * self._progress_width / self._len)
64+
)
65+
sys.stdout.write(
66+
" "
67+
* (
68+
self._progress_width
69+
- int(self._current_index * self._progress_width / self._len)
70+
)
71+
)
5572
sys.stdout.write("]" + progress_info)
5673
sys.stdout.flush()
57-
sys.stdout.write("\b" * (self._progress_width - int(self._current_index * self._progress_width / self._len)
58-
+ len(progress_info) + 1))
74+
sys.stdout.write(
75+
"\b"
76+
* (
77+
self._progress_width
78+
- int(self._current_index * self._progress_width / self._len)
79+
+ len(progress_info)
80+
+ 1
81+
)
82+
)
5983

6084
return result
6185

pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py

Lines changed: 42 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,10 @@
1515
1616
Text will be sorted in Western reading order. Any table will be included in
1717
the text in markdwn format as well.
18-
18+
1919
Dependencies
2020
-------------
21-
PyMuPDF v1.24.2 or later
21+
PyMuPDF v1.24.3 or later
2222
2323
Copyright and License
2424
----------------------
@@ -247,6 +247,8 @@ def to_markdown(
247247
page_height: (float) assumption if page layout is variable.
248248
table_strategy: choose table detection strategy
249249
graphics_limit: (int) ignore page with too many vector graphics.
250+
ignore_code: (bool) suppress extra formatting for mono-space fonts
251+
extract_words: (bool) include "words"-like output in page chunks
250252
show_progress: (bool) print progress as each page is processed.
251253
252254
"""
@@ -403,6 +405,13 @@ def write_text(
403405
key=lambda j: (j[1].y1, j[1].x0),
404406
):
405407
out_string += "\n" + tabs[i].to_markdown(clean=False) + "\n"
408+
if EXTRACT_WORDS: # determine raw line rects within this table
409+
line_rects.extend(
410+
[
411+
pymupdf.Rect(rl[0])
412+
for rl in get_raw_lines(textpage, clip=tab_rects[i])
413+
]
414+
)
406415
del tab_rects[i]
407416

408417
# ------------------------------------------------------------
@@ -548,7 +557,7 @@ def intersects_rects(rect, rect_list):
548557
return i
549558
return 0
550559

551-
def output_tables(tabs, text_rect, tab_rects):
560+
def output_tables(tabs, text_rect, tab_rects, line_rects, textpage):
552561
"""Output tables above a text rectangle."""
553562
this_md = "" # markdown string for table content
554563
if text_rect is not None: # select tables above the text block
@@ -557,6 +566,13 @@ def output_tables(tabs, text_rect, tab_rects):
557566
key=lambda j: (j[1].y1, j[1].x0),
558567
):
559568
this_md += tabs[i].to_markdown(clean=False)
569+
if EXTRACT_WORDS: # determine raw line rects within this table
570+
line_rects.extend(
571+
[
572+
pymupdf.Rect(rl[0])
573+
for rl in get_raw_lines(textpage, clip=tab_rects[i])
574+
]
575+
)
560576
del tab_rects[i] # do not touch this table twice
561577

562578
else: # output all remaining table
@@ -565,6 +581,13 @@ def output_tables(tabs, text_rect, tab_rects):
565581
key=lambda j: (j[1].y1, j[1].x0),
566582
):
567583
this_md += tabs[i].to_markdown(clean=False)
584+
if EXTRACT_WORDS: # determine raw line rects within this table
585+
line_rects.extend(
586+
[
587+
pymupdf.Rect(rl[0])
588+
for rl in get_raw_lines(textpage, clip=tab_rects[i])
589+
]
590+
)
568591
del tab_rects[i] # do not touch this table twice
569592
return this_md
570593

@@ -748,7 +771,7 @@ def get_page_output(doc, pno, margins, textflags):
748771
"""
749772
for text_rect in text_rects:
750773
# output tables above this block of text
751-
md_string += output_tables(tabs, text_rect, tab_rects)
774+
md_string += output_tables(tabs, text_rect, tab_rects, line_rects, textpage)
752775
md_string += output_images(
753776
page, textpage, text_rect, vg_clusters, line_rects
754777
)
@@ -768,24 +791,36 @@ def get_page_output(doc, pno, margins, textflags):
768791

769792
md_string = md_string.replace(" ,", ",").replace("-\n", "")
770793
# write any remaining tables and images
771-
md_string += output_tables(tabs, None, tab_rects)
794+
md_string += output_tables(tabs, None, tab_rects, line_rects, textpage)
772795
md_string += output_images(page, textpage, None, vg_clusters, line_rects)
773796
md_string += "\n-----\n\n"
774797
while md_string.startswith("\n"):
775798
md_string = md_string[1:]
776799
md_string = md_string.replace(chr(0), chr(0xFFFD))
800+
777801
if EXTRACT_WORDS is True:
802+
# output words in sequence compliant with Markdown text
778803
rawwords = textpage.extractWORDS()
779804
words = []
780805
for lrect in line_rects:
781806
lwords = []
782807
for w in rawwords:
783808
wrect = pymupdf.Rect(w[:4])
784809
if wrect in lrect:
785-
wrect.y0 = lrect.y0
786-
wrect.y1 = lrect.y1
810+
wrect.y0 = lrect.y0 # set upper coord to line
811+
wrect.y1 = lrect.y1 # set lower coord to line
787812
lwords.append(list(wrect) + list(w[4:]))
813+
# append sorted words of this line
788814
words.extend(sorted(lwords, key=lambda w: w[0]))
815+
816+
# remove word duplicates without spoiling the sequence
817+
# duplicates may occur for multiple reasons
818+
nwords = [] # words w/o duplicates
819+
for w in words:
820+
if w not in nwords:
821+
nwords.append(w)
822+
words = nwords
823+
789824
else:
790825
words = []
791826
return md_string, images, tables, graphics, words

pymupdf4llm/setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717

1818
setuptools.setup(
1919
name="pymupdf4llm",
20-
version="0.0.13",
20+
version="0.0.14",
2121
author="Artifex",
2222
author_email="support@artifex.com",
2323
description="PyMuPDF Utilities for LLM/RAG",

0 commit comments

Comments
 (0)