@@ -221,11 +221,31 @@ def cluster_stripes(boxes, vertical_gap: float = 12):
221221 Returns:
222222 List of disjoint horizontal stripes. Each stripe is a list of boxes.
223223 """
224+
225+ def is_multi_column_layout (boxes ):
226+ sorted_boxes = sorted (boxes , key = lambda b : b [0 ])
227+ columns = []
228+ current_column = [sorted_boxes [0 ]]
229+ for box in sorted_boxes [1 :]:
230+ prev_right = max ([b [2 ] for b in current_column ])
231+ if box [0 ] - prev_right > 3 :
232+ columns .append (current_column )
233+ current_column = [box ]
234+ else :
235+ current_column .append (box )
236+ columns .append (current_column )
237+ return len (columns ) > 1
238+
224239 # Sort top to bottom
225240 sorted_boxes = sorted (boxes , key = lambda b : b [1 ])
226241 stripes = []
227242 if not sorted_boxes :
228243 return stripes
244+
245+ # Early exit for clean multi-column layouts
246+ if is_multi_column_layout (sorted_boxes ):
247+ return [boxes ]
248+
229249 current_stripe = [sorted_boxes [0 ]]
230250
231251 for box in sorted_boxes [1 :]:
@@ -257,7 +277,7 @@ def cluster_columns_in_stripe(stripe: list):
257277
258278 for box in sorted_boxes [1 :]:
259279 prev_right = max ([b [2 ] for b in current_column ])
260- if box [0 ] - prev_right >= - 1 :
280+ if box [0 ] - prev_right > 1 :
261281 columns .append (sorted (current_column , key = lambda b : b [3 ]))
262282 current_column = [box ]
263283 else :
@@ -292,14 +312,15 @@ def compute_reading_order(boxes, vertical_gap: float = 12):
292312 return ordered
293313
294314
295- def find_reading_order (boxes , vertical_gap : float = 12 ) -> list :
315+ def find_reading_order (boxes , vertical_gap : float = 36 ) -> list :
296316 """Given page layout information, return the boxes in reading order.
297317
298318 Args:
299319 boxes: List of classified bounding boxes with class info as defined
300320 by pymupdf_layout: (x0, y0, x1, y1, "class").
301321 vertical_gap: Minimum vertical gap to separate stripes. The default
302- value of 12 works well for most documents.
322+ value of 36 works well for most documents. It roughly
323+ corresponds to 2 -3 text line heights
303324
304325 Returns:
305326 List of boxes in reading order.
0 commit comments