pymupdf · JorjMcKie · Sep 23, 2024 · Sep 22, 2024
diff --git a/src/__init__.py b/src/__init__.py
@@ -13319,14 +13319,16 @@ def width(self):
 TEXT_OUTPUT_XML = 3
 TEXT_OUTPUT_XHTML = 4
 
-TEXT_PRESERVE_LIGATURES = 1
-TEXT_PRESERVE_WHITESPACE = 2
-TEXT_PRESERVE_IMAGES = 4
-TEXT_INHIBIT_SPACES = 8
-TEXT_DEHYPHENATE = 16
-TEXT_PRESERVE_SPANS = 32
-TEXT_MEDIABOX_CLIP = 64
-TEXT_CID_FOR_UNKNOWN_UNICODE = 128
+TEXT_PRESERVE_LIGATURES = mupdf.FZ_STEXT_PRESERVE_LIGATURES
+TEXT_PRESERVE_WHITESPACE = mupdf.FZ_STEXT_PRESERVE_WHITESPACE
+TEXT_PRESERVE_IMAGES = mupdf.FZ_STEXT_PRESERVE_IMAGES
+TEXT_INHIBIT_SPACES = mupdf.FZ_STEXT_PRESERVE_LIGATURES
+TEXT_DEHYPHENATE = mupdf.FZ_STEXT_DEHYPHENATE
+TEXT_PRESERVE_SPANS = mupdf.FZ_STEXT_PRESERVE_SPANS
+TEXT_MEDIABOX_CLIP = mupdf.FZ_STEXT_MEDIABOX_CLIP
+TEXT_CID_FOR_UNKNOWN_UNICODE = mupdf.FZ_STEXT_USE_CID_FOR_UNKNOWN_UNICODE
+TEXT_COLLECT_STRUCTURE = 256 # mupdf.FZ_STEXT_COLLECT_STRUCTURE
+TEXT_ACCURATE_BBOXES = 512 # mupdf.FZ_STEXT_ACCURATE_BBOXES
 
 TEXTFLAGS_WORDS = (0
         | TEXT_PRESERVE_LIGATURES

diff --git a/src/utils.py b/src/utils.py
@@ -507,34 +507,195 @@ def get_text_words(
     textpage: pymupdf.TextPage = None,
     sort: bool = False,
     delimiters=None,
+    tolerance=3,
 ) -> list:
     """Return the text words as a list with the bbox for each word.
 
     Args:
+        page: pymupdf.Page
+        clip: (rect-like) area on page to consider
         flags: (int) control the amount of data parsed into the textpage.
-        delimiters: (str,list) characters to use as word delimiters
+        textpage: (pymupdf.TextPage) either passed-in or None.
+        sort: (bool) sort the words in reading sequence.
+        delimiters: (str,list) characters to use as word delimiters.
+        tolerance: (float) consider words to be part of the same line if
+            top or bottom coordinate are not larger than this. Relevant
+            only if sort=True.
 
     Returns:
         Word tuples (x0, y0, x1, y1, "word", bno, lno, wno).
     """
+
+    def sort_words(words):
+        """Sort words line-wise, forgiving small deviations."""
+        words.sort(key=lambda w: (w[3], w[0]))
+        nwords = []  # final word list
+        line = [words[0]]  # collects words roughly in same line
+        lrect = pymupdf.Rect(words[0][:4])  # start the line rectangle
+        for w in words[1:]:
+            wrect = pymupdf.Rect(w[:4])
+            if (
+                abs(wrect.y0 - lrect.y0) <= tolerance
+                or abs(wrect.y1 - lrect.y1) <= tolerance
+            ):
+                line.append(w)
+                lrect |= wrect
+            else:
+                line.sort(key=lambda w: w[0])  # sort words in line l-t-r
+                nwords.extend(line)  # append to final words list
+                line = [w]  # start next line
+                lrect = wrect  # start next line rect
+
+        line.sort(key=lambda w: w[0])  # sort words in line l-t-r
+        nwords.extend(line)  # append to final words list
+
+        return nwords
+
     pymupdf.CheckParent(page)
     if flags is None:
-        flags = pymupdf.TEXT_PRESERVE_WHITESPACE | pymupdf.TEXT_PRESERVE_LIGATURES | pymupdf.TEXT_MEDIABOX_CLIP
+        flags = pymupdf.TEXTFLAGS_WORDS
     tp = textpage
     if tp is None:
         tp = page.get_textpage(clip=clip, flags=flags)
     elif getattr(tp, "parent") != page:
         raise ValueError("not a textpage of this page")
 
     words = tp.extractWORDS(delimiters)
+
+    # if textpage was given, we subselect the words in clip
+    if textpage is not None and clip is not None:
+        # sub-select words contained in clip
+        clip = pymupdf.Rect(clip)
+        words = [
+            w for w in words if abs(clip & w[:4]) >= 0.5 * abs(pymupdf.Rect(w[:4]))
+        ]
+
     if textpage is None:
         del tp
-    if sort is True:
-        words.sort(key=lambda w: (w[3], w[0]))
+    if words and sort is True:
+        # advanced sort if any words found
+        words = sort_words(words)
 
     return words
 
 
+def get_sorted_text(
+    page: pymupdf.Page,
+    clip: rect_like = None,
+    flags: OptInt = None,
+    textpage: pymupdf.TextPage = None,
+    tolerance=3,
+) -> str:
+    """Extract plain text avoiding unacceptable line breaks.
+
+    Text contained in clip will be sorted in reading sequence. Some effort
+    is also spent to simulate layout vertically and horizontally.
+
+    Args:
+        page: pymupdf.Page
+        clip: (rect-like) only consider text inside
+        flags: (int) text extraction flags
+        textpage: pymupdf.TextPage
+        tolerance: (float) consider words to be on the same line if their top
+            or bottom coordinates do not differ more than this.
+
+    Notes:
+        If a TextPage is provided, all text is checked for being inside clip
+        with at least 50% of its bbox.
+        This allows to use some "global" TextPage in conjunction with sub-
+        selecting words in parts of the defined TextPage rectangle.
+
+    Returns:
+        A text string in reading sequence. Left indentation of each line,
+        inter-line and inter-word distances strive to reflect the layout.
+    """
+
+    def line_text(clip, line):
+        """Create the string of one text line.
+
+        We are trying to simulate some horizontal layout here, too.
+
+        Args:
+            clip: (pymupdf.Rect) the area from which all text is being read.
+            line: (list) word tuples (rect, text) contained in the line
+        Returns:
+            Text in this line. Generated from words in 'line'. Distance from
+            predecessor is translated to multiple spaces, thus simulating
+            text indentations and large horizontal distances.
+        """
+        line.sort(key=lambda w: w[0].x0)
+        ltext = ""  # text in the line
+        x1 = clip.x0  # end coordinate of ltext
+        lrect = pymupdf.EMPTY_RECT()  # bbox of this line
+        for r, t in line:
+            lrect |= r  # update line bbox
+            # convert distance to previous word to multiple spaces
+            dist = max(
+                int(round((r.x0 - x1) / r.width * len(t))),
+                0 if x1 == clip.x0 else 1,
+            )  # number of space characters
+
+            ltext += " " * dist + t  # append word string
+            x1 = r.x1  # update new end position
+        return ltext
+
+    # Extract words in correct sequence first.
+    words = [
+        (pymupdf.Rect(w[:4]), w[4])
+        for w in get_text_words(
+            page,
+            clip=clip,
+            flags=flags,
+            textpage=textpage,
+            sort=True,
+            tolerance=tolerance,
+        )
+    ]
+
+    if not words:  # no text present
+        return ""
+    totalbox = pymupdf.EMPTY_RECT()  # area covering all text
+    for wr, text in words:
+        totalbox |= wr
+
+    lines = []  # list of reconstituted lines
+    line = [words[0]]  # current line
+    lrect = words[0][0]  # the line's rectangle
+
+    # walk through the words
+    for wr, text in words[1:]:  # start with second word
+        w0r, _ = line[-1]  # read previous word in current line
+
+        # if this word matches top or bottom of the line, append it
+        if abs(lrect.y0 - wr.y0) <= tolerance or abs(lrect.y1 - wr.y1) <= tolerance:
+            line.append((wr, text))
+            lrect |= wr
+        else:
+            # output current line and re-initialize
+            ltext = line_text(totalbox, line)
+            lines.append((lrect, ltext))
+            line = [(wr, text)]
+            lrect = wr
+
+    # also append unfinished last line
+    ltext = line_text(totalbox, line)
+    lines.append((lrect, ltext))
+
+    # sort all lines vertically
+    lines.sort(key=lambda l: (l[0].y1))
+
+    text = lines[0][1]  # text of first line
+    y1 = lines[0][0].y1  # its bottom coordinate
+    for lrect, ltext in lines[1:]:
+        distance = min(int(round((lrect.y0 - y1) / lrect.height)), 5)
+        breaks = "\n" * (distance + 1)
+        text += breaks + ltext
+        y1 = lrect.y1
+
+    # return text in clip
+    return text
+
+
 def get_textbox(
     page: pymupdf.Page,
     rect: rect_like,
@@ -731,14 +892,15 @@ def get_image_rects(page: pymupdf.Page, name, transform=False) -> list:
 
 
 def get_text(
-        page: pymupdf.Page,
-        option: str = "text",
-        clip: rect_like = None,
-        flags: OptInt = None,
-        textpage: pymupdf.TextPage = None,
-        sort: bool = False,
-        delimiters=None,
-        ):
+    page: pymupdf.Page,
+    option: str = "text",
+    clip: rect_like = None,
+    flags: OptInt = None,
+    textpage: pymupdf.TextPage = None,
+    sort: bool = False,
+    delimiters=None,
+    tolerance=3,
+):
     """Extract text from a page or an annotation.
 
     This is a unifying wrapper for various methods of the pymupdf.TextPage class.
@@ -787,6 +949,16 @@ def get_text(
         return get_text_blocks(
             page, clip=clip, flags=flags, textpage=textpage, sort=sort
         )
+
+    if option == "text" and sort is True:
+        return get_sorted_text(
+            page,
+            clip=clip,
+            flags=flags,
+            textpage=textpage,
+            tolerance=tolerance,
+        )
+
     pymupdf.CheckParent(page)
     cb = None
     if option in ("html", "xml", "xhtml"):  # no clipping for MuPDF functions

diff --git a/tests/resources/test-linebreaks.pdf b/tests/resources/test-linebreaks.pdf
diff --git a/tests/test_linebreaks.py b/tests/test_linebreaks.py
@@ -0,0 +1,16 @@
+import pymupdf
+
+import os.path
+
+
+def test_linebreaks():
+    """Test avoidance of linebreaks."""
+    path = os.path.abspath(f"{__file__}/../../tests/resources/test-linebreaks.pdf")
+    doc = pymupdf.open(path)
+    page = doc[0]
+    tp = page.get_textpage(flags=pymupdf.TEXTFLAGS_WORDS)
+    word_count = len(page.get_text("words", textpage=tp))
+    line_count1 = len(page.get_text(textpage=tp).splitlines())
+    line_count2 = len(page.get_text(sort=True, textpage=tp).splitlines())
+    assert word_count == line_count1
+    assert line_count2 < line_count1 / 2