From 7790aff7113c704c34ec5c902f3b88191c06745d Mon Sep 17 00:00:00 2001 From: "Jorj X. McKie" Date: Sun, 22 Sep 2024 18:23:51 -0400 Subject: [PATCH] Prevent line breaks, deliver reading order. Refactor plain text and "words" extraction with sort=True: We previously simply sorted the output by ascending bottom and left coordinate. This change collects words (and respectively text) that are approximately on the same line. Apart from extremely malformed pages, words and respectively text is returned in "natural" reading sequence. This change also suppresses line breaks generated by MuPDF just because of large horizontal distances (as it e.g. often happens between table cell content of the same row. --- src/__init__.py | 18 +-- src/utils.py | 169 +++++++++++++++++++++++++++- tests/resources/test-linebreaks.pdf | Bin 0 -> 16052 bytes tests/test_linebreaks.py | 15 +++ 4 files changed, 190 insertions(+), 12 deletions(-) create mode 100644 tests/resources/test-linebreaks.pdf create mode 100644 tests/test_linebreaks.py diff --git a/src/__init__.py b/src/__init__.py index 2ee9bd6ff..4502f91e3 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -13319,14 +13319,16 @@ def width(self): TEXT_OUTPUT_XML = 3 TEXT_OUTPUT_XHTML = 4 -TEXT_PRESERVE_LIGATURES = 1 -TEXT_PRESERVE_WHITESPACE = 2 -TEXT_PRESERVE_IMAGES = 4 -TEXT_INHIBIT_SPACES = 8 -TEXT_DEHYPHENATE = 16 -TEXT_PRESERVE_SPANS = 32 -TEXT_MEDIABOX_CLIP = 64 -TEXT_CID_FOR_UNKNOWN_UNICODE = 128 +TEXT_PRESERVE_LIGATURES = mupdf.FZ_STEXT_PRESERVE_LIGATURES +TEXT_PRESERVE_WHITESPACE = mupdf.FZ_STEXT_PRESERVE_WHITESPACE +TEXT_PRESERVE_IMAGES = mupdf.FZ_STEXT_PRESERVE_IMAGES +TEXT_INHIBIT_SPACES = mupdf.FZ_STEXT_PRESERVE_LIGATURES +TEXT_DEHYPHENATE = mupdf.FZ_STEXT_DEHYPHENATE +TEXT_PRESERVE_SPANS = mupdf.FZ_STEXT_PRESERVE_SPANS +TEXT_MEDIABOX_CLIP = mupdf.FZ_STEXT_MEDIABOX_CLIP +TEXT_CID_FOR_UNKNOWN_UNICODE = mupdf.FZ_STEXT_USE_CID_FOR_UNKNOWN_UNICODE +TEXT_COLLECT_STRUCTURE = 256 # mupdf.FZ_STEXT_COLLECT_STRUCTURE +TEXT_ACCURATE_BBOXES = 512 # mupdf.FZ_STEXT_ACCURATE_BBOXES TEXTFLAGS_WORDS = (0 | TEXT_PRESERVE_LIGATURES diff --git a/src/utils.py b/src/utils.py index 670403d92..490eeb658 100644 --- a/src/utils.py +++ b/src/utils.py @@ -507,19 +507,53 @@ def get_text_words( textpage: pymupdf.TextPage = None, sort: bool = False, delimiters=None, + tolerance=3, ) -> list: """Return the text words as a list with the bbox for each word. Args: + page: pymupdf.Page + clip: (rect-like) area on page to consider flags: (int) control the amount of data parsed into the textpage. - delimiters: (str,list) characters to use as word delimiters + textpage: (pymupdf.TextPage) either passed-in or None. + sort: (bool) sort the words in reading sequence. + delimiters: (str,list) characters to use as word delimiters. + tolerance: (float) consider words to be part of the same line if + top or bottom coordinate are not larger than this. Relevant + only if sort=True. Returns: Word tuples (x0, y0, x1, y1, "word", bno, lno, wno). """ + + def sort_words(words): + """Sort words line-wise, forgiving small deviations.""" + words.sort(key=lambda w: (w[3], w[0])) + nwords = [] # final word list + line = [words[0]] # collects words roughly in same line + lrect = pymupdf.Rect(words[0][:4]) # start the line rectangle + for w in words[1:]: + wrect = pymupdf.Rect(w[:4]) + if ( + abs(wrect.y0 - lrect.y0) <= tolerance + or abs(wrect.y1 - lrect.y1) <= tolerance + ): + line.append(w) + lrect |= wrect + else: + line.sort(key=lambda w: w[0]) # sort words in line l-t-r + nwords.extend(line) # append to final words list + line = [w] # start next line + lrect = wrect # start next line rect + + line.sort(key=lambda w: w[0]) # sort words in line l-t-r + nwords.extend(line) # append to final words list + + return nwords + pymupdf.CheckParent(page) if flags is None: - flags = pymupdf.TEXT_PRESERVE_WHITESPACE | pymupdf.TEXT_PRESERVE_LIGATURES | pymupdf.TEXT_MEDIABOX_CLIP + flags = pymupdf.TEXTFLAGS_WORDS tp = textpage if tp is None: tp = page.get_textpage(clip=clip, flags=flags) @@ -527,14 +561,141 @@ def get_text_words( raise ValueError("not a textpage of this page") words = tp.extractWORDS(delimiters) + + # if textpage was given, we subselect the words in clip + if textpage is not None and clip is not None: + # sub-select words contained in clip + clip = pymupdf.Rect(clip) + words = [ + w for w in words if abs(clip & w[:4]) >= 0.5 * abs(pymupdf.Rect(w[:4])) + ] + if textpage is None: del tp - if sort is True: - words.sort(key=lambda w: (w[3], w[0])) + if words and sort is True: + # advanced sort if any words found + words = sort_words(words) return words +def get_sorted_text( + page: pymupdf.Page, + clip: rect_like = None, + flags: OptInt = None, + textpage: pymupdf.TextPage = None, + tolerance=3, +) -> str: + """Extract plain text avoiding unacceptable line breaks. + + Text contained in clip will be sorted in reading sequence. Some effort + is also spent to simulate layout vertically and horizontally. + + Args: + page: pymupdf.Page + clip: (rect-like) only consider text inside + flags: (int) text extraction flags + textpage: pymupdf.TextPage + tolerance: (float) consider words to be on the same line if their top + or bottom coordinates do not differ more than this. + + Notes: + If a TextPage is provided, all text is checked for being inside clip + with at least 50% of its bbox. + This allows to use some "global" TextPage in conjunction with sub- + selecting words in parts of the defined TextPage rectangle. + + Returns: + A text string in reading sequence. Left indentation of each line, + inter-line and inter-word distances strive to reflect the layout. + """ + + def line_text(clip, line): + """Create the string of one text line. + + We are trying to simulate some horizontal layout here, too. + + Args: + clip: (pymupdf.Rect) the area from which all text is being read. + line: (list) words contained in one text line + Returns: + Text in this line. Generated from words in 'line'. Inter-word + distances are translated to multiple spaces, thus simulating + text indentations and large horizontal distances. + """ + line.sort(key=lambda w: w[0].x0) + ltext = "" # text in the line + x1 = 0 # end coordinate of ltext + lrect = pymupdf.EMPTY_RECT() # bbox of this line + for r, t in line: + lrect |= r # update line bbox + # convert distance to previous word to multiple spaces + dist = max( + int(round((r.x0 - clip.x0 - x1) / r.width * len(t))), + 0 if x1 == 0 else 1, + ) # number of space characters + + ltext += " " * dist + t # append word string + x1 = r.x1 - clip.x0 # update new end position + return ltext + + # Extract words in correct sequence first. + words = [ + (pymupdf.Rect(w[:4]), w[4]) + for w in get_text_words( + page, + clip=clip, + flags=flags, + textpage=textpage, + sort=True, + tolerance=tolerance, + ) + ] + + if not words: # no text present + return "" + totalbox = pymupdf.EMPTY_RECT() # area covering all text + for w in words: + totalbox |= w[0] + + lines = [] # list of reconstituted lines + line = [words[0]] # current line + lrect = words[0][0] # the line's rectangle + + # walk through the words + for wr, text in words[1:]: # start with second word + w0r, _ = line[-1] # read previous word in current line + + # if this word matches top or bottom of the line, append it + if abs(lrect.y0 - wr.y0) <= tolerance or abs(lrect.y1 - wr.y1) <= tolerance: + line.append((wr, text)) + lrect |= wr + else: + # output current line and re-initialize + ltext = line_text(totalbox, line) + lines.append((lrect, ltext)) + line = [(wr, text)] + lrect = wr + + # also append unfinished last line + ltext = line_text(totalbox, line) + lines.append((lrect, ltext)) + + # sort all lines vertically + lines.sort(key=lambda l: (l[0].y1)) + + text = lines[0][1] # text of first line + y1 = lines[0][0].y1 # its bottom coordinate + for lrect, ltext in lines[1:]: + distance = min(int(round((lrect.y0 - y1) / lrect.height)), 5) + breaks = "\n" * (distance + 1) + text += breaks + ltext + y1 = lrect.y1 + + # return text in clip + return text + + def get_textbox( page: pymupdf.Page, rect: rect_like, diff --git a/tests/resources/test-linebreaks.pdf b/tests/resources/test-linebreaks.pdf new file mode 100644 index 0000000000000000000000000000000000000000..8696f885335b6d4e5dcbe9f57eeb29eabd2b3564 GIT binary patch literal 16052 zcmcJ$1yCGO(>9ugK(L?*?kw)^u8X@%aCdhI1P!n(?(VM1;tB5V?h@Pyc6sId?sw~d z|N5)$ty|SIea@La-E*GluAP~khf-NWijkFx3yJb~)9+>^BsNkOQb!XTBmn_td2CYdbe{7iK9tV>fdNb5ln%b0HxlS2q`PV|yg8%%lWXxlmS=uvb}&@YHMD zgik%9%9ceNm20#sq$`Xs{?S_K1gQA3a^aWf9ZClnu|X*7B0-5LvW%&XLFQ6eVbjEh z)f*L^KDsN{t%^C@mrn?uCx3&3@s>W6nv~eGeyW)8;OgMr|XU3rg0TbI|mn4J`}^0XOo8KX(M%bjzTU$rVi0+2DR|c2&Yv z=|JA??WUMSHe^X;6o1f3toWK2u+A(en~b&9hif(&arU*Y1p8GG&wK{Me1?0g)B48U zl;#JP>U~9>GSG+8SrKar8YW1B{Mdg=I1DA+btrwMKB5U&h#p<+8p5wqbYw}##2m%0 zhaA{UIUk2EH3U zrWR!rlk&FyG#+*?6ftGkcqC2VXQ}sX02e5qZMgxD?B#Q(0qg!`Gk?35p91&~S zlI*+ZKWSWYChE|u_(1%KFpl(gmVZ`clPTq1a|FJ^%Z@|#2d;J=XOM0UE(zLlD94Wi z73~V)?-pQduN2JC0$vD={xU$M9I{^euk#KAt+?Ywmn1u3TLZ8)AJB^;2-um^pHB8{zSq*}1r;E)ik1z9nbC=|E7w48c2V|s94QJ3^>G@2!oVO(t zPfoE@hR%?rVJb z*b%J#ICmQ6V`*rJRzu;guQj5K!|q}r8#J2cr<}$RV7Dc%hB`I>ry1oZykBdJsV)hh zf)xQ3Rdp04)Ks$iul8hoj-|=qi*{W_2icy7=wfng8o)|E=gf%~E08j^?a#>B-u(Rx zzIexWO{hw94<7p_bmzTmH%eVcn*dh|nxeN12~yp*UaO=Ing1BiMPiGuNxI?f4(kJO zbU2$ZMaiP&uH|w;lk>W~^FuhhY5!xQd2g-I>u5i7Ha)AosH4#Kk(NV5EV<9}J)(=|dSx6tm*_Q@ zl*jjqIOMPAq7m5)s;gG-5|XBC6qD)4)aXIHhsIYCMf1jLNcdDOAK*=)?PGxZ{oJXO zk!Ww9FCD~MKRUxtAj25+KA*dHFEG5BD^J?y>RI)mtm}ntwwX)B&3dKE_!oQS2H_u2 z?QoduiK+}r+|;P!M_HnlzSepBC{OIo+ndBSR@$yoSsE&3d$o@zW*%i;Kd?>{8lb9= zjDWshWvgLTErv!V!UsNQB95t*Yf+cl_}+Qjq%zI~Pj>e-yC(P*j`j~qX2IKdeO~UB zr<7Vsa!6U9-TsQ^z0|t6lv?%h;&A9@zt~ccan^i9X>cXG^?&HmiliC+HKF$ieR;N&T>-!9<;S}I4*nI;CSqwH|oA1i@)yhmI7%<1!3Z)k<}m*2CC?PG(~>y!Q?-k~vB z^34;B22ILEWK9W7J~wk9wDuNP>APoCzhr1j1%Gv3ddu2U_F%8qoPVcSMEJaoHe-bu z+_%rE#i*a;+uE-J(8&59)QD;}c3)^^y|O@aDpSHm81AU7$^eo9aSpG1EG2>ZCRBGS z6;J&}!~ABh&Bzcz@1PBGvZp*R1x$-CO8IN1mq0HD{0ev~a}DPifuLr@#LRkSdUr)J z%MUf#`Tf*b0%MQRV$`Nx=SDO4e#;{k*Ezdb^6%8xr&$KUp3--;$iV?GvDi(=aLglw zVQ8B|*2?gyuS6$!*#{@alOcuOS{HJrGflxg9MvCF>waO%xjopLT4*s=)L7ii=C5@f z#hm-4E7mPC94KD5phq~Hv_vLEH58lEC6yf>R-d+&WLo-MIwgCEdbsstuKZlNe#n?~ zphz%Ocl=Itc30<(?#TVecEs`M>=!fMXg08=B$Hm+|kJ=aXurdHMmE){bUk3FiY4=|L#TD zu*d!2)EfC>G4pe7V2ld;{b+e?zhqT>)e1~21yEjLeUmSfd;iGsM6w}7f7EhYKX%Cj}mg<-QSQNyyqGs_<^U|zyI?xJ9Wr0UUxbWLvQA!m~Y zWbO#697Iea##p6nwux?CPQ)}O!;wUeT0XvshK0p$ww{LBSS1*xDCA1+4+|O- zOmov6AtiJjh0_}5I1o(Y9w9p& zr{ivTj+|UR=pYFWuvrP8Ek_)>{9&;lgU1=GCz)MIW+4Rt?ABpRGr(=m$*%Yl8x2aO+VQXCK*JS0VXR930&enTc z_;$l09ZEMnBYZk6904#=UTjfkIMrY$L{mspt6b z9#0op8u2FR>-dBvzuY%z8M;1Q92O2uCDpW_FGcMtD%m98bda`a{yat2wE9T>b*a#1 zZao3L^497*^;czx&dHe1(z9FHqc!);&y<>jJdig?U)3JJfOU35S3SRshC#c@#e5N7 zLDDZdj{~PpXf4mz03%9DE_hbkwfDrkcDJmXB#svv_;s6CK#7sCANO&iH)Im&A%q zjdjx}+?(N`y#~{cM@u(lg^5oPooBZjq@#Lyh1b7{A{+c7JCwntd{w~-m=^0X6Q=z_GH(sou&J6{I zwU_F=qOg6B<8%%hux;vqmYzW08$(qCgO87ALQci$$t|nQKQDY@+M&aQbk*7&%icED z;gZ*An8*tAW4GgT<)+xl*mHAjm%|DEQ?>j{|qu#CQor7ZPYinAPV6?jw6RgSQ?zHbz`NO}(1f=~9NqX^tLpi&`pm$i_4YjPrj?69EY=|P$L?2qN^2c! zspYR`wo$7lhTydN&4HQtgXa{E8EK)}ESJN6+4Sd%v>VS~e`jl7Wc8N=x97Buo1jfa zO}egPQc$>Cddoc~%P{7BF6Ny4z`F$*ZYx(S% z!tc~F&;tF2+d3E7A!~1M(C8HH(e*-(fdmfYlzFFNz4!BQGBy!ymF7tRRis|vxP$de z|EjNtRx%PbbA^7LAKI}ar{%=uTm8welUK6{%3x2r5xFIsbrLfy+UdNyfDcC*s<8RH zn!lELob9bS2JIpv(b1ta@YmMow@I)Qv~eBWXXEN9=-T`ps@JG!nvMN7R;j(!##aoh#7>ndAH>l&JgeP^NtTR_5MNKQBc zJ$E!ed>A?T=@-Y;>jzX1oocy{B`KFnhd$6gBCD41AQagg&}oQkm9^*<9Wf=txkA}cyyZ%MbK6O7iiO(MfznFNyRPT2_@;EwN9v*2w95U%FMAdQe|-4{ z_$j7X5(@5Cc-ZHs%n$L{0aAL(qznvWy*4KbeTCo37^EEAoGo9&o0)0OnudE{b0vINQ$Ff>2zw0Cbm;$ zU&F_&<@p<;n;D}cqnx%z2e|y_m}y6w(+%nkS>ls^I;&uVm?Mu_i7xJ+6w9-SE*m91GF#H80y1x7DWB%V*A6> z=3$>YM&s;H^e(zxh#k31_3F4?!03WO$ZS?BY>cw`gGej3yiAO7v(?sz28xj|1@Yvw z7g6f{^r|shg}ONRXcnvF9+pckY&s?K?7rO8b-6K2T{w!OmYA)0L~RUzNKEO19KA3n zwl&SjXb06A|0Y%x46sVZB!n@k`WILJJy!b!Na@TLyVrh@R?PX5x2&+@b~iHYP1n0^ zf`5uA_AU4@;zQp1yjB$JakZ(ytHSL)O1zKop&!p@AIN`zRI?~~*?HAKTx|~Zlm~y*>ItO^DCjA9>1jnDL;hg7&t-luln?Jc zFVzgEhnsws1NpwK8tL!`s}_xk#DC978e}3#~ZmPP9IZ^a|`d4 z&6rMOoi5=RJ~c>-BT+3omDz<&kB-zyoj^(O8DUTX>8h2{B#qJPx||KUiGxZCT%wrb zJwiRRMVk@6(DvJ&meA45Ns{g*uZ%Jrwkeg;81sR-kr?5NE#?v@Hn5|{ejFSneq6fx zT`k&R7;&%Ra>+evn0{FTXhmVwDaxr$*#Zw{-{)zl;e>8azo%rHdd?L z6t^Lsk8Hz~8TTJ1#^?&7`A~~FV9$BI< z=J=f5GdK2iZcbj}gWX7tAe@zI*Cj5R4YGi4M#7C-eYWD-^q>#l?Dj@az^b_X@%xKM zwP`WC^5x~31T)j+tgoXDRX4dm=eiRUtBanrOZ&SM2I%IgsuX06I3-83dy2=lgz2L? z%Gil|LC;)v^l~uc>U_0z3-Uh*_CxKNMG1Pgp!m5MCXV5-x_EBUMGeyo-ExKD^1nSw;yl}fx8~J7aVN<4 zJgwoKFfixFYvi&Ole8@D@W>5_Z@qFF{zg=^(4Sm{^9VFRX83E#!llvR7dpWa;j6mDf zBWKr&W%wr|Aa?7+pf8!x=%D6bMp%X0kk$ooU$t9@Qnh zN83G*P`$#Qx#HxM+OM`m7_*0x5ZbKIEwnAy>b2y?yr^FUQoDX^ICEn;FYFbkhBfly zC@f{Ft6Tn}mAyG`uRY9nG@73B?)GFhY;cq=m@b^Jm+RJ?^T@Sdjtrjt{5*eZ7iq`% zg8=K}(j#HoVDn7AnB}~E;M29`+RJ4%hk4m_G9)Q8Dnv_Lcfn~_E7tJ-yq6QM2rjIU zFxmUGYtRSo&H;%QxW0p7Z|sg?@S8aihMuu@0tSnbWvWP5w=j>loYqxcZLd@(Al7Dx za5j;bA#IRWp)j(H0j0Q}J$Z;}f5xm))_hUr2sT|pteB*ATv4~L))~2&CT5wtwYbzm zb^W(uFP?`vf?P_JUCzVvh$O4ErGesylZ=~6Ed4C%`zt2Lv_692{bj93S&bdJ&53`SPxttayN}{CtRtTp+5L2Rb0wh{bJMA zmdC-F>V@M7Bx7_^#Ze-=WVa50XOt;{vu-3!!sY>cP4@b#Vaw$jlTz3 zT1&OyBjK%ue=cTbpmmm4=sX#eLjJJq`VSe7A!6Y6;HeGh4@n2!rzx6Y-P7DxK3N@r z%yK@u56dBEQ3}W(t$3qKn;1&xr<6GUEY-}WrI(W}K8DTlEpqzcsWUZHvh%_I=9d3# zS3u*y%_(VbgD@*iwXC&L{u%cmUE$KAT8uA4xlj#fbXq7U-H%Af_)3B2v8C5|))pmF z>TjkHWeE?pqh#!MOJSpH&Sd5?=q%UN8*YCMOdvr7FRf@HJN6Z&f(viCa5ke7(NXwt z>S@pJ^A%8dZ1bZJ!DZABNkf&N>Dbj<;Y!0-QNEJ8b-IGV6>0Qo&P|m8@3;)UD(HS) zqC6(nHL;~fPBPFhW*oL4h#%!5&oFXYJpMWy@Xk0^1i5u8H`U)z8cDD>mRz5mlG63_ zW69^_t@2KqW30RmFWbGg3Y$AT4~?XjvTezmFdx4=XRU-@`f4H@@%7kn{O#(jqhnuS!3#} zdgJ?~(e+3$X#q5!L_u=ZJKa;0C4pb|)=quy%}`sXGcY5jd#Hq~RD9NM4HA76=a z71MR8cFnN7Rn4d}%#nVkwe(jUe&MXDDBhiQbLOndo|^GiWkkl&ja8rU|Li6QS5=;S ziliI-;f@@E&guQlmpZUB$#UK&7?FNc-uGU`l(x zWm$ySjrMo`9L`R#uDK4({bA#HVqz=ha7pR;A(?Ug$kNC=7}ur-?|b{xX8WysASJT= zQw9s#%k6=uMJJ9cQ@*WqB^40BUlO1JzdFDxMw0Vb6^`b;amHadZ|IN+#aovo&2s#(xmq7QlOv@O*DvBYh~U@NKTNU2A`m`Y4B`%?u?2 zBHxOsNl68yq6|0}Pa3M?3NaSh<*Mg(&uOGokP)nMk^OFsR-q63@^h#O$bPc?=JmyrZXQc+ z=?@)u?u#QGH>AnhQp17V`p{NATZHhCkxr@bZttt{<6EZ zSNXuVe)L>9p@(6k+FO6?$3G4^YCw~~Pskdm{2_tyG=(UJT2h)pP3MkMVW0>J(hy)& z3*SqO>P5RMi4Dw`g2MMoHZthKNQWY;=y-h%2+R+`0gXZtcYwx5^4||25}T(TblpbL z3=+PU0pqbxQz)b1d7hbZxbF$J*i}f$Ub05cM{tKmGE)+gzQC~;%@(G{orQ$4Hfpd` zqiwSgJ@csLpI9V@sSIbwng+zD-MC#ihDqI{tl0{Mte_&5NkY+-w5#Iuj9>7-WrJH( zzs0e|mOT5lPqlL>3_9MFQ%3*hRe<gw+Y1F49$(eE_nK;alij5 zDf)ZdgU?r=BXmw+)|4uJ zgoh~4&iBQt?L%#OMow6&rJA9)$!k32*)J@2Nu@2r@oBCn`@U7B9kiMhdpw2B#+JJg z<$_Tf5k!1E4RS)K6vqtTNAyyzHv&d_`uoqc`k6PAqb^q~!~&F~h{-zoDOb++a)L2$ zDignw?6q?AG}U?e;5HqvE1)~8*nH*-dHP*|8R7;)B^27Pn{Uk^T@v3KMtb(I<=c$LD(0!=$sUwE8UD z9A4gdUc96Q$4$5ZcLt0)=~;i~JzLZYyn>PN9!A!}(}ru)O)XQKHBu>r3x=Nld0N3< zX$$tNtL??WZo?7nf&TfiV{B#9=O7==$Jr6n#Bq0rY=R9aDD<(Ium5gM?|ke5%Inc; z6&nM;wcKia&njHfV?ff_nHSVo^yw`s^Qs5M7NO3idTO;1jco}iiO)0HJwBn67 z*?QT@6Y)~0hhacJA`y|7qa-xE|EE;`IUBvb=%eO=i9dl1H%S3tiO+i&wYn9qX|xNz ziRh0?I6m{xcgGzSoWJEH&EJW7FQ{(fDU)d;Xj=da?PElVc02EeD$eAb?63 zR9N*NHz@f3zKs~I!4qz%DBt8-=&@wC?d1EyZ!vc5*fY%(Al{f~DQIL({lhTChP`>o z!Z6(+X`zg-1K&c!BE;t$d!P?*+5Oy;H6@mJX>ZwqKTv2{wz5Xbzh*nxcAK(}z`bpz z_by71QS+`oq-Og`<%g@1-A0woeH6c?Kw-RZO=|nKBCFJLW?IC>!rE9?^tH{d|5(HX zd5-UFRzQhC>a6Wgg@ragOQ`_n(PO)p6n~`WeAVVQyKhsN2Wa8sdA~|kaggG^{qC?L z<qFt?{=a2@BCM0Kg}slXt-|4Fvt%*jfi{Jpq%a6pkT zhX>Dy4~_iQWs)o)WVTl}oSBEK((W?mHvI2hfrqP=m--@y5W4ZU+s6;&vQE|6zm&Dh zK3>DYK6m8i$3}3Gp_EIAZEps2{x0?%z`3R0E`WWh#U#3=XH+ndbNRoJ?4vcehrrT z08=1di=Qs_OmV==7v$VskQxv<^+^N%x#PfVfkfqZKq@bKFYa@^2_@TBci-J(Uzbiw z{KXdz7WkxsZHgBbU8yIPYSR4g-^5c{Ggulwgye}2v3M9EezAX6p=2qSNMrwEaqv|E zw^v)+*XT2bI2?_h*wmrh5C zU`|#P4%9ppUE@~Np{<-kHSiBl0o zk0K5qF9wcJw=o|{m~L)5O}G_kZm5Sh6c14lkw#D zQ9X$1(}ehIj?G(Lzy8$1##`}BFzbt>b_t3d=?*j20)tdGvPLetN5Vgdg-eByt#|qT zc?d15?blXB5a09{;0p_4-LM=svk!azRri1`i4cMlxo0O1vtk+!>mN*mY(B>%@6FQl zl&qk`>0URrAN{2|Z!#^HoZC3^2>E3aA|Cu4%=#+QrG<%vmQ=4wt9z=^t%T8>(h7Gc zRlK*lz1!Lg-Jii8t?a!T8F>`b#$x~2#utG*Fe)gV&!#pOU8ZTvznqDHrmoE`zajlr zC=Be%l_+5tBszq@<8gndr^Oi4AJeqZ=Lz-ns#ts?i94Jcf4R}9MSLKSR}Spze}dgd z`DMy+gq6pQ9%`P<5_6?l;Z`pC{6H>L03c535o)vLC%(|WA@rozT#Xqu6hejHC^4}s zre|Q{{N9Czp=Zn*T^=kN7T^G`b7r7n8$n}Gzl&+;CNNGaPxE-6)zhOQ_wZ&MwzfH`2)#i)298B(tEc4^@@h3H-7L^sXL(2f}3RUi-Yk!O9Vq_WrHZ_yMS^WluPE~V#`&e3T zV{>3Oq>ImY#FeM5wmh>kGpkm~!R6k>mKCF{ThKV~=EgbeongzD=;0{st7i!tRujH7 zS3K8;`h99*YGUczaLs3T*>0{+iDL?!qoaPM_{dUI29e|{! zkEuEe_O2ngNYUlC)z&t&nn$*}He6iihV~OvzcR`imOILVwT#>FpXS^-o0{s)XPTNk z+;g?RYk0Vgx~;CwEGk#dxN*3%g>ZN@iVCt*`!qUJ9?UP1HnOPEUwOP&&|X|#WZ_Tw z8tBByomS|S`p{qi`z_4?9q-}%b$awZ)8`j0O4Tib}c<350dY&va6 z_L8Y)@V&i4$sR#E^&mc^;-MukMG@P6*8p_1<8kpi4Rk z4VHx5Fn{_fB2m1BbP12^9rWWRjD*syI2NVFbqP@8N<$S5FGzxnB;;W77DJ%*BR;JP zM@o+3Kw(A_?Wj3KpKg#6LYT5+0%C&fqynWWp~?^n2x0OL3Ii94Kzgy3J2;@TBj5$Kr}(RKg$vH32JtW#B@6ND?FwR4)eEg(S+Vk`JeV6S zpkx_U8c;Cs0m6{nLpewex&*2K2a|x(5IYdI>{cRB4gv=;lYKkw7N9kxQ1)#`JrD%p z0oKWxB=pclN@6wuBOyQ;6G#tDq$p-PkPp-du>$5n(32*tL1Q2nO~?;QZ#f7zc~XFM%3k7tUKj>*fDZ*gN14kHpaAAUJWv4jpd+QX0c2g~(iRjzImk@;@C_)O zvSSH(Qisd~y)_^?GM7<6;p822PyldH4)UZ7A(6TC2N@>r_(Ix92d^Lk)PtUs-kK0g zptmlh1o#jOTuVa2dX!K}NCxFYBaj<(NedF99gKnai9zDzTta~Ja+j7cwGgkKRT7s7+{W@aG1Pwikh5EnjqI>6g~n!xpR0C4nVjRyGu++l2b8}KK`DROnMmr1|02f63(>H;+K zeT#xPVKkAoDRaeW=y7UQt!S}o#kySbD%C|I4}Qpt)V=!=R0I?Co&iIdMjZ!X0EZ6; z*$Gh_sJ@S=EYYbC;DxD&WB9c33H*87295z8j5lrrYla|$lVlB}Drp`SWCj3jrHKa_ zs4ueUYI&z6}3;(b97cTa$E!eG#>#QIU zfE8eb?ersv8S$rF7YN|lH;qwqi1RI(8R4h6K&wI5KHLh`hAWH^{RTUX!)K>xWsio5 zOKSjkq-74~IS9awbRT+f?yS-x+I0gXM3wEYZUt5BlJ04+h0~|l=!Q{5xCdK3h^nNc zp926|$UkMfk}fz%Hlnxy?ch&)s+B%hy| zJ~;uix!&1_YfP+F{`rstsBtnmB4+2EGv<0e2&@qRkB!@8oqJT%ZV=Q#^=P)>w z$h419rvU)ieDlsAA%GB|9p)Bkfdac-Z^SMoD9dD$KlF&0RryEIiAl?HkBDJjk(@|L z5Gvs9+g@T3UxB2U32Y_v(^Y(>c9aQBB?EXXY=vfo1wtiBLy-&#>x7_~{Cd`GABfB5^b<}zjb5GoTWIlP}iI7cYOt|iO<{+LybQ{;Y8NZj?7&KJNv{3MvUo&+u! z&HjWIK*bh_|J~;;9cWO}MWKmEniZ5hy9c{jAiHPGqAUn=(mxG1PdiADMvpF6hDMJ( zS6L)D=*bJSVaf!+l;;{hkw%?JD<)SRL^#Q;*Chr}1n9tE_Yk!L(wzH=MWj~haKKEY z3L@n}p8$jT;*f(4^dt`e7eEt$JvqgoOBFTr9zX?<0>DqIGV2~m6ln%Yz56U8Fn!yh zPqlyi47EIgg*t|G!g*s|D$P8@x(9|oeZ15fFj9WC9LPC8j-%-WJizYUf!Rx6iFST? z-SQ1}20!6n(ho4(i$gnWKkmN}Ua}7;k=5j#YHTQx)`$drzZ~u&5jFh!gnkM8=Vj$g zPiv*H>7Dn0>&ALhi)+p|0a$PR@f@*L&q@mDI{Yj4J@>CpyE$thv0E{>eMri93w}Ez<16{__`_9gKnQWLPMg&TlB1E;$rUJ0E%{UQBwI_hL98 z=;eVwu9(M!nikqaYQ#k{*1qib_H(mW0QF3Z+s)=MO7x}Pi3T%^oCd9+KD2r3oZ5Gd ziwZ>xs8+x~(D@he0zL12PxLN*TmP56^So}a-k3uo9gq*D`ElO)7hAkSn2+z zK)e#_e(Pr*10h9DvlZ6r@3xF8jtd05c()KI-+M0}?76C#Z$=+&Wm2i?Uo|xs(xwMq z_}R6-RmJwfu7N#|%?9}Sc(zu<5#i@hyu+1Hy+6#Ocs)~pXuA!b0w}zu3)MI()Oeny zudWGFd-{4-EpD!QyM)ho6u5==O*IrRw-Oors5-p#QS$KSpSQd5$Xq$H*PA}Fm-HV_ zXI$G$&}++>x4Y!ZTG?6sSvBXC9*4MK6MMy=VeD8FzmNJX2p)pJ0 z53e!uN;K`sZqOg{X)^uEsb&6_-wla|&&PUOlTPI!X{j1s@7f#%<{~ zUU;8FiN|B3!FHYLrH5X%Libv+y>6U+?~zn1=X|-bTptGSyijjk1!K^X}Y)E*)@5vs?9!UWMEM^F72p8h;q|acX2(&5W zMdU@;#gB_{i-?PW!(cb@X7OgxHZYS6bs=p1=P6X{kVY|G(orxYcoa-EKs6vGLs1A& zMJo%L5wjsBmfM|N>4gWRs_ZzV3T3(`KW_L z`;{U1lNb`|1GpC*#|#6AmM`K$#tXI^5R1Tq!0Egj3^fy(BC`go%OC;qAn%<;j?oEv zkm^K6z!EYrJ?~k?P|yV>wAf=j86{xz(8b_sjt+PSbOx5eKS&+LUJxHX?3{r+MPHB} zkaxzx!lEw--msS+2F}PVNjpQHurK*`xWPu^0dNnLJ2>DR@c{S-+8xe;3UH2yVek{~ zCEJcPIE7R=^y%HD|3EbOOw=$)0L>fq(qNzgY)Pslp8eJD!=?Se57PBeKiEt6fdnu$ zX`FZm`~%$%J-DAVPK-EM0L2^Ol5__!@C>FM@CW;n-H9lQD~fJ`K{6X3^8H znV~Ch59m9{V1Cjc;zDR|4@kTNmSj1>D@af{7xtZLa0Y44S3gV_iXAL)oVY`fKH3BB zP67B2DM_$CG8F#OcmQ!AoRlO~UPR~}6yN3Zj_kl^v)nzN-iH#N*H8O7TTg14fl*gV z&M&lM!vFh04ir(8+z7xY)*y0t<(_v@xFQ&^G$!vP|Gy3Xcf4b@O#i3`CFfh(F`?eY z5}wCTmN{E*gU_#rJWriZvsc1`!LPf_n2{r(kp+5J=0~OEXJL6ky+8kfy2tguQ1{q) zIR682?|)MD4B|l!J*4Q8cbQ+!EhjG*#DT-A{X}wxX3Cj_@5C2HqX5Qj=O?g|)baJT zH&K5()%_sFtsb&1JF_-xAq55Mzklaz4F{vLfF$cPkp@?*ehcTMkwxC!E^mD~ohhw| z0y=j5;Ax#)l{^5P%3PYjX_ry8JX-k}-Bqv}v4psAP`WF?eGeGJVjs5)dvEZlc;D={ zyr%c^{wxEKZkpAr@e{iix9=AlC!lPqu~=hy?tS9@{P2(!p47`=Wy;$--{M{9Z6*Z? zT5$A$&IX$c-hGhY5G+YN@=PZA_5-gA#=X#(bVm=b{YLnJ!Uolm^9v{2={fF01P;`m zu=*3*Yh={rtS$+b5Xv4FBpxYXeas3=|K1nZy9KOoG?@_$3D?(JmGEN<*(Z0Bgn ztZZy)?n=u3c2xDBZG|^>DYJ&Fxstn^owb9xE3=Z5xr3;wo3*2Z9_v4?v~_ez9o+5g z{RJ<*m!Y@sBp|+s65?v43c8HX&y5H?FJOe|zBM{`c%ROYXOs|7E4@;%Me> zYX0AW{>#~$=YIx!bH?#@0$g1G-u@AY{a@4n5uE+aEt}+<=KOcSw>3+=QEq7@_}Ez3 zI9OQOSXo#(-ZUc%2Q>=|_5WkF|AV>vPkfb3Y}DNB-*VW+)s2*$gIST3>%S|8wm zhur?AI%w0)g8rNYO_6BDK?r6@O=b%1u}7o&(2Eh-RTlyj|9eLS%|mQg&$Zth&FCLt z$t4Y{d;iHvmJBXlWp`Q#0wtB$; ze>MG`+2g8g&y=Wq(5?+b=>Yoxkib$W+kOxx1Uv>ya&KY33}<9tGX0qOlKkPbjDffx z`Y7R~yjv)h0Gl_Jf*kNPX%Bpw)LNN6C>MfTASvv>Pgcj9xF+giT)#4fNDE*+jS0F% zUp}1}QY>D#q6-)O8U2!F>j*z?R5tfe%VQIcX?T4z2z+RKbh_3q3PlEcBeC@+yTFN= zgjl;Qi{%Q!ja~UZ-YQi&NtiqgpSCx-|KJ8cm3Q1c^C9zyVYIp!w6o@sSD1^+x*{&D zIG*k`ApNIu`A0$gJ9+IEto+L7LKI<%xhIgmBLP6Cd34%R2xS>7MbI_YfF#E(-AI0jx1QU@`d^or^uy43JPM;#?sQ wW~x{wo%#!mXw3Tc?w!u$`?UXnf~%Xci<_5=xdjp{CpQ-l5+$Xik`&Va1#P~o%m4rY literal 0 HcmV?d00001 diff --git a/tests/test_linebreaks.py b/tests/test_linebreaks.py new file mode 100644 index 000000000..d6319313c --- /dev/null +++ b/tests/test_linebreaks.py @@ -0,0 +1,15 @@ +import pymupdf + +import os.path + + +def test_linebreaks(): + """Test avoidance of linebreaks.""" + path = os.path.abspath(f"{__file__}/../../tests/resources/test-linebreaks.pdf") + doc = pymupdf.open(path) + page = doc[0] + word_count = len(page.get_text("words")) + line_count1 = len(page.get_text().splitlines()) + line_count2 = len(page.get_text(sort=True).splitlines()) + assert word_count == line_count1 + assert line_count2 < line_count1 / 2