pymupdf · julian-smith-artifex-com · Sep 2, 2024 · Sep 2, 2024 · Sep 2, 2024 · Sep 2, 2024
diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml
@@ -47,6 +47,7 @@ body:
       label: PyMuPDF version
       options:
         - 
+        - 1.24.10
         - 1.24.9
         - 1.24.8
         - 1.24.7

diff --git a/changes.txt b/changes.txt
@@ -2,9 +2,36 @@ Change Log
 ==========
 
 
+**Changes in version 1.24.10 (2024-09-02)**
+
+* Use MuPDF-1.24.9.
+
+* Fixed issues:
+
+  * **Fixed** `3450 <https://github.com/pymupdf/PyMuPDF/issues/3450>`_: get_pixmap function takes too long to process
+  * **Fixed** `3569 <https://github.com/pymupdf/PyMuPDF/issues/3569>`_: Invalid OCGs not ignored by SVG image creation
+  * **Fixed** `3603 <https://github.com/pymupdf/PyMuPDF/issues/3603>`_: ObjStm compression and PDF linearization doesn't work together
+  * **Fixed** `3650 <https://github.com/pymupdf/PyMuPDF/issues/3650>`_: Linebreak inserted between each letter
+  * **Fixed** `3698 <https://github.com/pymupdf/PyMuPDF/issues/3698>`_: documentation issue - old code in the annotations documentation
+  * **Fixed** `3705 <https://github.com/pymupdf/PyMuPDF/issues/3705>`_: Document.select() behaves weirdly in some particular kind of pdf files
+  * **Fixed** `3706 <https://github.com/pymupdf/PyMuPDF/issues/3706>`_: extend Document.__getitem__ type annotation to reflect that the method also accepts slices
+  * **Fixed** `3727 <https://github.com/pymupdf/PyMuPDF/issues/3727>`_: Method get_pixmap() make the program exit without any exceptions or messages
+  * **Fixed** `3767 <https://github.com/pymupdf/PyMuPDF/issues/3767>`_: Cannot get Tessdata with Tesseract-OCR 5
+  * **Fixed** `3773 <https://github.com/pymupdf/PyMuPDF/issues/3773>`_: Link.set_border gives TypeError: '<' not supported between instances of 'NoneType' and 'int'
+  * **Fixed** `3774 <https://github.com/pymupdf/PyMuPDF/issues/3774>`_: fitz.__version__` does not work anymore
+  * **Fixed** `3789 <https://github.com/pymupdf/PyMuPDF/issues/3789>`_: ValueError: not enough values to unpack (expected 3, got 2) is thrown when call insert_pdf
+  * **Fixed** `3820 <https://github.com/pymupdf/PyMuPDF/issues/3820>`_: class improves namedDest handling
+
+* Other:
+
+  * Object streams and linearization cannot be used together; attempting to do
+    so will raise an exception. (#3603)
+  * Fixed handling of non-existing /Contents object.
+
+
 **Changes in version 1.24.9 (2024-07-24)**
 
-* Incremented MyPDF version to 1.24.9.
+* Use MuPDF-1.24.8.
 
 
 **Changes in version 1.24.8 (2024-07-22)**
@@ -33,6 +60,8 @@ Other:
 
 **Changes in version 1.24.6 (2024-06-25)**
 
+* Use MuPDF-1.24.4
+
 * Fixed issues:
 
   * **Fixed** `3599 <https://github.com/pymupdf/PyMuPDF/issues/3599>`_: Story.fit_width() has a weird line

diff --git a/docs/version.rst b/docs/version.rst
@@ -1,6 +1,6 @@
 ----
 
-This documentation covers **PyMuPDF v1.24.9** features as of **2024-07-24 00:00:01**.
+This documentation covers **PyMuPDF v1.24.10** features as of **2024-09-02 00:00:01**.
 
 The major and minor versions of **PyMuPDF** and **MuPDF** will always be the same. Only the third qualifier (patch level) may deviate from that of **MuPDF**.
 

diff --git a/setup.py b/setup.py
@@ -386,7 +386,7 @@ def get_mupdf_internal(out, location=None, sha=None, local_tgz=None):
     log(f'get_mupdf_internal(): {out=} {location=} {sha=}')
     assert out in ('dir', 'tgz')
     if location is None:
-        location = 'https://mupdf.com/downloads/archive/mupdf-1.24.8-source.tar.gz'
+        location = 'https://mupdf.com/downloads/archive/mupdf-1.24.9-source.tar.gz'
         #location = 'git:--branch master https://github.com/ArtifexSoftware/mupdf.git'
 
     if location == '':
@@ -430,7 +430,8 @@ def get_mupdf_internal(out, location=None, sha=None, local_tgz=None):
             log(f'Not downloading from {location} because already present: {local_tgz!r}')
         else:
             log(f'Downloading from {location=} to {local_tgz=}.')
-            urllib.request.urlretrieve( location, local_tgz)
+            urllib.request.urlretrieve( location, local_tgz + '-')
+            os.rename(local_tgz + '-', local_tgz)
             assert os.path.exists( local_tgz)
             tar_check( local_tgz, 'r:gz', prefix=f'{name}/')
     else:
@@ -1044,8 +1045,8 @@ def sdist():
 # We generate different wheels depending on PYMUPDF_SETUP_FLAVOUR.
 #
 
-version_p = '1.24.9'
-version_b = '1.24.9'
+version_p = '1.24.10'
+version_b = '1.24.10'
 
 if os.path.exists(f'{g_root}/{g_pymupdfb_sdist_marker}'):
 

diff --git a/src/__init__.py b/src/__init__.py
@@ -209,9 +209,9 @@ def _int_rc(text):
 
 # Basic version information.
 #
-pymupdf_version = "1.24.9"
+pymupdf_version = "1.24.10"
 mupdf_version = mupdf.FZ_VERSION
-pymupdf_date = "2024-07-24 00:00:01"
+pymupdf_date = "2024-09-02 00:00:01"
 
 # Versions as tuples; useful when comparing versions.
 #

diff --git a/tests/resources/test_3450.pdf b/tests/resources/test_3450.pdf
diff --git a/tests/resources/test_3569.pdf b/tests/resources/test_3569.pdf
diff --git a/tests/resources/test_3650.pdf b/tests/resources/test_3650.pdf
diff --git a/tests/resources/test_3705.pdf b/tests/resources/test_3705.pdf
diff --git a/tests/resources/test_3727.pdf b/tests/resources/test_3727.pdf
diff --git a/tests/test_general.py b/tests/test_general.py
@@ -11,6 +11,7 @@
 import pathlib
 import pickle
 import platform
+import time
 
 scriptdir = os.path.abspath(os.path.dirname(__file__))
 filename = os.path.join(scriptdir, "resources", "001003ED.pdf")
@@ -1270,3 +1271,59 @@ def test_3654():
     wt = pymupdf.TOOLS.mupdf_warnings()
     assert wt == 'dropping unclosed output'
 
+def test_3727():
+    if pymupdf.mupdf_version_tuple < (1, 24, 9):
+        print('test_3727(): not running because known to segv: {pymupdf.mupdf_version=}')
+        return
+    path = os.path.normpath(f'{__file__}/../../tests/resources/test_3727.pdf')
+    doc = pymupdf.open(path)
+    for page in doc:
+        page.get_pixmap(matrix = pymupdf.Matrix(2,2))
+
+def test_3569():
+    path = os.path.normpath(f'{__file__}/../../tests/resources/test_3569.pdf')
+    document = pymupdf.open(path)
+    page = document[0]
+    svg = page.get_svg_image(text_as_path=False)
+    print(f'{svg=}')
+    assert svg == (
+            '<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape" version="1.1" width="3024" height="2160" viewBox="0 0 3024 2160">\n'
+            '<defs>\n'
+            '<clipPath id="clip_1">\n'
+            '<path transform="matrix(0,-.06,-.06,-0,3024,2160)" d="M25432 10909H29692V15642H25432V10909"/>\n'
+            '</clipPath>\n'
+            '<clipPath id="clip_2">\n'
+            '<path transform="matrix(0,-.06,-.06,-0,3024,2160)" d="M28526 38017 31807 40376V40379L31312 41314V42889H28202L25092 42888V42887L28524 38017H28526"/>\n'
+            '</clipPath>\n'
+            '</defs>\n'
+            '<g clip-path="url(#clip_1)">\n'
+            '<g inkscape:groupmode="layer" inkscape:label="CED - Text">\n'
+            '<text xml:space="preserve" transform="matrix(.06 0 0 .06 3024 2160)" font-size="174.644" font-family="ArialMT"><tspan y="-28538" x="-14909 -14841.063 -14773.127 -14676.024 -14578.922 -14520.766 -14423.663">**L1-13</tspan></text>\n'
+            '</g>\n'
+            '</g>\n'
+            '<g clip-path="url(#clip_2)">\n'
+            '<g inkscape:groupmode="layer" inkscape:label="Level 03|S-COLS">\n'
+            '<path transform="matrix(0,-.06,-.06,-0,3024,2160)" d="M31130 41483V42083L30530 41483ZM31130 42083 30530 41483V42083Z" fill="#7f7f7f"/>\n'
+            '<path transform="matrix(0,-.06,-.06,-0,3024,2160)" stroke-width="0" stroke-linecap="butt" stroke-miterlimit="10" stroke-linejoin="miter" fill="none" stroke="#7f7f7f" d="M31130 41483V42083L30530 41483ZM31130 42083 30530 41483V42083Z"/>\n'
+            '<path transform="matrix(0,-.06,-.06,-0,3024,2160)" stroke-width="9" stroke-linecap="round" stroke-linejoin="round" fill="none" stroke="#7f7f7f" d="M30530 41483H31130V42083H30530V41483"/>\n'
+            '</g>\n'
+            '</g>\n'
+            '</svg>\n'
+            )
+    wt = pymupdf.TOOLS.mupdf_warnings()
+    assert wt == 'unknown cid collection: PDFAUTOCAD-Indentity0\nnon-embedded font using identity encoding: ArialMT (mapping via )\ninvalid marked content and clip nesting'
+
+def test_3450():
+    # This issue is a slow-down, so we just show time taken - it's not safe
+    # to fail if test takes too long because that can give spurious failures
+    # depending on hardware etc.
+    #
+    # On a mac-mini, PyMuPDF-1.24.8 takes 60s, PyMuPDF-1.24.9 takes 4s.
+    #
+    path = os.path.normpath(f'{__file__}/../../tests/resources/test_3450.pdf')
+    pdf = pymupdf.open(path)
+    page = pdf[0]
+    t = time.time()
+    pix = page.get_pixmap(alpha=False, dpi=150)
+    t = time.time() - t
+    print(f'test_3450(): {t=}')
diff --git a/tests/test_insertpdf.py b/tests/test_insertpdf.py
@@ -153,9 +153,6 @@ def test_2871():
 
 def test_3789():
 
-    print('test_3789(): Disabled because known to fail.')
-    return
-
     file_path = os.path.abspath(f'{__file__}/../../tests/resources/test_3789.pdf')
     result_path = os.path.abspath(f'{__file__}/../../tests/test_3789_out')
     pages_per_split = 5

diff --git a/tests/test_textextract.py b/tests/test_textextract.py
@@ -345,3 +345,45 @@ def test_3687():
         wt = pymupdf.TOOLS.mupdf_warnings()
         print(f'{wt=}')
         assert wt == 'unknown epub version: 3.0'
+
+def test_3705():
+    path = os.path.normpath(f'{__file__}/../../tests/resources/test_3705.pdf')
+    def get_all_page_from_pdf(document, last_page=None):
+        if last_page:
+            document.select(list(range(0, last_page)))
+        if document.page_count > 30:
+            document.select(list(range(0, 30)))
+        return iter(page for page in document)
+
+    filename = os.path.basename(path)
+
+    doc = pymupdf.open(path)
+    texts0 = list()
+    for i, page in enumerate(get_all_page_from_pdf(doc)):
+        text = page.get_text()
+        print(i, text)    
+        texts0.append(text)
+
+    texts1 = list()
+    doc = pymupdf.open(path)
+    for page in doc:
+        if page.number >= 30:  # leave the iterator immediately
+            break
+        text = page.get_text()
+        texts1.append(text)
+
+    assert texts1 == texts0
+
+    wt = pymupdf.TOOLS.mupdf_warnings()
+    assert wt == 'Actualtext with no position. Text may be lost or mispositioned.\n... repeated 434 times...'
+
+def test_3650():
+    path = os.path.normpath(f'{__file__}/../../tests/resources/test_3650.pdf')
+    doc = pymupdf.Document(path)
+    blocks = doc[0].get_text("blocks")
+    t = [block[4] for block in blocks]
+    print(f'{t=}')
+    assert t == [
+            'RECUEIL DES ACTES ADMINISTRATIFS\n',
+            'n° 78 du 28 avril 2023\n',
+            ]