From 96bea83da5411c3f4736a114076e7ea468335889 Mon Sep 17 00:00:00 2001
From: Michael Hanke <michael.hanke@gmail.com>
Date: Mon, 8 May 2023 21:12:33 +0200
Subject: [PATCH] New `MultiHash` class

This has been factored out from the `UrlOperations` classes. It
simplifies that code and can also be used elsewhere.

Closes #344
---
 ...20230508_211747_michael.hanke_multihash.md |  4 ++
 datalad_next/url_operations/__init__.py       | 29 +++------
 datalad_next/url_operations/file.py           |  5 +-
 datalad_next/url_operations/http.py           |  5 +-
 datalad_next/url_operations/ssh.py            | 10 ++-
 datalad_next/utils/multihash.py               | 61 +++++++++++++++++++
 datalad_next/utils/tests/test_multihash.py    | 26 ++++++++
 docs/source/pyutils.rst                       |  1 +
 8 files changed, 107 insertions(+), 34 deletions(-)
 create mode 100644 changelog.d/20230508_211747_michael.hanke_multihash.md
 create mode 100644 datalad_next/utils/multihash.py
 create mode 100644 datalad_next/utils/tests/test_multihash.py

diff --git a/changelog.d/20230508_211747_michael.hanke_multihash.md b/changelog.d/20230508_211747_michael.hanke_multihash.md
new file mode 100644
index 00000000..d60126c0
--- /dev/null
+++ b/changelog.d/20230508_211747_michael.hanke_multihash.md
@@ -0,0 +1,4 @@
+### 💫 Enhancements and new features
+
+- New `MultiHash` helper to compute multiple hashes in one go.
+  Fixes https://github.com/datalad/datalad-next/issues/345 (by @mih)
diff --git a/datalad_next/url_operations/__init__.py b/datalad_next/url_operations/__init__.py
index 4b8f59d9..d6f1edf3 100644
--- a/datalad_next/url_operations/__init__.py
+++ b/datalad_next/url_operations/__init__.py
@@ -12,6 +12,10 @@
 
 import datalad
 from datalad_next.utils import log_progress
+from datalad_next.utils.multihash import (
+    MultiHash,
+    NoOpHash,
+)
 
 lgr = logging.getLogger('datalad.ext.next.url_operations')
 
@@ -319,28 +323,9 @@ def _progress_report_stop(self, pid: str, log_msg: tuple):
             noninteractive_level=logging.DEBUG,
         )
 
-    def _get_hasher(self, hash: list[str] | None) -> list[callable]:
-        if not hash:
-            return []
-
-        import hashlib
-        # yes, this will crash, if an invalid hash algorithm name
-        # is given
-        _hasher = []
-        for h in hash:
-            hr = getattr(hashlib, h.lower(), None)
-            if hr is None:
-                raise ValueError(f'unsupported hash algorithm {h}')
-            _hasher.append(hr())
-        return _hasher
-
-    def _get_hash_report(self,
-                         hash_names: list[str] | None,
-                         hashers: list) -> Dict:
-        if not hash_names:
-            return {}
-        else:
-            return dict(zip(hash_names, [h.hexdigest() for h in hashers]))
+    def _get_hasher(self, hash: list[str] | None) -> NoOpHash | MultiHash:
+        return MultiHash(hash) if hash is not None else NoOpHash()
+
 
 #
 # Exceptions to be used by all handlers
diff --git a/datalad_next/url_operations/file.py b/datalad_next/url_operations/file.py
index 82bdc09e..7f72b4ad 100644
--- a/datalad_next/url_operations/file.py
+++ b/datalad_next/url_operations/file.py
@@ -250,10 +250,9 @@ def _copyfp(self,
                 self._progress_report_update(
                     progress_id, update_log, chunk_size)
                 # compute hash simultaneously
-                for h in hasher:
-                    h.update(chunk)
+                hasher.update(chunk)
                 copy_size += chunk_size
-            props.update(self._get_hash_report(hash, hasher))
+            props.update(hasher.get_hexdigest())
             # return how much was copied. we could compare with
             # `expected_size` and error on mismatch, but not all
             # sources can provide that (e.g. stdin)
diff --git a/datalad_next/url_operations/http.py b/datalad_next/url_operations/http.py
index 6b964aab..8d31c038 100644
--- a/datalad_next/url_operations/http.py
+++ b/datalad_next/url_operations/http.py
@@ -266,9 +266,8 @@ def _stream_download_from_request(
                 self._progress_report_update(
                     progress_id, ('Downloaded chunk',), len(chunk))
                 # compute hash simultaneously
-                for h in hasher:
-                    h.update(chunk)
-            props.update(self._get_hash_report(hash, hasher))
+                hasher.update(chunk)
+            props.update(hasher.get_hexdigest())
             return props
         finally:
             if fp and to_path is not None:
diff --git a/datalad_next/url_operations/ssh.py b/datalad_next/url_operations/ssh.py
index f4ef5256..10ec5f41 100644
--- a/datalad_next/url_operations/ssh.py
+++ b/datalad_next/url_operations/ssh.py
@@ -185,11 +185,10 @@ def download(self,
                 # write data
                 dst_fp_write(chunk)
                 # compute hash simultaneously
-                for h in hasher:
-                    h.update(chunk)
+                hasher.update(chunk)
                 self._progress_report_update(
                     progress_id, ('Downloaded chunk',), len(chunk))
-            props.update(self._get_hash_report(hash, hasher))
+            props.update(hasher.get_hexdigest())
             return props
         except CommandError as e:
             if e.code == 244:
@@ -283,8 +282,7 @@ def _perform_upload(self,
                     break
                 chunk_size = len(chunk)
                 # compute hash simultaneously
-                for h in hasher:
-                    h.update(chunk)
+                hasher.update(chunk)
                 # we are just putting stuff in the queue, and rely on
                 # its maxsize to cause it to block the next call to
                 # have the progress reports be anyhow valid
@@ -313,7 +311,7 @@ def _perform_upload(self,
             f"return value: {ssh_runner_generator.return_code}"
 
         return {
-            **self._get_hash_report(hash_names, hasher),
+            **hasher.get_hexdigest(),
             # return how much was copied. we could compare with
             # `expected_size` and error on mismatch, but not all
             # sources can provide that (e.g. stdin)
diff --git a/datalad_next/utils/multihash.py b/datalad_next/utils/multihash.py
new file mode 100644
index 00000000..b691695a
--- /dev/null
+++ b/datalad_next/utils/multihash.py
@@ -0,0 +1,61 @@
+"""Compute more than one hash for the same data in one go"""
+
+from __future__ import annotations
+
+import hashlib
+from typing import (
+    ByteString,
+    Dict,
+)
+
+
+class NoOpHash:
+    """Companion of :class:`MultiHash` that computes no hash at all
+
+    This can be used wherever ``MultiHash`` would be used, because
+    it implements its API. However, no hash is computed and no
+    hexdigest is reported.
+    """
+    def __init__(self, algorithms: None = None):
+        pass
+
+    def update(self, data):
+        pass
+
+    def get_hexdigest(self):
+        return {}
+
+
+class MultiHash:
+    """Compute any number of hashes as if computing just one
+
+    Supports any hash algorithm supported by the ``hashlib`` module
+    of the standard library.
+    """
+    def __init__(self, algorithms: list[str]):
+        """
+        Parameters
+        ----------
+        algorithms: list
+          Hash names, must match the name of the algorithms in the
+          ``hashlib`` module (case insensitive).
+        """
+        # yes, this will crash, if an invalid hash algorithm name
+        # is given
+        _hasher = []
+        for h in algorithms:
+            hr = getattr(hashlib, h.lower(), None)
+            if hr is None:
+                raise ValueError(f'unsupported hash algorithm {h}')
+            _hasher.append(hr())
+        self._hasher = dict(zip(algorithms, _hasher))
+
+    def update(self, data: ByteString) -> None:
+        """Updates all configured digests"""
+        for h in self._hasher.values():
+            h.update(data)
+
+    def get_hexdigest(self) -> Dict[str, str]:
+        """Returns a mapping of algorithm name to hexdigest for all algorithms
+        """
+        return {a: h.hexdigest() for a, h in self._hasher.items()}
diff --git a/datalad_next/utils/tests/test_multihash.py b/datalad_next/utils/tests/test_multihash.py
new file mode 100644
index 00000000..ed09811a
--- /dev/null
+++ b/datalad_next/utils/tests/test_multihash.py
@@ -0,0 +1,26 @@
+import pytest
+
+from ..multihash import (
+    MultiHash,
+    NoOpHash,
+)
+
+
+def test_multihash():
+    mh = MultiHash(['sha1', 'MD5'])
+    mh.update(b'')
+    hd = mh.get_hexdigest()
+    assert len(hd) == 2
+    # algorithm label preserves original casing
+    assert hd['MD5'] == 'd41d8cd98f00b204e9800998ecf8427e'
+    assert hd['sha1'] == 'da39a3ee5e6b4b0d3255bfef95601890afd80709'
+
+    with pytest.raises(ValueError):
+        MultiHash(['bogus'])
+
+
+
+def test_noophash():
+    mh = NoOpHash()
+    mh.update(b'')
+    assert mh.get_hexdigest() == {}
diff --git a/docs/source/pyutils.rst b/docs/source/pyutils.rst
index 10e41624..ff349a2c 100644
--- a/docs/source/pyutils.rst
+++ b/docs/source/pyutils.rst
@@ -18,5 +18,6 @@ Python utilities
    url_operations.ssh
    utils
    utils.http_helpers
+   utils.multihash
    utils.requests_auth
    tests.fixtures