From 96bea83da5411c3f4736a114076e7ea468335889 Mon Sep 17 00:00:00 2001 From: Michael Hanke Date: Mon, 8 May 2023 21:12:33 +0200 Subject: [PATCH] New `MultiHash` class This has been factored out from the `UrlOperations` classes. It simplifies that code and can also be used elsewhere. Closes #344 --- ...20230508_211747_michael.hanke_multihash.md | 4 ++ datalad_next/url_operations/__init__.py | 29 +++------ datalad_next/url_operations/file.py | 5 +- datalad_next/url_operations/http.py | 5 +- datalad_next/url_operations/ssh.py | 10 ++- datalad_next/utils/multihash.py | 61 +++++++++++++++++++ datalad_next/utils/tests/test_multihash.py | 26 ++++++++ docs/source/pyutils.rst | 1 + 8 files changed, 107 insertions(+), 34 deletions(-) create mode 100644 changelog.d/20230508_211747_michael.hanke_multihash.md create mode 100644 datalad_next/utils/multihash.py create mode 100644 datalad_next/utils/tests/test_multihash.py diff --git a/changelog.d/20230508_211747_michael.hanke_multihash.md b/changelog.d/20230508_211747_michael.hanke_multihash.md new file mode 100644 index 00000000..d60126c0 --- /dev/null +++ b/changelog.d/20230508_211747_michael.hanke_multihash.md @@ -0,0 +1,4 @@ +### 💫 Enhancements and new features + +- New `MultiHash` helper to compute multiple hashes in one go. + Fixes https://github.com/datalad/datalad-next/issues/345 (by @mih) diff --git a/datalad_next/url_operations/__init__.py b/datalad_next/url_operations/__init__.py index 4b8f59d9..d6f1edf3 100644 --- a/datalad_next/url_operations/__init__.py +++ b/datalad_next/url_operations/__init__.py @@ -12,6 +12,10 @@ import datalad from datalad_next.utils import log_progress +from datalad_next.utils.multihash import ( + MultiHash, + NoOpHash, +) lgr = logging.getLogger('datalad.ext.next.url_operations') @@ -319,28 +323,9 @@ def _progress_report_stop(self, pid: str, log_msg: tuple): noninteractive_level=logging.DEBUG, ) - def _get_hasher(self, hash: list[str] | None) -> list[callable]: - if not hash: - return [] - - import hashlib - # yes, this will crash, if an invalid hash algorithm name - # is given - _hasher = [] - for h in hash: - hr = getattr(hashlib, h.lower(), None) - if hr is None: - raise ValueError(f'unsupported hash algorithm {h}') - _hasher.append(hr()) - return _hasher - - def _get_hash_report(self, - hash_names: list[str] | None, - hashers: list) -> Dict: - if not hash_names: - return {} - else: - return dict(zip(hash_names, [h.hexdigest() for h in hashers])) + def _get_hasher(self, hash: list[str] | None) -> NoOpHash | MultiHash: + return MultiHash(hash) if hash is not None else NoOpHash() + # # Exceptions to be used by all handlers diff --git a/datalad_next/url_operations/file.py b/datalad_next/url_operations/file.py index 82bdc09e..7f72b4ad 100644 --- a/datalad_next/url_operations/file.py +++ b/datalad_next/url_operations/file.py @@ -250,10 +250,9 @@ def _copyfp(self, self._progress_report_update( progress_id, update_log, chunk_size) # compute hash simultaneously - for h in hasher: - h.update(chunk) + hasher.update(chunk) copy_size += chunk_size - props.update(self._get_hash_report(hash, hasher)) + props.update(hasher.get_hexdigest()) # return how much was copied. we could compare with # `expected_size` and error on mismatch, but not all # sources can provide that (e.g. stdin) diff --git a/datalad_next/url_operations/http.py b/datalad_next/url_operations/http.py index 6b964aab..8d31c038 100644 --- a/datalad_next/url_operations/http.py +++ b/datalad_next/url_operations/http.py @@ -266,9 +266,8 @@ def _stream_download_from_request( self._progress_report_update( progress_id, ('Downloaded chunk',), len(chunk)) # compute hash simultaneously - for h in hasher: - h.update(chunk) - props.update(self._get_hash_report(hash, hasher)) + hasher.update(chunk) + props.update(hasher.get_hexdigest()) return props finally: if fp and to_path is not None: diff --git a/datalad_next/url_operations/ssh.py b/datalad_next/url_operations/ssh.py index f4ef5256..10ec5f41 100644 --- a/datalad_next/url_operations/ssh.py +++ b/datalad_next/url_operations/ssh.py @@ -185,11 +185,10 @@ def download(self, # write data dst_fp_write(chunk) # compute hash simultaneously - for h in hasher: - h.update(chunk) + hasher.update(chunk) self._progress_report_update( progress_id, ('Downloaded chunk',), len(chunk)) - props.update(self._get_hash_report(hash, hasher)) + props.update(hasher.get_hexdigest()) return props except CommandError as e: if e.code == 244: @@ -283,8 +282,7 @@ def _perform_upload(self, break chunk_size = len(chunk) # compute hash simultaneously - for h in hasher: - h.update(chunk) + hasher.update(chunk) # we are just putting stuff in the queue, and rely on # its maxsize to cause it to block the next call to # have the progress reports be anyhow valid @@ -313,7 +311,7 @@ def _perform_upload(self, f"return value: {ssh_runner_generator.return_code}" return { - **self._get_hash_report(hash_names, hasher), + **hasher.get_hexdigest(), # return how much was copied. we could compare with # `expected_size` and error on mismatch, but not all # sources can provide that (e.g. stdin) diff --git a/datalad_next/utils/multihash.py b/datalad_next/utils/multihash.py new file mode 100644 index 00000000..b691695a --- /dev/null +++ b/datalad_next/utils/multihash.py @@ -0,0 +1,61 @@ +"""Compute more than one hash for the same data in one go""" + +from __future__ import annotations + +import hashlib +from typing import ( + ByteString, + Dict, +) + + +class NoOpHash: + """Companion of :class:`MultiHash` that computes no hash at all + + This can be used wherever ``MultiHash`` would be used, because + it implements its API. However, no hash is computed and no + hexdigest is reported. + """ + def __init__(self, algorithms: None = None): + pass + + def update(self, data): + pass + + def get_hexdigest(self): + return {} + + +class MultiHash: + """Compute any number of hashes as if computing just one + + Supports any hash algorithm supported by the ``hashlib`` module + of the standard library. + """ + def __init__(self, algorithms: list[str]): + """ + Parameters + ---------- + algorithms: list + Hash names, must match the name of the algorithms in the + ``hashlib`` module (case insensitive). + """ + # yes, this will crash, if an invalid hash algorithm name + # is given + _hasher = [] + for h in algorithms: + hr = getattr(hashlib, h.lower(), None) + if hr is None: + raise ValueError(f'unsupported hash algorithm {h}') + _hasher.append(hr()) + self._hasher = dict(zip(algorithms, _hasher)) + + def update(self, data: ByteString) -> None: + """Updates all configured digests""" + for h in self._hasher.values(): + h.update(data) + + def get_hexdigest(self) -> Dict[str, str]: + """Returns a mapping of algorithm name to hexdigest for all algorithms + """ + return {a: h.hexdigest() for a, h in self._hasher.items()} diff --git a/datalad_next/utils/tests/test_multihash.py b/datalad_next/utils/tests/test_multihash.py new file mode 100644 index 00000000..ed09811a --- /dev/null +++ b/datalad_next/utils/tests/test_multihash.py @@ -0,0 +1,26 @@ +import pytest + +from ..multihash import ( + MultiHash, + NoOpHash, +) + + +def test_multihash(): + mh = MultiHash(['sha1', 'MD5']) + mh.update(b'') + hd = mh.get_hexdigest() + assert len(hd) == 2 + # algorithm label preserves original casing + assert hd['MD5'] == 'd41d8cd98f00b204e9800998ecf8427e' + assert hd['sha1'] == 'da39a3ee5e6b4b0d3255bfef95601890afd80709' + + with pytest.raises(ValueError): + MultiHash(['bogus']) + + + +def test_noophash(): + mh = NoOpHash() + mh.update(b'') + assert mh.get_hexdigest() == {} diff --git a/docs/source/pyutils.rst b/docs/source/pyutils.rst index 10e41624..ff349a2c 100644 --- a/docs/source/pyutils.rst +++ b/docs/source/pyutils.rst @@ -18,5 +18,6 @@ Python utilities url_operations.ssh utils utils.http_helpers + utils.multihash utils.requests_auth tests.fixtures