Skip to content

Commit

Permalink
Merge pull request datalad#345 from mih/multihash
Browse files Browse the repository at this point in the history
New `MultiHash` class
  • Loading branch information
mih committed May 9, 2023
2 parents 9e0d341 + 96bea83 commit 9c65b96
Show file tree
Hide file tree
Showing 8 changed files with 107 additions and 34 deletions.
4 changes: 4 additions & 0 deletions changelog.d/20230508_211747_michael.hanke_multihash.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
### 💫 Enhancements and new features

- New `MultiHash` helper to compute multiple hashes in one go.
Fixes https://github.com/datalad/datalad-next/issues/345 (by @mih)
29 changes: 7 additions & 22 deletions datalad_next/url_operations/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,10 @@

import datalad
from datalad_next.utils import log_progress
from datalad_next.utils.multihash import (
MultiHash,
NoOpHash,
)

lgr = logging.getLogger('datalad.ext.next.url_operations')

Expand Down Expand Up @@ -319,28 +323,9 @@ def _progress_report_stop(self, pid: str, log_msg: tuple):
noninteractive_level=logging.DEBUG,
)

def _get_hasher(self, hash: list[str] | None) -> list[callable]:
if not hash:
return []

import hashlib
# yes, this will crash, if an invalid hash algorithm name
# is given
_hasher = []
for h in hash:
hr = getattr(hashlib, h.lower(), None)
if hr is None:
raise ValueError(f'unsupported hash algorithm {h}')
_hasher.append(hr())
return _hasher

def _get_hash_report(self,
hash_names: list[str] | None,
hashers: list) -> Dict:
if not hash_names:
return {}
else:
return dict(zip(hash_names, [h.hexdigest() for h in hashers]))
def _get_hasher(self, hash: list[str] | None) -> NoOpHash | MultiHash:
return MultiHash(hash) if hash is not None else NoOpHash()


#
# Exceptions to be used by all handlers
Expand Down
5 changes: 2 additions & 3 deletions datalad_next/url_operations/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,10 +250,9 @@ def _copyfp(self,
self._progress_report_update(
progress_id, update_log, chunk_size)
# compute hash simultaneously
for h in hasher:
h.update(chunk)
hasher.update(chunk)
copy_size += chunk_size
props.update(self._get_hash_report(hash, hasher))
props.update(hasher.get_hexdigest())
# return how much was copied. we could compare with
# `expected_size` and error on mismatch, but not all
# sources can provide that (e.g. stdin)
Expand Down
5 changes: 2 additions & 3 deletions datalad_next/url_operations/http.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,9 +266,8 @@ def _stream_download_from_request(
self._progress_report_update(
progress_id, ('Downloaded chunk',), len(chunk))
# compute hash simultaneously
for h in hasher:
h.update(chunk)
props.update(self._get_hash_report(hash, hasher))
hasher.update(chunk)
props.update(hasher.get_hexdigest())
return props
finally:
if fp and to_path is not None:
Expand Down
10 changes: 4 additions & 6 deletions datalad_next/url_operations/ssh.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,11 +185,10 @@ def download(self,
# write data
dst_fp_write(chunk)
# compute hash simultaneously
for h in hasher:
h.update(chunk)
hasher.update(chunk)
self._progress_report_update(
progress_id, ('Downloaded chunk',), len(chunk))
props.update(self._get_hash_report(hash, hasher))
props.update(hasher.get_hexdigest())
return props
except CommandError as e:
if e.code == 244:
Expand Down Expand Up @@ -283,8 +282,7 @@ def _perform_upload(self,
break
chunk_size = len(chunk)
# compute hash simultaneously
for h in hasher:
h.update(chunk)
hasher.update(chunk)
# we are just putting stuff in the queue, and rely on
# its maxsize to cause it to block the next call to
# have the progress reports be anyhow valid
Expand Down Expand Up @@ -313,7 +311,7 @@ def _perform_upload(self,
f"return value: {ssh_runner_generator.return_code}"

return {
**self._get_hash_report(hash_names, hasher),
**hasher.get_hexdigest(),
# return how much was copied. we could compare with
# `expected_size` and error on mismatch, but not all
# sources can provide that (e.g. stdin)
Expand Down
61 changes: 61 additions & 0 deletions datalad_next/utils/multihash.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
"""Compute more than one hash for the same data in one go"""

from __future__ import annotations

import hashlib
from typing import (
ByteString,
Dict,
)


class NoOpHash:
"""Companion of :class:`MultiHash` that computes no hash at all
This can be used wherever ``MultiHash`` would be used, because
it implements its API. However, no hash is computed and no
hexdigest is reported.
"""
def __init__(self, algorithms: None = None):
pass

def update(self, data):
pass

def get_hexdigest(self):
return {}


class MultiHash:
"""Compute any number of hashes as if computing just one
Supports any hash algorithm supported by the ``hashlib`` module
of the standard library.
"""
def __init__(self, algorithms: list[str]):
"""
Parameters
----------
algorithms: list
Hash names, must match the name of the algorithms in the
``hashlib`` module (case insensitive).
"""
# yes, this will crash, if an invalid hash algorithm name
# is given
_hasher = []
for h in algorithms:
hr = getattr(hashlib, h.lower(), None)
if hr is None:
raise ValueError(f'unsupported hash algorithm {h}')
_hasher.append(hr())
self._hasher = dict(zip(algorithms, _hasher))

def update(self, data: ByteString) -> None:
"""Updates all configured digests"""
for h in self._hasher.values():
h.update(data)

def get_hexdigest(self) -> Dict[str, str]:
"""Returns a mapping of algorithm name to hexdigest for all algorithms
"""
return {a: h.hexdigest() for a, h in self._hasher.items()}
26 changes: 26 additions & 0 deletions datalad_next/utils/tests/test_multihash.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import pytest

from ..multihash import (
MultiHash,
NoOpHash,
)


def test_multihash():
mh = MultiHash(['sha1', 'MD5'])
mh.update(b'')
hd = mh.get_hexdigest()
assert len(hd) == 2
# algorithm label preserves original casing
assert hd['MD5'] == 'd41d8cd98f00b204e9800998ecf8427e'
assert hd['sha1'] == 'da39a3ee5e6b4b0d3255bfef95601890afd80709'

with pytest.raises(ValueError):
MultiHash(['bogus'])



def test_noophash():
mh = NoOpHash()
mh.update(b'')
assert mh.get_hexdigest() == {}
1 change: 1 addition & 0 deletions docs/source/pyutils.rst
Original file line number Diff line number Diff line change
Expand Up @@ -18,5 +18,6 @@ Python utilities
url_operations.ssh
utils
utils.http_helpers
utils.multihash
utils.requests_auth
tests.fixtures

0 comments on commit 9c65b96

Please sign in to comment.