Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

New MultiHash class #345

Merged
merged 1 commit into from
May 9, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions changelog.d/20230508_211747_michael.hanke_multihash.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
### 💫 Enhancements and new features

- New `MultiHash` helper to compute multiple hashes in one go.
Fixes https://github.com/datalad/datalad-next/issues/345 (by @mih)
29 changes: 7 additions & 22 deletions datalad_next/url_operations/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,10 @@

import datalad
from datalad_next.utils import log_progress
from datalad_next.utils.multihash import (
MultiHash,
NoOpHash,
)

lgr = logging.getLogger('datalad.ext.next.url_operations')

Expand Down Expand Up @@ -319,28 +323,9 @@ def _progress_report_stop(self, pid: str, log_msg: tuple):
noninteractive_level=logging.DEBUG,
)

def _get_hasher(self, hash: list[str] | None) -> list[callable]:
if not hash:
return []

import hashlib
# yes, this will crash, if an invalid hash algorithm name
# is given
_hasher = []
for h in hash:
hr = getattr(hashlib, h.lower(), None)
if hr is None:
raise ValueError(f'unsupported hash algorithm {h}')
_hasher.append(hr())
return _hasher

def _get_hash_report(self,
hash_names: list[str] | None,
hashers: list) -> Dict:
if not hash_names:
return {}
else:
return dict(zip(hash_names, [h.hexdigest() for h in hashers]))
def _get_hasher(self, hash: list[str] | None) -> NoOpHash | MultiHash:
return MultiHash(hash) if hash is not None else NoOpHash()


#
# Exceptions to be used by all handlers
Expand Down
5 changes: 2 additions & 3 deletions datalad_next/url_operations/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,10 +250,9 @@ def _copyfp(self,
self._progress_report_update(
progress_id, update_log, chunk_size)
# compute hash simultaneously
for h in hasher:
h.update(chunk)
hasher.update(chunk)
copy_size += chunk_size
props.update(self._get_hash_report(hash, hasher))
props.update(hasher.get_hexdigest())
# return how much was copied. we could compare with
# `expected_size` and error on mismatch, but not all
# sources can provide that (e.g. stdin)
Expand Down
5 changes: 2 additions & 3 deletions datalad_next/url_operations/http.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,9 +266,8 @@ def _stream_download_from_request(
self._progress_report_update(
progress_id, ('Downloaded chunk',), len(chunk))
# compute hash simultaneously
for h in hasher:
h.update(chunk)
props.update(self._get_hash_report(hash, hasher))
hasher.update(chunk)
props.update(hasher.get_hexdigest())
return props
finally:
if fp and to_path is not None:
Expand Down
10 changes: 4 additions & 6 deletions datalad_next/url_operations/ssh.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,11 +185,10 @@ def download(self,
# write data
dst_fp_write(chunk)
# compute hash simultaneously
for h in hasher:
h.update(chunk)
hasher.update(chunk)
self._progress_report_update(
progress_id, ('Downloaded chunk',), len(chunk))
props.update(self._get_hash_report(hash, hasher))
props.update(hasher.get_hexdigest())
return props
except CommandError as e:
if e.code == 244:
Expand Down Expand Up @@ -283,8 +282,7 @@ def _perform_upload(self,
break
chunk_size = len(chunk)
# compute hash simultaneously
for h in hasher:
h.update(chunk)
hasher.update(chunk)
# we are just putting stuff in the queue, and rely on
# its maxsize to cause it to block the next call to
# have the progress reports be anyhow valid
Expand Down Expand Up @@ -313,7 +311,7 @@ def _perform_upload(self,
f"return value: {ssh_runner_generator.return_code}"

return {
**self._get_hash_report(hash_names, hasher),
**hasher.get_hexdigest(),
# return how much was copied. we could compare with
# `expected_size` and error on mismatch, but not all
# sources can provide that (e.g. stdin)
Expand Down
61 changes: 61 additions & 0 deletions datalad_next/utils/multihash.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
"""Compute more than one hash for the same data in one go"""

from __future__ import annotations

import hashlib
from typing import (
ByteString,
Dict,
)


class NoOpHash:
"""Companion of :class:`MultiHash` that computes no hash at all

This can be used wherever ``MultiHash`` would be used, because
it implements its API. However, no hash is computed and no
hexdigest is reported.
"""
def __init__(self, algorithms: None = None):
pass

def update(self, data):
pass

def get_hexdigest(self):
return {}


class MultiHash:
"""Compute any number of hashes as if computing just one

Supports any hash algorithm supported by the ``hashlib`` module
of the standard library.
"""
def __init__(self, algorithms: list[str]):
"""
Parameters
----------
algorithms: list
Hash names, must match the name of the algorithms in the
``hashlib`` module (case insensitive).
"""
# yes, this will crash, if an invalid hash algorithm name
# is given
_hasher = []
for h in algorithms:
hr = getattr(hashlib, h.lower(), None)
if hr is None:
raise ValueError(f'unsupported hash algorithm {h}')
_hasher.append(hr())
self._hasher = dict(zip(algorithms, _hasher))

def update(self, data: ByteString) -> None:
"""Updates all configured digests"""
for h in self._hasher.values():
h.update(data)

def get_hexdigest(self) -> Dict[str, str]:
"""Returns a mapping of algorithm name to hexdigest for all algorithms
"""
return {a: h.hexdigest() for a, h in self._hasher.items()}
26 changes: 26 additions & 0 deletions datalad_next/utils/tests/test_multihash.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import pytest

from ..multihash import (
MultiHash,
NoOpHash,
)


def test_multihash():
mh = MultiHash(['sha1', 'MD5'])
mh.update(b'')
hd = mh.get_hexdigest()
assert len(hd) == 2
# algorithm label preserves original casing
assert hd['MD5'] == 'd41d8cd98f00b204e9800998ecf8427e'
assert hd['sha1'] == 'da39a3ee5e6b4b0d3255bfef95601890afd80709'

with pytest.raises(ValueError):
MultiHash(['bogus'])



def test_noophash():
mh = NoOpHash()
mh.update(b'')
assert mh.get_hexdigest() == {}
1 change: 1 addition & 0 deletions docs/source/pyutils.rst
Original file line number Diff line number Diff line change
Expand Up @@ -18,5 +18,6 @@ Python utilities
url_operations.ssh
utils
utils.http_helpers
utils.multihash
utils.requests_auth
tests.fixtures