From a22f5565c5e7f7876f18216f967a7292ffbe551c Mon Sep 17 00:00:00 2001 From: Gionata Ghiggi Date: Thu, 22 Feb 2024 17:02:27 +0100 Subject: [PATCH] Avoid unnecessary copy to intermediate store (#150) * Fix wasted compute * Add test unit * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- rechunker/algorithm.py | 2 +- rechunker/api.py | 2 +- tests/test_rechunk.py | 21 +++++++++++++++++++++ 3 files changed, 23 insertions(+), 2 deletions(-) diff --git a/rechunker/algorithm.py b/rechunker/algorithm.py index 3512eb8..ae2c327 100644 --- a/rechunker/algorithm.py +++ b/rechunker/algorithm.py @@ -294,7 +294,7 @@ def multistage_rechunking_plan( "achieving the minimum memory requirement due to increasing IO " f"requirements. Smallest intermediates have size {int_mem}. " f"Consider decreasing min_mem ({min_mem}) or increasing " - f"({max_mem}) to find a more efficient plan.", + f"max_mem ({max_mem}) to find a more efficient plan.", category=ExcessiveIOWarning, ) assert prev_plan is not None diff --git a/rechunker/api.py b/rechunker/api.py index ead05dd..0b3ced7 100644 --- a/rechunker/api.py +++ b/rechunker/api.py @@ -609,7 +609,7 @@ def _setup_array_rechunk( except AttributeError: pass - if read_chunks == write_chunks: + if read_chunks == write_chunks or read_chunks == int_chunks: int_array = None else: # do intermediate store diff --git a/tests/test_rechunk.py b/tests/test_rechunk.py index 675da98..3b3a393 100644 --- a/tests/test_rechunk.py +++ b/tests/test_rechunk.py @@ -792,6 +792,27 @@ def test_no_intermediate_fused(tmp_path): assert num_tasks < 20 # less than if no fuse +def test_no_intermediate_store(tmp_path): + """Test behaviour when read_chunks == int_chunks.""" + shape = (1000, 2000, 2000) + source_chunks = (1, 2000, 2000) + dtype = "f4" + max_mem = 20000000000 + target_chunks = (1000, 4, 4) + + store_source = str(tmp_path / "source.zarr") + source_array = zarr.ones( + shape, chunks=source_chunks, dtype=dtype, store=store_source + ) + + target_store = str(tmp_path / "target.zarr") + temp_store = str(tmp_path / "temp_store.zarr") + rechunked = api.rechunk( + source_array, target_chunks, max_mem, target_store, temp_store=temp_store + ) + assert "Intermediate" not in repr(rechunked) + + def test_rechunk_array_to_group_no_name(tmp_path): a = sample_zarr_array(tmp_path) target_chunks = a.chunks