Add benchmark for NetCDF --> Zarr cloud-optimization (#1551)

Co-authored-by: Hendrik Makait <hendrik@coiled.io>
coiled · Sep 20, 2024 · c87787c · c87787c
1 parent ff76354
commit c87787c
Show file tree

Hide file tree

Showing 2 changed files with 102 additions and 0 deletions.
diff --git a/ci/environment.yml b/ci/environment.yml
@@ -48,6 +48,7 @@ dependencies:
   - gilknocker ==0.4.1
   - openssl >1.1.0g
   - rioxarray ==0.17.0
+  - h5netcdf ==1.3.0
 
 ########################################################
 # PLEASE READ:

diff --git a/tests/geospatial/test_cloud_optimize.py b/tests/geospatial/test_cloud_optimize.py
@@ -0,0 +1,101 @@
+import xarray as xr
+
+
+def test_cloud_optimize(
+    scale,
+    s3,
+    s3_url,
+    client_factory,
+    cluster_kwargs={
+        "workspace": "dask-benchmarks",
+        "region": "us-west-2",
+        "wait_for_workers": True,
+    },
+    scale_kwargs={
+        "small": {"n_workers": 10},
+        "medium": {"n_workers": 100},
+        "large": {"n_workers": 200},
+    },
+):
+    with client_factory(
+        **scale_kwargs[scale], **cluster_kwargs
+    ) as client:  # noqa: F841
+        # Define models and variables of interest
+        models = [
+            "ACCESS-CM2",
+            "ACCESS-ESM1-5",
+            "CMCC-ESM2",
+            "CNRM-CM6-1",
+            "CNRM-ESM2-1",
+            "CanESM5",
+            "EC-Earth3",
+            "EC-Earth3-Veg-LR",
+            "FGOALS-g3",
+            "GFDL-ESM4",
+            "GISS-E2-1-G",
+            "INM-CM4-8",
+            "INM-CM5-0",
+            "KACE-1-0-G",
+            "MIROC-ES2L",
+            "MPI-ESM1-2-HR",
+            "MPI-ESM1-2-LR",
+            "MRI-ESM2-0",
+            "NorESM2-LM",
+            "NorESM2-MM",
+            "TaiESM1",
+            "UKESM1-0-LL",
+        ]
+        variables = [
+            "hurs",
+            "huss",
+            "pr",
+            "rlds",
+            "rsds",
+            "sfcWind",
+            "tas",
+            "tasmax",
+            "tasmin",
+        ]
+
+        if scale == "small":
+            # 130 files (152.83 GiB). One model and one variable.
+            models = models[:1]
+            variables = variables[:1]
+        elif scale == "medium":
+            # 390 files. Two models and two variables.
+            # Currently fails after hitting 20 minute idle timeout
+            # sending large graph to the scheduler.
+            models = models[:2]
+            variables = variables[:2]
+        else:
+            # 11635 files. All models and variables.
+            pass
+
+        # Get netCDF data files -- see https://registry.opendata.aws/nex-gddp-cmip6
+        # for dataset details.
+        file_list = []
+        for model in models:
+            for variable in variables:
+                data_dir = f"s3://nex-gddp-cmip6/NEX-GDDP-CMIP6/{model}/historical/r1i1p1f1/{variable}/*.nc"
+                file_list += [f"s3://{path}" for path in s3.glob(data_dir)]
+        files = [s3.open(f) for f in file_list]
+        print(f"Processing {len(files)} NetCDF files")
+
+        # Load input NetCDF data files
+        # TODO: Reduce explicit settings once https://github.com/pydata/xarray/issues/8778 is completed.
+        ds = xr.open_mfdataset(
+            files,
+            engine="h5netcdf",
+            combine="nested",
+            concat_dim="time",
+            data_vars="minimal",
+            coords="minimal",
+            compat="override",
+            parallel=True,
+        )
+
+        # Rechunk from "pancake" to "pencil" format
+        ds = ds.chunk({"time": -1, "lon": "auto", "lat": "auto"})
+
+        # Write out to a Zar dataset
+        ds.to_zarr(s3_url)