From bf7795c26f10dfb72a5d1646074ae6ee6f0d1ed2 Mon Sep 17 00:00:00 2001 From: Marco Braun Date: Wed, 5 Jul 2023 18:04:56 -0400 Subject: [PATCH 01/17] generalize dims as X/Y when lat/lon or rlat/rlon --- xscen/io.py | 61 +++++++++++++++++++++++++---------------------------- 1 file changed, 29 insertions(+), 32 deletions(-) diff --git a/xscen/io.py b/xscen/io.py index 13f4da3b..149d1486 100644 --- a/xscen/io.py +++ b/xscen/io.py @@ -304,22 +304,7 @@ def save_to_netcdf( xarray.Dataset.to_netcdf """ if rechunk: - for rechunk_var in ds.data_vars: - # Support for chunks varying per variable - if rechunk_var in rechunk: - rechunk_dims = rechunk[rechunk_var] - else: - rechunk_dims = rechunk - - ds[rechunk_var] = ds[rechunk_var].chunk( - { - d: chnks - for d, chnks in rechunk_dims.items() - if d in ds[rechunk_var].dims - } - ) - ds[rechunk_var].encoding.pop("chunksizes", None) - ds[rechunk_var].encoding.pop("chunks", None) + ds = _rechunk_for_saving(ds, rechunk) path = Path(filename) path.parent.mkdir(parents=True, exist_ok=True) @@ -404,22 +389,7 @@ def save_to_zarr( ds[v].encoding.clear() if rechunk: - for rechunk_var in ds.data_vars: - # Support for chunks varying per variable - if rechunk_var in rechunk: - rechunk_dims = rechunk[rechunk_var] - else: - rechunk_dims = rechunk - - ds[rechunk_var] = ds[rechunk_var].chunk( - { - d: chnks - for d, chnks in rechunk_dims.items() - if d in ds[rechunk_var].dims - } - ) - ds[rechunk_var].encoding.pop("chunksizes", None) - ds[rechunk_var].encoding.pop("chunks", None) + ds = _rechunk_for_saving(ds, rechunk) path = Path(filename) path.parent.mkdir(parents=True, exist_ok=True) @@ -518,6 +488,33 @@ def coerce_attrs(attrs): raise +def _rechunk_for_saving(ds, rechunk): + """Do chunking before saving to .zarr or .nc, generalized as Y/X for different axes lat/lon, rlat/rlon""" + for rechunk_var in ds.data_vars: + # Support for chunks varying per variable + if rechunk_var in rechunk: + rechunk_dims = rechunk[rechunk_var].copy() + else: + rechunk_dims = rechunk.copy() + + # get actual axes labels + if {'X', 'Y'}.issubset(rechunk_dims): + rechunk_dims[ds.cf.axes['X'][0]] = rechunk_dims.pop('X') + rechunk_dims[ds.cf.axes['Y'][0]] = rechunk_dims.pop('Y') + + ds[rechunk_var] = ds[rechunk_var].chunk( + { + d: chnks + for d, chnks in rechunk_dims.items() + if d in ds[rechunk_var].dims + } + ) + ds[rechunk_var].encoding.pop("chunksizes", None) + ds[rechunk_var].encoding.pop("chunks", None) + + return ds + + @parse_config def rechunk( path_in: Union[os.PathLike, str, xr.Dataset], From 7611f8337b9e4e8b0beae09281c8c86d060ca57a Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 6 Jul 2023 22:45:14 +0000 Subject: [PATCH 02/17] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xscen/io.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/xscen/io.py b/xscen/io.py index 149d1486..63a6d24a 100644 --- a/xscen/io.py +++ b/xscen/io.py @@ -498,16 +498,12 @@ def _rechunk_for_saving(ds, rechunk): rechunk_dims = rechunk.copy() # get actual axes labels - if {'X', 'Y'}.issubset(rechunk_dims): - rechunk_dims[ds.cf.axes['X'][0]] = rechunk_dims.pop('X') - rechunk_dims[ds.cf.axes['Y'][0]] = rechunk_dims.pop('Y') + if {"X", "Y"}.issubset(rechunk_dims): + rechunk_dims[ds.cf.axes["X"][0]] = rechunk_dims.pop("X") + rechunk_dims[ds.cf.axes["Y"][0]] = rechunk_dims.pop("Y") ds[rechunk_var] = ds[rechunk_var].chunk( - { - d: chnks - for d, chnks in rechunk_dims.items() - if d in ds[rechunk_var].dims - } + {d: chnks for d, chnks in rechunk_dims.items() if d in ds[rechunk_var].dims} ) ds[rechunk_var].encoding.pop("chunksizes", None) ds[rechunk_var].encoding.pop("chunks", None) From ee8e507b9cd2bacfeaae53971a19fcc131cb1f61 Mon Sep 17 00:00:00 2001 From: Marco Braun Date: Thu, 6 Jul 2023 18:46:05 -0400 Subject: [PATCH 03/17] updated AUTHORS.rst and HISTORY.rst --- AUTHORS.rst | 1 + HISTORY.rst | 1 + 2 files changed, 2 insertions(+) diff --git a/AUTHORS.rst b/AUTHORS.rst index fd562276..be4540bd 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -21,3 +21,4 @@ Contributors * Louis-Philippe Caron * Sarah Gammon `@SarahG-579462 `_ * Yannick Rousseau +* Marco Braun `@vindelico `_ diff --git a/HISTORY.rst b/HISTORY.rst index 6d9f34e0..9151b81c 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -16,6 +16,7 @@ New features and enhancements * New function `get_warming_level` to search within the IPCC CMIP global temperatures CSV without requiring data. (:issue:`208`, :pull:`210`). * File re-structuration from catalogs with ``xscen.catutils.build_path``. (:pull:`205`). * New scripting functions `save_and_update` and `move_and_delete`. (:pull:`214`). +* spatial dimensions can be generalized as X/Y when rechunking and will be mapped to rlon/rlat or lon/lat accordingly. Breaking changes ^^^^^^^^^^^^^^^^ From b2509b7e0da7888ff4da4a8e2d937340808aa349 Mon Sep 17 00:00:00 2001 From: Marco Braun Date: Thu, 6 Jul 2023 19:14:53 -0400 Subject: [PATCH 04/17] update .zenodo.json --- .zenodo.json | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.zenodo.json b/.zenodo.json index d74202de..9ad96650 100644 --- a/.zenodo.json +++ b/.zenodo.json @@ -34,6 +34,11 @@ "name": "Caron, Louis-Philippe", "affiliation": "Ouranos", "orcid": "0000-0001-5221-0147" + }, + { + "name": "Braun, Marco", + "affiliation": "Ouranos", + "orcid": "0000-0001-5061-3217" } ], "keywords": [ From 1ed3f3257afce4a6087503dc0579d51f7f046e57 Mon Sep 17 00:00:00 2001 From: Marco Braun <43412203+vindelico@users.noreply.github.com> Date: Mon, 10 Jul 2023 16:26:25 -0400 Subject: [PATCH 05/17] Update xscen/io.py Co-authored-by: Pascal Bourgault --- xscen/io.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/xscen/io.py b/xscen/io.py index 63a6d24a..60d6f3ea 100644 --- a/xscen/io.py +++ b/xscen/io.py @@ -498,8 +498,9 @@ def _rechunk_for_saving(ds, rechunk): rechunk_dims = rechunk.copy() # get actual axes labels - if {"X", "Y"}.issubset(rechunk_dims): + if "X" in rechunk_dims and "X" not in ds.dims: rechunk_dims[ds.cf.axes["X"][0]] = rechunk_dims.pop("X") + if "Y" in rechunk_dims and "Y" not in ds.dims: rechunk_dims[ds.cf.axes["Y"][0]] = rechunk_dims.pop("Y") ds[rechunk_var] = ds[rechunk_var].chunk( From c63b0054ee5c44453253147fdeb637c8a91648e6 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 10 Jul 2023 20:26:43 +0000 Subject: [PATCH 06/17] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xscen/io.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xscen/io.py b/xscen/io.py index 60d6f3ea..bf891b7a 100644 --- a/xscen/io.py +++ b/xscen/io.py @@ -498,9 +498,9 @@ def _rechunk_for_saving(ds, rechunk): rechunk_dims = rechunk.copy() # get actual axes labels - if "X" in rechunk_dims and "X" not in ds.dims: + if "X" in rechunk_dims and "X" not in ds.dims: rechunk_dims[ds.cf.axes["X"][0]] = rechunk_dims.pop("X") - if "Y" in rechunk_dims and "Y" not in ds.dims: + if "Y" in rechunk_dims and "Y" not in ds.dims: rechunk_dims[ds.cf.axes["Y"][0]] = rechunk_dims.pop("Y") ds[rechunk_var] = ds[rechunk_var].chunk( From 82ab63a857019bb767b02f8b00b4dc61fc28f4a0 Mon Sep 17 00:00:00 2001 From: Marco Braun <43412203+vindelico@users.noreply.github.com> Date: Mon, 10 Jul 2023 16:28:39 -0400 Subject: [PATCH 07/17] Update xscen/io.py Co-authored-by: Pascal Bourgault --- xscen/io.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xscen/io.py b/xscen/io.py index bf891b7a..9cc3edfe 100644 --- a/xscen/io.py +++ b/xscen/io.py @@ -489,7 +489,7 @@ def coerce_attrs(attrs): def _rechunk_for_saving(ds, rechunk): - """Do chunking before saving to .zarr or .nc, generalized as Y/X for different axes lat/lon, rlat/rlon""" + """Rechunk before saving to .zarr or .nc, generalized as Y/X for different axes lat/lon, rlat/rlon.""" for rechunk_var in ds.data_vars: # Support for chunks varying per variable if rechunk_var in rechunk: From 4fe31a43947dd1d38ac11ab6aa5994982e871209 Mon Sep 17 00:00:00 2001 From: Marco Braun <43412203+vindelico@users.noreply.github.com> Date: Mon, 10 Jul 2023 16:29:46 -0400 Subject: [PATCH 08/17] Update HISTORY.rst Co-authored-by: juliettelavoie --- HISTORY.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/HISTORY.rst b/HISTORY.rst index 9151b81c..eea0d55d 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -16,7 +16,7 @@ New features and enhancements * New function `get_warming_level` to search within the IPCC CMIP global temperatures CSV without requiring data. (:issue:`208`, :pull:`210`). * File re-structuration from catalogs with ``xscen.catutils.build_path``. (:pull:`205`). * New scripting functions `save_and_update` and `move_and_delete`. (:pull:`214`). -* spatial dimensions can be generalized as X/Y when rechunking and will be mapped to rlon/rlat or lon/lat accordingly. +* Spatial dimensions can be generalized as X/Y when rechunking and will be mapped to rlon/rlat or lon/lat accordingly. (:pull: `221`). Breaking changes ^^^^^^^^^^^^^^^^ From aea398ef99d6485c5d20c8ba19134415b6265697 Mon Sep 17 00:00:00 2001 From: Marco Braun <43412203+vindelico@users.noreply.github.com> Date: Mon, 10 Jul 2023 16:32:39 -0400 Subject: [PATCH 09/17] Update HISTORY.rst --- HISTORY.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/HISTORY.rst b/HISTORY.rst index eea0d55d..d77ef66c 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -16,7 +16,7 @@ New features and enhancements * New function `get_warming_level` to search within the IPCC CMIP global temperatures CSV without requiring data. (:issue:`208`, :pull:`210`). * File re-structuration from catalogs with ``xscen.catutils.build_path``. (:pull:`205`). * New scripting functions `save_and_update` and `move_and_delete`. (:pull:`214`). -* Spatial dimensions can be generalized as X/Y when rechunking and will be mapped to rlon/rlat or lon/lat accordingly. (:pull: `221`). +* Spatial dimensions can be generalized as X/Y when rechunking and will be mapped to rlon/rlat or lon/lat accordingly. (:pull:`221`). Breaking changes ^^^^^^^^^^^^^^^^ From fafe3eccd941da4f35973096d0164769b88078da Mon Sep 17 00:00:00 2001 From: Marco Braun Date: Mon, 24 Jul 2023 15:10:21 -0400 Subject: [PATCH 10/17] documentation and tests for generalize_rlatrlon_latlon --- docs/notebooks/2_getting_started.ipynb | 39 ++++----- tests/test_io.py | 106 +++++++++++++++++++++++++ xscen/io.py | 4 + 3 files changed, 131 insertions(+), 18 deletions(-) create mode 100644 tests/test_io.py diff --git a/docs/notebooks/2_getting_started.ipynb b/docs/notebooks/2_getting_started.ipynb index d9cb7569..958df1d3 100644 --- a/docs/notebooks/2_getting_started.ipynb +++ b/docs/notebooks/2_getting_started.ipynb @@ -61,7 +61,7 @@ "### Searching a subset of datasets within *DataCatalogs*\n", "\n", "
INFO\n", - " \n", + "\n", "At this stage, the search criteria should be for variables that will be **bias corrected**, not necessarily the variables required for the final product. For example, if `sfcWindfromdir` is the final product, then `uas` and `vas` should be searched for since these are the variables that will be bias corrected.\n", "
\n", "\n", @@ -130,8 +130,8 @@ "source": [ "## Extracting data\n", "\n", - "
WARNING \n", - " \n", + "
WARNING\n", + "\n", "It is heavily recommended to stop and analyse the results of `search_data_catalogs` before proceeding to the extraction function.\n", "
\n", "\n", @@ -221,8 +221,8 @@ "\n", "**NOTE:** Calling the extraction function without passing by `search_data_catalogs` beforehand is possible, but will not support *DerivedVariables*.\n", "\n", - "
NOTE \n", - " \n", + "
NOTE\n", + "\n", "`extract_dataset` currently only accepts a single unique ID at a time.\n", "
" ] @@ -259,11 +259,14 @@ "`xscen` has two functions for the purpose of saving data: `save_to_netcdf` and `save_to_zarr`. If possible for a given project, it is strongly recommended to use Zarr files since these are often orders of magnitude faster to read and create compared to NetCDF. They do have a few quirks, however:\n", "\n", "- Chunk size must separate the dataset in *exactly* equal chunks (with the exception of the last). While it is recommended to calculate ideal chunking and provide them explicitely to the function, `io.estimate_chunks()` can be used to automatically estimate a decent chunking. In a similar way, `io.subset_maxsize()` can be used to roughly cut a dataset along the *time* dimension into subsets of a given size (in Gb), which is especially useful for saving NetCDF files.\n", + " Chunk sizes can be passed to the two saving functions in a dictionary. Spatial dimensions can be generalized as `'X'` and `'Y'` which will be mapped to the *xarray.Dataset*'s actual grid type's dimension names.\n", "\n", "\n", "- Default behaviour for a Zarr is to act like a directory, with each new variable being assigned a subfolder. This is great when all required variables have the same dimensions and frequency, but will crash otherwise. If you have daily *tasmax* and subdaily *pr*, for example, they need to be assigned different paths.\n", "\n", "\n", + "\n", + "\n", "#### Updating the catalog\n", "\n", "`intake-esm` will automatically copy the catalog's entry in the dataset's metadata, with a `cat:attr` format. Where appropriate, `xscen` updates that information to keep the metadata up to date with the manipulations. `ProjectCatalog.update_from_ds` will in turn read the metadata of a Dataset and fill in the information into a new catalog entry.\n", @@ -376,7 +379,7 @@ "source": [ "## Regridding data\n", "\n", - "
NOTE \n", + "
NOTE\n", "\n", "Regridding in `xscen` is built upon `xESMF`. For more information on basic usage and available regridding methods, [please consult their documentation](https://xesmf.readthedocs.io/en/latest/). Their [masking and extrapolation tutorial](https://xesmf.readthedocs.io/en/latest/notebooks/Masking.html) is of particular interest.\n", "\n", @@ -481,8 +484,8 @@ "source": [ "### Preparing arguments for xESMF.Regridder\n", "\n", - "
NOTE \n", - " \n", + "
NOTE\n", + "\n", "xESMF's API appears to be broken on their ReadTheDocs. For a list of available arguments and options in `Regridder()`, please consult their [Github page](https://github.com/pangeo-data/xESMF/blob/master/xesmf/frontend.py) directly.\n", "
\n", "\n", @@ -522,7 +525,7 @@ "Other options exist in `ESMF/ESMPy`, but not `xESMF`. As they get implemented, they should automatically get supported by `xscen`.\n", "\n", "
NOTE\n", - " \n", + "\n", "Some utilities that exist in `xESMF` have not yet been explicitely added to `xscen`. If *conservative* regridding is desired, for instance, some additional scripts might be required on the User's side to create the lon/lat boundaries\n", "
" ] @@ -642,8 +645,8 @@ "source": [ "## Bias adjusting data\n", "\n", - "
NOTE \n", - " \n", + "
NOTE\n", + "\n", "Bias adjustment in `xscen` is built upon `xclim.sdba`. For more information on basic usage and available methods, [please consult their documentation](https://xclim.readthedocs.io/en/stable/sdba.html).\n", "
\n", "\n", @@ -694,9 +697,9 @@ "- `periods` defines the period(s) to bias adjust.\n", "- `xclim_adjust_kwargs` is described above.\n", "\n", - "
NOTE \n", - " \n", - "These functions currently do not support multiple variables due to the fact that train and adjust arguments might vary. The function must be called separately for every variable. \n", + "
NOTE\n", + "\n", + "These functions currently do not support multiple variables due to the fact that train and adjust arguments might vary. The function must be called separately for every variable.\n", "
" ] }, @@ -804,8 +807,8 @@ "source": [ "## Computing indicators\n", "\n", - "
NOTE \n", - " \n", + "
NOTE\n", + "\n", "`xscen` relies heavily on `xclim`'s YAML support for calculating indicators. For more information on how to build the YAML file, consult [this Notebook](https://xclim.readthedocs.io/en/latest/notebooks/extendxclim.html?highlight=yaml#YAML-file).\n", "
\n", "\n", @@ -1077,7 +1080,7 @@ "\n", "Usually, we would create an ensemble out of different models, but for this toy example, it will be made out of the 2 experiments of the same model that we have available.\n", "\n", - "
WARNING \n", + "
WARNING\n", "\n", "If given a set of paths, `xclim.ensembles.create_ensemble` will ignore the chunking on disk and open the datasets with only a chunking along the new `realization` dimension. Thus, for large datasets, `create_kwargs` should be used to explicitely specify chunks.\n", "
" @@ -1246,7 +1249,7 @@ "\n", "print(\"\")\n", "print(\"Inspect final attributes\")\n", - "display(ds_clean)" + "display(ds_clean)\n" ] } ], diff --git a/tests/test_io.py b/tests/test_io.py new file mode 100644 index 00000000..ab9df5dc --- /dev/null +++ b/tests/test_io.py @@ -0,0 +1,106 @@ +import numpy as np +import pytest +from xclim.testing.helpers import test_timeseries as timeseries +import pandas as pd +import xarray as xr +import numpy as np + +import xscen as xs + +import xclim.xclim.sdba.measures + + +class TestGenericRechunkingDims: + + def get_test_dataset(self, data: np.ndarray = None): + + """Create a 3D test Dataset. + + Parameters + ---------- + data : a 3D numpy array to populate the dataset, dimensionsa are interpreted [lat, lon, time] + (TODO: do with xarray.DataArray) + + Returns + ------- + ds : Dataset with cf information. + """ + + if isinstance(data, np.ndarray) and len(data.shape) == 3: + tas = data + lat = np.arange(0, data.shape[0]) * 90 / data.shape[0] + lon = np.arange(0, data.shape[1]) * 180 / data.shape[1] + time = pd.date_range("2010-10-18", periods=data.shape[2]) + else: + tas = [[[14.7, 10.1, 17.0], + [3.4, 23.5, 15.1]], + [[8.5, 12.4, 25.8], + [14.5, 5.8, 15.3]]] + # pr = [[[9.81, 0.00, 8.88], + # [0.00, 8.81, 1.22]], + # [[2.76, 3.31, 2.21], + # [6.16, 6.03, 3.35]]] + lon = [-88.83, -88.32] + lat = [44.25, 44.21] + time = pd.date_range("2010-10-18", periods=3) + ds = xr.Dataset( + data_vars=dict( + tas=(["lat", "lon", "time"], tas, {'standard_name': 'air_temperature'}), + # pr=(["lat", "lon", "time"], pr, {'standard_name': 'precipitation'}), + ), + coords=dict( + lon=(["lon"], lon, {'standard_name': 'Longitude', 'axis': 'X'}), + lat=(["lat"], lat, {'standard_name': 'Latitude', 'axis': 'Y'}), + time=(['time'], time, {'standard_name': 'time'}) + ), + attrs=dict(description="Climate data dummy."), + ) + return ds + + def get_test_dataset_rlat_rlon(self, data: np.ndarray = None): + + ds = self.get_test_dataset(data) + # make dummy rlat / rlon + ds = ds.assign_coords({'rlon': ('lon', ds.lon.values), 'rlat': ('lat', ds.lat.values)}) + ds = ds.swap_dims({'lat': 'rlat', 'lon': 'rlon'}) + ds.coords['rlon'] = ds.coords['rlon'].assign_attrs(ds['lon'].attrs) + ds.coords['rlon'] = ds.coords['rlon'].assign_attrs({'standard_name': 'grid_longitude'}) + ds.coords['rlat'] = ds.coords['rlat'].assign_attrs(ds['lat'].attrs) + ds.coords['rlat'] = ds.coords['rlat'].assign_attrs({'standard_name': 'grid_latitude'}) + ds = ds.drop_vars(['lat', 'lon']) + + return ds + + def test_rechunk_for_saving_lat_lon(self): + + ds = self.get_test_dataset(np.random.random((30, 30, 50))) + new_chunks = {'lat': 10, 'lon': 10, 'time': 20} + ds_ch = xs.io._rechunk_for_saving(ds, new_chunks) + for dim, chunks in ds_ch.chunks.items(): + assert chunks[0] == new_chunks[dim] + + def test_rechunk_for_saving_XY_lat_lon(self): + + ds = self.get_test_dataset(np.random.random((30, 30, 50))) + new_chunks = {'X': 10, 'Y': 10, 'time': 20} + ds_ch = xs.io._rechunk_for_saving(ds, new_chunks) + for dim, chunks in zip(['X', 'Y', 'time'], ds_ch.chunks.values()): + assert chunks[0] == new_chunks[dim] + + def test_rechunk_for_saving_rlat_rlon(self): + + ds = self.get_test_dataset_rlat_rlon(np.random.random((30, 30, 50))) + new_chunks = {'rlat': 10, 'rlon': 10, 'time': 20} + ds_ch = xs.io._rechunk_for_saving(ds, new_chunks) + for dim, chunks in ds_ch.chunks.items(): + assert chunks[0] == new_chunks[dim] + + def test_rechunk_for_saving_XY_rlat_lon(self): + + ds = self.get_test_dataset_rlat_rlon(np.random.random((30, 30, 50))) + new_chunks = {'X': 10, 'Y': 10, 'time': 20} + ds_ch = xs.io._rechunk_for_saving(ds, new_chunks) + for dim, chunks in zip(['X', 'Y', 'time'], ds_ch.chunks.values()): + assert chunks[0] == new_chunks[dim] + + diff --git a/xscen/io.py b/xscen/io.py index 9cc3edfe..4eb030c7 100644 --- a/xscen/io.py +++ b/xscen/io.py @@ -291,6 +291,8 @@ def save_to_netcdf( Name of the NetCDF file to be saved. rechunk : dict, optional This is a mapping from dimension name to new chunks (in any format understood by dask). + Spatial dimensions can be generalized as 'X' and 'Y' which will be mapped to the actual grid type's + dimension names. Rechunking is only done on *data* variables sharing dimensions with this argument. netcdf_kwargs : dict, optional Additional arguments to send to_netcdf() @@ -356,6 +358,8 @@ def save_to_zarr( Name of the Zarr file to be saved. rechunk : dict, optional This is a mapping from dimension name to new chunks (in any format understood by dask). + Spatial dimensions can be generalized as 'X' and 'Y' which will be mapped to the actual grid type's + dimension names. Rechunking is only done on *data* variables sharing dimensions with this argument. zarr_kwargs : dict, optional Additional arguments to send to_zarr() From 412309fd48df5ada676048527eb86554c51f542e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 24 Jul 2023 19:12:17 +0000 Subject: [PATCH 11/17] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/test_io.py | 65 ++++++++++++++++++++++-------------------------- 1 file changed, 30 insertions(+), 35 deletions(-) diff --git a/tests/test_io.py b/tests/test_io.py index ab9df5dc..24b6e6dc 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -1,19 +1,15 @@ import numpy as np -import pytest -from xclim.testing.helpers import test_timeseries as timeseries import pandas as pd +import pytest import xarray as xr -import numpy as np +import xclim.xclim.sdba.measures +from xclim.testing.helpers import test_timeseries as timeseries import xscen as xs -import xclim.xclim.sdba.measures - class TestGenericRechunkingDims: - def get_test_dataset(self, data: np.ndarray = None): - """Create a 3D test Dataset. Parameters @@ -32,10 +28,10 @@ def get_test_dataset(self, data: np.ndarray = None): lon = np.arange(0, data.shape[1]) * 180 / data.shape[1] time = pd.date_range("2010-10-18", periods=data.shape[2]) else: - tas = [[[14.7, 10.1, 17.0], - [3.4, 23.5, 15.1]], - [[8.5, 12.4, 25.8], - [14.5, 5.8, 15.3]]] + tas = [ + [[14.7, 10.1, 17.0], [3.4, 23.5, 15.1]], + [[8.5, 12.4, 25.8], [14.5, 5.8, 15.3]], + ] # pr = [[[9.81, 0.00, 8.88], # [0.00, 8.81, 1.22]], # [[2.76, 3.31, 2.21], @@ -45,62 +41,61 @@ def get_test_dataset(self, data: np.ndarray = None): time = pd.date_range("2010-10-18", periods=3) ds = xr.Dataset( data_vars=dict( - tas=(["lat", "lon", "time"], tas, {'standard_name': 'air_temperature'}), + tas=(["lat", "lon", "time"], tas, {"standard_name": "air_temperature"}), # pr=(["lat", "lon", "time"], pr, {'standard_name': 'precipitation'}), ), coords=dict( - lon=(["lon"], lon, {'standard_name': 'Longitude', 'axis': 'X'}), - lat=(["lat"], lat, {'standard_name': 'Latitude', 'axis': 'Y'}), - time=(['time'], time, {'standard_name': 'time'}) + lon=(["lon"], lon, {"standard_name": "Longitude", "axis": "X"}), + lat=(["lat"], lat, {"standard_name": "Latitude", "axis": "Y"}), + time=(["time"], time, {"standard_name": "time"}), ), attrs=dict(description="Climate data dummy."), ) return ds def get_test_dataset_rlat_rlon(self, data: np.ndarray = None): - ds = self.get_test_dataset(data) # make dummy rlat / rlon - ds = ds.assign_coords({'rlon': ('lon', ds.lon.values), 'rlat': ('lat', ds.lat.values)}) - ds = ds.swap_dims({'lat': 'rlat', 'lon': 'rlon'}) - ds.coords['rlon'] = ds.coords['rlon'].assign_attrs(ds['lon'].attrs) - ds.coords['rlon'] = ds.coords['rlon'].assign_attrs({'standard_name': 'grid_longitude'}) - ds.coords['rlat'] = ds.coords['rlat'].assign_attrs(ds['lat'].attrs) - ds.coords['rlat'] = ds.coords['rlat'].assign_attrs({'standard_name': 'grid_latitude'}) - ds = ds.drop_vars(['lat', 'lon']) + ds = ds.assign_coords( + {"rlon": ("lon", ds.lon.values), "rlat": ("lat", ds.lat.values)} + ) + ds = ds.swap_dims({"lat": "rlat", "lon": "rlon"}) + ds.coords["rlon"] = ds.coords["rlon"].assign_attrs(ds["lon"].attrs) + ds.coords["rlon"] = ds.coords["rlon"].assign_attrs( + {"standard_name": "grid_longitude"} + ) + ds.coords["rlat"] = ds.coords["rlat"].assign_attrs(ds["lat"].attrs) + ds.coords["rlat"] = ds.coords["rlat"].assign_attrs( + {"standard_name": "grid_latitude"} + ) + ds = ds.drop_vars(["lat", "lon"]) return ds def test_rechunk_for_saving_lat_lon(self): - ds = self.get_test_dataset(np.random.random((30, 30, 50))) - new_chunks = {'lat': 10, 'lon': 10, 'time': 20} + new_chunks = {"lat": 10, "lon": 10, "time": 20} ds_ch = xs.io._rechunk_for_saving(ds, new_chunks) for dim, chunks in ds_ch.chunks.items(): assert chunks[0] == new_chunks[dim] def test_rechunk_for_saving_XY_lat_lon(self): - ds = self.get_test_dataset(np.random.random((30, 30, 50))) - new_chunks = {'X': 10, 'Y': 10, 'time': 20} + new_chunks = {"X": 10, "Y": 10, "time": 20} ds_ch = xs.io._rechunk_for_saving(ds, new_chunks) - for dim, chunks in zip(['X', 'Y', 'time'], ds_ch.chunks.values()): + for dim, chunks in zip(["X", "Y", "time"], ds_ch.chunks.values()): assert chunks[0] == new_chunks[dim] def test_rechunk_for_saving_rlat_rlon(self): - ds = self.get_test_dataset_rlat_rlon(np.random.random((30, 30, 50))) - new_chunks = {'rlat': 10, 'rlon': 10, 'time': 20} + new_chunks = {"rlat": 10, "rlon": 10, "time": 20} ds_ch = xs.io._rechunk_for_saving(ds, new_chunks) for dim, chunks in ds_ch.chunks.items(): assert chunks[0] == new_chunks[dim] def test_rechunk_for_saving_XY_rlat_lon(self): - ds = self.get_test_dataset_rlat_rlon(np.random.random((30, 30, 50))) - new_chunks = {'X': 10, 'Y': 10, 'time': 20} + new_chunks = {"X": 10, "Y": 10, "time": 20} ds_ch = xs.io._rechunk_for_saving(ds, new_chunks) - for dim, chunks in zip(['X', 'Y', 'time'], ds_ch.chunks.values()): + for dim, chunks in zip(["X", "Y", "time"], ds_ch.chunks.values()): assert chunks[0] == new_chunks[dim] - - From ba67c106fb9cd8dd63110bfcd89d390beb6cf9ae Mon Sep 17 00:00:00 2001 From: Marco Braun <43412203+vindelico@users.noreply.github.com> Date: Tue, 25 Jul 2023 11:29:27 -0400 Subject: [PATCH 12/17] Update docs/notebooks/2_getting_started.ipynb Co-authored-by: RondeauG <38501935+RondeauG@users.noreply.github.com> --- docs/notebooks/2_getting_started.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/notebooks/2_getting_started.ipynb b/docs/notebooks/2_getting_started.ipynb index 958df1d3..69a97796 100644 --- a/docs/notebooks/2_getting_started.ipynb +++ b/docs/notebooks/2_getting_started.ipynb @@ -259,7 +259,7 @@ "`xscen` has two functions for the purpose of saving data: `save_to_netcdf` and `save_to_zarr`. If possible for a given project, it is strongly recommended to use Zarr files since these are often orders of magnitude faster to read and create compared to NetCDF. They do have a few quirks, however:\n", "\n", "- Chunk size must separate the dataset in *exactly* equal chunks (with the exception of the last). While it is recommended to calculate ideal chunking and provide them explicitely to the function, `io.estimate_chunks()` can be used to automatically estimate a decent chunking. In a similar way, `io.subset_maxsize()` can be used to roughly cut a dataset along the *time* dimension into subsets of a given size (in Gb), which is especially useful for saving NetCDF files.\n", - " Chunk sizes can be passed to the two saving functions in a dictionary. Spatial dimensions can be generalized as `'X'` and `'Y'` which will be mapped to the *xarray.Dataset*'s actual grid type's dimension names.\n", + " Chunk sizes can be passed to the two saving functions in a dictionary. Spatial dimensions can be generalized as `'X'` and `'Y'`, which will be mapped to the *xarray.Dataset*'s actual grid type's dimension names.\n", "\n", "\n", "- Default behaviour for a Zarr is to act like a directory, with each new variable being assigned a subfolder. This is great when all required variables have the same dimensions and frequency, but will crash otherwise. If you have daily *tasmax* and subdaily *pr*, for example, they need to be assigned different paths.\n", From ee324d24f7d641148038db41e2504e91df6b0df1 Mon Sep 17 00:00:00 2001 From: Marco Braun <43412203+vindelico@users.noreply.github.com> Date: Tue, 25 Jul 2023 11:30:46 -0400 Subject: [PATCH 13/17] Update xscen/io.py Co-authored-by: RondeauG <38501935+RondeauG@users.noreply.github.com> --- xscen/io.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xscen/io.py b/xscen/io.py index 4eb030c7..563bd6fd 100644 --- a/xscen/io.py +++ b/xscen/io.py @@ -291,7 +291,7 @@ def save_to_netcdf( Name of the NetCDF file to be saved. rechunk : dict, optional This is a mapping from dimension name to new chunks (in any format understood by dask). - Spatial dimensions can be generalized as 'X' and 'Y' which will be mapped to the actual grid type's + Spatial dimensions can be generalized as 'X' and 'Y', which will be mapped to the actual grid type's dimension names. Rechunking is only done on *data* variables sharing dimensions with this argument. netcdf_kwargs : dict, optional From ab3c08fde881731e2e48712b8db4b77416143302 Mon Sep 17 00:00:00 2001 From: Marco Braun Date: Tue, 25 Jul 2023 11:35:50 -0400 Subject: [PATCH 14/17] repairs following github discussion --- HISTORY.rst | 2 +- tests/test_io.py | 18 +++++------------- xscen/io.py | 23 +++++++++++++++++++---- 3 files changed, 25 insertions(+), 18 deletions(-) diff --git a/HISTORY.rst b/HISTORY.rst index 933a00b0..611524a9 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -4,7 +4,7 @@ History v0.7.0 (unreleased) ------------------- -Contributors to this version: Gabriel Rondeau-Genesse (:user:`RondeauG`), Pascal Bourgault (:user:`aulemahal`), Trevor James Smith (:user:`Zeitsperre`), Juliette Lavoie (:user:`juliettelavoie`). +Contributors to this version: Gabriel Rondeau-Genesse (:user:`RondeauG`), Pascal Bourgault (:user:`aulemahal`), Trevor James Smith (:user:`Zeitsperre`), Juliette Lavoie (:user:`juliettelavoie`), Marco Braun (:user:`vindelico``. Announcements ^^^^^^^^^^^^^ diff --git a/tests/test_io.py b/tests/test_io.py index ab9df5dc..5053ceb6 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -4,11 +4,8 @@ import pandas as pd import xarray as xr import numpy as np - import xscen as xs -import xclim.xclim.sdba.measures - class TestGenericRechunkingDims: @@ -36,17 +33,12 @@ def get_test_dataset(self, data: np.ndarray = None): [3.4, 23.5, 15.1]], [[8.5, 12.4, 25.8], [14.5, 5.8, 15.3]]] - # pr = [[[9.81, 0.00, 8.88], - # [0.00, 8.81, 1.22]], - # [[2.76, 3.31, 2.21], - # [6.16, 6.03, 3.35]]] lon = [-88.83, -88.32] lat = [44.25, 44.21] time = pd.date_range("2010-10-18", periods=3) ds = xr.Dataset( data_vars=dict( tas=(["lat", "lon", "time"], tas, {'standard_name': 'air_temperature'}), - # pr=(["lat", "lon", "time"], pr, {'standard_name': 'precipitation'}), ), coords=dict( lon=(["lon"], lon, {'standard_name': 'Longitude', 'axis': 'X'}), @@ -75,7 +67,7 @@ def test_rechunk_for_saving_lat_lon(self): ds = self.get_test_dataset(np.random.random((30, 30, 50))) new_chunks = {'lat': 10, 'lon': 10, 'time': 20} - ds_ch = xs.io._rechunk_for_saving(ds, new_chunks) + ds_ch = xs.io.rechunk_for_saving(ds, new_chunks) for dim, chunks in ds_ch.chunks.items(): assert chunks[0] == new_chunks[dim] @@ -83,7 +75,7 @@ def test_rechunk_for_saving_XY_lat_lon(self): ds = self.get_test_dataset(np.random.random((30, 30, 50))) new_chunks = {'X': 10, 'Y': 10, 'time': 20} - ds_ch = xs.io._rechunk_for_saving(ds, new_chunks) + ds_ch = xs.io.rechunk_for_saving(ds, new_chunks) for dim, chunks in zip(['X', 'Y', 'time'], ds_ch.chunks.values()): assert chunks[0] == new_chunks[dim] @@ -91,7 +83,7 @@ def test_rechunk_for_saving_rlat_rlon(self): ds = self.get_test_dataset_rlat_rlon(np.random.random((30, 30, 50))) new_chunks = {'rlat': 10, 'rlon': 10, 'time': 20} - ds_ch = xs.io._rechunk_for_saving(ds, new_chunks) + ds_ch = xs.io.rechunk_for_saving(ds, new_chunks) for dim, chunks in ds_ch.chunks.items(): assert chunks[0] == new_chunks[dim] @@ -99,8 +91,8 @@ def test_rechunk_for_saving_XY_rlat_lon(self): ds = self.get_test_dataset_rlat_rlon(np.random.random((30, 30, 50))) new_chunks = {'X': 10, 'Y': 10, 'time': 20} - ds_ch = xs.io._rechunk_for_saving(ds, new_chunks) + ds_ch = xs.io.rechunk_for_saving(ds, new_chunks) for dim, chunks in zip(['X', 'Y', 'time'], ds_ch.chunks.values()): assert chunks[0] == new_chunks[dim] - + diff --git a/xscen/io.py b/xscen/io.py index 4eb030c7..ffc69327 100644 --- a/xscen/io.py +++ b/xscen/io.py @@ -29,6 +29,7 @@ "save_to_netcdf", "save_to_zarr", "subset_maxsize", + "rechunk_for_saving", ] @@ -306,7 +307,7 @@ def save_to_netcdf( xarray.Dataset.to_netcdf """ if rechunk: - ds = _rechunk_for_saving(ds, rechunk) + ds = rechunk_for_saving(ds, rechunk) path = Path(filename) path.parent.mkdir(parents=True, exist_ok=True) @@ -393,7 +394,7 @@ def save_to_zarr( ds[v].encoding.clear() if rechunk: - ds = _rechunk_for_saving(ds, rechunk) + ds = rechunk_for_saving(ds, rechunk) path = Path(filename) path.parent.mkdir(parents=True, exist_ok=True) @@ -492,8 +493,22 @@ def coerce_attrs(attrs): raise -def _rechunk_for_saving(ds, rechunk): - """Rechunk before saving to .zarr or .nc, generalized as Y/X for different axes lat/lon, rlat/rlon.""" +def rechunk_for_saving(ds, rechunk): + """Rechunk before saving to .zarr or .nc, generalized as Y/X for different axes lat/lon, rlat/rlon. + + Parameters + ---------- + ds : xr.Dataset + The xr.Dataset to be rechunked. + rechunk : dict + A dictionary with the dimension names of ds and the new chunk size. Spatial dimensions + can be provided as X/Y. + + Returns + ------- + xr.Dataset + The dataset with new chunking. + """ for rechunk_var in ds.data_vars: # Support for chunks varying per variable if rechunk_var in rechunk: From 112abc8dd2ea100148e9c8395488a51516b1d828 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 25 Jul 2023 15:46:38 +0000 Subject: [PATCH 15/17] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/test_io.py | 24 ++++++++++++------------ xscen/io.py | 26 +++++++++++++------------- 2 files changed, 25 insertions(+), 25 deletions(-) diff --git a/tests/test_io.py b/tests/test_io.py index e4851464..4ba89ebf 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -2,7 +2,7 @@ import pandas as pd import pytest import xarray as xr -import numpy as np + import xscen as xs @@ -26,16 +26,16 @@ def get_test_dataset(self, data: np.ndarray = None): lon = np.arange(0, data.shape[1]) * 180 / data.shape[1] time = pd.date_range("2010-10-18", periods=data.shape[2]) else: - tas = [[[14.7, 10.1, 17.0], - [3.4, 23.5, 15.1]], - [[8.5, 12.4, 25.8], - [14.5, 5.8, 15.3]]] + tas = [ + [[14.7, 10.1, 17.0], [3.4, 23.5, 15.1]], + [[8.5, 12.4, 25.8], [14.5, 5.8, 15.3]], + ] lon = [-88.83, -88.32] lat = [44.25, 44.21] time = pd.date_range("2010-10-18", periods=3) ds = xr.Dataset( data_vars=dict( - tas=(["lat", "lon", "time"], tas, {'standard_name': 'air_temperature'}) + tas=(["lat", "lon", "time"], tas, {"standard_name": "air_temperature"}) ), coords=dict( lon=(["lon"], lon, {"standard_name": "Longitude", "axis": "X"}), @@ -67,28 +67,28 @@ def get_test_dataset_rlat_rlon(self, data: np.ndarray = None): def test_rechunk_for_saving_lat_lon(self): ds = self.get_test_dataset(np.random.random((30, 30, 50))) - new_chunks = {'lat': 10, 'lon': 10, 'time': 20} + new_chunks = {"lat": 10, "lon": 10, "time": 20} ds_ch = xs.io.rechunk_for_saving(ds, new_chunks) for dim, chunks in ds_ch.chunks.items(): assert chunks[0] == new_chunks[dim] def test_rechunk_for_saving_XY_lat_lon(self): ds = self.get_test_dataset(np.random.random((30, 30, 50))) - new_chunks = {'X': 10, 'Y': 10, 'time': 20} + new_chunks = {"X": 10, "Y": 10, "time": 20} ds_ch = xs.io.rechunk_for_saving(ds, new_chunks) - for dim, chunks in zip(['X', 'Y', 'time'], ds_ch.chunks.values()): + for dim, chunks in zip(["X", "Y", "time"], ds_ch.chunks.values()): assert chunks[0] == new_chunks[dim] def test_rechunk_for_saving_rlat_rlon(self): ds = self.get_test_dataset_rlat_rlon(np.random.random((30, 30, 50))) - new_chunks = {'rlat': 10, 'rlon': 10, 'time': 20} + new_chunks = {"rlat": 10, "rlon": 10, "time": 20} ds_ch = xs.io.rechunk_for_saving(ds, new_chunks) for dim, chunks in ds_ch.chunks.items(): assert chunks[0] == new_chunks[dim] def test_rechunk_for_saving_XY_rlat_lon(self): ds = self.get_test_dataset_rlat_rlon(np.random.random((30, 30, 50))) - new_chunks = {'X': 10, 'Y': 10, 'time': 20} + new_chunks = {"X": 10, "Y": 10, "time": 20} ds_ch = xs.io.rechunk_for_saving(ds, new_chunks) - for dim, chunks in zip(['X', 'Y', 'time'], ds_ch.chunks.values()): + for dim, chunks in zip(["X", "Y", "time"], ds_ch.chunks.values()): assert chunks[0] == new_chunks[dim] diff --git a/xscen/io.py b/xscen/io.py index b8fb28aa..54796fef 100644 --- a/xscen/io.py +++ b/xscen/io.py @@ -496,19 +496,19 @@ def coerce_attrs(attrs): def rechunk_for_saving(ds, rechunk): """Rechunk before saving to .zarr or .nc, generalized as Y/X for different axes lat/lon, rlat/rlon. - Parameters - ---------- - ds : xr.Dataset - The xr.Dataset to be rechunked. - rechunk : dict - A dictionary with the dimension names of ds and the new chunk size. Spatial dimensions - can be provided as X/Y. - - Returns - ------- - xr.Dataset - The dataset with new chunking. - """ + Parameters + ---------- + ds : xr.Dataset + The xr.Dataset to be rechunked. + rechunk : dict + A dictionary with the dimension names of ds and the new chunk size. Spatial dimensions + can be provided as X/Y. + + Returns + ------- + xr.Dataset + The dataset with new chunking. + """ for rechunk_var in ds.data_vars: # Support for chunks varying per variable if rechunk_var in rechunk: From 633469d57797c31ad37c48cc918e064b758d3c19 Mon Sep 17 00:00:00 2001 From: Marco Braun <43412203+vindelico@users.noreply.github.com> Date: Tue, 25 Jul 2023 11:47:28 -0400 Subject: [PATCH 16/17] Update HISTORY.rst --- HISTORY.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/HISTORY.rst b/HISTORY.rst index 611524a9..48508595 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -4,7 +4,7 @@ History v0.7.0 (unreleased) ------------------- -Contributors to this version: Gabriel Rondeau-Genesse (:user:`RondeauG`), Pascal Bourgault (:user:`aulemahal`), Trevor James Smith (:user:`Zeitsperre`), Juliette Lavoie (:user:`juliettelavoie`), Marco Braun (:user:`vindelico``. +Contributors to this version: Gabriel Rondeau-Genesse (:user:`RondeauG`), Pascal Bourgault (:user:`aulemahal`), Trevor James Smith (:user:`Zeitsperre`), Juliette Lavoie (:user:`juliettelavoie`), Marco Braun (:user:`vindelico`). Announcements ^^^^^^^^^^^^^ From bf9d90601255cf7dbcbb7b95450164f6f7b58f3d Mon Sep 17 00:00:00 2001 From: RondeauG Date: Tue, 25 Jul 2023 17:02:14 -0400 Subject: [PATCH 17/17] reformatted tests --- tests/conftest.py | 120 ++++++++++++++++++++++++++++++++++++++++ tests/test_io.py | 136 +++++++++++++++++++--------------------------- 2 files changed, 175 insertions(+), 81 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index a1e19f0f..47993f20 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -2,7 +2,11 @@ import shutil from pathlib import Path +import numpy as np +import pandas as pd import pytest +import xarray as xr +from xclim.testing.helpers import test_timeseries as timeseries import xscen as xs @@ -43,3 +47,119 @@ def samplecat(): xr_open_kwargs={"engine": "h5netcdf"}, ) return xs.DataCatalog({"esmcat": xs.catalog.esm_col_data, "df": df}) + + +@pytest.fixture +def datablock_3d(): + def _datablock_3d( + values, + variable, + x, + x_start, + y, + y_start, + x_step=0.1, + y_step=0.1, + start="7/1/2000", + freq="D", + units=None, + as_dataset=False, + ): + """ + Create a generic timeseries object based on pre-defined dictionaries of existing variables. + + Parameters + ---------- + values : np.ndarray + The values to be assigned to the variable. Dimensions are interpreted [T, Y, X]. + variable : str + The variable name. + x : str + The name of the x coordinate. + x_start : float + The starting value of the x coordinate. + y : str + The name of the y coordinate. + y_start : float + The starting value of the y coordinate. + x_step : float + The step between x values. + y_step : float + The step between y values. + start : str + The starting date of the time coordinate. + freq : str + The frequency of the time coordinate. + units : str + The units of the variable. + as_dataset : bool + If True, return a Dataset, else a DataArray. + """ + attrs = { + "lat": { + "units": "degrees_north", + "description": "Latitude.", + "standard_name": "latitude", + }, + "lon": { + "units": "degrees_east", + "description": "Longitude.", + "standard_name": "longitude", + }, + "rlon": { + "units": "degrees", + "description": "Rotated longitude.", + "standard_name": "grid_longitude", + }, + "rlat": { + "units": "degrees", + "description": "Rotated latitude.", + "standard_name": "grid_latitude", + }, + "x": { + "units": "m", + "description": "Projection x coordinate.", + "standard_name": "projection_x_coordinate", + }, + "y": { + "units": "m", + "description": "Projection y coordinate.", + "standard_name": "projection_y_coordinate", + }, + } + + dims = { + "time": xr.DataArray( + pd.date_range(start, periods=values.shape[0], freq=freq), dims="time" + ), + y: xr.DataArray( + np.arange(y_start, y_start + values.shape[1] * y_step, y_step), + dims=y, + attrs=attrs[y], + ), + x: xr.DataArray( + np.arange(x_start, x_start + values.shape[2] * x_step, x_step), + dims=x, + attrs=attrs[x], + ), + } + + # Get the attributes using xclim, then create the DataArray + ts_1d = timeseries(values[:, 0, 0], variable, start, units=units, freq=freq) + da = xr.DataArray( + values, coords=dims, dims=dims, name=variable, attrs=ts_1d.attrs + ) + + # Add the axis information + da[x].attrs["axis"] = "X" + da[y].attrs["axis"] = "Y" + da["time"].attrs["axis"] = "T" + + # TODO: Fully support rotated grids (2D coordinates + grid_mapping) + + if as_dataset: + return da.to_dataset() + else: + return da + + return _datablock_3d diff --git a/tests/test_io.py b/tests/test_io.py index 4ba89ebf..6fa6efaa 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -1,94 +1,68 @@ import numpy as np -import pandas as pd import pytest -import xarray as xr import xscen as xs -class TestGenericRechunkingDims: - def get_test_dataset(self, data: np.ndarray = None): - """Create a 3D test Dataset. - - Parameters - ---------- - data : a 3D numpy array to populate the dataset, dimensionsa are interpreted [lat, lon, time] - (TODO: do with xarray.DataArray) - - Returns - ------- - ds : Dataset with cf information. - """ - - if isinstance(data, np.ndarray) and len(data.shape) == 3: - tas = data - lat = np.arange(0, data.shape[0]) * 90 / data.shape[0] - lon = np.arange(0, data.shape[1]) * 180 / data.shape[1] - time = pd.date_range("2010-10-18", periods=data.shape[2]) - else: - tas = [ - [[14.7, 10.1, 17.0], [3.4, 23.5, 15.1]], - [[8.5, 12.4, 25.8], [14.5, 5.8, 15.3]], - ] - lon = [-88.83, -88.32] - lat = [44.25, 44.21] - time = pd.date_range("2010-10-18", periods=3) - ds = xr.Dataset( - data_vars=dict( - tas=(["lat", "lon", "time"], tas, {"standard_name": "air_temperature"}) - ), - coords=dict( - lon=(["lon"], lon, {"standard_name": "Longitude", "axis": "X"}), - lat=(["lat"], lat, {"standard_name": "Latitude", "axis": "Y"}), - time=(["time"], time, {"standard_name": "time"}), - ), - attrs=dict(description="Climate data dummy."), - ) - return ds - - def get_test_dataset_rlat_rlon(self, data: np.ndarray = None): - ds = self.get_test_dataset(data) - # make dummy rlat / rlon - ds = ds.assign_coords( - {"rlon": ("lon", ds.lon.values), "rlat": ("lat", ds.lat.values)} +class TestRechunkForSaving: + @pytest.mark.parametrize( + "dims, xy", + [ + (["lon", "lat"], True), + (["lon", "lat"], False), + (["rlon", "rlat"], True), + (["rlon", "rlat"], False), + ], + ) + def test_options(self, datablock_3d, dims, xy): + ds = datablock_3d( + np.random.random((30, 30, 50)), + variable="tas", + x=dims[0], + x_start=-70 if dims[0] == "lon" else 0, + y=dims[1], + y_start=45 if dims[1] == "lat" else 0, + as_dataset=True, ) - ds = ds.swap_dims({"lat": "rlat", "lon": "rlon"}) - ds.coords["rlon"] = ds.coords["rlon"].assign_attrs(ds["lon"].attrs) - ds.coords["rlon"] = ds.coords["rlon"].assign_attrs( - {"standard_name": "grid_longitude"} - ) - ds.coords["rlat"] = ds.coords["rlat"].assign_attrs(ds["lat"].attrs) - ds.coords["rlat"] = ds.coords["rlat"].assign_attrs( - {"standard_name": "grid_latitude"} - ) - ds = ds.drop_vars(["lat", "lon"]) - - return ds - - def test_rechunk_for_saving_lat_lon(self): - ds = self.get_test_dataset(np.random.random((30, 30, 50))) - new_chunks = {"lat": 10, "lon": 10, "time": 20} + x = "X" if xy else dims[0] + y = "Y" if xy else dims[1] + new_chunks = {y: 10, x: 10, "time": 20} ds_ch = xs.io.rechunk_for_saving(ds, new_chunks) for dim, chunks in ds_ch.chunks.items(): + dim = ( + dim + if (xy is False or dim == "time") + else "X" + if dim == dims[0] + else "Y" + ) assert chunks[0] == new_chunks[dim] - def test_rechunk_for_saving_XY_lat_lon(self): - ds = self.get_test_dataset(np.random.random((30, 30, 50))) - new_chunks = {"X": 10, "Y": 10, "time": 20} - ds_ch = xs.io.rechunk_for_saving(ds, new_chunks) - for dim, chunks in zip(["X", "Y", "time"], ds_ch.chunks.values()): - assert chunks[0] == new_chunks[dim] - - def test_rechunk_for_saving_rlat_rlon(self): - ds = self.get_test_dataset_rlat_rlon(np.random.random((30, 30, 50))) - new_chunks = {"rlat": 10, "rlon": 10, "time": 20} - ds_ch = xs.io.rechunk_for_saving(ds, new_chunks) - for dim, chunks in ds_ch.chunks.items(): - assert chunks[0] == new_chunks[dim] + def test_variables(self, datablock_3d): + ds = datablock_3d( + np.random.random((30, 30, 50)), + variable="tas", + x="lon", + x_start=-70, + y="lat", + y_start=45, + as_dataset=True, + ) + ds["pr"] = datablock_3d( + np.random.random((30, 30, 50)), + variable="pr", + x="lon", + x_start=-70, + y="lat", + y_start=45, + as_dataset=False, + ) - def test_rechunk_for_saving_XY_rlat_lon(self): - ds = self.get_test_dataset_rlat_rlon(np.random.random((30, 30, 50))) - new_chunks = {"X": 10, "Y": 10, "time": 20} + new_chunks = { + "tas": {"time": 20, "lon": 10, "lat": 7}, + "pr": {"time": 10, "lon": 5, "lat": 3}, + } ds_ch = xs.io.rechunk_for_saving(ds, new_chunks) - for dim, chunks in zip(["X", "Y", "time"], ds_ch.chunks.values()): - assert chunks[0] == new_chunks[dim] + for v in ds_ch.data_vars: + for dim, chunks in zip(list(ds.dims), ds_ch[v].chunks): + assert chunks[0] == new_chunks[v][dim]