Merge pull request #63 from Ouranosinc/reduce

Ensemble reduction and changes to Ensembles
Ouranosinc · Sep 15, 2022 · a3919b5 · a3919b5
2 parents c5e583c + 9e7a387
commit a3919b5
Show file tree

Hide file tree

Showing 9 changed files with 443 additions and 52 deletions.
diff --git a/HISTORY.rst b/HISTORY.rst
@@ -25,10 +25,15 @@ New features and enhancements
     * Do not fail for any grid mapping problem, includin if a grid_mapping attribute mentions a variable that doesn't exist.
 * Default email sent to the local user. (:pull:`68`).
 * Special accelerated pathway for parsing catalogs with all dates within the datetime64[ns] range (:pull:`75`).
+* New functions ``reduce_ensemble`` and ``build_reduction_data`` to support kkz and kmeans clustering (:issue:`4`, :pull:`63`)
+* `ensemble_stats` can now loop through multiple statistics, support functions located in `xclim.ensembles._robustness`, and supports weighted realizations (:pull:`63`).
+* New function `ensemble_stats.generate_weights` that estimates weights based on simulation metadata (:pull:`63`).
+* New function `catalog.unstack_id` to reverse-engineer IDs (:pull:`63`).
+* `generate_id` now accepts Datasets (:pull:`63`).
 
 Breaking changes
 ^^^^^^^^^^^^^^^^
-* N/A
+* `statistics / stats_kwargs` have been changed/eliminated in `ensemble_stats`, respectively (:pull:`63`).
 
 Bug fixes
 ^^^^^^^^^
@@ -41,6 +46,9 @@ Internal changes
 * Default method of `xs.extract.resample` now depends on frequency. (:issue:`57`, :pull:`58`).
 * Bugfix for `_restrict_by_resolution` with CMIP6 datasets (:pull:`71`).
 * More complete check of coverage in ``_subset_file_coverage`` (:issue: `70`, :pull: `72`)
+* The code that performs `common_attrs_only` in `ensemble_stats` has been moved to `clean_up` (:pull:`63`).
+* Removed the default `to_level` in `clean_up` (:pull:`63`).
+
 
 v0.3.0 (2022-08-23)
 -------------------

diff --git a/docs/api.rst b/docs/api.rst
@@ -29,6 +29,13 @@ Controlled Vocabulary and Mappings
    :members:
    :noindex:
 
+Reduction
+----------
+
+.. automodule:: xscen.reduce
+   :members:
+   :noindex:
+
 Regridding
 ----------
 

diff --git a/docs/index.rst b/docs/index.rst
@@ -31,6 +31,7 @@ Features
     notebooks/getting_started
     notebooks/config_usage
     notebooks/diagnostics
+    notebooks/ensemble_reduction
     columns
     api
     contributing

diff --git a/docs/notebooks b/docs/notebooks
diff --git a/xscen/__init__.py b/xscen/__init__.py
@@ -12,6 +12,7 @@
     extract,
     indicators,
     io,
+    reduce,
     regrid,
     scripting,
     utils,
@@ -27,6 +28,7 @@
 from .extract import extract_dataset, search_data_catalogs  # noqa
 from .indicators import compute_indicators  # noqa
 from .io import save_to_netcdf, save_to_zarr  # noqa
+from .reduce import build_reduction_data, reduce_ensemble
 from .regrid import *
 from .scripting import (
     TimeoutException,

diff --git a/xscen/catalog.py b/xscen/catalog.py
@@ -44,6 +44,7 @@
     "generate_id",
     "parse_directory",
     "parse_from_ds",
+    "unstack_id",
 ]
 
 
@@ -1233,20 +1234,71 @@ def _parse_date(date, fmts):
     return date
 
 
-def generate_id(df: pd.DataFrame, id_columns: Optional[list] = None):
+def generate_id(df: Union[pd.DataFrame, xr.Dataset], id_columns: Optional[list] = None):
     """Utility to create an ID from column entries.
 
     Parameters
     ----------
-    df: pd.DataFrame
+    df: pd.DataFrame, xr.Dataset
       Data for which to create an ID.
     id_columns : list
       List of column names on which to base the dataset definition. Empty columns will be skipped.
       If None (default), uses :py:data:`ID_COLUMNS`.
     """
+    if isinstance(df, xr.Dataset):
+        df = pd.DataFrame.from_dict(
+            {
+                key[4:]: [value]
+                for key, value in df.attrs.items()
+                if key.startswith("cat:")
+            }
+        )
 
     id_columns = [x for x in (id_columns or ID_COLUMNS) if x in df.columns]
 
     return df[id_columns].apply(
         lambda row: "_".join(map(str, filter(pd.notna, row.values))), axis=1
     )
+
+
+def unstack_id(df: Union[pd.DataFrame, ProjectCatalog, DataCatalog]) -> dict:
+    """
+    Utility that reverse-engineers an ID using catalog entries.
+
+    Parameters
+    ----------
+    df: Union[pd.DataFrame, ProjectCatalog, DataCatalog]
+        Either a Project/DataCatalog or the pandas DataFrame.
+
+    Returns
+    -------
+    dict
+        Dictionary with one entry per unique ID, which are themselves dictionaries of all the individual parts of the ID.
+    """
+
+    if isinstance(df, (ProjectCatalog, DataCatalog)):
+        df = df.df
+
+    out = {}
+    for ids in pd.unique(df["id"]):
+        subset = df[df["id"] == ids]
+
+        # Only keep relevant columns
+        subset = subset[
+            [
+                col
+                for col in subset.columns
+                if bool(re.search(f"((_)|(^)){str(subset[col].iloc[0])}((_)|($))", ids))
+            ]
+        ].drop("id", axis=1)
+
+        # Make sure that all elements are the same, if there are multiple lines
+        if len(subset) > 1:
+            if not all([subset[col].is_unique for col in subset.columns]):
+                raise ValueError(
+                    "Not all elements of the columns are the same for a given ID!"
+                )
+
+        out[ids] = {attr: subset[attr].iloc[0] for attr in subset.columns}
+
+    return out