Open-EO · m-mohr · May 15, 2023 · May 15, 2023 · May 15, 2023 · May 16, 2023
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -16,7 +16,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
     - `filter_vector`
     - `flatten_dimensions`
     - `load_geojson`
+    - `load_ml_model`
     - `load_url`
+    - `ml_fit_class_random_forest`
+    - `ml_fit_regr_random_forest`
+    - `ml_predict`
+    - `save_ml_model`
     - `unflatten_dimension`
     - `vector_buffer`
     - `vector_reproject`

diff --git a/meta/subtype-schemas.json b/meta/subtype-schemas.json
@@ -232,6 +232,12 @@
                 }
             }
         },
+        "ml-model": {
+            "type": "object",
+            "subtype": "ml-model",
+            "title": "Machine Learning Model",
+            "description": "A machine learning model, accompanied with STAC metadata that implements the the STAC ml-model extension."
+        },
         "output-format": {
             "type": "string",
             "subtype": "output-format",

diff --git a/proposals/load_ml_model.json b/proposals/load_ml_model.json
@@ -0,0 +1,46 @@
+{
+    "id": "load_ml_model",
+    "summary": "Load a ML model",
+    "description": "Loads a machine learning model from a STAC Item.\n\nSuch a model could be trained and saved as part of a previous batch job with processes such as  ``ml_fit_regr_random_forest()`` and ``save_ml_model()``.",
+    "categories": [
+        "machine learning",
+        "import"
+    ],
+    "experimental": true,
+    "parameters": [
+        {
+            "name": "uri",
+            "description": "The STAC Item to load the machine learning model from. The STAC Item must implement the `ml-model` extension.",
+            "schema": [
+                {
+                    "title": "URL",
+                    "type": "string",
+                    "format": "uri",
+                    "subtype": "uri",
+                    "pattern": "^https?://"
+                },
+                {
+                    "title": "User-uploaded File",
+                    "type": "string",
+                    "subtype": "file-path",
+                    "pattern": "^[^\r\n\\:'\"]+$"
+                }
+            ]
+        }
+    ],
+    "returns": {
+        "description": "A machine learning model to be used with machine learning processes such as ``ml_predict()``.",
+        "schema": {
+            "type": "object",
+            "subtype": "ml-model"
+        }
+    },
+    "links": [
+        {
+            "href": "https://github.com/stac-extensions/ml-model",
+            "title": "STAC ml-model extension",
+            "type": "text/html",
+            "rel": "about"
+        }
+    ]
+}
diff --git a/proposals/ml_fit_class_random_forest.json b/proposals/ml_fit_class_random_forest.json
@@ -0,0 +1,110 @@
+{
+    "id": "ml_fit_class_random_forest",
+    "summary": "Train a random forest classification model",
+    "description": "Executes the fit of a random forest classification based on training data. The process does not include a separate split of the data in test, validation and training data. The Random Forest classification model is based on the approach by Breiman (2001).",
+    "categories": [
+        "machine learning"
+    ],
+    "experimental": true,
+    "parameters": [
+        {
+            "name": "predictors",
+            "description": "The predictors for the classification model as a vector data cube. Aggregated to the features (vectors) of the target input variable.",
+            "schema": [
+                {
+                    "type": "object",
+                    "subtype": "datacube",
+                    "dimensions": [
+                        {
+                            "type": "geometry"
+                        },
+                        {
+                            "type": "bands"
+                        }
+                    ]
+                },
+                {
+                    "type": "object",
+                    "subtype": "datacube",
+                    "dimensions": [
+                        {
+                            "type": "geometry"
+                        },
+                        {
+                            "type": "other"
+                        }
+                    ]
+                }
+            ]
+        },
+        {
+            "name": "target",
+            "description": "The training sites for the classification model as a vector data cube. This is associated with the target variable for the Random Forest model. The geometry has to associated with a value to predict (e.g. fractional forest canopy cover).",
+            "schema": {
+                "type": "object",
+                "subtype": "datacube",
+                "dimensions": [
+                    {
+                        "type": "geometry"
+                    }
+                ]
+            }
+        },
+        {
+            "name": "max_variables",
+            "description": "Specifies how many split variables will be used at a node.\n\nThe following options are available:\n\n- *integer*: The given number of variables are considered for each split.\n- `all`: All variables are considered for each split.\n- `log2`: The logarithm with base 2 of the number of variables are considered for each split.\n- `onethird`: A third of the number of variables are considered for each split.\n- `sqrt`: The square root of the number of variables are considered for each split. This is often the default for classification.",
+            "schema": [
+                {
+                    "type": "integer",
+                    "minimum": 1
+                },
+                {
+                    "type": "string",
+                    "enum": [
+                        "all",
+                        "log2",
+                        "onethird",
+                        "sqrt"
+                    ]
+                }
+            ]
+        },
+        {
+            "name": "num_trees",
+            "description": "The number of trees build within the Random Forest classification.",
+            "optional": true,
+            "default": 100,
+            "schema": {
+                "type": "integer",
+                "minimum": 1
+            }
+        },
+        {
+            "name": "seed",
+            "description": "A randomization seed to use for the random sampling in training. If not given or `null`, no seed is used and results may differ on subsequent use.",
+            "optional": true,
+            "default": null,
+            "schema": {
+                "type": [
+                    "integer",
+                    "null"
+                ]
+            }
+        }
+    ],
+    "returns": {
+        "description": "A model object that can be saved with ``save_ml_model()`` and restored with ``load_ml_model()``.",
+        "schema": {
+            "type": "object",
+            "subtype": "ml-model"
+        }
+    },
+    "links": [
+        {
+            "href": "https://doi.org/10.1023/A:1010933404324",
+            "title": "Breiman (2001): Random Forests",
+            "type": "text/html",
+            "rel": "about"
+        }
+    ]
+}
diff --git a/proposals/ml_fit_regr_random_forest.json b/proposals/ml_fit_regr_random_forest.json
@@ -0,0 +1,110 @@
+{
+    "id": "ml_fit_regr_random_forest",
+    "summary": "Train a random forest regression model",
+    "description": "Executes the fit of a random forest regression based on training data. The process does not include a separate split of the data in test, validation and training data. The Random Forest regression model is based on the approach by Breiman (2001).",
+    "categories": [
+        "machine learning"
+    ],
+    "experimental": true,
+    "parameters": [
+        {
+            "name": "predictors",
+            "description": "The predictors for the regression model as a vector data cube. Aggregated to the features (vectors) of the target input variable.",
+            "schema": [
+                {
+                    "type": "object",
+                    "subtype": "datacube",
+                    "dimensions": [
+                        {
+                            "type": "geometry"
+                        },
+                        {
+                            "type": "bands"
+                        }
+                    ]
+                },
+                {
+                    "type": "object",
+                    "subtype": "datacube",
+                    "dimensions": [
+                        {
+                            "type": "geometry"
+                        },
+                        {
+                            "type": "other"
+                        }
+                    ]
+                }
+            ]
+        },
+        {
+            "name": "target",
+            "description": "The training sites for the regression model as a vector data cube. This is associated with the target variable for the Random Forest model. The geometry has to associated with a value to predict (e.g. fractional forest canopy cover).",
+            "schema": {
+                "type": "object",
+                "subtype": "datacube",
+                "dimensions": [
+                    {
+                        "type": "geometry"
+                    }
+                ]
+            }
+        },
+        {
+            "name": "max_variables",
+            "description": "Specifies how many split variables will be used at a node.\n\nThe following options are available:\n\n- *integer*: The given number of variables are considered for each split.\n- `all`: All variables are considered for each split.\n- `log2`: The logarithm with base 2 of the number of variables are considered for each split.\n- `onethird`: A third of the number of variables are considered for each split. This is often the default for regression.\n- `sqrt`: The square root of the number of variables are considered for each split.",
+            "schema": [
+                {
+                    "type": "integer",
+                    "minimum": 1
+                },
+                {
+                    "type": "string",
+                    "enum": [
+                        "all",
+                        "log2",
+                        "onethird",
+                        "sqrt"
+                    ]
+                }
+            ]
+        },
+        {
+            "name": "num_trees",
+            "description": "The number of trees build within the Random Forest regression.",
+            "optional": true,
+            "default": 100,
+            "schema": {
+                "type": "integer",
+                "minimum": 1
+            }
+        },
+        {
+            "name": "seed",
+            "description": "A randomization seed to use for the random sampling in training. If not given or `null`, no seed is used and results may differ on subsequent use.",
+            "optional": true,
+            "default": null,
+            "schema": {
+                "type": [
+                    "integer",
+                    "null"
+                ]
+            }
+        }
+    ],
+    "returns": {
+        "description": "A model object that can be saved with ``save_ml_model()`` and restored with ``load_ml_model()``.",
+        "schema": {
+            "type": "object",
+            "subtype": "ml-model"
+        }
+    },
+    "links": [
+        {
+            "href": "https://doi.org/10.1023/A:1010933404324",
+            "title": "Breiman (2001): Random Forests",
+            "type": "text/html",
+            "rel": "about"
+        }
+    ]
+}
diff --git a/proposals/ml_predict.json b/proposals/ml_predict.json
@@ -0,0 +1,49 @@
+{
+    "id": "ml_predict",
+    "summary": "Predict using ML",
+    "description": "Applies a machine learning model to a data cube of input features and returns the predicted values.",
+    "categories": [
+        "machine learning"
+    ],
+    "experimental": true,
+    "parameters": [
+        {
+            "name": "data",
+            "description": "The data cube containing the input features.",
+            "schema": {
+                "type": "object",
+                "subtype": "datacube"
+            }
+        },
+        {
+            "name": "model",
+            "description": "A ML model that was trained with one of the ML training processes such as ``ml_fit_regr_random_forest()``.",
+            "schema": {
+                "type": "object",
+                "subtype": "ml-model"
+            }
+        },
+        {
+            "name": "dimensions",
+            "description": "Zero or more dimensions that will be reduced by the model. Fails with a `DimensionNotAvailable` exception if one of the specified dimensions does not exist.",
+            "schema": {
+                "type": "array",
+                "items": {
+                    "type": "string"
+                }
+            }
+        }
+    ],
+    "returns": {
+        "description": "A data cube with the predicted values. It removes the specified dimensions and adds new dimension for the predicted values. It has the name `predictions` and is of type `other`. If a single value is returned, the dimension has a single label with name `0`.",
+        "schema": {
+            "type": "object",
+            "subtype": "datacube",
+            "dimensions": [
+                {
+                    "type": "other"
+                }
+            ]
+        }
+    }
+}
diff --git a/proposals/predict_curve.json b/proposals/predict_curve.json
@@ -1,7 +1,7 @@
 {
     "id": "predict_curve",
-    "summary": "Predict values",
-    "description": "Predict values using a model function and pre-computed parameters. The process is intended to compute values for new labels.",
+    "summary": "Predict values using a model function",
+    "description": "Predict values using a model function and pre-computed parameters. The process is primarily intended to compute values for new labels, but it can also fill gaps where existing labels contain no-data (`null`) values.",
     "categories": [
         "cubes",
         "math"