Skip to content

Commit

Permalink
Add ML processes for 2.1.0 #416
Browse files Browse the repository at this point in the history
  • Loading branch information
m-mohr committed Mar 14, 2023
1 parent 5345c19 commit 7e2d30e
Show file tree
Hide file tree
Showing 6 changed files with 365 additions and 0 deletions.
6 changes: 6 additions & 0 deletions meta/subtype-schemas.json
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,12 @@
}
}
},
"ml-model": {
"type": "object",
"subtype": "ml-model",
"title": "Machine Learning Model",
"description": "A machine learning model, accompanied with STAC metadata that implements the the STAC ml-model extension."
},
"output-format": {
"type": "string",
"subtype": "output-format",
Expand Down
110 changes: 110 additions & 0 deletions proposals/fit_class_random_forest.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
{
"id": "fit_class_random_forest",
"summary": "Train a random forest classification model",
"description": "Executes the fit of a random forest classification based on training data. The process does not include a separate split of the data in test, validation and training data. The Random Forest classification model is based on the approach by Breiman (2001).",
"categories": [
"machine learning"
],
"experimental": true,
"parameters": [
{
"name": "predictors",
"description": "The predictors for the classification model as a vector data cube. Aggregated to the features (vectors) of the target input variable.",
"schema": [
{
"type": "object",
"subtype": "datacube",
"dimensions": [
{
"type": "geometry"
},
{
"type": "bands"
}
]
},
{
"type": "object",
"subtype": "datacube",
"dimensions": [
{
"type": "geometry"
},
{
"type": "other"
}
]
}
]
},
{
"name": "target",
"description": "The training sites for the classification model as a vector data cube. This is associated with the target variable for the Random Forest model. The geometry has to associated with a value to predict (e.g. fractional forest canopy cover).",
"schema": {
"type": "object",
"subtype": "datacube",
"dimensions": [
{
"type": "geometry"
}
]
}
},
{
"name": "max_variables",
"description": "Specifies how many split variables will be used at a node.\n\nThe following options are available:\n\n- *integer*: The given number of variables are considered for each split.\n- `all`: All variables are considered for each split.\n- `log2`: The logarithm with base 2 of the number of variables are considered for each split.\n- `onethird`: A third of the number of variables are considered for each split.\n- `sqrt`: The square root of the number of variables are considered for each split. This is often the default for classification.",
"schema": [
{
"type": "integer",
"minimum": 1
},
{
"type": "string",
"enum": [
"all",
"log2",
"onethird",
"sqrt"
]
}
]
},
{
"name": "num_trees",
"description": "The number of trees build within the Random Forest classification.",
"optional": true,
"default": 100,
"schema": {
"type": "integer",
"minimum": 1
}
},
{
"name": "seed",
"description": "A randomization seed to use for the random sampling in training. If not given or `null`, no seed is used and results may differ on subsequent use.",
"optional": true,
"default": null,
"schema": {
"type": [
"integer",
"null"
]
}
}
],
"returns": {
"description": "A model object that can be saved with ``save_ml_model()`` and restored with ``load_ml_model()``.",
"schema": {
"type": "object",
"subtype": "ml-model"
}
},
"links": [
{
"href": "https://doi.org/10.1023/A:1010933404324",
"title": "Breiman (2001): Random Forests",
"type": "text/html",
"rel": "about"
}
]
}
110 changes: 110 additions & 0 deletions proposals/fit_regr_random_forest.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
{
"id": "fit_regr_random_forest",
"summary": "Train a random forest regression model",
"description": "Executes the fit of a random forest regression based on training data. The process does not include a separate split of the data in test, validation and training data. The Random Forest regression model is based on the approach by Breiman (2001).",
"categories": [
"machine learning"
],
"experimental": true,
"parameters": [
{
"name": "predictors",
"description": "The predictors for the regression model as a vector data cube. Aggregated to the features (vectors) of the target input variable.",
"schema": [
{
"type": "object",
"subtype": "datacube",
"dimensions": [
{
"type": "geometry"
},
{
"type": "bands"
}
]
},
{
"type": "object",
"subtype": "datacube",
"dimensions": [
{
"type": "geometry"
},
{
"type": "other"
}
]
}
]
},
{
"name": "target",
"description": "The training sites for the regression model as a vector data cube. This is associated with the target variable for the Random Forest model. The geometry has to associated with a value to predict (e.g. fractional forest canopy cover).",
"schema": {
"type": "object",
"subtype": "datacube",
"dimensions": [
{
"type": "geometry"
}
]
}
},
{
"name": "max_variables",
"description": "Specifies how many split variables will be used at a node.\n\nThe following options are available:\n\n- *integer*: The given number of variables are considered for each split.\n- `all`: All variables are considered for each split.\n- `log2`: The logarithm with base 2 of the number of variables are considered for each split.\n- `onethird`: A third of the number of variables are considered for each split. This is often the default for regression.\n- `sqrt`: The square root of the number of variables are considered for each split.",
"schema": [
{
"type": "integer",
"minimum": 1
},
{
"type": "string",
"enum": [
"all",
"log2",
"onethird",
"sqrt"
]
}
]
},
{
"name": "num_trees",
"description": "The number of trees build within the Random Forest regression.",
"optional": true,
"default": 100,
"schema": {
"type": "integer",
"minimum": 1
}
},
{
"name": "seed",
"description": "A randomization seed to use for the random sampling in training. If not given or `null`, no seed is used and results may differ on subsequent use.",
"optional": true,
"default": null,
"schema": {
"type": [
"integer",
"null"
]
}
}
],
"returns": {
"description": "A model object that can be saved with ``save_ml_model()`` and restored with ``load_ml_model()``.",
"schema": {
"type": "object",
"subtype": "ml-model"
}
},
"links": [
{
"href": "https://doi.org/10.1023/A:1010933404324",
"title": "Breiman (2001): Random Forests",
"type": "text/html",
"rel": "about"
}
]
}
53 changes: 53 additions & 0 deletions proposals/load_ml_model.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
{
"id": "load_ml_model",
"summary": "Load a ML model",
"description": "Loads a machine learning model from a STAC Item.\n\nSuch a model could be trained and saved as part of a previous batch job with processes such as ``fit_regr_random_forest()`` and ``save_ml_model()``.",
"categories": [
"machine learning",
"import"
],
"experimental": true,
"parameters": [
{
"name": "id",
"description": "The STAC Item to load the machine learning model from. The STAC Item must implement the `ml-model` extension.",
"schema": [
{
"title": "URL",
"type": "string",
"format": "uri",
"subtype": "uri",
"pattern": "^https?://"
},
{
"title": "Batch Job ID",
"description": "Loading a model by batch job ID is possible only if a single model has been saved by the job. Otherwise, you have to load a specific model from a batch job by URL.",
"type": "string",
"subtype": "job-id",
"pattern": "^[\\w\\-\\.~]+$"
},
{
"title": "User-uploaded File",
"type": "string",
"subtype": "file-path",
"pattern": "^[^\r\n\\:'\"]+$"
}
]
}
],
"returns": {
"description": "A machine learning model to be used with machine learning processes such as ``predict_random_forest()``.",
"schema": {
"type": "object",
"subtype": "ml-model"
}
},
"links": [
{
"href": "https://github.com/stac-extensions/ml-model",
"title": "STAC ml-model extension",
"type": "text/html",
"rel": "about"
}
]
}
42 changes: 42 additions & 0 deletions proposals/predict_random_forest.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
{
"id": "predict_random_forest",
"summary": "Predict values based on a Random Forest model",
"description": "Applies a Random Forest machine learning model to an array and predict a value for it.",
"categories": [
"machine learning",
"reducer"
],
"experimental": true,
"parameters": [
{
"name": "data",
"description": "An array of numbers.",
"schema": {
"type": "array",
"items": {
"type": [
"number",
"null"
]
}
}
},
{
"name": "model",
"description": "A model object that can be trained with the processes ``fit_regr_random_forest()`` (regression) and ``fit_class_random_forest()`` (classification).",
"schema": {
"type": "object",
"subtype": "ml-model"
}
}
],
"returns": {
"description": "The predicted value. Returns `null` if any of the given values in the array is a no-data value.",
"schema": {
"type": [
"number",
"null"
]
}
}
}
44 changes: 44 additions & 0 deletions proposals/save_ml_model.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
{
"id": "save_ml_model",
"summary": "Save a ML model",
"description": "Saves a machine learning model as part of a batch job.\n\nThe model will be accompanied by a separate STAC Item that implements the [ml-model extension](https://github.com/stac-extensions/ml-model).",
"categories": [
"machine learning",
"import"
],
"experimental": true,
"parameters": [
{
"name": "data",
"description": "The data to store as a machine learning model.",
"schema": {
"type": "object",
"subtype": "ml-model"
}
},
{
"name": "options",
"description": "Additional parameters to create the file(s).",
"schema": {
"type": "object",
"additionalParameters": false
},
"default": {},
"optional": true
}
],
"returns": {
"description": "Returns `false` if the process failed to store the model, `true` otherwise.",
"schema": {
"type": "boolean"
}
},
"links": [
{
"href": "https://github.com/stac-extensions/ml-model",
"title": "STAC ml-model extension",
"type": "text/html",
"rel": "about"
}
]
}

0 comments on commit 7e2d30e

Please sign in to comment.