From 0f09efc515d34c697cd9b7712f962b121922f610 Mon Sep 17 00:00:00 2001 From: Kevin Kho Date: Tue, 24 Oct 2023 15:46:39 -0500 Subject: [PATCH] MLFlow Tutorial (#676) * MLFlow tutorial * Cleaning notebook * Editing imports * Adding Mintlify --- .gitignore | 1 + nbs/docs/how-to-guides/MLFlow.ipynb | 674 ++++++++++++++++++++++++++++ nbs/mint.json | 1 + 3 files changed, 676 insertions(+) create mode 100644 nbs/docs/how-to-guides/MLFlow.ipynb diff --git a/.gitignore b/.gitignore index 92ef65b65..e54b2aa52 100644 --- a/.gitignore +++ b/.gitignore @@ -22,4 +22,5 @@ _proc/ nbs/.last_checked .venv .idea +mlruns/ .luarc.json diff --git a/nbs/docs/how-to-guides/MLFlow.ipynb b/nbs/docs/how-to-guides/MLFlow.ipynb new file mode 100644 index 000000000..f2c4119c9 --- /dev/null +++ b/nbs/docs/how-to-guides/MLFlow.ipynb @@ -0,0 +1,674 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#| hide\n", + "import warnings\n", + "import logging" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#| hide\n", + "warnings.simplefilter('ignore')\n", + "logging.getLogger('statsforecast').setLevel(logging.ERROR)\n", + "logging.getLogger(\"mlflow\").setLevel(logging.ERROR)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# MLFlow\n", + "\n", + "> Run Statsforecast with MLFlow.\n", + "\n", + "[MLFlow](https://github.com/mlflow/mlflow/) is an open source experiment tracking system that helps data scientists manage the model lifecycle from experimentation to production. An MLFlow integration for statsforecast is available in the [MLFlow](https://github.com/ml-toolkits/mlflavors) library that contains MLFlow support for popular machine learning libraries." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from statsforecast.utils import generate_series" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
unique_iddsystatic_0
002000-01-0112.07389743
102000-01-0259.73416643
202000-01-03101.26079443
302000-01-04143.98743043
402000-01-05185.32040643
\n", + "
" + ], + "text/plain": [ + " unique_id ds y static_0\n", + "0 0 2000-01-01 12.073897 43\n", + "1 0 2000-01-02 59.734166 43\n", + "2 0 2000-01-03 101.260794 43\n", + "3 0 2000-01-04 143.987430 43\n", + "4 0 2000-01-05 185.320406 43" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "series = generate_series(5, min_length=50, max_length=50, equal_ends=True, n_static_features=1)\n", + "series.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For the next part, `mlflow` and `mlflavors` are needed. Install them with:\n", + "\n", + "```bash\n", + "pip install mlflow mlflavors\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Model Logging" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import mlflow\n", + "from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error\n", + "from statsforecast import StatsForecast\n", + "from statsforecast.models import AutoARIMA\n", + "\n", + "import mlflavors\n", + "import requests" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Metrics: \n", + "{'mae': 6.712853959225143, 'mape': 0.11719246764336884}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2023/10/20 23:45:36 WARNING mlflow.utils.environment: Encountered an unexpected error while inferring pip requirements (model URI: /var/folders/w2/91_v34nx0xs2npnl3zsl9tmm0000gn/T/tmpt4686vpu/model/model.pkl, flavor: statsforecast), fall back to return ['statsforecast==1.6.0']. Set logging level to DEBUG to see the full traceback.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "MLflow run id:\n", + "0319bbd664424fcd88d6c532e3ecac77\n" + ] + } + ], + "source": [ + "ARTIFACT_PATH = \"model\"\n", + "DATA_PATH = \"./data\"\n", + "HORIZON = 7\n", + "LEVEL = [90]\n", + "\n", + "with mlflow.start_run() as run:\n", + " series = generate_series(5, min_length=50, max_length=50, equal_ends=True, n_static_features=1)\n", + " \n", + " train_df = series.groupby('unique_id').head(43)\n", + " test_df = series.groupby('unique_id').tail(7)\n", + " X_test = test_df.drop(columns=[\"y\"])\n", + " y_test = test_df[[\"y\"]]\n", + "\n", + " models = [AutoARIMA(season_length=7)]\n", + "\n", + " sf = StatsForecast(df=train_df, models=models, freq=\"D\", n_jobs=-1)\n", + "\n", + " sf.fit()\n", + "\n", + " # Evaluate model\n", + " y_pred = sf.predict(h=HORIZON, X_df=X_test, level=LEVEL)[\"AutoARIMA\"]\n", + "\n", + " metrics = {\n", + " \"mae\": mean_absolute_error(y_test, y_pred),\n", + " \"mape\": mean_absolute_percentage_error(y_test, y_pred),\n", + " }\n", + "\n", + " print(f\"Metrics: \\n{metrics}\")\n", + "\n", + " # Log metrics\n", + " mlflow.log_metrics(metrics)\n", + "\n", + " # Log model using pickle serialization (default).\n", + " mlflavors.statsforecast.log_model(\n", + " statsforecast_model=sf,\n", + " artifact_path=ARTIFACT_PATH,\n", + " serialization_format=\"pickle\",\n", + " )\n", + " model_uri = mlflow.get_artifact_uri(ARTIFACT_PATH)\n", + "\n", + "print(f\"\\nMLflow run id:\\n{run.info.run_id}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Viewing Experiment\n", + "\n", + "To view the newly created experiment and logged artifacts open the MLflow UI:\n", + "\n", + "```bash\n", + "mlflow ui\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Loading Statsforecast Model\n", + "\n", + "The `statsforecast` model can be loaded from the MLFlow registry using the `mlflow.statsforecast.load_model` function and used to generate predictions." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
dsAutoARIMAAutoARIMA-lo-90AutoARIMA-hi-90
unique_id
02000-02-1355.89443244.34388067.444984
02000-02-1497.81805486.267502109.368607
02000-02-15146.745422135.194870158.295975
02000-02-16188.888336177.337784200.438904
02000-02-17231.493637219.943085243.044189
\n", + "
" + ], + "text/plain": [ + " ds AutoARIMA AutoARIMA-lo-90 AutoARIMA-hi-90\n", + "unique_id \n", + "0 2000-02-13 55.894432 44.343880 67.444984\n", + "0 2000-02-14 97.818054 86.267502 109.368607\n", + "0 2000-02-15 146.745422 135.194870 158.295975\n", + "0 2000-02-16 188.888336 177.337784 200.438904\n", + "0 2000-02-17 231.493637 219.943085 243.044189" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "loaded_model = mlflavors.statsforecast.load_model(model_uri=model_uri)\n", + "results = loaded_model.predict(h=HORIZON, X_df=X_test, level=LEVEL)\n", + "results.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Loading Model with pyfunc\n", + "\n", + "[Pyfunc](https://mlflow.org/docs/latest/python_api/mlflow.pyfunc.html) is another interface for MLFlow models that has utilities for loading and saving models. This code is equivalent in making predictions as above." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
dsAutoARIMAAutoARIMA-lo-90AutoARIMA-hi-90
unique_id
02000-02-1355.89443244.34388067.444984
02000-02-1497.81805486.267502109.368607
02000-02-15146.745422135.194870158.295975
02000-02-16188.888336177.337784200.438904
02000-02-17231.493637219.943085243.044189
\n", + "
" + ], + "text/plain": [ + " ds AutoARIMA AutoARIMA-lo-90 AutoARIMA-hi-90\n", + "unique_id \n", + "0 2000-02-13 55.894432 44.343880 67.444984\n", + "0 2000-02-14 97.818054 86.267502 109.368607\n", + "0 2000-02-15 146.745422 135.194870 158.295975\n", + "0 2000-02-16 188.888336 177.337784 200.438904\n", + "0 2000-02-17 231.493637 219.943085 243.044189" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "loaded_pyfunc = mlflavors.statsforecast.pyfunc.load_model(model_uri=model_uri)\n", + "\n", + "# Convert test data to 2D numpy array so it can be passed to pyfunc predict using\n", + "# a single-row Pandas DataFrame configuration argument\n", + "X_test_array = X_test.to_numpy()\n", + "\n", + "# Create configuration DataFrame\n", + "predict_conf = pd.DataFrame(\n", + " [\n", + " {\n", + " \"X\": X_test_array,\n", + " \"X_cols\": X_test.columns,\n", + " \"X_dtypes\": list(X_test.dtypes),\n", + " \"h\": HORIZON,\n", + " \"level\": LEVEL,\n", + " }\n", + " ]\n", + ")\n", + "\n", + "\n", + "pyfunc_result = loaded_pyfunc.predict(predict_conf)\n", + "pyfunc_result.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Model Serving\n", + "\n", + "This section illustrates an example of serving the `pyfunc` flavor to a local REST API endpoint and subsequently requesting a prediction from the served model. To serve the model run the command below where you substitute the run id printed during execution training code.\n", + "\n", + "```bash\n", + "mlflow models serve -m runs://model --env-manager local --host 127.0.0.1\n", + "```\n", + "\n", + "After running this, the code below can be ran to send a request." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "HORIZON = 7\n", + "LEVEL = [90, 95]\n", + "\n", + "# Define local host and endpoint url\n", + "host = \"127.0.0.1\"\n", + "url = f\"http://{host}:5000/invocations\"\n", + "\n", + "# Convert DateTime to string for JSON serialization\n", + "X_test_pyfunc = X_test.copy()\n", + "X_test_pyfunc[\"ds\"] = X_test_pyfunc[\"ds\"].dt.strftime(date_format=\"%Y-%m-%d\")\n", + "\n", + "# Convert to list for JSON serialization\n", + "X_test_list = X_test_pyfunc.to_numpy().tolist()\n", + "\n", + "# Convert index to list of strings for JSON serialization\n", + "X_cols = list(X_test.columns)\n", + "\n", + "# Convert dtypes to string for JSON serialization\n", + "X_dtypes = [str(dtype) for dtype in list(X_test.dtypes)]\n", + "\n", + "predict_conf = pd.DataFrame(\n", + " [\n", + " {\n", + " \"X\": X_test_list,\n", + " \"X_cols\": X_cols,\n", + " \"X_dtypes\": X_dtypes,\n", + " \"h\": HORIZON,\n", + " \"level\": LEVEL,\n", + " }\n", + " ]\n", + ")\n", + "\n", + "# Create dictionary with pandas DataFrame in the split orientation\n", + "json_data = {\"dataframe_split\": predict_conf.to_dict(orient=\"split\")}\n", + "\n", + "# Score model\n", + "response = requests.post(url, json=json_data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
dsAutoARIMAAutoARIMA-lo-95AutoARIMA-lo-90AutoARIMA-hi-90AutoARIMA-hi-95
02000-02-13T00:00:0055.89443242.13110044.34388067.44498469.657768
12000-02-14T00:00:0097.81805484.05471886.267502109.368607111.581390
22000-02-15T00:00:00146.745422132.982086135.194870158.295975160.508759
32000-02-16T00:00:00188.888336175.125015177.337784200.438904202.651672
42000-02-17T00:00:00231.493637217.730301219.943085243.044189245.256973
\n", + "
" + ], + "text/plain": [ + " ds AutoARIMA AutoARIMA-lo-95 AutoARIMA-lo-90 \\\n", + "0 2000-02-13T00:00:00 55.894432 42.131100 44.343880 \n", + "1 2000-02-14T00:00:00 97.818054 84.054718 86.267502 \n", + "2 2000-02-15T00:00:00 146.745422 132.982086 135.194870 \n", + "3 2000-02-16T00:00:00 188.888336 175.125015 177.337784 \n", + "4 2000-02-17T00:00:00 231.493637 217.730301 219.943085 \n", + "\n", + " AutoARIMA-hi-90 AutoARIMA-hi-95 \n", + "0 67.444984 69.657768 \n", + "1 109.368607 111.581390 \n", + "2 158.295975 160.508759 \n", + "3 200.438904 202.651672 \n", + "4 243.044189 245.256973 " + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.DataFrame(response.json()['predictions']).head()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "python3", + "language": "python", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/nbs/mint.json b/nbs/mint.json index 86817bc40..a91529e83 100644 --- a/nbs/mint.json +++ b/nbs/mint.json @@ -62,6 +62,7 @@ "docs/how-to-guides/exogenous.html", "docs/how-to-guides/getting_started_complete_polars.html", "docs/how-to-guides/migrating_R", + "docs/how-to-guides/MLFlow.html", "docs/how-to-guides/numba_cache.html", "docs/how-to-guides/prophet_spark_m5.html", "docs/how-to-guides/ray.html",