Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FEAT] Ability to save and load StatsForecast #667

Merged
merged 21 commits into from
Oct 23, 2023
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
143 changes: 143 additions & 0 deletions nbs/src/core/core.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -75,9 +75,15 @@
"import logging\n",
"import reprlib\n",
"import warnings\n",
"import errno\n",
"import os\n",
"from pathlib import Path\n",
"from os import cpu_count\n",
"from typing import Any, List, Optional, Union, Dict\n",
"import pkg_resources\n",
"import pickle\n",
"import datetime as dt\n",
"import re\n",
"\n",
"from fugue.execution.factory import make_execution_engine\n",
"import numpy as np\n",
Expand Down Expand Up @@ -1965,6 +1971,109 @@
" palette='tab20b',\n",
" )\n",
" \n",
" def save(\n",
" self, \n",
" path: Union[Optional[Path], Optional[str]] = None, \n",
" max_size: Optional[str] = None,\n",
" trim: bool = False,\n",
" ):\n",
" \"\"\"Function that will save StatsForecast class with certain settings to make it \n",
" reproducible.\n",
" \n",
" Parameters\n",
" ----------\n",
" path : str or pathlib.Path, optional (default=None)\n",
" Path of the file to be saved. If `None` will create one in the current \n",
" directory using the current UTC timestamp.\n",
" max_size: str, (default = None)\n",
" StatsForecast class should not exceed this size, available byte naming:\n",
" ['B', 'KB', 'MB', 'GB']\n",
" If max_size is set, but not parsable then default 50MB will be set.\n",
" trim: bool, (default = False)\n",
" Delete any attributes not needed for inference.\n",
" \"\"\"\n",
" # Will be used to find the size of the fitted models\n",
" # Never expecting anything higher than GB (even that's a lot')\n",
" bytes_hmap = {\n",
" \"B\": 1,\n",
" \"KB\": 2**10,\n",
" \"MB\": 2**20,\n",
" \"GB\": 2**30,\n",
" }\n",
"\n",
" # Removing unnecessary attributes\n",
" # @jmoralez decide future implementation\n",
" trim_attr:list = [\"fcst_fitted_values_\", \"cv_fitted_values_\"]\n",
" if trim:\n",
" for attr in trim_attr:\n",
" # remove unnecessary attributes here\n",
" self.__dict__.pop(attr, None)\n",
"\n",
" sf_size = len(pickle.dumps(self))\n",
"\n",
" if max_size:\n",
" cap_size = self.__get_cap_size__(max_size, bytes_hmap)\n",
" if sf_size >= cap_size:\n",
" err_messg = \"StatsForecast is larger specified max_size\"\n",
" raise OSError(errno.EFBIG, err_messg) \n",
"\n",
" converted_size, sf_byte = None, None\n",
" for key in reversed(list(bytes_hmap.keys())):\n",
" x_byte = bytes_hmap[key]\n",
" if sf_size >= x_byte:\n",
" converted_size = sf_size / x_byte\n",
" sf_byte = key\n",
" break\n",
" \n",
" if converted_size is None or sf_byte is None:\n",
" err_messg = \"Internal Error, this shouldn't happen, please open an issue\"\n",
" raise NameError(err_messg)\n",
" \n",
" print(f\"Model(s) size: {converted_size:.2f}{sf_byte}\")\n",
" \n",
" print(\"Saving model(s)\")\n",
" \n",
" if not path:\n",
" datetime_record = dt.datetime.utcnow().strftime(\"%Y-%m-%d_%H-%M-%S\")\n",
" path = f\"StatsForecast_{datetime_record}.pkl\"\n",
" \n",
" with open(path, \"wb\") as m_file:\n",
" pickle.dump(self, m_file)\n",
" print(\"Model(s) saved\")\n",
"\n",
" def __get_cap_size__(self, max_size, bytes_hmap):\n",
" max_size = max_size.replace(\" \", \"\")\n",
" match = re.match(r'(\\d+\\.\\d+|\\d+)(\\w+)', max_size)\n",
" if not match or match[2] not in bytes_hmap.keys():\n",
" warnings.warn(\"Couldn't parse your max_size, default 50MB will be set\")\n",
" # Keeping this as default, failure check\n",
" cap_size = 50.0 * bytes_hmap[\"MB\"]\n",
" else:\n",
" m_size = float(match[1])\n",
" key_ = match[2]\n",
" cap_size = m_size * bytes_hmap[key_]\n",
" return cap_size\n",
" \n",
" @staticmethod\n",
" def load(path:Union[Path, str]):\n",
" \"\"\"\n",
" Automatically loads the model into ready StatsForecast.\n",
"\n",
" Parameters\n",
" ----------\n",
" path: Union[str, Path]\n",
" Path to saved StatsForecast file.\n",
" \n",
" Returns\n",
" -------\n",
" sf: StatsForecast\n",
" Previously saved StatsForecast\n",
" \"\"\"\n",
" if not os.path.exists(path):\n",
" raise ValueError(\"Specified path does not exist, check again and retry.\")\n",
" with open(path, \"rb\") as f:\n",
" return pickle.load(f)\n",
" \n",
" def __repr__(self):\n",
" return f\"StatsForecast(models=[{','.join(map(repr, self.models))}])\""
]
Expand Down Expand Up @@ -2196,6 +2305,40 @@
"fcsts_df.groupby('unique_id').tail(4)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8b9eaa52",
"metadata": {},
"outputs": [],
"source": [
"#| hide\n",
"# Testing save and load \n",
"import tempfile\n",
"from polars.testing import assert_frame_equal\n",
"\n",
"with tempfile.TemporaryDirectory() as td:\n",
" f_path = os.path.join(td, \"sf_test.pickle\")\n",
" \n",
" test_df = pl.from_pandas(panel_df.astype({\"unique_id\": str}))\n",
" test_frcs = StatsForecast(\n",
" df=test_df,\n",
" models=models,\n",
" freq='D', \n",
" n_jobs=1, \n",
" verbose=True\n",
" )\n",
"\n",
" origin_df = test_frcs.forecast(h=4, fitted=True)\n",
"\n",
" test_frcs.save(f_path)\n",
"\n",
" sf_test = StatsForecast.load(f_path)\n",
" load_df = sf_test.forecast(h=4, fitted=True)\n",
" \n",
" assert_frame_equal(origin_df, load_df)"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand Down
6 changes: 6 additions & 0 deletions statsforecast/_modidx.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,8 @@
'statsforecast.core.StatsForecast.forecast': ( 'src/core/core.html#statsforecast.forecast',
'statsforecast/core.py'),
'statsforecast.core._StatsForecast': ('src/core/core.html#_statsforecast', 'statsforecast/core.py'),
'statsforecast.core._StatsForecast.__get_cap_size__': ( 'src/core/core.html#_statsforecast.__get_cap_size__',
'statsforecast/core.py'),
'statsforecast.core._StatsForecast.__init__': ( 'src/core/core.html#_statsforecast.__init__',
'statsforecast/core.py'),
'statsforecast.core._StatsForecast.__repr__': ( 'src/core/core.html#_statsforecast.__repr__',
Expand Down Expand Up @@ -188,10 +190,14 @@
'statsforecast/core.py'),
'statsforecast.core._StatsForecast.forecast_fitted_values': ( 'src/core/core.html#_statsforecast.forecast_fitted_values',
'statsforecast/core.py'),
'statsforecast.core._StatsForecast.load': ( 'src/core/core.html#_statsforecast.load',
'statsforecast/core.py'),
'statsforecast.core._StatsForecast.plot': ( 'src/core/core.html#_statsforecast.plot',
'statsforecast/core.py'),
'statsforecast.core._StatsForecast.predict': ( 'src/core/core.html#_statsforecast.predict',
'statsforecast/core.py'),
'statsforecast.core._StatsForecast.save': ( 'src/core/core.html#_statsforecast.save',
'statsforecast/core.py'),
'statsforecast.core._cv_dates': ('src/core/core.html#_cv_dates', 'statsforecast/core.py'),
'statsforecast.core._get_n_jobs': ('src/core/core.html#_get_n_jobs', 'statsforecast/core.py'),
'statsforecast.core._parse_ds_type': ('src/core/core.html#_parse_ds_type', 'statsforecast/core.py'),
Expand Down
110 changes: 109 additions & 1 deletion statsforecast/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,15 @@
import logging
import reprlib
import warnings
import errno
import os
akmalsoliev marked this conversation as resolved.
Show resolved Hide resolved
from pathlib import Path
from os import cpu_count
from typing import Any, List, Optional, Union, Dict
import pkg_resources
import pickle
import datetime as dt
import re

from fugue.execution.factory import make_execution_engine
import numpy as np
Expand Down Expand Up @@ -453,7 +459,6 @@ def __init__(
sort_dataframe: bool,
validate: Optional[bool] = True,
):

self.dataframe = dataframe
self.sort_dataframe = sort_dataframe
self.validate = validate
Expand Down Expand Up @@ -1546,6 +1551,109 @@ def plot(
palette="tab20b",
)

def save(
self,
path: Union[Optional[Path], Optional[str]] = None,
akmalsoliev marked this conversation as resolved.
Show resolved Hide resolved
max_size: Optional[str] = None,
trim: bool = False,
):
"""Function that will save StatsForecast class with certain settings to make it
reproducible.

Parameters
----------
path : str or pathlib.Path, optional (default=None)
Path of the file to be saved. If `None` will create one in the current
directory using the current UTC timestamp.
max_size: str, (default = None)
akmalsoliev marked this conversation as resolved.
Show resolved Hide resolved
StatsForecast class should not exceed this size, available byte naming:
['B', 'KB', 'MB', 'GB']
akmalsoliev marked this conversation as resolved.
Show resolved Hide resolved
If max_size is set, but not parsable then default 50MB will be set.
akmalsoliev marked this conversation as resolved.
Show resolved Hide resolved
trim: bool, (default = False)
akmalsoliev marked this conversation as resolved.
Show resolved Hide resolved
Delete any attributes not needed for inference.
"""
# Will be used to find the size of the fitted models
# Never expecting anything higher than GB (even that's a lot')
bytes_hmap = {
"B": 1,
"KB": 2**10,
"MB": 2**20,
"GB": 2**30,
}

# Removing unnecessary attributes
# @jmoralez decide future implementation
trim_attr: list = ["fcst_fitted_values_", "cv_fitted_values_"]
if trim:
for attr in trim_attr:
# remove unnecessary attributes here
self.__dict__.pop(attr, None)

sf_size = len(pickle.dumps(self))

if max_size:
akmalsoliev marked this conversation as resolved.
Show resolved Hide resolved
cap_size = self.__get_cap_size__(max_size, bytes_hmap)
akmalsoliev marked this conversation as resolved.
Show resolved Hide resolved
if sf_size >= cap_size:
err_messg = "StatsForecast is larger specified max_size"
akmalsoliev marked this conversation as resolved.
Show resolved Hide resolved
raise OSError(errno.EFBIG, err_messg)

converted_size, sf_byte = None, None
for key in reversed(list(bytes_hmap.keys())):
x_byte = bytes_hmap[key]
if sf_size >= x_byte:
converted_size = sf_size / x_byte
sf_byte = key
break

if converted_size is None or sf_byte is None:
err_messg = "Internal Error, this shouldn't happen, please open an issue"
raise NameError(err_messg)
akmalsoliev marked this conversation as resolved.
Show resolved Hide resolved

print(f"Model(s) size: {converted_size:.2f}{sf_byte}")

print("Saving model(s)")
akmalsoliev marked this conversation as resolved.
Show resolved Hide resolved

if not path:
akmalsoliev marked this conversation as resolved.
Show resolved Hide resolved
datetime_record = dt.datetime.utcnow().strftime("%Y-%m-%d_%H-%M-%S")
path = f"StatsForecast_{datetime_record}.pkl"

with open(path, "wb") as m_file:
pickle.dump(self, m_file)
print("Model(s) saved")
akmalsoliev marked this conversation as resolved.
Show resolved Hide resolved

def __get_cap_size__(self, max_size, bytes_hmap):
max_size = max_size.replace(" ", "")
match = re.match(r"(\d+\.\d+|\d+)(\w+)", max_size)
if not match or match[2] not in bytes_hmap.keys():
akmalsoliev marked this conversation as resolved.
Show resolved Hide resolved
warnings.warn("Couldn't parse your max_size, default 50MB will be set")
# Keeping this as default, failure check
cap_size = 50.0 * bytes_hmap["MB"]
akmalsoliev marked this conversation as resolved.
Show resolved Hide resolved
else:
m_size = float(match[1])
key_ = match[2]
cap_size = m_size * bytes_hmap[key_]
return cap_size

@staticmethod
def load(path: Union[Path, str]):
"""
Automatically loads the model into ready StatsForecast.

Parameters
----------
path: Union[str, Path]
Path to saved StatsForecast file.
akmalsoliev marked this conversation as resolved.
Show resolved Hide resolved

Returns
-------
sf: StatsForecast
Previously saved StatsForecast
"""
if not os.path.exists(path):
raise ValueError("Specified path does not exist, check again and retry.")
with open(path, "rb") as f:
akmalsoliev marked this conversation as resolved.
Show resolved Hide resolved
return pickle.load(f)

def __repr__(self):
return f"StatsForecast(models=[{','.join(map(repr, self.models))}])"

Expand Down
Loading