Nixtla · jmoralez · Oct 23, 2023 · Oct 13, 2023 · Oct 13, 2023 · Oct 13, 2023
diff --git a/nbs/src/core/core.ipynb b/nbs/src/core/core.ipynb
@@ -75,9 +75,15 @@
     "import logging\n",
     "import reprlib\n",
     "import warnings\n",
+    "import errno\n",
+    "import os\n",
+    "from pathlib import Path\n",
     "from os import cpu_count\n",
     "from typing import Any, List, Optional, Union, Dict\n",
     "import pkg_resources\n",
+    "import pickle\n",
+    "import datetime as dt\n",
+    "import re\n",
     "\n",
     "from fugue.execution.factory import make_execution_engine\n",
     "import numpy as np\n",
@@ -1965,6 +1971,109 @@
     "            palette='tab20b',\n",
     "        )\n",
     "    \n",
+    "    def save(\n",
+    "        self, \n",
+    "        path: Union[Optional[Path], Optional[str]] = None, \n",
+    "        max_size: Optional[str] = None,\n",
+    "        trim: bool = False,\n",
+    "    ):\n",
+    "        \"\"\"Function that will save StatsForecast class with certain settings to make it \n",
+    "        reproducible.\n",
+    "        \n",
+    "        Parameters\n",
+    "        ----------\n",
+    "        path : str or pathlib.Path, optional (default=None)\n",
+    "            Path of the file to be saved. If `None` will create one in the current \n",
+    "            directory using the current UTC timestamp.\n",
+    "        max_size: str, (default = None)\n",
+    "            StatsForecast class should not exceed this size, available byte naming:\n",
+    "            ['B', 'KB', 'MB', 'GB']\n",
+    "            If max_size is set, but not parsable then default 50MB will be set.\n",
+    "        trim: bool, (default = False)\n",
+    "            Delete any attributes not needed for inference.\n",
+    "        \"\"\"\n",
+    "        # Will be used to find the size of the fitted models\n",
+    "        # Never expecting anything higher than GB (even that's a lot')\n",
+    "        bytes_hmap = {\n",
+    "            \"B\": 1,\n",
+    "            \"KB\": 2**10,\n",
+    "            \"MB\": 2**20,\n",
+    "            \"GB\": 2**30,\n",
+    "        }\n",
+    "\n",
+    "        # Removing unnecessary attributes\n",
+    "        # @jmoralez decide future implementation\n",
+    "        trim_attr:list = [\"fcst_fitted_values_\", \"cv_fitted_values_\"]\n",
+    "        if trim:\n",
+    "            for attr in trim_attr:\n",
+    "                # remove unnecessary attributes here\n",
+    "                self.__dict__.pop(attr, None)\n",
+    "\n",
+    "        sf_size = len(pickle.dumps(self))\n",
+    "\n",
+    "        if max_size:\n",
+    "            cap_size = self.__get_cap_size__(max_size, bytes_hmap)\n",
+    "            if sf_size >= cap_size:\n",
+    "                err_messg = \"StatsForecast is larger specified max_size\"\n",
+    "                raise OSError(errno.EFBIG, err_messg) \n",
+    "\n",
+    "        converted_size, sf_byte = None, None\n",
+    "        for key in reversed(list(bytes_hmap.keys())):\n",
+    "            x_byte = bytes_hmap[key]\n",
+    "            if sf_size >= x_byte:\n",
+    "                converted_size = sf_size / x_byte\n",
+    "                sf_byte = key\n",
+    "                break\n",
+    "    \n",
+    "        if converted_size is None or sf_byte is None:\n",
+    "            err_messg = \"Internal Error, this shouldn't happen, please open an issue\"\n",
+    "            raise NameError(err_messg)\n",
+    "    \n",
+    "        print(f\"Model(s) size: {converted_size:.2f}{sf_byte}\")\n",
+    "    \n",
+    "        print(\"Saving model(s)\")\n",
+    "    \n",
+    "        if not path:\n",
+    "            datetime_record = dt.datetime.utcnow().strftime(\"%Y-%m-%d_%H-%M-%S\")\n",
+    "            path = f\"StatsForecast_{datetime_record}.pkl\"\n",
+    "    \n",
+    "        with open(path, \"wb\") as m_file:\n",
+    "            pickle.dump(self, m_file)\n",
+    "        print(\"Model(s) saved\")\n",
+    "\n",
+    "    def __get_cap_size__(self, max_size, bytes_hmap):\n",
+    "        max_size = max_size.replace(\" \", \"\")\n",
+    "        match = re.match(r'(\\d+\\.\\d+|\\d+)(\\w+)', max_size)\n",
+    "        if not match or match[2] not in bytes_hmap.keys():\n",
+    "            warnings.warn(\"Couldn't parse your max_size, default 50MB will be set\")\n",
+    "            # Keeping this as default, failure check\n",
+    "            cap_size = 50.0 * bytes_hmap[\"MB\"]\n",
+    "        else:\n",
+    "            m_size = float(match[1])\n",
+    "            key_ = match[2]\n",
+    "            cap_size = m_size * bytes_hmap[key_]\n",
+    "        return cap_size\n",
+    "    \n",
+    "    @staticmethod\n",
+    "    def load(path:Union[Path, str]):\n",
+    "        \"\"\"\n",
+    "        Automatically loads the model into ready StatsForecast.\n",
+    "\n",
+    "        Parameters\n",
+    "        ----------\n",
+    "        path: Union[str, Path]\n",
+    "            Path to saved StatsForecast file.\n",
+    "        \n",
+    "        Returns\n",
+    "        -------\n",
+    "        sf: StatsForecast\n",
+    "            Previously saved StatsForecast\n",
+    "        \"\"\"\n",
+    "        if not os.path.exists(path):\n",
+    "            raise ValueError(\"Specified path does not exist, check again and retry.\")\n",
+    "        with open(path, \"rb\") as f:\n",
+    "            return pickle.load(f)\n",
+    "    \n",
     "    def __repr__(self):\n",
     "        return f\"StatsForecast(models=[{','.join(map(repr, self.models))}])\""
    ]
@@ -2196,6 +2305,40 @@
     "fcsts_df.groupby('unique_id').tail(4)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8b9eaa52",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#| hide\n",
+    "# Testing save and load \n",
+    "import tempfile\n",
+    "from polars.testing import assert_frame_equal\n",
+    "\n",
+    "with tempfile.TemporaryDirectory() as td:\n",
+    "    f_path = os.path.join(td, \"sf_test.pickle\")\n",
+    "    \n",
+    "    test_df = pl.from_pandas(panel_df.astype({\"unique_id\": str}))\n",
+    "    test_frcs = StatsForecast(\n",
+    "        df=test_df,\n",
+    "        models=models,\n",
+    "        freq='D', \n",
+    "        n_jobs=1, \n",
+    "        verbose=True\n",
+    "    )\n",
+    "\n",
+    "    origin_df = test_frcs.forecast(h=4, fitted=True)\n",
+    "\n",
+    "    test_frcs.save(f_path)\n",
+    "\n",
+    "    sf_test = StatsForecast.load(f_path)\n",
+    "    load_df = sf_test.forecast(h=4, fitted=True)\n",
+    "    \n",
+    "    assert_frame_equal(origin_df, load_df)"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,

diff --git a/statsforecast/_modidx.py b/statsforecast/_modidx.py
@@ -148,6 +148,8 @@
                                     'statsforecast.core.StatsForecast.forecast': ( 'src/core/core.html#statsforecast.forecast',
                                                                                    'statsforecast/core.py'),
                                     'statsforecast.core._StatsForecast': ('src/core/core.html#_statsforecast', 'statsforecast/core.py'),
+                                    'statsforecast.core._StatsForecast.__get_cap_size__': ( 'src/core/core.html#_statsforecast.__get_cap_size__',
+                                                                                            'statsforecast/core.py'),
                                     'statsforecast.core._StatsForecast.__init__': ( 'src/core/core.html#_statsforecast.__init__',
                                                                                     'statsforecast/core.py'),
                                     'statsforecast.core._StatsForecast.__repr__': ( 'src/core/core.html#_statsforecast.__repr__',
@@ -188,10 +190,14 @@
                                                                                     'statsforecast/core.py'),
                                     'statsforecast.core._StatsForecast.forecast_fitted_values': ( 'src/core/core.html#_statsforecast.forecast_fitted_values',
                                                                                                   'statsforecast/core.py'),
+                                    'statsforecast.core._StatsForecast.load': ( 'src/core/core.html#_statsforecast.load',
+                                                                                'statsforecast/core.py'),
                                     'statsforecast.core._StatsForecast.plot': ( 'src/core/core.html#_statsforecast.plot',
                                                                                 'statsforecast/core.py'),
                                     'statsforecast.core._StatsForecast.predict': ( 'src/core/core.html#_statsforecast.predict',
                                                                                    'statsforecast/core.py'),
+                                    'statsforecast.core._StatsForecast.save': ( 'src/core/core.html#_statsforecast.save',
+                                                                                'statsforecast/core.py'),
                                     'statsforecast.core._cv_dates': ('src/core/core.html#_cv_dates', 'statsforecast/core.py'),
                                     'statsforecast.core._get_n_jobs': ('src/core/core.html#_get_n_jobs', 'statsforecast/core.py'),
                                     'statsforecast.core._parse_ds_type': ('src/core/core.html#_parse_ds_type', 'statsforecast/core.py'),

diff --git a/statsforecast/core.py b/statsforecast/core.py
@@ -8,9 +8,15 @@
 import logging
 import reprlib
 import warnings
+import errno
+import os
+from pathlib import Path
 from os import cpu_count
 from typing import Any, List, Optional, Union, Dict
 import pkg_resources
+import pickle
+import datetime as dt
+import re
 
 from fugue.execution.factory import make_execution_engine
 import numpy as np
@@ -453,7 +459,6 @@ def __init__(
         sort_dataframe: bool,
         validate: Optional[bool] = True,
     ):
-
         self.dataframe = dataframe
         self.sort_dataframe = sort_dataframe
         self.validate = validate
@@ -1546,6 +1551,109 @@ def plot(
             palette="tab20b",
         )
 
+    def save(
+        self,
+        path: Union[Optional[Path], Optional[str]] = None,
+        max_size: Optional[str] = None,
+        trim: bool = False,
+    ):
+        """Function that will save StatsForecast class with certain settings to make it
+        reproducible.
+
+        Parameters
+        ----------
+        path : str or pathlib.Path, optional (default=None)
+            Path of the file to be saved. If `None` will create one in the current
+            directory using the current UTC timestamp.
+        max_size: str, (default = None)
+            StatsForecast class should not exceed this size, available byte naming:
+            ['B', 'KB', 'MB', 'GB']
+            If max_size is set, but not parsable then default 50MB will be set.
+        trim: bool, (default = False)
+            Delete any attributes not needed for inference.
+        """
+        # Will be used to find the size of the fitted models
+        # Never expecting anything higher than GB (even that's a lot')
+        bytes_hmap = {
+            "B": 1,
+            "KB": 2**10,
+            "MB": 2**20,
+            "GB": 2**30,
+        }
+
+        # Removing unnecessary attributes
+        # @jmoralez decide future implementation
+        trim_attr: list = ["fcst_fitted_values_", "cv_fitted_values_"]
+        if trim:
+            for attr in trim_attr:
+                # remove unnecessary attributes here
+                self.__dict__.pop(attr, None)
+
+        sf_size = len(pickle.dumps(self))
+
+        if max_size:
+            cap_size = self.__get_cap_size__(max_size, bytes_hmap)
+            if sf_size >= cap_size:
+                err_messg = "StatsForecast is larger specified max_size"
+                raise OSError(errno.EFBIG, err_messg)
+
+        converted_size, sf_byte = None, None
+        for key in reversed(list(bytes_hmap.keys())):
+            x_byte = bytes_hmap[key]
+            if sf_size >= x_byte:
+                converted_size = sf_size / x_byte
+                sf_byte = key
+                break
+
+        if converted_size is None or sf_byte is None:
+            err_messg = "Internal Error, this shouldn't happen, please open an issue"
+            raise NameError(err_messg)
+
+        print(f"Model(s) size: {converted_size:.2f}{sf_byte}")
+
+        print("Saving model(s)")
+
+        if not path:
+            datetime_record = dt.datetime.utcnow().strftime("%Y-%m-%d_%H-%M-%S")
+            path = f"StatsForecast_{datetime_record}.pkl"
+
+        with open(path, "wb") as m_file:
+            pickle.dump(self, m_file)
+        print("Model(s) saved")
+
+    def __get_cap_size__(self, max_size, bytes_hmap):
+        max_size = max_size.replace(" ", "")
+        match = re.match(r"(\d+\.\d+|\d+)(\w+)", max_size)
+        if not match or match[2] not in bytes_hmap.keys():
+            warnings.warn("Couldn't parse your max_size, default 50MB will be set")
+            # Keeping this as default, failure check
+            cap_size = 50.0 * bytes_hmap["MB"]
+        else:
+            m_size = float(match[1])
+            key_ = match[2]
+            cap_size = m_size * bytes_hmap[key_]
+        return cap_size
+
+    @staticmethod
+    def load(path: Union[Path, str]):
+        """
+        Automatically loads the model into ready StatsForecast.
+
+        Parameters
+        ----------
+        path: Union[str, Path]
+            Path to saved StatsForecast file.
+
+        Returns
+        -------
+        sf: StatsForecast
+            Previously saved StatsForecast
+        """
+        if not os.path.exists(path):
+            raise ValueError("Specified path does not exist, check again and retry.")
+        with open(path, "rb") as f:
+            return pickle.load(f)
+
     def __repr__(self):
         return f"StatsForecast(models=[{','.join(map(repr, self.models))}])"