Add some misc features. (microsoft#1816)

* Normal mod * Black linting * Linting
2young-2simple-sometimes-naive · Jun 26, 2024 · 5190332 · 5190332
1 parent cde8020
commit 5190332
Show file tree

Hide file tree

Showing 15 changed files with 289 additions and 75 deletions.
diff --git a/examples/benchmarks_dynamic/DDG-DA/workflow.py b/examples/benchmarks_dynamic/DDG-DA/workflow.py
@@ -1,5 +1,6 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
+import os
 from pathlib import Path
 from typing import Union
 
@@ -35,6 +36,10 @@ def __init__(self, conf_path: Union[str, Path] = DEFAULT_CONF, horizon=20, **kwa
 
 
 if __name__ == "__main__":
-    GetData().qlib_data(exists_skip=True)
-    auto_init()
+    kwargs = {}
+    if os.environ.get("PROVIDER_URI", "") == "":
+        GetData().qlib_data(exists_skip=True)
+    else:
+        kwargs["provider_uri"] = os.environ["PROVIDER_URI"]
+    auto_init(**kwargs)
     fire.Fire(DDGDABench)
diff --git a/examples/benchmarks_dynamic/baseline/rolling_benchmark.py b/examples/benchmarks_dynamic/baseline/rolling_benchmark.py
@@ -1,5 +1,6 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
+import os
 from pathlib import Path
 from typing import Union
 
@@ -31,6 +32,10 @@ def __init__(self, conf_path: Union[str, Path] = DEFAULT_CONF, horizon=20, **kwa
 
 
 if __name__ == "__main__":
-    GetData().qlib_data(exists_skip=True)
-    auto_init()
+    kwargs = {}
+    if os.environ.get("PROVIDER_URI", "") == "":
+        GetData().qlib_data(exists_skip=True)
+    else:
+        kwargs["provider_uri"] = os.environ["PROVIDER_URI"]
+    auto_init(**kwargs)
     fire.Fire(RollingBenchmark)
diff --git a/qlib/contrib/meta/data_selection/dataset.py b/qlib/contrib/meta/data_selection/dataset.py
@@ -243,7 +243,7 @@ def __init__(
         trunc_days: int = None,
         rolling_ext_days: int = 0,
         exp_name: Union[str, InternalData],
-        segments: Union[Dict[Text, Tuple], float],
+        segments: Union[Dict[Text, Tuple], float, str],
         hist_step_n: int = 10,
         task_mode: str = MetaTask.PROC_MODE_FULL,
         fill_method: str = "max",
@@ -271,12 +271,16 @@ def __init__(
             - str: the name of the experiment to store the performance of data
             - InternalData: a prepared internal data
         segments: Union[Dict[Text, Tuple], float]
-            the segments to divide data
-            both left and right
+            if the segment is a Dict
+                the segments to divide data
+                both left and right are included
             if segments is a float:
                 the float represents the percentage of data for training
+            if segments is a string:
+                it will try its best to put its data in training and ensure that the date `segments` is in the test set
         hist_step_n: int
             length of historical steps for the meta infomation
+            Number of steps of the data similarity information
         task_mode : str
             Please refer to the docs of MetaTask
         """
@@ -383,10 +387,30 @@ def _prepare_seg(self, segment: Text) -> List[MetaTask]:
         if isinstance(self.segments, float):
             train_task_n = int(len(self.meta_task_l) * self.segments)
             if segment == "train":
-                return self.meta_task_l[:train_task_n]
+                train_tasks = self.meta_task_l[:train_task_n]
+                get_module_logger("MetaDatasetDS").info(f"The first train meta task: {train_tasks[0]}")
+                return train_tasks
             elif segment == "test":
-                return self.meta_task_l[train_task_n:]
+                test_tasks = self.meta_task_l[train_task_n:]
+                get_module_logger("MetaDatasetDS").info(f"The first test meta task: {test_tasks[0]}")
+                return test_tasks
             else:
                 raise NotImplementedError(f"This type of input is not supported")
+        elif isinstance(self.segments, str):
+            train_tasks = []
+            test_tasks = []
+            for t in self.meta_task_l:
+                test_end = t.task["dataset"]["kwargs"]["segments"]["test"][1]
+                if test_end is None or pd.Timestamp(test_end) < pd.Timestamp(self.segments):
+                    train_tasks.append(t)
+                else:
+                    test_tasks.append(t)
+            get_module_logger("MetaDatasetDS").info(f"The first train meta task: {train_tasks[0]}")
+            get_module_logger("MetaDatasetDS").info(f"The first test meta task: {test_tasks[0]}")
+            if segment == "train":
+                return train_tasks
+            elif segment == "test":
+                return test_tasks
+            raise NotImplementedError(f"This type of input is not supported")
         else:
             raise NotImplementedError(f"This type of input is not supported")
diff --git a/qlib/contrib/meta/data_selection/model.py b/qlib/contrib/meta/data_selection/model.py
@@ -53,7 +53,12 @@ def __init__(
         max_epoch=100,
         seed=43,
         alpha=0.0,
+        loss_skip_thresh=50,
     ):
+        """
+        loss_skip_size: int
+            The number of threshold to skip the loss calculation for each day.
+        """
         self.step = step
         self.hist_step_n = hist_step_n
         self.clip_method = clip_method
@@ -63,6 +68,7 @@ def __init__(
         self.max_epoch = max_epoch
         self.fitted = False
         self.alpha = alpha
+        self.loss_skip_thresh = loss_skip_thresh
         torch.manual_seed(seed)
 
     def run_epoch(self, phase, task_list, epoch, opt, loss_l, ignore_weight=False):
@@ -88,12 +94,14 @@ def run_epoch(self, phase, task_list, epoch, opt, loss_l, ignore_weight=False):
                 criterion = nn.MSELoss()
                 loss = criterion(pred, meta_input["y_test"])
             elif self.criterion == "ic_loss":
-                criterion = ICLoss()
+                criterion = ICLoss(self.loss_skip_thresh)
                 try:
-                    loss = criterion(pred, meta_input["y_test"], meta_input["test_idx"], skip_size=50)
+                    loss = criterion(pred, meta_input["y_test"], meta_input["test_idx"])
                 except ValueError as e:
                     get_module_logger("MetaModelDS").warning(f"Exception `{e}` when calculating IC loss")
                     continue
+            else:
+                raise ValueError(f"Unknown criterion: {self.criterion}")
 
             assert not np.isnan(loss.detach().item()), "NaN loss!"
 

diff --git a/qlib/contrib/meta/data_selection/utils.py b/qlib/contrib/meta/data_selection/utils.py
@@ -10,7 +10,11 @@
 
 
 class ICLoss(nn.Module):
-    def forward(self, pred, y, idx, skip_size=50):
+    def __init__(self, skip_size=50):
+        super().__init__()
+        self.skip_size = skip_size
+
+    def forward(self, pred, y, idx):
         """forward.
         FIXME:
         - Some times it will be a slightly different from the result from `pandas.corr()`
@@ -33,7 +37,7 @@ def forward(self, pred, y, idx, skip_size=50):
         skip_n = 0
         for start_i, end_i in zip(diff_point, diff_point[1:]):
             pred_focus = pred[start_i:end_i]  # TODO: just for fake
-            if pred_focus.shape[0] < skip_size:
+            if pred_focus.shape[0] < self.skip_size:
                 # skip some days which have very small amount of stock.
                 skip_n += 1
                 continue
@@ -50,6 +54,7 @@ def forward(self, pred, y, idx, skip_size=50):
             )
             ic_all += ic_day
         if len(diff_point) - 1 - skip_n <= 0:
+            __import__("ipdb").set_trace()
             raise ValueError("No enough data for calculating IC")
         if skip_n > 0:
             get_module_logger("ICLoss").info(

diff --git a/qlib/contrib/model/linear.py b/qlib/contrib/model/linear.py
@@ -63,6 +63,7 @@ def fit(self, dataset: DatasetH, reweighter: Reweighter = None):
                 df_train = pd.concat([df_train, df_valid])
             except KeyError:
                 get_module_logger("LinearModel").info("include_valid=True, but valid does not exist")
+        df_train = df_train.dropna()
         if df_train.empty:
             raise ValueError("Empty data from dataset, please check your dataset config.")
         if reweighter is not None:

diff --git a/qlib/contrib/model/pytorch_gru.py b/qlib/contrib/model/pytorch_gru.py
@@ -1,25 +1,25 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 
-
 from __future__ import division
 from __future__ import print_function
+import copy
+from typing import Text, Union
 
 import numpy as np
 import pandas as pd
-from typing import Text, Union
-import copy
-from ...utils import get_or_create_path
-from ...log import get_module_logger
-
 import torch
 import torch.nn as nn
 import torch.optim as optim
 
-from .pytorch_utils import count_parameters
-from ...model.base import Model
+from qlib.workflow import R
+
 from ...data.dataset import DatasetH
 from ...data.dataset.handler import DataHandlerLP
+from ...log import get_module_logger
+from ...model.base import Model
+from ...utils import get_or_create_path
+from .pytorch_utils import count_parameters
 
 
 class GRU(Model):
@@ -212,16 +212,31 @@ def fit(
         evals_result=dict(),
         save_path=None,
     ):
-        df_train, df_valid, df_test = dataset.prepare(
-            ["train", "valid", "test"],
-            col_set=["feature", "label"],
-            data_key=DataHandlerLP.DK_L,
-        )
-        if df_train.empty or df_valid.empty:
-            raise ValueError("Empty data from dataset, please check your dataset config.")
+        # prepare training and validation data
+        dfs = {
+            k: dataset.prepare(
+                k,
+                col_set=["feature", "label"],
+                data_key=DataHandlerLP.DK_L,
+            )
+            for k in ["train", "valid"]
+            if k in dataset.segments
+        }
+        df_train, df_valid = dfs.get("train", pd.DataFrame()), dfs.get("valid", pd.DataFrame())
+
+        # check if training data is empty
+        if df_train.empty:
+            raise ValueError("Empty training data from dataset, please check your dataset config.")
 
+        df_train = df_train.dropna()
         x_train, y_train = df_train["feature"], df_train["label"]
-        x_valid, y_valid = df_valid["feature"], df_valid["label"]
+
+        # check if validation data is provided
+        if not df_valid.empty:
+            df_valid = df_valid.dropna()
+            x_valid, y_valid = df_valid["feature"], df_valid["label"]
+        else:
+            x_valid, y_valid = None, None
 
         save_path = get_or_create_path(save_path)
         stop_steps = 0
@@ -235,32 +250,42 @@ def fit(
         self.logger.info("training...")
         self.fitted = True
 
+        best_param = copy.deepcopy(self.gru_model.state_dict())
         for step in range(self.n_epochs):
             self.logger.info("Epoch%d:", step)
             self.logger.info("training...")
             self.train_epoch(x_train, y_train)
             self.logger.info("evaluating...")
             train_loss, train_score = self.test_epoch(x_train, y_train)
-            val_loss, val_score = self.test_epoch(x_valid, y_valid)
-            self.logger.info("train %.6f, valid %.6f" % (train_score, val_score))
             evals_result["train"].append(train_score)
-            evals_result["valid"].append(val_score)
 
-            if val_score > best_score:
-                best_score = val_score
-                stop_steps = 0
-                best_epoch = step
-                best_param = copy.deepcopy(self.gru_model.state_dict())
-            else:
-                stop_steps += 1
-                if stop_steps >= self.early_stop:
-                    self.logger.info("early stop")
-                    break
+            # evaluate on validation data if provided
+            if x_valid is not None and y_valid is not None:
+                val_loss, val_score = self.test_epoch(x_valid, y_valid)
+                self.logger.info("train %.6f, valid %.6f" % (train_score, val_score))
+                evals_result["valid"].append(val_score)
+
+                if val_score > best_score:
+                    best_score = val_score
+                    stop_steps = 0
+                    best_epoch = step
+                    best_param = copy.deepcopy(self.gru_model.state_dict())
+                else:
+                    stop_steps += 1
+                    if stop_steps >= self.early_stop:
+                        self.logger.info("early stop")
+                        break
 
         self.logger.info("best score: %.6lf @ %d" % (best_score, best_epoch))
         self.gru_model.load_state_dict(best_param)
         torch.save(best_param, save_path)
 
+        # Logging
+        rec = R.get_recorder()
+        for k, v_l in evals_result.items():
+            for i, v in enumerate(v_l):
+                rec.log_metrics(step=i, **{k: v})
+
         if self.use_gpu:
             torch.cuda.empty_cache()
 
@@ -292,6 +317,7 @@ def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"):
 
 
 class GRUModel(nn.Module):
+
     def __init__(self, d_feat=6, hidden_size=64, num_layers=2, dropout=0.0):
         super().__init__()
 

diff --git a/qlib/contrib/report/data/ana.py b/qlib/contrib/report/data/ana.py
@@ -1,5 +1,17 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
+"""
+Here we have a comprehensive set of analysis classes.
+
+Here is an example.
+
+.. code-block:: python
+
+    from qlib.contrib.report.data.ana import FeaMeanStd
+    fa = FeaMeanStd(ret_df)
+    fa.plot_all(wspace=0.3, sub_figsize=(12, 3), col_n=5)
+
+"""
 import pandas as pd
 import numpy as np
 from qlib.contrib.report.data.base import FeaAnalyser
@@ -152,6 +164,7 @@ def plot_single(self, col, ax):
         self._kurt[col].plot(ax=right_ax, label="kurt", color="green")
         right_ax.set_xlabel("")
         right_ax.set_ylabel("kurt")
+        right_ax.grid(None)  # set the grid to None to avoid two layer of grid
 
         h1, l1 = ax.get_legend_handles_labels()
         h2, l2 = right_ax.get_legend_handles_labels()
@@ -171,12 +184,15 @@ def plot_single(self, col, ax):
         ax.set_xlabel("")
         ax.set_ylabel("mean")
         ax.legend()
+        ax.tick_params(axis="x", rotation=90)
 
         right_ax = ax.twinx()
 
         self._std[col].plot(ax=right_ax, label="std", color="green")
         right_ax.set_xlabel("")
         right_ax.set_ylabel("std")
+        right_ax.tick_params(axis="x", rotation=90)
+        right_ax.grid(None)  # set the grid to None to avoid two layer of grid
 
         h1, l1 = ax.get_legend_handles_labels()
         h2, l2 = right_ax.get_legend_handles_labels()

diff --git a/qlib/contrib/report/data/base.py b/qlib/contrib/report/data/base.py
@@ -14,6 +14,24 @@
 
 class FeaAnalyser:
     def __init__(self, dataset: pd.DataFrame):
+        """
+
+        Parameters
+        ----------
+        dataset : pd.DataFrame
+
+            We often have multiple columns for dataset. Each column corresponds to one sub figure.
+            There will be a datatime column in the index levels.
+            Aggretation will be used for more summarized metrics overtime.
+            Here is an example of data:
+
+            .. code-block::
+
+                                            return
+                datetime   instrument
+                2007-02-06 equity_tpx     0.010087
+                           equity_spx     0.000786
+        """
         self._dataset = dataset
         with TimeInspector.logt("calc_stat_values"):
             self.calc_stat_values()