Skip to content

Commit

Permalink
Add some misc features. (microsoft#1816)
Browse files Browse the repository at this point in the history
* Normal mod

* Black linting

* Linting
  • Loading branch information
you-n-g authored Jun 26, 2024
1 parent cde8020 commit 5190332
Show file tree
Hide file tree
Showing 15 changed files with 289 additions and 75 deletions.
9 changes: 7 additions & 2 deletions examples/benchmarks_dynamic/DDG-DA/workflow.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import os
from pathlib import Path
from typing import Union

Expand Down Expand Up @@ -35,6 +36,10 @@ def __init__(self, conf_path: Union[str, Path] = DEFAULT_CONF, horizon=20, **kwa


if __name__ == "__main__":
GetData().qlib_data(exists_skip=True)
auto_init()
kwargs = {}
if os.environ.get("PROVIDER_URI", "") == "":
GetData().qlib_data(exists_skip=True)
else:
kwargs["provider_uri"] = os.environ["PROVIDER_URI"]
auto_init(**kwargs)
fire.Fire(DDGDABench)
9 changes: 7 additions & 2 deletions examples/benchmarks_dynamic/baseline/rolling_benchmark.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import os
from pathlib import Path
from typing import Union

Expand Down Expand Up @@ -31,6 +32,10 @@ def __init__(self, conf_path: Union[str, Path] = DEFAULT_CONF, horizon=20, **kwa


if __name__ == "__main__":
GetData().qlib_data(exists_skip=True)
auto_init()
kwargs = {}
if os.environ.get("PROVIDER_URI", "") == "":
GetData().qlib_data(exists_skip=True)
else:
kwargs["provider_uri"] = os.environ["PROVIDER_URI"]
auto_init(**kwargs)
fire.Fire(RollingBenchmark)
34 changes: 29 additions & 5 deletions qlib/contrib/meta/data_selection/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,7 +243,7 @@ def __init__(
trunc_days: int = None,
rolling_ext_days: int = 0,
exp_name: Union[str, InternalData],
segments: Union[Dict[Text, Tuple], float],
segments: Union[Dict[Text, Tuple], float, str],
hist_step_n: int = 10,
task_mode: str = MetaTask.PROC_MODE_FULL,
fill_method: str = "max",
Expand Down Expand Up @@ -271,12 +271,16 @@ def __init__(
- str: the name of the experiment to store the performance of data
- InternalData: a prepared internal data
segments: Union[Dict[Text, Tuple], float]
the segments to divide data
both left and right
if the segment is a Dict
the segments to divide data
both left and right are included
if segments is a float:
the float represents the percentage of data for training
if segments is a string:
it will try its best to put its data in training and ensure that the date `segments` is in the test set
hist_step_n: int
length of historical steps for the meta infomation
Number of steps of the data similarity information
task_mode : str
Please refer to the docs of MetaTask
"""
Expand Down Expand Up @@ -383,10 +387,30 @@ def _prepare_seg(self, segment: Text) -> List[MetaTask]:
if isinstance(self.segments, float):
train_task_n = int(len(self.meta_task_l) * self.segments)
if segment == "train":
return self.meta_task_l[:train_task_n]
train_tasks = self.meta_task_l[:train_task_n]
get_module_logger("MetaDatasetDS").info(f"The first train meta task: {train_tasks[0]}")
return train_tasks
elif segment == "test":
return self.meta_task_l[train_task_n:]
test_tasks = self.meta_task_l[train_task_n:]
get_module_logger("MetaDatasetDS").info(f"The first test meta task: {test_tasks[0]}")
return test_tasks
else:
raise NotImplementedError(f"This type of input is not supported")
elif isinstance(self.segments, str):
train_tasks = []
test_tasks = []
for t in self.meta_task_l:
test_end = t.task["dataset"]["kwargs"]["segments"]["test"][1]
if test_end is None or pd.Timestamp(test_end) < pd.Timestamp(self.segments):
train_tasks.append(t)
else:
test_tasks.append(t)
get_module_logger("MetaDatasetDS").info(f"The first train meta task: {train_tasks[0]}")
get_module_logger("MetaDatasetDS").info(f"The first test meta task: {test_tasks[0]}")
if segment == "train":
return train_tasks
elif segment == "test":
return test_tasks
raise NotImplementedError(f"This type of input is not supported")
else:
raise NotImplementedError(f"This type of input is not supported")
12 changes: 10 additions & 2 deletions qlib/contrib/meta/data_selection/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,12 @@ def __init__(
max_epoch=100,
seed=43,
alpha=0.0,
loss_skip_thresh=50,
):
"""
loss_skip_size: int
The number of threshold to skip the loss calculation for each day.
"""
self.step = step
self.hist_step_n = hist_step_n
self.clip_method = clip_method
Expand All @@ -63,6 +68,7 @@ def __init__(
self.max_epoch = max_epoch
self.fitted = False
self.alpha = alpha
self.loss_skip_thresh = loss_skip_thresh
torch.manual_seed(seed)

def run_epoch(self, phase, task_list, epoch, opt, loss_l, ignore_weight=False):
Expand All @@ -88,12 +94,14 @@ def run_epoch(self, phase, task_list, epoch, opt, loss_l, ignore_weight=False):
criterion = nn.MSELoss()
loss = criterion(pred, meta_input["y_test"])
elif self.criterion == "ic_loss":
criterion = ICLoss()
criterion = ICLoss(self.loss_skip_thresh)
try:
loss = criterion(pred, meta_input["y_test"], meta_input["test_idx"], skip_size=50)
loss = criterion(pred, meta_input["y_test"], meta_input["test_idx"])
except ValueError as e:
get_module_logger("MetaModelDS").warning(f"Exception `{e}` when calculating IC loss")
continue
else:
raise ValueError(f"Unknown criterion: {self.criterion}")

assert not np.isnan(loss.detach().item()), "NaN loss!"

Expand Down
9 changes: 7 additions & 2 deletions qlib/contrib/meta/data_selection/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,11 @@


class ICLoss(nn.Module):
def forward(self, pred, y, idx, skip_size=50):
def __init__(self, skip_size=50):
super().__init__()
self.skip_size = skip_size

def forward(self, pred, y, idx):
"""forward.
FIXME:
- Some times it will be a slightly different from the result from `pandas.corr()`
Expand All @@ -33,7 +37,7 @@ def forward(self, pred, y, idx, skip_size=50):
skip_n = 0
for start_i, end_i in zip(diff_point, diff_point[1:]):
pred_focus = pred[start_i:end_i] # TODO: just for fake
if pred_focus.shape[0] < skip_size:
if pred_focus.shape[0] < self.skip_size:
# skip some days which have very small amount of stock.
skip_n += 1
continue
Expand All @@ -50,6 +54,7 @@ def forward(self, pred, y, idx, skip_size=50):
)
ic_all += ic_day
if len(diff_point) - 1 - skip_n <= 0:
__import__("ipdb").set_trace()
raise ValueError("No enough data for calculating IC")
if skip_n > 0:
get_module_logger("ICLoss").info(
Expand Down
1 change: 1 addition & 0 deletions qlib/contrib/model/linear.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ def fit(self, dataset: DatasetH, reweighter: Reweighter = None):
df_train = pd.concat([df_train, df_valid])
except KeyError:
get_module_logger("LinearModel").info("include_valid=True, but valid does not exist")
df_train = df_train.dropna()
if df_train.empty:
raise ValueError("Empty data from dataset, please check your dataset config.")
if reweighter is not None:
Expand Down
84 changes: 55 additions & 29 deletions qlib/contrib/model/pytorch_gru.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,25 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.


from __future__ import division
from __future__ import print_function
import copy
from typing import Text, Union

import numpy as np
import pandas as pd
from typing import Text, Union
import copy
from ...utils import get_or_create_path
from ...log import get_module_logger

import torch
import torch.nn as nn
import torch.optim as optim

from .pytorch_utils import count_parameters
from ...model.base import Model
from qlib.workflow import R

from ...data.dataset import DatasetH
from ...data.dataset.handler import DataHandlerLP
from ...log import get_module_logger
from ...model.base import Model
from ...utils import get_or_create_path
from .pytorch_utils import count_parameters


class GRU(Model):
Expand Down Expand Up @@ -212,16 +212,31 @@ def fit(
evals_result=dict(),
save_path=None,
):
df_train, df_valid, df_test = dataset.prepare(
["train", "valid", "test"],
col_set=["feature", "label"],
data_key=DataHandlerLP.DK_L,
)
if df_train.empty or df_valid.empty:
raise ValueError("Empty data from dataset, please check your dataset config.")
# prepare training and validation data
dfs = {
k: dataset.prepare(
k,
col_set=["feature", "label"],
data_key=DataHandlerLP.DK_L,
)
for k in ["train", "valid"]
if k in dataset.segments
}
df_train, df_valid = dfs.get("train", pd.DataFrame()), dfs.get("valid", pd.DataFrame())

# check if training data is empty
if df_train.empty:
raise ValueError("Empty training data from dataset, please check your dataset config.")

df_train = df_train.dropna()
x_train, y_train = df_train["feature"], df_train["label"]
x_valid, y_valid = df_valid["feature"], df_valid["label"]

# check if validation data is provided
if not df_valid.empty:
df_valid = df_valid.dropna()
x_valid, y_valid = df_valid["feature"], df_valid["label"]
else:
x_valid, y_valid = None, None

save_path = get_or_create_path(save_path)
stop_steps = 0
Expand All @@ -235,32 +250,42 @@ def fit(
self.logger.info("training...")
self.fitted = True

best_param = copy.deepcopy(self.gru_model.state_dict())
for step in range(self.n_epochs):
self.logger.info("Epoch%d:", step)
self.logger.info("training...")
self.train_epoch(x_train, y_train)
self.logger.info("evaluating...")
train_loss, train_score = self.test_epoch(x_train, y_train)
val_loss, val_score = self.test_epoch(x_valid, y_valid)
self.logger.info("train %.6f, valid %.6f" % (train_score, val_score))
evals_result["train"].append(train_score)
evals_result["valid"].append(val_score)

if val_score > best_score:
best_score = val_score
stop_steps = 0
best_epoch = step
best_param = copy.deepcopy(self.gru_model.state_dict())
else:
stop_steps += 1
if stop_steps >= self.early_stop:
self.logger.info("early stop")
break
# evaluate on validation data if provided
if x_valid is not None and y_valid is not None:
val_loss, val_score = self.test_epoch(x_valid, y_valid)
self.logger.info("train %.6f, valid %.6f" % (train_score, val_score))
evals_result["valid"].append(val_score)

if val_score > best_score:
best_score = val_score
stop_steps = 0
best_epoch = step
best_param = copy.deepcopy(self.gru_model.state_dict())
else:
stop_steps += 1
if stop_steps >= self.early_stop:
self.logger.info("early stop")
break

self.logger.info("best score: %.6lf @ %d" % (best_score, best_epoch))
self.gru_model.load_state_dict(best_param)
torch.save(best_param, save_path)

# Logging
rec = R.get_recorder()
for k, v_l in evals_result.items():
for i, v in enumerate(v_l):
rec.log_metrics(step=i, **{k: v})

if self.use_gpu:
torch.cuda.empty_cache()

Expand Down Expand Up @@ -292,6 +317,7 @@ def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"):


class GRUModel(nn.Module):

def __init__(self, d_feat=6, hidden_size=64, num_layers=2, dropout=0.0):
super().__init__()

Expand Down
16 changes: 16 additions & 0 deletions qlib/contrib/report/data/ana.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,17 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
"""
Here we have a comprehensive set of analysis classes.
Here is an example.
.. code-block:: python
from qlib.contrib.report.data.ana import FeaMeanStd
fa = FeaMeanStd(ret_df)
fa.plot_all(wspace=0.3, sub_figsize=(12, 3), col_n=5)
"""
import pandas as pd
import numpy as np
from qlib.contrib.report.data.base import FeaAnalyser
Expand Down Expand Up @@ -152,6 +164,7 @@ def plot_single(self, col, ax):
self._kurt[col].plot(ax=right_ax, label="kurt", color="green")
right_ax.set_xlabel("")
right_ax.set_ylabel("kurt")
right_ax.grid(None) # set the grid to None to avoid two layer of grid

h1, l1 = ax.get_legend_handles_labels()
h2, l2 = right_ax.get_legend_handles_labels()
Expand All @@ -171,12 +184,15 @@ def plot_single(self, col, ax):
ax.set_xlabel("")
ax.set_ylabel("mean")
ax.legend()
ax.tick_params(axis="x", rotation=90)

right_ax = ax.twinx()

self._std[col].plot(ax=right_ax, label="std", color="green")
right_ax.set_xlabel("")
right_ax.set_ylabel("std")
right_ax.tick_params(axis="x", rotation=90)
right_ax.grid(None) # set the grid to None to avoid two layer of grid

h1, l1 = ax.get_legend_handles_labels()
h2, l2 = right_ax.get_legend_handles_labels()
Expand Down
18 changes: 18 additions & 0 deletions qlib/contrib/report/data/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,24 @@

class FeaAnalyser:
def __init__(self, dataset: pd.DataFrame):
"""
Parameters
----------
dataset : pd.DataFrame
We often have multiple columns for dataset. Each column corresponds to one sub figure.
There will be a datatime column in the index levels.
Aggretation will be used for more summarized metrics overtime.
Here is an example of data:
.. code-block::
return
datetime instrument
2007-02-06 equity_tpx 0.010087
equity_spx 0.000786
"""
self._dataset = dataset
with TimeInspector.logt("calc_stat_values"):
self.calc_stat_values()
Expand Down
Loading

0 comments on commit 5190332

Please sign in to comment.