From 95f121e584e66eb21cd320ab0ca4122083714a25 Mon Sep 17 00:00:00 2001
From: Dennis Bader <dennis.bader@gmx.ch>
Date: Tue, 16 Apr 2024 15:03:57 +0200
Subject: [PATCH] Fix/historical forecasts torch models (#2329)

* simplify hist fc tests part 1

* refactor torch hist fc auto start

* future cov hist fcs tests

* fix rnn model historical forecasts

* fix failing unit tests

* update changelog

* fix discrepancies in test comments

* fix failing unit tests
---
 CHANGELOG.md                                  |   8 +-
 darts/models/forecasting/ensemble_model.py    |   3 +-
 darts/models/forecasting/forecasting_model.py |  30 +-
 .../forecasting/global_baseline_models.py     |   3 -
 .../forecasting/regression_ensemble_model.py  |   5 +-
 darts/models/forecasting/regression_model.py  |   2 +
 darts/models/forecasting/rnn_model.py         |  37 +-
 .../forecasting/torch_forecasting_model.py    |  54 +-
 darts/tests/models/forecasting/test_RNN.py    |  19 +
 .../forecasting/test_ensemble_models.py       |  17 +-
 .../test_global_forecasting_models.py         |   3 +-
 .../forecasting/test_historical_forecasts.py  | 693 +++++++++++-------
 .../test_regression_ensemble_model.py         |  34 +-
 .../test_torch_forecasting_model.py           |   2 +-
 darts/utils/historical_forecasts/utils.py     |  17 +-
 15 files changed, 577 insertions(+), 350 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 44e6f13281..06e5ed062b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -90,14 +90,18 @@ but cannot always guarantee backwards compatibility. Changes that may **break co
 - Improvements to `RegressionModel`: [#2320](https://github.com/unit8co/darts/pull/2320) by [Felix Divo](https://github.com/felixdivo).
   - Added a progress bar when performing optimized historical forecasts (`retrain=False` and no autoregression) to display the series-level progress.
 - Improvements to `DataTransformer`: [#2267](https://github.com/unit8co/darts/pull/2267) by [Alicja Krzeminska-Sciga](https://github.com/alicjakrzeminska).
-  - `InvertibleDataTransformer` now supports parallelized inverse transformation for `series` being a list of lists of `TimeSeries` (`Sequence[Sequence[TimeSeries]]`). This `series` type represents for example the output from `historical_forecasts()` when using multiple series. 
+  - `InvertibleDataTransformer` now supports parallelized inverse transformation for `series` being a list of lists of `TimeSeries` (`Sequence[Sequence[TimeSeries]]`). This `series` type represents for example the output from `historical_forecasts()` when using multiple series.
+- Improvements to `RNNModel`: [#2329](https://github.com/unit8co/darts/pull/2329) by [Dennis Bader](https://github.com/dennisbader).
+  - 🔴 Enforce `training_length>input_chunk_length` since otherwise, during training the model is never run for as many iterations as it will during prediction.
+  - Historical forecasts now correctly infer all possible prediction start points for untrained and pre-trained `RNNModel`.
 
 **Fixed**
 - Fixed a bug in `quantile_loss`, where the loss was computed on all samples rather than only on the predicted quantiles. [#2284](https://github.com/unit8co/darts/pull/2284) by [Dennis Bader](https://github.com/dennisbader).
 - Fixed type hint warning "Unexpected argument" when calling `historical_forecasts()` caused by the `_with_sanity_checks` decorator. The type hinting is now properly configured to expect any input arguments and return the output type of the method for which the sanity checks are performed for. [#2286](https://github.com/unit8co/darts/pull/2286) by [Dennis Bader](https://github.com/dennisbader).
 - Fixed the order of the features when using component-wise lags so that they are grouped by values, then by components (before, were grouped by components, then by values). [#2272](https://github.com/unit8co/darts/pull/2272) by [Antoine Madrona](https://github.com/madtoinou).
 - Fixed a segmentation fault that some users were facing when importing a `LightGBMModel`. [#2304](https://github.com/unit8co/darts/pull/2304) by [Dennis Bader](https://github.com/dennisbader).
-- Fixed a bug when using a dropout with a `TorchForecasting` and pytorch lightning versions >= 2.2.0, where the dropout was not properly activated during training. [#2312](https://github.com/unit8co/darts/pull/2312) by [Dennis Bader](https://github.com/dennisbader).
+- Fixed a bug when using a dropout with a `TorchForecastingModel` and pytorch lightning versions >= 2.2.0, where the dropout was not properly activated during training. [#2312](https://github.com/unit8co/darts/pull/2312) by [Dennis Bader](https://github.com/dennisbader).
+- Fixed a bug when performing historical forecasts with an untrained `TorchForecastingModel` and using covariates, where the historical forecastable time index generation did not take the covariates into account. [#2329](https://github.com/unit8co/darts/pull/2329) by [Dennis Bader](https://github.com/dennisbader).
 
 **Dependencies**
 
diff --git a/darts/models/forecasting/ensemble_model.py b/darts/models/forecasting/ensemble_model.py
index 98ba7293c3..2a1f6627e3 100644
--- a/darts/models/forecasting/ensemble_model.py
+++ b/darts/models/forecasting/ensemble_model.py
@@ -402,6 +402,7 @@ def extreme_lags(
         Optional[int],
         Optional[int],
         int,
+        Optional[int],
     ]:
         def find_max_lag_or_none(lag_id, aggregator) -> Optional[int]:
             max_lag = None
@@ -413,7 +414,7 @@ def find_max_lag_or_none(lag_id, aggregator) -> Optional[int]:
                     max_lag = aggregator(max_lag, curr_lag)
             return max_lag
 
-        lag_aggregators = (min, max, min, max, min, max, max)
+        lag_aggregators = (min, max, min, max, min, max, max, max)
         return tuple(
             find_max_lag_or_none(i, agg) for i, agg in enumerate(lag_aggregators)
         )
diff --git a/darts/models/forecasting/forecasting_model.py b/darts/models/forecasting/forecasting_model.py
index 7b327cd8fd..6c58b3d44a 100644
--- a/darts/models/forecasting/forecasting_model.py
+++ b/darts/models/forecasting/forecasting_model.py
@@ -446,12 +446,13 @@ def extreme_lags(
         Optional[int],
         Optional[int],
         int,
+        Optional[int],
     ]:
         """
-        A 7-tuple containing in order:
+        A 8-tuple containing in order:
         (min target lag, max target lag, min past covariate lag, max past covariate lag, min future covariate
-        lag, max future covariate lag, output shift). If 0 is the index of the first prediction, then all lags are
-        relative to this index.
+        lag, max future covariate lag, output shift, max target lag train (only for RNNModel)). If 0 is the index of the
+        first prediction, then all lags are relative to this index.
 
         See examples below.
 
@@ -474,27 +475,27 @@ def extreme_lags(
         >>> model = LinearRegressionModel(lags=3, output_chunk_length=2)
         >>> model.fit(train_series)
         >>> model.extreme_lags
-        (-3, 1, None, None, None, None, 0)
+        (-3, 1, None, None, None, None, 0, None)
         >>> model = LinearRegressionModel(lags=3, output_chunk_length=2, output_chunk_shift=2)
         >>> model.fit(train_series)
         >>> model.extreme_lags
-        (-3, 1, None, None, None, None, 2)
+        (-3, 1, None, None, None, None, 2, None)
         >>> model = LinearRegressionModel(lags=[-3, -5], lags_past_covariates = 4, output_chunk_length=7)
         >>> model.fit(train_series, past_covariates=past_covariates)
         >>> model.extreme_lags
-        (-5, 6, -4, -1,  None, None, 0)
+        (-5, 6, -4, -1,  None, None, 0, None)
         >>> model = LinearRegressionModel(lags=[3, 5], lags_future_covariates = [4, 6], output_chunk_length=7)
         >>> model.fit(train_series, future_covariates=future_covariates)
         >>> model.extreme_lags
-        (-5, 6, None, None, 4, 6, 0)
+        (-5, 6, None, None, 4, 6, 0, None)
         >>> model = NBEATSModel(input_chunk_length=10, output_chunk_length=7)
         >>> model.fit(train_series)
         >>> model.extreme_lags
-        (-10, 6, None, None, None, None, 0)
+        (-10, 6, None, None, None, None, 0, None)
         >>> model = NBEATSModel(input_chunk_length=10, output_chunk_length=7, lags_future_covariates=[4, 6])
         >>> model.fit(train_series, future_covariates)
         >>> model.extreme_lags
-        (-10, 6, None, None, 4, 6, 0)
+        (-10, 6, None, None, 4, 6, 0, None)
         """
 
     @property
@@ -510,10 +511,13 @@ def _training_sample_time_index_length(self) -> int:
             min_future_cov_lag,
             max_future_cov_lag,
             output_chunk_shift,
+            max_target_lag_train,
         ) = self.extreme_lags
 
+        # some models can have different output chunks for training and prediction (e.g. `RNNModel`)
+        output_lag = max_target_lag_train or max_target_lag
         return max(
-            max_target_lag + 1,
+            output_lag + 1,
             max_future_cov_lag + 1 if max_future_cov_lag else 0,
         ) - min(
             min_target_lag if min_target_lag else 0,
@@ -2452,12 +2456,13 @@ def extreme_lags(
         Optional[int],
         Optional[int],
         int,
+        Optional[int],
     ]:
         # TODO: LocalForecastingModels do not yet handle extreme lags properly. Especially
         #  TransferableFutureCovariatesLocalForecastingModel, where there is a difference between fit and predict mode)
         #  do not yet. In general, Local models train on the entire series (input=output), different to Global models
         #  that use an input to predict an output.
-        return -self.min_train_series_length, -1, None, None, None, None, 0
+        return -self.min_train_series_length, -1, None, None, None, None, 0, None
 
     @property
     def supports_transferrable_series_prediction(self) -> bool:
@@ -2927,12 +2932,13 @@ def extreme_lags(
         Optional[int],
         Optional[int],
         int,
+        Optional[int],
     ]:
         # TODO: LocalForecastingModels do not yet handle extreme lags properly. Especially
         #  TransferableFutureCovariatesLocalForecastingModel, where there is a difference between fit and predict mode)
         #  do not yet. In general, Local models train on the entire series (input=output), different to Global models
         #  that use an input to predict an output.
-        return -self.min_train_series_length, -1, None, None, 0, 0, 0
+        return -self.min_train_series_length, -1, None, None, 0, 0, 0, None
 
 
 class TransferableFutureCovariatesLocalForecastingModel(
diff --git a/darts/models/forecasting/global_baseline_models.py b/darts/models/forecasting/global_baseline_models.py
index 860e44609c..1da914872a 100644
--- a/darts/models/forecasting/global_baseline_models.py
+++ b/darts/models/forecasting/global_baseline_models.py
@@ -229,9 +229,6 @@ def _verify_predict_sample(self, predict_sample: Tuple):
         # have to match the training sample
         pass
 
-    def min_train_series_length(self) -> int:
-        return self.input_chunk_length
-
     def supports_likelihood_parameter_prediction(self) -> bool:
         return False
 
diff --git a/darts/models/forecasting/regression_ensemble_model.py b/darts/models/forecasting/regression_ensemble_model.py
index 835afbe883..a76bb1a2e9 100644
--- a/darts/models/forecasting/regression_ensemble_model.py
+++ b/darts/models/forecasting/regression_ensemble_model.py
@@ -316,9 +316,9 @@ def fit(
             # shift by the forecasting models' largest input length
             all_shifts = []
             # when it's not clearly defined, extreme_lags returns
-            # min_train_serie_length for the LocalForecastingModels
+            # `min_train_series_length` for the LocalForecastingModels
             for model in self.forecasting_models:
-                min_target_lag, _, _, _, _, _, _ = model.extreme_lags
+                min_target_lag, _, _, _, _, _, _, _ = model.extreme_lags
                 if min_target_lag is not None:
                     all_shifts.append(-min_target_lag)
 
@@ -459,6 +459,7 @@ def extreme_lags(
         Optional[int],
         Optional[int],
         int,
+        Optional[int],
     ]:
         extreme_lags_ = super().extreme_lags
         # shift min_target_lag in the past to account for the regression model training set
diff --git a/darts/models/forecasting/regression_model.py b/darts/models/forecasting/regression_model.py
index 3bfd45b439..ab01088a19 100644
--- a/darts/models/forecasting/regression_model.py
+++ b/darts/models/forecasting/regression_model.py
@@ -449,6 +449,7 @@ def extreme_lags(
         Optional[int],
         Optional[int],
         int,
+        Optional[int],
     ]:
         min_target_lag = self.lags["target"][0] if "target" in self.lags else None
         max_target_lag = self.output_chunk_length - 1 + self.output_chunk_shift
@@ -464,6 +465,7 @@ def extreme_lags(
             min_future_cov_lag,
             max_future_cov_lag,
             self.output_chunk_shift,
+            None,
         )
 
     @property
diff --git a/darts/models/forecasting/rnn_model.py b/darts/models/forecasting/rnn_model.py
index 01621d9909..8ccda3712a 100644
--- a/darts/models/forecasting/rnn_model.py
+++ b/darts/models/forecasting/rnn_model.py
@@ -321,9 +321,9 @@ def __init__(
             Fraction of neurons afected by Dropout.
         training_length
             The length of both input (target and covariates) and output (target) time series used during
-            training. Generally speaking, `training_length` should have a higher value than `input_chunk_length`
-            because otherwise during training the RNN is never run for as many iterations as it will during
-            inference. For more information on this parameter, please see `darts.utils.data.ShiftedDataset`
+            training. Must have a larger value than `input_chunk_length`, because otherwise during training
+            the RNN is never run for as many iterations as it will during inference. For more information on
+            this parameter, please see `darts.utils.data.ShiftedDataset`.
         **kwargs
             Optional arguments to initialize the pytorch_lightning.Module, pytorch_lightning.Trainer, and
             Darts' :class:`TorchForecastingModel`.
@@ -485,6 +485,13 @@ def encode_year(idx):
             `RNN example notebook <https://unit8co.github.io/darts/examples/04-RNN-examples.html>`_ presents techniques
             that can be used to improve the forecasts quality compared to this simple usage example.
         """
+        if training_length < input_chunk_length:
+            raise_log(
+                ValueError(
+                    f"`training_length` ({training_length}) must be `>=input_chunk_length` ({input_chunk_length})."
+                ),
+                logger=logger,
+            )
         # create copy of model parameters
         model_kwargs = {key: val for key, val in self.model_params.items()}
 
@@ -585,3 +592,27 @@ def supports_multivariate(self) -> bool:
     @property
     def min_train_series_length(self) -> int:
         return self.training_length + 1
+
+    @property
+    def extreme_lags(
+        self,
+    ) -> Tuple[
+        Optional[int],
+        Optional[int],
+        Optional[int],
+        Optional[int],
+        Optional[int],
+        Optional[int],
+        int,
+        Optional[int],
+    ]:
+        return (
+            -self.input_chunk_length,
+            self.output_chunk_length - 1,
+            None,
+            None,
+            -self.input_chunk_length,
+            self.output_chunk_length - 1,
+            self.output_chunk_shift,
+            self.training_length - self.input_chunk_length,
+        )
diff --git a/darts/models/forecasting/torch_forecasting_model.py b/darts/models/forecasting/torch_forecasting_model.py
index f3c877c24c..1bde798b6c 100644
--- a/darts/models/forecasting/torch_forecasting_model.py
+++ b/darts/models/forecasting/torch_forecasting_model.py
@@ -2494,15 +2494,17 @@ def extreme_lags(
         Optional[int],
         Optional[int],
         int,
+        Optional[int],
     ]:
         return (
             -self.input_chunk_length,
             self.output_chunk_length - 1 + self.output_chunk_shift,
-            -self.input_chunk_length if self.uses_past_covariates else None,
-            -1 if self.uses_past_covariates else None,
+            -self.input_chunk_length,
+            -1,
             None,
             None,
             self.output_chunk_shift,
+            None,
         )
 
 
@@ -2583,19 +2585,17 @@ def extreme_lags(
         Optional[int],
         Optional[int],
         int,
+        Optional[int],
     ]:
         return (
             -self.input_chunk_length,
             self.output_chunk_length - 1 + self.output_chunk_shift,
             None,
             None,
-            self.output_chunk_shift if self.uses_future_covariates else None,
-            (
-                self.output_chunk_length - 1 + self.output_chunk_shift
-                if self.uses_future_covariates
-                else None
-            ),
             self.output_chunk_shift,
+            self.output_chunk_length - 1 + self.output_chunk_shift,
+            self.output_chunk_shift,
+            None,
         )
 
 
@@ -2677,19 +2677,17 @@ def extreme_lags(
         Optional[int],
         Optional[int],
         int,
+        Optional[int],
     ]:
         return (
             -self.input_chunk_length,
             self.output_chunk_length - 1 + self.output_chunk_shift,
             None,
             None,
-            -self.input_chunk_length if self.uses_future_covariates else None,
-            (
-                self.output_chunk_length - 1 + self.output_chunk_shift
-                if self.uses_future_covariates
-                else None
-            ),
+            -self.input_chunk_length,
+            self.output_chunk_length - 1 + self.output_chunk_shift,
             self.output_chunk_shift,
+            None,
         )
 
 
@@ -2771,19 +2769,17 @@ def extreme_lags(
         Optional[int],
         Optional[int],
         int,
+        Optional[int],
     ]:
         return (
             -self.input_chunk_length,
             self.output_chunk_length - 1 + self.output_chunk_shift,
-            -self.input_chunk_length if self.uses_past_covariates else None,
-            -1 if self.uses_past_covariates else None,
-            -self.input_chunk_length if self.uses_future_covariates else None,
-            (
-                self.output_chunk_length - 1 + self.output_chunk_shift
-                if self.uses_future_covariates
-                else None
-            ),
+            -self.input_chunk_length,
+            -1,
+            -self.input_chunk_length,
+            self.output_chunk_length - 1 + self.output_chunk_shift,
             self.output_chunk_shift,
+            None,
         )
 
     def predict(
@@ -2922,17 +2918,15 @@ def extreme_lags(
         Optional[int],
         Optional[int],
         int,
+        Optional[int],
     ]:
         return (
             -self.input_chunk_length,
             self.output_chunk_length - 1 + self.output_chunk_shift,
-            -self.input_chunk_length if self.uses_past_covariates else None,
-            -1 if self.uses_past_covariates else None,
-            self.output_chunk_shift if self.uses_future_covariates else None,
-            (
-                self.output_chunk_length - 1 + self.output_chunk_shift
-                if self.uses_future_covariates
-                else None
-            ),
+            -self.input_chunk_length,
+            -1,
             self.output_chunk_shift,
+            self.output_chunk_length - 1 + self.output_chunk_shift,
+            self.output_chunk_shift,
+            None,
         )
diff --git a/darts/tests/models/forecasting/test_RNN.py b/darts/tests/models/forecasting/test_RNN.py
index 8fe711a6d3..30c58cfeec 100644
--- a/darts/tests/models/forecasting/test_RNN.py
+++ b/darts/tests/models/forecasting/test_RNN.py
@@ -55,6 +55,25 @@ class TestRNNModel:
         dropout=0,
     )
 
+    def test_training_length_input(self):
+        # too small training length
+        with pytest.raises(ValueError) as msg:
+            RNNModel(input_chunk_length=2, training_length=1)
+        assert (
+            str(msg.value)
+            == "`training_length` (1) must be `>=input_chunk_length` (2)."
+        )
+
+        # training_length >= input_chunk_length works
+        model = RNNModel(
+            input_chunk_length=2,
+            training_length=2,
+            n_epochs=1,
+            random_state=42,
+            **tfm_kwargs,
+        )
+        model.fit(self.series[:3])
+
     def test_creation(self):
         # cannot choose any string
         with pytest.raises(ValueError) as msg:
diff --git a/darts/tests/models/forecasting/test_ensemble_models.py b/darts/tests/models/forecasting/test_ensemble_models.py
index 79d3f5d762..42a8534afd 100644
--- a/darts/tests/models/forecasting/test_ensemble_models.py
+++ b/darts/tests/models/forecasting/test_ensemble_models.py
@@ -111,6 +111,7 @@ def test_extreme_lag_inference(self):
             None,
             None,
             0,
+            None,
         )  # test if default is okay
 
         model1 = LinearRegressionModel(
@@ -123,7 +124,19 @@ def test_extreme_lag_inference(self):
         ensemble = NaiveEnsembleModel(
             [model1, model2]
         )  # test if infers extreme lags is okay
-        expected = (-5, 0, -6, -1, 6, 9, 0)
+        expected = (-5, 0, -6, -1, 6, 9, 0, None)
+        assert expected == ensemble.extreme_lags
+
+    @pytest.mark.skipif(not TORCH_AVAILABLE, reason="requires torch")
+    def test_extreme_lags_rnn(self):
+        # RNNModel has the 8th element in `extreme_lags` for the `max_target_lag_train`.
+        # it is given by `training_length - input_chunk_length`.
+        # for the ensemble model we want the max lag of all forecasting models.
+        model1 = RNNModel(input_chunk_length=14, training_length=24)
+        model2 = RNNModel(input_chunk_length=12, training_length=37)
+
+        ensemble = NaiveEnsembleModel([model1, model2])
+        expected = (-14, 0, None, None, -14, 0, 0, 37 - 12)
         assert expected == ensemble.extreme_lags
 
     def test_input_models_local_models(self):
@@ -152,7 +165,7 @@ def test_call_predict_local_models(self):
     def test_call_backtest_naive_ensemble_local_models(self):
         ensemble = NaiveEnsembleModel([NaiveSeasonal(5), Theta(2, 5)])
         ensemble.fit(self.series1)
-        assert ensemble.extreme_lags == (-10, -1, None, None, None, None, 0)
+        assert ensemble.extreme_lags == (-10, -1, None, None, None, None, 0, None)
         ensemble.backtest(self.series1)
 
     def test_predict_univariate_ensemble_local_models(self):
diff --git a/darts/tests/models/forecasting/test_global_forecasting_models.py b/darts/tests/models/forecasting/test_global_forecasting_models.py
index 59d278f756..bdd04ae030 100644
--- a/darts/tests/models/forecasting/test_global_forecasting_models.py
+++ b/darts/tests/models/forecasting/test_global_forecasting_models.py
@@ -67,6 +67,7 @@
         RNNModel,
         {
             "model": "RNN",
+            "training_length": IN_LEN + OUT_LEN,
             "hidden_dim": 10,
             "batch_size": 32,
             "n_epochs": 10,
@@ -77,7 +78,7 @@
     (
         RNNModel,
         {
-            "training_length": 12,
+            "training_length": IN_LEN + OUT_LEN,
             "n_epochs": 10,
             "likelihood": GaussianLikelihood(),
             "pl_trainer_kwargs": tfm_kwargs["pl_trainer_kwargs"],
diff --git a/darts/tests/models/forecasting/test_historical_forecasts.py b/darts/tests/models/forecasting/test_historical_forecasts.py
index e92eedffdc..738b7ef3f0 100644
--- a/darts/tests/models/forecasting/test_historical_forecasts.py
+++ b/darts/tests/models/forecasting/test_historical_forecasts.py
@@ -133,6 +133,7 @@
             RNNModel,
             {
                 "input_chunk_length": IN_LEN,
+                "training_length": IN_LEN + OUT_LEN - 1,
                 "model": "RNN",
                 "hidden_dim": 10,
                 "batch_size": 32,
@@ -147,7 +148,7 @@
             RNNModel,
             {
                 "input_chunk_length": IN_LEN,
-                "training_length": 12,
+                "training_length": IN_LEN + OUT_LEN - 1,
                 "n_epochs": NB_EPOCH,
                 "likelihood": GaussianLikelihood(),
                 **tfm_kwargs,
@@ -1378,137 +1379,6 @@ def f_encoder(idx):
                     hfc.all_values(), ohfc.all_values()
                 )
 
-    @pytest.mark.slow
-    @pytest.mark.skipif(not TORCH_AVAILABLE, reason="requires torch")
-    @pytest.mark.parametrize("model_config", models_torch_cls_kwargs)
-    def test_torch_auto_start_multiple_no_cov(self, model_config):
-        forecast_hrz = 10
-        model_cls, kwargs, bounds, _ = model_config
-        model = model_cls(
-            random_state=0,
-            **kwargs,
-        )
-        model.fit(self.ts_pass_train)
-
-        # check historical forecasts for several time series,
-        # retrain True and overlap_end False
-        forecasts = model.historical_forecasts(
-            series=[self.ts_pass_val, self.ts_pass_val],
-            forecast_horizon=forecast_hrz,
-            stride=1,
-            retrain=True,
-            overlap_end=False,
-        )
-        assert (
-            len(forecasts) == 2
-        ), f"Model {model_cls} did not return a list of historical forecasts"
-        # If retrain=True and overlap_end=False, as ts has 72 values, we can only forecast
-        # (target length)-(training length=input_chunk_length+output_chunk_length) - (horizon - 1)
-        # indeed we start to predict after the first trainable point (input_chunk_length+output_chunk_length)
-        # and we stop in this case (overlap_end=False) at the end_time:
-        # target.end_time() - (horizon - 1) * target.freq
-
-        # explanation:
-        # (bounds): train sample length
-        # (horizon - 1): with overlap_end=False, if entire horizon is available (overlap_end=False),
-        # we can predict 1
-        theorical_forecast_length = (
-            self.ts_val_length - (bounds[0] + bounds[1]) - (forecast_hrz - 1)
-        )
-        assert len(forecasts[0]) == len(forecasts[1]) == theorical_forecast_length, (
-            f"Model {model_cls} does not return the right number of historical forecasts in the case of "
-            f"retrain=True and overlap_end=False. "
-            f"Expected {theorical_forecast_length}, got {len(forecasts[0])} and {len(forecasts[1])}"
-        )
-
-        model = model_cls(
-            random_state=0,
-            **kwargs,
-        )
-
-        model.fit(self.ts_pass_train)
-        # check historical forecasts for several time series,
-        # retrain True and overlap_end True
-        forecasts = model.historical_forecasts(
-            series=[self.ts_pass_val, self.ts_pass_val],
-            forecast_horizon=forecast_hrz,
-            stride=1,
-            retrain=True,
-            overlap_end=True,
-        )
-
-        assert (
-            len(forecasts) == 2
-        ), f"Model {model_cls} did not return a list of historical forecasts"
-        theorical_forecast_length = (
-            self.ts_val_length
-            - (bounds[0] + bounds[1])  # train sample length
-            + 1  # with overlap_end=True, we are not restricted by the end of the series or horizon
-        )
-        assert len(forecasts[0]) == len(forecasts[1]) == theorical_forecast_length
-
-        model = model_cls(
-            random_state=0,
-            **kwargs,
-        )
-        model.fit(self.ts_pass_train)
-        # check historical forecasts for several time series,
-        # retrain False and overlap_end False
-        forecasts = model.historical_forecasts(
-            series=[self.ts_pass_val, self.ts_pass_val],
-            forecast_horizon=forecast_hrz,
-            stride=1,
-            retrain=False,
-            overlap_end=False,
-        )
-
-        assert (
-            len(forecasts) == 2
-        ), f"Model {model_cls} did not return a list of historical forecasts"
-        theorical_forecast_length = (
-            self.ts_val_length
-            - bounds[0]  # prediction input sample length
-            - (
-                forecast_hrz - 1
-            )  # overlap_end=False -> if entire horizon is available, we can predict 1
-        )
-        assert len(forecasts[0]) == len(forecasts[1]) == theorical_forecast_length
-        assert (
-            forecasts[0].end_time()
-            == forecasts[1].end_time()
-            == self.ts_pass_val.end_time()
-        )
-
-        model = model_cls(
-            random_state=0,
-            **kwargs,
-        )
-        model.fit(self.ts_pass_train)
-        # check historical forecasts for several time series,
-        # retrain False and overlap_end True
-        forecasts = model.historical_forecasts(
-            series=[self.ts_pass_val, self.ts_pass_val],
-            forecast_horizon=forecast_hrz,
-            stride=1,
-            retrain=False,
-            overlap_end=True,
-        )
-
-        assert (
-            len(forecasts) == 2
-        ), f"Model {model_cls} did not return a list of historical forecasts"
-        theorical_forecast_length = (
-            self.ts_val_length
-            - bounds[0]  # prediction input sample length
-            + 1  # overlap_end=True -> last possible prediction start is one step after end of target
-        )
-        assert len(forecasts[0]) == len(forecasts[1]) == theorical_forecast_length
-        assert (
-            forecasts[0].end_time()
-            == forecasts[1].end_time()
-            == self.ts_pass_val.end_time() + forecast_hrz * self.ts_pass_val.freq
-        )
-
     def test_hist_fc_end_exact_with_covs(self):
         model = LinearRegressionModel(
             lags=2,
@@ -1591,6 +1461,7 @@ def test_regression_auto_start_multiple_with_cov_retrain(self, model_config):
             min_future_cov_lag,
             max_future_cov_lag,
             output_chunk_shift,
+            _,
         ) = model.extreme_lags
 
         past_lag = min(
@@ -1703,6 +1574,7 @@ def test_regression_auto_start_multiple_with_cov_no_retrain(self, model_config):
             min_future_cov_lag,
             max_future_cov_lag,
             output_chunk_shift,
+            _,
         ) = model.extreme_lags
 
         past_lag = min(
@@ -1731,10 +1603,86 @@ def test_regression_auto_start_multiple_with_cov_no_retrain(self, model_config):
 
     @pytest.mark.slow
     @pytest.mark.skipif(not TORCH_AVAILABLE, reason="requires torch")
-    @pytest.mark.parametrize("model_config", models_torch_cls_kwargs)
-    def test_torch_auto_start_with_past_cov(self, model_config):
+    @pytest.mark.parametrize(
+        "model_config,retrain",
+        itertools.product(models_torch_cls_kwargs, [True, False]),
+    )
+    def test_torch_auto_start_multiple_no_cov(self, model_config, retrain):
+        n_fcs = 3
         forecast_hrz = 10
-        # Past covariates only
+        model_cls, kwargs, bounds, _ = model_config
+        model = model_cls(
+            random_state=0,
+            **kwargs,
+        )
+
+        # we expect first predicted point after `min_train_series_length`
+        # model is expected to generate `n_fcs` historical forecasts with `n=forecast_hrz` and
+        # `series` of length `length_series_history`
+        length_series_history = model.min_train_series_length + forecast_hrz + n_fcs - 1
+        series = self.ts_pass_train[:length_series_history]
+        if not retrain:
+            model.fit(series)
+
+        # check historical forecasts for several time series,
+        # retrain True and overlap_end False
+        forecasts = model.historical_forecasts(
+            series=[series] * 2,
+            forecast_horizon=forecast_hrz,
+            stride=1,
+            retrain=retrain,
+            overlap_end=False,
+        )
+        assert (
+            len(forecasts) == 2
+        ), f"Model {model_cls} did not return a list of historical forecasts"
+
+        # with the required time spans we expect to get `n_fcs` forecasts
+        if not retrain:
+            # with retrain=False, we can start `output_chunk_length` steps earlier for non-RNNModels
+            # and `training_length - input_chunk_length` steps for RNNModels
+            if not isinstance(model, RNNModel):
+                add_fcs = model.extreme_lags[1] + 1
+            else:
+                add_fcs = model.extreme_lags[7] + 1
+        else:
+            add_fcs = 0
+        assert len(forecasts[0]) == len(forecasts[1]) == n_fcs + add_fcs
+        assert forecasts[0].end_time() == forecasts[1].end_time() == series.end_time()
+
+        # check historical forecasts for several time series,
+        # retrain True and overlap_end True
+        forecasts = model.historical_forecasts(
+            series=[series] * 2,
+            forecast_horizon=forecast_hrz,
+            stride=1,
+            retrain=retrain,
+            overlap_end=True,
+        )
+
+        assert (
+            len(forecasts) == 2
+        ), f"Model {model_cls} did not return a list of historical forecasts"
+        # with overlap_end=True, we can generate additional `forecast_hrz`
+        # with retrain=False, we can start `add_fcs` steps earlier
+        # forecasts after the end of `series`
+        assert len(forecasts[0]) == len(forecasts[1]) == n_fcs + forecast_hrz + add_fcs
+        assert (
+            forecasts[0].end_time()
+            == forecasts[1].end_time()
+            == series.end_time() + forecast_hrz * series.freq
+        )
+
+    @pytest.mark.slow
+    @pytest.mark.skipif(not TORCH_AVAILABLE, reason="requires torch")
+    @pytest.mark.parametrize(
+        "model_config,retrain",
+        itertools.product(models_torch_cls_kwargs, [True, False]),
+    )
+    def test_torch_auto_start_with_past_cov(self, model_config, retrain):
+        n_fcs = 3
+        forecast_hrz = 10
+        # past covariates only
         model_cls, kwargs, bounds, cov_type = model_config
 
         model = model_cls(
@@ -1752,231 +1700,410 @@ def test_torch_auto_start_with_past_cov(self, model_config):
             )
             return
 
-        model.fit(self.ts_pass_train, self.ts_past_cov_train)
+        # we expect first predicted point after `min_train_series_length`
+        # model is expected to generate `n_fcs` historical forecasts with `n=forecast_hrz`,
+        # `series` of length `length_series_history`, and covariates that cover the required time range
+        length_series_history = model.min_train_series_length + forecast_hrz + n_fcs - 1
+        series = self.ts_pass_train[:length_series_history]
+
+        # for historical forecasts, minimum required past covariates should end
+        # `forecast_hrz` before the end of `series`
+        pc = series[:-forecast_hrz]
+
+        if not retrain:
+            model.fit(series, past_covariates=pc)
 
-        # same start
+        # same start, overlap_end=False
         forecasts = model.historical_forecasts(
-            series=[self.ts_pass_val, self.ts_pass_val],
-            past_covariates=[
-                self.ts_past_cov_valid_same_start,
-                self.ts_past_cov_valid_same_start,
-            ],
+            series=[series] * 2,
+            past_covariates=[pc] * 2,
             forecast_horizon=forecast_hrz,
             stride=1,
-            retrain=True,
+            retrain=retrain,
             overlap_end=False,
         )
-
         assert (
             len(forecasts) == 2
         ), f"Model {model_cls} did not return a list of historical forecasts"
 
-        theorical_forecast_length = (
-            self.ts_val_length
-            - (bounds[0] + bounds[1])  # train sample length
-            - (forecast_hrz - 1)  # if entire horizon is available, we can predict 1
-            - 0  # past covs have same start as target -> no shift
-            - 0  # we don't have future covs in output chunk -> no shift
-        )
-        assert len(forecasts[0]) == len(forecasts[1]) == theorical_forecast_length, (
-            f"Model {model_cls} does not return the right number of historical forecasts in case "
-            f"of retrain=True and overlap_end=False and past_covariates with same start. "
-            f"Expected {theorical_forecast_length}, got {len(forecasts[0])} and {len(forecasts[1])}"
+        # with the required time spans we expect to get `n_fcs` forecasts
+        if not retrain:
+            # with retrain=False, we can start `output_chunk_length` steps earlier for non-RNNModels
+            # and `training_length - input_chunk_length` steps for RNNModels
+            add_fcs = model.extreme_lags[1] + 1
+        else:
+            add_fcs = 0
+        assert len(forecasts[0]) == len(forecasts[1]) == n_fcs + add_fcs
+        assert forecasts[0].end_time() == forecasts[1].end_time() == series.end_time()
+
+        # check the same for `overlap_end=True`
+        forecasts = model.historical_forecasts(
+            series=[series] * 2,
+            past_covariates=[pc] * 2,
+            forecast_horizon=forecast_hrz,
+            stride=1,
+            retrain=retrain,
+            overlap_end=True,
         )
+        assert len(forecasts[0]) == len(forecasts[1]) == n_fcs + add_fcs
+        assert forecasts[0].end_time() == forecasts[1].end_time() == series.end_time()
 
-        model = model_cls(
-            random_state=0,
-            **kwargs,
+        # same time index, `overlap_end=True`
+        forecasts = model.historical_forecasts(
+            series=[series] * 2,
+            past_covariates=[series] * 2,
+            forecast_horizon=forecast_hrz,
+            stride=1,
+            retrain=retrain,
+            overlap_end=True,
         )
-        model.fit(self.ts_pass_train, past_covariates=self.ts_past_cov_train)
+        assert (
+            len(forecasts) == 2
+        ), f"Model {model_cls} did not return a list of historical forecasts"
+        # with overlap_end=True, we can generate additional `forecast_hrz`
+        # with retrain=False, we can start `add_fcs` steps earlier
+        # forecasts after the end of `series`
+        assert len(forecasts[0]) == len(forecasts[1]) == n_fcs + forecast_hrz + add_fcs
+        assert (
+            forecasts[0].end_time()
+            == forecasts[1].end_time()
+            == series.end_time() + forecast_hrz * series.freq
+        )
+
+        # `pc_longer` has more than required length
+        pc_longer = pc.prepend_values([0.0]).append_values([0.0])
+        # `pc_before` starts before and has required times
+        pc_longer_start = pc.prepend_values([0.0])
+        # `pc_after` has required length but starts one step after `pc`
+        pc_start_after = pc[1:].append_values([0.0])
+        # `pc_end_before` has required length but end one step before `pc`
+        pc_end_before = pc[:-1].prepend_values([0.0])
 
-        # start before, after
+        # checks for long enough and shorter covariates
         forecasts = model.historical_forecasts(
-            series=[self.ts_pass_val, self.ts_pass_val],
+            series=[series] * 4,
             past_covariates=[
-                self.ts_past_cov_valid_5_aft_start,
-                self.ts_past_cov_valid_10_bef_start,
+                pc_longer,
+                pc_longer_start,
+                pc_start_after,
+                pc_end_before,
             ],
             forecast_horizon=forecast_hrz,
             stride=1,
-            retrain=True,
+            retrain=retrain,
             overlap_end=False,
         )
-        theorical_forecast_length = (
-            self.ts_val_length
-            - (bounds[0] + bounds[1])  # train sample length
-            - (forecast_hrz - 1)  # if entire horizon is available, we can predict 1
-            - 5  # past covs start 5 later -> shift
-            - 0  # we don't have future covs in output chunk -> no shift
-        )
-        assert len(forecasts[0]) == theorical_forecast_length, (
-            f"Model {model_cls} does not return the right number of historical forecasts in case "
-            f"of retrain=True and overlap_end=False and past_covariates starting after. "
-            f"Expected {theorical_forecast_length}, got {len(forecasts[0])}"
-        )
-        theorical_forecast_length = (
-            self.ts_val_length
-            - (bounds[0] + bounds[1])  # train sample length
-            - (forecast_hrz - 1)  # if entire horizon is available, we can predict 1
-            - 0  # past covs have same start as target -> no shift
-            - 0  # we don't have future covs in output chunk -> no shift
-        )
-        assert len(forecasts[1]) == theorical_forecast_length, (
-            f"Model {model_cls} does not return the right number of historical forecasts in case "
-            f"of retrain=True and overlap_end=False and past_covariates starting before. "
-            f"Expected {theorical_forecast_length}, got {len(forecasts[1])}"
-        )
+
+        # for long enough past covariates (but too short for overlapping after the end), we expect `n_fcs` forecast
+        assert len(forecasts[0]) == len(forecasts[1]) == n_fcs + add_fcs
+        # `pc_start_after` and `pc_end_before` are one step too short for all `n_fcs`
+        assert len(forecasts[2]) == len(forecasts[3]) == n_fcs + add_fcs - 1
+        assert all([fc.end_time() == series.end_time() for fc in forecasts[:3]])
+        assert forecasts[3].end_time() == series.end_time() - series.freq
 
     @pytest.mark.slow
     @pytest.mark.skipif(not TORCH_AVAILABLE, reason="requires torch")
-    @pytest.mark.parametrize("model_config", models_torch_cls_kwargs)
-    def test_torch_auto_start_with_past_future_cov(self, model_config):
+    @pytest.mark.parametrize(
+        "model_config,retrain",
+        list(itertools.product(models_torch_cls_kwargs, [True, False]))[2:],
+    )
+    def test_torch_auto_start_with_future_cov(self, model_config, retrain):
+        n_fcs = 3
         forecast_hrz = 10
-        # Past and future covariates
+        # future covariates only
         model_cls, kwargs, bounds, cov_type = model_config
 
         model = model_cls(
             random_state=0,
             **kwargs,
         )
-        if not (model.supports_past_covariates and model.supports_future_covariates):
+        if not model.supports_future_covariates:
             with pytest.raises(ValueError) as err:
                 model.fit(
-                    self.ts_pass_train,
-                    past_covariates=self.ts_past_cov_train,
-                    future_covariates=self.ts_fut_cov_train,
+                    series=self.ts_pass_train, future_covariates=self.ts_fut_cov_train
                 )
-            invalid_covs = []
-            if not model.supports_past_covariates:
-                invalid_covs.append("`past_covariates`")
-            if not model.supports_future_covariates:
-                invalid_covs.append("`future_covariates`")
             assert str(err.value).startswith(
-                f"The model does not support {', '.join(invalid_covs)}"
+                "The model does not support `future_covariates`."
             )
             return
 
-        model.fit(
-            self.ts_pass_train,
-            past_covariates=self.ts_past_cov_train,
-            future_covariates=self.ts_fut_cov_train,
-        )
+        # we expect first predicted point after `min_train_series_length`
+        # model is expected to generate `n_fcs` historical forecasts with `n=forecast_hrz`,
+        # `series` of length `length_series_history`, and covariates that cover the required time range
+        length_series_history = model.min_train_series_length + forecast_hrz + n_fcs - 1
+        series = self.ts_pass_train[:length_series_history]
+
+        # to generate `n_fcs` historical forecasts, and since `forecast_horizon > output_chunk_length`,
+        # we need additional `output_chunk_length - horizon` future covariates steps
+        add_n = max(model.extreme_lags[1] + 1 - forecast_hrz, 0)
+        fc = series.append_values([0.0] * add_n) if add_n else series
+
+        if not retrain:
+            model.fit(series, future_covariates=fc)
 
+        # same start, overlap_end=False
         forecasts = model.historical_forecasts(
-            series=[self.ts_pass_val, self.ts_pass_val],
-            past_covariates=[
-                self.ts_past_cov_valid_5_aft_start,
-                self.ts_past_cov_valid_same_start,
-            ],
-            future_covariates=[
-                self.ts_fut_cov_valid_7_aft_start,
-                self.ts_fut_cov_valid_16_bef_start,
-            ],
+            series=[series] * 2,
+            future_covariates=[fc] * 2,
             forecast_horizon=forecast_hrz,
             stride=1,
-            retrain=True,
+            retrain=retrain,
             overlap_end=False,
         )
-        theorical_forecast_length = (
-            self.ts_val_length
-            - (bounds[0] + bounds[1])  # train sample length
-            - (forecast_hrz - 1)  # if entire horizon is available, we can predict 1
-            - 7  # future covs start 7 after target (more than past covs) -> shift
-            - 2  # future covs in output chunk -> difference between horizon=10 and output_chunk_length=12
+        assert (
+            len(forecasts) == 2
+        ), f"Model {model_cls} did not return a list of historical forecasts"
+
+        # with the required time spans we expect to get `n_fcs` forecasts
+        if not retrain:
+            # with retrain=False, we can start `output_chunk_length` steps earlier for non-RNNModels
+            # and `training_length - input_chunk_length` steps for RNNModels
+            if not isinstance(model, RNNModel):
+                add_fcs = model.extreme_lags[1] + 1
+            else:
+                add_fcs = model.extreme_lags[7] + 1
+        else:
+            add_fcs = 0
+        assert len(forecasts[0]) == len(forecasts[1]) == n_fcs + add_fcs
+        assert forecasts[0].end_time() == forecasts[1].end_time() == series.end_time()
+
+        # check the same for `overlap_end=True`
+        forecasts = model.historical_forecasts(
+            series=[series] * 2,
+            future_covariates=[fc] * 2,
+            forecast_horizon=forecast_hrz,
+            stride=1,
+            retrain=retrain,
+            overlap_end=True,
         )
-        assert len(forecasts[0]) == theorical_forecast_length, (
-            f"Model {model_cls} does not return the right number of historical forecasts in case "
-            f"of retrain=True and overlap_end=False and past_covariates and future_covariates with "
-            f"different start. "
-            f"Expected {theorical_forecast_length}, got {len(forecasts[0])}"
+        assert len(forecasts[0]) == len(forecasts[1]) == n_fcs + add_fcs
+        assert forecasts[0].end_time() == forecasts[1].end_time() == series.end_time()
+
+        # `overlap_end=True`, with long enough future covariates
+        if not isinstance(model, RNNModel):
+            add_n = model.output_chunk_length
+        else:
+            # RNNModel is a special case with always `output_chunk_length=1`
+            add_n = forecast_hrz
+        fc_long = fc.append_values([0.0] * add_n)
+        forecasts = model.historical_forecasts(
+            series=[series] * 2,
+            future_covariates=[fc_long] * 2,
+            forecast_horizon=forecast_hrz,
+            stride=1,
+            retrain=retrain,
+            overlap_end=True,
         )
-        theorical_forecast_length = (
-            self.ts_val_length
-            - (bounds[0] + bounds[1])  # train sample length
-            - (forecast_hrz - 1)  # if entire horizon is available, we can predict 1,
-            - 0  # all covs start at the same time as target -> no shift,
-            - 2  # future covs in output chunk -> difference between horizon=10 and output_chunk_length=12
+        assert (
+            len(forecasts) == 2
+        ), f"Model {model_cls} did not return a list of historical forecasts"
+        # with overlap_end=True, we can generate additional `forecast_hrz`
+        # with retrain=False, we can start `add_fcs` steps earlier
+        # forecasts after the end of `series`
+        assert len(forecasts[0]) == len(forecasts[1]) == n_fcs + forecast_hrz + add_fcs
+        assert (
+            forecasts[0].end_time()
+            == forecasts[1].end_time()
+            == series.end_time() + forecast_hrz * series.freq
         )
-        assert len(forecasts[1]) == theorical_forecast_length, (
-            f"Model {model_cls} does not return the right number of historical forecasts in case "
-            f"of retrain=True and overlap_end=False and past_covariates with different start. "
-            f"Expected {theorical_forecast_length}, got {len(forecasts[1])}"
+
+        # `fc_longer` has more than required length
+        fc_longer = fc.prepend_values([0.0]).append_values([0.0])
+        # `fc_before` starts before and has required times
+        fc_longer_start = fc.prepend_values([0.0])
+        # `fc_after` has required length but starts one step after `fc`
+        fc_start_after = fc[1:].append_values([0.0])
+        # `fc_end_before` has required length but end one step before `fc`
+        fc_end_before = fc[:-1].prepend_values([0.0])
+
+        # checks for long enough and shorter covariates
+        forecasts = model.historical_forecasts(
+            series=[series] * 4,
+            future_covariates=[
+                fc_longer,
+                fc_longer_start,
+                fc_start_after,
+                fc_end_before,
+            ],
+            forecast_horizon=forecast_hrz,
+            stride=1,
+            retrain=retrain,
+            overlap_end=False,
         )
 
+        # for long enough future covariates (but too short for overlapping after the end), we expect `n_fcs` forecast
+        assert len(forecasts[0]) == len(forecasts[1]) == n_fcs + add_fcs
+        # `fc_start_after` and `fc_end_before` are one step too short for all `n_fcs`
+        assert len(forecasts[2]) == len(forecasts[3]) == n_fcs + add_fcs - 1
+        assert all([fc.end_time() == series.end_time() for fc in forecasts[:3]])
+        assert forecasts[3].end_time() == series.end_time() - series.freq
+
     @pytest.mark.slow
     @pytest.mark.skipif(not TORCH_AVAILABLE, reason="requires torch")
-    @pytest.mark.parametrize("model_config", models_torch_cls_kwargs)
-    def test_torch_auto_start_with_future_cov(self, model_config):
+    @pytest.mark.parametrize(
+        "model_config,retrain",
+        itertools.product(models_torch_cls_kwargs, [True, False]),
+    )
+    def test_torch_auto_start_with_past_and_future_cov(self, model_config, retrain):
+        n_fcs = 3
         forecast_hrz = 10
-        # Future covariates only
+        # past and future covariates
         model_cls, kwargs, bounds, cov_type = model_config
 
         model = model_cls(
             random_state=0,
             **kwargs,
         )
-
-        if not model.supports_future_covariates:
+        if not (model.supports_past_covariates and model.supports_future_covariates):
             with pytest.raises(ValueError) as err:
-                model.fit(self.ts_pass_train, future_covariates=self.ts_fut_cov_train)
+                model.fit(
+                    self.ts_pass_train,
+                    past_covariates=self.ts_past_cov_train,
+                    future_covariates=self.ts_fut_cov_train,
+                )
+            invalid_covs = []
+            if not model.supports_past_covariates:
+                invalid_covs.append("`past_covariates`")
+            if not model.supports_future_covariates:
+                invalid_covs.append("`future_covariates`")
             assert str(err.value).startswith(
-                "The model does not support `future_covariates`"
+                f"The model does not support {', '.join(invalid_covs)}"
             )
             return
 
-        model.fit(self.ts_pass_train, future_covariates=self.ts_fut_cov_train)
+        # we expect first predicted point after `min_train_series_length`
+        # model is expected to generate `n_fcs` historical forecasts with `n=forecast_hrz`,
+        # `series` of length `length_series_history`, and covariates that cover the required time range
+        length_series_history = model.min_train_series_length + forecast_hrz + n_fcs - 1
+        series = self.ts_pass_train[:length_series_history]
+
+        # for historical forecasts, minimum required past covariates should end
+        # `forecast_hrz` before the end of `series`
+        pc = series[:-forecast_hrz]
 
-        # Only fut covariate
+        # to generate `n_fcs` historical forecasts, and since `forecast_horizon > output_chunk_length`,
+        # we need additional `output_chunk_length - horizon` future covariates steps
+        add_n = max(model.extreme_lags[1] + 1 - forecast_hrz, 0)
+        fc = series.append_values([0.0] * add_n) if add_n else series
+
+        if not retrain:
+            model.fit(series, past_covariates=pc, future_covariates=fc)
+
+        # same start, overlap_end=False
         forecasts = model.historical_forecasts(
-            series=[self.ts_pass_val, self.ts_pass_val],
-            future_covariates=[
-                self.ts_fut_cov_valid_7_aft_start,
-                self.ts_fut_cov_valid_16_bef_start,
-            ],
+            series=[series] * 2,
+            past_covariates=[pc] * 2,
+            future_covariates=[fc] * 2,
             forecast_horizon=forecast_hrz,
             stride=1,
-            retrain=True,
+            retrain=retrain,
             overlap_end=False,
         )
-
         assert (
             len(forecasts) == 2
         ), f"Model {model_cls} did not return a list of historical forecasts"
 
-        icl, ocl = bounds
-        theorical_forecast_length = (
-            self.ts_val_length
-            - (icl + ocl)  # train sample length
-            - (
-                forecast_hrz - 1
-            )  # (horizon - 1): if entire horizon is available, we can predict 1,
-            - 7  # future covs start 7 after target (more than past covs) -> shift
-            - max(
-                ocl - forecast_hrz, 0
-            )  # future covs in output chunk -> difference between hrz=10 and ocl=12
-        )
-        assert len(forecasts[0]) == theorical_forecast_length, (
-            f"Model {model_cls} does not return the right number of historical forecasts in case "
-            f"of retrain=True and overlap_end=False and no past_covariates and future_covariates "
-            f"with different start. "
-            f"Expected {theorical_forecast_length}, got {len(forecasts[0])}"
+        # with the required time spans we expect to get `n_fcs` forecasts
+        if not retrain:
+            # with retrain=False, we can start `output_chunk_length` steps earlier for non-RNNModels
+            # and `training_length - input_chunk_length` steps for RNNModels
+            if not isinstance(model, RNNModel):
+                add_fcs = model.extreme_lags[1] + 1
+            else:
+                add_fcs = model.extreme_lags[7] + 1
+        else:
+            add_fcs = 0
+        assert len(forecasts[0]) == len(forecasts[1]) == n_fcs + add_fcs
+        assert forecasts[0].end_time() == forecasts[1].end_time() == series.end_time()
+
+        # check the same for `overlap_end=True`
+        forecasts = model.historical_forecasts(
+            series=[series] * 2,
+            past_covariates=[pc] * 2,
+            future_covariates=[fc] * 2,
+            forecast_horizon=forecast_hrz,
+            stride=1,
+            retrain=retrain,
+            overlap_end=True,
         )
-        theorical_forecast_length = (
-            self.ts_val_length
-            - (icl + ocl)  # train sample length
-            - (forecast_hrz - 1)  # if entire horizon is available, we can predict 1
-            - 0  # all covs start at the same time as target -> no shift
-            - max(
-                ocl - forecast_hrz, 0
-            )  # future covs in output chunk -> difference between hrz=10 and ocl=12
+        assert len(forecasts[0]) == len(forecasts[1]) == n_fcs + add_fcs
+        assert forecasts[0].end_time() == forecasts[1].end_time() == series.end_time()
+
+        # `overlap_end=True`, with long enough past and future covariates
+        if not isinstance(model, RNNModel):
+            add_n = model.output_chunk_length
+        else:
+            # RNNModel is a special case with always `output_chunk_length=1`
+            add_n = forecast_hrz
+        fc_long = fc.append_values([0.0] * add_n)
+        forecasts = model.historical_forecasts(
+            series=[series] * 2,
+            past_covariates=[series] * 2,
+            future_covariates=[fc_long] * 2,
+            forecast_horizon=forecast_hrz,
+            stride=1,
+            retrain=retrain,
+            overlap_end=True,
         )
-        assert len(forecasts[1]) == theorical_forecast_length, (
-            f"Model {model_cls} does not return the right number of historical forecasts in case "
-            f"of retrain=True and overlap_end=False and no past_covariates and future_covariates "
-            f"with different start. "
-            f"Expected {theorical_forecast_length}, got {len(forecasts[1])}"
+        assert (
+            len(forecasts) == 2
+        ), f"Model {model_cls} did not return a list of historical forecasts"
+        # with overlap_end=True, we can generate additional `forecast_hrz`
+        # with retrain=False, we can start `add_fcs` steps earlier
+        # forecasts after the end of `series`
+        assert len(forecasts[0]) == len(forecasts[1]) == n_fcs + forecast_hrz + add_fcs
+        assert (
+            forecasts[0].end_time()
+            == forecasts[1].end_time()
+            == series.end_time() + forecast_hrz * series.freq
+        )
+
+        # `pc_longer` has more than required length
+        pc_longer = pc.prepend_values([0.0]).append_values([0.0])
+        # `pc_before` starts before and has required times
+        pc_longer_start = pc.prepend_values([0.0])
+        # `pc_after` has required length but starts one step after `pc`
+        pc_start_after = pc[1:].append_values([0.0])
+        # `pc_end_before` has required length but end one step before `pc`
+        pc_end_before = pc[:-1].prepend_values([0.0])
+
+        # `fc_longer` has more than required length
+        fc_longer = fc.prepend_values([0.0]).append_values([0.0])
+        # `fc_before` starts before and has required times
+        fc_longer_start = fc.prepend_values([0.0])
+        # `fc_after` has required length but starts one step after `fc`
+        fc_start_after = fc[1:].append_values([0.0])
+        # `fc_end_before` has required length but end one step before `fc`
+        fc_end_before = fc[:-1].prepend_values([0.0])
+
+        # checks for long enough and shorter covariates
+        forecasts = model.historical_forecasts(
+            series=[series] * 4,
+            past_covariates=[
+                pc_longer,
+                pc_longer_start,
+                pc_start_after,
+                pc_end_before,
+            ],
+            future_covariates=[
+                fc_longer,
+                fc_longer_start,
+                fc_start_after,
+                fc_end_before,
+            ],
+            forecast_horizon=forecast_hrz,
+            stride=1,
+            retrain=retrain,
+            overlap_end=False,
         )
 
+        # for long enough future covariates (but too short for overlapping after the end), we expect `n_fcs` forecast
+        assert len(forecasts[0]) == len(forecasts[1]) == n_fcs + add_fcs
+        # `*_start_after` and `*_end_bore` are one step too short for all `n_fcs`
+        assert len(forecasts[2]) == len(forecasts[3]) == n_fcs + add_fcs - 1
+        assert all([fc.end_time() == series.end_time() for fc in forecasts[:3]])
+        assert forecasts[3].end_time() == series.end_time() - series.freq
+
     def test_retrain(self):
         """test historical_forecasts for an untrained model with different retrain values."""
 
diff --git a/darts/tests/models/forecasting/test_regression_ensemble_model.py b/darts/tests/models/forecasting/test_regression_ensemble_model.py
index 5b4530b52f..96a569277a 100644
--- a/darts/tests/models/forecasting/test_regression_ensemble_model.py
+++ b/darts/tests/models/forecasting/test_regression_ensemble_model.py
@@ -70,17 +70,19 @@ def get_local_models(self):
         return [NaiveDrift(), NaiveSeasonal(5), NaiveSeasonal(10)]
 
     @pytest.mark.skipif(not TORCH_AVAILABLE, reason="requires torch")
-    def get_global_models(self, output_chunk_length=5):
+    def get_global_models(
+        self, output_chunk_length=5, input_chunk_length=20, training_length=24
+    ):
         return [
             RNNModel(
-                input_chunk_length=20,
-                output_chunk_length=output_chunk_length,
+                input_chunk_length=input_chunk_length,
+                training_length=training_length,
                 n_epochs=1,
                 random_state=42,
                 **tfm_kwargs,
             ),
             BlockRNNModel(
-                input_chunk_length=20,
+                input_chunk_length=input_chunk_length,
                 output_chunk_length=output_chunk_length,
                 n_epochs=1,
                 random_state=42,
@@ -559,6 +561,7 @@ def test_call_backtest_regression_ensemble_local_models(self):
             None,
             None,
             0,
+            None,
         )
         ensemble.backtest(self.sine_series)
 
@@ -574,7 +577,7 @@ def test_extreme_lags(self):
             regression_train_n_points=train_n_points,
         )
 
-        assert model.extreme_lags == (-train_n_points, 0, -3, -1, 0, 0, 0)
+        assert model.extreme_lags == (-train_n_points, 0, -3, -1, 0, 0, 0, None)
 
         # mix of all the lags
         model3 = RandomForest(
@@ -586,7 +589,26 @@ def test_extreme_lags(self):
             regression_train_n_points=train_n_points,
         )
 
-        assert model.extreme_lags == (-7 - train_n_points, 0, -3, -1, -2, 5, 0)
+        assert model.extreme_lags == (-7 - train_n_points, 0, -3, -1, -2, 5, 0, None)
+
+        # test RNN case which has the 8th extreme lags element (max_target_lag_train)
+        icl = 20
+        ocl = 5
+        training_length = 24
+        model = RegressionEnsembleModel(
+            forecasting_models=self.get_global_models(ocl, icl, training_length),
+            regression_train_n_points=train_n_points,
+        )
+        assert model.extreme_lags == (
+            -icl - train_n_points,
+            ocl - 1,
+            -icl,  # past covs from BlockRNN
+            -1,  # past covs from BlockRNN
+            -icl,  # future covs from RNN
+            0,  # future covs from RNN
+            0,
+            training_length - icl,  # training length from RNN
+        )
 
     def test_stochastic_regression_ensemble_model(self):
         quantiles = [0.25, 0.5, 0.75]
diff --git a/darts/tests/models/forecasting/test_torch_forecasting_model.py b/darts/tests/models/forecasting/test_torch_forecasting_model.py
index e08b2396d0..e962d35012 100644
--- a/darts/tests/models/forecasting/test_torch_forecasting_model.py
+++ b/darts/tests/models/forecasting/test_torch_forecasting_model.py
@@ -64,7 +64,7 @@
         (NBEATSModel, kwargs),
         (NHiTSModel, kwargs),
         (NLinearModel, kwargs),
-        (RNNModel, {"training_length": 2, **kwargs}),
+        (RNNModel, {"training_length": 10, **kwargs}),
         (TCNModel, kwargs),
         (TFTModel, {"add_relative_index": 2, **kwargs}),
         (TiDEModel, kwargs),
diff --git a/darts/utils/historical_forecasts/utils.py b/darts/utils/historical_forecasts/utils.py
index cab00882a6..d5a3d343a4 100644
--- a/darts/utils/historical_forecasts/utils.py
+++ b/darts/utils/historical_forecasts/utils.py
@@ -403,6 +403,7 @@ def _get_historical_forecastable_time_index(
         min_future_cov_lag,
         max_future_cov_lag,
         output_chunk_shift,
+        max_target_lag_train,
     ) = model.extreme_lags
 
     # max_target_lag < 0 are local models which can predict for n (horizon) -> infinity (no auto-regression)
@@ -414,11 +415,17 @@ def _get_historical_forecastable_time_index(
     if min_target_lag is None:
         min_target_lag = 0
 
+    if is_training and max_target_lag_train is not None:
+        # the output lag/window can be different for train and predict modes
+        output_lag = max_target_lag_train
+    else:
+        output_lag = max_target_lag
+
     # longest possible time index for target
     if is_training:
         start = (
             series.start_time()
-            + (max_target_lag - output_chunk_shift - min_target_lag + 1) * series.freq
+            + (output_lag - output_chunk_shift - min_target_lag + 1) * series.freq
         )
     else:
         start = series.start_time() - min_target_lag * series.freq
@@ -431,7 +438,7 @@ def _get_historical_forecastable_time_index(
         if is_training:
             start_pc = (
                 past_covariates.start_time()
-                + (max_target_lag - output_chunk_shift - min_past_cov_lag + 1)
+                + (output_lag - output_chunk_shift - min_past_cov_lag + 1)
                 * past_covariates.freq
             )
         else:
@@ -455,7 +462,7 @@ def _get_historical_forecastable_time_index(
         if is_training:
             start_fc = (
                 future_covariates.start_time()
-                + (max_target_lag - output_chunk_shift - min_future_cov_lag + 1)
+                + (output_lag - output_chunk_shift - min_future_cov_lag + 1)
                 * future_covariates.freq
             )
         else:
@@ -475,7 +482,7 @@ def _get_historical_forecastable_time_index(
             min([intersect_[1], end_fc]),
         )
 
-    # overlap_end = True -> predictions must not go beyond end of target series
+    # overlap_end = False -> predictions must not go beyond end of target series
     if (
         not overlap_end
         and intersect_[1] + (forecast_horizon + output_chunk_shift - 1) * series.freq
@@ -723,6 +730,7 @@ def _get_historical_forecast_boundaries(
     )
 
     # re-adjust the slicing indexes to account for the lags
+    # `max_target_lag_train` is redundant, since optimized hist fc is running in predict mode only
     (
         min_target_lag,
         _,
@@ -731,6 +739,7 @@ def _get_historical_forecast_boundaries(
         min_future_cov_lag,
         max_future_cov_lag,
         output_chunk_shift,
+        max_target_lag_train,
     ) = model.extreme_lags
 
     # target lags are <= 0