Merge branch 'master' into LHCB_Z_13TEV_DIMUON_2022

NNPDF · Jun 5, 2024 · 3b3cfc2 · 3b3cfc2
2 parents e61c220 + 7caa1ef
commit 3b3cfc2
Show file tree

Hide file tree

Showing 13 changed files with 163 additions and 98 deletions.
diff --git a/doc/sphinx/source/n3fit/hyperopt.rst b/doc/sphinx/source/n3fit/hyperopt.rst
@@ -401,8 +401,9 @@ In NNPDF, this hyperoptimisation metrics is selected via the following generic r
 
         kfold:
           loss_type: chi2
-          replica_statistic: average
+          replica_statistic: average_best
           fold_statistic: average
+          penalties_in_loss: False
           partitions:
           - datasets:
             ...
@@ -411,6 +412,16 @@ In NNPDF, this hyperoptimisation metrics is selected via the following generic r
 
         parallel_models: true
 
+
+The key ``replica_statistic`` defines how to combine all replicas when perform a multireplica hyperopt.
+With ``average`` a simple average will be taken, ``average_best`` instead will take the 90% best replicas,
+mimicking what is done in a real post-fit selection.
+
+The ``fold_statistic`` instead defines how to combine the loss of the different folds.
+While the values for the ``penalties`` are always saved during the hyperopt run, by default they are not
+considered by the hyoperoptimizaton algorithm.
+If they are to be considered the key ``penalties_in_loss`` needs to be set to ``True``.
+
 By combining the ``average``, ``best_worst``, and ``std`` figures of merit discussed in :ref:`hyperkfolding-label`,
 several alternatives may arise. For example, one approach could involve minimizing
 the maximum value of the set of averaged-over-replicas :math:`\chi^2`,

diff --git a/doc/sphinx/source/n3fit/runcard_detailed.rst b/doc/sphinx/source/n3fit/runcard_detailed.rst
@@ -319,10 +319,9 @@ Running in parallel can be quite hard on memory and it is only advantageous when
 fitting on a GPU, where one can find a speed up equal to the number of models run
 in parallel (each model being a different replica).
 
-Running in parallel leverages the fact that the only difference between two replicas
-is the output data the prediction is compared to.
-In order to ensure this is indeed the case it is necessary to also
-use the `same_trvl_per_replica` flag in the runcard.
+When running in parallel it might be advantageous (e.g., for debugging)
+to set the training validation split to be equal for all replicas,
+this can be done with the `same_trvl_per_replica: true` runcard flag.
 
 In other words, in order to run several replicas in parallel in a machine
 (be it a big CPU or, most likely, a GPU)
@@ -332,7 +331,6 @@ top-level options:
 .. code-block:: yaml
 
   parallel_models: true
-  same_trvl_per_replica: true
 
 
 And then run ``n3fit`` with a replica range to be parallelized
@@ -348,8 +346,6 @@ should run by setting the environment variable ``CUDA_VISIBLE_DEVICES``
 to the right index (usually ``0, 1, 2``) or leaving it explicitly empty
 to avoid running on GPU: ``export CUDA_VISIBLE_DEVICES=""``
 
-Note that at present it cannot be used together with the ``hyperopt`` module.
-
 
 .. _otheroptions-label:
 

diff --git a/n3fit/runcards/hyperopt_studies/renew_hyperopt.yml b/n3fit/runcards/hyperopt_studies/renew_hyperopt.yml
@@ -161,8 +161,9 @@ hyperscan_config:
 
 kfold:
       loss_type: chi2
-      replica_statistic: average
+      replica_statistic: average_best
       fold_statistic: average
+      penalties_in_loss: True
       penalties:
         - saturation
         - patience

diff --git a/n3fit/runcards/hyperopt_studies/restricted_search_space_renew_hyperopt.yml b/n3fit/runcards/hyperopt_studies/restricted_search_space_renew_hyperopt.yml
@@ -129,8 +129,9 @@ hyperscan_config:
 
 kfold:
       loss_type: chi2
-      replica_statistic: average
+      replica_statistic: average_best
       fold_statistic: average
+      penalties_in_loss: True
       penalties:
         - saturation
         - patience

diff --git a/n3fit/src/n3fit/backends/keras_backend/multi_dense.py b/n3fit/src/n3fit/backends/keras_backend/multi_dense.py
@@ -187,7 +187,7 @@ def __call__(self, shape, dtype=None, **kwargs):
         per_replica_weights = []
         for replica_seed in self.replica_seeds:
             if "seed" in self.initializer_config:
-                self.initializer_config["seed"] = self.base_seed + replica_seed
+                self.initializer_config["seed"] = int(self.base_seed + replica_seed)
             single_initializer = self.initializer_class.from_config(self.initializer_config)
 
             per_replica_weights.append(single_initializer(shape, dtype, **kwargs))

diff --git a/n3fit/src/n3fit/hyper_optimization/rewards.py b/n3fit/src/n3fit/hyper_optimization/rewards.py
@@ -44,6 +44,34 @@
 log = logging.getLogger(__name__)
 
 
+def _average_best(fold_losses: np.ndarray, proportion: float = 0.9, axis: int = 0) -> float:
+    """
+    Compute the average of the input array along the specified axis, among the best `proportion`
+    of replicas.
+
+    Parameters
+    ----------
+        fold_losses: np.ndarray
+            Per replica losses for a single fold.
+        proportion: float
+            The proportion of best replicas to take into account (rounded up).
+        axis: int, optional
+            Axis along which the mean is computed. Default is 0.
+
+    Returns
+    -------
+        float: The average along the specified axis.
+    """
+    # TODO: use directly `validphys.fitveto.determine_vetoes`
+    num_best = int(np.ceil(proportion * len(fold_losses)))
+
+    if np.isnan(fold_losses).any():
+        log.warning(f"{np.isnan(fold_losses).sum()} replicas have NaNs losses")
+    sorted_losses = np.sort(fold_losses, axis=axis)
+    best_losses = sorted_losses[:num_best]
+    return _average(best_losses, axis=axis)
+
+
 def _average(fold_losses: np.ndarray, axis: int = 0) -> float:
     """
     Compute the average of the input array along the specified axis.
@@ -98,7 +126,12 @@ def _std(fold_losses: np.ndarray, axis: int = 0) -> float:
     return np.std(fold_losses, axis=axis).item()
 
 
-IMPLEMENTED_STATS = {"average": _average, "best_worst": _best_worst, "std": _std}
+IMPLEMENTED_STATS = {
+    "average": _average,
+    "average_best": _average_best,
+    "best_worst": _best_worst,
+    "std": _std,
+}
 IMPLEMENTED_LOSSES = ["chi2", "phi2"]
 
 
@@ -114,6 +147,13 @@ class HyperLoss:
     Computes the statistic over the replicas and then over the folds, both
     statistics default to the average.
 
+    The ``compute_loss`` method saves intermediate metrics such as the
+    chi2 of the folds or the phi regardless of the loss type that has been selected.
+    These metrics are saved in the properties
+        ``phi_vector``: list of phi per fold
+        ``chi2_matrix``: list of chi2 per fold, per replica
+
+
     Parameters
     ----------
         loss_type: str
@@ -125,17 +165,27 @@ class HyperLoss:
         fold_statistic: str
             the statistic over the folds to use.
             Options are "average", "best_worst", and "std".
+        penalties_in_loss: bool
+            whether the penalties should be included in the output of ``compute_loss``
     """
 
     def __init__(
-        self, loss_type: str = None, replica_statistic: str = None, fold_statistic: str = None
+        self,
+        loss_type: str = None,
+        replica_statistic: str = None,
+        fold_statistic: str = None,
+        penalties_in_loss: bool = False,
     ):
-        self._default_statistic = "average"
         self._default_loss = "chi2"
+        self._penalties_in_loss = penalties_in_loss
 
         self.loss_type = self._parse_loss(loss_type)
-        self.reduce_over_replicas = self._parse_statistic(replica_statistic, "replica_statistic")
-        self.reduce_over_folds = self._parse_statistic(fold_statistic, "fold_statistic")
+        self.reduce_over_replicas = self._parse_statistic(
+            replica_statistic, "replica_statistic", default="average_best"
+        )
+        self.reduce_over_folds = self._parse_statistic(
+            fold_statistic, "fold_statistic", default="average"
+        )
 
         self.phi_vector = []
         self.chi2_matrix = []
@@ -153,6 +203,9 @@ def compute_loss(
         """
         Compute the loss, including added penalties, for a single fold.
 
+        Save the phi of the assemble and the chi2 of the separate replicas,
+        and the penalties into the ``phi_vector``, ``chi2_matrix`` and ``penalties`` attributes.
+
         Parameters
         ----------
             penalties: Dict[str, NDArray(replicas)]
@@ -167,6 +220,8 @@ def compute_loss(
                 List of tuples containing `validphys.core.DataGroupSpec` instances for each group data set
             fold_idx: int
                 k-fold index. Defaults to 0.
+            include_penalties: float
+                Whether to include the penalties in the returned loss value
 
         Returns
         -------
@@ -195,17 +250,18 @@ def compute_loss(
         # these are saved in the phi_vector and chi2_matrix attributes, excluding penalties
         self._save_hyperopt_metrics(phi_per_fold, experimental_loss, penalties, fold_idx)
 
-        # include penalties to experimental loss
-        # this allows introduction of statistics also to penalties
-        experimental_loss_w_penalties = experimental_loss + sum(penalties.values())
+        # Prepare the output loss, including penalties if necessary
+        if self._penalties_in_loss:
+            # include penalties to experimental loss
+            experimental_loss += sum(penalties.values())
 
-        # add penalties to phi in the form of a sum of per-replicas averages
-        phi_per_fold += sum(np.mean(penalty) for penalty in penalties.values())
+            # add penalties to phi in the form of a sum of per-replicas averages
+            phi_per_fold += sum(np.mean(penalty) for penalty in penalties.values())
 
         # define loss for hyperopt according to the chosen loss_type
         if self.loss_type == "chi2":
             # calculate statistics of chi2 over replicas for a given k-fold
-            loss = self.reduce_over_replicas(experimental_loss_w_penalties)
+            loss = self.reduce_over_replicas(experimental_loss)
         elif self.loss_type == "phi2":
             loss = phi_per_fold**2
 
@@ -269,18 +325,16 @@ def _parse_loss(self, loss_type: str) -> str:
         if loss_type is None:
             loss_type = self._default_loss
             log.warning(f"No loss_type selected in HyperLoss, defaulting to {loss_type}")
-        else:
-            if loss_type not in IMPLEMENTED_LOSSES:
-                valid_options = ", ".join(IMPLEMENTED_LOSSES)
-                raise ValueError(
-                    f"Invalid loss type '{loss_type}'. Valid options are: {valid_options}"
-                )
+
+        if loss_type not in IMPLEMENTED_LOSSES:
+            valid_options = ", ".join(IMPLEMENTED_LOSSES)
+            raise ValueError(f"Invalid loss type '{loss_type}'. Valid options are: {valid_options}")
 
         log.info(f"Setting '{loss_type}' as the loss type for hyperoptimization")
 
         return loss_type
 
-    def _parse_statistic(self, statistic: str, name: str) -> Callable:
+    def _parse_statistic(self, statistic: str, name: str, default: str) -> Callable:
         """
         Parse the statistic and return the default if None.
 
@@ -304,14 +358,12 @@ def _parse_statistic(self, statistic: str, name: str) -> Callable:
             For loss type equal to phi2, the applied fold statistics is always the reciprocal of the selected stats.
         """
         if statistic is None:
-            statistic = self._default_statistic
+            statistic = default
             log.warning(f"No {name} selected in HyperLoss, defaulting to {statistic}")
-        else:
-            if statistic not in IMPLEMENTED_STATS:
-                valid_options = ", ".join(IMPLEMENTED_STATS.keys())
-                raise ValueError(
-                    f"Invalid {name} '{statistic}'. Valid options are: {valid_options}"
-                )
+
+        if statistic not in IMPLEMENTED_STATS:
+            valid_options = ", ".join(IMPLEMENTED_STATS.keys())
+            raise ValueError(f"Invalid {name} '{statistic}'. Valid options are: {valid_options}")
 
         log.info(f"Using '{statistic}' as the {name} for hyperoptimization")
 

diff --git a/n3fit/src/n3fit/model_trainer.py b/n3fit/src/n3fit/model_trainer.py
@@ -192,6 +192,7 @@ def __init__(
                 loss_type=loss_type,
                 replica_statistic=replica_statistic,
                 fold_statistic=fold_statistic,
+                penalties_in_loss=kfold_parameters.get("penalties_in_loss", False),
             )
 
         # Initialize the dictionaries which contain all fitting information
@@ -750,11 +751,6 @@ def _train_and_fit(self, training_model, stopping_object, epochs=100) -> bool:
             callbacks=self.callbacks + [callback_st, callback_pos, callback_integ],
         )
 
-        # TODO: in order to use multireplica in hyperopt is is necessary to define what "passing" means
-        # for now consider the run as good if any replica passed
-        fit_has_passed = any(bool(i) for i in stopping_object.e_best_chi2)
-        return fit_has_passed
-
     def _hyperopt_override(self, params):
         """Unrolls complicated hyperopt structures into very simple dictionaries"""
         # If the input contains all parameters, then that's your dictionary of hyperparameters
@@ -984,13 +980,9 @@ def hyperparametrizable(self, params):
             for model in models.values():
                 model.compile(**params["optimizer"])
 
-            passed = self._train_and_fit(models["training"], stopping_object, epochs=epochs)
+            self._train_and_fit(models["training"], stopping_object, epochs=epochs)
 
             if self.mode_hyperopt:
-                if not passed:
-                    log.info("Hyperparameter combination fail to find a good fit, breaking")
-                    break
-
                 validation_loss = stopping_object.vl_chi2
 
                 # number of active points in this fold
@@ -1024,8 +1016,6 @@ def hyperparametrizable(self, params):
                     fold_idx=k,
                 )
 
-                log.info("Fold %d finished, loss=%.1f, pass=%s", k + 1, hyper_loss, passed)
-
                 # Create another list of `validphys.core.DataGroupSpec`
                 # containing now exp datasets that are included in the training/validation dataset
                 trvl_partitions = list(self.kpartitions)
@@ -1054,7 +1044,11 @@ def hyperparametrizable(self, params):
                     # Apply a penalty proportional to the number of folds not computed
                     pen_mul = len(self.kpartitions) - k
                     l_hyper = [i * pen_mul for i in l_hyper]
+                    passed = False
                     break
+                else:
+                    passed = True
+                    log.info("Fold %d finished, loss=%.1f, pass=%s", k + 1, hyper_loss, passed)
 
             # endfor
 
@@ -1097,5 +1091,7 @@ def hyperparametrizable(self, params):
         # In a normal run, the only information we need to output is the stopping object
         # (which contains metadata about the stopping)
         # and the pdf model (which are used to generate the PDF grids and compute arclengths)
+        if not self.mode_hyperopt:
+            passed = any(bool(i) for i in stopping_object.e_best_chi2)
         dict_out = {"status": passed, "stopping_object": stopping_object, "pdf_model": pdf_model}
         return dict_out
diff --git a/n3fit/src/n3fit/tests/regressions/hyper-quickcard.yml b/n3fit/src/n3fit/tests/regressions/hyper-quickcard.yml
@@ -62,7 +62,7 @@ kfold:
       - saturation
       - patience
       - integrability
-    threshold: 2.0
+    threshold: 1e3
     partitions:
         - datasets:
             - NMC_NC_NOTFIXED_P_EM-SIGMARED

diff --git a/nnpdf_data/nnpdf_data/new_commondata/ATLAS_TTBAR_13TEV_HADR_DIF/metadata.yaml b/nnpdf_data/nnpdf_data/new_commondata/ATLAS_TTBAR_13TEV_HADR_DIF/metadata.yaml
@@ -138,7 +138,7 @@ implemented_observables:
     data_central: data_d2Sig_dmttBar_dyttBar.yaml
     data_uncertainties:
       - uncertainties_d2Sig_dmttBar_dyttBar.yaml
-    kinematic_coverage: [y_ttBar, m_ttBar, sqrts]
+    kinematic_coverage: [y_ttBar, m_ttBar, m_t2]
     plotting:
       dataset_label: 'ATLAS 13 TeV top quark pair in hadronic channel: $\frac{d^2\sigma}{dm_{t\bar{t}}d|y_{t\bar{t}}|}$'
       kinematics_override: identity
@@ -167,7 +167,7 @@ implemented_observables:
     data_central: data_d2Sig_dmttBar_dyttBar_norm.yaml
     data_uncertainties:
       - uncertainties_d2Sig_dmttBar_dyttBar_norm.yaml
-    kinematic_coverage: [y_ttBar, m_ttBar, sqrts]
+    kinematic_coverage: [y_ttBar, m_ttBar, m_t2]
     plotting:
       dataset_label: 'ATLAS 13 TeV top quark pair in hadronic channel: $\frac{1}{\sigma}\frac{d^2\sigma}{dm_{t\bar{t}}d|y_{t\bar{t}}|}$'
       kinematics_override: identity

diff --git a/nnpdf_data/nnpdf_data/new_commondata/CMS_TTBAR_13TEV_LJ_DIF/metadata.yaml b/nnpdf_data/nnpdf_data/new_commondata/CMS_TTBAR_13TEV_LJ_DIF/metadata.yaml
@@ -140,7 +140,7 @@ implemented_observables:
     data_central: data_d2Sig_dyttBar_dmttBar.yaml
     data_uncertainties:
       - uncertainties_d2Sig_dyttBar_dmttBar.yaml
-    kinematic_coverage: [y_ttBar, m_ttBar, sqrts]
+    kinematic_coverage: [y_ttBar, m_ttBar, m_t2]
     plotting:
       dataset_label: 'CMS 13 TeV top quark pair l+j channel: $\frac{d^2\sigma}{dm_{t\bar{t}}d|y_{t\bar{t}}|}$'
       kinematics_override: identity
@@ -170,7 +170,7 @@ implemented_observables:
     data_central: data_d2Sig_dyttBar_dmttBar_norm.yaml
     data_uncertainties:
       - uncertainties_d2Sig_dyttBar_dmttBar_norm.yaml
-    kinematic_coverage: [y_ttBar, m_ttBar, sqrts]
+    kinematic_coverage: [y_ttBar, m_ttBar, m_t2]
     plotting:
       dataset_label: 'CMS 13 TeV top quark pair l+j channel: $\frac{1}{\sigma}\frac{d^2\sigma}{dm_{t\bar{t}}d|y_{t\bar{t}}|}$'
       kinematics_override: identity