feat: specify features and target when creating a TaggedTable (#114)

Closes #27 . ### Summary of Changes A user can now optionally specify the `features` of a `TaggedTable` explicitly. If the features are not specified, all columns except the target are considered features. The `predict` method of classifiers/regressors only uses the features for prediction. The other columns are still included in the output, however. This is, for example, useful to include an ID column in the table created by `predict`. --------- Co-authored-by: lars-reimann <lars-reimann@users.noreply.github.com>
Safe-DS · Mar 29, 2023 · 95e1fc7 · 95e1fc7
1 parent b18a06d
commit 95e1fc7
Show file tree

Hide file tree

Showing 86 changed files with 590 additions and 354 deletions.
diff --git a/docs/tutorials/machine_learning.ipynb b/docs/tutorials/machine_learning.ipynb
@@ -10,8 +10,7 @@
     "## Create a `TaggedTable`\n",
     "\n",
     "First, we need to create a `TaggedTable` from the training data. `TaggedTable`s are used to train supervised machine learning models, because they keep track of the target\n",
-    "column. A `TaggedTable` can be created from a `Table` by\n",
-    "specifying the target column in the `Table`."
+    "column. A `TaggedTable` can be created from a `Table` by calling the `tag_columns` method:"
    ],
    "metadata": {
     "collapsed": false
@@ -32,8 +31,7 @@
     "    \"result\": [6, 7, 10, 13, 9]\n",
     "})\n",
     "\n",
-    "tagged_table = TaggedTable(\n",
-    "    training_set,\n",
+    "tagged_table = training_set.tag_columns(\n",
     "    target_name=\"result\"\n",
     ")"
    ],

diff --git a/src/safeds/data/tabular/containers/_table.py b/src/safeds/data/tabular/containers/_table.py
@@ -2,18 +2,15 @@
 
 import functools
 import os.path
-import typing
 from pathlib import Path
-from typing import Callable, Optional, Union
+from typing import TYPE_CHECKING, Any, Callable, Iterable, Optional, Union
 
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
 import seaborn as sns
 from IPython.core.display_functions import DisplayHandle, display
 from pandas import DataFrame, Series
-from safeds.data.tabular.containers._column import Column
-from safeds.data.tabular.containers._row import Row
 from safeds.data.tabular.typing import ColumnType, TableSchema
 from safeds.exceptions import (
     ColumnLengthMismatchError,
@@ -28,6 +25,12 @@
 )
 from scipy import stats
 
+from ._column import Column
+from ._row import Row
+
+if TYPE_CHECKING:
+    from ._tagged_table import TaggedTable
+
 
 # noinspection PyProtectedMember
 class Table:
@@ -188,7 +191,7 @@ def from_rows(rows: list[Row]) -> Table:
     # Dunder methods
     # ------------------------------------------------------------------------------------------------------------------
 
-    def __init__(self, data: typing.Iterable, schema: Optional[TableSchema] = None):
+    def __init__(self, data: Iterable, schema: Optional[TableSchema] = None):
         self._data: pd.Dataframe = data if isinstance(data, pd.DataFrame) else pd.DataFrame(data)
         if schema is None:
             if self.count_columns() == 0:
@@ -202,7 +205,7 @@ def __init__(self, data: typing.Iterable, schema: Optional[TableSchema] = None):
         self._data = self._data.reset_index(drop=True)
         self._data.columns = list(range(self.count_columns()))
 
-    def __eq__(self, other: typing.Any) -> bool:
+    def __eq__(self, other: Any) -> bool:
         if not isinstance(other, Table):
             return NotImplemented
         if self is other:
@@ -782,8 +785,8 @@ def shuffle(self) -> Table:
 
     def slice(
         self,
-        start: typing.Optional[int] = None,
-        end: typing.Optional[int] = None,
+        start: Optional[int] = None,
+        end: Optional[int] = None,
         step: int = 1,
     ) -> Table:
         """
@@ -878,7 +881,7 @@ def sort_rows(self, comparator: Callable[[Row, Row], int]) -> Table:
         rows.sort(key=functools.cmp_to_key(comparator))
         return Table.from_rows(rows)
 
-    def split(self, percentage_in_first: float) -> typing.Tuple[Table, Table]:
+    def split(self, percentage_in_first: float) -> tuple[Table, Table]:
         """
         Split the table into two new tables.
 
@@ -902,7 +905,28 @@ def split(self, percentage_in_first: float) -> typing.Tuple[Table, Table]:
             self.slice(round(percentage_in_first * self.count_rows())),
         )
 
-    def transform_column(self, name: str, transformer: Callable[[Row], typing.Any]) -> Table:
+    def tag_columns(self, target_name: str, feature_names: Optional[list[str]] = None) -> TaggedTable:
+        """
+        Mark the columns of the table as target column or feature columns. The original table is not modified.
+
+        Parameters
+        ----------
+        target_name : str
+            Name of the target column.
+        feature_names : Optional[list[str]]
+            Names of the feature columns. If None, all columns except the target column are used.
+
+        Returns
+        -------
+        tagged_table : TaggedTable
+            A new tagged table with the given target and feature names.
+        """
+        # pylint: disable=import-outside-toplevel
+        from ._tagged_table import TaggedTable
+
+        return TaggedTable(self._data, target_name, feature_names, self._schema)
+
+    def transform_column(self, name: str, transformer: Callable[[Row], Any]) -> Table:
         """
         Transform provided column by calling provided transformer.
 

diff --git a/src/safeds/data/tabular/containers/_tagged_table.py b/src/safeds/data/tabular/containers/_tagged_table.py
@@ -1,7 +1,8 @@
-from IPython.core.display_functions import DisplayHandle
+from typing import Iterable, Optional
 
-from ._column import Column
-from ._table import Table
+from IPython.core.display_functions import DisplayHandle
+from safeds.data.tabular.containers import Column, Table
+from safeds.data.tabular.typing import TableSchema
 
 
 class TaggedTable(Table):
@@ -10,34 +11,56 @@ class TaggedTable(Table):
 
     Parameters
     ----------
-    table : Table
-        The table used to derive the features and target.
+    data : Iterable
+        The data.
     target_name : str
         Name of the target column.
+    feature_names : Optional[list[str]]
+        Names of the feature columns. If None, all columns except the target column are used.
+    schema : Optional[TableSchema]
+        The schema of the table. If not specified, the schema will be inferred from the data.
     """
 
-    def __init__(self, table: Table, target_name: str):
-        super().__init__(table._data)
+    def __init__(
+        self,
+        data: Iterable,
+        target_name: str,
+        feature_names: Optional[list[str]] = None,
+        schema: Optional[TableSchema] = None,
+    ):
+        super().__init__(data, schema)
+
+        # If no feature names are specified, use all columns except the target column
+        if feature_names is None:
+            feature_names = self.get_column_names()
+            if target_name in feature_names:
+                feature_names.remove(target_name)
+
+        # Validate inputs
+        if target_name in feature_names:
+            raise ValueError(f"Column '{target_name}' cannot be both feature and target.")
+        if len(feature_names) == 0:
+            raise ValueError("At least one feature column must be specified.")
 
-        self._y: Column = table.get_column(target_name)
-        self._X: Table = table.drop_columns([target_name])
+        self._features: Table = self.keep_only_columns(feature_names)
+        self._target: Column = self.get_column(target_name)
 
     @property
     def features(self) -> Table:
-        return self._X
+        return self._features
 
     @property
     def target(self) -> Column:
-        return self._y
+        return self._target
 
     def __repr__(self) -> str:
-        tmp = self._X.add_column(self._y)
-        header_info = "Target Column is '" + self._y.name + "'\n"
+        tmp = self._features.add_column(self._target)
+        header_info = "Target Column is '" + self._target.name + "'\n"
         return header_info + tmp.__repr__()
 
     def __str__(self) -> str:
-        tmp = self._X.add_column(self._y)
-        header_info = "Target Column is '" + self._y.name + "'\n"
+        tmp = self._features.add_column(self._target)
+        header_info = "Target Column is '" + self._target.name + "'\n"
         return header_info + tmp.__str__()
 
     def _ipython_display_(self) -> DisplayHandle:
@@ -49,7 +72,7 @@ def _ipython_display_(self) -> DisplayHandle:
         output : DisplayHandle
             Output object.
         """
-        tmp = self._X.add_column(self._y)
-        header_info = "Target Column is '" + self._y.name + "'\n"
+        tmp = self._features.add_column(self._target)
+        header_info = "Target Column is '" + self._target.name + "'\n"
         print(header_info)
         return tmp._ipython_display_()
diff --git a/src/safeds/data/tabular/transformation/_table_transformer.py b/src/safeds/data/tabular/transformation/_table_transformer.py
@@ -1,9 +1,10 @@
 from __future__ import annotations
 
 from abc import ABC, abstractmethod
-from typing import Optional
+from typing import TYPE_CHECKING, Optional
 
-from safeds.data.tabular.containers import Table
+if TYPE_CHECKING:
+    from safeds.data.tabular.containers import Table
 
 
 class TableTransformer(ABC):

diff --git a/src/safeds/ml/_util_sklearn.py b/src/safeds/ml/_util_sklearn.py
@@ -29,12 +29,10 @@ def fit(model: Any, tagged_table: TaggedTable) -> None:
         )
     except ValueError as exception:
         raise LearningError(str(exception)) from exception
-    except Exception as exception:
-        raise LearningError(None) from exception
 
 
 # noinspection PyProtectedMember
-def predict(model: Any, dataset: Table, target_name: Optional[str]) -> TaggedTable:
+def predict(model: Any, dataset: Table, feature_names: Optional[list[str]], target_name: Optional[str]) -> TaggedTable:
     """
     Predict a target vector using a dataset containing feature vectors. The model has to be trained first.
 
@@ -44,8 +42,10 @@ def predict(model: Any, dataset: Table, target_name: Optional[str]) -> TaggedTab
         Classifier or regressor from scikit-learn.
     dataset : Table
         The dataset containing the features.
-    target_name : str
+    target_name : Optional[str]
         The name of the target column.
+    feature_names : Optional[list[str]]
+        The names of the feature columns.
 
     Returns
     -------
@@ -58,23 +58,20 @@ def predict(model: Any, dataset: Table, target_name: Optional[str]) -> TaggedTab
         If predicting with the given dataset failed.
     """
 
-    if model is None or target_name is None:
+    if model is None or target_name is None or feature_names is None:
         raise PredictionError("The model was not trained")
 
-    dataset_df = dataset._data
-    dataset_df.columns = dataset.schema.get_column_names()
+    dataset_df = dataset.keep_only_columns(feature_names)._data
+    dataset_df.columns = feature_names
     try:
         predicted_target_vector = model.predict(dataset_df.values)
-        result_set = dataset_df.copy(deep=True)
+        result_set = dataset._data.copy(deep=True)
+        result_set.columns = dataset.get_column_names()
         if target_name in result_set.columns:
-            raise ValueError(
-                f"Dataset already contains '{target_name}' column. Please rename this column"
-            )
+            raise ValueError(f"Dataset already contains '{target_name}' column. Please rename this column")
         result_set[target_name] = predicted_target_vector
-        return TaggedTable(Table(result_set), target_name=target_name)
+        return Table(result_set).tag_columns(target_name=target_name, feature_names=feature_names)
     except NotFittedError as exception:
         raise PredictionError("The model was not trained") from exception
     except ValueError as exception:
         raise PredictionError(str(exception)) from exception
-    except Exception as exception:
-        raise PredictionError(None) from exception
diff --git a/src/safeds/ml/classification/_ada_boost.py b/src/safeds/ml/classification/_ada_boost.py
@@ -17,6 +17,7 @@ class AdaBoost(Classifier):
 
     def __init__(self) -> None:
         self._wrapped_classifier: Optional[sk_AdaBoostClassifier] = None
+        self._feature_names: Optional[list[str]] = None
         self._target_name: Optional[str] = None
 
     def fit(self, training_set: TaggedTable) -> AdaBoost:
@@ -45,6 +46,7 @@ def fit(self, training_set: TaggedTable) -> AdaBoost:
 
         result = AdaBoost()
         result._wrapped_classifier = wrapped_classifier
+        result._feature_names = training_set.features.get_column_names()
         result._target_name = training_set.target.name
 
         return result
@@ -68,4 +70,4 @@ def predict(self, dataset: Table) -> TaggedTable:
         PredictionError
             If prediction with the given dataset failed.
         """
-        return predict(self._wrapped_classifier, dataset, self._target_name)
+        return predict(self._wrapped_classifier, dataset, self._feature_names, self._target_name)
diff --git a/src/safeds/ml/classification/_decision_tree.py b/src/safeds/ml/classification/_decision_tree.py
@@ -17,6 +17,7 @@ class DecisionTree(Classifier):
 
     def __init__(self) -> None:
         self._wrapped_classifier: Optional[sk_DecisionTreeClassifier] = None
+        self._feature_names: Optional[list[str]] = None
         self._target_name: Optional[str] = None
 
     def fit(self, training_set: TaggedTable) -> DecisionTree:
@@ -45,6 +46,7 @@ def fit(self, training_set: TaggedTable) -> DecisionTree:
 
         result = DecisionTree()
         result._wrapped_classifier = wrapped_classifier
+        result._feature_names = training_set.features.get_column_names()
         result._target_name = training_set.target.name
 
         return result
@@ -68,4 +70,4 @@ def predict(self, dataset: Table) -> TaggedTable:
         PredictionError
             If prediction with the given dataset failed.
         """
-        return predict(self._wrapped_classifier, dataset, self._target_name)
+        return predict(self._wrapped_classifier, dataset, self._feature_names, self._target_name)
diff --git a/src/safeds/ml/classification/_gradient_boosting_classification.py b/src/safeds/ml/classification/_gradient_boosting_classification.py
@@ -17,6 +17,7 @@ class GradientBoosting(Classifier):
 
     def __init__(self) -> None:
         self._wrapped_classifier: Optional[sk_GradientBoostingClassifier] = None
+        self._feature_names: Optional[list[str]] = None
         self._target_name: Optional[str] = None
 
     def fit(self, training_set: TaggedTable) -> GradientBoosting:
@@ -45,6 +46,7 @@ def fit(self, training_set: TaggedTable) -> GradientBoosting:
 
         result = GradientBoosting()
         result._wrapped_classifier = wrapped_classifier
+        result._feature_names = training_set.features.get_column_names()
         result._target_name = training_set.target.name
 
         return result
@@ -69,4 +71,4 @@ def predict(self, dataset: Table) -> TaggedTable:
         PredictionError
             If prediction with the given dataset failed.
         """
-        return predict(self._wrapped_classifier, dataset, self._target_name)
+        return predict(self._wrapped_classifier, dataset, self._feature_names, self._target_name)
diff --git a/src/safeds/ml/classification/_k_nearest_neighbors.py b/src/safeds/ml/classification/_k_nearest_neighbors.py
@@ -23,6 +23,7 @@ def __init__(self, n_neighbors: int) -> None:
         self._n_neighbors = n_neighbors
 
         self._wrapped_classifier: Optional[sk_KNeighborsClassifier] = None
+        self._feature_names: Optional[list[str]] = None
         self._target_name: Optional[str] = None
 
     def fit(self, training_set: TaggedTable) -> KNearestNeighbors:
@@ -50,6 +51,7 @@ def fit(self, training_set: TaggedTable) -> KNearestNeighbors:
 
         result = KNearestNeighbors(self._n_neighbors)
         result._wrapped_classifier = wrapped_classifier
+        result._feature_names = training_set.features.get_column_names()
         result._target_name = training_set.target.name
 
         return result
@@ -73,8 +75,4 @@ def predict(self, dataset: Table) -> TaggedTable:
         PredictionError
             If prediction with the given dataset failed.
         """
-        return predict(
-            self._wrapped_classifier,
-            dataset,
-            self._target_name,
-        )
+        return predict(self._wrapped_classifier, dataset, self._feature_names, self._target_name)
diff --git a/src/safeds/ml/classification/_logistic_regression.py b/src/safeds/ml/classification/_logistic_regression.py
@@ -17,6 +17,7 @@ class LogisticRegression(Classifier):
 
     def __init__(self) -> None:
         self._wrapped_classifier: Optional[sk_LogisticRegression] = None
+        self._feature_names: Optional[list[str]] = None
         self._target_name: Optional[str] = None
 
     def fit(self, training_set: TaggedTable) -> LogisticRegression:
@@ -45,6 +46,7 @@ def fit(self, training_set: TaggedTable) -> LogisticRegression:
 
         result = LogisticRegression()
         result._wrapped_classifier = wrapped_classifier
+        result._feature_names = training_set.features.get_column_names()
         result._target_name = training_set.target.name
 
         return result
@@ -68,4 +70,4 @@ def predict(self, dataset: Table) -> TaggedTable:
         PredictionError
             If prediction with the given dataset failed.
         """
-        return predict(self._wrapped_classifier, dataset, self._target_name)
+        return predict(self._wrapped_classifier, dataset, self._feature_names, self._target_name)