Skip to content

Commit

Permalink
feat: specify features and target when creating a TaggedTable (#114)
Browse files Browse the repository at this point in the history
Closes #27 .

### Summary of Changes

A user can now optionally specify the `features` of a `TaggedTable`
explicitly. If the features are not specified, all columns except the
target are considered features.

The `predict` method of classifiers/regressors only uses the features
for prediction. The other columns are still included in the output,
however. This is, for example, useful to include an ID column in the
table created by `predict`.

---------

Co-authored-by: lars-reimann <lars-reimann@users.noreply.github.com>
  • Loading branch information
lars-reimann and lars-reimann committed Mar 29, 2023
1 parent b18a06d commit 95e1fc7
Show file tree
Hide file tree
Showing 86 changed files with 590 additions and 354 deletions.
6 changes: 2 additions & 4 deletions docs/tutorials/machine_learning.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,7 @@
"## Create a `TaggedTable`\n",
"\n",
"First, we need to create a `TaggedTable` from the training data. `TaggedTable`s are used to train supervised machine learning models, because they keep track of the target\n",
"column. A `TaggedTable` can be created from a `Table` by\n",
"specifying the target column in the `Table`."
"column. A `TaggedTable` can be created from a `Table` by calling the `tag_columns` method:"
],
"metadata": {
"collapsed": false
Expand All @@ -32,8 +31,7 @@
" \"result\": [6, 7, 10, 13, 9]\n",
"})\n",
"\n",
"tagged_table = TaggedTable(\n",
" training_set,\n",
"tagged_table = training_set.tag_columns(\n",
" target_name=\"result\"\n",
")"
],
Expand Down
44 changes: 34 additions & 10 deletions src/safeds/data/tabular/containers/_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,15 @@

import functools
import os.path
import typing
from pathlib import Path
from typing import Callable, Optional, Union
from typing import TYPE_CHECKING, Any, Callable, Iterable, Optional, Union

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from IPython.core.display_functions import DisplayHandle, display
from pandas import DataFrame, Series
from safeds.data.tabular.containers._column import Column
from safeds.data.tabular.containers._row import Row
from safeds.data.tabular.typing import ColumnType, TableSchema
from safeds.exceptions import (
ColumnLengthMismatchError,
Expand All @@ -28,6 +25,12 @@
)
from scipy import stats

from ._column import Column
from ._row import Row

if TYPE_CHECKING:
from ._tagged_table import TaggedTable


# noinspection PyProtectedMember
class Table:
Expand Down Expand Up @@ -188,7 +191,7 @@ def from_rows(rows: list[Row]) -> Table:
# Dunder methods
# ------------------------------------------------------------------------------------------------------------------

def __init__(self, data: typing.Iterable, schema: Optional[TableSchema] = None):
def __init__(self, data: Iterable, schema: Optional[TableSchema] = None):
self._data: pd.Dataframe = data if isinstance(data, pd.DataFrame) else pd.DataFrame(data)
if schema is None:
if self.count_columns() == 0:
Expand All @@ -202,7 +205,7 @@ def __init__(self, data: typing.Iterable, schema: Optional[TableSchema] = None):
self._data = self._data.reset_index(drop=True)
self._data.columns = list(range(self.count_columns()))

def __eq__(self, other: typing.Any) -> bool:
def __eq__(self, other: Any) -> bool:
if not isinstance(other, Table):
return NotImplemented
if self is other:
Expand Down Expand Up @@ -782,8 +785,8 @@ def shuffle(self) -> Table:

def slice(
self,
start: typing.Optional[int] = None,
end: typing.Optional[int] = None,
start: Optional[int] = None,
end: Optional[int] = None,
step: int = 1,
) -> Table:
"""
Expand Down Expand Up @@ -878,7 +881,7 @@ def sort_rows(self, comparator: Callable[[Row, Row], int]) -> Table:
rows.sort(key=functools.cmp_to_key(comparator))
return Table.from_rows(rows)

def split(self, percentage_in_first: float) -> typing.Tuple[Table, Table]:
def split(self, percentage_in_first: float) -> tuple[Table, Table]:
"""
Split the table into two new tables.
Expand All @@ -902,7 +905,28 @@ def split(self, percentage_in_first: float) -> typing.Tuple[Table, Table]:
self.slice(round(percentage_in_first * self.count_rows())),
)

def transform_column(self, name: str, transformer: Callable[[Row], typing.Any]) -> Table:
def tag_columns(self, target_name: str, feature_names: Optional[list[str]] = None) -> TaggedTable:
"""
Mark the columns of the table as target column or feature columns. The original table is not modified.
Parameters
----------
target_name : str
Name of the target column.
feature_names : Optional[list[str]]
Names of the feature columns. If None, all columns except the target column are used.
Returns
-------
tagged_table : TaggedTable
A new tagged table with the given target and feature names.
"""
# pylint: disable=import-outside-toplevel
from ._tagged_table import TaggedTable

return TaggedTable(self._data, target_name, feature_names, self._schema)

def transform_column(self, name: str, transformer: Callable[[Row], Any]) -> Table:
"""
Transform provided column by calling provided transformer.
Expand Down
57 changes: 40 additions & 17 deletions src/safeds/data/tabular/containers/_tagged_table.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from IPython.core.display_functions import DisplayHandle
from typing import Iterable, Optional

from ._column import Column
from ._table import Table
from IPython.core.display_functions import DisplayHandle
from safeds.data.tabular.containers import Column, Table
from safeds.data.tabular.typing import TableSchema


class TaggedTable(Table):
Expand All @@ -10,34 +11,56 @@ class TaggedTable(Table):
Parameters
----------
table : Table
The table used to derive the features and target.
data : Iterable
The data.
target_name : str
Name of the target column.
feature_names : Optional[list[str]]
Names of the feature columns. If None, all columns except the target column are used.
schema : Optional[TableSchema]
The schema of the table. If not specified, the schema will be inferred from the data.
"""

def __init__(self, table: Table, target_name: str):
super().__init__(table._data)
def __init__(
self,
data: Iterable,
target_name: str,
feature_names: Optional[list[str]] = None,
schema: Optional[TableSchema] = None,
):
super().__init__(data, schema)

# If no feature names are specified, use all columns except the target column
if feature_names is None:
feature_names = self.get_column_names()
if target_name in feature_names:
feature_names.remove(target_name)

# Validate inputs
if target_name in feature_names:
raise ValueError(f"Column '{target_name}' cannot be both feature and target.")
if len(feature_names) == 0:
raise ValueError("At least one feature column must be specified.")

self._y: Column = table.get_column(target_name)
self._X: Table = table.drop_columns([target_name])
self._features: Table = self.keep_only_columns(feature_names)
self._target: Column = self.get_column(target_name)

@property
def features(self) -> Table:
return self._X
return self._features

@property
def target(self) -> Column:
return self._y
return self._target

def __repr__(self) -> str:
tmp = self._X.add_column(self._y)
header_info = "Target Column is '" + self._y.name + "'\n"
tmp = self._features.add_column(self._target)
header_info = "Target Column is '" + self._target.name + "'\n"
return header_info + tmp.__repr__()

def __str__(self) -> str:
tmp = self._X.add_column(self._y)
header_info = "Target Column is '" + self._y.name + "'\n"
tmp = self._features.add_column(self._target)
header_info = "Target Column is '" + self._target.name + "'\n"
return header_info + tmp.__str__()

def _ipython_display_(self) -> DisplayHandle:
Expand All @@ -49,7 +72,7 @@ def _ipython_display_(self) -> DisplayHandle:
output : DisplayHandle
Output object.
"""
tmp = self._X.add_column(self._y)
header_info = "Target Column is '" + self._y.name + "'\n"
tmp = self._features.add_column(self._target)
header_info = "Target Column is '" + self._target.name + "'\n"
print(header_info)
return tmp._ipython_display_()
5 changes: 3 additions & 2 deletions src/safeds/data/tabular/transformation/_table_transformer.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
from __future__ import annotations

from abc import ABC, abstractmethod
from typing import Optional
from typing import TYPE_CHECKING, Optional

from safeds.data.tabular.containers import Table
if TYPE_CHECKING:
from safeds.data.tabular.containers import Table


class TableTransformer(ABC):
Expand Down
25 changes: 11 additions & 14 deletions src/safeds/ml/_util_sklearn.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,10 @@ def fit(model: Any, tagged_table: TaggedTable) -> None:
)
except ValueError as exception:
raise LearningError(str(exception)) from exception
except Exception as exception:
raise LearningError(None) from exception


# noinspection PyProtectedMember
def predict(model: Any, dataset: Table, target_name: Optional[str]) -> TaggedTable:
def predict(model: Any, dataset: Table, feature_names: Optional[list[str]], target_name: Optional[str]) -> TaggedTable:
"""
Predict a target vector using a dataset containing feature vectors. The model has to be trained first.
Expand All @@ -44,8 +42,10 @@ def predict(model: Any, dataset: Table, target_name: Optional[str]) -> TaggedTab
Classifier or regressor from scikit-learn.
dataset : Table
The dataset containing the features.
target_name : str
target_name : Optional[str]
The name of the target column.
feature_names : Optional[list[str]]
The names of the feature columns.
Returns
-------
Expand All @@ -58,23 +58,20 @@ def predict(model: Any, dataset: Table, target_name: Optional[str]) -> TaggedTab
If predicting with the given dataset failed.
"""

if model is None or target_name is None:
if model is None or target_name is None or feature_names is None:
raise PredictionError("The model was not trained")

dataset_df = dataset._data
dataset_df.columns = dataset.schema.get_column_names()
dataset_df = dataset.keep_only_columns(feature_names)._data
dataset_df.columns = feature_names
try:
predicted_target_vector = model.predict(dataset_df.values)
result_set = dataset_df.copy(deep=True)
result_set = dataset._data.copy(deep=True)
result_set.columns = dataset.get_column_names()
if target_name in result_set.columns:
raise ValueError(
f"Dataset already contains '{target_name}' column. Please rename this column"
)
raise ValueError(f"Dataset already contains '{target_name}' column. Please rename this column")
result_set[target_name] = predicted_target_vector
return TaggedTable(Table(result_set), target_name=target_name)
return Table(result_set).tag_columns(target_name=target_name, feature_names=feature_names)
except NotFittedError as exception:
raise PredictionError("The model was not trained") from exception
except ValueError as exception:
raise PredictionError(str(exception)) from exception
except Exception as exception:
raise PredictionError(None) from exception
4 changes: 3 additions & 1 deletion src/safeds/ml/classification/_ada_boost.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ class AdaBoost(Classifier):

def __init__(self) -> None:
self._wrapped_classifier: Optional[sk_AdaBoostClassifier] = None
self._feature_names: Optional[list[str]] = None
self._target_name: Optional[str] = None

def fit(self, training_set: TaggedTable) -> AdaBoost:
Expand Down Expand Up @@ -45,6 +46,7 @@ def fit(self, training_set: TaggedTable) -> AdaBoost:

result = AdaBoost()
result._wrapped_classifier = wrapped_classifier
result._feature_names = training_set.features.get_column_names()
result._target_name = training_set.target.name

return result
Expand All @@ -68,4 +70,4 @@ def predict(self, dataset: Table) -> TaggedTable:
PredictionError
If prediction with the given dataset failed.
"""
return predict(self._wrapped_classifier, dataset, self._target_name)
return predict(self._wrapped_classifier, dataset, self._feature_names, self._target_name)
4 changes: 3 additions & 1 deletion src/safeds/ml/classification/_decision_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ class DecisionTree(Classifier):

def __init__(self) -> None:
self._wrapped_classifier: Optional[sk_DecisionTreeClassifier] = None
self._feature_names: Optional[list[str]] = None
self._target_name: Optional[str] = None

def fit(self, training_set: TaggedTable) -> DecisionTree:
Expand Down Expand Up @@ -45,6 +46,7 @@ def fit(self, training_set: TaggedTable) -> DecisionTree:

result = DecisionTree()
result._wrapped_classifier = wrapped_classifier
result._feature_names = training_set.features.get_column_names()
result._target_name = training_set.target.name

return result
Expand All @@ -68,4 +70,4 @@ def predict(self, dataset: Table) -> TaggedTable:
PredictionError
If prediction with the given dataset failed.
"""
return predict(self._wrapped_classifier, dataset, self._target_name)
return predict(self._wrapped_classifier, dataset, self._feature_names, self._target_name)
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ class GradientBoosting(Classifier):

def __init__(self) -> None:
self._wrapped_classifier: Optional[sk_GradientBoostingClassifier] = None
self._feature_names: Optional[list[str]] = None
self._target_name: Optional[str] = None

def fit(self, training_set: TaggedTable) -> GradientBoosting:
Expand Down Expand Up @@ -45,6 +46,7 @@ def fit(self, training_set: TaggedTable) -> GradientBoosting:

result = GradientBoosting()
result._wrapped_classifier = wrapped_classifier
result._feature_names = training_set.features.get_column_names()
result._target_name = training_set.target.name

return result
Expand All @@ -69,4 +71,4 @@ def predict(self, dataset: Table) -> TaggedTable:
PredictionError
If prediction with the given dataset failed.
"""
return predict(self._wrapped_classifier, dataset, self._target_name)
return predict(self._wrapped_classifier, dataset, self._feature_names, self._target_name)
8 changes: 3 additions & 5 deletions src/safeds/ml/classification/_k_nearest_neighbors.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ def __init__(self, n_neighbors: int) -> None:
self._n_neighbors = n_neighbors

self._wrapped_classifier: Optional[sk_KNeighborsClassifier] = None
self._feature_names: Optional[list[str]] = None
self._target_name: Optional[str] = None

def fit(self, training_set: TaggedTable) -> KNearestNeighbors:
Expand Down Expand Up @@ -50,6 +51,7 @@ def fit(self, training_set: TaggedTable) -> KNearestNeighbors:

result = KNearestNeighbors(self._n_neighbors)
result._wrapped_classifier = wrapped_classifier
result._feature_names = training_set.features.get_column_names()
result._target_name = training_set.target.name

return result
Expand All @@ -73,8 +75,4 @@ def predict(self, dataset: Table) -> TaggedTable:
PredictionError
If prediction with the given dataset failed.
"""
return predict(
self._wrapped_classifier,
dataset,
self._target_name,
)
return predict(self._wrapped_classifier, dataset, self._feature_names, self._target_name)
4 changes: 3 additions & 1 deletion src/safeds/ml/classification/_logistic_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ class LogisticRegression(Classifier):

def __init__(self) -> None:
self._wrapped_classifier: Optional[sk_LogisticRegression] = None
self._feature_names: Optional[list[str]] = None
self._target_name: Optional[str] = None

def fit(self, training_set: TaggedTable) -> LogisticRegression:
Expand Down Expand Up @@ -45,6 +46,7 @@ def fit(self, training_set: TaggedTable) -> LogisticRegression:

result = LogisticRegression()
result._wrapped_classifier = wrapped_classifier
result._feature_names = training_set.features.get_column_names()
result._target_name = training_set.target.name

return result
Expand All @@ -68,4 +70,4 @@ def predict(self, dataset: Table) -> TaggedTable:
PredictionError
If prediction with the given dataset failed.
"""
return predict(self._wrapped_classifier, dataset, self._target_name)
return predict(self._wrapped_classifier, dataset, self._feature_names, self._target_name)
Loading

0 comments on commit 95e1fc7

Please sign in to comment.