Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

RFC, WIP: demo: use dataframe api in datetimeencoder #786

Closed
wants to merge 10 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@ install_requires =
scikit-learn>=1.2.1
numpy>=1.23.5
scipy>=1.9.3
pandas>=1.5.3
pandas>=2.1.0
dataframe-api-compat>=0.1.28
packaging>=23.1
python_requires = >=3.10

Expand Down
91 changes: 52 additions & 39 deletions skrub/_datetime_encoder.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
from typing import Literal
from __future__ import annotations

from typing import Literal, TYPE_CHECKING

import numpy as np
import pandas as pd
Expand All @@ -7,6 +9,8 @@
from sklearn.utils.validation import check_is_fitted

from skrub._utils import check_input
if TYPE_CHECKING:
from dataframe_api import Column

WORD_TO_ALIAS: dict[str, str] = {
"year": "Y",
Expand Down Expand Up @@ -130,37 +134,34 @@ def _validate_keywords(self):
)

@staticmethod
def _extract_from_date(date_series: pd.Series, feature: str):
def _extract_from_date(date_series: Column, feature: str):
MarcoGorelli marked this conversation as resolved.
Show resolved Hide resolved
if feature == "year":
return pd.DatetimeIndex(date_series).year.to_numpy()
return date_series.year()
elif feature == "month":
return pd.DatetimeIndex(date_series).month.to_numpy()
return date_series.month()
elif feature == "day":
return pd.DatetimeIndex(date_series).day.to_numpy()
return date_series.day()
elif feature == "hour":
return pd.DatetimeIndex(date_series).hour.to_numpy()
return date_series.hour()
elif feature == "minute":
return pd.DatetimeIndex(date_series).minute.to_numpy()
return date_series.minute()
elif feature == "second":
return pd.DatetimeIndex(date_series).second.to_numpy()
return date_series.second()
elif feature == "microsecond":
return pd.DatetimeIndex(date_series).microsecond.to_numpy()
return date_series.microsecond()
elif feature == "nanosecond":
return pd.DatetimeIndex(date_series).nanosecond.to_numpy()
if hasattr(date_series, 'nanosecond'):
return date_series.nanosecond()
else:
raise AttributeError(
f"`nanosecond` is not part of the DataFrame API and so support is not guaranteed across all libraries. "
"In particular, it is not supported for {date_series.__class__.__name__}"
)
Comment on lines +153 to +159
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

technically, nanosecond isn't part of the API standard (yet?). But I think you're primarily interested in pandas-Polars compatibility anyway (please correct me if I'm wrong)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you move the nanosecond handling into the dataframe API? It would make more sense to have it there and would help your users, IMHO.

Copy link

@ogrisel ogrisel Oct 30, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What do you mean by "into the dataframe API"? Into the standard? Or the dataframe-api-compat package? It cannot be done without having the stakeholders of the specification process agree on that first.

To move it into the dataframe-api-compat package would be helpful but maybe the dataframe-api-compat package does not want to expose extensions to the standard in its public API. So it would have to wait for this new method to become part of the standard first.

I think keeping this code there is pragmatic for the time being (to avoid introducing a regression w.r.t. what is already done in the current code-based).

Copy link
Author

@MarcoGorelli MarcoGorelli Oct 30, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

dataframe-api-compat currently only handles implementations of the Dataframe API for pandas and Polars. For some other libraries, like cudf, they plan to expose the dataframe API methods from their main namespace directly, so they wouldn't have a separate package

I've added nanosecond to dataframe-api-compat, but unless nanosecond is added to https://data-apis.org/dataframe-api/draft/index.html, then there's no guarantee that other libraries (e.g. cudf) would add it

I'll try to get nanoseconds added to the spec though, would make things cleaner

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you for the clarification @MarcoGorelli

elif feature == "dayofweek":
return pd.DatetimeIndex(date_series).dayofweek.to_numpy()
return date_series.iso_weekday() - 1
elif feature == "total_time":
tz = pd.DatetimeIndex(date_series).tz
# Compute the time in seconds from the epoch time UTC
if tz is None:
return (
pd.to_datetime(date_series) - pd.Timestamp("1970-01-01")
) // pd.Timedelta("1s")
else:
return (
pd.DatetimeIndex(date_series).tz_convert("utc")
- pd.Timestamp("1970-01-01", tz="utc")
) // pd.Timedelta("1s")
return date_series.unix_timestamp() # type: ignore

def fit(self, X: ArrayLike, y=None) -> "DatetimeEncoder":
"""Fit the instance to ``X``.
Expand All @@ -181,23 +182,26 @@ def fit(self, X: ArrayLike, y=None) -> "DatetimeEncoder":
Fitted DatetimeEncoder instance (self).
"""
self._validate_keywords()
if isinstance(X, pd.DataFrame):
self.col_names_ = X.columns.to_list()
else:
self.col_names_ = None
X = check_input(X)
if not hasattr(X, "__dataframe_consortium_standard__"):
X = check_input(X)
X = pd.DataFrame(X, columns=[str(i) for i in range(X.shape[1])])
Comment on lines +185 to +187
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this requires some discussion too

currently, you're allowing numpy arrays as inputs, but you're doing operations which numpy doesn't support (e.g. dayofweek) - if the input is a numpy array, is it OK with just pick a dataframe library and construct the input using that? here I'm just picking pandas

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's a good point. We're refactoring the DatetimeEncoder in #784, and we don't plan to have efficient support of Polars in this PR since we're converting the input to numpy right away.

So, to answer your question, you won't need to call a dataframe constructor. However, we will revisit this so that Polars and Pandas can be used without numpy conversions.

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am wondering: do you expect many users to use skrub with non-dataframe inputs? If not maybe it's still time not to accept numpy arrays as input and only accept dataframes. That might make the code simpler and easier to maintain.

X = X.__dataframe_consortium_standard__(api_version='2023.11-beta')
X = X.persist()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is the idea behind persist()?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It makes sure that computations prior to this step won't be repeated https://data-apis.org/dataframe-api/draft/API_specification/dataframe_object.html#dataframe_api.DataFrame.persist . In the end, the consortium preferred this to collect. It does pretty much the same thing anyway (in Polars it does .collect().lazy())

Below, if df.col(col_name).std() > 0 will trigger computation because it calls (df.col(col_name).std() > 0).__bool__(). That command produces a Python scalar, so it's necessary to trigger compute here (as opposed to keeping things lazy)

By putting persist, we ensure that the compute which gets triggered only happens from this point until the __bool__ call

Just saw you've already done a lot of work here to support Polars - well done! I'll rebase then

n_colums = len(X.column_names)
self.col_names_ = X.column_names
# Features to extract for each column, after removing constant features
self.features_per_column_ = {}
for i in range(X.shape[1]):
for i in range(n_colums):
self.features_per_column_[i] = []
# Check which columns are constant
for i in range(X.shape[1]):
for i in range(n_colums):
column = X.col(X.column_names[i])
if self.extract_until is None:
if np.nanstd(self._extract_from_date(X[:, i], "total_time")) > 0:
if self._extract_from_date(column, "total_time").std() > 0:
self.features_per_column_[i].append("total_time")
else:
for feature in TIME_LEVELS:
if np.nanstd(self._extract_from_date(X[:, i], feature)) > 0:
if self._extract_from_date(column, feature).std() > 0:
if TIME_LEVELS.index(feature) <= TIME_LEVELS.index(
self.extract_until
):
Expand All @@ -213,11 +217,11 @@ def fit(self, X: ArrayLike, y=None) -> "DatetimeEncoder":
# Add day of the week feature if needed
if (
self.add_day_of_the_week
and np.nanstd(self._extract_from_date(X[:, i], "dayofweek")) > 0
and float(self._extract_from_date(column, "dayofweek").std()) > 0
):
self.features_per_column_[i].append("dayofweek")

self.n_features_in_ = X.shape[1]
self.n_features_in_ = n_colums
self.n_features_out_ = len(
np.concatenate(list(self.features_per_column_.values()))
)
Expand All @@ -240,26 +244,35 @@ def transform(self, X: ArrayLike, y=None) -> NDArray:
ndarray, shape (``n_samples``, ``n_features_out_``)
Transformed input.
"""
if not hasattr(X, "__dataframe_consortium_standard__"):
X = check_input(X)
X = pd.DataFrame(X, columns=[str(i) for i in range(X.shape[1])])
X = X.__dataframe_consortium_standard__(api_version='2023.11-beta')
n_columns = len(X.column_names)
check_is_fitted(
self,
attributes=["n_features_in_", "n_features_out_", "features_per_column_"],
)
X = check_input(X)
if X.shape[1] != self.n_features_in_:
if n_columns != self.n_features_in_:
raise ValueError(
f"The number of features in the input data ({X.shape[1]}) "
f"The number of features in the input data ({n_columns}) "
"does not match the number of features "
f"seen during fit ({self.n_features_in_}). "
)
# Create a new array with the extracted features,
# choosing only features that weren't constant during fit
X_ = np.empty((X.shape[0], self.n_features_out_), dtype=np.float64)
features_to_select = []
idx = 0
for i in range(X.shape[1]):
for i in range(n_columns):
column = X.col(X.column_names[i])
for j, feature in enumerate(self.features_per_column_[i]):
X_[:, idx + j] = self._extract_from_date(X[:, i], feature)
features_to_select.append(
self._extract_from_date(column, feature).rename(f"{feature}_{i}")
)
idx += len(self.features_per_column_[i])
return X_
X = X.assign(*features_to_select).select(*(feature.name for feature in features_to_select))
X = X.persist()
return X.to_array("float64")

def get_feature_names_out(self, input_features=None) -> list[str]:
"""Return clean feature names.
Expand Down