Skip to content

Commit

Permalink
Match datetime to string conversions with pandas (#14)
Browse files Browse the repository at this point in the history
Fixes: #14152

This PR adds additional sub-second data introspection while converting a datetime64 to a string column to match pandas. Because it's not free, this is being added only in pandas compatibility mode.
  • Loading branch information
galipremsagar authored Oct 2, 2023
1 parent a324a48 commit 24a8365
Show file tree
Hide file tree
Showing 2 changed files with 190 additions and 64 deletions.
169 changes: 105 additions & 64 deletions python/cudf/cudf/core/column/datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,107 @@
]


def infer_format(element: str, **kwargs) -> str:
"""
Infers datetime format from a string, also takes cares for `ms` and `ns`
"""
fmt = _guess_datetime_format(element, **kwargs)

if fmt is not None:
if "%z" in fmt or "%Z" in fmt:
raise NotImplementedError(
"cuDF does not yet support timezone-aware datetimes"
)
return fmt

element_parts = element.split(".")
if len(element_parts) != 2:
raise ValueError("Given date string not likely a datetime.")

# There is possibility that the element is of following format
# '00:00:03.333333 2016-01-01'
second_parts = re.split(r"(\D+)", element_parts[1], maxsplit=1)
subsecond_fmt = ".%" + str(len(second_parts[0])) + "f"

first_part = _guess_datetime_format(element_parts[0], **kwargs)
# For the case where first_part is '00:00:03'
if first_part is None:
tmp = "1970-01-01 " + element_parts[0]
first_part = _guess_datetime_format(tmp, **kwargs).split(" ", 1)[1]
if first_part is None:
raise ValueError("Unable to infer the timestamp format from the data")

if len(second_parts) > 1:
# We may have a non-digit, timezone-like component
# like Z, UTC-3, +01:00
if any(re.search(r"\D", part) for part in second_parts):
raise NotImplementedError(
"cuDF does not yet support timezone-aware datetimes"
)
second_part = "".join(second_parts[1:])

if len(second_part) > 1:
# Only infer if second_parts is not an empty string.
second_part = _guess_datetime_format(second_part, **kwargs)
else:
second_part = ""

try:
fmt = first_part + subsecond_fmt + second_part
except Exception:
raise ValueError("Unable to infer the timestamp format from the data")

return fmt


def _resolve_mixed_dtypes(
lhs: ColumnBinaryOperand, rhs: ColumnBinaryOperand, base_type: str
) -> Dtype:
units = ["s", "ms", "us", "ns"]
lhs_time_unit = cudf.utils.dtypes.get_time_unit(lhs)
lhs_unit = units.index(lhs_time_unit)
rhs_time_unit = cudf.utils.dtypes.get_time_unit(rhs)
rhs_unit = units.index(rhs_time_unit)
return cudf.dtype(f"{base_type}[{units[max(lhs_unit, rhs_unit)]}]")


def _get_datetime_format(col, dtype, time_unit):
format = _dtype_to_format_conversion.get(dtype.name, "%Y-%m-%d %H:%M:%S")
if format.endswith("f"):
sub_second_res_len = 3
else:
sub_second_res_len = 0

has_nanos = time_unit in {"ns"} and col.get_dt_field("nanosecond").any()
has_micros = (
time_unit in {"ns", "us"} and col.get_dt_field("microsecond").any()
)
has_millis = (
time_unit in {"ns", "us", "ms"}
and col.get_dt_field("millisecond").any()
)
has_seconds = col.get_dt_field("second").any()
has_minutes = col.get_dt_field("minute").any()
has_hours = col.get_dt_field("hour").any()
if sub_second_res_len:
if has_nanos:
# format should be intact and rest of the
# following conditions shouldn't execute.
pass
elif has_micros:
format = format[:-sub_second_res_len] + "%6f"
elif has_millis:
format = format[:-sub_second_res_len] + "%3f"
elif has_seconds or has_minutes or has_hours:
format = format[:-4]
else:
format = format.split(" ")[0]
else:
if not (has_seconds or has_minutes or has_hours):
format = format.split(" ")[0]
return format


class DatetimeColumn(column.ColumnBase):
"""
A Column implementation for Date-time types.
Expand Down Expand Up @@ -346,6 +447,10 @@ def as_string_column(
format = _dtype_to_format_conversion.get(
self.dtype.name, "%Y-%m-%d %H:%M:%S"
)
if cudf.get_option("mode.pandas_compatible"):
format = _get_datetime_format(
self, dtype=self.dtype, time_unit=self.time_unit
)
if format in _DATETIME_SPECIAL_FORMATS:
names = as_column(_DATETIME_NAMES)
else:
Expand Down Expand Up @@ -622,67 +727,3 @@ def __repr__(self):
f"{arr.to_string()}\n"
f"dtype: {self.dtype}"
)


def infer_format(element: str, **kwargs) -> str:
"""
Infers datetime format from a string, also takes cares for `ms` and `ns`
"""
fmt = _guess_datetime_format(element, **kwargs)

if fmt is not None:
if "%z" in fmt or "%Z" in fmt:
raise NotImplementedError(
"cuDF does not yet support timezone-aware datetimes"
)
return fmt

element_parts = element.split(".")
if len(element_parts) != 2:
raise ValueError("Given date string not likely a datetime.")

# There is possibility that the element is of following format
# '00:00:03.333333 2016-01-01'
second_parts = re.split(r"(\D+)", element_parts[1], maxsplit=1)
subsecond_fmt = ".%" + str(len(second_parts[0])) + "f"

first_part = _guess_datetime_format(element_parts[0], **kwargs)
# For the case where first_part is '00:00:03'
if first_part is None:
tmp = "1970-01-01 " + element_parts[0]
first_part = _guess_datetime_format(tmp, **kwargs).split(" ", 1)[1]
if first_part is None:
raise ValueError("Unable to infer the timestamp format from the data")

if len(second_parts) > 1:
# We may have a non-digit, timezone-like component
# like Z, UTC-3, +01:00
if any(re.search(r"\D", part) for part in second_parts):
raise NotImplementedError(
"cuDF does not yet support timezone-aware datetimes"
)
second_part = "".join(second_parts[1:])

if len(second_part) > 1:
# Only infer if second_parts is not an empty string.
second_part = _guess_datetime_format(second_part, **kwargs)
else:
second_part = ""

try:
fmt = first_part + subsecond_fmt + second_part
except Exception:
raise ValueError("Unable to infer the timestamp format from the data")

return fmt


def _resolve_mixed_dtypes(
lhs: ColumnBinaryOperand, rhs: ColumnBinaryOperand, base_type: str
) -> Dtype:
units = ["s", "ms", "us", "ns"]
lhs_time_unit = cudf.utils.dtypes.get_time_unit(lhs)
lhs_unit = units.index(lhs_time_unit)
rhs_time_unit = cudf.utils.dtypes.get_time_unit(rhs)
rhs_unit = units.index(rhs_time_unit)
return cudf.dtype(f"{base_type}[{units[max(lhs_unit, rhs_unit)]}]")
85 changes: 85 additions & 0 deletions python/cudf/cudf/tests/test_datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -2187,3 +2187,88 @@ def test_no_format_timezone_not_implemented(tz):
def test_args_not_datetime_typerror(arg):
with pytest.raises(TypeError):
cudf.to_datetime([arg])


@pytest.mark.parametrize(
"data",
[
[
"2000-01-01 00:00:00.000000000",
"2000-01-01 00:00:00.000000000",
"2000-01-01 00:00:00.000000000",
],
[
"2000-01-01 00:00:00.000000000",
None,
"2000-01-01 00:00:00.000000000",
],
[
"2000-01-01 00:00:00.001000000",
"2000-01-01 00:00:00.000000000",
"2000-01-01 00:00:00.000000000",
],
[
"2000-01-01 00:00:00.010000000",
"2000-01-01 00:00:00.020000000",
"2000-01-01 00:00:00.030000000",
],
[
"2000-01-01 00:00:00.010000000",
"2000-01-01 00:00:00.020000000",
None,
],
[
"2000-01-01 00:00:00.000001000",
"2000-01-01 00:00:00.000000000",
"2000-01-01 00:00:00.000004000",
],
[
None,
"2000-01-01 00:00:00.000000000",
"2000-01-01 00:00:00.000004000",
],
[
"2000-01-01 00:00:00.000000010",
"2000-01-01 00:00:00.000000002",
"2000-01-01 00:00:00.000000000",
],
[
"2000-01-01 00:00:00.000000010",
None,
"2000-01-01 00:00:00.000000000",
],
[
"2000-01-01 00:00:01.000000000",
"2000-01-01 00:00:40.000000000",
"2000-01-01 00:00:59.000000000",
],
[
"2000-01-01 00:10:00.000000000",
"2000-01-01 00:30:40.000000000",
"2000-01-01 00:59:00.000000000",
],
[
"2000-01-01 07:00:00.000000000",
"2000-01-01 08:00:00.000000000",
None,
],
[None, None, None],
[],
[
"2000-01-01 00:10:00.123456789",
"2000-01-01 00:30:40.123123456",
"2000-01-01 00:59:00.675347634",
],
],
)
@pytest.mark.parametrize("dtype", DATETIME_TYPES)
def test_datetime_to_str(data, dtype):
gs = cudf.Series(data, dtype=dtype)
ps = gs.to_pandas()

with cudf.option_context("mode.pandas_compatible", True):
actual = gs.astype("str")

expected = ps.astype("string")

assert_eq(actual.to_pandas(nullable=True), expected)

0 comments on commit 24a8365

Please sign in to comment.