Skip to content

Commit

Permalink
Implement replace in pylibcudf (#15005)
Browse files Browse the repository at this point in the history
Contributes to #13921

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: #15005
  • Loading branch information
vyasr authored Feb 8, 2024
1 parent c3cf7c6 commit 7294280
Show file tree
Hide file tree
Showing 11 changed files with 304 additions and 117 deletions.
1 change: 1 addition & 0 deletions docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ This page provides API documentation for pylibcudf.
reduce
rolling
scalar
replace
table
types
unary
6 changes: 6 additions & 0 deletions docs/cudf/source/user_guide/api_docs/pylibcudf/replace.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
=======
replace
=======

.. automodule:: cudf._lib.pylibcudf.replace
:members:
4 changes: 3 additions & 1 deletion python/cudf/cudf/_lib/cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@
# the License.
# =============================================================================

set(cython_sources aggregation.pyx binaryop.pyx copying.pyx reduce.pxd types.pyx unary.pyx)
set(cython_sources aggregation.pyx binaryop.pyx copying.pyx replace.pyx reduce.pxd types.pyx
unary.pyx
)

set(linked_libraries cudf::cudf)

Expand Down
3 changes: 1 addition & 2 deletions python/cudf/cudf/_lib/cpp/replace.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ from cudf._lib.cpp.scalar.scalar cimport scalar

cdef extern from "cudf/replace.hpp" namespace "cudf" nogil:

cdef enum class replace_policy(bool):
cpdef enum class replace_policy(bool):
PRECEDING
FOLLOWING

Expand Down Expand Up @@ -42,7 +42,6 @@ cdef extern from "cudf/replace.hpp" namespace "cudf" nogil:
column_view source_column,
scalar lo, scalar hi) except +

cdef extern from "cudf/replace.hpp" namespace "cudf" nogil:
cdef unique_ptr[column] normalize_nans_and_zeros(
column_view source_column) except +

Expand Down
Empty file.
2 changes: 1 addition & 1 deletion python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

set(cython_sources
aggregation.pyx binaryop.pyx column.pyx copying.pyx gpumemoryview.pyx groupby.pyx interop.pyx
join.pyx reduce.pyx rolling.pyx scalar.pyx table.pyx types.pyx unary.pyx utils.pyx
join.pyx reduce.pyx replace.pyx rolling.pyx scalar.pyx table.pyx types.pyx unary.pyx utils.pyx
)
set(linked_libraries cudf::cudf)
rapids_cython_create_modules(
Expand Down
2 changes: 2 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/__init__.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ from . cimport (
interop,
join,
reduce,
replace,
rolling,
types,
unary,
Expand All @@ -35,6 +36,7 @@ __all__ = [
"join",
"unary",
"reduce",
"replace",
"rolling",
"types",
]
2 changes: 2 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
interop,
join,
reduce,
replace,
rolling,
types,
unary,
Expand All @@ -33,6 +34,7 @@
"join",
"unary",
"reduce",
"replace",
"rolling",
"types",
]
36 changes: 36 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/replace.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# Copyright (c) 2023-2024, NVIDIA CORPORATION.

from libcpp cimport bool

from cudf._lib.cpp.replace cimport replace_policy

from .column cimport Column
from .scalar cimport Scalar

ctypedef fused ReplacementType:
Column
Scalar
replace_policy
# Allowing object is a workaround for
# https://github.com/cython/cython/issues/5984. See the implementation of
# replace_nulls for details.
object


cpdef Column replace_nulls(Column source_column, ReplacementType replacement)

cpdef Column find_and_replace_all(
Column source_column,
Column values_to_replace,
Column replacement_values,
)

cpdef Column clamp(
Column source_column,
Scalar lo,
Scalar hi,
Scalar lo_replace=*,
Scalar hi_replace=*,
)

cpdef Column normalize_nans_and_zeros(Column source_column, bool inplace=*)
208 changes: 208 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/replace.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,208 @@
# Copyright (c) 2023-2024, NVIDIA CORPORATION.


from cython.operator import dereference

from libcpp cimport bool
from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move

from cudf._lib.cpp cimport replace as cpp_replace
from cudf._lib.cpp.column.column cimport column

from cudf._lib.cpp.replace import \
replace_policy as ReplacePolicy # no-cython-lint

from .column cimport Column
from .scalar cimport Scalar


cpdef Column replace_nulls(Column source_column, ReplacementType replacement):
"""Replace nulls in source_column.
The values used to replace nulls depends on the type of replacement:
- If replacement is a Column, the corresponding value from replacement
is used.
- If replacement is a Scalar, the same value is used for all nulls.
- If replacement is a replace_policy, the policy is used to determine
the replacement value:
- PRECEDING: The first non-null value that precedes the null is used.
- FOLLOWING: The first non-null value that follows the null is used.
For more details, see :cpp:func:`replace_nulls`.
Parameters
----------
source_column : Column
The column in which to replace nulls.
replacement_column : Union[Column, Scalar, replace_policy]
If a Column, the values to use as replacements. If a Scalar, the value
to use as a replacement. If a replace_policy, the policy to use to
determine the replacement value.
Returns
-------
Column
A copy of source_column with nulls replaced by values from
replacement_column.
"""
cdef unique_ptr[column] c_result
cdef replace_policy policy
# Due to https://github.com/cython/cython/issues/5984, if this function is
# called as a Python function (i.e. without typed inputs, which is always
# true in pure Python files), the type of `replacement` will be `object`
# instead of `replace_policy`. This is a workaround to handle that case.
if ReplacementType is object:
if isinstance(replacement, ReplacePolicy):
policy = replacement
with nogil:
c_result = move(
cpp_replace.replace_nulls(source_column.view(), policy)
)
return Column.from_libcudf(move(c_result))
else:
raise TypeError("replacement must be a Column, Scalar, or replace_policy")

with nogil:
if ReplacementType is Column:
c_result = move(
cpp_replace.replace_nulls(source_column.view(), replacement.view())
)
elif ReplacementType is Scalar:
c_result = move(
cpp_replace.replace_nulls(
source_column.view(), dereference(replacement.c_obj)
)
)
elif ReplacementType is replace_policy:
c_result = move(
cpp_replace.replace_nulls(source_column.view(), replacement)
)
else:
assert False, "Internal error. Please contact pylibcudf developers"
return Column.from_libcudf(move(c_result))


cpdef Column find_and_replace_all(
Column source_column,
Column values_to_replace,
Column replacement_values,
):
"""Replace all occurrences of values_to_replace with replacement_values.
For details, see :cpp:func:`find_and_replace_all`.
Parameters
----------
source_column : Column
The column in which to replace values.
values_to_replace : Column
The column containing values to replace.
replacement_values : Column
The column containing replacement values.
Returns
-------
Column
A copy of source_column with all occurrences of values_to_replace
replaced by replacement_values.
"""
cdef unique_ptr[column] c_result
with nogil:
c_result = move(
cpp_replace.find_and_replace_all(
source_column.view(),
values_to_replace.view(),
replacement_values.view(),
)
)
return Column.from_libcudf(move(c_result))


cpdef Column clamp(
Column source_column,
Scalar lo,
Scalar hi,
Scalar lo_replace=None,
Scalar hi_replace=None,
):
"""Clamp the values in source_column to the range [lo, hi].
For details, see :cpp:func:`clamp`.
Parameters
----------
source_column : Column
The column to clamp.
lo : Scalar
The lower bound of the clamp range.
hi : Scalar
The upper bound of the clamp range.
lo_replace : Scalar, optional
The value to use for elements that are less than lo. If not specified,
the value of lo is used.
hi_replace : Scalar, optional
The value to use for elements that are greater than hi. If not
specified, the value of hi is used.
Returns
-------
Column
A copy of source_column with values clamped to the range [lo, hi].
"""
if (lo_replace is None) != (hi_replace is None):
raise ValueError("lo_replace and hi_replace must be specified together")

cdef unique_ptr[column] c_result
with nogil:
if lo_replace is None:
c_result = move(
cpp_replace.clamp(
source_column.view(),
dereference(lo.c_obj),
dereference(hi.c_obj),
)
)
else:
c_result = move(
cpp_replace.clamp(
source_column.view(),
dereference(lo.c_obj),
dereference(hi.c_obj),
dereference(lo_replace.c_obj),
dereference(hi_replace.c_obj),
)
)
return Column.from_libcudf(move(c_result))


cpdef Column normalize_nans_and_zeros(Column source_column, bool inplace=False):
"""Normalize NaNs and zeros in source_column.
For details, see :cpp:func:`normalize_nans_and_zeros`.
Parameters
----------
source_column : Column
The column to normalize.
inplace : bool, optional
If True, normalize source_column in place. If False, return a new
column with the normalized values.
Returns
-------
Column
A copy of source_column with NaNs and zeros normalized.
"""
cdef unique_ptr[column] c_result
with nogil:
if inplace:
cpp_replace.normalize_nans_and_zeros(source_column.mutable_view())
else:
c_result = move(
cpp_replace.normalize_nans_and_zeros(source_column.view())
)

if not inplace:
return Column.from_libcudf(move(c_result))
Loading

0 comments on commit 7294280

Please sign in to comment.