Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add string padding and side_type APIs to pylibcudf #16833

Merged
merged 10 commits into from
Oct 2, 2024
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,12 @@ strings
find
find_multiple
findall
padding
regex_flags
regex_program
repeat
replace
side_type
slice
split
strip
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
=======
padding
=======

.. automodule:: pylibcudf.strings.padding
:members:
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
=========
side_type
=========

.. automodule:: pylibcudf.strings.side_type
:members:
9 changes: 1 addition & 8 deletions python/cudf/cudf/_lib/strings/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,14 +73,7 @@
from cudf._lib.strings.find_multiple import find_multiple
from cudf._lib.strings.findall import findall
from cudf._lib.strings.json import GetJsonObjectOptions, get_json_object
from cudf._lib.strings.padding import (
SideType,
center,
ljust,
pad,
rjust,
zfill,
)
from cudf._lib.strings.padding import center, ljust, pad, rjust, zfill
from cudf._lib.strings.repeat import repeat_scalar, repeat_sequence
from cudf._lib.strings.replace import (
insert,
Expand Down
112 changes: 16 additions & 96 deletions python/cudf/cudf/_lib/strings/padding.pyx
Original file line number Diff line number Diff line change
@@ -1,64 +1,31 @@
# Copyright (c) 2020-2024, NVIDIA CORPORATION.

from libcpp.memory cimport unique_ptr
from libcpp.string cimport string
from libcpp.utility cimport move

from cudf.core.buffer import acquire_spill_lock

from pylibcudf.libcudf.column.column_view cimport column_view
from pylibcudf.libcudf.types cimport size_type

from cudf._lib.column cimport Column

from enum import IntEnum

from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.strings.padding cimport (
pad as cpp_pad,
zfill as cpp_zfill,
)
from pylibcudf.libcudf.strings.side_type cimport (
side_type,
underlying_type_t_side_type,
)


class SideType(IntEnum):
LEFT = <underlying_type_t_side_type> side_type.LEFT
RIGHT = <underlying_type_t_side_type> side_type.RIGHT
BOTH = <underlying_type_t_side_type> side_type.BOTH
import pylibcudf as plc


@acquire_spill_lock()
def pad(Column source_strings,
size_type width,
fill_char,
side=SideType.LEFT):
side=plc.strings.side_type.SideType.LEFT):
"""
Returns a Column by padding strings in `source_strings`
up to the given `width`. Direction of padding is to be specified by `side`.
The additional characters being filled can be changed by specifying
`fill_char`.
"""
cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

cdef string f_char = <string>str(fill_char).encode()

cdef side_type pad_direction = <side_type>(
<underlying_type_t_side_type> side
plc_result = plc.strings.padding.pad(
source_strings.to_pylibcudf(mode="read"),
width,
side,
fill_char,
)

with nogil:
c_result = move(cpp_pad(
source_view,
width,
pad_direction,
f_char
))

return Column.from_unique_ptr(move(c_result))
return Column.from_pylibcudf(plc_result)


@acquire_spill_lock()
Expand All @@ -68,19 +35,13 @@ def zfill(Column source_strings,
Returns a Column by prepending strings in `source_strings`
with '0' characters up to the given `width`.
"""
cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

with nogil:
c_result = move(cpp_zfill(
source_view,
width
))

return Column.from_unique_ptr(move(c_result))
plc_result = plc.strings.padding.zfill(
source_strings.to_pylibcudf(mode="read"),
width
)
return Column.from_pylibcudf(plc_result)


@acquire_spill_lock()
def center(Column source_strings,
size_type width,
fill_char):
Expand All @@ -89,65 +50,24 @@ def center(Column source_strings,
in `source_strings` with additional character, `fill_char`
up to the given `width`.
"""
cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

cdef string f_char = <string>str(fill_char).encode()

with nogil:
c_result = move(cpp_pad(
source_view,
width,
side_type.BOTH,
f_char
))
return pad(source_strings, width, fill_char, plc.strings.side_type.SideType.BOTH)

return Column.from_unique_ptr(move(c_result))


@acquire_spill_lock()
def ljust(Column source_strings,
size_type width,
fill_char):
"""
Returns a Column by filling right side of strings in `source_strings`
with additional character, `fill_char` up to the given `width`.
"""
cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

cdef string f_char = <string>str(fill_char).encode()
return pad(source_strings, width, fill_char, plc.strings.side_type.SideType.RIGHT)

with nogil:
c_result = move(cpp_pad(
source_view,
width,
side_type.RIGHT,
f_char
))

return Column.from_unique_ptr(move(c_result))


@acquire_spill_lock()
def rjust(Column source_strings,
size_type width,
fill_char):
"""
Returns a Column by filling left side of strings in `source_strings`
with additional character, `fill_char` up to the given `width`.
"""
cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

cdef string f_char = <string>str(fill_char).encode()

with nogil:
c_result = move(cpp_pad(
source_view,
width,
side_type.LEFT,
f_char
))

return Column.from_unique_ptr(move(c_result))
return pad(source_strings, width, fill_char, plc.strings.side_type.SideType.LEFT)
67 changes: 15 additions & 52 deletions python/cudf/cudf/_lib/strings/strip.pyx
Original file line number Diff line number Diff line change
@@ -1,18 +1,8 @@
# Copyright (c) 2020-2024, NVIDIA CORPORATION.

from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move

from cudf.core.buffer import acquire_spill_lock

from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.column.column_view cimport column_view
from pylibcudf.libcudf.scalar.scalar cimport string_scalar
from pylibcudf.libcudf.strings.side_type cimport side_type
from pylibcudf.libcudf.strings.strip cimport strip as cpp_strip

from cudf._lib.column cimport Column
from cudf._lib.scalar cimport DeviceScalar
import pylibcudf as plc


Expand All @@ -24,15 +14,12 @@ def strip(Column source_strings,
The set of characters need be stripped from left and right side
can be specified by `py_repl`.
"""

cdef DeviceScalar repl = py_repl.device_value
return Column.from_pylibcudf(
plc.strings.strip.strip(
source_strings.to_pylibcudf(mode="read"),
plc.strings.SideType.BOTH,
repl.c_value
)
plc_result = plc.strings.strip.strip(
source_strings.to_pylibcudf(mode="read"),
plc.strings.side_type.SideType.BOTH,
py_repl.device_value.c_value,
)
return Column.from_pylibcudf(plc_result)


@acquire_spill_lock()
Expand All @@ -43,24 +30,12 @@ def lstrip(Column source_strings,
The set of characters need be stripped from left side can
be specified by `py_repl`.
"""

cdef DeviceScalar repl = py_repl.device_value

cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

cdef const string_scalar* scalar_str = <const string_scalar*>(
repl.get_raw_ptr()
plc_result = plc.strings.strip.strip(
source_strings.to_pylibcudf(mode="read"),
plc.strings.side_type.SideType.LEFT,
py_repl.device_value.c_value,
)

with nogil:
c_result = move(cpp_strip(
source_view,
side_type.LEFT,
scalar_str[0]
))

return Column.from_unique_ptr(move(c_result))
return Column.from_pylibcudf(plc_result)


@acquire_spill_lock()
Expand All @@ -71,21 +46,9 @@ def rstrip(Column source_strings,
The set of characters need be stripped from right side can
be specified by `py_repl`.
"""

cdef DeviceScalar repl = py_repl.device_value

cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

cdef const string_scalar* scalar_str = <const string_scalar*>(
repl.get_raw_ptr()
plc_result = plc.strings.strip.strip(
source_strings.to_pylibcudf(mode="read"),
plc.strings.side_type.SideType.RIGHT,
py_repl.device_value.c_value,
)

with nogil:
c_result = move(cpp_strip(
source_view,
side_type.RIGHT,
scalar_str[0]
))

return Column.from_unique_ptr(move(c_result))
return Column.from_pylibcudf(plc_result)
4 changes: 3 additions & 1 deletion python/cudf/cudf/core/column/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
import pandas as pd
import pyarrow as pa

import pylibcudf as plc

import cudf
import cudf.api.types
from cudf import _lib as libcudf
Expand Down Expand Up @@ -2966,7 +2968,7 @@ def pad(
raise TypeError(msg)

try:
side = libstrings.SideType[side.upper()]
side = plc.strings.side_type.SideType[side.upper()]
except KeyError:
raise ValueError(
"side has to be either one of {'left', 'right', 'both'}"
Expand Down
4 changes: 2 additions & 2 deletions python/pylibcudf/pylibcudf/libcudf/strings/padding.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,11 @@ from pylibcudf.libcudf.types cimport size_type
cdef extern from "cudf/strings/padding.hpp" namespace "cudf::strings" nogil:

cdef unique_ptr[column] pad(
column_view source_strings,
column_view input,
size_type width,
side_type side,
string fill_char) except +

cdef unique_ptr[column] zfill(
column_view source_strings,
column_view input,
size_type width) except +
12 changes: 5 additions & 7 deletions python/pylibcudf/pylibcudf/libcudf/strings/side_type.pxd
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
# Copyright (c) 2022-2024, NVIDIA CORPORATION.
from libc.stdint cimport int32_t
from libcpp cimport int


cdef extern from "cudf/strings/side_type.hpp" namespace "cudf::strings" nogil:

cpdef enum class side_type(int32_t):
LEFT 'cudf::strings::side_type::LEFT'
RIGHT 'cudf::strings::side_type::RIGHT'
BOTH 'cudf::strings::side_type::BOTH'

ctypedef int32_t underlying_type_t_side_type
cpdef enum class side_type(int):
LEFT
RIGHT
BOTH
4 changes: 2 additions & 2 deletions python/pylibcudf/pylibcudf/libcudf/strings/strip.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,6 @@ from pylibcudf.libcudf.strings.side_type cimport side_type
cdef extern from "cudf/strings/strip.hpp" namespace "cudf::strings" nogil:

cdef unique_ptr[column] strip(
column_view source_strings,
side_type stype,
column_view input,
side_type side,
string_scalar to_strip) except +
1 change: 1 addition & 0 deletions python/pylibcudf/pylibcudf/strings/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ set(cython_sources
find.pyx
find_multiple.pyx
findall.pyx
padding.pyx
regex_flags.pyx
regex_program.pyx
repeat.pyx
Expand Down
2 changes: 2 additions & 0 deletions python/pylibcudf/pylibcudf/strings/__init__.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,11 @@ from . cimport (
find,
find_multiple,
findall,
padding,
regex_flags,
regex_program,
replace,
side_type,
slice,
split,
strip,
Expand Down
2 changes: 2 additions & 0 deletions python/pylibcudf/pylibcudf/strings/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,12 @@
find,
find_multiple,
findall,
padding,
regex_flags,
regex_program,
repeat,
replace,
side_type,
slice,
split,
strip,
Expand Down
Loading
Loading