Skip to content

Commit

Permalink
[FEA] Migrate nvtext/edit_distance APIs to pylibcudf (#16957)
Browse files Browse the repository at this point in the history
Apart of #15162. This PR migrates `edit_distance.pxd` to pylibcudf

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Yunsong Wang (https://github.com/PointKernel)
  - David Wendt (https://github.com/davidwendt)

URL: #16957
  • Loading branch information
Matt711 authored Oct 4, 2024
1 parent d15bbfd commit 04c17de
Show file tree
Hide file tree
Showing 14 changed files with 171 additions and 25 deletions.
2 changes: 1 addition & 1 deletion cpp/include/nvtext/edit_distance.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ namespace CUDF_EXPORT nvtext {
* @param targets Strings to compute edit distance against `input`
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
* @return New strings columns of with replaced strings
* @return New lists column of edit distance values
*/
std::unique_ptr<cudf::column> edit_distance(
cudf::strings_column_view const& input,
Expand Down
1 change: 1 addition & 0 deletions docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -49,3 +49,4 @@ This page provides API documentation for pylibcudf.

io/index.rst
strings/index.rst
nvtext/index.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
=============
edit_distance
=============

.. automodule:: pylibcudf.nvtext.edit_distance
:members:
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
nvtext
======

.. toctree::
:maxdepth: 1

edit_distance
34 changes: 10 additions & 24 deletions python/cudf/cudf/_lib/nvtext/edit_distance.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -2,37 +2,23 @@

from cudf.core.buffer import acquire_spill_lock

from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move

from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.column.column_view cimport column_view
from pylibcudf.libcudf.nvtext.edit_distance cimport (
edit_distance as cpp_edit_distance,
edit_distance_matrix as cpp_edit_distance_matrix,
)
from pylibcudf cimport nvtext

from cudf._lib.column cimport Column


@acquire_spill_lock()
def edit_distance(Column strings, Column targets):
cdef column_view c_strings = strings.view()
cdef column_view c_targets = targets.view()
cdef unique_ptr[column] c_result

with nogil:
c_result = move(cpp_edit_distance(c_strings, c_targets))

return Column.from_unique_ptr(move(c_result))
result = nvtext.edit_distance.edit_distance(
strings.to_pylibcudf(mode="read"),
targets.to_pylibcudf(mode="read")
)
return Column.from_pylibcudf(result)


@acquire_spill_lock()
def edit_distance_matrix(Column strings):
cdef column_view c_strings = strings.view()
cdef unique_ptr[column] c_result

with nogil:
c_result = move(cpp_edit_distance_matrix(c_strings))

return Column.from_unique_ptr(move(c_result))
result = nvtext.edit_distance.edit_distance_matrix(
strings.to_pylibcudf(mode="read")
)
return Column.from_pylibcudf(result)
1 change: 1 addition & 0 deletions python/pylibcudf/pylibcudf/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -66,3 +66,4 @@ target_link_libraries(pylibcudf_interop PUBLIC nanoarrow)
add_subdirectory(libcudf)
add_subdirectory(strings)
add_subdirectory(io)
add_subdirectory(nvtext)
2 changes: 2 additions & 0 deletions python/pylibcudf/pylibcudf/__init__.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ from . cimport (
lists,
merge,
null_mask,
nvtext,
partitioning,
quantiles,
reduce,
Expand Down Expand Up @@ -78,4 +79,5 @@ __all__ = [
"transpose",
"types",
"unary",
"nvtext",
]
2 changes: 2 additions & 0 deletions python/pylibcudf/pylibcudf/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
lists,
merge,
null_mask,
nvtext,
partitioning,
quantiles,
reduce,
Expand Down Expand Up @@ -92,4 +93,5 @@
"transpose",
"types",
"unary",
"nvtext",
]
22 changes: 22 additions & 0 deletions python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# =============================================================================
# Copyright (c) 2024, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
# in compliance with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under the License
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
# or implied. See the License for the specific language governing permissions and limitations under
# the License.
# =============================================================================

set(cython_sources edit_distance.pyx)

set(linked_libraries cudf::cudf)
rapids_cython_create_modules(
CXX
SOURCE_FILES "${cython_sources}"
LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_nvtext_ ASSOCIATED_TARGETS cudf
)
7 changes: 7 additions & 0 deletions python/pylibcudf/pylibcudf/nvtext/__init__.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from . cimport edit_distance

__all__ = [
"edit_distance",
]
7 changes: 7 additions & 0 deletions python/pylibcudf/pylibcudf/nvtext/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from . import edit_distance

__all__ = [
"edit_distance",
]
8 changes: 8 additions & 0 deletions python/pylibcudf/pylibcudf/nvtext/edit_distance.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from pylibcudf.column cimport Column


cpdef Column edit_distance(Column input, Column targets)

cpdef Column edit_distance_matrix(Column input)
63 changes: 63 additions & 0 deletions python/pylibcudf/pylibcudf/nvtext/edit_distance.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move
from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.column.column_view cimport column_view
from pylibcudf.libcudf.nvtext.edit_distance cimport (
edit_distance as cpp_edit_distance,
edit_distance_matrix as cpp_edit_distance_matrix,
)


cpdef Column edit_distance(Column input, Column targets):
"""
Returns the edit distance between individual strings in two strings columns
For details, see :cpp:func:`edit_distance`
Parameters
----------
input : Column
Input strings
targets : Column
Strings to compute edit distance against
Returns
-------
Column
New column of edit distance values
"""
cdef column_view c_strings = input.view()
cdef column_view c_targets = targets.view()
cdef unique_ptr[column] c_result

with nogil:
c_result = move(cpp_edit_distance(c_strings, c_targets))

return Column.from_libcudf(move(c_result))


cpdef Column edit_distance_matrix(Column input):
"""
Returns the edit distance between all strings in the input strings column
For details, see :cpp:func:`edit_distance_matrix`
Parameters
----------
input : Column
Input strings
Returns
-------
Column
New column of edit distance values
"""
cdef column_view c_strings = input.view()
cdef unique_ptr[column] c_result

with nogil:
c_result = move(cpp_edit_distance_matrix(c_strings))

return Column.from_libcudf(move(c_result))
34 changes: 34 additions & 0 deletions python/pylibcudf/pylibcudf/tests/test_nvtext_edit_distance.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

import pyarrow as pa
import pylibcudf as plc
import pytest
from utils import assert_column_eq


@pytest.fixture(scope="module")
def edit_distance_data():
arr1 = ["hallo", "goodbye", "world"]
arr2 = ["hello", "", "world"]
return pa.array(arr1), pa.array(arr2)


def test_edit_distance(edit_distance_data):
input_col, targets = edit_distance_data
result = plc.nvtext.edit_distance.edit_distance(
plc.interop.from_arrow(input_col),
plc.interop.from_arrow(targets),
)
expected = pa.array([1, 7, 0], type=pa.int32())
assert_column_eq(result, expected)


def test_edit_distance_matrix(edit_distance_data):
input_col, _ = edit_distance_data
result = plc.nvtext.edit_distance.edit_distance_matrix(
plc.interop.from_arrow(input_col)
)
expected = pa.array(
[[0, 7, 4], [7, 0, 6], [4, 6, 0]], type=pa.list_(pa.int32())
)
assert_column_eq(expected, result)

0 comments on commit 04c17de

Please sign in to comment.