Skip to content

Commit

Permalink
Migrate column factories to pylibcudf (#15257)
Browse files Browse the repository at this point in the history
This PR implements `column_factories.hpp` using `pylibcudf` and migrates the cuDF cython to use them cc @vyasr

Authors:
  - https://github.com/brandon-b-miller
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Lawrence Mitchell (https://github.com/wence-)

URL: #15257
  • Loading branch information
brandon-b-miller authored Jun 4, 2024
1 parent 382de32 commit eb46016
Show file tree
Hide file tree
Showing 17 changed files with 767 additions and 29 deletions.
17 changes: 11 additions & 6 deletions cpp/src/column/column_factories.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,8 @@ std::size_t size_of(data_type element_type)
std::unique_ptr<column> make_empty_column(data_type type)
{
CUDF_EXPECTS(type.id() == type_id::EMPTY || !cudf::is_nested(type),
"make_empty_column is invalid to call on nested types");
"make_empty_column is invalid to call on nested types",
cudf::data_type_error);
return std::make_unique<column>(type, 0, rmm::device_buffer{}, rmm::device_buffer{}, 0);
}

Expand All @@ -80,7 +81,9 @@ std::unique_ptr<column> make_numeric_column(data_type type,
rmm::device_async_resource_ref mr)
{
CUDF_FUNC_RANGE();
CUDF_EXPECTS(is_numeric(type), "Invalid, non-numeric type.");
CUDF_EXPECTS(type.id() != type_id::EMPTY && is_numeric(type),
"Invalid, non-numeric type.",
cudf::data_type_error);
CUDF_EXPECTS(size >= 0, "Column size cannot be negative.");

return std::make_unique<column>(
Expand All @@ -100,7 +103,7 @@ std::unique_ptr<column> make_fixed_point_column(data_type type,
rmm::device_async_resource_ref mr)
{
CUDF_FUNC_RANGE();
CUDF_EXPECTS(is_fixed_point(type), "Invalid, non-fixed_point type.");
CUDF_EXPECTS(is_fixed_point(type), "Invalid, non-fixed_point type.", cudf::data_type_error);
CUDF_EXPECTS(size >= 0, "Column size cannot be negative.");

return std::make_unique<column>(
Expand All @@ -120,7 +123,7 @@ std::unique_ptr<column> make_timestamp_column(data_type type,
rmm::device_async_resource_ref mr)
{
CUDF_FUNC_RANGE();
CUDF_EXPECTS(is_timestamp(type), "Invalid, non-timestamp type.");
CUDF_EXPECTS(is_timestamp(type), "Invalid, non-timestamp type.", cudf::data_type_error);
CUDF_EXPECTS(size >= 0, "Column size cannot be negative.");

return std::make_unique<column>(
Expand All @@ -140,7 +143,7 @@ std::unique_ptr<column> make_duration_column(data_type type,
rmm::device_async_resource_ref mr)
{
CUDF_FUNC_RANGE();
CUDF_EXPECTS(is_duration(type), "Invalid, non-duration type.");
CUDF_EXPECTS(is_duration(type), "Invalid, non-duration type.", cudf::data_type_error);
CUDF_EXPECTS(size >= 0, "Column size cannot be negative.");

return std::make_unique<column>(
Expand All @@ -160,7 +163,9 @@ std::unique_ptr<column> make_fixed_width_column(data_type type,
rmm::device_async_resource_ref mr)
{
CUDF_FUNC_RANGE();
CUDF_EXPECTS(is_fixed_width(type), "Invalid, non-fixed-width type.");
CUDF_EXPECTS(type.id() != type_id::EMPTY && is_fixed_width(type),
"Invalid, non-fixed-width type.",
cudf::data_type_error);

// clang-format off
if (is_timestamp (type)) return make_timestamp_column (type, size, state, stream, mr);
Expand Down
4 changes: 2 additions & 2 deletions cpp/tests/column/factories_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ TEST_P(NonNumericFactoryTest, NonNumericThrow)
auto column = cudf::make_numeric_column(
cudf::data_type{GetParam()}, this->size(), cudf::mask_state::UNALLOCATED);
};
EXPECT_THROW(construct(), cudf::logic_error);
EXPECT_THROW(construct(), cudf::data_type_error);
}

INSTANTIATE_TEST_CASE_P(NonNumeric,
Expand Down Expand Up @@ -307,7 +307,7 @@ TEST_P(NonFixedWidthFactoryTest, NonFixedWidthThrow)
auto column = cudf::make_fixed_width_column(
cudf::data_type{GetParam()}, this->size(), cudf::mask_state::UNALLOCATED);
};
EXPECT_THROW(construct(), cudf::logic_error);
EXPECT_THROW(construct(), cudf::data_type_error);
}

INSTANTIATE_TEST_CASE_P(NonFixedWidth,
Expand Down
2 changes: 1 addition & 1 deletion cpp/tests/fixed_point/fixed_point_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -498,7 +498,7 @@ TYPED_TEST(FixedPointTestAllReps, FixedPointColumnWrapper)
TYPED_TEST(FixedPointTestAllReps, NoScaleOrWrongTypeID)
{
EXPECT_THROW(cudf::make_fixed_point_column(cudf::data_type{cudf::type_id::INT32}, 0),
cudf::logic_error);
cudf::data_type_error);
}

TYPED_TEST(FixedPointTestAllReps, SimpleFixedPointColumnWrapper)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
================
column_factories
================

.. automodule:: cudf._lib.pylibcudf.column_factories
:members:
1 change: 1 addition & 0 deletions docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ This page provides API documentation for pylibcudf.
aggregation
binaryop
column
column_factories
concatenate
copying
filling
Expand Down
21 changes: 6 additions & 15 deletions python/cudf/cudf/_lib/column.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -39,14 +39,10 @@ from cudf._lib.types cimport (
from cudf._lib.null_mask import bitmask_allocation_size_bytes
from cudf._lib.types import dtype_from_pylibcudf_column

# TODO: We currently need this for "casting" empty pylibcudf columns in
# from_pylibcudf by instead creating an empty numeric column. We will be able
# to remove this once column factories are exposed to pylibcudf.

cimport cudf._lib.pylibcudf.libcudf.copying as cpp_copying
cimport cudf._lib.pylibcudf.libcudf.types as libcudf_types
cimport cudf._lib.pylibcudf.libcudf.unary as libcudf_unary
from cudf._lib.pylibcudf cimport Column as plc_Column
from cudf._lib.pylibcudf.libcudf.column.column cimport column, column_contents
from cudf._lib.pylibcudf.libcudf.column.column_factories cimport (
make_column_from_scalar as cpp_make_column_from_scalar,
Expand Down Expand Up @@ -623,22 +619,17 @@ cdef class Column:
pylibcudf.Column
A new pylibcudf.Column referencing the same data.
"""
cdef libcudf_types.data_type new_dtype
if col.type().id() == pylibcudf.TypeId.TIMESTAMP_DAYS:
col = pylibcudf.unary.cast(
col, pylibcudf.DataType(pylibcudf.TypeId.TIMESTAMP_SECONDS)
)
elif col.type().id() == pylibcudf.TypeId.EMPTY:
new_dtype = libcudf_types.data_type(libcudf_types.type_id.INT8)
# TODO: This function call is what requires cimporting pylibcudf.
# We can remove the cimport once we can directly do
# pylibcudf.column_factories.make_numeric_column or equivalent.
col = plc_Column.from_libcudf(
move(
make_numeric_column(
new_dtype, col.size(), libcudf_types.mask_state.ALL_NULL
)
)
new_dtype = pylibcudf.DataType(pylibcudf.TypeId.INT8)

col = pylibcudf.column_factories.make_numeric_column(
new_dtype,
col.size(),
pylibcudf.column_factories.MaskState.ALL_NULL
)

dtype = dtype_from_pylibcudf_column(col)
Expand Down
1 change: 1 addition & 0 deletions python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ set(cython_sources
aggregation.pyx
binaryop.pyx
column.pyx
column_factories.pyx
concatenate.pyx
copying.pyx
filling.pyx
Expand Down
2 changes: 2 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/__init__.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from . cimport (
aggregation,
binaryop,
column_factories,
concatenate,
copying,
filling,
Expand Down Expand Up @@ -40,6 +41,7 @@ __all__ = [
"binaryop",
"concatenate",
"copying",
"column_factories",
"filling",
"gpumemoryview",
"groupby",
Expand Down
4 changes: 3 additions & 1 deletion python/cudf/cudf/_lib/pylibcudf/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from . import (
aggregation,
binaryop,
column_factories,
concatenate,
copying,
filling,
Expand All @@ -27,7 +28,7 @@
from .gpumemoryview import gpumemoryview
from .scalar import Scalar
from .table import Table
from .types import DataType, TypeId
from .types import DataType, MaskState, TypeId

__all__ = [
"Column",
Expand All @@ -39,6 +40,7 @@
"binaryop",
"concatenate",
"copying",
"column_factories",
"filling",
"gpumemoryview",
"groupby",
Expand Down
52 changes: 52 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/column_factories.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# Copyright (c) 2024, NVIDIA CORPORATION.
from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move

from cudf._lib.pylibcudf.libcudf.types cimport mask_state, size_type

from .column cimport Column
from .types cimport DataType, size_type, type_id

ctypedef fused MakeEmptyColumnOperand:
DataType
type_id
object

ctypedef fused MaskArg:
mask_state
object


cpdef Column make_empty_column(
MakeEmptyColumnOperand type_or_id
)

cpdef Column make_numeric_column(
DataType type_,
size_type size,
MaskArg mask,
)

cpdef Column make_fixed_point_column(
DataType type_,
size_type size,
MaskArg mask,
)

cpdef Column make_timestamp_column(
DataType type_,
size_type size,
MaskArg mask,
)

cpdef Column make_duration_column(
DataType type_,
size_type size,
MaskArg mask,
)

cpdef Column make_fixed_width_column(
DataType type_,
size_type size,
MaskArg mask,
)
Loading

0 comments on commit eb46016

Please sign in to comment.