Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement groupby in pylibcudf #14945

Merged
merged 35 commits into from
Feb 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
acbf97f
Move to enum classes
vyasr Jan 24, 2024
2757384
Some more minor improvements to cpp exports
vyasr Jan 24, 2024
0c60be2
Add scaffolding for aggregation classes
vyasr Jan 25, 2024
46d412b
Implement new strategy via multiple inheritance
vyasr Jan 26, 2024
bdc92e8
Implement all aggregations
vyasr Jan 26, 2024
cdadd4a
Consolidate logic into some simple helpers
vyasr Jan 26, 2024
daf971c
Add some notes
vyasr Jan 26, 2024
48c3a85
Move all aggregations into the same class.
vyasr Jan 28, 2024
e57a755
Implement helper for generating groupby aggregations
vyasr Jan 28, 2024
e1d4f4c
Reduce manual ownership handling and use f-string
vyasr Jan 28, 2024
fe5014c
Implement groupby
vyasr Jan 28, 2024
366c092
Add to init
vyasr Jan 28, 2024
0b4a334
Fix some bugs
vyasr Jan 28, 2024
f60a316
Make all factories cpdef free functions
vyasr Jan 29, 2024
acbb5b6
Simplify and unify logic for factories
vyasr Jan 29, 2024
1bef3f1
Rename method
vyasr Jan 29, 2024
8985733
Lots of cleanup
vyasr Jan 29, 2024
b25442e
Add declarations to pxd
vyasr Jan 29, 2024
3b2e75c
Return list of tables
vyasr Jan 29, 2024
31b8d4c
Expose enums
vyasr Jan 29, 2024
bc2cefb
Implement udf aggregation
vyasr Jan 29, 2024
391f074
Add docs and fix a few function defaults
vyasr Jan 29, 2024
eb49091
Add to Sphinx and fix some issues
vyasr Jan 30, 2024
d9de564
Enable scans
vyasr Jan 30, 2024
f79f340
Some minor cleanup
vyasr Jan 30, 2024
5c8c44f
Add some missing imports to the package
vyasr Jan 30, 2024
28c455e
Add some more missing imports and fix constructor of GroupByRequest
vyasr Jan 31, 2024
bd2ab53
Move _as_vector to utils
vyasr Jan 31, 2024
6b14aba
Implement shift in pylibcudf
vyasr Jan 31, 2024
527a516
Implement replace_nulls in pylibcudf
vyasr Jan 31, 2024
c44cc9f
Fix argument name in cpp interface
vyasr Jan 31, 2024
54f0a67
Implement get_groups in pylibcudf
vyasr Jan 31, 2024
f16e13f
Merge branch 'branch-24.04' into feat/pylibcudf_groupby
vyasr Feb 1, 2024
f4e5acf
Address reviews
vyasr Feb 1, 2024
35bec86
Add some missing docs
vyasr Feb 1, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
===========
aggregation
===========

.. automodule:: cudf._lib.pylibcudf.aggregation
:members:
6 changes: 6 additions & 0 deletions docs/cudf/source/user_guide/api_docs/pylibcudf/groupby.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
=======
groupby
=======

.. automodule:: cudf._lib.pylibcudf.groupby
:members:
2 changes: 2 additions & 0 deletions docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,12 @@ This page provides API documentation for pylibcudf.
:maxdepth: 1
:caption: API Documentation

aggregation
binaryop
column
copying
gpumemoryview
groupby
scalar
table
types
24 changes: 17 additions & 7 deletions python/cudf/cudf/_lib/aggregation.pyx
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020-2022, NVIDIA CORPORATION.
# Copyright (c) 2020-2024, NVIDIA CORPORATION.

from enum import Enum, IntEnum

Expand Down Expand Up @@ -51,7 +51,7 @@ class AggregationKind(Enum):
NUNIQUE = libcudf_aggregation.aggregation.Kind.NUNIQUE
NTH = libcudf_aggregation.aggregation.Kind.NTH_ELEMENT
RANK = libcudf_aggregation.aggregation.Kind.RANK
COLLECT = libcudf_aggregation.aggregation.Kind.COLLECT
COLLECT = libcudf_aggregation.aggregation.Kind.COLLECT_LIST
UNIQUE = libcudf_aggregation.aggregation.Kind.COLLECT_SET
PTX = libcudf_aggregation.aggregation.Kind.PTX
CUDA = libcudf_aggregation.aggregation.Kind.CUDA
Expand Down Expand Up @@ -191,7 +191,7 @@ cdef class RollingAggregation:
cdef RollingAggregation agg = cls()
agg.c_obj = move(
libcudf_aggregation.make_collect_list_aggregation[
rolling_aggregation]())
rolling_aggregation](libcudf_types.null_policy.INCLUDE))
return agg

@classmethod
Expand Down Expand Up @@ -335,15 +335,19 @@ cdef class GroupbyAggregation:
cdef GroupbyAggregation agg = cls()
agg.c_obj = move(
libcudf_aggregation.
make_collect_list_aggregation[groupby_aggregation]())
make_collect_list_aggregation[groupby_aggregation](
libcudf_types.null_policy.INCLUDE
))
return agg

@classmethod
def nunique(cls):
cdef GroupbyAggregation agg = cls()
agg.c_obj = move(
libcudf_aggregation.
make_nunique_aggregation[groupby_aggregation]())
make_nunique_aggregation[groupby_aggregation](
libcudf_types.null_policy.EXCLUDE
))
return agg

@classmethod
Expand Down Expand Up @@ -422,7 +426,11 @@ cdef class GroupbyAggregation:
cdef GroupbyAggregation agg = cls()
agg.c_obj = move(
libcudf_aggregation.
make_collect_set_aggregation[groupby_aggregation]())
make_collect_set_aggregation[groupby_aggregation](
libcudf_types.null_policy.INCLUDE,
libcudf_types.null_equality.EQUAL,
libcudf_types.nan_equality.ALL_EQUAL,
))
return agg

@classmethod
Expand Down Expand Up @@ -724,7 +732,9 @@ cdef class ReduceAggregation:
def nunique(cls):
cdef ReduceAggregation agg = cls()
agg.c_obj = move(
libcudf_aggregation.make_nunique_aggregation[reduce_aggregation]())
libcudf_aggregation.make_nunique_aggregation[reduce_aggregation](
libcudf_types.null_policy.EXCLUDE
))
return agg

@classmethod
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/_lib/cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# the License.
# =============================================================================

set(cython_sources binaryop.pyx copying.pyx types.pyx)
set(cython_sources aggregation.pyx binaryop.pyx copying.pyx types.pyx)

set(linked_libraries cudf::cudf)

Expand Down
131 changes: 67 additions & 64 deletions python/cudf/cudf/_lib/cpp/aggregation.pxd
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020-2022, NVIDIA CORPORATION.
# Copyright (c) 2020-2024, NVIDIA CORPORATION.
from libc.stdint cimport int32_t
from libcpp cimport bool
from libcpp.memory cimport unique_ptr
Expand All @@ -8,6 +8,8 @@ from libcpp.vector cimport vector
from cudf._lib.cpp.types cimport (
data_type,
interpolation,
nan_equality,
null_equality,
null_order,
null_policy,
order,
Expand All @@ -19,71 +21,74 @@ ctypedef int32_t underlying_type_t_rank_method

cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil:

cdef cppclass aggregation:
ctypedef enum Kind:
SUM 'cudf::aggregation::SUM'
PRODUCT 'cudf::aggregation::PRODUCT'
MIN 'cudf::aggregation::MIN'
MAX 'cudf::aggregation::MAX'
COUNT_VALID 'cudf::aggregation::COUNT_VALID'
COUNT_ALL 'cudf::aggregation::COUNT_ALL'
ANY 'cudf::aggregation::ANY'
ALL 'cudf::aggregation::ALL'
SUM_OF_SQUARES 'cudf::aggregation::SUM_OF_SQUARES'
MEAN 'cudf::aggregation::MEAN'
VARIANCE 'cudf::aggregation::VARIANCE'
STD 'cudf::aggregation::STD'
MEDIAN 'cudf::aggregation::MEDIAN'
QUANTILE 'cudf::aggregation::QUANTILE'
ARGMAX 'cudf::aggregation::ARGMAX'
ARGMIN 'cudf::aggregation::ARGMIN'
NUNIQUE 'cudf::aggregation::NUNIQUE'
NTH_ELEMENT 'cudf::aggregation::NTH_ELEMENT'
RANK 'cudf::aggregation::RANK'
COLLECT 'cudf::aggregation::COLLECT_LIST'
COLLECT_SET 'cudf::aggregation::COLLECT_SET'
PTX 'cudf::aggregation::PTX'
CUDA 'cudf::aggregation::CUDA'
CORRELATION 'cudf::aggregation::CORRELATION'
COVARIANCE 'cudf::aggregation::COVARIANCE'
# Cython doesn't appear to support enum class nested inside a class, so
# have to namespace it manually
cpdef enum class Kind "cudf::aggregation::Kind":
SUM
PRODUCT
MIN
MAX
COUNT_VALID
COUNT_ALL
ANY
ALL
SUM_OF_SQUARES
MEAN
VARIANCE
STD
MEDIAN
QUANTILE
ARGMAX
ARGMIN
NUNIQUE
NTH_ELEMENT
RANK
COLLECT_LIST
COLLECT_SET
PTX
CUDA
CORRELATION
COVARIANCE

cdef cppclass aggregation:
Kind kind
unique_ptr[aggregation] clone()

cdef cppclass rolling_aggregation:
aggregation.Kind kind
cdef cppclass rolling_aggregation(aggregation):
pass

cdef cppclass groupby_aggregation:
aggregation.Kind kind
cdef cppclass groupby_aggregation(aggregation):
pass

cdef cppclass groupby_scan_aggregation:
aggregation.Kind kind
cdef cppclass groupby_scan_aggregation(aggregation):
pass

cdef cppclass reduce_aggregation:
aggregation.Kind kind
cdef cppclass reduce_aggregation(aggregation):
pass

cdef cppclass scan_aggregation:
aggregation.Kind kind
cdef cppclass scan_aggregation(aggregation):
pass

ctypedef enum udf_type:
CUDA 'cudf::udf_type::CUDA'
PTX 'cudf::udf_type::PTX'
cpdef enum class udf_type(bool):
CUDA
PTX

ctypedef enum correlation_type:
PEARSON 'cudf::correlation_type::PEARSON'
KENDALL 'cudf::correlation_type::KENDALL'
SPEARMAN 'cudf::correlation_type::SPEARMAN'
cpdef enum class correlation_type(int32_t):
PEARSON
KENDALL
SPEARMAN

ctypedef enum rank_method:
FIRST "cudf::rank_method::FIRST"
AVERAGE "cudf::rank_method::AVERAGE"
MIN "cudf::rank_method::MIN"
MAX "cudf::rank_method::MAX"
DENSE "cudf::rank_method::DENSE"
cpdef enum class rank_method(int32_t):
FIRST
AVERAGE
MIN
MAX
DENSE

ctypedef enum rank_percentage:
NONE "cudf::rank_percentage::NONE"
ZERO_NORMALIZED "cudf::rank_percentage::ZERO_NORMALIZED"
ONE_NORMALIZED "cudf::rank_percentage::ONE_NORMALIZED"
cpdef enum class rank_percentage(int32_t):
NONE
ZERO_NORMALIZED
ONE_NORMALIZED

cdef unique_ptr[T] make_sum_aggregation[T]() except +

Expand All @@ -93,8 +98,6 @@ cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil:

cdef unique_ptr[T] make_max_aggregation[T]() except +

cdef unique_ptr[T] make_count_aggregation[T]() except +

cdef unique_ptr[T] make_count_aggregation[T](null_policy) except +

cdef unique_ptr[T] make_any_aggregation[T]() except +
Expand All @@ -119,20 +122,20 @@ cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil:

cdef unique_ptr[T] make_argmin_aggregation[T]() except +

cdef unique_ptr[T] make_nunique_aggregation[T]() except +

cdef unique_ptr[T] make_nth_element_aggregation[T](
size_type n
) except +
cdef unique_ptr[T] make_nunique_aggregation[T](null_policy null_handling) except +

cdef unique_ptr[T] make_nth_element_aggregation[T](
size_type n,
null_policy null_handling
) except +

cdef unique_ptr[T] make_collect_list_aggregation[T]() except +
cdef unique_ptr[T] make_collect_list_aggregation[T](
null_policy null_handling
) except +

cdef unique_ptr[T] make_collect_set_aggregation[T]() except +
cdef unique_ptr[T] make_collect_set_aggregation[T](
null_policy null_handling, null_equality nulls_equal, nan_equality nans_equal
) except +

cdef unique_ptr[T] make_udf_aggregation[T](
udf_type type,
Expand Down
Empty file.
18 changes: 12 additions & 6 deletions python/cudf/cudf/_lib/cpp/groupby.pxd
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020-2023, NVIDIA CORPORATION.
# Copyright (c) 2020-2024, NVIDIA CORPORATION.

from libcpp cimport bool
from libcpp.functional cimport reference_wrapper
Expand All @@ -16,7 +16,13 @@ from cudf._lib.cpp.replace cimport replace_policy
from cudf._lib.cpp.scalar.scalar cimport scalar
from cudf._lib.cpp.table.table cimport table
from cudf._lib.cpp.table.table_view cimport table_view
from cudf._lib.cpp.types cimport null_order, null_policy, order, size_type
from cudf._lib.cpp.types cimport (
null_order,
null_policy,
order,
size_type,
sorted,
)
from cudf._lib.cpp.utilities.host_span cimport host_span

# workaround for https://github.com/cython/cython/issues/3885
Expand Down Expand Up @@ -55,20 +61,20 @@ cdef extern from "cudf/groupby.hpp" \
groupby(
const table_view& keys,
null_policy include_null_keys,
bool keys_are_sorted,
sorted keys_are_sorted,
) except +

groupby(
const table_view& keys,
null_policy include_null_keys,
bool keys_are_sorted,
sorted keys_are_sorted,
const vector[order]& column_order,
) except +

groupby(
const table_view& keys,
null_policy include_null_keys,
bool keys_are_sorted,
sorted keys_are_sorted,
const vector[order]& column_order,
const vector[null_order]& null_precedence
) except +
Expand Down Expand Up @@ -100,6 +106,6 @@ cdef extern from "cudf/groupby.hpp" \
groups get_groups(table_view values) except +

pair[unique_ptr[table], unique_ptr[table]] replace_nulls(
const table_view& value,
const table_view& values,
const vector[replace_policy] replace_policy
) except +
9 changes: 5 additions & 4 deletions python/cudf/cudf/_lib/cpp/replace.pxd
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Copyright (c) 2020, NVIDIA CORPORATION.
# Copyright (c) 2020-2024, NVIDIA CORPORATION.

from libcpp cimport bool
from libcpp.memory cimport unique_ptr

from cudf._lib.types import cudf_to_np_types, np_to_cudf_types
Expand All @@ -11,9 +12,9 @@ from cudf._lib.cpp.scalar.scalar cimport scalar

cdef extern from "cudf/replace.hpp" namespace "cudf" nogil:

ctypedef enum replace_policy:
PRECEDING 'cudf::replace_policy::PRECEDING',
FOLLOWING 'cudf::replace_policy::FOLLOWING'
cdef enum class replace_policy(bool):
PRECEDING
FOLLOWING

cdef unique_ptr[column] replace_nulls(
column_view source_column,
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@
# the License.
# =============================================================================

set(cython_sources binaryop.pyx column.pyx copying.pyx gpumemoryview.pyx interop.pyx scalar.pyx
table.pyx types.pyx utils.pyx
set(cython_sources aggregation.pyx binaryop.pyx column.pyx copying.pyx gpumemoryview.pyx
groupby.pyx interop.pyx scalar.pyx table.pyx types.pyx utils.pyx
)
set(linked_libraries cudf::cudf)
rapids_cython_create_modules(
Expand Down
7 changes: 5 additions & 2 deletions python/cudf/cudf/_lib/pylibcudf/__init__.pxd
Original file line number Diff line number Diff line change
@@ -1,22 +1,25 @@
# Copyright (c) 2023-2024, NVIDIA CORPORATION.

# TODO: Verify consistent usage of relative/absolute imports in pylibcudf.
from . cimport binaryop, copying, interop
from . cimport aggregation, binaryop, copying, groupby, interop
from .column cimport Column
from .gpumemoryview cimport gpumemoryview
from .scalar cimport Scalar
from .table cimport Table
# TODO: cimport type_id once
# https://github.com/cython/cython/issues/5609 is resolved
from .types cimport DataType
from .types cimport DataType, type_id

__all__ = [
"Column",
"DataType",
"Scalar",
"Table",
"aggregation",
"binaryop",
"copying",
"gpumemoryview",
"groupby",
"interop",
"types",
]
Loading
Loading