Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REVIEW] Adding support for is_null and is_not_null #2962

Merged
Merged
Show file tree
Hide file tree
Changes from 20 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
7c57700
function and test cases
rgsl888prabhu Oct 3, 2019
4370fc8
change log
rgsl888prabhu Oct 3, 2019
d6e8cbc
review changes
rgsl888prabhu Oct 4, 2019
6ee6722
Merge pull request #20 from rapidsai/branch-0.11
rgsl888prabhu Oct 4, 2019
7641cdc
Merge branch '2143_is_null_and_is_not_null' of https://github.com/rgs…
rgsl888prabhu Oct 4, 2019
2afe671
cython support
rgsl888prabhu Oct 4, 2019
fac1144
review changes and updating isna and notna definition for series
rgsl888prabhu Oct 7, 2019
ee0a03d
Merge pull request #24 from rapidsai/branch-0.11
rgsl888prabhu Oct 7, 2019
2fd920e
Addressing reviews
rgsl888prabhu Oct 8, 2019
01224f3
Merge branch '2143_is_null_and_is_not_null' of https://github.com/rgs…
rgsl888prabhu Oct 8, 2019
612b763
pleasing black
rgsl888prabhu Oct 8, 2019
6563647
Delete global.lock
rgsl888prabhu Oct 8, 2019
7a180ba
Delete purge.lock
rgsl888prabhu Oct 8, 2019
382fd5b
Update CHANGELOG.md
rgsl888prabhu Oct 10, 2019
840547f
Merge branch 'branch-0.11' into 2143_is_null_and_is_not_null
rgsl888prabhu Oct 10, 2019
c71aa4b
Merge branch 'branch-0.11' into 2143_is_null_and_is_not_null
rgsl888prabhu Oct 10, 2019
8423499
removing gpu_isnull gpu_notna notna_mask
rgsl888prabhu Oct 11, 2019
0074137
Merge pull request #32 from rapidsai/branch-0.11
rgsl888prabhu Oct 15, 2019
787d917
merging with branch 0.11
rgsl888prabhu Oct 16, 2019
c6ad279
Merge branch 'branch-0.11' into 2143_is_null_and_is_not_null
Oct 17, 2019
166416f
review changes
rgsl888prabhu Oct 17, 2019
da0f7a9
flake
rgsl888prabhu Oct 17, 2019
d534d60
Merge branch 'branch-0.11' into 2143_is_null_and_is_not_null
rgsl888prabhu Oct 17, 2019
4d05511
Merge branch 'branch-0.11' into 2143_is_null_and_is_not_null
harrism Oct 17, 2019
6d85384
review changes
rgsl888prabhu Oct 18, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@
- PR #2987 Add `inplace` arg to `DataFrame.reset_index` and `Series`
- PR #3129 Add strings column factory from `std::vector`s
- PR #3054 Add parquet reader support for decimal data types
- PR #2962 Add isnull(), notnull() and related functions
- PR #3094 Adding `any` and `all` support from libcudf

rgsl888prabhu marked this conversation as resolved.
Show resolved Hide resolved

## Improvements

- PR #2904 Move gpu decompressors to cudf::io namespace
Expand Down
1 change: 1 addition & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -379,6 +379,7 @@ add_library(cudf
src/merge/merge.cu
src/unary/math_ops.cu
src/unary/cast_ops.cu
src/unary/null_ops.cu
src/io/cuio_common.cpp
src/io/io_functions.cpp
src/io/convert/csr/cudf_to_csr.cu
Expand Down
22 changes: 22 additions & 0 deletions cpp/include/cudf/unary.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -70,4 +70,26 @@ gdf_column cast(gdf_column const& input, gdf_dtype out_type,
gdf_dtype_extra_info out_info = gdf_dtype_extra_info{});


/**
* @brief Checks the `input` column for `null` values, and creates a `bool`
* column of same size with `true` representing `null` values and `false` for
* other.
*
* @param input A gdf_column as input
*
* @returns gdf_column A column of type GDF_BOOL8 with `true` representing `null` values.
*/
gdf_column is_null(gdf_column const& input);

/**
* @brief Checks the `input` column for `null` values, and creates a `bool`
* column of same size with `false` representing `null` values and `true` for
* other.
*
* @param input A gdf_column as input
*
* @returns gdf_column A column of type GDF_BOOL8 with `false` representing `null` values.
*/
gdf_column is_not_null(gdf_column const& input);

} // namespace cudf
64 changes: 64 additions & 0 deletions cpp/src/unary/null_ops.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
/*
* Copyright (c) 2019, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <cudf/cudf.h>
#include <cudf/types.hpp>
#include <cudf/utilities/legacy/type_dispatcher.hpp>
#include <utilities/cuda_utils.hpp>
#include <utilities/column_utils.hpp>
#include <bitmask/legacy/bit_mask.cuh>
#include <cudf/filling.hpp>

using bit_mask::bit_mask_t;

namespace cudf {

namespace detail {

gdf_column null_op(gdf_column const& input, bool nulls_are_false = true, cudaStream_t stream = 0) {
auto output = cudf::allocate_column(GDF_BOOL8, input.size, false,
gdf_dtype_extra_info{}, stream);

if (not cudf::is_nullable(input)) {
gdf_scalar value {nulls_are_false, GDF_BOOL8, true};
cudf::fill(&output, value, 0, output.size);
} else {
const bit_mask_t* __restrict__ typed_input_valid = reinterpret_cast<bit_mask_t*>(input.valid);
auto exec = rmm::exec_policy(stream)->on(stream);

thrust::transform(exec,
thrust::make_counting_iterator(static_cast<gdf_size_type>(0)),
thrust::make_counting_iterator(static_cast<gdf_size_type>(input.size)),
static_cast<bool*>(output.data),
[=]__device__(auto index){
return (nulls_are_false ==
bit_mask::is_valid(typed_input_valid, index));
});
}

return output;
}
}// detail

gdf_column is_null(gdf_column const& input) {
return detail::null_op(input, false, 0);
}

gdf_column is_not_null(gdf_column const& input) {
return detail::null_op(input, true, 0);
}

}// cudf
167 changes: 166 additions & 1 deletion cpp/tests/unary/unary_ops_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2410,4 +2410,169 @@ TYPED_TEST(gdf_logical_test, LogicalNot) {
auto outputCol = cudf::test::column_wrapper<cudf::bool8>(output);

EXPECT_EQ(expectCol, outputCol);
}
}

template <typename T>
using column_wrapper = cudf::test::column_wrapper<T>;

template <typename T>
struct IsNull : public GdfTest{};

using test_types =
::testing::Types<int64_t>;

TYPED_TEST_CASE(IsNull, test_types);

TYPED_TEST(IsNull, sample)
{
using T = TypeParam;

gdf_index_type NUM_ELEM = 5000;

column_wrapper <T> col(NUM_ELEM,
[](auto row) { return row; },
[](auto row) { return (row%7==0 ? true: false); });
column_wrapper <cudf::bool8> expected(NUM_ELEM,
[](auto row) {return cudf::bool8{(row%7==0 ? false: true)};},
false);

gdf_column got = cudf::is_null(*col.get());

EXPECT_EQ(gdf_equal_columns(got, *expected.get()), true);
}

TYPED_TEST(IsNull, all_valid)
{
using T = TypeParam;

gdf_index_type NUM_ELEM = 50;

column_wrapper <T> col(NUM_ELEM,
[](auto row) { return row; },
[](auto row) { return (true); });
column_wrapper <cudf::bool8> expected(NUM_ELEM,
[](auto row) {return cudf::bool8{false};},
false);

gdf_column got = cudf::is_null(*col.get());

EXPECT_EQ(gdf_equal_columns(got, *expected.get()), true);
}

TYPED_TEST(IsNull, all_invalid)
{
using T = TypeParam;

gdf_index_type NUM_ELEM = 50;

column_wrapper <T> col(NUM_ELEM,
[](auto row) { return row; },
[](auto row) { return (false); });
column_wrapper <cudf::bool8> expected(NUM_ELEM,
[](auto row) {return cudf::bool8{true};},
false);

gdf_column got = cudf::is_null(*col.get());

EXPECT_EQ(gdf_equal_columns(got, *expected.get()), true);
}

TYPED_TEST(IsNull, empty_column)
{
using T = TypeParam;

gdf_index_type NUM_ELEM = 0;

column_wrapper <T> col(NUM_ELEM,
[](auto row) { return row; },
[](auto row) { return (true); });
column_wrapper <cudf::bool8> expected(NUM_ELEM,
[](auto row) {return cudf::bool8{false};},
false);

gdf_column got = cudf::is_null(*col.get());

EXPECT_EQ(gdf_equal_columns(got, *expected.get()), true);
}


template <typename T>
struct IsNotNull : public GdfTest{};

using test_types =
::testing::Types<int64_t>;

TYPED_TEST_CASE(IsNotNull, test_types);

TYPED_TEST(IsNotNull, sample)
{
using T = TypeParam;

gdf_index_type NUM_ELEM = 5000;

column_wrapper <T> col(NUM_ELEM,
[](auto row) { return row; },
[](auto row) { return (row%7==0 ? true: false); });
column_wrapper <cudf::bool8> expected(NUM_ELEM,
[](auto row) {return cudf::bool8{(row%7==0 ? true: false)};},
false);

gdf_column got = cudf::is_not_null(*col.get());

EXPECT_EQ(gdf_equal_columns(got, *expected.get()), true);
}

TYPED_TEST(IsNotNull, all_valid)
{
using T = TypeParam;

gdf_index_type NUM_ELEM = 50;

column_wrapper <T> col(NUM_ELEM,
[](auto row) { return row; },
[](auto row) { return (true); });
column_wrapper <cudf::bool8> expected(NUM_ELEM,
[](auto row) {return cudf::bool8{true};},
false);

gdf_column got = cudf::is_not_null(*col.get());

EXPECT_EQ(gdf_equal_columns(got, *expected.get()), true);
}

TYPED_TEST(IsNotNull, all_invalid)
{
using T = TypeParam;

gdf_index_type NUM_ELEM = 50;

column_wrapper <T> col(NUM_ELEM,
[](auto row) { return row; },
[](auto row) { return (false); });
column_wrapper <cudf::bool8> expected(NUM_ELEM,
[](auto row) {return cudf::bool8{false};},
false);

gdf_column got = cudf::is_not_null(*col.get());

EXPECT_EQ(gdf_equal_columns(got, *expected.get()), true);
}

TYPED_TEST(IsNotNull, empty_column)
{
using T = TypeParam;

gdf_index_type NUM_ELEM = 0;

column_wrapper <T> col(NUM_ELEM,
[](auto row) { return row; },
[](auto row) { return (true); });
column_wrapper <cudf::bool8> expected(NUM_ELEM,
[](auto row) {return cudf::bool8{false};},
false);

gdf_column got = cudf::is_not_null(*col.get());

EXPECT_EQ(gdf_equal_columns(got, *expected.get()), true);
}

8 changes: 8 additions & 0 deletions python/cudf/cudf/_lib/includes/unaryops.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -91,3 +91,11 @@ cdef extern from "cudf/unary.hpp" namespace "cudf" nogil:
gdf_dtype out_type,
gdf_dtype_extra_info out_info
) except +

cdef gdf_column is_null(
const gdf_column& input
) except +

cdef gdf_column is_not_null(
const gdf_column& input
) except +
24 changes: 24 additions & 0 deletions python/cudf/cudf/_lib/unaryops.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -166,3 +166,27 @@ def nans_to_nulls(py_col):
)

return mask


def is_null(col):
from cudf.core.column import as_column, Column

if (not isinstance(col, Column)):
col = as_column(col)
cdef gdf_column* c_col = column_view_from_column(col)
rgsl888prabhu marked this conversation as resolved.
Show resolved Hide resolved

cdef gdf_column result = cpp_unaryops.is_null(c_col[0])

return gdf_column_to_column(&result)


def is_not_null(col):
from cudf.core.column import as_column, Column

if (not isinstance(col, Column)):
col = as_column(col)
cdef gdf_column* c_col = column_view_from_column(col)
rgsl888prabhu marked this conversation as resolved.
Show resolved Hide resolved

cdef gdf_column result = cpp_unaryops.is_not_null(c_col[0])

return gdf_column_to_column(&result)
20 changes: 20 additions & 0 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -618,6 +618,26 @@ def fillna(self, value):
)
return self.replace(data=Buffer(out), mask=None, null_count=0)

def isnull(self):
"""Identify missing values in a Column.
"""
return libcudf.unaryops.is_null(self)

def isna(self):
"""Identify missing values in a Column. Alias for isnull.
"""
return self.isnull()

def notna(self):
"""Identify non-missing values in a Column.
"""
return libcudf.unaryops.is_not_null(self)

def notnull(self):
"""Identify non-missing values in a Column. Alias for notna.
"""
return self.notna()

def to_dense_buffer(self, fillna=None):
"""Get dense (no null values) ``Buffer`` of the data.

Expand Down
24 changes: 24 additions & 0 deletions python/cudf/cudf/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -293,6 +293,30 @@ def to_series(self):

return Series(self._values)

def isnull(self):
"""Identify missing values in an Index.
"""
from cudf.core.series import Series

return Series(self.as_column().isnull(), name=self.name)
rgsl888prabhu marked this conversation as resolved.
Show resolved Hide resolved

def isna(self):
"""Identify missing values in an Index. Alias for isnull.
"""
return self.isnull()

def notna(self):
"""Identify non-missing values in an Index.
"""
from cudf.core.series import Series

return Series(self.as_column().notna(), name=self.name)
rgsl888prabhu marked this conversation as resolved.
Show resolved Hide resolved

def notnull(self):
"""Identify non-missing values in an Index. Alias for notna.
"""
return self.notna()

@property
@property
def is_unique(self):
Expand Down
Loading