Skip to content

Commit

Permalink
Merge pull request #2962 from rgsl888prabhu/2143_is_null_and_is_not_null
Browse files Browse the repository at this point in the history
[REVIEW] Adding support for is_null and is_not_null
  • Loading branch information
shwina authored Oct 21, 2019
2 parents b390881 + 6d85384 commit df9917c
Show file tree
Hide file tree
Showing 12 changed files with 350 additions and 67 deletions.
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,11 @@
- PR #2987 Add `inplace` arg to `DataFrame.reset_index` and `Series`
- PR #3129 Add strings column factory from `std::vector`s
- PR #3054 Add parquet reader support for decimal data types
- PR #2962 Add isnull(), notnull() and related functions
- PR #3025 Move search files to legacy
- PR #3094 Adding `any` and `all` support from libcudf
- PR #3130 Define and implement new `column_wrapper`


## Improvements

- PR #2904 Move gpu decompressors to cudf::io namespace
Expand Down
1 change: 1 addition & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -379,6 +379,7 @@ add_library(cudf
src/merge/merge.cu
src/unary/math_ops.cu
src/unary/cast_ops.cu
src/unary/null_ops.cu
src/io/legacy/cuio_common.cpp
src/io/legacy/io_functions.cpp
src/io/convert/csr/legacy/cudf_to_csr.cu
Expand Down
22 changes: 22 additions & 0 deletions cpp/include/cudf/unary.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -70,4 +70,26 @@ gdf_column cast(gdf_column const& input, gdf_dtype out_type,
gdf_dtype_extra_info out_info = gdf_dtype_extra_info{});


/**
* @brief Checks the `input` column for `null` values, and creates a `bool`
* column of same size with `true` representing `null` values and `false` for
* other.
*
* @param input A gdf_column as input
*
* @returns gdf_column A column of type GDF_BOOL8 with `true` representing `null` values.
*/
gdf_column is_null(gdf_column const& input);

/**
* @brief Checks the `input` column for `null` values, and creates a `bool`
* column of same size with `false` representing `null` values and `true` for
* other.
*
* @param input A gdf_column as input
*
* @returns gdf_column A column of type GDF_BOOL8 with `false` representing `null` values.
*/
gdf_column is_not_null(gdf_column const& input);

} // namespace cudf
64 changes: 64 additions & 0 deletions cpp/src/unary/null_ops.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
/*
* Copyright (c) 2019, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <cudf/cudf.h>
#include <cudf/types.hpp>
#include <cudf/utilities/legacy/type_dispatcher.hpp>
#include <utilities/cuda_utils.hpp>
#include <utilities/column_utils.hpp>
#include <bitmask/legacy/bit_mask.cuh>
#include <cudf/filling.hpp>

using bit_mask::bit_mask_t;

namespace cudf {

namespace detail {

gdf_column null_op(gdf_column const& input, bool nulls_are_false = true, cudaStream_t stream = 0) {
auto output = cudf::allocate_column(GDF_BOOL8, input.size, false,
gdf_dtype_extra_info{}, stream);

if (not cudf::is_nullable(input)) {
gdf_scalar value {nulls_are_false, GDF_BOOL8, true};
cudf::fill(&output, value, 0, output.size);
} else {
const bit_mask_t* __restrict__ typed_input_valid = reinterpret_cast<bit_mask_t*>(input.valid);
auto exec = rmm::exec_policy(stream)->on(stream);

thrust::transform(exec,
thrust::make_counting_iterator(static_cast<gdf_size_type>(0)),
thrust::make_counting_iterator(static_cast<gdf_size_type>(input.size)),
static_cast<bool*>(output.data),
[=]__device__(auto index){
return (nulls_are_false ==
bit_mask::is_valid(typed_input_valid, index));
});
}

return output;
}
}// detail

gdf_column is_null(gdf_column const& input) {
return detail::null_op(input, false, 0);
}

gdf_column is_not_null(gdf_column const& input) {
return detail::null_op(input, true, 0);
}

}// cudf
167 changes: 166 additions & 1 deletion cpp/tests/unary/unary_ops_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2410,4 +2410,169 @@ TYPED_TEST(gdf_logical_test, LogicalNot) {
auto outputCol = cudf::test::column_wrapper<cudf::bool8>(output);

EXPECT_EQ(expectCol, outputCol);
}
}

template <typename T>
using column_wrapper = cudf::test::column_wrapper<T>;

template <typename T>
struct IsNull : public GdfTest{};

using test_types =
::testing::Types<int64_t>;

TYPED_TEST_CASE(IsNull, test_types);

TYPED_TEST(IsNull, sample)
{
using T = TypeParam;

cudf::size_type NUM_ELEM = 5000;

column_wrapper <T> col(NUM_ELEM,
[](auto row) { return row; },
[](auto row) { return (row%7==0 ? true: false); });
column_wrapper <cudf::bool8> expected(NUM_ELEM,
[](auto row) {return cudf::bool8{(row%7==0 ? false: true)};},
false);

gdf_column got = cudf::is_null(*col.get());

EXPECT_EQ(gdf_equal_columns(got, *expected.get()), true);
}

TYPED_TEST(IsNull, all_valid)
{
using T = TypeParam;

cudf::size_type NUM_ELEM = 50;

column_wrapper <T> col(NUM_ELEM,
[](auto row) { return row; },
[](auto row) { return (true); });
column_wrapper <cudf::bool8> expected(NUM_ELEM,
[](auto row) {return cudf::bool8{false};},
false);

gdf_column got = cudf::is_null(*col.get());

EXPECT_EQ(gdf_equal_columns(got, *expected.get()), true);
}

TYPED_TEST(IsNull, all_invalid)
{
using T = TypeParam;

cudf::size_type NUM_ELEM = 50;

column_wrapper <T> col(NUM_ELEM,
[](auto row) { return row; },
[](auto row) { return (false); });
column_wrapper <cudf::bool8> expected(NUM_ELEM,
[](auto row) {return cudf::bool8{true};},
false);

gdf_column got = cudf::is_null(*col.get());

EXPECT_EQ(gdf_equal_columns(got, *expected.get()), true);
}

TYPED_TEST(IsNull, empty_column)
{
using T = TypeParam;

cudf::size_type NUM_ELEM = 0;

column_wrapper <T> col(NUM_ELEM,
[](auto row) { return row; },
[](auto row) { return (true); });
column_wrapper <cudf::bool8> expected(NUM_ELEM,
[](auto row) {return cudf::bool8{false};},
false);

gdf_column got = cudf::is_null(*col.get());

EXPECT_EQ(gdf_equal_columns(got, *expected.get()), true);
}


template <typename T>
struct IsNotNull : public GdfTest{};

using test_types =
::testing::Types<int64_t>;

TYPED_TEST_CASE(IsNotNull, test_types);

TYPED_TEST(IsNotNull, sample)
{
using T = TypeParam;

cudf::size_type NUM_ELEM = 5000;

column_wrapper <T> col(NUM_ELEM,
[](auto row) { return row; },
[](auto row) { return (row%7==0 ? true: false); });
column_wrapper <cudf::bool8> expected(NUM_ELEM,
[](auto row) {return cudf::bool8{(row%7==0 ? true: false)};},
false);

gdf_column got = cudf::is_not_null(*col.get());

EXPECT_EQ(gdf_equal_columns(got, *expected.get()), true);
}

TYPED_TEST(IsNotNull, all_valid)
{
using T = TypeParam;

cudf::size_type NUM_ELEM = 50;

column_wrapper <T> col(NUM_ELEM,
[](auto row) { return row; },
[](auto row) { return (true); });
column_wrapper <cudf::bool8> expected(NUM_ELEM,
[](auto row) {return cudf::bool8{true};},
false);

gdf_column got = cudf::is_not_null(*col.get());

EXPECT_EQ(gdf_equal_columns(got, *expected.get()), true);
}

TYPED_TEST(IsNotNull, all_invalid)
{
using T = TypeParam;

cudf::size_type NUM_ELEM = 50;

column_wrapper <T> col(NUM_ELEM,
[](auto row) { return row; },
[](auto row) { return (false); });
column_wrapper <cudf::bool8> expected(NUM_ELEM,
[](auto row) {return cudf::bool8{false};},
false);

gdf_column got = cudf::is_not_null(*col.get());

EXPECT_EQ(gdf_equal_columns(got, *expected.get()), true);
}

TYPED_TEST(IsNotNull, empty_column)
{
using T = TypeParam;

cudf::size_type NUM_ELEM = 0;

column_wrapper <T> col(NUM_ELEM,
[](auto row) { return row; },
[](auto row) { return (true); });
column_wrapper <cudf::bool8> expected(NUM_ELEM,
[](auto row) {return cudf::bool8{false};},
false);

gdf_column got = cudf::is_not_null(*col.get());

EXPECT_EQ(gdf_equal_columns(got, *expected.get()), true);
}

8 changes: 8 additions & 0 deletions python/cudf/cudf/_lib/includes/unaryops.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -91,3 +91,11 @@ cdef extern from "cudf/unary.hpp" namespace "cudf" nogil:
gdf_dtype out_type,
gdf_dtype_extra_info out_info
) except +

cdef gdf_column is_null(
const gdf_column& input
) except +

cdef gdf_column is_not_null(
const gdf_column& input
) except +
28 changes: 28 additions & 0 deletions python/cudf/cudf/_lib/unaryops.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -165,4 +165,32 @@ def nans_to_nulls(py_col):
finalizer=rmm._make_finalizer(mask_ptr, 0)
)

free_column(c_col)

return mask


def is_null(col):
from cudf.core.column import as_column, Column

if (not isinstance(col, Column)):
col = as_column(col)
cdef gdf_column* c_col = column_view_from_column(col)

cdef gdf_column result = cpp_unaryops.is_null(c_col[0])
free_column(c_col)

return gdf_column_to_column(&result)


def is_not_null(col):
from cudf.core.column import as_column, Column

if (not isinstance(col, Column)):
col = as_column(col)
cdef gdf_column* c_col = column_view_from_column(col)

cdef gdf_column result = cpp_unaryops.is_not_null(c_col[0])
free_column(c_col)

return gdf_column_to_column(&result)
20 changes: 20 additions & 0 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -618,6 +618,26 @@ def fillna(self, value):
)
return self.replace(data=Buffer(out), mask=None, null_count=0)

def isnull(self):
"""Identify missing values in a Column.
"""
return libcudf.unaryops.is_null(self)

def isna(self):
"""Identify missing values in a Column. Alias for isnull.
"""
return self.isnull()

def notna(self):
"""Identify non-missing values in a Column.
"""
return libcudf.unaryops.is_not_null(self)

def notnull(self):
"""Identify non-missing values in a Column. Alias for notna.
"""
return self.notna()

def to_dense_buffer(self, fillna=None):
"""Get dense (no null values) ``Buffer`` of the data.
Expand Down
20 changes: 20 additions & 0 deletions python/cudf/cudf/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -293,6 +293,26 @@ def to_series(self):

return Series(self._values)

def isnull(self):
"""Identify missing values in an Index.
"""
return as_index(self.as_column().isnull(), name=self.name)

def isna(self):
"""Identify missing values in an Index. Alias for isnull.
"""
return self.isnull()

def notna(self):
"""Identify non-missing values in an Index.
"""
return as_index(self.as_column().notna(), name=self.name)

def notnull(self):
"""Identify non-missing values in an Index. Alias for notna.
"""
return self.notna()

@property
@property
def is_unique(self):
Expand Down
Loading

0 comments on commit df9917c

Please sign in to comment.