Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REVIEW] Fix bug in dask-cudf caused by inconsistency with Pandas empty groupby result #1639

Closed
wants to merge 11 commits into from
Closed
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,7 @@
- PR #1607 Revert change of `column.to_dense_buffer` always return by copy for performance concerns
- PR #1618 ORC reader: fix assert & data output when nrows/skiprows isn't aligned to stripe boundaries
- PR #1631 Fix failure of TYPES_TEST on some gcc-7 based systems.
- PR #1639 Fix bug caused by empty groupbys throwing an exception
- PR #1641 CSV Reader: Fix skip_blank_lines behavior with Windows line terminators (\r\n)


Expand Down
8 changes: 7 additions & 1 deletion python/cudf/dataframe/buffer.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,13 @@ def __init__(self, mem, size=None, capacity=None, categorical=False):
if categorical:
size = len(mem)
else:
size = mem.size
if isinstance(mem, slice):
if mem.stop is not 0:
size = (mem.start-mem.stop)/mem.step
else:
size = 0
else:
size = mem.size
if capacity is None:
capacity = size
self.mem = cudautils.to_device(mem)
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/dataframe/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -722,7 +722,7 @@ def index(self):
@index.setter
def index(self, _index):
if isinstance(_index, cudf.dataframe.multiindex.MultiIndex):
if len(_index) != len(self[self.columns[0]]):
if len(_index) != len(self):
msg = f"Length mismatch: Expected axis has "\
"%d elements, new values "\
"have %d elements"\
Expand Down
9 changes: 5 additions & 4 deletions python/cudf/dataframe/multiindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def __init__(self, levels, codes=None, labels=None, names=None):
else:
column_names = names
if len(codes) == 0:
raise ValueError('MultiIndex codes can not be empty.')
codes = [[]]
import cudf
if not isinstance(codes, cudf.dataframe.dataframe.DataFrame) and\
not isinstance(codes[0], (Sequence,
Expand All @@ -52,9 +52,10 @@ def __init__(self, levels, codes=None, labels=None, names=None):
if not isinstance(codes, cudf.dataframe.dataframe.DataFrame):
self.codes = cudf.dataframe.dataframe.DataFrame()
for idx, code in enumerate(codes):
code = np.array(code)
self.codes.add_column(column_names[idx],
columnops.as_column(code))
if len(code) != 0:
code = np.array(code)
self.codes.add_column(column_names[idx],
columnops.as_column(code))
else:
self.codes = codes
self.levels = levels
Expand Down
25 changes: 24 additions & 1 deletion python/cudf/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,30 @@ def _apply_basic_agg(self, agg_type, sort_results=False):

def apply_multiindex_or_single_index(self, result):
if len(result) == 0:
raise ValueError('Groupby result is empty!')
final_result = DataFrame()
for col in result.columns:
if col not in self._by:
final_result[col] = result[col]
if len(self._by) == 1 or len(final_result.columns) == 0:
dtype = 'float64' if len(self._by) == 1 else 'object'
name = self._by[0] if len(self._by) == 1 else None
from cudf.dataframe.index import GenericIndex
index = GenericIndex(Series([], dtype=dtype))
index.name = name
final_result.index = index
else:
levels = []
codes = []
names = []
for by in self._by:
levels.append([])
codes.append([])
names.append(by)
from cudf import MultiIndex
mi = MultiIndex(levels, codes)
mi.names = names
final_result.index = mi
return final_result
if len(self._by) == 1:
from cudf.dataframe import index
idx = index.as_index(result[self._by[0]])
Expand Down
30 changes: 27 additions & 3 deletions python/cudf/tests/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -383,9 +383,7 @@ def test_advanced_groupby_levels():
gdh = gdg.groupby(level=1).sum()
assert_eq(pdh, gdh)
pdg = pdf.groupby(['x', 'y', 'z']).sum()
with pytest.raises(ValueError) as raises:
gdg = gdf.groupby(['x', 'y', 'z']).sum()
raises.match("Groupby result is empty!")
gdg = gdf.groupby(['x', 'y', 'z']).sum()
pdg = pdf.groupby(['z']).sum()
gdg = gdf.groupby(['z']).sum()
assert_eq(pdg, gdg)
Expand Down Expand Up @@ -432,3 +430,29 @@ def test_list_of_series():
pdg = pdf.groupby([pdf.x, pdf.y]).y.sum()
gdg = gdf.groupby([gdf.x, gdf.y]).y.sum()
assert_eq(pdg, gdg)


def test_empty_groupby():
pdf = pd.DataFrame({'x': [], 'y': [], 'z': []})
gdf = cudf.from_pandas(pdf)
pdg = pdf.groupby(['x', 'y', 'z']).sum()
gdg = gdf.groupby(['x', 'y', 'z']).sum()
assert_eq(pdg, gdg)
pdg = pdf.groupby(['x', 'y']).sum()
gdg = gdf.groupby(['x', 'y']).sum()
assert_eq(pdg, gdg)
pdg = pdf.groupby(['x', 'y']).agg('sum')
gdg = gdf.groupby(['x', 'y']).agg('sum')
assert_eq(pdg, gdg)
pdg = pdf.groupby(['y']).sum()
gdg = gdf.groupby(['y']).sum()
assert_eq(pdg, gdg)
pdg = pdf.groupby(['y']).agg('sum')
gdg = gdf.groupby(['y']).agg('sum')
assert_eq(pdg, gdg)
pdg = pdf.groupby(['x']).sum()
gdg = gdf.groupby(['x']).sum()
assert_eq(pdg, gdg)
pdg = pdf.groupby(['x']).agg('sum')
gdg = gdf.groupby(['x']).agg('sum')
assert_eq(pdg, gdg)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For the future, I tend to parametrize these as follows:

@pytest.mark.parametrize('func', [
    lambda df: df.groupby(['x', 'y']).sum(),
    ...
])
def test_empty_groupby(func):
    pdf = pd.DataFrame({'x': [], 'y': []})
    gdf = cudf.from_pandas(pdf)
    assert_eq(func(pdf), func(gdf))

Just a style thing though. Nothing that should slow down this PR.