Skip to content

Commit

Permalink
Merge pull request rapidsai#1 from quasiben/fea/cudf-empty-groubpy-cont
Browse files Browse the repository at this point in the history
Fea/cudf empty groubpy cont
  • Loading branch information
thomcom authored May 8, 2019
2 parents 0578541 + f5884bf commit 60a6bf9
Show file tree
Hide file tree
Showing 4 changed files with 47 additions and 36 deletions.
6 changes: 5 additions & 1 deletion python/cudf/dataframe/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,11 @@ def equals(self, other):
if len(self) != len(other):
return False
elif len(self) == 1:
return self[0] == other[0]
val = self[0] == other[0]
# when self is multiindex we need to checkall
if isinstance(val, np.ndarray):
return val.all()
return bool(val)
else:
result = (self == other)
if isinstance(result, bool):
Expand Down
14 changes: 8 additions & 6 deletions python/cudf/dataframe/multiindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def __init__(self, levels, codes=None, labels=None, names=None):
else:
column_names = names
elif names is None:
column_names = list(range(len(codes)))
column_names = list(range(len(levels)))
else:
column_names = names
if len(codes) == 0:
Expand All @@ -52,13 +52,15 @@ def __init__(self, levels, codes=None, labels=None, names=None):
if not isinstance(codes, cudf.dataframe.dataframe.DataFrame):
self.codes = cudf.dataframe.dataframe.DataFrame()
for idx, code in enumerate(codes):
if len(code) != 0:
code = np.array(code)
self.codes.add_column(column_names[idx],
columnops.as_column(code))
code = np.array(code)
self.codes.add_column(column_names[idx],
columnops.as_column(code))
else:
self.codes = codes
self.levels = levels

# converting levels to numpy array will produce a Float64Index
# (on empty levels)for levels mimicking the behavior of Pandas
self.levels = np.array(levels)
self._validate_levels_and_codes(self.levels, self.codes)
self.name = None
self.names = names
Expand Down
29 changes: 22 additions & 7 deletions python/cudf/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@

from cudf.dataframe.dataframe import DataFrame
from cudf.dataframe.series import Series
from cudf import MultiIndex

from cudf.bindings.groupby import (
agg as cpp_agg,
_apply_basic_agg as _cpp_apply_basic_agg
Expand Down Expand Up @@ -181,7 +183,6 @@ def apply_multiindex_or_single_index(self, result):
levels.append([])
codes.append([])
names.append(by)
from cudf import MultiIndex
mi = MultiIndex(levels, codes)
mi.names = names
final_result.index = mi
Expand Down Expand Up @@ -212,7 +213,6 @@ def apply_multiindex_or_single_index(self, result):
levels.append(level)
codes[by] = code
names.append(by)
from cudf import MultiIndex
multi_index = MultiIndex(levels=levels,
codes=codes,
names=names)
Expand All @@ -227,10 +227,26 @@ def apply_multicolumn(self, result, aggs):
codes = []
levels.append(self._val_columns)
levels.append(aggs)
codes.append(list(np.zeros(len(aggs), dtype='int64')))
codes.append(list(range(len(aggs))))
from cudf import MultiIndex
result.columns = MultiIndex(levels, codes)

# if the values columns have length == 1, codes is a nested list of
# zeros equal to the size of aggs (sum, min, mean, etc.)
# if the values columns are length>1, codes will monotonically
# increase by 1 for every n values where n is the number of aggs
# [['x,', 'z'], ['sum', 'min']]
# codes == [[0, 1], [0, 1]]
code_size = max(len(aggs), len(self._val_columns))
codes.append(list(np.zeros(code_size, dtype='int64')))
codes.append(list(range(code_size)))

if len(aggs) == 1:
# unprefix columns
new_cols = []
for c in result.columns:
new_col = c.split('_')[1] # sum_z-> (sum, z)
new_cols.append(new_col)
result.columns = new_cols
else:
result.columns = MultiIndex(levels, codes)
return result

def apply_multicolumn_mapped(self, result, aggs):
Expand All @@ -242,7 +258,6 @@ def apply_multicolumn_mapped(self, result, aggs):
for k in aggs.keys():
for v in aggs[k]:
tuples.append((k, v))
from cudf import MultiIndex
multiindex = MultiIndex.from_tuples(tuples)
result.columns = multiindex
return result
Expand Down
34 changes: 12 additions & 22 deletions python/cudf/tests/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -432,27 +432,17 @@ def test_list_of_series():
assert_eq(pdg, gdg)


def test_empty_groupby():
@pytest.mark.parametrize('func', [
lambda df: df.groupby(['x', 'y', 'z']).sum(),
lambda df: df.groupby(['x', 'y']).sum(),
lambda df: df.groupby(['x', 'y']).agg('sum'),
lambda df: df.groupby(['y']).sum(),
lambda df: df.groupby(['y']).agg('sum'),
lambda df: df.groupby(['x']).sum(),
lambda df: df.groupby(['x']).agg('sum'),
lambda df: df.groupby(['x', 'y']).z.sum(),
])
def test_empty_groupby(func):
pdf = pd.DataFrame({'x': [], 'y': [], 'z': []})
gdf = cudf.from_pandas(pdf)
pdg = pdf.groupby(['x', 'y', 'z']).sum()
gdg = gdf.groupby(['x', 'y', 'z']).sum()
assert_eq(pdg, gdg)
pdg = pdf.groupby(['x', 'y']).sum()
gdg = gdf.groupby(['x', 'y']).sum()
assert_eq(pdg, gdg)
pdg = pdf.groupby(['x', 'y']).agg('sum')
gdg = gdf.groupby(['x', 'y']).agg('sum')
assert_eq(pdg, gdg)
pdg = pdf.groupby(['y']).sum()
gdg = gdf.groupby(['y']).sum()
assert_eq(pdg, gdg)
pdg = pdf.groupby(['y']).agg('sum')
gdg = gdf.groupby(['y']).agg('sum')
assert_eq(pdg, gdg)
pdg = pdf.groupby(['x']).sum()
gdg = gdf.groupby(['x']).sum()
assert_eq(pdg, gdg)
pdg = pdf.groupby(['x']).agg('sum')
gdg = gdf.groupby(['x']).agg('sum')
assert_eq(pdg, gdg)
assert_eq(func(pdf), func(gdf))

0 comments on commit 60a6bf9

Please sign in to comment.