From 08be6a5174fb9ef94a78403b250f3b607939c9d1 Mon Sep 17 00:00:00 2001 From: Benjamin Zaitlen Date: Tue, 7 May 2019 19:15:57 -0700 Subject: [PATCH 1/4] rewrite test harness for empty groupby --- python/cudf/tests/test_groupby.py | 34 +++++++++++-------------------- 1 file changed, 12 insertions(+), 22 deletions(-) diff --git a/python/cudf/tests/test_groupby.py b/python/cudf/tests/test_groupby.py index 8293a737242..2d641b0058e 100644 --- a/python/cudf/tests/test_groupby.py +++ b/python/cudf/tests/test_groupby.py @@ -432,27 +432,17 @@ def test_list_of_series(): assert_eq(pdg, gdg) -def test_empty_groupby(): +@pytest.mark.parametrize('func', [ + lambda df: df.groupby(['x', 'y', 'z']).sum(), + lambda df: df.groupby(['x', 'y']).sum(), + lambda df: df.groupby(['x', 'y']).agg('sum'), + lambda df: df.groupby(['y']).sum(), + lambda df: df.groupby(['y']).agg('sum'), + lambda df: df.groupby(['x']).sum(), + lambda df: df.groupby(['x']).agg('sum'), + lambda df: df.groupby(['x', 'y']).z.sum(), +]) +def test_empty_groupby(func): pdf = pd.DataFrame({'x': [], 'y': [], 'z': []}) gdf = cudf.from_pandas(pdf) - pdg = pdf.groupby(['x', 'y', 'z']).sum() - gdg = gdf.groupby(['x', 'y', 'z']).sum() - assert_eq(pdg, gdg) - pdg = pdf.groupby(['x', 'y']).sum() - gdg = gdf.groupby(['x', 'y']).sum() - assert_eq(pdg, gdg) - pdg = pdf.groupby(['x', 'y']).agg('sum') - gdg = gdf.groupby(['x', 'y']).agg('sum') - assert_eq(pdg, gdg) - pdg = pdf.groupby(['y']).sum() - gdg = gdf.groupby(['y']).sum() - assert_eq(pdg, gdg) - pdg = pdf.groupby(['y']).agg('sum') - gdg = gdf.groupby(['y']).agg('sum') - assert_eq(pdg, gdg) - pdg = pdf.groupby(['x']).sum() - gdg = gdf.groupby(['x']).sum() - assert_eq(pdg, gdg) - pdg = pdf.groupby(['x']).agg('sum') - gdg = gdf.groupby(['x']).agg('sum') - assert_eq(pdg, gdg) + assert_eq(func(pdf), func(gdf)) From 16a20301256eae05754818933d1413324f15e105 Mon Sep 17 00:00:00 2001 From: Benjamin Zaitlen Date: Tue, 7 May 2019 19:17:05 -0700 Subject: [PATCH 2/4] handle index check for equality of multi-index -- probably should move to multiindex.py --- python/cudf/dataframe/index.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/python/cudf/dataframe/index.py b/python/cudf/dataframe/index.py index 2d12e122393..67982176a71 100644 --- a/python/cudf/dataframe/index.py +++ b/python/cudf/dataframe/index.py @@ -208,7 +208,11 @@ def equals(self, other): if len(self) != len(other): return False elif len(self) == 1: - return self[0] == other[0] + val = self[0] == other[0] + # when self is multiindex we need to checkall + if isinstance(val, np.ndarray): + return val.all() + return bool(val) else: result = (self == other) if isinstance(result, bool): From a4397bec9ba7822ec21897afdd966ef9420e22a8 Mon Sep 17 00:00:00 2001 From: Benjamin Zaitlen Date: Tue, 7 May 2019 19:17:53 -0700 Subject: [PATCH 3/4] clean up code/level assignment --- python/cudf/dataframe/multiindex.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/python/cudf/dataframe/multiindex.py b/python/cudf/dataframe/multiindex.py index 9a957063df2..49c054acce1 100644 --- a/python/cudf/dataframe/multiindex.py +++ b/python/cudf/dataframe/multiindex.py @@ -39,7 +39,7 @@ def __init__(self, levels, codes=None, labels=None, names=None): else: column_names = names elif names is None: - column_names = list(range(len(codes))) + column_names = list(range(len(levels))) else: column_names = names if len(codes) == 0: @@ -52,13 +52,15 @@ def __init__(self, levels, codes=None, labels=None, names=None): if not isinstance(codes, cudf.dataframe.dataframe.DataFrame): self.codes = cudf.dataframe.dataframe.DataFrame() for idx, code in enumerate(codes): - if len(code) != 0: - code = np.array(code) - self.codes.add_column(column_names[idx], - columnops.as_column(code)) + code = np.array(code) + self.codes.add_column(column_names[idx], + columnops.as_column(code)) else: self.codes = codes - self.levels = levels + + # converting levels to numpy array will produce a Float64Index + # (on empty levels)for levels mimicking the behavior of Pandas + self.levels = np.array(levels) self._validate_levels_and_codes(self.levels, self.codes) self.name = None self.names = names From f5884bff0ce149d001040d650d6e6ab41d1bdffa Mon Sep 17 00:00:00 2001 From: Benjamin Zaitlen Date: Tue, 7 May 2019 19:19:14 -0700 Subject: [PATCH 4/4] when agg is length 1 unprefix columns instead of building multiindex --- python/cudf/groupby/groupby.py | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/python/cudf/groupby/groupby.py b/python/cudf/groupby/groupby.py index 9d0a634b335..b9ea9cd2dbc 100644 --- a/python/cudf/groupby/groupby.py +++ b/python/cudf/groupby/groupby.py @@ -6,6 +6,8 @@ from cudf.dataframe.dataframe import DataFrame from cudf.dataframe.series import Series +from cudf import MultiIndex + from cudf.bindings.groupby import ( agg as cpp_agg, _apply_basic_agg as _cpp_apply_basic_agg @@ -181,7 +183,6 @@ def apply_multiindex_or_single_index(self, result): levels.append([]) codes.append([]) names.append(by) - from cudf import MultiIndex mi = MultiIndex(levels, codes) mi.names = names final_result.index = mi @@ -212,7 +213,6 @@ def apply_multiindex_or_single_index(self, result): levels.append(level) codes[by] = code names.append(by) - from cudf import MultiIndex multi_index = MultiIndex(levels=levels, codes=codes, names=names) @@ -227,10 +227,26 @@ def apply_multicolumn(self, result, aggs): codes = [] levels.append(self._val_columns) levels.append(aggs) - codes.append(list(np.zeros(len(aggs), dtype='int64'))) - codes.append(list(range(len(aggs)))) - from cudf import MultiIndex - result.columns = MultiIndex(levels, codes) + + # if the values columns have length == 1, codes is a nested list of + # zeros equal to the size of aggs (sum, min, mean, etc.) + # if the values columns are length>1, codes will monotonically + # increase by 1 for every n values where n is the number of aggs + # [['x,', 'z'], ['sum', 'min']] + # codes == [[0, 1], [0, 1]] + code_size = max(len(aggs), len(self._val_columns)) + codes.append(list(np.zeros(code_size, dtype='int64'))) + codes.append(list(range(code_size))) + + if len(aggs) == 1: + # unprefix columns + new_cols = [] + for c in result.columns: + new_col = c.split('_')[1] # sum_z-> (sum, z) + new_cols.append(new_col) + result.columns = new_cols + else: + result.columns = MultiIndex(levels, codes) return result def apply_multicolumn_mapped(self, result, aggs): @@ -242,7 +258,6 @@ def apply_multicolumn_mapped(self, result, aggs): for k in aggs.keys(): for v in aggs[k]: tuples.append((k, v)) - from cudf import MultiIndex multiindex = MultiIndex.from_tuples(tuples) result.columns = multiindex return result