From bb442a8a7d7453b2891b80e68f49d2ae97bd678f Mon Sep 17 00:00:00 2001 From: "H. Thomson Comer" Date: Tue, 7 May 2019 11:48:37 -0700 Subject: [PATCH 1/9] Two changes: copy the dataframe whenever it is getattrd, and drop non numeric columns from groupby's when they are not the by value. --- python/cudf/groupby/groupby.py | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/python/cudf/groupby/groupby.py b/python/cudf/groupby/groupby.py index ba2c048139c..3ce57f2d13e 100644 --- a/python/cudf/groupby/groupby.py +++ b/python/cudf/groupby/groupby.py @@ -158,7 +158,14 @@ def _apply_basic_agg(self, agg_type, sort_results=False): agg_type : str The aggregation function to run. """ - return _cpp_apply_basic_agg(self, agg_type, sort_results=sort_results) + agg_groupby = self.copy() + agg_groupby._df = agg_groupby._df._get_numeric_data() + agg_groupby._val_columns = [] + for val in self._val_columns: + if val in agg_groupby._df.columns: + agg_groupby._val_columns.append(val) + return _cpp_apply_basic_agg(agg_groupby, agg_type, + sort_results=sort_results) def apply_multiindex_or_single_index(self, result): if len(result) == 0: @@ -232,8 +239,16 @@ def __getitem__(self, arg): for val in arg: if val not in self._val_columns: raise KeyError("Column not found: " + str(val)) - result = self + result = self.copy() + result._df = DataFrame() + for by in self._by: + result._df[by] = self._df[by] result._val_columns = arg + if isinstance(arg, str): + result._df[arg] = self._df[arg] + else: + for a in arg: + result._df[a] = self._df[a] return result def copy(self, deep=True): @@ -292,4 +307,10 @@ def agg(self, args): Since multi-indexes aren't supported aggregation results are returned in columns using the naming scheme of `aggregation_columnname`. """ - return cpp_agg(self, args) + agg_groupby = self.copy() + agg_groupby._df = agg_groupby._df._get_numeric_data() + agg_groupby._val_columns = [] + for val in self._val_columns: + if val in agg_groupby._df.columns: + agg_groupby._val_columns.append(val) + return cpp_agg(agg_groupby, args) From ec30edfb5ab2354345d06a6b35b2a11a8609d385 Mon Sep 17 00:00:00 2001 From: "H. Thomson Comer" Date: Tue, 7 May 2019 12:45:09 -0700 Subject: [PATCH 2/9] Add appropriate test --- python/cudf/groupby/groupby.py | 2 -- python/cudf/tests/test_groupby.py | 11 +++++++++++ 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/python/cudf/groupby/groupby.py b/python/cudf/groupby/groupby.py index 3ce57f2d13e..fc712c1ef16 100644 --- a/python/cudf/groupby/groupby.py +++ b/python/cudf/groupby/groupby.py @@ -168,8 +168,6 @@ def _apply_basic_agg(self, agg_type, sort_results=False): sort_results=sort_results) def apply_multiindex_or_single_index(self, result): - if len(result) == 0: - raise ValueError('Groupby result is empty!') if len(self._by) == 1: from cudf.dataframe import index idx = index.as_index(result[self._by[0]]) diff --git a/python/cudf/tests/test_groupby.py b/python/cudf/tests/test_groupby.py index 83ca6745435..925f0f133cd 100644 --- a/python/cudf/tests/test_groupby.py +++ b/python/cudf/tests/test_groupby.py @@ -432,3 +432,14 @@ def test_list_of_series(): pdg = pdf.groupby([pdf.x, pdf.y]).y.sum() gdg = gdf.groupby([gdf.x, gdf.y]).y.sum() assert_eq(pdg, gdg) + + +def test_groupby_unsupported_columns(): + pdf = pd.DataFrame({'x': [1, 2, 3], + 'y': ['a', 'b', 'c'], + 'z': ['d', 'e', 'f'], + 'a': [3, 4, 5]}) + gdf = cudf.from_pandas(pdf) + pdg = pdf.groupby('x').sum() + gdg = gdf.groupby('x').sum() + assert_eq(pdg, gdg) From 7cead7ea87c6c7e5f0aa3bad3ce9f98d9ce45717 Mon Sep 17 00:00:00 2001 From: "H. Thomson Comer" Date: Tue, 7 May 2019 14:20:22 -0700 Subject: [PATCH 3/9] One Series creation that breaks a lot of tests. --- python/cudf/groupby/groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/groupby/groupby.py b/python/cudf/groupby/groupby.py index fc712c1ef16..4def4d9d036 100644 --- a/python/cudf/groupby/groupby.py +++ b/python/cudf/groupby/groupby.py @@ -242,7 +242,7 @@ def __getitem__(self, arg): for by in self._by: result._df[by] = self._df[by] result._val_columns = arg - if isinstance(arg, str): + if isinstance(arg, (str, Number)): result._df[arg] = self._df[arg] else: for a in arg: From 80bf1c58801471ccec821e2af271963fa5a8d3e6 Mon Sep 17 00:00:00 2001 From: "H. Thomson Comer" Date: Wed, 8 May 2019 05:42:55 -0700 Subject: [PATCH 4/9] Get tests passing again. --- python/cudf/dataframe/dataframe.py | 8 ++--- python/cudf/groupby/groupby.py | 47 ++++++++++++++++++++++++------ 2 files changed, 41 insertions(+), 14 deletions(-) diff --git a/python/cudf/dataframe/dataframe.py b/python/cudf/dataframe/dataframe.py index 54bd29d9fb5..cc70cea0dd3 100644 --- a/python/cudf/dataframe/dataframe.py +++ b/python/cudf/dataframe/dataframe.py @@ -228,7 +228,7 @@ def __getitem__(self, arg): if isinstance(self.columns, cudf.dataframe.multiindex.MultiIndex) and\ isinstance(arg, tuple): return self.columns._get_column_major(self, arg) - if isinstance(arg, str) or isinstance(arg, numbers.Integral) or \ + if isinstance(arg, (str, numbers.Number)) or isinstance(arg, numbers.Integral) or \ isinstance(arg, tuple): s = self._cols[arg] s.name = arg @@ -317,9 +317,8 @@ def empty(self): def _get_numeric_data(self): """ Return a dataframe with only numeric data types """ - columns = [c for c, dt in self.dtypes.items() - if dt != object and - not pd.api.types.is_categorical_dtype(dt)] + columns = [c for c, dt in self.dtypes.items() if dt != object] # and + # not pd.api.types.is_categorical_dtype(dt)] return self[columns] def assign(self, **kwargs): @@ -1806,7 +1805,6 @@ def groupby(self, by=None, sort=False, as_index=True, method="hash", - Since we don't support multiindex, the *by* columns are stored as regular columns. """ - if by is None and level is None: raise TypeError('groupby() requires either by or level to be' 'specified.') diff --git a/python/cudf/groupby/groupby.py b/python/cudf/groupby/groupby.py index 4def4d9d036..f4d7a5cc43a 100644 --- a/python/cudf/groupby/groupby.py +++ b/python/cudf/groupby/groupby.py @@ -17,25 +17,36 @@ class SeriesGroupBy(object): """ def __init__(self, source_series, group_series, level=None, sort=False): self.source_series = source_series + self.source_name = '_x' + if self.source_series is not None and\ + self.source_series.name is not None: + self.source_name = source_series.name self.group_series = group_series + self.group_name = '_y' + if self.group_series is not None and\ + self.group_series.name is not None: + self.group_name = group_series.name self.level = level self.sort = sort def __getattr__(self, attr): df = DataFrame() - df['x'] = self.source_series + df[self.source_name] = self.source_series if self.level is not None: - df['y'] = self.source_series.index + df[self.group_name] = self.source_series.index else: - df['y'] = self.group_series - groupby = df.groupby('y', level=self.level, sort=self.sort) + df[self.group_name] = self.group_series + groupby = df.groupby(self.group_name, + level=self.level, + sort=self.sort) result_df = getattr(groupby, attr)() def get_result(): - result_series = result_df['x'] - result_series.name = None + result_series = result_df[self.source_name] + result_series.name = self.source_name if self.source_name !=\ + '_x' else None idx = result_df.index - idx.name = None + idx.name = self.group_name if self.group_name != '_y' else None result_series.set_index(idx) return result_series return get_result @@ -239,14 +250,32 @@ def __getitem__(self, arg): raise KeyError("Column not found: " + str(val)) result = self.copy() result._df = DataFrame() - for by in self._by: - result._df[by] = self._df[by] + if isinstance(self._by, (str, Number)): + result._df[self._by] = self._df[self._by] + else: + for by in self._by: + result._df[by] = self._df[by] result._val_columns = arg if isinstance(arg, (str, Number)): result._df[arg] = self._df[arg] else: for a in arg: result._df[a] = self._df[a] + if len(result._by) == 1 and isinstance(result._val_columns, + (str, Number)): + new_by = [result._by] if isinstance(result._by, (str, Number))\ + else list(result._by) + new_val_columns = [result._val_columns] if\ + isinstance(result._val_columns, (str, Number))\ + else list(result._val_columns) + new_val_series = result._df[new_val_columns[0]] + new_val_series.name = new_val_columns[0] + new_by_series = result._df[new_by[0]] + new_by_series.name = new_by[0] + if new_by[0] == self._LEVEL_0_INDEX_NAME: + new_by_series.name = None + return SeriesGroupBy(new_val_series, + new_by_series) return result def copy(self, deep=True): From 1ab8d27582727b05a34f9289b60572857bec94f6 Mon Sep 17 00:00:00 2001 From: "H. Thomson Comer" Date: Wed, 8 May 2019 11:24:32 -0700 Subject: [PATCH 5/9] Manipulate between Series and DataFrame results to better match pandas, all if clauses --- python/cudf/groupby/groupby.py | 29 ++++++++++++----------------- 1 file changed, 12 insertions(+), 17 deletions(-) diff --git a/python/cudf/groupby/groupby.py b/python/cudf/groupby/groupby.py index 931c19b51e5..e784d8a93c9 100644 --- a/python/cudf/groupby/groupby.py +++ b/python/cudf/groupby/groupby.py @@ -172,11 +172,12 @@ def _apply_basic_agg(self, agg_type, sort_results=False): The aggregation function to run. """ agg_groupby = self.copy() - agg_groupby._df = agg_groupby._df._get_numeric_data() - agg_groupby._val_columns = [] - for val in self._val_columns: - if val in agg_groupby._df.columns: - agg_groupby._val_columns.append(val) + if agg_type in ['mean', 'sum']: + agg_groupby._df = agg_groupby._df._get_numeric_data() + agg_groupby._val_columns = [] + for val in self._val_columns: + if val in agg_groupby._df.columns: + agg_groupby._val_columns.append(val) return _cpp_apply_basic_agg(agg_groupby, agg_type, sort_results=sort_results) @@ -204,11 +205,7 @@ def apply_multiindex_or_single_index(self, result): mi = MultiIndex(levels, codes) mi.names = names final_result.index = mi - print(final_result.columns) - print(hasattr(self, '_gotattr')) - print(self) - if len(final_result.columns) == 1 and hasattr(self, - "_gotattr"): + if len(final_result.columns) == 1 and hasattr(self, "_gotattr"): final_series = Series([], name=final_result.columns[0]) final_series.index = final_result.index return final_series @@ -246,6 +243,11 @@ def apply_multiindex_or_single_index(self, result): for col in result.columns: if col not in self._by: final_result[col] = result[col] + if len(final_result.columns) == 1 and hasattr(self, "_gotattr"): + final_series = Series(final_result[final_result.columns[0]]) + final_series.name = final_result.columns[0] + final_series.index = multi_index + return final_series return final_result.set_index(multi_index) def apply_multicolumn(self, result, aggs): @@ -299,8 +301,6 @@ def __getitem__(self, arg): result = self.copy() result._df = DataFrame() setattr(result, "_gotattr", True) - print('getitem') - print(arg) if isinstance(self._by, (str, Number)): result._df[self._by] = self._df[self._by] else: @@ -388,9 +388,4 @@ def agg(self, args): in columns using the naming scheme of `aggregation_columnname`. """ agg_groupby = self.copy() - agg_groupby._df = agg_groupby._df._get_numeric_data() - agg_groupby._val_columns = [] - for val in self._val_columns: - if val in agg_groupby._df.columns: - agg_groupby._val_columns.append(val) return cpp_agg(agg_groupby, args) From 5f6af352130372b38e03556969ba01a4acc55cbd Mon Sep 17 00:00:00 2001 From: "H. Thomson Comer" Date: Wed, 8 May 2019 12:01:48 -0700 Subject: [PATCH 6/9] Add categorical column at @mrocklin's behest. --- python/cudf/dataframe/dataframe.py | 4 +++- python/cudf/tests/test_groupby.py | 9 +++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/python/cudf/dataframe/dataframe.py b/python/cudf/dataframe/dataframe.py index 7b40821333d..9bf3b2fe6a8 100644 --- a/python/cudf/dataframe/dataframe.py +++ b/python/cudf/dataframe/dataframe.py @@ -316,7 +316,9 @@ def empty(self): def _get_numeric_data(self): """ Return a dataframe with only numeric data types """ - columns = [c for c, dt in self.dtypes.items() if dt != object] + columns = [c for c, dt in self.dtypes.items() + if dt != object and + not pd.api.types.is_categorical_dtype(dt)] return self[columns] def assign(self, **kwargs): diff --git a/python/cudf/tests/test_groupby.py b/python/cudf/tests/test_groupby.py index ebf654b0652..84829df5590 100644 --- a/python/cudf/tests/test_groupby.py +++ b/python/cudf/tests/test_groupby.py @@ -462,10 +462,19 @@ def test_empty_groupby(func): def test_groupby_unsupported_columns(): + np.random.seed(12) + pd_cat = pd.Categorical( + pd.Series( + np.random.choice(['a', 'b', 1], 3), + dtype='category' + ) + ) + pdf = pd.DataFrame({'x': [1, 2, 3], 'y': ['a', 'b', 'c'], 'z': ['d', 'e', 'f'], 'a': [3, 4, 5]}) + pdf['b'] = pd_cat gdf = cudf.from_pandas(pdf) pdg = pdf.groupby('x').sum() gdg = gdf.groupby('x').sum() From 733632b768b7f664706fde06b2190ba7d72cd748 Mon Sep 17 00:00:00 2001 From: "H. Thomson Comer" Date: Wed, 8 May 2019 12:07:39 -0700 Subject: [PATCH 7/9] CHANGELOG --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 582df0f0758..2b9a989c382 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -140,7 +140,7 @@ - PR #1648 ORC reader: fix non-deterministic output when skiprows is non-zero - PR #1676 Fix groupby `as_index` behaviour with `MultiIndex` - PR #1659 Fix bug caused by empty groupbys and multiindex slicing throwing exceptions - +- PR #1656 Correct Groupby failure in dask when un-aggregable columns are left in dataframe. # cuDF 0.6.1 (25 Mar 2019) From e76ad89a206383f8e98f293790f93bbcc70f24b5 Mon Sep 17 00:00:00 2001 From: "H. Thomson Comer" Date: Wed, 8 May 2019 12:56:58 -0700 Subject: [PATCH 8/9] Allow groupby by categorical columns --- python/cudf/groupby/groupby.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/python/cudf/groupby/groupby.py b/python/cudf/groupby/groupby.py index e784d8a93c9..603b5cffee5 100644 --- a/python/cudf/groupby/groupby.py +++ b/python/cudf/groupby/groupby.py @@ -178,6 +178,12 @@ def _apply_basic_agg(self, agg_type, sort_results=False): for val in self._val_columns: if val in agg_groupby._df.columns: agg_groupby._val_columns.append(val) + # _get_numeric_data might have removed the by column + if isinstance(self._by, (str, Number)): + agg_groupby._df[self._by] = self._df[self._by] + else: + for by in self._by: + agg_groupby._df[by] = self._df[by] return _cpp_apply_basic_agg(agg_groupby, agg_type, sort_results=sort_results) From 365cf884f929714e3cb4a2924f839aabb3933e99 Mon Sep 17 00:00:00 2001 From: "H. Thomson Comer" Date: Wed, 8 May 2019 21:03:26 -0700 Subject: [PATCH 9/9] Replace a bunch of deep copies with shallow copies at the request of @kkraus14 --- python/cudf/groupby/groupby.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/cudf/groupby/groupby.py b/python/cudf/groupby/groupby.py index 603b5cffee5..0e7389ed652 100644 --- a/python/cudf/groupby/groupby.py +++ b/python/cudf/groupby/groupby.py @@ -90,7 +90,7 @@ def __init__(self, df, by, method="hash", as_index=True, level=None): self.level = None self._original_index_name = None self._val_columns = [] - self._df = df.copy(deep=True) + self._df = df.copy(deep=False) self._as_index = as_index if isinstance(by, Series): if len(by) != len(self._df.index): @@ -171,7 +171,7 @@ def _apply_basic_agg(self, agg_type, sort_results=False): agg_type : str The aggregation function to run. """ - agg_groupby = self.copy() + agg_groupby = self.copy(deep=False) if agg_type in ['mean', 'sum']: agg_groupby._df = agg_groupby._df._get_numeric_data() agg_groupby._val_columns = [] @@ -304,7 +304,7 @@ def __getitem__(self, arg): for val in arg: if val not in self._val_columns: raise KeyError("Column not found: " + str(val)) - result = self.copy() + result = self.copy(deep=False) result._df = DataFrame() setattr(result, "_gotattr", True) if isinstance(self._by, (str, Number)): @@ -393,5 +393,5 @@ def agg(self, args): Since multi-indexes aren't supported aggregation results are returned in columns using the naming scheme of `aggregation_columnname`. """ - agg_groupby = self.copy() + agg_groupby = self.copy(deep=False) return cpp_agg(agg_groupby, args)