Skip to content

Commit

Permalink
Merge pull request #1656 from thomcom/bug-ext-groupby-getattr-incorrect
Browse files Browse the repository at this point in the history
[REVIEW] Drop object columns automatically when running numeric groupby aggregations.
  • Loading branch information
raydouglass authored May 9, 2019
2 parents 17ef8f5 + 365cf88 commit 2db0fc9
Show file tree
Hide file tree
Showing 4 changed files with 109 additions and 26 deletions.
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@
- PR #1648 ORC reader: fix non-deterministic output when skiprows is non-zero
- PR #1676 Fix groupby `as_index` behaviour with `MultiIndex`
- PR #1659 Fix bug caused by empty groupbys and multiindex slicing throwing exceptions

- PR #1656 Correct Groupby failure in dask when un-aggregable columns are left in dataframe.


# cuDF 0.6.1 (25 Mar 2019)
Expand Down
4 changes: 1 addition & 3 deletions python/cudf/dataframe/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,8 +228,7 @@ def __getitem__(self, arg):
if isinstance(self.columns, cudf.dataframe.multiindex.MultiIndex) and\
isinstance(arg, tuple):
return self.columns._get_column_major(self, arg)
if isinstance(arg, str) or isinstance(arg, numbers.Integral) or \
isinstance(arg, tuple):
if isinstance(arg, (str, numbers.Number)) or isinstance(arg, tuple):
s = self._cols[arg]
s.name = arg
s.index = self.index
Expand Down Expand Up @@ -1810,7 +1809,6 @@ def groupby(self, by=None, sort=False, as_index=True, method="hash",
raise NotImplementedError(
"The group_keys keyword is not yet implemented"
)

if by is None and level is None:
raise TypeError('groupby() requires either by or level to be'
'specified.')
Expand Down
86 changes: 75 additions & 11 deletions python/cudf/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,25 +19,36 @@ class SeriesGroupBy(object):
"""
def __init__(self, source_series, group_series, level=None, sort=False):
self.source_series = source_series
self.source_name = '_x'
if self.source_series is not None and\
self.source_series.name is not None:
self.source_name = source_series.name
self.group_series = group_series
self.group_name = '_y'
if self.group_series is not None and\
self.group_series.name is not None:
self.group_name = group_series.name
self.level = level
self.sort = sort

def __getattr__(self, attr):
df = DataFrame()
df['x'] = self.source_series
df[self.source_name] = self.source_series
if self.level is not None:
df['y'] = self.source_series.index
df[self.group_name] = self.source_series.index
else:
df['y'] = self.group_series
groupby = df.groupby('y', level=self.level, sort=self.sort)
df[self.group_name] = self.group_series
groupby = df.groupby(self.group_name,
level=self.level,
sort=self.sort)
result_df = getattr(groupby, attr)()

def get_result():
result_series = result_df['x']
result_series.name = None
result_series = result_df[self.source_name]
result_series.name = self.source_name if self.source_name !=\
'_x' else None
idx = result_df.index
idx.name = None
idx.name = self.group_name if self.group_name != '_y' else None
result_series.set_index(idx)
return result_series
return get_result
Expand Down Expand Up @@ -79,7 +90,7 @@ def __init__(self, df, by, method="hash", as_index=True, level=None):
self.level = None
self._original_index_name = None
self._val_columns = []
self._df = df.copy(deep=True)
self._df = df.copy(deep=False)
self._as_index = as_index
if isinstance(by, Series):
if len(by) != len(self._df.index):
Expand Down Expand Up @@ -160,7 +171,21 @@ def _apply_basic_agg(self, agg_type, sort_results=False):
agg_type : str
The aggregation function to run.
"""
return _cpp_apply_basic_agg(self, agg_type, sort_results=sort_results)
agg_groupby = self.copy(deep=False)
if agg_type in ['mean', 'sum']:
agg_groupby._df = agg_groupby._df._get_numeric_data()
agg_groupby._val_columns = []
for val in self._val_columns:
if val in agg_groupby._df.columns:
agg_groupby._val_columns.append(val)
# _get_numeric_data might have removed the by column
if isinstance(self._by, (str, Number)):
agg_groupby._df[self._by] = self._df[self._by]
else:
for by in self._by:
agg_groupby._df[by] = self._df[by]
return _cpp_apply_basic_agg(agg_groupby, agg_type,
sort_results=sort_results)

def apply_multiindex_or_single_index(self, result):
if len(result) == 0:
Expand All @@ -186,6 +211,10 @@ def apply_multiindex_or_single_index(self, result):
mi = MultiIndex(levels, codes)
mi.names = names
final_result.index = mi
if len(final_result.columns) == 1 and hasattr(self, "_gotattr"):
final_series = Series([], name=final_result.columns[0])
final_series.index = final_result.index
return final_series
return final_result
if len(self._by) == 1:
from cudf.dataframe import index
Expand Down Expand Up @@ -220,6 +249,11 @@ def apply_multiindex_or_single_index(self, result):
for col in result.columns:
if col not in self._by:
final_result[col] = result[col]
if len(final_result.columns) == 1 and hasattr(self, "_gotattr"):
final_series = Series(final_result[final_result.columns[0]])
final_series.name = final_result.columns[0]
final_series.index = multi_index
return final_series
return final_result.set_index(multi_index)

def apply_multicolumn(self, result, aggs):
Expand Down Expand Up @@ -270,8 +304,35 @@ def __getitem__(self, arg):
for val in arg:
if val not in self._val_columns:
raise KeyError("Column not found: " + str(val))
result = self
result = self.copy(deep=False)
result._df = DataFrame()
setattr(result, "_gotattr", True)
if isinstance(self._by, (str, Number)):
result._df[self._by] = self._df[self._by]
else:
for by in self._by:
result._df[by] = self._df[by]
result._val_columns = arg
if isinstance(arg, (str, Number)):
result._df[arg] = self._df[arg]
else:
for a in arg:
result._df[a] = self._df[a]
if len(result._by) == 1 and isinstance(result._val_columns,
(str, Number)):
new_by = [result._by] if isinstance(result._by, (str, Number))\
else list(result._by)
new_val_columns = [result._val_columns] if\
isinstance(result._val_columns, (str, Number))\
else list(result._val_columns)
new_val_series = result._df[new_val_columns[0]]
new_val_series.name = new_val_columns[0]
new_by_series = result._df[new_by[0]]
new_by_series.name = new_by[0]
if new_by[0] == self._LEVEL_0_INDEX_NAME:
new_by_series.name = None
return SeriesGroupBy(new_val_series,
new_by_series)
return result

def copy(self, deep=True):
Expand All @@ -282,6 +343,8 @@ def copy(self, deep=True):
as_index=self._as_index,
level=self.level)
result._original_index_name = self._original_index_name
if hasattr(self, "_gotattr"):
setattr(result, "_gotattr", True)
return result

def deepcopy(self):
Expand Down Expand Up @@ -330,4 +393,5 @@ def agg(self, args):
Since multi-indexes aren't supported aggregation results are returned
in columns using the naming scheme of `aggregation_columnname`.
"""
return cpp_agg(self, args)
agg_groupby = self.copy(deep=False)
return cpp_agg(agg_groupby, args)
43 changes: 32 additions & 11 deletions python/cudf/tests/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -445,17 +445,6 @@ def test_advanced_groupby_levels():
assert_eq(pdh, gdh)


def test_list_of_series():
pdf = pd.DataFrame({'x': [1, 2, 3], 'y': [1, 2, 1]})
gdf = cudf.from_pandas(pdf)
pdg = pdf.groupby([pdf.x]).y.sum()
gdg = gdf.groupby([gdf.x]).y.sum()
assert_eq(pdg, gdg)
pdg = pdf.groupby([pdf.x, pdf.y]).y.sum()
gdg = gdf.groupby([gdf.x, gdf.y]).y.sum()
assert_eq(pdg, gdg)


@pytest.mark.parametrize('func', [
lambda df: df.groupby(['x', 'y', 'z']).sum(),
lambda df: df.groupby(['x', 'y']).sum(),
Expand All @@ -470,3 +459,35 @@ def test_empty_groupby(func):
pdf = pd.DataFrame({'x': [], 'y': [], 'z': []})
gdf = cudf.from_pandas(pdf)
assert_eq(func(pdf), func(gdf))


def test_groupby_unsupported_columns():
np.random.seed(12)
pd_cat = pd.Categorical(
pd.Series(
np.random.choice(['a', 'b', 1], 3),
dtype='category'
)
)

pdf = pd.DataFrame({'x': [1, 2, 3],
'y': ['a', 'b', 'c'],
'z': ['d', 'e', 'f'],
'a': [3, 4, 5]})
pdf['b'] = pd_cat
gdf = cudf.from_pandas(pdf)
pdg = pdf.groupby('x').sum()
gdg = gdf.groupby('x').sum()
assert_eq(pdg, gdg)


def test_list_of_series():
pdf = pd.DataFrame({'x': [1, 2, 3], 'y': [1, 2, 1]})
gdf = cudf.from_pandas(pdf)
pdg = pdf.groupby([pdf.x]).y.sum()
gdg = gdf.groupby([gdf.x]).y.sum()
assert_eq(pdg, gdg)
pdg = pdf.groupby([pdf.x, pdf.y]).y.sum()
gdg = gdf.groupby([gdf.x, gdf.y]).y.sum()
pytest.skip()
assert_eq(pdg, gdg)

0 comments on commit 2db0fc9

Please sign in to comment.