Skip to content

Commit

Permalink
Merge pull request #1689 from thomcom/bug-ext-0.7-performance-regression
Browse files Browse the repository at this point in the history
[REVIEW] Correct performance regression with 0.7 groupby.
  • Loading branch information
raydouglass authored May 9, 2019
2 parents 2db0fc9 + c6690dd commit 862b187
Show file tree
Hide file tree
Showing 3 changed files with 5 additions and 6 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,7 @@
- PR #1676 Fix groupby `as_index` behaviour with `MultiIndex`
- PR #1659 Fix bug caused by empty groupbys and multiindex slicing throwing exceptions
- PR #1656 Correct Groupby failure in dask when un-aggregable columns are left in dataframe.
- PR #1689 Fix groupby performance regression


# cuDF 0.6.1 (25 Mar 2019)
Expand Down
3 changes: 2 additions & 1 deletion python/cudf/dataframe/multiindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from collections.abc import Sequence

from cudf.dataframe import columnops
from cudf.dataframe.series import Series
from cudf.comm.serialize import register_distributed_serializer
from cudf.dataframe.index import Index, StringIndex
from cudf.utils import utils
Expand Down Expand Up @@ -63,7 +64,7 @@ def __init__(self, levels, codes=None, labels=None, names=None):

# converting levels to numpy array will produce a Float64Index
# (on empty levels)for levels mimicking the behavior of Pandas
self.levels = np.array(levels)
self.levels = np.array([Series(level).to_array() for level in levels])
self._validate_levels_and_codes(self.levels, self.codes)
self.name = None
self.names = names
Expand Down
7 changes: 2 additions & 5 deletions python/cudf/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,12 +235,9 @@ def apply_multiindex_or_single_index(self, result):
# time the groupby is calculated.
for by in self._by:
level = result[by].unique()
code = result[by]
for idx, value in enumerate(level):
level_mask = code == value
code = code.masked_assign(idx, level_mask)
replaced = result[by].replace(level, range(len(level)))
levels.append(level)
codes[by] = code
codes[by] = Series(replaced, dtype="int32")
names.append(by)
multi_index = MultiIndex(levels=levels,
codes=codes,
Expand Down

0 comments on commit 862b187

Please sign in to comment.