From d81f3cbaebf23f0c86779a0aa270da8bb62c68ee Mon Sep 17 00:00:00 2001 From: "H. Thomson Comer" Date: Wed, 8 May 2019 20:15:18 -0700 Subject: [PATCH 1/2] Improve performance --- python/cudf/dataframe/multiindex.py | 3 ++- python/cudf/groupby/groupby.py | 7 ++----- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/python/cudf/dataframe/multiindex.py b/python/cudf/dataframe/multiindex.py index 26a42af2fa2..cd49f4992b8 100644 --- a/python/cudf/dataframe/multiindex.py +++ b/python/cudf/dataframe/multiindex.py @@ -7,6 +7,7 @@ from collections.abc import Sequence from cudf.dataframe import columnops +from cudf.dataframe.series import Series from cudf.comm.serialize import register_distributed_serializer from cudf.dataframe.index import Index, StringIndex from cudf.utils import utils @@ -63,7 +64,7 @@ def __init__(self, levels, codes=None, labels=None, names=None): # converting levels to numpy array will produce a Float64Index # (on empty levels)for levels mimicking the behavior of Pandas - self.levels = np.array(levels) + self.levels = np.array([Series(level).to_array() for level in levels]) self._validate_levels_and_codes(self.levels, self.codes) self.name = None self.names = names diff --git a/python/cudf/groupby/groupby.py b/python/cudf/groupby/groupby.py index b9ea9cd2dbc..730e02450e8 100644 --- a/python/cudf/groupby/groupby.py +++ b/python/cudf/groupby/groupby.py @@ -206,12 +206,9 @@ def apply_multiindex_or_single_index(self, result): # time the groupby is calculated. for by in self._by: level = result[by].unique() - code = result[by] - for idx, value in enumerate(level): - level_mask = code == value - code = code.masked_assign(idx, level_mask) + replaced = result[by].replace(level, range(len(level))) levels.append(level) - codes[by] = code + codes[by] = Series(replaced, dtype="int32") names.append(by) multi_index = MultiIndex(levels=levels, codes=codes, From 0f4bc527b57dbc18c588f94d25594d225242fa85 Mon Sep 17 00:00:00 2001 From: "H. Thomson Comer" Date: Wed, 8 May 2019 20:20:10 -0700 Subject: [PATCH 2/2] CHANGELOG --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 582df0f0758..c57cf647160 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -140,7 +140,7 @@ - PR #1648 ORC reader: fix non-deterministic output when skiprows is non-zero - PR #1676 Fix groupby `as_index` behaviour with `MultiIndex` - PR #1659 Fix bug caused by empty groupbys and multiindex slicing throwing exceptions - +- PR #1689 Fix groupby performance regression # cuDF 0.6.1 (25 Mar 2019)