From 2b547dc70c7f42b671cdc3e75946b123301779f0 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 28 Jun 2024 03:11:01 -1000 Subject: [PATCH] Add ensure_index to not unnecessarily shallow copy cudf.Index (#16117) The `cudf.Index` constructor will shallow copy a `cudf.Index` input. Sometimes, we just need to make sure an input is a `cudf.Index`, so created `ensure_index` (pandas has something similar) so we don't shallow copy these inputs unnecessarily Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/16117 --- python/cudf/cudf/core/_base_index.py | 6 ++++- python/cudf/cudf/core/algorithms.py | 4 ++-- python/cudf/cudf/core/cut.py | 2 +- python/cudf/cudf/core/dataframe.py | 29 ++++++++++++++---------- python/cudf/cudf/core/index.py | 13 ++++++++++- python/cudf/cudf/core/indexed_frame.py | 11 ++++----- python/cudf/cudf/core/multiindex.py | 3 ++- python/cudf/cudf/core/series.py | 12 ++++------ python/cudf/cudf/tests/test_dataframe.py | 24 ++++++++++++++++++++ 9 files changed, 73 insertions(+), 31 deletions(-) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index caf07b286cd..e160fa697ee 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -1104,7 +1104,11 @@ def difference(self, other, sort=None): f"of [None, False, True]; {sort} was passed." ) - other = cudf.Index(other, name=getattr(other, "name", self.name)) + if not isinstance(other, BaseIndex): + other = cudf.Index( + other, + name=getattr(other, "name", self.name), + ) if not len(other): res = self._get_reconciled_name_object(other).unique() diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py index 51a32e29886..e8b82ff60c2 100644 --- a/python/cudf/cudf/core/algorithms.py +++ b/python/cudf/cudf/core/algorithms.py @@ -6,7 +6,7 @@ from cudf.core.column import as_column from cudf.core.copy_types import BooleanMask -from cudf.core.index import Index, RangeIndex +from cudf.core.index import RangeIndex, ensure_index from cudf.core.indexed_frame import IndexedFrame from cudf.core.scalar import Scalar from cudf.options import get_option @@ -107,7 +107,7 @@ def factorize(values, sort=False, use_na_sentinel=True, size_hint=None): dtype="int64" if get_option("mode.pandas_compatible") else None, ).values - return labels, cats.values if return_cupy_array else Index(cats) + return labels, cats.values if return_cupy_array else ensure_index(cats) def _linear_interpolation(column, index=None): diff --git a/python/cudf/cudf/core/cut.py b/python/cudf/cudf/core/cut.py index 54c5e829e8a..d9f62f51f92 100644 --- a/python/cudf/cudf/core/cut.py +++ b/python/cudf/cudf/core/cut.py @@ -292,7 +292,7 @@ def cut( ) # we return a categorical index, as we don't have a Categorical method - categorical_index = cudf.Index(col) + categorical_index = cudf.CategoricalIndex._from_data({None: col}) if isinstance(orig_x, (pd.Series, cudf.Series)): # if we have a series input we return a series output diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 3fc29582c4c..4dfeb68b7ba 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -58,7 +58,12 @@ from cudf.core.column_accessor import ColumnAccessor from cudf.core.copy_types import BooleanMask from cudf.core.groupby.groupby import DataFrameGroupBy, groupby_doc_template -from cudf.core.index import BaseIndex, RangeIndex, _index_from_data, as_index +from cudf.core.index import ( + BaseIndex, + RangeIndex, + _index_from_data, + ensure_index, +) from cudf.core.indexed_frame import ( IndexedFrame, _FrameIndexer, @@ -338,7 +343,7 @@ def _getitem_tuple_arg(self, arg): range(len(tmp_arg[0])) ) }, - index=as_index(tmp_arg[0]), + index=cudf.Index(tmp_arg[0]), ) columns_df[cantor_name] = column.as_column( range(len(columns_df)) @@ -702,7 +707,7 @@ def __init__( data = data.reindex(index) index = data.index else: - index = cudf.Index(index) + index = ensure_index(index) else: index = data.index @@ -751,7 +756,7 @@ def __init__( if index is None: self._index = RangeIndex(0) else: - self._index = cudf.Index(index) + self._index = ensure_index(index) if columns is not None: rangeindex = isinstance( columns, (range, pd.RangeIndex, cudf.RangeIndex) @@ -909,7 +914,7 @@ def _init_from_series_list(self, data, columns, index): f"not match length of index ({index_length})" ) - final_index = cudf.Index(index) + final_index = ensure_index(index) series_lengths = list(map(len, data)) data = numeric_normalize_types(*data) @@ -977,9 +982,9 @@ def _init_from_list_like(self, data, index=None, columns=None): if index is None: index = RangeIndex(start=0, stop=len(data)) else: - index = cudf.Index(index) + index = ensure_index(index) - self._index = cudf.Index(index) + self._index = index # list-of-dicts case if len(data) > 0 and isinstance(data[0], dict): data = DataFrame.from_pandas(pd.DataFrame(data)) @@ -1085,7 +1090,7 @@ def _init_from_dict_like( self._index = RangeIndex(0, num_rows) else: - self._index = cudf.Index(index) + self._index = ensure_index(index) if len(data): self._data.multiindex = True @@ -1491,7 +1496,7 @@ def memory_usage(self, index=True, deep=False): names.append("Index") return Series._from_data( data={None: as_column(mem_usage)}, - index=as_index(names), + index=cudf.Index(names), ) @_performance_tracking @@ -4033,7 +4038,7 @@ def transpose(self): # Set the old column names as the new index result = self.__class__._from_data( ColumnAccessor(dict(enumerate(result_columns)), verify=False), - index=as_index(index), + index=cudf.Index(index), ) # Set the old index as the new column names result.columns = columns @@ -5657,7 +5662,7 @@ def from_records(cls, data, index=None, columns=None, nan_as_null=False): } if not is_scalar(index): - new_index = cudf.Index(index) + new_index = ensure_index(index) else: new_index = None @@ -5741,7 +5746,7 @@ def _from_arrays(cls, data, index=None, columns=None, nan_as_null=False): } if index is not None: - index = cudf.Index(index) + index = ensure_index(index) if isinstance(columns, (pd.Index, cudf.Index)): level_names = tuple(columns.names) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index e069f8d0ea6..b398ee2343e 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -65,6 +65,17 @@ from collections.abc import Generator, Iterable +def ensure_index(index_like: Any) -> BaseIndex: + """ + Ensure an Index is returned. + + Avoids a shallow copy compared to calling cudf.Index(...) + """ + if not isinstance(index_like, BaseIndex): + return cudf.Index(index_like) + return index_like + + class IndexMeta(type): """Custom metaclass for Index that overrides instance/subclass tests.""" @@ -1569,7 +1580,7 @@ def append(self, other): to_concat.append(obj) else: this = self - other = cudf.Index(other) + other = ensure_index(other) if len(this) == 0 or len(other) == 0: # we'll filter out empties later in ._concat diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 72bd3c45fa6..ff10051c52d 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -33,7 +33,6 @@ is_list_like, is_scalar, ) -from cudf.core._base_index import BaseIndex from cudf.core._compat import PANDAS_LT_300 from cudf.core.buffer import acquire_spill_lock from cudf.core.column import ColumnBase, as_column @@ -42,7 +41,7 @@ from cudf.core.dtypes import ListDtype from cudf.core.frame import Frame from cudf.core.groupby.groupby import GroupBy -from cudf.core.index import Index, RangeIndex, _index_from_data +from cudf.core.index import RangeIndex, _index_from_data, ensure_index from cudf.core.missing import NA from cudf.core.multiindex import MultiIndex from cudf.core.resample import _Resampler @@ -66,6 +65,8 @@ Dtype, NotImplementedType, ) + from cudf.core._base_index import BaseIndex + doc_reset_index_template = """ Reset the index of the {klass}, or a level of it. @@ -627,9 +628,7 @@ def index(self, value): f"new values have {len(value)} elements" ) # avoid unnecessary cast to Index - if not isinstance(value, BaseIndex): - value = Index(value) - + value = ensure_index(value) self._index = value @_performance_tracking @@ -3595,7 +3594,7 @@ def _align_to_index( sort: bool = True, allow_non_unique: bool = False, ) -> Self: - index = cudf.Index(index) + index = ensure_index(index) if self.index.equals(index): return self diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 7657fa9e234..9cbe863142b 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -29,6 +29,7 @@ BaseIndex, _get_indexer_basic, _lexsorted_equal_range, + ensure_index, ) from cudf.core.join._join_helpers import _match_join_keys from cudf.utils.dtypes import is_column_like @@ -173,7 +174,7 @@ def __init__( "codes and is inconsistent!" ) - levels = [cudf.Index(level) for level in levels] + levels = [ensure_index(level) for level in levels] if len(levels) != len(codes._data): raise ValueError( diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 9acf5294b72..97b6bbec2d4 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -48,7 +48,7 @@ from cudf.core.column.struct import StructMethods from cudf.core.column_accessor import ColumnAccessor from cudf.core.groupby.groupby import SeriesGroupBy, groupby_doc_template -from cudf.core.index import BaseIndex, DatetimeIndex, RangeIndex, as_index +from cudf.core.index import BaseIndex, DatetimeIndex, RangeIndex, ensure_index from cudf.core.indexed_frame import ( IndexedFrame, _FrameIndexer, @@ -588,10 +588,8 @@ def __init__( data = data.copy(deep=True) name_from_data = data.name column = as_column(data, nan_as_null=nan_as_null, dtype=dtype) - if isinstance(data, pd.Series): - index_from_data = cudf.Index(data.index) - elif isinstance(data, Series): - index_from_data = data.index + if isinstance(data, (pd.Series, Series)): + index_from_data = ensure_index(data.index) elif isinstance(data, ColumnAccessor): raise TypeError( "Use cudf.Series._from_data for constructing a Series from " @@ -642,7 +640,7 @@ def __init__( name = name_from_data if index is not None: - index = cudf.Index(index) + index = ensure_index(index) if index_from_data is not None: first_index = index_from_data @@ -3191,7 +3189,7 @@ def quantile( return Series._from_data( data={self.name: result}, - index=as_index(np_array_q) if quant_index else None, + index=cudf.Index(np_array_q) if quant_index else None, ) @docutils.doc_describe() diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index fc7fd87d4c5..f40106a30f4 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -11078,3 +11078,27 @@ def test_dataframe_loc_int_float(dtype1, dtype2): expected = pdf.loc[pidx] assert_eq(actual, expected, check_index_type=True, check_dtype=True) + + +@pytest.mark.parametrize( + "data", + [ + cudf.DataFrame(range(2)), + None, + [cudf.Series(range(2))], + [[0], [1]], + {1: range(2)}, + cupy.arange(2), + ], +) +def test_init_with_index_no_shallow_copy(data): + idx = cudf.RangeIndex(2) + df = cudf.DataFrame(data, index=idx) + assert df.index is idx + + +def test_from_records_with_index_no_shallow_copy(): + idx = cudf.RangeIndex(2) + data = np.array([(1.0, 2), (3.0, 4)], dtype=[("x", "