Skip to content

Commit

Permalink
Backport PR pandas-dev#57061: REGR: non-unique, masked dtype index ra…
Browse files Browse the repository at this point in the history
…ising IndexError
  • Loading branch information
lukemanley authored and meeseeksmachine committed Jan 30, 2024
1 parent 10b5873 commit 4591549
Show file tree
Hide file tree
Showing 3 changed files with 44 additions and 32 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.2.1.rst
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ Fixed regressions
- Fixed performance regression in :meth:`Series.combine_first` (:issue:`55845`)
- Fixed regression in :func:`merge_ordered` raising ``TypeError`` for ``fill_method="ffill"`` and ``how="left"`` (:issue:`57010`)
- Fixed regression in :func:`wide_to_long` raising an ``AttributeError`` for string columns (:issue:`57066`)
- Fixed regression in :meth:`DataFrame.loc` raising ``IndexError`` for non-unique, masked dtype indexes where result has more than 10,000 rows (:issue:`57027`)
- Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` ignoring the ``skipna`` argument (:issue:`57040`)
- Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` where values containing the minimum or maximum value for the dtype could produce incorrect results (:issue:`57040`)
- Fixed regression in :meth:`Index.join` raising ``TypeError`` when joining an empty index to a non-empty index containing mixed dtype values (:issue:`57048`)
Expand Down
63 changes: 31 additions & 32 deletions pandas/_libs/index.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,20 @@ cdef ndarray _get_bool_indexer(ndarray values, object val, ndarray mask = None):
return indexer.view(bool)


cdef _maybe_resize_array(ndarray values, Py_ssize_t loc, Py_ssize_t max_length):
"""
Resize array if loc is out of bounds.
"""
cdef:
Py_ssize_t n = len(values)

if loc >= n:
while loc >= n:
n *= 2
values = np.resize(values, min(n, max_length))
return values


# Don't populate hash tables in monotonic indexes larger than this
_SIZE_CUTOFF = 1_000_000

Expand Down Expand Up @@ -450,27 +464,18 @@ cdef class IndexEngine:
# found
if val in d:
key = val

result = _maybe_resize_array(
result,
count + len(d[key]) - 1,
max_alloc
)
for j in d[key]:

# realloc if needed
if count >= n_alloc:
n_alloc *= 2
if n_alloc > max_alloc:
n_alloc = max_alloc
result = np.resize(result, n_alloc)

result[count] = j
count += 1

# value not found
else:

if count >= n_alloc:
n_alloc *= 2
if n_alloc > max_alloc:
n_alloc = max_alloc
result = np.resize(result, n_alloc)
result = _maybe_resize_array(result, count, max_alloc)
result[count] = -1
count += 1
missing[count_missing] = i
Expand Down Expand Up @@ -1193,37 +1198,31 @@ cdef class MaskedIndexEngine(IndexEngine):

if PySequence_GetItem(target_mask, i):
if na_pos:
result = _maybe_resize_array(
result,
count + len(na_pos) - 1,
max_alloc,
)
for na_idx in na_pos:
# realloc if needed
if count >= n_alloc:
n_alloc *= 2
if n_alloc > max_alloc:
n_alloc = max_alloc

result[count] = na_idx
count += 1
continue

elif val in d:
# found
key = val

result = _maybe_resize_array(
result,
count + len(d[key]) - 1,
max_alloc,
)
for j in d[key]:

# realloc if needed
if count >= n_alloc:
n_alloc *= 2
if n_alloc > max_alloc:
n_alloc = max_alloc

result[count] = j
count += 1
continue

# value not found
if count >= n_alloc:
n_alloc += 10_000
result = np.resize(result, n_alloc)
result = _maybe_resize_array(result, count, max_alloc)
result[count] = -1
count += 1
missing[count_missing] = i
Expand Down
12 changes: 12 additions & 0 deletions pandas/tests/indexing/test_loc.py
Original file line number Diff line number Diff line change
Expand Up @@ -3364,3 +3364,15 @@ def test_getitem_loc_str_periodindex(self):
index = pd.period_range(start="2000", periods=20, freq="B")
series = Series(range(20), index=index)
assert series.loc["2000-01-14"] == 9

def test_loc_nonunique_masked_index(self):
# GH 57027
ids = list(range(11))
index = Index(ids * 1000, dtype="Int64")
df = DataFrame({"val": np.arange(len(index), dtype=np.intp)}, index=index)
result = df.loc[ids]
expected = DataFrame(
{"val": index.argsort(kind="stable").astype(np.intp)},
index=Index(np.array(ids).repeat(1000), dtype="Int64"),
)
tm.assert_frame_equal(result, expected)

0 comments on commit 4591549

Please sign in to comment.