Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: Series.map may raise TypeError in Categorical or DatetimeTz #12532

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions doc/source/whatsnew/v0.18.1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@ API changes

- ``Period`` and ``PeriodIndex`` now raises ``IncompatibleFrequency`` error which inherits ``ValueError`` rather than raw ``ValueError`` (:issue:`12615`)

- ``Series.apply`` for category dtype now applies passed function to each ``.categories`` (not ``.codes``), and returns "category" dtype if possible (:issue:`12473`)


- The default for ``.query()/.eval()`` is now ``engine=None``, which will use ``numexpr`` if it's installed; otherwise it will fallback to the ``python`` engine. This mimics the pre-0.18.1 behavior if ``numexpr`` is installed (and which Previously, if numexpr was not installed, ``.query()/.eval()`` would raise). (:issue:`12749`)
Expand Down Expand Up @@ -324,4 +325,8 @@ Bug Fixes
- ``pd.read_excel()`` now accepts path objects (e.g. ``pathlib.Path``, ``py.path.local``) for the file path, in line with other ``read_*`` functions (:issue:`12655`)
- ``pd.read_excel()`` now accepts column names associated with keyword argument ``names``(:issue `12870`)


- Bug in ``fill_value`` is ignored if the argument to a binary operator is a constant (:issue `12723`)


- Bug in ``Series.map`` raises ``TypeError`` if its dtype is ``category`` or tz-aware ``datetime`` (:issue:`12473`)
24 changes: 24 additions & 0 deletions pandas/core/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -883,6 +883,30 @@ def remove_unused_categories(self, inplace=False):
if not inplace:
return cat

def map(self, mapper):
"""
Apply mapper function to its categories (not codes).

Parameters
----------
mapper : callable
Function to be applied. When all categories are mapped
to different categories, the result will be Categorical which has
the same order property as the original. Otherwise, the result will
be np.ndarray.

Returns
-------
applied : Categorical or np.ndarray.
"""
new_categories = self.categories.map(mapper)
try:
return Categorical.from_codes(self._codes.copy(),
categories=new_categories,
ordered=self.ordered)
except ValueError:
return np.take(new_categories, self._codes)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

shouldn't this be:

Index(new_categories).take(self._codes)? maybe that's the same

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, the result should be the same. I chose numpy logic to omit Index creation, but Index.take is preferable?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

its fine, more of a style things, but yes should be the same


__eq__ = _cat_compare_op('__eq__')
__ne__ = _cat_compare_op('__ne__')
__lt__ = _cat_compare_op('__lt__')
Expand Down
4 changes: 2 additions & 2 deletions pandas/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -705,7 +705,7 @@ def _maybe_upcast(values, fill_value=np.nan, dtype=None, copy=False):
copy : if True always make a copy even if no upcast is required
"""

if is_internal_type(values):
if is_extension_type(values):
if copy:
values = values.copy()
else:
Expand Down Expand Up @@ -1714,7 +1714,7 @@ def is_datetimetz(array):
is_datetime64tz_dtype(array))


def is_internal_type(value):
def is_extension_type(value):
"""
if we are a klass that is preserved by the internals
these are internal klasses that we represent (and don't use a np.array)
Expand Down
6 changes: 3 additions & 3 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
isnull, notnull, PandasError, _try_sort, _default_index, _maybe_upcast,
is_sequence, _infer_dtype_from_scalar, _values_from_object, is_list_like,
_maybe_box_datetimelike, is_categorical_dtype, is_object_dtype,
is_internal_type, is_datetimetz, _possibly_infer_to_datetimelike,
is_extension_type, is_datetimetz, _possibly_infer_to_datetimelike,
_dict_compat)
from pandas.core.generic import NDFrame, _shared_docs
from pandas.core.index import Index, MultiIndex, _ensure_index
Expand Down Expand Up @@ -2594,7 +2594,7 @@ def reindexer(value):
value = com._possibly_cast_to_datetime(value, dtype)

# return internal types directly
if is_internal_type(value):
if is_extension_type(value):
return value

# broadcast across multiple columns if necessary
Expand Down Expand Up @@ -4094,7 +4094,7 @@ def _apply_standard(self, func, axis, ignore_failures=False, reduce=True):

# we cannot reduce using non-numpy dtypes,
# as demonstrated in gh-12244
if not is_internal_type(values):
if not is_extension_type(values):
# Create a dummy Series from an empty array
index = self._get_axis(axis)
empty_arr = np.empty(len(index), dtype=values.dtype)
Expand Down
10 changes: 5 additions & 5 deletions pandas/core/internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
_maybe_convert_string_to_object,
_maybe_convert_scalar,
is_categorical, is_datetimelike_v_numeric,
is_numeric_v_string_like, is_internal_type)
is_numeric_v_string_like, is_extension_type)
import pandas.core.algorithms as algos
from pandas.types.api import DatetimeTZDtype

Expand Down Expand Up @@ -1765,7 +1765,7 @@ def should_store(self, value):
return not (issubclass(value.dtype.type,
(np.integer, np.floating, np.complexfloating,
np.datetime64, np.bool_)) or
is_internal_type(value))
is_extension_type(value))

def replace(self, to_replace, value, inplace=False, filter=None,
regex=False, convert=True, mgr=None):
Expand Down Expand Up @@ -3388,10 +3388,10 @@ def set(self, item, value, check=False):
# FIXME: refactor, clearly separate broadcasting & zip-like assignment
# can prob also fix the various if tests for sparse/categorical

value_is_internal_type = is_internal_type(value)
value_is_extension_type = is_extension_type(value)

# categorical/spares/datetimetz
if value_is_internal_type:
if value_is_extension_type:

def value_getitem(placement):
return value
Expand Down Expand Up @@ -3463,7 +3463,7 @@ def value_getitem(placement):
unfit_count = len(unfit_mgr_locs)

new_blocks = []
if value_is_internal_type:
if value_is_extension_type:
# This code (ab-)uses the fact that sparse blocks contain only
# one item.
new_blocks.extend(
Expand Down
40 changes: 25 additions & 15 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
is_categorical_dtype,
_possibly_cast_to_datetime,
_possibly_castable, _possibly_convert_platform,
_try_sort, is_internal_type, is_datetimetz,
_try_sort, is_extension_type, is_datetimetz,
_maybe_match_name, ABCSparseArray,
_coerce_to_dtype, SettingWithCopyError,
_maybe_box_datetimelike, ABCDataFrame,
Expand Down Expand Up @@ -2063,28 +2063,33 @@ def map(self, arg, na_action=None):
y : Series
same index as caller
"""
values = self.asobject

if na_action == 'ignore':
mask = isnull(values)

def map_f(values, f):
return lib.map_infer_mask(values, f, mask.view(np.uint8))
if is_extension_type(self.dtype):
values = self._values
if na_action is not None:
raise NotImplementedError
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmm, testing this?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

map_f = lambda values, f: values.map(f)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would map_f = type(values).map work?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's the background of this suggestion?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh I just thought it would be a cleaner way.

else:
map_f = lib.map_infer
values = self.asobject

if na_action == 'ignore':
def map_f(values, f):
return lib.map_infer_mask(values, f,
isnull(values).view(np.uint8))
else:
map_f = lib.map_infer

if isinstance(arg, (dict, Series)):
if isinstance(arg, dict):
arg = self._constructor(arg, index=arg.keys())

indexer = arg.index.get_indexer(values)
new_values = algos.take_1d(arg._values, indexer)
return self._constructor(new_values,
index=self.index).__finalize__(self)
else:
mapped = map_f(values, arg)
return self._constructor(mapped,
index=self.index).__finalize__(self)
new_values = map_f(values, arg)

return self._constructor(new_values,
index=self.index).__finalize__(self)

def apply(self, func, convert_dtype=True, args=(), **kwds):
"""
Expand Down Expand Up @@ -2193,7 +2198,12 @@ def apply(self, func, convert_dtype=True, args=(), **kwds):
if isinstance(f, np.ufunc):
return f(self)

mapped = lib.map_infer(self.asobject, f, convert=convert_dtype)
if is_extension_type(self.dtype):
mapped = self._values.map(f)
else:
values = self.asobject
mapped = lib.map_infer(values, f, convert=convert_dtype)

if len(mapped) and isinstance(mapped[0], Series):
from pandas.core.frame import DataFrame
return DataFrame(mapped.tolist(), index=self.index)
Expand Down Expand Up @@ -2779,7 +2789,7 @@ def _try_cast(arr, take_fast_path):

try:
subarr = _possibly_cast_to_datetime(arr, dtype)
if not is_internal_type(subarr):
if not is_extension_type(subarr):
subarr = np.array(subarr, dtype=dtype, copy=copy)
except (ValueError, TypeError):
if is_categorical_dtype(dtype):
Expand Down
13 changes: 12 additions & 1 deletion pandas/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2194,11 +2194,22 @@ def groupby(self, to_groupby):
-------
groups : dict
{group name -> group labels}

"""
return self._groupby(self.values, _values_from_object(to_groupby))

def map(self, mapper):
"""
Apply mapper function to its values.

Parameters
----------
mapper : callable
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmm, so it seems that Series.map and Index.map/Categorical.map have different signatures. (first does not accept a callable, only dict-like, second ONLY a callable, and no na_action arg).

hmm, can you create an issue so we can look at this. might be a bit non-trivial to fix

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, let me issue a separate issue (to close #12756 also).

Function to be applied.

Returns
-------
applied : array
"""
return self._arrmap(self.values, mapper)

def isin(self, values, level=None):
Expand Down
18 changes: 18 additions & 0 deletions pandas/indexes/category.py
Original file line number Diff line number Diff line change
Expand Up @@ -468,6 +468,24 @@ def take(self, indices, axis=0, allow_fill=True, fill_value=None):
na_value=-1)
return self._create_from_codes(taken)

def map(self, mapper):
"""
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe in another PR can consolidate the doc-strings of map into base (when we combine them all there)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Left it as it is, because Categorical.map needs some supplementary info. Using Appender doesn't make it very simple.

Apply mapper function to its categories (not codes).

Parameters
----------
mapper : callable
Function to be applied. When all categories are mapped
to different categories, the result will be Categorical which has
the same order property as the original. Otherwise, the result will
be np.ndarray.

Returns
-------
applied : Categorical or np.ndarray.
"""
return self.values.map(mapper)

def delete(self, loc):
"""
Make new Index with passed location(-s) deleted
Expand Down
27 changes: 27 additions & 0 deletions pandas/tests/indexes/test_category.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,33 @@ def test_min_max(self):
self.assertEqual(ci.min(), 'c')
self.assertEqual(ci.max(), 'b')

def test_map(self):
ci = pd.CategoricalIndex(list('ABABC'), categories=list('CBA'),
ordered=True)
result = ci.map(lambda x: x.lower())
exp = pd.Categorical(list('ababc'), categories=list('cba'),
ordered=True)
tm.assert_categorical_equal(result, exp)

ci = pd.CategoricalIndex(list('ABABC'), categories=list('BAC'),
ordered=False, name='XXX')
result = ci.map(lambda x: x.lower())
exp = pd.Categorical(list('ababc'), categories=list('bac'),
ordered=False)
tm.assert_categorical_equal(result, exp)

tm.assert_numpy_array_equal(ci.map(lambda x: 1), np.array([1] * 5))

# change categories dtype
ci = pd.CategoricalIndex(list('ABABC'), categories=list('BAC'),
ordered=False)
def f(x):
return {'A': 10, 'B': 20, 'C': 30}.get(x)
result = ci.map(f)
exp = pd.Categorical([10, 20, 10, 20, 30], categories=[20, 10, 30],
ordered=False)
tm.assert_categorical_equal(result, exp)

def test_append(self):

ci = self.create_index()
Expand Down
19 changes: 19 additions & 0 deletions pandas/tests/series/test_analytics.py
Original file line number Diff line number Diff line change
Expand Up @@ -1567,6 +1567,25 @@ def test_sortlevel(self):
res = s.sortlevel(['A', 'B'], sort_remaining=False)
assert_series_equal(s, res)

def test_apply_categorical(self):
values = pd.Categorical(list('ABBABCD'), categories=list('DCBA'),
ordered=True)
s = pd.Series(values, name='XX', index=list('abcdefg'))
result = s.apply(lambda x: x.lower())

# should be categorical dtype when the number of categories are
# the same
values = pd.Categorical(list('abbabcd'), categories=list('dcba'),
ordered=True)
exp = pd.Series(values, name='XX', index=list('abcdefg'))
tm.assert_series_equal(result, exp)
tm.assert_categorical_equal(result.values, exp.values)

result = s.apply(lambda x: 'A')
exp = pd.Series(['A'] * 7, name='XX', index=list('abcdefg'))
tm.assert_series_equal(result, exp)
self.assertEqual(result.dtype, np.object)

def test_shift_int(self):
ts = self.ts.astype(int)
shifted = ts.shift(1)
Expand Down
Loading