Skip to content

Commit

Permalink
BUG: Series.map may raise TypeError in Categorical or DatetimeTz
Browse files Browse the repository at this point in the history
  • Loading branch information
sinhrks committed Apr 17, 2016
1 parent 3bed097 commit 027689e
Show file tree
Hide file tree
Showing 13 changed files with 238 additions and 30 deletions.
5 changes: 5 additions & 0 deletions doc/source/whatsnew/v0.18.1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@ API changes

- ``Period`` and ``PeriodIndex`` now raises ``IncompatibleFrequency`` error which inherits ``ValueError`` rather than raw ``ValueError`` (:issue:`12615`)

- ``Series.apply`` for category dtype now applies passed function to each ``.categories`` (not ``.codes``), and returns "category" dtype if possible (:issue:`12473`)


- The default for ``.query()/.eval()`` is now ``engine=None``, which will use ``numexpr`` if it's installed; otherwise it will fallback to the ``python`` engine. This mimics the pre-0.18.1 behavior if ``numexpr`` is installed (and which Previously, if numexpr was not installed, ``.query()/.eval()`` would raise). (:issue:`12749`)
Expand Down Expand Up @@ -324,4 +325,8 @@ Bug Fixes
- ``pd.read_excel()`` now accepts path objects (e.g. ``pathlib.Path``, ``py.path.local``) for the file path, in line with other ``read_*`` functions (:issue:`12655`)
- ``pd.read_excel()`` now accepts column names associated with keyword argument ``names``(:issue `12870`)


- Bug in ``fill_value`` is ignored if the argument to a binary operator is a constant (:issue `12723`)


- Bug in ``Series.map`` raises ``TypeError`` if its dtype is ``category`` or tz-aware ``datetime`` (:issue:`12473`)
24 changes: 24 additions & 0 deletions pandas/core/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -883,6 +883,30 @@ def remove_unused_categories(self, inplace=False):
if not inplace:
return cat

def map(self, mapper):
"""
Apply mapper function to its categories (not codes).
Parameters
----------
mapper : callable
Function to be applied. When all categories are mapped
to different categories, the result will be Categorical which has
the same order property as the original. Otherwise, the result will
be np.ndarray.
Returns
-------
applied : Categorical or np.ndarray.
"""
new_categories = self.categories.map(mapper)
try:
return Categorical.from_codes(self._codes.copy(),
categories=new_categories,
ordered=self.ordered)
except ValueError:
return np.take(new_categories, self._codes)

__eq__ = _cat_compare_op('__eq__')
__ne__ = _cat_compare_op('__ne__')
__lt__ = _cat_compare_op('__lt__')
Expand Down
4 changes: 2 additions & 2 deletions pandas/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -705,7 +705,7 @@ def _maybe_upcast(values, fill_value=np.nan, dtype=None, copy=False):
copy : if True always make a copy even if no upcast is required
"""

if is_internal_type(values):
if is_extension_type(values):
if copy:
values = values.copy()
else:
Expand Down Expand Up @@ -1714,7 +1714,7 @@ def is_datetimetz(array):
is_datetime64tz_dtype(array))


def is_internal_type(value):
def is_extension_type(value):
"""
if we are a klass that is preserved by the internals
these are internal klasses that we represent (and don't use a np.array)
Expand Down
6 changes: 3 additions & 3 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
isnull, notnull, PandasError, _try_sort, _default_index, _maybe_upcast,
is_sequence, _infer_dtype_from_scalar, _values_from_object, is_list_like,
_maybe_box_datetimelike, is_categorical_dtype, is_object_dtype,
is_internal_type, is_datetimetz, _possibly_infer_to_datetimelike,
is_extension_type, is_datetimetz, _possibly_infer_to_datetimelike,
_dict_compat)
from pandas.core.generic import NDFrame, _shared_docs
from pandas.core.index import Index, MultiIndex, _ensure_index
Expand Down Expand Up @@ -2594,7 +2594,7 @@ def reindexer(value):
value = com._possibly_cast_to_datetime(value, dtype)

# return internal types directly
if is_internal_type(value):
if is_extension_type(value):
return value

# broadcast across multiple columns if necessary
Expand Down Expand Up @@ -4094,7 +4094,7 @@ def _apply_standard(self, func, axis, ignore_failures=False, reduce=True):

# we cannot reduce using non-numpy dtypes,
# as demonstrated in gh-12244
if not is_internal_type(values):
if not is_extension_type(values):
# Create a dummy Series from an empty array
index = self._get_axis(axis)
empty_arr = np.empty(len(index), dtype=values.dtype)
Expand Down
10 changes: 5 additions & 5 deletions pandas/core/internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
_maybe_convert_string_to_object,
_maybe_convert_scalar,
is_categorical, is_datetimelike_v_numeric,
is_numeric_v_string_like, is_internal_type)
is_numeric_v_string_like, is_extension_type)
import pandas.core.algorithms as algos
from pandas.types.api import DatetimeTZDtype

Expand Down Expand Up @@ -1765,7 +1765,7 @@ def should_store(self, value):
return not (issubclass(value.dtype.type,
(np.integer, np.floating, np.complexfloating,
np.datetime64, np.bool_)) or
is_internal_type(value))
is_extension_type(value))

def replace(self, to_replace, value, inplace=False, filter=None,
regex=False, convert=True, mgr=None):
Expand Down Expand Up @@ -3388,10 +3388,10 @@ def set(self, item, value, check=False):
# FIXME: refactor, clearly separate broadcasting & zip-like assignment
# can prob also fix the various if tests for sparse/categorical

value_is_internal_type = is_internal_type(value)
value_is_extension_type = is_extension_type(value)

# categorical/spares/datetimetz
if value_is_internal_type:
if value_is_extension_type:

def value_getitem(placement):
return value
Expand Down Expand Up @@ -3463,7 +3463,7 @@ def value_getitem(placement):
unfit_count = len(unfit_mgr_locs)

new_blocks = []
if value_is_internal_type:
if value_is_extension_type:
# This code (ab-)uses the fact that sparse blocks contain only
# one item.
new_blocks.extend(
Expand Down
40 changes: 25 additions & 15 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
is_categorical_dtype,
_possibly_cast_to_datetime,
_possibly_castable, _possibly_convert_platform,
_try_sort, is_internal_type, is_datetimetz,
_try_sort, is_extension_type, is_datetimetz,
_maybe_match_name, ABCSparseArray,
_coerce_to_dtype, SettingWithCopyError,
_maybe_box_datetimelike, ABCDataFrame,
Expand Down Expand Up @@ -2063,28 +2063,33 @@ def map(self, arg, na_action=None):
y : Series
same index as caller
"""
values = self.asobject

if na_action == 'ignore':
mask = isnull(values)

def map_f(values, f):
return lib.map_infer_mask(values, f, mask.view(np.uint8))
if is_extension_type(self.dtype):
values = self._values
if na_action is not None:
raise NotImplementedError
map_f = lambda values, f: values.map(f)
else:
map_f = lib.map_infer
values = self.asobject

if na_action == 'ignore':
def map_f(values, f):
return lib.map_infer_mask(values, f,
isnull(values).view(np.uint8))
else:
map_f = lib.map_infer

if isinstance(arg, (dict, Series)):
if isinstance(arg, dict):
arg = self._constructor(arg, index=arg.keys())

indexer = arg.index.get_indexer(values)
new_values = algos.take_1d(arg._values, indexer)
return self._constructor(new_values,
index=self.index).__finalize__(self)
else:
mapped = map_f(values, arg)
return self._constructor(mapped,
index=self.index).__finalize__(self)
new_values = map_f(values, arg)

return self._constructor(new_values,
index=self.index).__finalize__(self)

def apply(self, func, convert_dtype=True, args=(), **kwds):
"""
Expand Down Expand Up @@ -2193,7 +2198,12 @@ def apply(self, func, convert_dtype=True, args=(), **kwds):
if isinstance(f, np.ufunc):
return f(self)

mapped = lib.map_infer(self.asobject, f, convert=convert_dtype)
if is_extension_type(self.dtype):
mapped = self._values.map(f)
else:
values = self.asobject
mapped = lib.map_infer(values, f, convert=convert_dtype)

if len(mapped) and isinstance(mapped[0], Series):
from pandas.core.frame import DataFrame
return DataFrame(mapped.tolist(), index=self.index)
Expand Down Expand Up @@ -2779,7 +2789,7 @@ def _try_cast(arr, take_fast_path):

try:
subarr = _possibly_cast_to_datetime(arr, dtype)
if not is_internal_type(subarr):
if not is_extension_type(subarr):
subarr = np.array(subarr, dtype=dtype, copy=copy)
except (ValueError, TypeError):
if is_categorical_dtype(dtype):
Expand Down
13 changes: 12 additions & 1 deletion pandas/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2194,11 +2194,22 @@ def groupby(self, to_groupby):
-------
groups : dict
{group name -> group labels}
"""
return self._groupby(self.values, _values_from_object(to_groupby))

def map(self, mapper):
"""
Apply mapper function to its values.
Parameters
----------
mapper : callable
Function to be applied.
Returns
-------
applied : array
"""
return self._arrmap(self.values, mapper)

def isin(self, values, level=None):
Expand Down
18 changes: 18 additions & 0 deletions pandas/indexes/category.py
Original file line number Diff line number Diff line change
Expand Up @@ -468,6 +468,24 @@ def take(self, indices, axis=0, allow_fill=True, fill_value=None):
na_value=-1)
return self._create_from_codes(taken)

def map(self, mapper):
"""
Apply mapper function to its categories (not codes).
Parameters
----------
mapper : callable
Function to be applied. When all categories are mapped
to different categories, the result will be Categorical which has
the same order property as the original. Otherwise, the result will
be np.ndarray.
Returns
-------
applied : Categorical or np.ndarray.
"""
return self.values.map(mapper)

def delete(self, loc):
"""
Make new Index with passed location(-s) deleted
Expand Down
27 changes: 27 additions & 0 deletions pandas/tests/indexes/test_category.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,33 @@ def test_min_max(self):
self.assertEqual(ci.min(), 'c')
self.assertEqual(ci.max(), 'b')

def test_map(self):
ci = pd.CategoricalIndex(list('ABABC'), categories=list('CBA'),
ordered=True)
result = ci.map(lambda x: x.lower())
exp = pd.Categorical(list('ababc'), categories=list('cba'),
ordered=True)
tm.assert_categorical_equal(result, exp)

ci = pd.CategoricalIndex(list('ABABC'), categories=list('BAC'),
ordered=False, name='XXX')
result = ci.map(lambda x: x.lower())
exp = pd.Categorical(list('ababc'), categories=list('bac'),
ordered=False)
tm.assert_categorical_equal(result, exp)

tm.assert_numpy_array_equal(ci.map(lambda x: 1), np.array([1] * 5))

# change categories dtype
ci = pd.CategoricalIndex(list('ABABC'), categories=list('BAC'),
ordered=False)
def f(x):
return {'A': 10, 'B': 20, 'C': 30}.get(x)
result = ci.map(f)
exp = pd.Categorical([10, 20, 10, 20, 30], categories=[20, 10, 30],
ordered=False)
tm.assert_categorical_equal(result, exp)

def test_append(self):

ci = self.create_index()
Expand Down
19 changes: 19 additions & 0 deletions pandas/tests/series/test_analytics.py
Original file line number Diff line number Diff line change
Expand Up @@ -1567,6 +1567,25 @@ def test_sortlevel(self):
res = s.sortlevel(['A', 'B'], sort_remaining=False)
assert_series_equal(s, res)

def test_apply_categorical(self):
values = pd.Categorical(list('ABBABCD'), categories=list('DCBA'),
ordered=True)
s = pd.Series(values, name='XX', index=list('abcdefg'))
result = s.apply(lambda x: x.lower())

# should be categorical dtype when the number of categories are
# the same
values = pd.Categorical(list('abbabcd'), categories=list('dcba'),
ordered=True)
exp = pd.Series(values, name='XX', index=list('abcdefg'))
tm.assert_series_equal(result, exp)
tm.assert_categorical_equal(result.values, exp.values)

result = s.apply(lambda x: 'A')
exp = pd.Series(['A'] * 7, name='XX', index=list('abcdefg'))
tm.assert_series_equal(result, exp)
self.assertEqual(result.dtype, np.object)

def test_shift_int(self):
ts = self.ts.astype(int)
shifted = ts.shift(1)
Expand Down
Loading

0 comments on commit 027689e

Please sign in to comment.