API: map() on Index returns an Index, not array

closes #12766 closes #12798 This is a follow on to #12798. Author: Nate Yoder <nate@whistle.com> Closes #14506 from nateyoder/index_map_index and squashes the following commits: 95e4440 [Nate Yoder] fix typo and add ref tag in whatsnew b36e83c [Nate Yoder] update whatsnew, fix documentation 4635e6a [Nate Yoder] compare as index a17ddab [Nate Yoder] Fix unused import and docstrings per pep8radius docformatter; change other uses of assert_index_equal to testing instead os self ab168e7 [Nate Yoder] Update whatsnew and add git PR to tests to denote changes 504c2a2 [Nate Yoder] Fix tests that weren't run by PyCharm 23c133d [Nate Yoder] Update tests to match dtype int64 07b772a [Nate Yoder] use the numpy results if we can to avoid repeating the computation just to create the object a110be9 [Nate Yoder] make map on time tseries indices return index if dtype of output is not a tseries; sphinx changes; fix docstring a596744 [Nate Yoder] introspect results from map so that if the output array has tuples we create a multiindex instead of an index 5fc66c3 [Nate Yoder] make map return an index if it operates on an index, multi index, or categorical index; map on a categorical will either return a categorical or an index (rather than a numpy array)
pandas-dev · Dec 16, 2016 · 6f4e36a · 6f4e36a
1 parent 2566223
commit 6f4e36a
Show file tree

Hide file tree

Showing 13 changed files with 188 additions and 61 deletions.
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -91,8 +91,77 @@ Other enhancements
 Backwards incompatible API changes
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. _whatsnew_0200.api:
 
+.. _whatsnew.api_breaking.index_map
+
+Map on Index types now return other Index types
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+-  ``map`` on an ``Index`` now returns an ``Index``, not a numpy array (:issue:`12766`)
+
+  .. ipython:: python
+
+     idx = Index([1, 2])
+     idx
+     mi = MultiIndex.from_tuples([(1, 2), (2, 4)])
+     mi
+
+  Previous Behavior:
+
+  .. code-block:: ipython
+
+     In [5]: idx.map(lambda x: x * 2)
+     Out[5]: array([2, 4])
+
+     In [6]: idx.map(lambda x: (x, x * 2))
+     Out[6]: array([(1, 2), (2, 4)], dtype=object)
+
+     In [7]: mi.map(lambda x: x)
+     Out[7]: array([(1, 2), (2, 4)], dtype=object)
+
+     In [8]: mi.map(lambda x: x[0])
+     Out[8]: array([1, 2])
+
+  New Behavior:
+
+  .. ipython:: python
+
+      idx.map(lambda x: x * 2)
+
+      idx.map(lambda x: (x, x * 2))
+
+      mi.map(lambda x: x)
+
+      mi.map(lambda x: x[0])
+
+
+-  ``map`` on a Series with datetime64 values may return int64 dtypes rather than int32
+
+  .. ipython:: python
+
+    s = Series(date_range('2011-01-02T00:00', '2011-01-02T02:00', freq='H').tz_localize('Asia/Tokyo'))
+    s
+
+  Previous Behavior:
+
+  .. code-block:: ipython
+
+    In [9]: s.map(lambda x: x.hour)
+    Out[9]:
+    0    0
+    1    1
+    2    2
+    dtype: int32
+
+
+  New Behavior:
+
+  .. ipython:: python
+
+    s.map(lambda x: x.hour)
+
+
+  .. _whatsnew_0200.api:
 
 - ``CParserError`` has been renamed to ``ParserError`` in ``pd.read_csv`` and will be removed in the future (:issue:`12665`)
 - ``SparseArray.cumsum()`` and ``SparseSeries.cumsum()`` will now always return ``SparseArray`` and ``SparseSeries`` respectively (:issue:`12855`)

diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py
@@ -930,8 +930,7 @@ def remove_unused_categories(self, inplace=False):
             return cat
 
     def map(self, mapper):
-        """
-        Apply mapper function to its categories (not codes).
+        """Apply mapper function to its categories (not codes).
 
         Parameters
         ----------
@@ -943,7 +942,8 @@ def map(self, mapper):
 
         Returns
         -------
-        applied : Categorical or np.ndarray.
+        applied : Categorical or Index.
+
         """
         new_categories = self.categories.map(mapper)
         try:

diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py
@@ -2427,8 +2427,7 @@ def groupby(self, values):
         return result
 
     def map(self, mapper):
-        """
-        Apply mapper function to its values.
+        """Apply mapper function to an index.
 
         Parameters
         ----------
@@ -2437,9 +2436,21 @@ def map(self, mapper):
 
         Returns
         -------
-        applied : array
+        applied : Union[Index, MultiIndex], inferred
+            The output of the mapping function applied to the index.
+            If the function returns a tuple with more than one element
+            a MultiIndex will be returned.
+
         """
-        return self._arrmap(self.values, mapper)
+        from .multi import MultiIndex
+        mapped_values = self._arrmap(self.values, mapper)
+        attributes = self._get_attributes_dict()
+        if mapped_values.size and isinstance(mapped_values[0], tuple):
+            return MultiIndex.from_tuples(mapped_values,
+                                          names=attributes.get('name'))
+
+        attributes['copy'] = False
+        return Index(mapped_values, **attributes)
 
     def isin(self, values, level=None):
         """

diff --git a/pandas/indexes/category.py b/pandas/indexes/category.py
@@ -517,22 +517,22 @@ def take(self, indices, axis=0, allow_fill=True,
         return self._create_from_codes(taken)
 
     def map(self, mapper):
-        """
-        Apply mapper function to its categories (not codes).
+        """Apply mapper function to its categories (not codes).
 
         Parameters
         ----------
         mapper : callable
             Function to be applied. When all categories are mapped
-            to different categories, the result will be Categorical which has
-            the same order property as the original. Otherwise, the result will
-            be np.ndarray.
+            to different categories, the result will be a CategoricalIndex
+            which has the same order property as the original. Otherwise,
+            the result will be a Index.
 
         Returns
         -------
-        applied : Categorical or np.ndarray.
+        applied : CategoricalIndex or Index
+
         """
-        return self.values.map(mapper)
+        return self._shallow_copy_with_infer(self.values.map(mapper))
 
     def delete(self, loc):
         """

diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py
@@ -767,6 +767,48 @@ def test_sub(self):
         self.assertRaises(TypeError, lambda: idx - idx.tolist())
         self.assertRaises(TypeError, lambda: idx.tolist() - idx)
 
+    def test_map_identity_mapping(self):
+        # GH 12766
+        for name, cur_index in self.indices.items():
+            tm.assert_index_equal(cur_index, cur_index.map(lambda x: x))
+
+    def test_map_with_tuples(self):
+        # GH 12766
+
+        # Test that returning a single tuple from an Index
+        #   returns an Index.
+        boolean_index = tm.makeIntIndex(3).map(lambda x: (x,))
+        expected = Index([(0,), (1,), (2,)])
+        tm.assert_index_equal(boolean_index, expected)
+
+        # Test that returning a tuple from a map of a single index
+        #   returns a MultiIndex object.
+        boolean_index = tm.makeIntIndex(3).map(lambda x: (x, x == 1))
+        expected = MultiIndex.from_tuples([(0, False), (1, True), (2, False)])
+        tm.assert_index_equal(boolean_index, expected)
+
+        # Test that returning a single object from a MultiIndex
+        #   returns an Index.
+        first_level = ['foo', 'bar', 'baz']
+        multi_index = MultiIndex.from_tuples(lzip(first_level, [1, 2, 3]))
+        reduced_index = multi_index.map(lambda x: x[0])
+        tm.assert_index_equal(reduced_index, Index(first_level))
+
+    def test_map_tseries_indices_return_index(self):
+        date_index = tm.makeDateIndex(10)
+        exp = Index([1] * 10)
+        tm.assert_index_equal(exp, date_index.map(lambda x: 1))
+
+        period_index = tm.makePeriodIndex(10)
+        tm.assert_index_equal(exp, period_index.map(lambda x: 1))
+
+        tdelta_index = tm.makeTimedeltaIndex(10)
+        tm.assert_index_equal(exp, tdelta_index.map(lambda x: 1))
+
+        date_index = tm.makeDateIndex(24, freq='h', name='hourly')
+        exp = Index(range(24), name='hourly')
+        tm.assert_index_equal(exp, date_index.map(lambda x: x.hour))
+
     def test_append_multiple(self):
         index = Index(['a', 'b', 'c', 'd', 'e', 'f'])
 
@@ -1194,16 +1236,16 @@ def check_slice(in_slice, expected):
             self.assert_index_equal(result, expected)
 
         for in_slice, expected in [
-                (SLC[::-1], 'yxdcb'), (SLC['b':'y':-1], ''),
-                (SLC['b'::-1], 'b'), (SLC[:'b':-1], 'yxdcb'),
-                (SLC[:'y':-1], 'y'), (SLC['y'::-1], 'yxdcb'),
-                (SLC['y'::-4], 'yb'),
-                # absent labels
-                (SLC[:'a':-1], 'yxdcb'), (SLC[:'a':-2], 'ydb'),
-                (SLC['z'::-1], 'yxdcb'), (SLC['z'::-3], 'yc'),
-                (SLC['m'::-1], 'dcb'), (SLC[:'m':-1], 'yx'),
-                (SLC['a':'a':-1], ''), (SLC['z':'z':-1], ''),
-                (SLC['m':'m':-1], '')
+            (SLC[::-1], 'yxdcb'), (SLC['b':'y':-1], ''),
+            (SLC['b'::-1], 'b'), (SLC[:'b':-1], 'yxdcb'),
+            (SLC[:'y':-1], 'y'), (SLC['y'::-1], 'yxdcb'),
+            (SLC['y'::-4], 'yb'),
+            # absent labels
+            (SLC[:'a':-1], 'yxdcb'), (SLC[:'a':-2], 'ydb'),
+            (SLC['z'::-1], 'yxdcb'), (SLC['z'::-3], 'yc'),
+            (SLC['m'::-1], 'dcb'), (SLC[:'m':-1], 'yx'),
+            (SLC['a':'a':-1], ''), (SLC['z':'z':-1], ''),
+            (SLC['m':'m':-1], '')
         ]:
             check_slice(in_slice, expected)
 

diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py
@@ -207,19 +207,20 @@ def test_map(self):
         ci = pd.CategoricalIndex(list('ABABC'), categories=list('CBA'),
                                  ordered=True)
         result = ci.map(lambda x: x.lower())
-        exp = pd.Categorical(list('ababc'), categories=list('cba'),
-                             ordered=True)
-        tm.assert_categorical_equal(result, exp)
+        exp = pd.CategoricalIndex(list('ababc'), categories=list('cba'),
+                                  ordered=True)
+        tm.assert_index_equal(result, exp)
 
         ci = pd.CategoricalIndex(list('ABABC'), categories=list('BAC'),
                                  ordered=False, name='XXX')
         result = ci.map(lambda x: x.lower())
-        exp = pd.Categorical(list('ababc'), categories=list('bac'),
-                             ordered=False)
-        tm.assert_categorical_equal(result, exp)
+        exp = pd.CategoricalIndex(list('ababc'), categories=list('bac'),
+                                  ordered=False, name='XXX')
+        tm.assert_index_equal(result, exp)
 
-        tm.assert_numpy_array_equal(ci.map(lambda x: 1),
-                                    np.array([1] * 5, dtype=np.int64))
+        # GH 12766: Return an index not an array
+        tm.assert_index_equal(ci.map(lambda x: 1),
+                              Index(np.array([1] * 5, dtype=np.int64), name='XXX'))
 
         # change categories dtype
         ci = pd.CategoricalIndex(list('ABABC'), categories=list('BAC'),
@@ -228,9 +229,9 @@ def f(x):
             return {'A': 10, 'B': 20, 'C': 30}.get(x)
 
         result = ci.map(f)
-        exp = pd.Categorical([10, 20, 10, 20, 30], categories=[20, 10, 30],
-                             ordered=False)
-        tm.assert_categorical_equal(result, exp)
+        exp = pd.CategoricalIndex([10, 20, 10, 20, 30], categories=[20, 10, 30],
+                                  ordered=False)
+        tm.assert_index_equal(result, exp)
 
     def test_where(self):
         i = self.create_index()

diff --git a/pandas/tests/series/test_apply.py b/pandas/tests/series/test_apply.py
@@ -123,8 +123,9 @@ def test_apply_datetimetz(self):
         tm.assert_series_equal(result, exp)
 
         # change dtype
+        # GH 14506 : Returned dtype changed from int32 to int64
         result = s.apply(lambda x: x.hour)
-        exp = pd.Series(list(range(24)) + [0], name='XX', dtype=np.int32)
+        exp = pd.Series(list(range(24)) + [0], name='XX', dtype=np.int64)
         tm.assert_series_equal(result, exp)
 
         # not vectorized
@@ -317,8 +318,9 @@ def test_map_datetimetz(self):
         tm.assert_series_equal(result, exp)
 
         # change dtype
+        # GH 14506 : Returned dtype changed from int32 to int64
         result = s.map(lambda x: x.hour)
-        exp = pd.Series(list(range(24)) + [0], name='XX', dtype=np.int32)
+        exp = pd.Series(list(range(24)) + [0], name='XX', dtype=np.int64)
         tm.assert_series_equal(result, exp)
 
         with tm.assertRaises(NotImplementedError):

diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py
@@ -1669,7 +1669,8 @@ def test_map(self):
         tm.assert_categorical_equal(result, exp)
 
         result = c.map(lambda x: 1)
-        tm.assert_numpy_array_equal(result, np.array([1] * 5, dtype=np.int64))
+        # GH 12766: Return an index not an array
+        tm.assert_index_equal(result, Index(np.array([1] * 5, dtype=np.int64)))
 
 
 class TestCategoricalAsBlock(tm.TestCase):

diff --git a/pandas/tseries/base.py b/pandas/tseries/base.py
@@ -27,7 +27,6 @@
 from pandas.util.decorators import Appender, cache_readonly
 import pandas.types.concat as _concat
 import pandas.tseries.frequencies as frequencies
-import pandas.algos as _algos
 
 
 class DatelikeOps(object):
@@ -330,11 +329,16 @@ def _nat_new(self, box=True):
     def map(self, f):
         try:
             result = f(self)
-            if not isinstance(result, (np.ndarray, Index)):
-                raise TypeError
+
+            # Try to use this result if we can
+            if isinstance(result, np.ndarray):
+                self._shallow_copy(result)
+
+            if not isinstance(result, Index):
+                raise TypeError('The map function must return an Index object')
             return result
         except Exception:
-            return _algos.arrmap_object(self.asobject.values, f)
+            return self.asobject.map(f)
 
     def sort_values(self, return_indexer=False, ascending=True):
         """

diff --git a/pandas/tseries/tests/test_converter.py b/pandas/tseries/tests/test_converter.py
@@ -3,7 +3,7 @@
 import nose
 
 import numpy as np
-from pandas import Timestamp, Period
+from pandas import Timestamp, Period, Index
 from pandas.compat import u
 import pandas.util.testing as tm
 from pandas.tseries.offsets import Second, Milli, Micro
@@ -104,8 +104,8 @@ def test_dateindex_conversion(self):
         for freq in ('B', 'L', 'S'):
             dateindex = tm.makeDateIndex(k=10, freq=freq)
             rs = self.dtc.convert(dateindex, None, None)
-            xp = converter.dates.date2num(dateindex._mpl_repr())
-            tm.assert_almost_equal(rs, xp, decimals)
+            xp = Index(converter.dates.date2num(dateindex._mpl_repr()))
+            tm.assert_index_equal(rs, xp, decimals)
 
     def test_resolution(self):
         def _assert_less(ts1, ts2):

diff --git a/pandas/tseries/tests/test_period.py b/pandas/tseries/tests/test_period.py
@@ -3521,8 +3521,8 @@ def test_map(self):
         tm.assert_index_equal(result, expected)
 
         result = index.map(lambda x: x.ordinal)
-        exp = np.array([x.ordinal for x in index], dtype=np.int64)
-        tm.assert_numpy_array_equal(result, exp)
+        exp = Index([x.ordinal for x in index])
+        tm.assert_index_equal(result, exp)
 
     def test_map_with_string_constructor(self):
         raw = [2005, 2007, 2009]
@@ -3534,20 +3534,17 @@ def test_map_with_string_constructor(self):
             types += text_type,
 
         for t in types:
-            expected = np.array(lmap(t, raw), dtype=object)
+            expected = Index(lmap(t, raw))
             res = index.map(t)
 
-            # should return an array
-            tm.assertIsInstance(res, np.ndarray)
+            # should return an Index
+            tm.assertIsInstance(res, Index)
 
             # preserve element types
             self.assertTrue(all(isinstance(resi, t) for resi in res))
 
-            # dtype should be object
-            self.assertEqual(res.dtype, np.dtype('object').type)
-
             # lastly, values should compare equal
-            tm.assert_numpy_array_equal(res, expected)
+            tm.assert_index_equal(res, expected)
 
     def test_convert_array_of_periods(self):
         rng = period_range('1/1/2000', periods=20, freq='D')