diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index e8510a8d7c345..c4dcc01d68cd8 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -204,6 +204,7 @@ Other enhancements - :meth:`.DataFrameGroupBy.transform`, :meth:`.SeriesGroupBy.transform`, :meth:`.DataFrameGroupBy.agg`, :meth:`.SeriesGroupBy.agg`, :meth:`.SeriesGroupBy.apply`, :meth:`.DataFrameGroupBy.apply` now support ``kurt`` (:issue:`40139`) - :meth:`DataFrame.apply` supports using third-party execution engines like the Bodo.ai JIT compiler (:issue:`60668`) - :meth:`DataFrame.iloc` and :meth:`Series.iloc` now support boolean masks in ``__getitem__`` for more consistent indexing behavior (:issue:`60994`) +- :meth:`DataFrame.rank` now preserves the ``dtype_backend`` for extension arrays (:issue:`52829`) - :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` now support Python's new-style format strings (e.g., ``"{:.6f}"``) for the ``float_format`` parameter, in addition to old-style ``%`` format strings and callables. This allows for more flexible and modern formatting of floating point numbers when exporting to CSV. (:issue:`49580`) - :meth:`DataFrameGroupBy.transform`, :meth:`SeriesGroupBy.transform`, :meth:`DataFrameGroupBy.agg`, :meth:`SeriesGroupBy.agg`, :meth:`RollingGroupby.apply`, :meth:`ExpandingGroupby.apply`, :meth:`Rolling.apply`, :meth:`Expanding.apply`, :meth:`DataFrame.apply` with ``engine="numba"`` now supports positional arguments passed as kwargs (:issue:`58995`) - :meth:`Rolling.agg`, :meth:`Expanding.agg` and :meth:`ExponentialMovingWindow.agg` now accept :class:`NamedAgg` aggregations through ``**kwargs`` (:issue:`28333`) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 8126bd072a8dc..a2547af7a6dc3 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1091,7 +1091,6 @@ def rank( ) else: raise TypeError("Array with ndim > 2 are not supported.") - return ranks diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 07c297b2c15ff..cd30588309a23 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -2414,8 +2414,6 @@ def _rank( """ See Series.rank.__doc__. """ - if axis != 0: - raise NotImplementedError return rank( self, diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 93a7de467dd97..1ef48f1a150bf 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -9276,16 +9276,11 @@ def rank( msg = "na_option must be one of 'keep', 'top', or 'bottom'" raise ValueError(msg) - def ranker(data): - if data.ndim == 2: - # i.e. DataFrame, we cast to ndarray - values = data.values - else: - # i.e. Series, can dispatch to EA - values = data._values - - if isinstance(values, ExtensionArray): - ranks = values._rank( + def ranker(blk_values): + if axis_int == 0: + blk_values = blk_values.T + if isinstance(blk_values, ExtensionArray): + ranks = blk_values._rank( axis=axis_int, method=method, ascending=ascending, @@ -9294,16 +9289,16 @@ def ranker(data): ) else: ranks = algos.rank( - values, + blk_values, axis=axis_int, method=method, ascending=ascending, na_option=na_option, pct=pct, ) - - ranks_obj = self._constructor(ranks, **data._construct_axes_dict()) - return ranks_obj.__finalize__(self, method="rank") + if axis_int == 0: + ranks = ranks.T + return ranks if numeric_only: if self.ndim == 1 and not is_numeric_dtype(self.dtype): @@ -9316,7 +9311,16 @@ def ranker(data): else: data = self - return ranker(data) + should_transpose = axis_int == 1 + + if should_transpose: + data = data.T + applied = data._mgr.apply(ranker) + result = self._constructor_from_mgr(applied, axes=applied.axes) + if should_transpose: + result = result.T + + return result.__finalize__(self, method="rank") @doc(_shared_docs["compare"], klass=_shared_doc_kwargs["klass"]) def compare( diff --git a/pandas/tests/frame/methods/test_rank.py b/pandas/tests/frame/methods/test_rank.py index 6c6c208ee0c78..258d4c60dd4b9 100644 --- a/pandas/tests/frame/methods/test_rank.py +++ b/pandas/tests/frame/methods/test_rank.py @@ -10,11 +10,14 @@ Infinity, NegInfinity, ) +import pandas.util._test_decorators as td from pandas import ( DataFrame, Index, Series, + to_datetime, + to_timedelta, ) import pandas._testing as tm @@ -498,3 +501,96 @@ def test_rank_string_dtype(self, string_dtype_no_object): exp_dtype = "float64" expected = Series([1, 2, None, 3], dtype=exp_dtype) tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "method,og_dtype,expected_dtype", + [ + ("average", "UInt32", "Float64"), + ("average", "Float32", "Float64"), + pytest.param( + "average", + "int32[pyarrow]", + "double[pyarrow]", + marks=td.skip_if_no("pyarrow"), + ), + ("min", "Int32", "UInt64"), + ("min", "Float32", "UInt64"), + pytest.param( + "min", + "int32[pyarrow]", + "uint64[pyarrow]", + marks=td.skip_if_no("pyarrow"), + ), + ], + ) + def test_rank_extension_array_dtype(self, method, og_dtype, expected_dtype): + # GH#52829 + result = DataFrame([4, 89, 33], dtype=og_dtype).rank(method=method) + if method == "average": + expected = DataFrame([1.0, 3.0, 2.0], dtype=expected_dtype) + else: + expected = DataFrame([1, 3, 2], dtype=expected_dtype) + tm.assert_frame_equal(result, expected) + + def test_rank_mixed_extension_array_dtype(self): + # GH#52829 + pytest.importorskip("pyarrow") + result = DataFrame( + { + "base": Series([4, 5, 6]), + "pyarrow": Series([7, 8, 9], dtype="int32[pyarrow]"), + } + ).rank(method="min") + expected = DataFrame( + { + "base": Series([1.0, 2.0, 3.0], dtype="float64"), + "pyarrow": Series([1, 2, 3], dtype="uint64[pyarrow]"), + } + ) + tm.assert_frame_equal(result, expected) + + def test_2d_extension_array_datetime(self): + # GH#52829 + df = DataFrame( + { + "year": to_datetime(["2012-1-1", "2013-1-1", "2014-1-1"]), + "week": to_datetime(["2012-1-2", "2012-1-9", "2012-1-16"]), + "day": to_datetime(["2012-1-3", "2012-1-4", "2012-1-5"]), + } + ) + axis0_expected = DataFrame( + {"year": [1.0, 2.0, 3.0], "week": [1.0, 2.0, 3.0], "day": [1.0, 2.0, 3.0]} + ) + axis1_expected = DataFrame( + {"year": [1.0, 3.0, 3.0], "week": [2.0, 2.0, 2.0], "day": [3.0, 1.0, 1.0]} + ) + tm.assert_frame_equal(df.rank(), axis0_expected) + tm.assert_frame_equal(df.rank(1), axis1_expected) + + def test_2d_extension_array_timedelta(self): + # GH#52829 + df = DataFrame( + { + "day": to_timedelta(["0 days", "1 day", "2 days"]), + "hourly": to_timedelta(["23 hours", "24 hours", "25 hours"]), + "minute": to_timedelta( + ["1439 minutes", "1440 minutes", "1441 minutes"] + ), + } + ) + axis0_expected = DataFrame( + { + "day": [1.0, 2.0, 3.0], + "hourly": [1.0, 2.0, 3.0], + "minute": [1.0, 2.0, 3.0], + } + ) + axis1_expected = DataFrame( + { + "day": [1.0, 2.0, 3.0], + "hourly": [2.0, 2.0, 2.0], + "minute": [3.0, 2.0, 1.0], + } + ) + tm.assert_frame_equal(df.rank(), axis0_expected) + tm.assert_frame_equal(df.rank(1), axis1_expected)