Skip to content
Open
Show file tree
Hide file tree
Changes from 12 commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
1f8c628
init commit kendall spearman ordinal cats
pandeconscious Oct 23, 2025
906f1e4
Merge branch 'pandas-dev:main' into ordered_cat_corr
pandeconscious Oct 27, 2025
497dc7e
series test update and fixes
pandeconscious Oct 27, 2025
583aca6
cat desc longer in tests
pandeconscious Oct 27, 2025
e069810
testing frame corr
pandeconscious Oct 27, 2025
b90726f
pre commit fixes v2
pandeconscious Oct 27, 2025
65a506c
cleanup
pandeconscious Oct 27, 2025
ab3b8b9
Merge branch 'pandas-dev:main' into ordered_cat_corr
pandeconscious Nov 4, 2025
e93ed83
test import scipy fix
pandeconscious Nov 4, 2025
ec4d97e
rst sorting autofix
pandeconscious Nov 4, 2025
ebfc3b0
Merge branch 'pandas-dev:main' into ordered_cat_corr
pandeconscious Nov 4, 2025
8cfacef
Merge branch 'pandas-dev:main' into ordered_cat_corr
pandeconscious Nov 5, 2025
7ef7fb2
Merge branch 'pandas-dev:main' into ordered_cat_corr
pandeconscious Nov 12, 2025
588808a
refactor
pandeconscious Nov 12, 2025
c484552
fix dtype for duplicates
pandeconscious Nov 12, 2025
216475c
Merge branch 'pandas-dev:main' into ordered_cat_corr
pandeconscious Nov 16, 2025
e997747
clean up
pandeconscious Nov 16, 2025
4184167
Merge branch 'pandas-dev:main' into ordered_cat_corr
pandeconscious Nov 16, 2025
8bcd3dc
Merge branch 'pandas-dev:main' into ordered_cat_corr
pandeconscious Nov 18, 2025
2673281
clean up
pandeconscious Nov 18, 2025
ff48847
import fix
pandeconscious Nov 18, 2025
1c69e29
test tranform ordered cat func
pandeconscious Nov 18, 2025
8b26a7d
tests and mypy fixes
pandeconscious Nov 18, 2025
a625520
type check fix
pandeconscious Nov 18, 2025
259424e
addressing review comments
pandeconscious Nov 18, 2025
f141e6a
Merge branch 'main' into ordered_cat_corr
pandeconscious Nov 18, 2025
d2d0f71
type fix corr.py
pandeconscious Nov 19, 2025
858d0c2
ruff format
pandeconscious Nov 19, 2025
a8c88c7
mypy fix
pandeconscious Nov 19, 2025
1a472e3
Merge branch 'main' into ordered_cat_corr
pandeconscious Nov 19, 2025
71305aa
scipy unavailable fix in test
pandeconscious Nov 19, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,7 @@ Other enhancements
- :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` now support Python's new-style format strings (e.g., ``"{:.6f}"``) for the ``float_format`` parameter, in addition to old-style ``%`` format strings and callables. This allows for more flexible and modern formatting of floating point numbers when exporting to CSV. (:issue:`49580`)
- :meth:`DataFrameGroupBy.transform`, :meth:`SeriesGroupBy.transform`, :meth:`DataFrameGroupBy.agg`, :meth:`SeriesGroupBy.agg`, :meth:`RollingGroupby.apply`, :meth:`ExpandingGroupby.apply`, :meth:`Rolling.apply`, :meth:`Expanding.apply`, :meth:`DataFrame.apply` with ``engine="numba"`` now supports positional arguments passed as kwargs (:issue:`58995`)
- :meth:`Rolling.agg`, :meth:`Expanding.agg` and :meth:`ExponentialMovingWindow.agg` now accept :class:`NamedAgg` aggregations through ``**kwargs`` (:issue:`28333`)
- :meth:`Series.corr`, :meth:`DataFrame.corr`, :meth:`DataFrame.corrwith` with ``method="kendall"`` and ``method="spearman"`` now work with ordered categorical data types (:issue:`60306`)
- :meth:`Series.map` can now accept kwargs to pass on to func (:issue:`59814`)
- :meth:`Series.map` now accepts an ``engine`` parameter to allow execution with a third-party execution engine (:issue:`61125`)
- :meth:`Series.rank` and :meth:`DataFrame.rank` with numpy-nullable dtypes preserve ``NA`` values and return ``UInt64`` dtype where appropriate instead of casting ``NA`` to ``NaN`` with ``float64`` dtype (:issue:`62043`)
Expand Down
25 changes: 25 additions & 0 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -11680,6 +11680,10 @@ def corr(
data = self._get_numeric_data() if numeric_only else self
cols = data.columns
idx = cols.copy()

if method in ("spearman", "kendall"):
data = data._transform_ord_cat_cols_to_coded_cols()

mat = data.to_numpy(dtype=float, na_value=np.nan, copy=False)

if method == "pearson":
Expand Down Expand Up @@ -11973,6 +11977,8 @@ def corrwith(
correl = num / dom

elif method in ["kendall", "spearman"] or callable(method):
left = left._transform_ord_cat_cols_to_coded_cols()
right = right._transform_ord_cat_cols_to_coded_cols()

def c(x):
return nanops.nancorr(x[0], x[1], method=method)
Expand Down Expand Up @@ -12004,6 +12010,25 @@ def c(x):

return correl

def _transform_ord_cat_cols_to_coded_cols(self) -> DataFrame:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we can simplify this a bit and make it more performant.

result = self
made_copy = False
for idx, dtype in enumerate(self.dtypes):
    if not dtype == "category" or not dtype.ordered:
        continue
    col = result._ixs(idx, axis=1)
    if not made_copy:
        made_copy = True
        result = result.copy(deep=False)
    result._iset_item(idx, col.cat.codes.replace(-1, np.nan))
return result

Can you move this to pandas.core.methods.corr (this file does not yet exist) and make it take a DataFrame as input - we can move the remaining parts of the implementation in a later PR.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

"""
any ordered categorical columns are transformed to the respective
categorical codes while other columns remain untouched
"""
categ = self.select_dtypes("category")
if len(categ.columns) == 0:
return self

cols_convert = categ.loc[:, categ.agg(lambda x: x.cat.ordered)].columns

if len(cols_convert) > 0:
data = self.copy(deep=False)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm a bit wary of taking an entire copy of the dataframe in instances where there might be ordered categoricals; that's a potentially large performance hit, and the usage of this seems pretty niche

I see @rhshadrach commented on the original issue, so lets see what his thoughts are

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

deep=False shouldn't be large as it doesn't copy the underlying data, but agreed we should measure the performance here.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@rhshadrach are you suggesting an asv benchmark or to profile it and paste the results in the description of the PR?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For benchmarking, we don't have any ASVs that hit this case. You can just setup an example that hits this case and use timeit to compare this PR to main. Aim for 10-100ms in runtime so we aren't merely benchmarking overhead. If you want any assistance in setting this up, just let me know.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Time profiling stats added to the description of the PR, please let me know if it makes sense or something else is needed as well.

data[cols_convert] = data[cols_convert].transform(
lambda x: x.cat.codes.replace(-1, np.nan)
)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this will fail when a DataFrame has duplicate column names.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thanks for catching this, fixing this!

return data
return self

# ----------------------------------------------------------------------
# ndarray-like stats methods

Expand Down
6 changes: 6 additions & 0 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2686,6 +2686,12 @@ def corr(
if len(this) == 0:
return np.nan

if method in ("spearman", "kendall"):
if this.dtype == "category" and this.cat.ordered:
this = this.cat.codes.replace(-1, np.nan)
if other.dtype == "category" and other.cat.ordered:
other = other.cat.codes.replace(-1, np.nan)

this_values = this.to_numpy(dtype=float, na_value=np.nan, copy=False)
other_values = other.to_numpy(dtype=float, na_value=np.nan, copy=False)

Expand Down
89 changes: 89 additions & 0 deletions pandas/tests/frame/methods/test_cov_corr.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from itertools import combinations

import numpy as np
import pytest

Expand Down Expand Up @@ -252,6 +254,46 @@ def test_corr_numeric_only(self, meth, numeric_only):
with pytest.raises(ValueError, match="could not convert string to float"):
df.corr(meth, numeric_only=numeric_only)

@pytest.mark.parametrize("method", ["kendall", "spearman"])
def test_corr_rank_ordered_categorical(
self,
method,
):
pytest.importorskip("scipy")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unless you are going to use the import, you can just add this as a @td.skip_if_no("scipy") decorator to the test

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thanks, fixed

df = DataFrame(
{
"ord_cat": Series(
pd.Categorical(
["low", "m", "h", "vh"],
categories=["low", "m", "h", "vh"],
ordered=True,
)
),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems to me these test are unnecessarily long. Can you simplify - e.g. remove the Series call here.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

"ord_cat_none": Series(
pd.Categorical(
["low", "m", "h", None],
categories=["low", "m", "h"],
ordered=True,
)
),
"ord_int": Series([0, 1, 2, 3]),
"ord_float": Series([2.0, 3.0, 4.5, 6.5]),
"ord_float_nan": Series([2.0, 3.0, 4.5, np.nan]),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't see the value in testing these, aren't these tested elsewhere? Can you remove.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

removed

"ord_cat_shuff": Series(
pd.Categorical(
["m", "h", "vh", "low"],
categories=["low", "m", "h", "vh"],
ordered=True,
)
),
"ord_int_shuff": Series([2, 3, 0, 1]),
}
)
corr_calc = df.corr(method=method)
for col1, col2 in combinations(df.columns, r=2):
corr_expected = df[col1].corr(df[col2], method=method)
tm.assert_almost_equal(corr_calc[col1][col2], corr_expected)


class TestDataFrameCorrWith:
@pytest.mark.parametrize(
Expand Down Expand Up @@ -493,3 +535,50 @@ def test_cov_with_missing_values(self):
result2 = df.dropna().cov()
tm.assert_frame_equal(result1, expected)
tm.assert_frame_equal(result2, expected)

@pytest.mark.parametrize("method", ["kendall", "spearman"])
def test_corr_rank_ordered_categorical(
self,
method,
):
pytest.importorskip("scipy")
df1 = DataFrame(
{
"a": Series(
pd.Categorical(
["low", "m", "h", "vh"],
categories=["low", "m", "h", "vh"],
ordered=True,
)
),
"b": Series(
pd.Categorical(
["low", "m", "h", None],
categories=["low", "m", "h"],
ordered=True,
)
),
"c": Series([0, 1, 2, 3]),
"d": Series([2.0, 3.0, 4.5, 6.5]),
}
)

df2 = DataFrame(
{
"a": Series([2.0, 3.0, 4.5, np.nan]),
"b": Series(
pd.Categorical(
["m", "h", "vh", "low"],
categories=["low", "m", "h", "vh"],
ordered=True,
)
),
"c": Series([2, 3, 0, 1]),
"d": Series([2.0, 3.0, 4.5, 6.5]),
}
)

corr_calc = df1.corrwith(df2, method=method)
for col in df1.columns:
corr_expected = df1[col].corr(df2[col], method=method)
tm.assert_almost_equal(corr_calc.get(col), corr_expected)
74 changes: 74 additions & 0 deletions pandas/tests/series/methods/test_cov_corr.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,3 +184,77 @@ def test_corr_callable_method(self, datetime_series):
df = pd.DataFrame([s1, s2])
expected = pd.DataFrame([{0: 1.0, 1: 0}, {0: 0, 1: 1.0}])
tm.assert_almost_equal(df.transpose().corr(method=my_corr), expected)

@pytest.mark.parametrize("method", ["kendall", "spearman"])
def test_corr_rank_ordered_categorical(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This test is pretty long, to the point where its unclear what its intent is. Maybe its worth breaking up into a few tests? Or adding parameterization?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed

self,
method,
):
stats = pytest.importorskip("scipy.stats")
method_scipy_func = {"kendall": stats.kendalltau, "spearman": stats.spearmanr}
ser_ord_cat = Series(
pd.Categorical(
["low", "med", "high", "very_high"],
categories=["low", "med", "high", "very_high"],
ordered=True,
)
)
ser_ord_cat_codes = ser_ord_cat.cat.codes.replace(-1, np.nan)
ser_ord_int = Series([0, 1, 2, 3])
ser_ord_float = Series([2.0, 3.0, 4.5, 6.5])

corr_calc = ser_ord_cat.corr(ser_ord_int, method=method)
corr_expected = method_scipy_func[method](
ser_ord_cat_codes, ser_ord_int, nan_policy="omit"
)[0]
tm.assert_almost_equal(corr_calc, corr_expected)

corr_calc = ser_ord_cat.corr(ser_ord_float, method=method)
corr_expected = method_scipy_func[method](
ser_ord_cat_codes, ser_ord_float, nan_policy="omit"
)[0]
tm.assert_almost_equal(corr_calc, corr_expected)

corr_calc = ser_ord_cat.corr(ser_ord_cat, method=method)
corr_expected = method_scipy_func[method](
ser_ord_cat_codes, ser_ord_cat_codes, nan_policy="omit"
)[0]
tm.assert_almost_equal(corr_calc, corr_expected)

ser_ord_cat_shuff = Series(
pd.Categorical(
["high", "low", "very_high", "med"],
categories=["low", "med", "high", "very_high"],
ordered=True,
)
)
ser_ord_cat_shuff_codes = ser_ord_cat_shuff.cat.codes.replace(-1, np.nan)

corr_calc = ser_ord_cat_shuff.corr(ser_ord_cat, method=method)
corr_expected = method_scipy_func[method](
ser_ord_cat_shuff_codes, ser_ord_cat_codes, nan_policy="omit"
)[0]
tm.assert_almost_equal(corr_calc, corr_expected)

corr_calc = ser_ord_cat_shuff.corr(ser_ord_cat_shuff, method=method)
corr_expected = method_scipy_func[method](
ser_ord_cat_shuff_codes, ser_ord_cat_shuff_codes, nan_policy="omit"
)[0]
tm.assert_almost_equal(corr_calc, corr_expected)

ser_ord_cat_with_nan = Series(
pd.Categorical(
["h", "low", "vh", None, "m"],
categories=["low", "m", "h", "vh"],
ordered=True,
)
)
ser_ord_cat_shuff_with_nan_codes = ser_ord_cat_with_nan.cat.codes.replace(
-1, np.nan
)
ser_ord_int = Series([2, 0, 1, 3, None])
corr_calc = ser_ord_cat_with_nan.corr(ser_ord_int, method=method)
corr_expected = method_scipy_func[method](
ser_ord_cat_shuff_with_nan_codes, ser_ord_int, nan_policy="omit"
)[0]
tm.assert_almost_equal(corr_calc, corr_expected)
Loading