Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,7 @@ Other enhancements
- :meth:`DataFrame.apply` supports using third-party execution engines like the Bodo.ai JIT compiler (:issue:`60668`)
- :meth:`DataFrame.iloc` and :meth:`Series.iloc` now support boolean masks in ``__getitem__`` for more consistent indexing behavior (:issue:`60994`)
- :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` now support f-strings (e.g., ``"{:.6f}"``) for the ``float_format`` parameter, in addition to the ``%`` format strings and callables (:issue:`49580`)
- :meth:`Series.corr`, :meth:`DataFrame.corr`, :meth:`DataFrame.corrwith` with ``method="kendall"`` and ``method="spearman"`` now work with ordered categorical data types (:issue:`60306`)
- :meth:`Series.map` can now accept kwargs to pass on to func (:issue:`59814`)
- :meth:`Series.map` now accepts an ``engine`` parameter to allow execution with a third-party execution engine (:issue:`61125`)
- :meth:`Series.nlargest` uses stable sort internally and will preserve original ordering in the case of equality (:issue:`55767`)
Expand Down
7 changes: 7 additions & 0 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,7 @@
treat_as_nested,
)
from pandas.core.methods import selectn
from pandas.core.methods.corr import transform_ord_cat_cols_to_coded_cols
from pandas.core.reshape.melt import melt
from pandas.core.series import Series
from pandas.core.shared_docs import _shared_docs
Expand Down Expand Up @@ -11718,6 +11719,10 @@ def corr(
data = self._get_numeric_data() if numeric_only else self
cols = data.columns
idx = cols.copy()

if method in ("spearman", "kendall"):
data = transform_ord_cat_cols_to_coded_cols(data)

mat = data.to_numpy(dtype=float, na_value=np.nan, copy=False)

if method == "pearson":
Expand Down Expand Up @@ -12007,6 +12012,8 @@ def corrwith(
correl = num / dom

elif method in ["kendall", "spearman"] or callable(method):
left = transform_ord_cat_cols_to_coded_cols(left)
right = transform_ord_cat_cols_to_coded_cols(right)

def c(x):
return nanops.nancorr(x[0], x[1], method=method)
Expand Down
32 changes: 32 additions & 0 deletions pandas/core/methods/corr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
"""
Module for correlation related implementation
"""

from __future__ import annotations

from typing import TYPE_CHECKING

import numpy as np

from pandas.core.dtypes.dtypes import CategoricalDtype

if TYPE_CHECKING:
from pandas import DataFrame


def transform_ord_cat_cols_to_coded_cols(df: DataFrame) -> DataFrame:
"""
Replace ordered categoricals with their codes, making a shallow copy if necessary.
"""

result = df
made_copy = False
for idx, dtype in enumerate(df.dtypes):
if not isinstance(dtype, CategoricalDtype) or not dtype.ordered:
continue
col = result._ixs(idx, axis=1)
if not made_copy:
made_copy = True
result = result.copy(deep=False)
result._iset_item(idx, col.cat.codes.replace(-1, np.nan))
return result
6 changes: 6 additions & 0 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2750,6 +2750,12 @@ def corr(
if len(this) == 0:
return np.nan

if method in ("spearman", "kendall"):
if this.dtype == "category" and this.cat.ordered:
this = this.cat.codes.replace(-1, np.nan)
if other.dtype == "category" and other.cat.ordered:
other = other.cat.codes.replace(-1, np.nan)

this_values = this.to_numpy(dtype=float, na_value=np.nan, copy=False)
other_values = other.to_numpy(dtype=float, na_value=np.nan, copy=False)

Expand Down
92 changes: 92 additions & 0 deletions pandas/tests/frame/methods/test_cov_corr.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,59 @@ def test_corr_numeric_only(self, meth, numeric_only):
with pytest.raises(ValueError, match="could not convert string to float"):
df.corr(meth, numeric_only=numeric_only)

@pytest.mark.parametrize("method", ["kendall", "spearman"])
@pytest.mark.parametrize("col1", ["ord_cat", "ord_cat_none", "ord_cat_shuff"])
@pytest.mark.parametrize("col2", ["ord_cat", "ord_cat_none", "ord_cat_shuff"])
@td.skip_if_no("scipy")
def test_corr_rank_ordered_categorical(self, method, col1, col2):
# GH #60306
df = DataFrame(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add # GH #60306 to the start of each test.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

{
"ord_cat": pd.Categorical(
["low", "m", "h", "vh"],
categories=["low", "m", "h", "vh"],
ordered=True,
),
"ord_cat_none": pd.Categorical(
["low", "m", "h", None],
categories=["low", "m", "h"],
ordered=True,
),
"ord_cat_shuff": pd.Categorical(
["m", "h", "vh", "low"],
categories=["low", "m", "h", "vh"],
ordered=True,
),
Comment on lines +263 to +277
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you use 2 pytest.mark.parameterizes to simulate the combinations call in this test? It will be easier to see which combination fails if this test fails

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

}
)
corr_calc = df.corr(method=method)
corr_expected = df[col1].corr(df[col2], method=method)
tm.assert_almost_equal(corr_calc[col1][col2], corr_expected)

@pytest.mark.parametrize("method", ["kendall", "spearman"])
@pytest.mark.parametrize("col1_idx", [0, 1, 2, 3, 4])
@pytest.mark.parametrize("col2_idx", [0, 1, 2, 3, 4])
@td.skip_if_no("scipy")
def test_corr_rank_ordered_categorical_duplicate_columns(
self, method, col1_idx, col2_idx
):
# GH #60306
cat = pd.CategoricalDtype(categories=[4, 3, 2, 1], ordered=True)
df = DataFrame(
{
"a": pd.array([1, 2, 3, 4], dtype=cat),
"b": pd.array([4, 3, 2, 1], dtype=cat),
"c": [4, 3, 2, 1],
"d": [10, 20, 30, 40],
"e": [100, 200, 300, 400],
Comment on lines +295 to +299
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same here

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

}
)
df.columns = ["a", "a", "c", "c", "e"]

corr_calc = df.corr(method=method)
corr_expected = df.iloc[:, col1_idx].corr(df.iloc[:, col2_idx], method=method)
tm.assert_almost_equal(corr_calc.iloc[col1_idx, col2_idx], corr_expected)


class TestDataFrameCorrWith:
@pytest.mark.parametrize(
Expand Down Expand Up @@ -493,3 +546,42 @@ def test_cov_with_missing_values(self):
result2 = df.dropna().cov()
tm.assert_frame_equal(result1, expected)
tm.assert_frame_equal(result2, expected)

@pytest.mark.parametrize("method", ["kendall", "spearman"])
@pytest.mark.parametrize("col", ["a", "b", "c", "d"])
def test_corr_rank_ordered_categorical(self, method, col):
# GH #60306
pytest.importorskip("scipy")
df1 = DataFrame(
{
"a": pd.Categorical(
["low", "m", "h", "vh"],
categories=["low", "m", "h", "vh"],
ordered=True,
),
"b": pd.Categorical(
["low", "m", "h", None],
categories=["low", "m", "h"],
ordered=True,
),
"c": [0, 1, 2, 3],
"d": [2.0, 3.0, 4.5, 6.5],
}
)

df2 = DataFrame(
{
"a": [2.0, 3.0, 4.5, np.nan],
"b": pd.Categorical(
["m", "h", "vh", "low"],
categories=["low", "m", "h", "vh"],
ordered=True,
),
"c": [2, 3, 0, 1],
"d": [2.0, 3.0, 4.5, 6.5],
}
)

corr_calc = df1.corrwith(df2, method=method)
corr_expected = df1[col].corr(df2[col], method=method)
tm.assert_almost_equal(corr_calc.get(col), corr_expected)
132 changes: 132 additions & 0 deletions pandas/tests/methods/corr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
"""
Tests for core/methods/corr.py
"""

import numpy as np
import pytest

from pandas import (
Categorical,
DataFrame,
Series,
)
import pandas._testing as tm
from pandas.core.methods.corr import transform_ord_cat_cols_to_coded_cols


@pytest.mark.parametrize(
("input_df_dict", "expected_df_dict"),
[
pytest.param(
# 1) Simple: two ordered categorical columns (with and without None)
{
"ord_cat": Categorical(
["low", "m", "h", "vh"],
categories=["low", "m", "h", "vh"],
ordered=True,
),
"ord_cat_none": Categorical(
["low", "m", "h", None],
categories=["low", "m", "h"],
ordered=True,
),
},
{
# codes: low=0, m=1, h=2, vh=3
"ord_cat": Series([0, 1, 2, 3], dtype="int8"),
# codes: low=0, m=1, h=2, None -> NaN
"ord_cat_none": [0, 1.0, 2.0, np.nan],
},
id="ordered-categoricals-basic",
),
pytest.param(
# 2) Mixed dtypes: only the ordered categorical should change
{
"ordered": Categorical(
["a", "c", "b"],
categories=["a", "b", "c"],
ordered=True,
),
"unordered": Categorical(["x", "y", "x"], ordered=False),
"num": [10, 20, 30],
"text": ["u", "v", "w"],
},
{
# codes: a=0, c=2, b=1
"ordered": Series([0, 2, 1], dtype="int8"),
# unordered categorical should be untouched (still categorical)
"unordered": Categorical(["x", "y", "x"], ordered=False),
"num": [10, 20, 30],
"text": ["u", "v", "w"],
},
id="mixed-types-only-ordered-changes",
),
],
)
def test_transform_ord_cat_cols_to_coded_cols(
input_df_dict: dict, expected_df_dict: dict
) -> None:
# GH #60306
input_df = DataFrame(input_df_dict)
expected_df = DataFrame(expected_df_dict)
out_df = transform_ord_cat_cols_to_coded_cols(input_df)
assert list(out_df.columns) == list(expected_df.columns)
tm.assert_frame_equal(out_df, expected_df)


@pytest.mark.parametrize(
("input_df_dict", "expected_df_dict"),
[
pytest.param(
{
"dup_1": Categorical(
["low", "m", "h"],
categories=["low", "m", "h"],
ordered=True,
),
"dup_2": [5, 6, 7],
},
{
# After transform: position 0 (ordered cat) becomes codes [0,1,2],
# position 1 remains untouched numbers [5,6,7].
"dup_1": Series([0, 1, 2], dtype="int8"),
"dup_2": [5, 6, 7],
},
id="duplicate-names-ordered-first",
),
pytest.param(
{
"dup_1": ["a", "b", "c"], # non-categorical
"dup_2": Categorical(
["p", "q", None],
categories=["p", "q"],
ordered=True,
),
"dup_3": Categorical(
["low", "m", "h"],
categories=["low", "m", "h"],
ordered=True,
),
},
{
# First stays object; second turns into codes [0, 1, NaN]
# and third changes into codes [0, 1, 2]
"dup_1": ["a", "b", "c"],
"dup_2": [0.0, 1.0, np.nan],
"dup_3": Series([0, 1, 2], dtype="int8"),
},
id="duplicate-names-ordered-and-non-categorical-and-none",
),
],
)
def test_transform_ord_cat_cols_to_coded_cols_duplicated_col(
input_df_dict: dict, expected_df_dict: dict
) -> None:
# GH #60306
input_df = DataFrame(input_df_dict)
expected_df = DataFrame(expected_df_dict)
input_df.columns = ["dup" for _ in input_df.columns]
expected_df.columns = ["dup" for _ in expected_df.columns]

out_df = transform_ord_cat_cols_to_coded_cols(input_df)
tm.assert_frame_equal(out_df, expected_df)
44 changes: 44 additions & 0 deletions pandas/tests/series/methods/test_cov_corr.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
import numpy as np
import pytest

import pandas.util._test_decorators as td

import pandas as pd
from pandas import (
Series,
Expand Down Expand Up @@ -184,3 +186,45 @@ def test_corr_callable_method(self, datetime_series):
df = pd.DataFrame([s1, s2])
expected = pd.DataFrame([{0: 1.0, 1: 0}, {0: 0, 1: 1.0}])
tm.assert_almost_equal(df.transpose().corr(method=my_corr), expected)

@td.skip_if_no("scipy")
@pytest.mark.parametrize("method", ["kendall", "spearman"])
@pytest.mark.parametrize(
"cat_series_inpt",
[
pd.Categorical( # ordered cat series
["low", "medium", "high"],
categories=["low", "medium", "high"],
ordered=True,
),
pd.Categorical( # ordered cat series with NA
["low", "medium", "high", None],
categories=["low", "medium", "high"],
ordered=True,
),
],
)
@pytest.mark.parametrize(
"other_series_inpt",
[
pd.Categorical( # other cat ordered series
["m", "l", "h"],
categories=["l", "m", "h"],
ordered=True,
),
# other non cat series
[2, 1, 3],
],
)
def test_corr_rank_ordered_categorical(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This test is pretty long, to the point where its unclear what its intent is. Maybe its worth breaking up into a few tests? Or adding parameterization?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed

self,
method,
cat_series_inpt,
other_series_inpt,
):
# GH #60306
expected_corr = {"kendall": 0.33333333333333337, "spearman": 0.5}
cat_series = Series(cat_series_inpt)
other_series = Series(other_series_inpt)
corr_calc = cat_series.corr(other_series, method=method)
tm.assert_almost_equal(corr_calc, expected_corr[method])
Loading