Skip to content

Commit aa56714

Browse files
authored
Add Perc stat for computing percentiles (#3063)
* Add Perc stat * Add Perc tests * Fix orientation test * Add Perc to API docs * Get Literal from typing_extensions when necessary * Make robust to missing data * Numpy backcompat * Add backcompat conditional in test too * Add API examples
1 parent 5013aea commit aa56714

File tree

7 files changed

+300
-2
lines changed

7 files changed

+300
-2
lines changed

doc/_docstrings/objects.Perc.ipynb

Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": null,
6+
"id": "2d44a326-029b-47ff-b560-5f4b6a4bb73f",
7+
"metadata": {
8+
"tags": [
9+
"hide"
10+
]
11+
},
12+
"outputs": [],
13+
"source": [
14+
"import seaborn.objects as so\n",
15+
"from seaborn import load_dataset\n",
16+
"diamonds = load_dataset(\"diamonds\")"
17+
]
18+
},
19+
{
20+
"cell_type": "raw",
21+
"id": "65e975a2-2559-4bf1-8851-8bbbf52bf22d",
22+
"metadata": {},
23+
"source": [
24+
"The default behavior computes the quartiles and min/max of the input data:"
25+
]
26+
},
27+
{
28+
"cell_type": "code",
29+
"execution_count": null,
30+
"id": "36f927f5-3b64-4871-a355-adadc4da769b",
31+
"metadata": {},
32+
"outputs": [],
33+
"source": [
34+
"p = (\n",
35+
" so.Plot(diamonds, \"cut\", \"price\")\n",
36+
" .scale(y=\"log\")\n",
37+
")\n",
38+
"p.add(so.Dot(), so.Perc())"
39+
]
40+
},
41+
{
42+
"cell_type": "raw",
43+
"id": "feba1b99-0f71-4b18-8e7e-bd5470cc2d0c",
44+
"metadata": {},
45+
"source": [
46+
"Passing an integer will compute that many evenly-spaced percentiles:"
47+
]
48+
},
49+
{
50+
"cell_type": "code",
51+
"execution_count": null,
52+
"id": "f030dd39-1223-475a-93e1-1759a8971a6c",
53+
"metadata": {},
54+
"outputs": [],
55+
"source": [
56+
"p.add(so.Dot(), so.Perc(20))"
57+
]
58+
},
59+
{
60+
"cell_type": "raw",
61+
"id": "85bd754b-122e-4475-8727-2d584a90a38e",
62+
"metadata": {},
63+
"source": [
64+
"Passing a list will compute exactly those percentiles:"
65+
]
66+
},
67+
{
68+
"cell_type": "code",
69+
"execution_count": null,
70+
"id": "2fde7549-45b5-411a-afba-eb0da754d9e9",
71+
"metadata": {},
72+
"outputs": [],
73+
"source": [
74+
"p.add(so.Dot(), so.Perc([10, 25, 50, 75, 90]))"
75+
]
76+
},
77+
{
78+
"cell_type": "raw",
79+
"id": "7be16a13-dfc8-4595-a904-42f9be10f4f6",
80+
"metadata": {},
81+
"source": [
82+
"Combine with a range mark to show a percentile interval:"
83+
]
84+
},
85+
{
86+
"cell_type": "code",
87+
"execution_count": null,
88+
"id": "05c561c6-0449-4a61-96d1-390611a1b694",
89+
"metadata": {},
90+
"outputs": [],
91+
"source": [
92+
"(\n",
93+
" so.Plot(diamonds, \"price\", \"cut\")\n",
94+
" .add(so.Dots(pointsize=1, alpha=.2), so.Jitter(.3))\n",
95+
" .add(so.Range(color=\"k\"), so.Perc([25, 75]), so.Shift(y=.2))\n",
96+
" .scale(x=\"log\")\n",
97+
")"
98+
]
99+
},
100+
{
101+
"cell_type": "code",
102+
"execution_count": null,
103+
"id": "d464157c-3187-49c1-9cd8-71f284ce4c50",
104+
"metadata": {},
105+
"outputs": [],
106+
"source": []
107+
}
108+
],
109+
"metadata": {
110+
"kernelspec": {
111+
"display_name": "py310",
112+
"language": "python",
113+
"name": "py310"
114+
},
115+
"language_info": {
116+
"codemirror_mode": {
117+
"name": "ipython",
118+
"version": 3
119+
},
120+
"file_extension": ".py",
121+
"mimetype": "text/x-python",
122+
"name": "python",
123+
"nbconvert_exporter": "python",
124+
"pygments_lexer": "ipython3",
125+
"version": "3.10.0"
126+
}
127+
},
128+
"nbformat": 4,
129+
"nbformat_minor": 5
130+
}

doc/api.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@ Stat objects
8686
Agg
8787
Est
8888
Hist
89+
Perc
8990
PolyFit
9091

9192
Move objects

doc/whatsnew/v0.12.1.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@ v0.12.1 (Unreleased)
44

55
- |Feature| Added the :class:`objects.Text` mark (:pr:`3051`).
66

7+
- |Feature| Added the :class:`objects.Perc` stat (:pr:`3063`).
8+
79
- |Feature| The :class:`Band` and :class:`Range` marks will now cover the full extent of the data if `min` / `max` variables are not explicitly assigned or added in a transform (:pr:`3056`).
810

911
- |Enhancement| Marks that sort along the orient axis (e.g. :class:`Line`) now use a stable algorithm (:pr:`3064`).

seaborn/_stats/aggregation.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99
from seaborn._core.groupby import GroupBy
1010
from seaborn._stats.base import Stat
1111
from seaborn._statistics import EstimateAggregator
12-
1312
from seaborn._core.typing import Vector
1413

1514

seaborn/_stats/order.py

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
2+
from __future__ import annotations
3+
from dataclasses import dataclass
4+
from typing import ClassVar, cast
5+
try:
6+
from typing import Literal
7+
except ImportError:
8+
from typing_extensions import Literal # type: ignore
9+
10+
import numpy as np
11+
from pandas import DataFrame
12+
13+
from seaborn._core.scales import Scale
14+
from seaborn._core.groupby import GroupBy
15+
from seaborn._stats.base import Stat
16+
from seaborn.external.version import Version
17+
18+
19+
# From https://github.com/numpy/numpy/blob/main/numpy/lib/function_base.pyi
20+
_MethodKind = Literal[
21+
"inverted_cdf",
22+
"averaged_inverted_cdf",
23+
"closest_observation",
24+
"interpolated_inverted_cdf",
25+
"hazen",
26+
"weibull",
27+
"linear",
28+
"median_unbiased",
29+
"normal_unbiased",
30+
"lower",
31+
"higher",
32+
"midpoint",
33+
"nearest",
34+
]
35+
36+
37+
@dataclass
38+
class Perc(Stat):
39+
"""
40+
Replace observations with percentile values.
41+
42+
Parameters
43+
----------
44+
k : list of numbers or int
45+
If a list of numbers, this gives the percentiles (in [0, 100]) to compute.
46+
If an integer, compute `k` evenly-spaced percentiles between 0 and 100.
47+
For example, `k=5` computes the 0, 25, 50, 75, and 100th percentiles.
48+
method : str
49+
Method for interpolating percentiles between observed datapoints.
50+
See :func:`numpy.percentile` for valid options and more information.
51+
52+
Examples
53+
--------
54+
.. include:: ../docstrings/objects.Perc.rst
55+
56+
"""
57+
k: int | list[float] = 5
58+
method: str = "linear"
59+
60+
group_by_orient: ClassVar[bool] = True
61+
62+
def _percentile(self, data: DataFrame, var: str) -> DataFrame:
63+
64+
k = list(np.linspace(0, 100, self.k)) if isinstance(self.k, int) else self.k
65+
method = cast(_MethodKind, self.method)
66+
values = data[var].dropna()
67+
if Version(np.__version__) < Version("1.22.0"):
68+
res = np.percentile(values, k, interpolation=method) # type: ignore
69+
else:
70+
res = np.percentile(data[var].dropna(), k, method=method)
71+
return DataFrame({var: res, "percentile": k})
72+
73+
def __call__(
74+
self, data: DataFrame, groupby: GroupBy, orient: str, scales: dict[str, Scale],
75+
) -> DataFrame:
76+
77+
var = {"x": "y", "y": "x"}[orient]
78+
return groupby.apply(data, self._percentile, var)

seaborn/objects.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,8 +37,9 @@
3737

3838
from seaborn._stats.base import Stat # noqa: F401
3939
from seaborn._stats.aggregation import Agg, Est # noqa: F401
40-
from seaborn._stats.regression import PolyFit # noqa: F401
4140
from seaborn._stats.histogram import Hist # noqa: F401
41+
from seaborn._stats.order import Perc # noqa: F401
42+
from seaborn._stats.regression import PolyFit # noqa: F401
4243

4344
from seaborn._core.moves import Dodge, Jitter, Norm, Shift, Stack, Move # noqa: F401
4445

tests/_stats/test_order.py

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
2+
import numpy as np
3+
import pandas as pd
4+
5+
import pytest
6+
from numpy.testing import assert_array_equal
7+
8+
from seaborn._core.groupby import GroupBy
9+
from seaborn._stats.order import Perc
10+
from seaborn.external.version import Version
11+
12+
13+
class Fixtures:
14+
15+
@pytest.fixture
16+
def df(self, rng):
17+
return pd.DataFrame(dict(x="", y=rng.normal(size=30)))
18+
19+
def get_groupby(self, df, orient):
20+
# TODO note, copied from aggregation
21+
other = {"x": "y", "y": "x"}[orient]
22+
cols = [c for c in df if c != other]
23+
return GroupBy(cols)
24+
25+
26+
class TestPerc(Fixtures):
27+
28+
def test_int_k(self, df):
29+
30+
ori = "x"
31+
gb = self.get_groupby(df, ori)
32+
res = Perc(3)(df, gb, ori, {})
33+
percentiles = [0, 50, 100]
34+
assert_array_equal(res["percentile"], percentiles)
35+
assert_array_equal(res["y"], np.percentile(df["y"], percentiles))
36+
37+
def test_list_k(self, df):
38+
39+
ori = "x"
40+
gb = self.get_groupby(df, ori)
41+
percentiles = [0, 20, 100]
42+
res = Perc(k=percentiles)(df, gb, ori, {})
43+
assert_array_equal(res["percentile"], percentiles)
44+
assert_array_equal(res["y"], np.percentile(df["y"], percentiles))
45+
46+
def test_orientation(self, df):
47+
48+
df = df.rename(columns={"x": "y", "y": "x"})
49+
ori = "y"
50+
gb = self.get_groupby(df, ori)
51+
res = Perc(k=3)(df, gb, ori, {})
52+
assert_array_equal(res["x"], np.percentile(df["x"], [0, 50, 100]))
53+
54+
def test_method(self, df):
55+
56+
ori = "x"
57+
gb = self.get_groupby(df, ori)
58+
method = "nearest"
59+
res = Perc(k=5, method=method)(df, gb, ori, {})
60+
percentiles = [0, 25, 50, 75, 100]
61+
if Version(np.__version__) < Version("1.22.0"):
62+
expected = np.percentile(df["y"], percentiles, interpolation=method)
63+
else:
64+
expected = np.percentile(df["y"], percentiles, method=method)
65+
assert_array_equal(res["y"], expected)
66+
67+
def test_grouped(self, df, rng):
68+
69+
ori = "x"
70+
df = df.assign(x=rng.choice(["a", "b", "c"], len(df)))
71+
gb = self.get_groupby(df, ori)
72+
k = [10, 90]
73+
res = Perc(k)(df, gb, ori, {})
74+
for x, res_x in res.groupby("x"):
75+
assert_array_equal(res_x["percentile"], k)
76+
expected = np.percentile(df.loc[df["x"] == x, "y"], k)
77+
assert_array_equal(res_x["y"], expected)
78+
79+
def test_with_na(self, df):
80+
81+
ori = "x"
82+
df.loc[:5, "y"] = np.nan
83+
gb = self.get_groupby(df, ori)
84+
k = [10, 90]
85+
res = Perc(k)(df, gb, ori, {})
86+
expected = np.percentile(df["y"].dropna(), k)
87+
assert_array_equal(res["y"], expected)

0 commit comments

Comments
 (0)