Skip to content

Commit cff4600

Browse files
authored
Add proportion-based normalization in histplot (#2634)
Follows #2461. This gives the same result as but may be more intuitive for some people.
1 parent feef114 commit cff4600

File tree

4 files changed

+30
-23
lines changed

4 files changed

+30
-23
lines changed

doc/docstrings/histplot.ipynb

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -195,11 +195,11 @@
195195
]
196196
},
197197
{
198-
"cell_type": "markdown",
199-
"metadata": {},
200198
"source": [
201-
"It's also possible to normalize so that each bar's height shows a probability, which make more sense for discrete variables:"
202-
]
199+
"It's also possible to normalize so that each bar's height shows a probability, proportion, or percent, which make more sense for discrete variables:"
200+
],
201+
"cell_type": "markdown",
202+
"metadata": {}
203203
},
204204
{
205205
"cell_type": "code",
@@ -208,7 +208,7 @@
208208
"outputs": [],
209209
"source": [
210210
"tips = sns.load_dataset(\"tips\")\n",
211-
"sns.histplot(data=tips, x=\"size\", stat=\"probability\", discrete=True)"
211+
"sns.histplot(data=tips, x=\"size\", stat=\"percent\", discrete=True)"
212212
]
213213
},
214214
{
@@ -480,4 +480,4 @@
480480
},
481481
"nbformat": 4,
482482
"nbformat_minor": 4
483-
}
483+
}

doc/releases/v0.11.2.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ This is primarily a bug fix release that addresses issues in the v0.11 series, a
1212

1313
- |Enhancement| In :func:`histplot`, improved performance with large datasets and many groupings/facets (:pr:`2559`, :pr:`2570`).
1414

15-
- |Enhancement| In :func:`histplot`, added `stat="percent"` as an option for normalization such that bar heights sum to 100 (:pr:`2461`).
15+
- |Enhancement| In :func:`histplot`, added `stat="percent"` as an option for normalization such that bar heights sum to 100 and `stat="proportion"` as an alias for the existing `stat="probability"`: (:pr:`2461`, :pr:`2634`).
1616

1717
- |Enhancement| In :func:`kdeplot`, added the `warn_singular` parameter to silence the warning about data with zero variance (:pr:`2566`).
1818

seaborn/_statistics.py

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -209,13 +209,14 @@ def __init__(
209209
210210
Parameters
211211
----------
212-
stat : {"count", "frequency", "density", "probability", "percent"}
212+
stat : str
213213
Aggregate statistic to compute in each bin.
214214
215-
- ``count`` shows the number of observations
216-
- ``frequency`` shows the number of observations divided by the bin width
217-
- ``density`` normalizes counts so that the area of the histogram is 1
218-
- ``probability`` normalizes counts so that the sum of the bar heights is 1
215+
- `count`: show the number of observations in each bin
216+
- `frequency`: show the number of observations divided by the bin width
217+
- `probability`: or `proportion`: normalize such that bar heights sum to 1
218+
- `percent`: normalize such that bar heights sum to 100
219+
- `density`: normalize such that the total area of the histogram equals 1
219220
220221
bins : str, number, vector, or a pair of such values
221222
Generic bin parameter that can be the name of a reference rule,
@@ -234,7 +235,9 @@ def __init__(
234235
If True, return the cumulative statistic.
235236
236237
"""
237-
stat_choices = ["count", "frequency", "density", "probability", "percent"]
238+
stat_choices = [
239+
"count", "frequency", "density", "probability", "proportion", "percent",
240+
]
238241
_check_argument("stat", stat_choices, stat)
239242

240243
self.stat = stat
@@ -341,7 +344,7 @@ def _eval_bivariate(self, x1, x2, weights):
341344
np.diff(bin_edges[1]),
342345
)
343346

344-
if self.stat == "probability":
347+
if self.stat == "probability" or self.stat == "proportion":
345348
hist = hist.astype(float) / hist.sum()
346349
elif self.stat == "percent":
347350
hist = hist.astype(float) / hist.sum() * 100
@@ -367,7 +370,7 @@ def _eval_univariate(self, x, weights):
367370
x, **bin_kws, weights=weights, density=density,
368371
)
369372

370-
if self.stat == "probability":
373+
if self.stat == "probability" or self.stat == "proportion":
371374
hist = hist.astype(float) / hist.sum()
372375
elif self.stat == "percent":
373376
hist = hist.astype(float) / hist.sum() * 100

seaborn/tests/test_distributions.py

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1299,27 +1299,31 @@ def test_density_stat_unique_norm(self, long_df):
12991299
bar_areas = np.multiply(bar_heights, bar_widths)
13001300
assert bar_areas.sum() == pytest.approx(1)
13011301

1302-
def test_probability_stat(self, flat_series):
1302+
@pytest.fixture(params=["probability", "proportion"])
1303+
def height_norm_arg(self, request):
1304+
return request.param
13031305

1304-
ax = histplot(flat_series, stat="probability")
1306+
def test_probability_stat(self, flat_series, height_norm_arg):
1307+
1308+
ax = histplot(flat_series, stat=height_norm_arg)
13051309
bar_heights = [b.get_height() for b in ax.patches]
13061310
assert sum(bar_heights) == pytest.approx(1)
13071311

1308-
def test_probability_stat_common_norm(self, long_df):
1312+
def test_probability_stat_common_norm(self, long_df, height_norm_arg):
13091313

13101314
ax = histplot(
13111315
data=long_df, x="x", hue="a",
1312-
stat="probability", common_norm=True, element="bars",
1316+
stat=height_norm_arg, common_norm=True, element="bars",
13131317
)
13141318
bar_heights = [b.get_height() for b in ax.patches]
13151319
assert sum(bar_heights) == pytest.approx(1)
13161320

1317-
def test_probability_stat_unique_norm(self, long_df):
1321+
def test_probability_stat_unique_norm(self, long_df, height_norm_arg):
13181322

13191323
n = 10
13201324
ax = histplot(
13211325
data=long_df, x="x", hue="a",
1322-
stat="probability", bins=n, common_norm=False, element="bars",
1326+
stat=height_norm_arg, bins=n, common_norm=False, element="bars",
13231327
)
13241328

13251329
bar_groups = ax.patches[:n], ax.patches[-n:]
@@ -1874,15 +1878,15 @@ def test_mesh_unique_norm(self, long_df):
18741878
density, (x_edges, y_edges) = sub_hist(sub_df["x"], sub_df["y"])
18751879
assert_array_equal(mesh_data.data, density.T.flat)
18761880

1877-
@pytest.mark.parametrize("stat", ["probability", "percent"])
1881+
@pytest.mark.parametrize("stat", ["probability", "proportion", "percent"])
18781882
def test_mesh_normalization(self, long_df, stat):
18791883

18801884
ax = histplot(
18811885
long_df, x="x", y="y", stat=stat,
18821886
)
18831887

18841888
mesh_data = ax.collections[0].get_array()
1885-
expected_sum = {"probability": 1, "percent": 100}[stat]
1889+
expected_sum = {"percent": 100}.get(stat, 1)
18861890
assert mesh_data.data.sum() == expected_sum
18871891

18881892
def test_mesh_colors(self, long_df):

0 commit comments

Comments
 (0)