Add proportion-based normalization in histplot (#2634)

mwaskom · web-flow · commit cff46009ea7e · 2021-08-07T13:33:54.000-04:00
Follows #2461. This gives the same result as but may be more intuitive for some people.
diff --git a/doc/docstrings/histplot.ipynb b/doc/docstrings/histplot.ipynb
@@ -195,11 +195,11 @@
    ]
   },
   {
-   "cell_type": "markdown",
-   "metadata": {},
    "source": [
-    "It's also possible to normalize so that each bar's height shows a probability, which make more sense for discrete variables:"
-   ]
+    "It's also possible to normalize so that each bar's height shows a probability, proportion, or percent, which make more sense for discrete variables:"
+   ],
+   "cell_type": "markdown",
+   "metadata": {}
   },
   {
    "cell_type": "code",
@@ -208,7 +208,7 @@
    "outputs": [],
    "source": [
     "tips = sns.load_dataset(\"tips\")\n",
-    "sns.histplot(data=tips, x=\"size\", stat=\"probability\", discrete=True)"
+    "sns.histplot(data=tips, x=\"size\", stat=\"percent\", discrete=True)"
    ]
   },
   {
@@ -480,4 +480,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 4
-}
+}
diff --git a/doc/releases/v0.11.2.txt b/doc/releases/v0.11.2.txt
@@ -12,7 +12,7 @@ This is primarily a bug fix release that addresses issues in the v0.11 series, a
 
 - |Enhancement| In :func:`histplot`, improved performance with large datasets and many groupings/facets (:pr:`2559`, :pr:`2570`).
 
-- |Enhancement| In :func:`histplot`, added `stat="percent"` as an option for normalization such that bar heights sum to 100 (:pr:`2461`).
+- |Enhancement| In :func:`histplot`, added `stat="percent"` as an option for normalization such that bar heights sum to 100 and `stat="proportion"` as an alias for the existing `stat="probability"`: (:pr:`2461`, :pr:`2634`).
 
 - |Enhancement| In :func:`kdeplot`, added the `warn_singular` parameter to silence the warning about data with zero variance (:pr:`2566`).
 
diff --git a/seaborn/_statistics.py b/seaborn/_statistics.py
@@ -209,13 +209,14 @@ def __init__(
 
         Parameters
         ----------
-        stat : {"count", "frequency", "density", "probability", "percent"}
+        stat : str
             Aggregate statistic to compute in each bin.
 
-            - ``count`` shows the number of observations
-            - ``frequency`` shows the number of observations divided by the bin width
-            - ``density`` normalizes counts so that the area of the histogram is 1
-            - ``probability`` normalizes counts so that the sum of the bar heights is 1
+            - `count`: show the number of observations in each bin
+            - `frequency`: show the number of observations divided by the bin width
+            - `probability`: or `proportion`: normalize such that bar heights sum to 1
+            - `percent`: normalize such that bar heights sum to 100
+            - `density`: normalize such that the total area of the histogram equals 1
 
         bins : str, number, vector, or a pair of such values
             Generic bin parameter that can be the name of a reference rule,
@@ -234,7 +235,9 @@ def __init__(
             If True, return the cumulative statistic.
 
         """
-        stat_choices = ["count", "frequency", "density", "probability", "percent"]
+        stat_choices = [
+            "count", "frequency", "density", "probability", "proportion", "percent",
+        ]
         _check_argument("stat", stat_choices, stat)
 
         self.stat = stat
@@ -341,7 +344,7 @@ def _eval_bivariate(self, x1, x2, weights):
             np.diff(bin_edges[1]),
         )
 
-        if self.stat == "probability":
+        if self.stat == "probability" or self.stat == "proportion":
             hist = hist.astype(float) / hist.sum()
         elif self.stat == "percent":
             hist = hist.astype(float) / hist.sum() * 100
@@ -367,7 +370,7 @@ def _eval_univariate(self, x, weights):
             x, **bin_kws, weights=weights, density=density,
         )
 
-        if self.stat == "probability":
+        if self.stat == "probability" or self.stat == "proportion":
             hist = hist.astype(float) / hist.sum()
         elif self.stat == "percent":
             hist = hist.astype(float) / hist.sum() * 100
diff --git a/seaborn/tests/test_distributions.py b/seaborn/tests/test_distributions.py
@@ -1299,27 +1299,31 @@ def test_density_stat_unique_norm(self, long_df):
             bar_areas = np.multiply(bar_heights, bar_widths)
             assert bar_areas.sum() == pytest.approx(1)
 
-    def test_probability_stat(self, flat_series):
+    @pytest.fixture(params=["probability", "proportion"])
+    def height_norm_arg(self, request):
+        return request.param
 
-        ax = histplot(flat_series, stat="probability")
+    def test_probability_stat(self, flat_series, height_norm_arg):
+
+        ax = histplot(flat_series, stat=height_norm_arg)
         bar_heights = [b.get_height() for b in ax.patches]
         assert sum(bar_heights) == pytest.approx(1)
 
-    def test_probability_stat_common_norm(self, long_df):
+    def test_probability_stat_common_norm(self, long_df, height_norm_arg):
 
         ax = histplot(
             data=long_df, x="x", hue="a",
-            stat="probability", common_norm=True, element="bars",
+            stat=height_norm_arg, common_norm=True, element="bars",
         )
         bar_heights = [b.get_height() for b in ax.patches]
         assert sum(bar_heights) == pytest.approx(1)
 
-    def test_probability_stat_unique_norm(self, long_df):
+    def test_probability_stat_unique_norm(self, long_df, height_norm_arg):
 
         n = 10
         ax = histplot(
             data=long_df, x="x", hue="a",
-            stat="probability", bins=n, common_norm=False, element="bars",
+            stat=height_norm_arg, bins=n, common_norm=False, element="bars",
         )
 
         bar_groups = ax.patches[:n], ax.patches[-n:]
@@ -1874,15 +1878,15 @@ def test_mesh_unique_norm(self, long_df):
             density, (x_edges, y_edges) = sub_hist(sub_df["x"], sub_df["y"])
             assert_array_equal(mesh_data.data, density.T.flat)
 
-    @pytest.mark.parametrize("stat", ["probability", "percent"])
+    @pytest.mark.parametrize("stat", ["probability", "proportion", "percent"])
     def test_mesh_normalization(self, long_df, stat):
 
         ax = histplot(
             long_df, x="x", y="y", stat=stat,
         )
 
         mesh_data = ax.collections[0].get_array()
-        expected_sum = {"probability": 1, "percent": 100}[stat]
+        expected_sum = {"percent": 100}.get(stat, 1)
         assert mesh_data.data.sum() == expected_sum
 
     def test_mesh_colors(self, long_df):

Original file line number	Diff line number	Diff line change
`@@ -195,11 +195,11 @@`
`195`	`195`	`]`
`196`	`196`	`},`
`197`	`197`	`{`
`198`		`- "cell_type": "markdown",`
`199`		`- "metadata": {},`
`200`	`198`	`"source": [`
`201`		`- "It's also possible to normalize so that each bar's height shows a probability, which make more sense for discrete variables:"`
`202`		`- ]`
	`199`	`+ "It's also possible to normalize so that each bar's height shows a probability, proportion, or percent, which make more sense for discrete variables:"`
	`200`	`+ ],`
	`201`	`+ "cell_type": "markdown",`
	`202`	`+ "metadata": {}`
`203`	`203`	`},`
`204`	`204`	`{`
`205`	`205`	`"cell_type": "code",`
`@@ -208,7 +208,7 @@`
`208`	`208`	`"outputs": [],`
`209`	`209`	`"source": [`
`210`	`210`	`"tips = sns.load_dataset(\"tips\")\n",`
`211`		`- "sns.histplot(data=tips, x=\"size\", stat=\"probability\", discrete=True)"`
	`211`	`+ "sns.histplot(data=tips, x=\"size\", stat=\"percent\", discrete=True)"`
`212`	`212`	`]`
`213`	`213`	`},`
`214`	`214`	`{`
`@@ -480,4 +480,4 @@`
`480`	`480`	`},`
`481`	`481`	`"nbformat": 4,`
`482`	`482`	`"nbformat_minor": 4`
`483`		`-}`
	`483`	`+}`