google-research
diff --git a/‎setup.py‎
Lines changed: 1 addition & 0 deletions b/‎setup.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎weatherbenchX/aggregation.py‎
Lines changed: 13 additions & 7 deletions b/‎weatherbenchX/aggregation.py‎
Lines changed: 13 additions & 7 deletions
diff --git a/‎weatherbenchX/statistical_inference/bootstrap.py‎
Lines changed: 280 additions & 0 deletions b/‎weatherbenchX/statistical_inference/bootstrap.py‎
Lines changed: 280 additions & 0 deletions
@@ -16,6 +16,7 @@
 
 base_requires = [
     "apache_beam[gcp]>=2.31.0",
+    "arch>=5.0",
     "cftime>=1.6.2",
     "numpy>=2.1.3",
     "pandas>=2.2.3",
 
@@ -15,7 +15,7 @@
 
 import collections
 import dataclasses
-from typing import Any, Collection, Hashable, Mapping, Sequence
+from typing import Any, Callable, Collection, Hashable, Mapping, Sequence
 
 from weatherbenchX import binning
 from weatherbenchX import weighting
@@ -135,14 +135,20 @@ def sum_along_dims(self, dims: Collection[str]) -> 'AggregationState':
     if self.sum_weighted_statistics is None:
       # Further reduction of a generic zero state is also a zero state.
       return self
+    else:
+      return self.map(lambda x: x.sum(dims, skipna=False))
+
+  def map(
+      self,
+      func: Callable[[xr.DataArray], xr.DataArray],
+  ) -> 'AggregationState':
+    """Apply a function to all DataArrays in the AggregationState."""
+    if self.sum_weighted_statistics is None:
+      raise ValueError('Cannot map a zero AggregationState.')
     sum_weighted_statistics = xarray_tree.map_structure(
-        lambda x: x.sum(dims, skipna=False),
-        self.sum_weighted_statistics,
-    )
+        func, self.sum_weighted_statistics)
     sum_weights = xarray_tree.map_structure(
-        lambda x: x.sum(dims, skipna=False),
-        self.sum_weights,
-    )
+        func, self.sum_weights)
     return AggregationState(sum_weighted_statistics, sum_weights)
 
   def to_data_tree(self) -> xr.DataTree:
 
@@ -0,0 +1,280 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Bootstrap methods statistical inference methods for evaluation metrics."""
+
+from collections.abc import Mapping, Hashable
+import functools
+
+import arch.bootstrap
+import numpy as np
+from weatherbenchX import aggregation
+from weatherbenchX import xarray_tree
+from weatherbenchX.metrics import base as metrics_base
+from weatherbenchX.statistical_inference import base
+from weatherbenchX.statistical_inference import utils
+
+import xarray as xr
+
+
+def stationary_bootstrap_indices(
+    n_data: int,
+    mean_block_length: float,
+    n_replicates: int,
+    dtype: np.typing.DTypeLike = np.int64,
+) -> np.ndarray:
+  """Samples indices with shape (n_data, n_replicates)."""
+  end_block_prob = 1/mean_block_length
+  current_indices = np.random.randint(n_data, size=(n_replicates,), dtype=dtype)
+  all_indices = [current_indices]
+  for _ in range(1, n_data):
+    end_block_flags = np.random.rand(n_replicates) < end_block_prob
+    new_random_indices = np.random.randint(
+        n_data, size=(n_replicates,), dtype=dtype)
+    next_indices = (current_indices+1) % n_data
+    current_indices = np.where(
+        end_block_flags, new_random_indices, next_indices)
+    all_indices.append(current_indices)
+  return np.stack(all_indices, axis=0)
+
+
+_REPLICATE_DIM = 'bootstrap_replicate'
+
+
+class StationaryBootstrap(base.StatisticalInferenceMethod):
+  r"""Stationary bootstrap method of Politis and Romano [1].
+
+  With optimal block length selection from [2], [3].
+
+  [1] Politis, D. N. & Romano, J. P. The stationary bootstrap. J. Am. Stat.
+  Assoc. 89, 1303–1313 (1994).
+  [2] Politis, D. N. & White, H. Automatic Block-Length Selection for the
+  Dependent Bootstrap, Econometric Reviews, 23:1, 53-70 (2004).
+  [3] Patton, A., Politis, D. N. & White, H. Correction to "Automatic
+  Block-Length Selection for the Dependent Bootstrap" by D. Politis and
+  H. White, Econometric Reviews, 28:4, 372-375 (2009).
+  """
+
+  def __init__(
+      self,
+      metrics: Mapping[str, metrics_base.Metric],
+      aggregated_statistics: aggregation.AggregationState,
+      experimental_unit_dim: str,
+      n_replicates: int,
+      mean_block_length: float | None = None,
+      block_length_rounding_resolution: float | None = 30.0,
+      stationary_bootstrap_indices_cache_size: int = 10,
+  ):
+    self._experimental_unit_dim = experimental_unit_dim
+    self._mean_block_length = mean_block_length
+    self._n_replicates = n_replicates
+    self._aggregated_statistics = aggregated_statistics
+    self._block_length_rounding_resolution = block_length_rounding_resolution
+    self._stationary_bootstrap_indices = functools.lru_cache(
+        maxsize=stationary_bootstrap_indices_cache_size)(
+            stationary_bootstrap_indices)
+    self._original_values = {}
+    self._resampled_values = {}
+    for metric_name, metric in metrics.items():
+      original_values, resampled_values = self._bootstrap_results_for_metric(
+          metric)
+      self._original_values[metric_name] = original_values
+      self._resampled_values[metric_name] = resampled_values
+
+  def _optimal_block_length(self, data_array: xr.DataArray) -> float:
+    if self._mean_block_length is not None:
+      return self._mean_block_length
+
+    assert self._experimental_unit_dim in data_array.dims
+    if data_array.sizes[self._experimental_unit_dim] < 8:
+      # At least, arch.bootstrap.optimal_block_length craps out with a very
+      # unfriendly error if given a smaller array.
+      raise ValueError(
+          'Need at least 8 data points along experimental_unit_dim '
+          f'{self._experimental_unit_dim} to set mean_block_length '
+          'automatically -- and many more than 8 recommended.')
+    data_array = data_array.squeeze()
+    assert data_array.ndim == 1
+
+    # .stationary gives the mean block length for use with the stationary
+    # bootstrap:
+    result = arch.bootstrap.optimal_block_length(
+        data_array.data).stationary.item()
+    # Values <1 can sometimes show up, but 1 is the minimum.
+    result = max(1.0, result)
+    if self._block_length_rounding_resolution is not None:
+      # Rounding this off makes it a useful key for LRU caching of the
+      # bootstrap indices. These need to be sampled separately for each mean
+      # block length used, and this forms a significant fraction of total
+      # running time. The inference of an optimal block length is noisy enough
+      # that rounding off to 1 or 2 significant figures (or the similar but
+      # smoother logarithmic rounding below) should be perfectly acceptable.
+      result = utils.logarithmic_round(
+          result, self._block_length_rounding_resolution)
+    return result
+
+  def _bootstrap_results_for_metric(
+      self, metric: metrics_base.Metric) -> tuple[
+          Mapping[Hashable, xr.DataArray], Mapping[Hashable, xr.DataArray]]:
+
+    overall_values = metrics_base.compute_metric_from_statistics(
+        metric, self._aggregated_statistics.sum_along_dims(
+            [self._experimental_unit_dim]).mean_statistics())
+    per_unit_values = metrics_base.compute_metric_from_statistics(
+        metric, self._aggregated_statistics.mean_statistics())
+    sum_weighted_stats = {
+        stat_name: self._aggregated_statistics.sum_weighted_statistics[
+            stat.unique_name]
+        for stat_name, stat in metric.statistics.items()
+    }
+    sum_weights = {
+        stat_name: self._aggregated_statistics.sum_weights[
+            stat.unique_name]
+        for stat_name, stat in metric.statistics.items()
+    }
+    resampled_values = {}
+    for var_name in overall_values.keys():
+      # Results for different variables will need to be computed separately,
+      # as the optimal block length will depend on the variable.
+      #
+      # We try to avoid computing results for *all* variables every time we
+      # do a bootstrap resample based on the optimal block length for a single
+      # *one* of these variables though, using this logic:
+      if (len(overall_values) > 1 and
+          all(var_name in vars for vars in sum_weighted_stats.values())):
+        # A corresponding variable is present in each Statistic and we make the
+        # assumption that this variable in the result only depends on these
+        # corresponding variables in the stats and that we can recompute the
+        # Metric with the statistics restricted just to this single variable.
+        # This saves us resampling statistics for all the other variables.
+        sum_weighted_stats_for_this_var = {
+            stat_name: {var_name: vars[var_name]}
+            for stat_name, vars in sum_weighted_stats.items()
+        }
+        sum_weights_for_this_var = {
+            stat_name: {var_name: vars[var_name]}
+            for stat_name, vars in sum_weights.items()
+        }
+      else:
+        # If there was only a single variable, it's fine to resample all the
+        # statistics since this will only be done once.
+        # If there are multiple variables and they don't correspond 1:1 to
+        # variables in the statistics, then we can't do any better than
+        # resampling all the statistics even though this may result in some
+        # redundant work. This should be a rare edge case though.
+        sum_weighted_stats_for_this_var = sum_weighted_stats
+        sum_weights_for_this_var = sum_weights
+
+      # The optimal block length will also depend on the specific component
+      # within the DataArray for the metric result, for example different
+      # degrees of autocorrelation may be observed for forecast metrics at
+      # different lead times.
+      # And so bootstrap indices will need to be sampled separately for each
+      # component along any dimensions present in the metric result.
+      #
+      # We assume that where a dimension of the metric result also occurs in the
+      # statistics, that the metrics at index i along that dimension only depend
+      # on the statistics at index i along the same dimension, and that we can
+      # therefore slice the statistics down to a single index along any such
+      # dimensions when computing a single index of the metric result.
+      #
+      # This assumption isn't strictly guaranteed, but it is true in the vast
+      # majority of cases, including:
+      # * The common case of a per-component metric like RMSE, which is a scalar
+      #   quantity computed independently for each component.
+      # * Metrics which introduce some additional internal dimensions on their
+      #   statistics, but reduce them down to a scalar value in their output.
+      # * Metrics which introduce some additional dimensions in their output
+      #   which aren't present in the statistics, but use a different dimension
+      #   name for them to any dimensions used in the statistics.
+      per_var_resampled_values = utils.apply_to_slices(
+          functools.partial(self._bootstrap_results_for_metric_scalar,
+                            metric, var_name),
+          per_unit_values[var_name],
+          sum_weighted_stats_for_this_var,
+          sum_weights_for_this_var,
+          dim=overall_values[var_name].dims,
+      )
+      resampled_values[var_name] = per_var_resampled_values
+    return overall_values, resampled_values
+
+  def _bootstrap_results_for_metric_scalar(
+      self,
+      metric: metrics_base.Metric,
+      var_name: str,
+      per_unit_values: xr.DataArray,
+      sum_weighted_stats: Mapping[str, Mapping[Hashable, xr.DataArray]],
+      sum_weights: Mapping[str, Mapping[Hashable, xr.DataArray]],
+  ) -> xr.DataArray:
+    n_data = per_unit_values.sizes[self._experimental_unit_dim]
+    mean_block_length = self._optimal_block_length(per_unit_values)
+    bootstrap_indices = self._stationary_bootstrap_indices(
+        n_data=n_data,
+        mean_block_length=mean_block_length,
+        n_replicates=self._n_replicates,
+    )
+    bootstrap_indices = xr.DataArray(
+        bootstrap_indices, dims=[self._experimental_unit_dim, _REPLICATE_DIM])
+
+    def sum_of_resampled(data):
+      return data.isel({self._experimental_unit_dim: bootstrap_indices}).sum(
+          self._experimental_unit_dim)
+    sum_weighted_stats, sum_weights = xarray_tree.map_structure(
+        sum_of_resampled, (sum_weighted_stats, sum_weights))
+    # del bootstrap_indices
+    mean_stats = xarray_tree.map_structure(
+        lambda x, y: x / y, sum_weighted_stats, sum_weights)
+    del sum_weighted_stats, sum_weights
+
+    return metric.values_from_mean_statistics(mean_stats)[var_name]
+
+  def point_estimates(self) -> base.MetricValues:
+    return self._original_values
+
+  def standard_error_estimates(self) -> base.MetricValues:
+    return xarray_tree.map_structure(
+        lambda x: x.std(_REPLICATE_DIM, ddof=1), self._resampled_values)
+
+  def confidence_intervals(
+      self, alpha: float = 0.05
+  ) -> tuple[base.MetricValues, base.MetricValues]:
+    # TODO(matthjw): implement BCa intervals.
+    return (
+        xarray_tree.map_structure(
+            lambda x: x.quantile(alpha/2, _REPLICATE_DIM),
+            self._resampled_values),
+        xarray_tree.map_structure(
+            lambda x: x.quantile(1-alpha/2, _REPLICATE_DIM),
+            self._resampled_values),
+    )
+
+  def p_values(self, null_value: float = 0.) -> base.MetricValues:
+    """p-value for a two-sided test with the given null hypothesis value."""
+
+    # Obtained by inverting the percentile confidence interval above.
+    # TODO(matthjw): replace with inverting the BCa interval when implemented.
+
+    def p_value_numpy_1d(resampled: np.ndarray) -> float:
+      data = np.sort(resampled)
+      q = np.linspace(0, 1, data.shape[0])
+      empirical_cdf_at_null = np.interp(null_value, data, q)
+      return 2 * min(empirical_cdf_at_null, 1 - empirical_cdf_at_null)
+
+    def p_value(resampled: xr.DataArray) -> xr.DataArray:
+      return xr.apply_ufunc(
+          p_value_numpy_1d,
+          resampled,
+          input_core_dims=[[_REPLICATE_DIM]],
+          vectorize=True)
+
+    return xarray_tree.map_structure(p_value, self._resampled_values)