FEAT-#7549: Emit metrics on auto-switch and casting behavior (#7550)

sfc-gh-jkew · sfc-gh-joshi · web-flow · commit 0d2f4bcf1c6a · 2025-05-22T18:28:17.000-07:00
In both the caster and the calculator we emit some new metrics to track
the switching behavior. This will be used by a metrics handler to
implement debugging tools.

Co-authored-by: Jonathan Shi &lt;jonathan.shi@snowflake.com&gt;
diff --git a/modin/core/storage_formats/base/query_compiler_calculator.py b/modin/core/storage_formats/base/query_compiler_calculator.py
@@ -20,13 +20,15 @@
 """
 
 import logging
+import random
 from types import MappingProxyType
 from typing import Any, Optional
 
 from modin.core.storage_formats.base.query_compiler import (
     BaseQueryCompiler,
     QCCoercionCost,
 )
+from modin.logging.metrics import emit_metric
 
 
 class AggregatedBackendData:
@@ -102,6 +104,8 @@ def calculate(self) -> str:
         """
         if self._result_backend is not None:
             return self._result_backend
+        if len(self._qc_list) == 1:
+            return self._qc_list[0].get_backend()
         if len(self._qc_list) == 0:
             raise ValueError("No query compilers registered")
 
@@ -146,12 +150,32 @@ def calculate(self) -> str:
             logging.info(
                 f"BackendCostCalculator Results: {self._calc_result_log(self._result_backend)}"
             )
+            # Does not need to be secure, should not use system entropy
+            metrics_group = "%04x" % random.randrange(16**4)
+            for qc in self._qc_list:
+                max_shape = qc._max_shape()
+                backend = qc.get_backend()
+                emit_metric(
+                    f"hybrid.merge.candidate.{backend}.group.{metrics_group}.rows",
+                    max_shape[0],
+                )
+                emit_metric(
+                    f"hybrid.merge.candidate.{backend}.group.{metrics_group}.cols",
+                    max_shape[1],
+                )
+            for k, v in self._backend_data.items():
+                emit_metric(
+                    f"hybrid.merge.candidate.{k}.group.{metrics_group}.cost", v.cost
+                )
+            emit_metric(
+                f"hybrid.merge.decision.{self._result_backend}.group.{metrics_group}",
+                1,
+            )
 
         if self._result_backend is None:
             raise ValueError(
                 f"Cannot cast to any of the available backends, as the estimated cost is too high. Tried these backends: [{','.join(self._backend_data.keys())}]"
             )
-
         return self._result_backend
 
     def _add_cost_data(self, backend, cost):
diff --git a/modin/core/storage_formats/pandas/query_compiler_caster.py b/modin/core/storage_formats/pandas/query_compiler_caster.py
@@ -22,6 +22,7 @@
 import functools
 import inspect
 import logging
+import random
 from abc import ABC, abstractmethod
 from collections import defaultdict, namedtuple
 from types import FunctionType, MappingProxyType, MethodType
@@ -42,6 +43,7 @@
 )
 from modin.error_message import ErrorMessage
 from modin.logging import disable_logging
+from modin.logging.metrics import emit_metric
 from modin.utils import sentinel
 
 Fn = TypeVar("Fn", bound=Any)
@@ -724,6 +726,8 @@ def _get_backend_for_auto_switch(
     # backend.
     from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher
 
+    # Does not need to be secure, should not use system entropy
+    metrics_group = "%04x" % random.randrange(16**4)
     starting_backend = input_qc.get_backend()
 
     min_move_stay_delta = None
@@ -734,6 +738,23 @@ def _get_backend_for_auto_switch(
         operation=function_name,
         arguments=arguments,
     )
+    data_max_shape = input_qc._max_shape()
+    emit_metric(
+        f"hybrid.auto.api.{class_of_wrapped_fn}.{function_name}.group.{metrics_group}",
+        1,
+    )
+    emit_metric(
+        f"hybrid.auto.current.{starting_backend}.group.{metrics_group}.stay_cost",
+        stay_cost,
+    )
+    emit_metric(
+        f"hybrid.auto.current.{starting_backend}.group.{metrics_group}.rows",
+        data_max_shape[0],
+    )
+    emit_metric(
+        f"hybrid.auto.current.{starting_backend}.group.{metrics_group}.cols",
+        data_max_shape[1],
+    )
     for backend in Backend.get_active_backends():
         if backend in ("Ray", "Unidist", "Dask"):
             # Disable automatically switching to these engines for now, because
@@ -778,16 +799,32 @@ def _get_backend_for_auto_switch(
             ):
                 min_move_stay_delta = move_stay_delta
                 best_backend = backend
+            emit_metric(
+                f"hybrid.auto.candidate.{backend}.group.{metrics_group}.move_to_cost",
+                move_to_cost,
+            )
+            emit_metric(
+                f"hybrid.auto.candidate.{backend}.group.{metrics_group}.other_execute_cost",
+                other_execute_cost,
+            )
+            emit_metric(
+                f"hybrid.auto.candidate.{backend}.group.{metrics_group}.delta",
+                move_stay_delta,
+            )
+
             logging.info(
                 f"After {class_of_wrapped_fn} function {function_name}, "
                 + f"considered moving to backend {backend} with "
                 + f"(transfer_cost {move_to_cost} + other_execution_cost {other_execute_cost}) "
                 + f", stay_cost {stay_cost}, and move-stay delta "
                 + f"{move_stay_delta}"
             )
+
     if best_backend == starting_backend:
+        emit_metric(f"hybrid.auto.decision.{best_backend}.group.{metrics_group}", 0)
         logging.info(f"Chose not to switch backends after operation {function_name}")
     else:
+        emit_metric(f"hybrid.auto.decision.{best_backend}.group.{metrics_group}", 1)
         logging.info(f"Chose to move to backend {best_backend}")
     return best_backend
 
diff --git a/modin/tests/pandas/native_df_interoperability/test_compiler_caster.py b/modin/tests/pandas/native_df_interoperability/test_compiler_caster.py
@@ -44,6 +44,7 @@
     register_function_for_post_op_switch,
     register_function_for_pre_op_switch,
 )
+from modin.logging.metrics import add_metric_handler, clear_metric_handler
 from modin.pandas.api.extensions import register_pd_accessor
 from modin.tests.pandas.utils import create_test_dfs, df_equals, eval_general
 
@@ -1308,3 +1309,48 @@ class AQC(NativeQueryCompiler):
 
     assert qc._engine_max_size() == oldmax
     assert qc._transfer_threshold() == oldthresh
+
+
+def test_cast_metrics(pico_df, cluster_df):
+    try:
+        count = 0
+
+        def test_handler(metric: str, value) -> None:
+            nonlocal count
+            if metric.startswith("modin.hybrid.merge"):
+                count += 1
+
+        add_metric_handler(test_handler)
+        df3 = pd.concat([pico_df, cluster_df], axis=1)
+        assert df3.get_backend() == "Cluster"  # result should be on cluster
+        assert count == 7
+    finally:
+        clear_metric_handler(test_handler)
+
+
+def test_switch_metrics(pico_df, cluster_df):
+    with backend_test_context(
+        test_backend="Big_Data_Cloud",
+        choices=("Big_Data_Cloud", "Small_Data_Local"),
+    ):
+        try:
+            count = 0
+
+            def test_handler(metric: str, value) -> None:
+                nonlocal count
+                if metric.startswith("modin.hybrid.auto"):
+                    count += 1
+
+            add_metric_handler(test_handler)
+
+            register_function_for_pre_op_switch(
+                class_name="DataFrame",
+                backend="Big_Data_Cloud",
+                method="describe",
+            )
+            df = pd.DataFrame([1] * 10)
+            assert df.get_backend() == "Big_Data_Cloud"
+            df.describe()
+            assert count == 8
+        finally:
+            clear_metric_handler(test_handler)