sodadata
diff --git a/‎soda-athena/src/soda_athena/common/data_sources/athena_data_source.py‎
Lines changed: 1 addition & 6 deletions b/‎soda-athena/src/soda_athena/common/data_sources/athena_data_source.py‎
Lines changed: 1 addition & 6 deletions
diff --git a/‎soda-bigquery/src/soda_bigquery/common/data_sources/bigquery_data_source.py‎
Lines changed: 1 addition & 1 deletion b/‎soda-bigquery/src/soda_bigquery/common/data_sources/bigquery_data_source.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎soda-core/setup.py‎
Lines changed: 1 addition & 0 deletions b/‎soda-core/setup.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎soda-core/src/soda_core/common/env_config_helper.py‎
Lines changed: 9 additions & 0 deletions b/‎soda-core/src/soda_core/common/env_config_helper.py‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎soda-core/src/soda_core/common/sql_ast.py‎
Lines changed: 3 additions & 3 deletions b/‎soda-core/src/soda_core/common/sql_ast.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎soda-core/src/soda_core/common/sql_dialect.py‎
Lines changed: 29 additions & 5 deletions b/‎soda-core/src/soda_core/common/sql_dialect.py‎
Lines changed: 29 additions & 5 deletions
diff --git a/‎soda-core/src/soda_core/common/sql_utils.py‎
Lines changed: 68 additions & 0 deletions b/‎soda-core/src/soda_core/common/sql_utils.py‎
Lines changed: 68 additions & 0 deletions
diff --git a/‎soda-core/src/soda_core/contracts/impl/check_types/failed_rows_check.py‎
Lines changed: 10 additions & 1 deletion b/‎soda-core/src/soda_core/contracts/impl/check_types/failed_rows_check.py‎
Lines changed: 10 additions & 1 deletion
diff --git a/‎soda-core/src/soda_core/contracts/impl/check_types/invalidity_check.py‎
Lines changed: 36 additions & 34 deletions b/‎soda-core/src/soda_core/contracts/impl/check_types/invalidity_check.py‎
Lines changed: 36 additions & 34 deletions
diff --git a/‎soda-core/src/soda_core/contracts/impl/check_types/metric_check.py‎
Lines changed: 8 additions & 1 deletion b/‎soda-core/src/soda_core/contracts/impl/check_types/metric_check.py‎
Lines changed: 8 additions & 1 deletion
@@ -33,7 +33,7 @@ def __init__(self, data_source_model: AthenaDataSourceModel, connection: Optiona
         super().__init__(data_source_model=data_source_model, connection=connection)
 
     def _create_sql_dialect(self) -> SqlDialect:
-        return AthenaSqlDialect(self)
+        return AthenaSqlDialect(data_source_impl=self)
 
     def _create_data_source_connection(self) -> DataSourceConnection:
         return AthenaDataSourceConnection(
@@ -138,11 +138,6 @@ class AthenaSqlDialect(SqlDialect):
         (SodaDataTypeName.TIME, SodaDataTypeName.VARCHAR),
     )
 
-    # We need to pass the data source impl to the dialect to be able to access connection properties (such as the staging dir)
-    def __init__(self, data_source_impl: AthenaDataSourceImpl):
-        super().__init__()
-        self.data_source_impl = data_source_impl
-
     def default_casify(self, identifier: str) -> str:
         return identifier.lower()
 
 
@@ -54,7 +54,7 @@ def __init__(self, data_source_model: BigQueryDataSourceModel, connection: Optio
         self.cached_location = None
 
     def _create_sql_dialect(self) -> SqlDialect:
-        return BigQuerySqlDialect()
+        return BigQuerySqlDialect(data_source_impl=self)
 
     def _create_data_source_connection(self) -> DataSourceConnection:
         return BigQueryDataSourceConnection(
 
@@ -21,6 +21,7 @@
     "opentelemetry-exporter-otlp-proto-http>=1.16.0,<2.0.0",
     "tabulate[widechars]",
     "python-dotenv~=1.0",
+    "sqlglot",
 ]
 
 setup(
 
@@ -47,3 +47,12 @@ def soda_core_telemetry_local_debug_mode(self) -> bool:
     @property
     def soda_core_telemetry_test_mode(self) -> bool:
         return strtobool(os.getenv("SODA_CORE_TELEMETRY_TEST_MODE", "false"))
+
+    @property
+    def soda_instruction_id(self) -> str | None:
+        return os.getenv("SODA_INSTRUCTION_ID")
+
+    @property
+    def is_running_on_agent(self) -> bool:
+        # SODA_INSTRUCTION_ID is only set when running in Soda Agent
+        return self.soda_instruction_id is not None
@@ -71,7 +71,7 @@ class FROM(BaseSqlExpression):
     table_name: str
     table_prefix: Optional[list[str]] = None
     alias: Optional[str] = None
-    sample_type: Optional[str] = None
+    sampler_type: Optional[str] = None
     sample_size: Optional[Number] = None
 
     def __post_init__(self):
@@ -85,8 +85,8 @@ def IN(self, table_prefix: str | list[str]) -> FROM:
         self.table_prefix = table_prefix if isinstance(table_prefix, list) else [table_prefix]
         return self
 
-    def SAMPLE(self, sample_type: str, sample_size: Number) -> FROM:
-        self.sample_type = sample_type
+    def SAMPLE(self, sampler_type: str, sample_size: Number) -> FROM:
+        self.sampler_type = sampler_type
         self.sample_size = sample_size
         return self
 
 
@@ -5,7 +5,7 @@
 from datetime import date, datetime, time
 from numbers import Number
 from textwrap import indent
-from typing import Any, Optional, Tuple
+from typing import TYPE_CHECKING, Any, Optional, Tuple
 
 from soda_core.common.data_source_results import QueryResult
 from soda_core.common.dataset_identifier import DatasetIdentifier
@@ -16,6 +16,7 @@
     SodaDataTypeName,
     SqlDataType,
 )
+from soda_core.common.soda_cloud_dto import SamplerType
 from soda_core.common.sql_ast import (
     ALTER_TABLE,
     ALTER_TABLE_ADD_COLUMN,
@@ -81,6 +82,10 @@
     SqlExpression,
     SqlExpressionStr,
 )
+from soda_core.common.sql_utils import apply_sampling_to_sql
+
+if TYPE_CHECKING:
+    from soda_core.common.data_source_impl import DataSourceImpl
 
 logger: logging.Logger = soda_logger
 
@@ -96,7 +101,12 @@ class SqlDialect:
 
     SODA_DATA_TYPE_SYNONYMS: tuple[tuple[SodaDataTypeName, ...]] = ()
 
-    def __init__(self):
+    def __init__(
+        self,
+        data_source_impl: DataSourceImpl,
+    ):
+        self.data_source_impl: DataSourceImpl = data_source_impl
+
         self._data_type_name_synonym_mappings: dict[str, str] = self._build_data_type_name_synonym_mappings(
             self._get_data_type_name_synonyms()
         )
@@ -727,8 +737,8 @@ def _build_from_part(self, from_part: FROM) -> str:
             )
         ]
 
-        if isinstance(from_part.sample_type, str) and isinstance(from_part.sample_size, Number):
-            from_parts.append(self._build_sample_sql(from_part.sample_type, from_part.sample_size))
+        if isinstance(from_part.sampler_type, str) and isinstance(from_part.sample_size, Number):
+            from_parts.append(self._build_sample_sql(from_part.sampler_type, from_part.sample_size))
 
         if isinstance(from_part.alias, str):
             from_parts.append(self._alias_format(from_part.alias))
@@ -976,7 +986,7 @@ def format_expr(e: SqlExpression) -> SqlExpression:
             string_to_hash = CONCAT_WS(separator="'||'", expressions=formatted_expressions)
         return self.build_expression_sql(STRING_HASH(string_to_hash))
 
-    def _build_sample_sql(self, sample_type: str, sample_size: Number) -> str:
+    def _build_sample_sql(self, sampler_type: str, sample_size: Number) -> str:
         raise NotImplementedError("Sampling not implemented for this dialect")
 
     def information_schema_namespace_elements(self, data_source_namespace: DataSourceNamespace) -> list[str]:
@@ -1191,6 +1201,20 @@ def get_sql_data_type_class(self) -> type:
     def supports_case_sensitive_column_names(self) -> bool:
         return True
 
+    def apply_sampling(
+        self,
+        sql: str,
+        sampler_limit: Number,
+        sampler_type: SamplerType,
+    ) -> str:
+        return apply_sampling_to_sql(
+            sql=sql,
+            sampler_limit=sampler_limit,
+            sampler_type=sampler_type,
+            read_dialect=self.data_source_impl.type_name,
+            write_dialect=self.data_source_impl.type_name,
+        )
+
     ########################################################
     # Metadata columns query
     ########################################################
 
@@ -0,0 +1,68 @@
+from __future__ import annotations
+
+from numbers import Number
+
+import sqlglot
+from soda_core.common.soda_cloud_dto import SamplerType
+from sqlglot import exp
+
+
+def build_sample_clause(sampler_limit: Number, sampler_type: SamplerType) -> exp.TableSample:
+    if sampler_limit <= 0:
+        raise ValueError("sampler_limit must be positive")
+
+    size = exp.Literal.number(sampler_limit)
+    sample = exp.TableSample()
+
+    if sampler_type == SamplerType.ABSOLUTE_LIMIT:
+        sample.set("size", size)
+    else:
+        raise ValueError(f"Unsupported sample type: {sampler_type}")
+
+    return sample
+
+
+def attach_sample_to_relation(rel: exp.Expression, sampler_limit: Number, sampler_type: SamplerType) -> None:
+    """
+    Attach a TableSample clause to a relation (Table or Subquery),
+    unless it already has one.
+    """
+    if rel is None:
+        return
+
+    if rel.args.get("sample"):
+        return
+
+    if isinstance(rel, (exp.Table, exp.Subquery)):
+        rel.set("sample", build_sample_clause(sampler_limit, sampler_type))
+
+
+def apply_sampling_to_sql(
+    sql: str,
+    sampler_limit: Number,
+    sampler_type: SamplerType,
+    read_dialect: str | None = None,
+    write_dialect: str | None = None,
+) -> str:
+    """
+    Add TABLESAMPLE / SAMPLE to every table-like source in all FROM and JOIN clauses,
+    including inside CTEs and subqueries.
+
+    Exact rendering is dialect-specific.
+    """
+    tree = sqlglot.parse_one(sql, read=read_dialect) if read_dialect else sqlglot.parse_one(sql)
+
+    # FROM sources (top-level, CTE bodies, nested subqueries)
+    # Keep track of CTEs and skip them as they are already sampled at their definition
+    ctes = {cte.alias_or_name for cte in tree.find_all(exp.CTE)}
+    for from_ in tree.find_all(exp.From):
+        if isinstance(from_.this, exp.Table) and from_.this.alias_or_name in ctes:
+            continue
+
+        attach_sample_to_relation(from_.this, sampler_limit, sampler_type)
+
+    # JOIN targets
+    for join in tree.find_all(exp.Join):
+        attach_sample_to_relation(join.this, sampler_limit, sampler_type)
+
+    return tree.sql(dialect=write_dialect) if write_dialect else tree.sql()
@@ -85,11 +85,20 @@ def setup_metrics(
             self.failed_rows_count_metric_impl = self._resolve_metric(
                 FailedRowsQueryMetricImpl(contract_impl=contract_impl, column_impl=column_impl, check_impl=self)
             )
+
+            sql = self.failed_rows_check_yaml.query
+
+            if contract_impl.should_apply_sampling:
+                sql = contract_impl.data_source_impl.sql_dialect.apply_sampling(
+                    sql=sql,
+                    sampler_limit=contract_impl.sampler_limit,
+                    sampler_type=contract_impl.sampler_type,
+                )
             if contract_impl.data_source_impl:
                 failed_rows_count_query: Query = FailedRowsCountQuery(
                     data_source_impl=contract_impl.data_source_impl,
                     metrics=[self.failed_rows_count_metric_impl],
-                    failed_rows_query=self.failed_rows_check_yaml.query,
+                    failed_rows_query=sql,
                 )
                 self.queries.append(failed_rows_count_query)
 
 
@@ -89,6 +89,9 @@ def setup_metrics(self, contract_impl: ContractImpl, column_impl: ColumnImpl, ch
             )
             # this is used in the check extension to extract failed keys and rows
             self.ref_query = InvalidReferenceCountQuery(
+                cte=contract_impl.cte,
+                sampler_type=contract_impl.sampler_type,
+                sampler_limit=contract_impl.sampler_limit,
                 metric_impl=self.invalid_count_metric_impl,
                 dataset_filter=self.contract_impl.filter,
                 check_filter=self.check_yaml.filter,
@@ -191,62 +194,63 @@ class DatasetAlias(Enum):
 class InvalidReferenceCountQuery(Query):
     def __init__(
         self,
+        cte: CTE,
+        sampler_type: Optional[str],
+        sampler_limit: Optional[Number],
         metric_impl: InvalidReferenceCountMetricImpl,
         dataset_filter: Optional[str],
         check_filter: Optional[str],
         data_source_impl: Optional[DataSourceImpl],
     ):
         super().__init__(data_source_impl=data_source_impl, metrics=[metric_impl])
         self.metric_impl = metric_impl
-        self.dataset_filter = dataset_filter
         self.check_filter = check_filter
 
         self.referencing_alias: str = DatasetAlias.CONTRACT.value
         self.referenced_alias: str = DatasetAlias.REFERENCE.value
+        self._referenced_cte_name: str = "_soda_filtered_referenced_dataset"
 
-        sql_ast = self.build_query(SELECT(COUNT(STAR())))
+        self.sampler_type: Optional[str] = sampler_type
+        self.sampler_limit: Optional[Number] = sampler_limit
+
+        sql_ast = self.build_query(cte=cte)
         self.sql = self.data_source_impl.sql_dialect.build_select_sql(sql_ast)
 
-    def build_query(self, select_expression: SqlExpression) -> SqlExpression:
-        sql_ast: list = [select_expression]
-        sql_ast.extend(self.query_from())
+    def build_query(self, cte: CTE) -> list[SqlExpression]:
+        query = [
+            WITH([cte, self.referenced_cte()]),
+            SELECT(COUNT(STAR())),
+            FROM(cte.alias).AS(self.referencing_alias),
+            WHERE.optional(SqlExpressionStr.optional(self.check_filter)),
+        ]
+
+        query.extend(self.query_join())
 
-        if self.dataset_filter or self.check_filter:
-            dataset_filter_expr: Optional[SqlExpressionStr] = None
-            check_filter_expr: Optional[SqlExpressionStr] = None
-            combined_filter_expr: Optional[SqlExpression] = None
+        return query
 
-            if self.dataset_filter:
-                dataset_filter_expr = SqlExpressionStr(self.dataset_filter)
-                combined_filter_expr = dataset_filter_expr
+    def referenced_cte(self) -> CTE:
+        valid_reference_data: ValidReferenceData = self.metric_impl.missing_and_validity.valid_reference_data
+        referenced_dataset_name: str = valid_reference_data.dataset_name
+        referenced_dataset_prefix: Optional[list[str]] = valid_reference_data.dataset_prefix
 
-            if self.check_filter:
-                check_filter_expr = SqlExpressionStr(self.check_filter)
-                combined_filter_expr = check_filter_expr
+        cte = CTE(self._referenced_cte_name).AS(
+            [
+                SELECT(STAR()),
+                FROM(referenced_dataset_name).IN(referenced_dataset_prefix),
+            ]
+        )
 
-            if dataset_filter_expr and check_filter_expr:
-                combined_filter_expr = AND([dataset_filter_expr, check_filter_expr])
+        if self.sampler_type and self.sampler_limit:
+            cte.cte_query[1] = cte.cte_query[1].SAMPLE(self.sampler_type, self.sampler_limit)
 
-            original_from = sql_ast[1].AS(None)
-            sql_ast[1] = FROM("filtered_dataset").AS(self.referencing_alias)
-            sql_ast = [
-                WITH([CTE("filtered_dataset").AS([SELECT(STAR()), original_from, WHERE(combined_filter_expr)])]),
-            ] + sql_ast
-        return sql_ast
+        return cte
 
-    def query_from(self) -> SqlExpression:
+    def query_join(self) -> SqlExpression:
         valid_reference_data: ValidReferenceData = self.metric_impl.missing_and_validity.valid_reference_data
 
-        referencing_dataset_name: str = self.metric_impl.contract_impl.dataset_name
-        referencing_dataset_prefix: Optional[str] = self.metric_impl.contract_impl.dataset_prefix
         referencing_column_name: str = self.metric_impl.column_impl.column_yaml.name
 
-        referenced_dataset_name: str = valid_reference_data.dataset_name
-        referenced_dataset_prefix: Optional[list[str]] = (
-            valid_reference_data.dataset_prefix
-            if valid_reference_data.dataset_prefix is not None
-            else self.metric_impl.contract_impl.dataset_prefix
-        )
+        referenced_dataset_name: str = self._referenced_cte_name
         referenced_column: str = valid_reference_data.column
 
         # The variant to get the failed rows is:
@@ -265,9 +269,7 @@ def query_from(self) -> SqlExpression:
         )
 
         return [
-            FROM(referencing_dataset_name).IN(referencing_dataset_prefix).AS(self.referencing_alias),
             LEFT_INNER_JOIN(referenced_dataset_name)
-            .IN(referenced_dataset_prefix)
             .ON(
                 EQ(
                     COLUMN(referencing_column_name).IN(self.referencing_alias),
 
@@ -78,14 +78,21 @@ def setup_metrics(
             )
 
         elif self.metric_check_yaml.query:
+            sql = self.metric_check_yaml.query
+
+            if contract_impl.should_apply_sampling:
+                sql = contract_impl.data_source_impl.sql_dialect.apply_sampling(
+                    sql, contract_impl.sampler_limit, sampler_type=contract_impl.sampler_type
+                )
+
             self.numeric_metric_impl = self._resolve_metric(
                 MetricQueryMetricImpl(contract_impl=contract_impl, column_impl=column_impl, check_impl=self)
             )
             if contract_impl.data_source_impl:
                 metric_query: Query = MetricQuery(
                     data_source_impl=contract_impl.data_source_impl,
                     metrics=[self.numeric_metric_impl],
-                    sql=self.metric_check_yaml.query,
+                    sql=sql,
                 )
                 self.queries.append(metric_query)
Original file line number	Diff line number	Diff line change
`@@ -21,6 +21,7 @@`
`21`	`21`	`"opentelemetry-exporter-otlp-proto-http>=1.16.0,<2.0.0",`
`22`	`22`	`"tabulate[widechars]",`
`23`	`23`	`"python-dotenv~=1.0",`
	`24`	`+ "sqlglot",`
`24`	`25`	`]`
`25`	`26`
`26`	`27`	`setup(`