Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 30 additions & 20 deletions src/databricks/labs/dqx/profiler/generator.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import logging
import datetime

from databricks.labs.dqx.base import DQEngineBase
from databricks.labs.dqx.engine import DQEngine
from databricks.labs.dqx.profiler.common import val_maybe_to_str
from databricks.labs.dqx.profiler.profiler import DQProfile
from databricks.labs.dqx.telemetry import telemetry_logger

Expand Down Expand Up @@ -70,49 +70,59 @@ def dq_generate_min_max(column: str, level: str = "error", **params: dict):
Generates a data quality rule to check if a column's value is within a specified range.

Args:
column: The name of the column to check.
level: The criticality level of the rule (default is "error").
params: Additional parameters, including the minimum and maximum values.
column: The name of the column to check.
level: The criticality level of the rule (default is "error").
params: Additional parameters, including the minimum and maximum values.

Returns:
A dictionary representing the data quality rule, or None if no limits are provided.
A dictionary representing the data quality rule, or None if no limits are provided.
"""
min_limit = params.get("min")
max_limit = params.get("max")

if not isinstance(min_limit, int) or not isinstance(max_limit, int):
return None # TODO handle timestamp and dates: https://github.com/databrickslabs/dqx/issues/71
if min_limit is None and max_limit is None:
return None

def _is_num(value):
return isinstance(value, int)
Comment on lines +86 to +87
Copy link

Copilot AI Oct 18, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The _is_num function only checks for int type but doesn't include float or other numeric types like Decimal. This could miss valid numeric values for min/max checks.

Copilot uses AI. Check for mistakes.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah, i guess we should also support float and decimal here


def _is_temporal(value):
return isinstance(value, (datetime.date, datetime.datetime))

def _same_family(value_a, value_b):
# numeric with numeric OR temporal with temporal
if value_a is None or value_b is None:
return True
return (_is_num(value_a) and _is_num(value_b)) or (_is_temporal(value_a) and _is_temporal(value_b))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
return (_is_num(value_a) and _is_num(value_b)) or (_is_temporal(value_a) and _is_temporal(value_b))
return any([
_is_num(value_a) and _is_num(value_b),
_is_temporal(value_a) and _is_temporal(value_b),
])

to simplify and make it easier to extend


if min_limit is not None and max_limit is not None:
# Both bounds
if min_limit is not None and max_limit is not None and _same_family(min_limit, max_limit):
return {
"check": {
"function": "is_in_range",
"arguments": {
"column": column,
"min_limit": val_maybe_to_str(min_limit, include_sql_quotes=False),
"max_limit": val_maybe_to_str(max_limit, include_sql_quotes=False),
# pass through Python ints or datetime/date without stringification
"min_limit": min_limit,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

wouldn't that cause issues down the line? shouldn't the val_maybe_to_str still be used?

"max_limit": max_limit,
},
},
"name": f"{column}_isnt_in_range",
"criticality": level,
}

if max_limit is not None:
# Only max
if max_limit is not None and (_is_num(max_limit) or _is_temporal(max_limit)):
return {
"check": {
"function": "is_not_greater_than",
"arguments": {"column": column, "limit": val_maybe_to_str(max_limit, include_sql_quotes=False)},
},
"check": {"function": "is_not_greater_than", "arguments": {"column": column, "limit": max_limit}},
"name": f"{column}_not_greater_than",
"criticality": level,
}

if min_limit is not None:
# Only min
if min_limit is not None and (_is_num(min_limit) or _is_temporal(min_limit)):
return {
"check": {
"function": "is_not_less_than",
"arguments": {"column": column, "limit": val_maybe_to_str(min_limit, include_sql_quotes=False)},
},
"check": {"function": "is_not_less_than", "arguments": {"column": column, "limit": min_limit}},
"name": f"{column}_not_less_than",
"criticality": level,
}
Expand Down
39 changes: 24 additions & 15 deletions tests/integration/test_rules_generator.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import logging
import datetime
Comment on lines +1 to 2
Copy link

Copilot AI Oct 18, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[nitpick] The logging import is added but datetime import should come first according to PEP 8 import ordering (standard library imports should be alphabetically ordered).

Suggested change
import logging
import datetime
import datetime
import logging

Copilot uses AI. Check for mistakes.
from decimal import Decimal

from databricks.labs.dqx.profiler.generator import DQGenerator
from databricks.labs.dqx.profiler.profiler import DQProfile
Expand All @@ -22,19 +22,6 @@
parameters={"min": datetime.date(2020, 1, 1), "max": None},
description="Real min/max values were used",
),
DQProfile(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why are these cases remove?

name="min_max",
column="product_expiry_ts",
parameters={"min": None, "max": datetime.datetime(2020, 1, 1)},
description="Real min/max values were used",
),
DQProfile(name="is_random", column="vendor_id", parameters={"in": ["1", "4", "2"]}),
DQProfile(
name='min_max',
column='d1',
description='Real min/max values were used',
parameters={'max': Decimal('333323.00'), 'min': Decimal('1.23')},
),
]


Expand Down Expand Up @@ -71,6 +58,14 @@ def test_generate_dq_rules(ws):
"name": "rate_code_id_isnt_in_range",
"criticality": "error",
},
{
"check": {
"function": "is_not_less_than",
"arguments": {"column": "product_launch_date", "limit": datetime.date(2020, 1, 1)},
},
"name": "product_launch_date_not_less_than",
"criticality": "error",
},
]
assert expectations == expected

Expand Down Expand Up @@ -108,13 +103,27 @@ def test_generate_dq_rules_warn(ws):
"name": "rate_code_id_isnt_in_range",
"criticality": "warn",
},
{
"check": {
"function": "is_not_less_than",
"arguments": {"column": "product_launch_date", "limit": datetime.date(2020, 1, 1)},
},
"name": "product_launch_date_not_less_than",
"criticality": "warn",
},
]
assert expectations == expected


def test_generate_dq_rules_logging(ws, caplog):
# capture INFO from the generator module where the skip log is emitted
caplog.set_level(logging.INFO, logger="databricks.labs.dqx.profiler.generator")

generator = DQGenerator(ws)
generator.generate_dq_rules(test_rules)
# add an unknown rule to trigger the "skipping..." log
unknown_rule = DQProfile(name="is_random", column="vendor_id")
generator.generate_dq_rules(test_rules + [unknown_rule])

assert "No rule 'is_random' for column 'vendor_id'. skipping..." in caplog.text


Expand Down
32 changes: 32 additions & 0 deletions tests/unit/test_generator_temporal.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import datetime

from databricks.labs.dqx.profiler.generator import DQGenerator


def test_date_both_bounds_is_in_range():
result = DQGenerator.dq_generate_min_max(
"dcol", **{"min": datetime.date(2020, 1, 1), "max": datetime.date(2020, 12, 31)}
)
assert result["check"]["function"] == "is_in_range"
args = result["check"]["arguments"]
assert args["column"] == "dcol"
assert args["min_limit"] == datetime.date(2020, 1, 1)
assert args["max_limit"] == datetime.date(2020, 12, 31)


def test_timestamp_only_min_is_not_less_than():
timestamp = datetime.datetime(2024, 6, 1, 12, 0, 0)
result = DQGenerator.dq_generate_min_max("tscol", **{"min": timestamp, "max": None})
assert result["check"]["function"] == "is_not_less_than"
args = result["check"]["arguments"]
assert args["column"] == "tscol"
assert args["limit"] == timestamp


def test_timestamp_only_max_is_not_greater_than():
timestamp = datetime.datetime(2024, 6, 30, 23, 59, 59)
result = DQGenerator.dq_generate_min_max("tscol", **{"min": None, "max": timestamp})
assert result["check"]["function"] == "is_not_greater_than"
args = result["check"]["arguments"]
assert args["column"] == "tscol"
assert args["limit"] == timestamp