-
Couldn't load subscription status.
- Fork 67
feat(generator): emit temporal checks for date/datetime min/max #624
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| @@ -1,8 +1,8 @@ | ||||||||||||
| import logging | ||||||||||||
| import datetime | ||||||||||||
|
|
||||||||||||
| from databricks.labs.dqx.base import DQEngineBase | ||||||||||||
| from databricks.labs.dqx.engine import DQEngine | ||||||||||||
| from databricks.labs.dqx.profiler.common import val_maybe_to_str | ||||||||||||
| from databricks.labs.dqx.profiler.profiler import DQProfile | ||||||||||||
| from databricks.labs.dqx.telemetry import telemetry_logger | ||||||||||||
|
|
||||||||||||
|
|
@@ -70,49 +70,59 @@ def dq_generate_min_max(column: str, level: str = "error", **params: dict): | |||||||||||
| Generates a data quality rule to check if a column's value is within a specified range. | ||||||||||||
|
|
||||||||||||
| Args: | ||||||||||||
| column: The name of the column to check. | ||||||||||||
| level: The criticality level of the rule (default is "error"). | ||||||||||||
| params: Additional parameters, including the minimum and maximum values. | ||||||||||||
| column: The name of the column to check. | ||||||||||||
| level: The criticality level of the rule (default is "error"). | ||||||||||||
| params: Additional parameters, including the minimum and maximum values. | ||||||||||||
|
|
||||||||||||
| Returns: | ||||||||||||
| A dictionary representing the data quality rule, or None if no limits are provided. | ||||||||||||
| A dictionary representing the data quality rule, or None if no limits are provided. | ||||||||||||
| """ | ||||||||||||
| min_limit = params.get("min") | ||||||||||||
| max_limit = params.get("max") | ||||||||||||
|
|
||||||||||||
| if not isinstance(min_limit, int) or not isinstance(max_limit, int): | ||||||||||||
| return None # TODO handle timestamp and dates: https://github.com/databrickslabs/dqx/issues/71 | ||||||||||||
| if min_limit is None and max_limit is None: | ||||||||||||
| return None | ||||||||||||
|
|
||||||||||||
| def _is_num(value): | ||||||||||||
| return isinstance(value, int) | ||||||||||||
|
|
||||||||||||
| def _is_temporal(value): | ||||||||||||
| return isinstance(value, (datetime.date, datetime.datetime)) | ||||||||||||
|
|
||||||||||||
| def _same_family(value_a, value_b): | ||||||||||||
| # numeric with numeric OR temporal with temporal | ||||||||||||
| if value_a is None or value_b is None: | ||||||||||||
| return True | ||||||||||||
| return (_is_num(value_a) and _is_num(value_b)) or (_is_temporal(value_a) and _is_temporal(value_b)) | ||||||||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
to simplify and make it easier to extend |
||||||||||||
|
|
||||||||||||
| if min_limit is not None and max_limit is not None: | ||||||||||||
| # Both bounds | ||||||||||||
| if min_limit is not None and max_limit is not None and _same_family(min_limit, max_limit): | ||||||||||||
| return { | ||||||||||||
| "check": { | ||||||||||||
| "function": "is_in_range", | ||||||||||||
| "arguments": { | ||||||||||||
| "column": column, | ||||||||||||
| "min_limit": val_maybe_to_str(min_limit, include_sql_quotes=False), | ||||||||||||
| "max_limit": val_maybe_to_str(max_limit, include_sql_quotes=False), | ||||||||||||
| # pass through Python ints or datetime/date without stringification | ||||||||||||
| "min_limit": min_limit, | ||||||||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. wouldn't that cause issues down the line? shouldn't the |
||||||||||||
| "max_limit": max_limit, | ||||||||||||
| }, | ||||||||||||
| }, | ||||||||||||
| "name": f"{column}_isnt_in_range", | ||||||||||||
| "criticality": level, | ||||||||||||
| } | ||||||||||||
|
|
||||||||||||
| if max_limit is not None: | ||||||||||||
| # Only max | ||||||||||||
| if max_limit is not None and (_is_num(max_limit) or _is_temporal(max_limit)): | ||||||||||||
| return { | ||||||||||||
| "check": { | ||||||||||||
| "function": "is_not_greater_than", | ||||||||||||
| "arguments": {"column": column, "limit": val_maybe_to_str(max_limit, include_sql_quotes=False)}, | ||||||||||||
| }, | ||||||||||||
| "check": {"function": "is_not_greater_than", "arguments": {"column": column, "limit": max_limit}}, | ||||||||||||
| "name": f"{column}_not_greater_than", | ||||||||||||
| "criticality": level, | ||||||||||||
| } | ||||||||||||
|
|
||||||||||||
| if min_limit is not None: | ||||||||||||
| # Only min | ||||||||||||
| if min_limit is not None and (_is_num(min_limit) or _is_temporal(min_limit)): | ||||||||||||
| return { | ||||||||||||
| "check": { | ||||||||||||
| "function": "is_not_less_than", | ||||||||||||
| "arguments": {"column": column, "limit": val_maybe_to_str(min_limit, include_sql_quotes=False)}, | ||||||||||||
| }, | ||||||||||||
| "check": {"function": "is_not_less_than", "arguments": {"column": column, "limit": min_limit}}, | ||||||||||||
| "name": f"{column}_not_less_than", | ||||||||||||
| "criticality": level, | ||||||||||||
| } | ||||||||||||
|
|
||||||||||||
| Original file line number | Diff line number | Diff line change | ||||||||
|---|---|---|---|---|---|---|---|---|---|---|
| @@ -1,5 +1,5 @@ | ||||||||||
| import logging | ||||||||||
| import datetime | ||||||||||
|
Comment on lines
+1
to
2
|
||||||||||
| import logging | |
| import datetime | |
| import datetime | |
| import logging |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
why are these cases remove?
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,32 @@ | ||
| import datetime | ||
|
|
||
| from databricks.labs.dqx.profiler.generator import DQGenerator | ||
|
|
||
|
|
||
| def test_date_both_bounds_is_in_range(): | ||
| result = DQGenerator.dq_generate_min_max( | ||
| "dcol", **{"min": datetime.date(2020, 1, 1), "max": datetime.date(2020, 12, 31)} | ||
| ) | ||
| assert result["check"]["function"] == "is_in_range" | ||
| args = result["check"]["arguments"] | ||
| assert args["column"] == "dcol" | ||
| assert args["min_limit"] == datetime.date(2020, 1, 1) | ||
| assert args["max_limit"] == datetime.date(2020, 12, 31) | ||
|
|
||
|
|
||
| def test_timestamp_only_min_is_not_less_than(): | ||
| timestamp = datetime.datetime(2024, 6, 1, 12, 0, 0) | ||
| result = DQGenerator.dq_generate_min_max("tscol", **{"min": timestamp, "max": None}) | ||
| assert result["check"]["function"] == "is_not_less_than" | ||
| args = result["check"]["arguments"] | ||
| assert args["column"] == "tscol" | ||
| assert args["limit"] == timestamp | ||
|
|
||
|
|
||
| def test_timestamp_only_max_is_not_greater_than(): | ||
| timestamp = datetime.datetime(2024, 6, 30, 23, 59, 59) | ||
| result = DQGenerator.dq_generate_min_max("tscol", **{"min": None, "max": timestamp}) | ||
| assert result["check"]["function"] == "is_not_greater_than" | ||
| args = result["check"]["arguments"] | ||
| assert args["column"] == "tscol" | ||
| assert args["limit"] == timestamp |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The
_is_numfunction only checks forinttype but doesn't includefloator other numeric types likeDecimal. This could miss valid numeric values for min/max checks.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yeah, i guess we should also support float and decimal here