Skip to content

Commit 2a5a6e8

Browse files
authored
feat(MLOP-2236): add NTZ (#360)
* feat: NTZ and new tests
1 parent cbda73d commit 2a5a6e8

File tree

4 files changed

+84
-3
lines changed

4 files changed

+84
-3
lines changed

butterfree/constants/data_type.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
IntegerType,
1313
LongType,
1414
StringType,
15+
TimestampNTZType,
1516
TimestampType,
1617
)
1718
from typing_extensions import final
@@ -21,6 +22,7 @@
2122
class DataType(Enum):
2223
"""Holds constants for data types within Butterfree."""
2324

25+
TIMESTAMP_NTZ = (TimestampNTZType(), "timestamp", "TIMESTAMP_NTZ")
2426
TIMESTAMP = (TimestampType(), "timestamp", "TIMESTAMP")
2527
BINARY = (BinaryType(), "boolean", "BINARY")
2628
BOOLEAN = (BooleanType(), "boolean", "BOOLEAN")

butterfree/transform/features/timestamp_feature.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ class TimestampFeature(Feature):
4141

4242
def __init__(
4343
self,
44+
dtype: Optional[DataType] = DataType.TIMESTAMP,
4445
from_column: Optional[str] = None,
4546
transformation: Optional[TransformComponent] = None,
4647
from_ms: bool = False,
@@ -51,7 +52,7 @@ def __init__(
5152
name=TIMESTAMP_COLUMN,
5253
description=description,
5354
from_column=from_column,
54-
dtype=DataType.TIMESTAMP,
55+
dtype=dtype,
5556
transformation=transformation,
5657
)
5758
self.from_ms = from_ms

tests/integration/butterfree/pipelines/test_feature_set_pipeline.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ def create_temp_view(dataframe: DataFrame, name):
5050

5151

5252
def create_db_and_table(spark, table_reader_id, table_reader_db, table_reader_table):
53-
spark.sql(f"drop schema {table_reader_db} cascade")
53+
spark.sql(f"drop schema if exists {table_reader_db} cascade")
5454
spark.sql(f"create database {table_reader_db}")
5555
spark.sql(f"use {table_reader_db}")
5656
spark.sql(

tests/unit/butterfree/transform/features/test_timestamp_feature.py

Lines changed: 79 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,26 @@
1-
from pyspark.sql.types import StringType
1+
from datetime import datetime
22

3+
import pytz
4+
from pyspark.sql.types import StringType, StructField, StructType
5+
6+
from butterfree.clients import SparkClient
37
from butterfree.constants import DataType
48
from butterfree.constants.columns import TIMESTAMP_COLUMN
59
from butterfree.transform.features import TimestampFeature
610

11+
# from pyspark.sql.types import *
12+
713

814
class TestTimestampFeature:
915
def test_args_without_transformation(self):
1016

1117
test_key = TimestampFeature(from_column="ts")
18+
test_key_ntz = TimestampFeature(dtype=DataType.TIMESTAMP_NTZ, from_column="ts")
1219

1320
assert test_key.name == TIMESTAMP_COLUMN
1421
assert test_key.from_column == "ts"
1522
assert test_key.dtype == DataType.TIMESTAMP
23+
assert test_key_ntz.dtype == DataType.TIMESTAMP_NTZ
1624

1725
def test_transform(self, feature_set_dataframe):
1826

@@ -70,3 +78,73 @@ def test_transform_mask(self, feature_set_dataframe_date):
7078

7179
assert df[0]["timestamp"] == "2020-02-07 00:00:00"
7280
assert df[1]["timestamp"] == "2020-02-08 00:00:00"
81+
82+
def test_timezone_configs(self):
83+
84+
spark = SparkClient()
85+
now = datetime.now()
86+
87+
# Testing a new timezone
88+
spark.conn.conf.set("spark.sql.session.timeZone", "GMT-5")
89+
90+
time_list = [(now, now)]
91+
rdd = spark.conn.sparkContext.parallelize(time_list)
92+
93+
schema = StructType(
94+
[
95+
StructField("ts", DataType.TIMESTAMP.spark, True),
96+
StructField("ts_ntz", DataType.TIMESTAMP_NTZ.spark, True),
97+
]
98+
)
99+
df = spark.conn.createDataFrame(rdd, schema)
100+
df.createOrReplaceTempView("temp_tz_table")
101+
102+
df1 = spark.conn.sql("""SELECT ts, ts_ntz FROM temp_tz_table""")
103+
df2 = df1.withColumns(
104+
{"ts": df1.ts.cast(StringType()), "ts_ntz": df1.ts_ntz.cast(StringType())}
105+
)
106+
df2_vals = df2.collect()[0]
107+
108+
assert df2_vals.ts != df2_vals.ts_ntz
109+
110+
# New TZ. Column with TZ must have a != value; Column NTZ must keep its value
111+
spark.conn.conf.set("spark.sql.session.timeZone", "GMT-7")
112+
113+
df3 = spark.conn.sql("""SELECT ts, ts_ntz FROM temp_tz_table""")
114+
df4 = df3.withColumns(
115+
{"ts": df1.ts.cast(StringType()), "ts_ntz": df1.ts_ntz.cast(StringType())}
116+
)
117+
df4_vals = df4.collect()[0]
118+
119+
assert df4_vals.ts != df2_vals.ts
120+
assert df4_vals.ts_ntz == df2_vals.ts_ntz
121+
122+
def test_timezone(self):
123+
124+
spark = SparkClient()
125+
126+
my_date = datetime.now(pytz.timezone("US/Pacific"))
127+
128+
datetime_mask = "%Y-%m-%d %H:%M"
129+
130+
data = [
131+
{"id": 1, TIMESTAMP_COLUMN: str(my_date), "feature": 100},
132+
{"id": 2, TIMESTAMP_COLUMN: str(my_date), "feature": 200},
133+
]
134+
135+
df = spark.conn.read.json(spark.conn._sc.parallelize(data, 1))
136+
df.createOrReplaceTempView("time_table")
137+
138+
df2 = spark.sql("SELECT TIMESTAMP AS ts FROM time_table")
139+
140+
time_value = datetime.fromisoformat(df2.collect()[0].ts).strftime(datetime_mask)
141+
142+
df_different_timezone = df2.withColumn(
143+
"ts", df2.ts.cast(DataType.TIMESTAMP.spark)
144+
)
145+
df_no_timezone = df2.withColumn("ts", df2.ts.cast(DataType.TIMESTAMP_NTZ.spark))
146+
147+
assert (
148+
df_different_timezone.collect()[0].ts.strftime(datetime_mask) != time_value
149+
)
150+
assert df_no_timezone.collect()[0].ts.strftime(datetime_mask) == time_value

0 commit comments

Comments
 (0)