|
1 |
| -from pyspark.sql.types import StringType |
| 1 | +from datetime import datetime |
2 | 2 |
|
| 3 | +import pytz |
| 4 | +from pyspark.sql.types import StringType, StructField, StructType |
| 5 | + |
| 6 | +from butterfree.clients import SparkClient |
3 | 7 | from butterfree.constants import DataType
|
4 | 8 | from butterfree.constants.columns import TIMESTAMP_COLUMN
|
5 | 9 | from butterfree.transform.features import TimestampFeature
|
6 | 10 |
|
| 11 | +# from pyspark.sql.types import * |
| 12 | + |
7 | 13 |
|
8 | 14 | class TestTimestampFeature:
|
9 | 15 | def test_args_without_transformation(self):
|
10 | 16 |
|
11 | 17 | test_key = TimestampFeature(from_column="ts")
|
| 18 | + test_key_ntz = TimestampFeature(dtype=DataType.TIMESTAMP_NTZ, from_column="ts") |
12 | 19 |
|
13 | 20 | assert test_key.name == TIMESTAMP_COLUMN
|
14 | 21 | assert test_key.from_column == "ts"
|
15 | 22 | assert test_key.dtype == DataType.TIMESTAMP
|
| 23 | + assert test_key_ntz.dtype == DataType.TIMESTAMP_NTZ |
16 | 24 |
|
17 | 25 | def test_transform(self, feature_set_dataframe):
|
18 | 26 |
|
@@ -70,3 +78,73 @@ def test_transform_mask(self, feature_set_dataframe_date):
|
70 | 78 |
|
71 | 79 | assert df[0]["timestamp"] == "2020-02-07 00:00:00"
|
72 | 80 | assert df[1]["timestamp"] == "2020-02-08 00:00:00"
|
| 81 | + |
| 82 | + def test_timezone_configs(self): |
| 83 | + |
| 84 | + spark = SparkClient() |
| 85 | + now = datetime.now() |
| 86 | + |
| 87 | + # Testing a new timezone |
| 88 | + spark.conn.conf.set("spark.sql.session.timeZone", "GMT-5") |
| 89 | + |
| 90 | + time_list = [(now, now)] |
| 91 | + rdd = spark.conn.sparkContext.parallelize(time_list) |
| 92 | + |
| 93 | + schema = StructType( |
| 94 | + [ |
| 95 | + StructField("ts", DataType.TIMESTAMP.spark, True), |
| 96 | + StructField("ts_ntz", DataType.TIMESTAMP_NTZ.spark, True), |
| 97 | + ] |
| 98 | + ) |
| 99 | + df = spark.conn.createDataFrame(rdd, schema) |
| 100 | + df.createOrReplaceTempView("temp_tz_table") |
| 101 | + |
| 102 | + df1 = spark.conn.sql("""SELECT ts, ts_ntz FROM temp_tz_table""") |
| 103 | + df2 = df1.withColumns( |
| 104 | + {"ts": df1.ts.cast(StringType()), "ts_ntz": df1.ts_ntz.cast(StringType())} |
| 105 | + ) |
| 106 | + df2_vals = df2.collect()[0] |
| 107 | + |
| 108 | + assert df2_vals.ts != df2_vals.ts_ntz |
| 109 | + |
| 110 | + # New TZ. Column with TZ must have a != value; Column NTZ must keep its value |
| 111 | + spark.conn.conf.set("spark.sql.session.timeZone", "GMT-7") |
| 112 | + |
| 113 | + df3 = spark.conn.sql("""SELECT ts, ts_ntz FROM temp_tz_table""") |
| 114 | + df4 = df3.withColumns( |
| 115 | + {"ts": df1.ts.cast(StringType()), "ts_ntz": df1.ts_ntz.cast(StringType())} |
| 116 | + ) |
| 117 | + df4_vals = df4.collect()[0] |
| 118 | + |
| 119 | + assert df4_vals.ts != df2_vals.ts |
| 120 | + assert df4_vals.ts_ntz == df2_vals.ts_ntz |
| 121 | + |
| 122 | + def test_timezone(self): |
| 123 | + |
| 124 | + spark = SparkClient() |
| 125 | + |
| 126 | + my_date = datetime.now(pytz.timezone("US/Pacific")) |
| 127 | + |
| 128 | + datetime_mask = "%Y-%m-%d %H:%M" |
| 129 | + |
| 130 | + data = [ |
| 131 | + {"id": 1, TIMESTAMP_COLUMN: str(my_date), "feature": 100}, |
| 132 | + {"id": 2, TIMESTAMP_COLUMN: str(my_date), "feature": 200}, |
| 133 | + ] |
| 134 | + |
| 135 | + df = spark.conn.read.json(spark.conn._sc.parallelize(data, 1)) |
| 136 | + df.createOrReplaceTempView("time_table") |
| 137 | + |
| 138 | + df2 = spark.sql("SELECT TIMESTAMP AS ts FROM time_table") |
| 139 | + |
| 140 | + time_value = datetime.fromisoformat(df2.collect()[0].ts).strftime(datetime_mask) |
| 141 | + |
| 142 | + df_different_timezone = df2.withColumn( |
| 143 | + "ts", df2.ts.cast(DataType.TIMESTAMP.spark) |
| 144 | + ) |
| 145 | + df_no_timezone = df2.withColumn("ts", df2.ts.cast(DataType.TIMESTAMP_NTZ.spark)) |
| 146 | + |
| 147 | + assert ( |
| 148 | + df_different_timezone.collect()[0].ts.strftime(datetime_mask) != time_value |
| 149 | + ) |
| 150 | + assert df_no_timezone.collect()[0].ts.strftime(datetime_mask) == time_value |
0 commit comments