quintoandar
diff --git a/‎CHANGELOG.md‎
Lines changed: 4 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎butterfree/load/writers/historical_feature_store_writer.py‎
Lines changed: 47 additions & 1 deletion b/‎butterfree/load/writers/historical_feature_store_writer.py‎
Lines changed: 47 additions & 1 deletion
diff --git a/‎butterfree/load/writers/online_feature_store_writer.py‎
Lines changed: 17 additions & 1 deletion b/‎butterfree/load/writers/online_feature_store_writer.py‎
Lines changed: 17 additions & 1 deletion
diff --git a/‎tests/integration/butterfree/load/test_sink.py‎
Lines changed: 5 additions & 1 deletion b/‎tests/integration/butterfree/load/test_sink.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎tests/integration/butterfree/pipelines/test_feature_set_pipeline.py‎
Lines changed: 245 additions & 0 deletions b/‎tests/integration/butterfree/pipelines/test_feature_set_pipeline.py‎
Lines changed: 245 additions & 0 deletions
@@ -46,6 +46,10 @@ Preferably use **Added**, **Changed**, **Removed** and **Fixed** topics in each
 * [BUG] Fix Cassandra Connect Session ([#316](https://github.com/quintoandar/butterfree/pull/316))
 * Fix method to generate agg feature name. ([#326](https://github.com/quintoandar/butterfree/pull/326))
 
+## [1.1.3](https://github.com/quintoandar/butterfree/releases/tag/1.1.3)
+### Added
+* [MLOP-636] Create migration classes ([#282](https://github.com/quintoandar/butterfree/pull/282))
+
 ## [1.1.3](https://github.com/quintoandar/butterfree/releases/tag/1.1.3)
 ### Added
 * [MLOP-599] Apply mypy to ButterFree ([#273](https://github.com/quintoandar/butterfree/pull/273))
 
@@ -146,7 +146,25 @@ def write(
         """
         dataframe = self._create_partitions(dataframe)
 
-        dataframe = self._apply_transformations(dataframe)
+        partition_df = self._apply_transformations(dataframe)
+
+        if self.debug_mode:
+            dataframe = partition_df
+        else:
+            dataframe = self.check_schema(
+                spark_client, partition_df, feature_set.name, self.database
+            )
+
+        if self.interval_mode:
+            if self.debug_mode:
+                spark_client.create_temporary_view(
+                    dataframe=dataframe,
+                    name=f"historical_feature_store__{feature_set.name}",
+                )
+                return
+
+            self._incremental_mode(feature_set, dataframe, spark_client)
+            return
 
         if self.interval_mode:
             partition_overwrite_mode = spark_client.conn.conf.get(
@@ -191,6 +209,34 @@ def write(
             **self.db_config.get_options(s3_key),
         )
 
+    def _incremental_mode(
+        self, feature_set: FeatureSet, dataframe: DataFrame, spark_client: SparkClient
+    ) -> None:
+
+        partition_overwrite_mode = spark_client.conn.conf.get(
+            "spark.sql.sources.partitionOverwriteMode"
+        ).lower()
+
+        if partition_overwrite_mode != "dynamic":
+            raise RuntimeError(
+                "m=load_incremental_table, "
+                "spark.sql.sources.partitionOverwriteMode={}, "
+                "msg=partitionOverwriteMode have to be configured to 'dynamic'".format(
+                    partition_overwrite_mode
+                )
+            )
+
+        s3_key = os.path.join("historical", feature_set.entity, feature_set.name)
+        options = {"path": self.db_config.get_options(s3_key).get("path")}
+
+        spark_client.write_dataframe(
+            dataframe=dataframe,
+            format_=self.db_config.format_,
+            mode=self.db_config.mode,
+            **options,
+            partitionBy=self.PARTITION_BY,
+        )
+
     def _assert_validation_count(
         self, table_name: str, written_count: int, dataframe_count: int
     ) -> None:
 
@@ -7,7 +7,7 @@
 from pyspark.sql.functions import col, row_number
 from pyspark.sql.streaming import StreamingQuery
 
-from butterfree.clients import SparkClient
+from butterfree.clients import CassandraClient, SparkClient
 from butterfree.configs.db import AbstractWriteConfig, CassandraConfig
 from butterfree.constants.columns import TIMESTAMP_COLUMN
 from butterfree.hooks import Hook
@@ -182,6 +182,22 @@ def write(
         """
         table_name = feature_set.entity if self.write_to_entity else feature_set.name
 
+        if not self.debug_mode:
+            config = (
+                self.db_config
+                if self.db_config == CassandraConfig
+                else CassandraConfig()
+            )
+
+            cassandra_client = CassandraClient(
+                host=[config.host],
+                keyspace=config.keyspace,
+                user=config.username,
+                password=config.password,
+            )
+
+            dataframe = self.check_schema(cassandra_client, dataframe, table_name)
+
         if dataframe.isStreaming:
             dataframe = self._apply_transformations(dataframe)
             if self.debug_mode:
 
@@ -9,7 +9,7 @@
 )
 
 
-def test_sink(input_dataframe, feature_set):
+def test_sink(input_dataframe, feature_set, mocker):
     # arrange
     client = SparkClient()
     client.conn.conf.set("spark.sql.sources.partitionOverwriteMode", "dynamic")
@@ -44,6 +44,10 @@ def test_sink(input_dataframe, feature_set):
     )
     online_writer = OnlineFeatureStoreWriter(db_config=online_config)
 
+    online_writer.check_schema_hook = mocker.stub("check_schema_hook")
+    online_writer.check_schema_hook.run = mocker.stub("run")
+    online_writer.check_schema_hook.run.return_value = feature_set_df
+
     writers = [historical_writer, online_writer]
     sink = Sink(writers)
 
 
@@ -4,6 +4,7 @@
 from pyspark.sql import DataFrame
 from pyspark.sql import functions as F
 
+from butterfree.clients import SparkClient
 from butterfree.configs import environment
 from butterfree.configs.db import MetastoreConfig
 from butterfree.constants import DataType
@@ -411,3 +412,247 @@ def test_pipeline_interval_run(
 
         # tear down
         shutil.rmtree("test_folder")
+
+    def test_feature_set_pipeline_with_dates(
+        self,
+        mocked_date_df,
+        spark_session,
+        fixed_windows_output_feature_set_date_dataframe,
+        feature_set_pipeline,
+        mocker,
+    ):
+        # arrange
+        table_reader_table = "b_table"
+        create_temp_view(dataframe=mocked_date_df, name=table_reader_table)
+
+        historical_writer = HistoricalFeatureStoreWriter(debug_mode=True)
+
+        feature_set_pipeline.sink.writers = [historical_writer]
+
+        # act
+        feature_set_pipeline.run(start_date="2016-04-12", end_date="2016-04-13")
+
+        df = spark_session.sql("select * from historical_feature_store__feature_set")
+
+        # assert
+        assert_dataframe_equality(df, fixed_windows_output_feature_set_date_dataframe)
+
+    def test_feature_set_pipeline_with_execution_date(
+        self,
+        mocked_date_df,
+        spark_session,
+        fixed_windows_output_feature_set_date_dataframe,
+        feature_set_pipeline,
+        mocker,
+    ):
+        # arrange
+        table_reader_table = "b_table"
+        create_temp_view(dataframe=mocked_date_df, name=table_reader_table)
+
+        target_df = fixed_windows_output_feature_set_date_dataframe.filter(
+            "timestamp < '2016-04-13'"
+        )
+
+        historical_writer = HistoricalFeatureStoreWriter(debug_mode=True)
+
+        feature_set_pipeline.sink.writers = [historical_writer]
+
+        # act
+        feature_set_pipeline.run_for_date(execution_date="2016-04-12")
+
+        df = spark_session.sql("select * from historical_feature_store__feature_set")
+
+        # assert
+        assert_dataframe_equality(df, target_df)
+
+    def test_pipeline_with_hooks(self, spark_session, mocker):
+        # arrange
+        hook1 = AddHook(value=1)
+
+        spark_session.sql(
+            "select 1 as id, timestamp('2020-01-01') as timestamp, 0 as feature"
+        ).createOrReplaceTempView("test")
+
+        target_df = spark_session.sql(
+            "select 1 as id, timestamp('2020-01-01') as timestamp, 6 as feature, 2020 "
+            "as year, 1 as month, 1 as day"
+        )
+
+        historical_writer = HistoricalFeatureStoreWriter(debug_mode=True)
+
+        test_pipeline = FeatureSetPipeline(
+            source=Source(
+                readers=[TableReader(id="reader", table="test",).add_post_hook(hook1)],
+                query="select * from reader",
+            ).add_post_hook(hook1),
+            feature_set=FeatureSet(
+                name="feature_set",
+                entity="entity",
+                description="description",
+                features=[
+                    Feature(
+                        name="feature",
+                        description="test",
+                        transformation=SQLExpressionTransform(expression="feature + 1"),
+                        dtype=DataType.INTEGER,
+                    ),
+                ],
+                keys=[
+                    KeyFeature(
+                        name="id",
+                        description="The user's Main ID or device ID",
+                        dtype=DataType.INTEGER,
+                    )
+                ],
+                timestamp=TimestampFeature(),
+            )
+            .add_pre_hook(hook1)
+            .add_post_hook(hook1),
+            sink=Sink(writers=[historical_writer],).add_pre_hook(hook1),
+        )
+
+        # act
+        test_pipeline.run()
+        output_df = spark_session.table("historical_feature_store__feature_set")
+
+        # assert
+        output_df.show()
+        assert_dataframe_equality(output_df, target_df)
+
+    def test_pipeline_interval_run(
+        self, mocked_date_df, pipeline_interval_run_target_dfs, spark_session
+    ):
+        """Testing pipeline's idempotent interval run feature.
+        Source data:
+        +-------+---+-------------------+-------------------+
+        |feature| id|                 ts|          timestamp|
+        +-------+---+-------------------+-------------------+
+        |    200|  1|2016-04-11 11:31:11|2016-04-11 11:31:11|
+        |    300|  1|2016-04-12 11:44:12|2016-04-12 11:44:12|
+        |    400|  1|2016-04-13 11:46:24|2016-04-13 11:46:24|
+        |    500|  1|2016-04-14 12:03:21|2016-04-14 12:03:21|
+        +-------+---+-------------------+-------------------+
+        The test executes 3 runs for different time intervals. The input data has 4 data
+        points: 2016-04-11, 2016-04-12, 2016-04-13 and 2016-04-14. The following run
+        specifications are:
+        1)  Interval: from 2016-04-11 to 2016-04-13
+            Target table result:
+            +---+-------+---+-----+------+-------------------+----+
+            |day|feature| id|month|run_id|          timestamp|year|
+            +---+-------+---+-----+------+-------------------+----+
+            | 11|    200|  1|    4|     1|2016-04-11 11:31:11|2016|
+            | 12|    300|  1|    4|     1|2016-04-12 11:44:12|2016|
+            | 13|    400|  1|    4|     1|2016-04-13 11:46:24|2016|
+            +---+-------+---+-----+------+-------------------+----+
+        2)  Interval: only 2016-04-14.
+            Target table result:
+            +---+-------+---+-----+------+-------------------+----+
+            |day|feature| id|month|run_id|          timestamp|year|
+            +---+-------+---+-----+------+-------------------+----+
+            | 11|    200|  1|    4|     1|2016-04-11 11:31:11|2016|
+            | 12|    300|  1|    4|     1|2016-04-12 11:44:12|2016|
+            | 13|    400|  1|    4|     1|2016-04-13 11:46:24|2016|
+            | 14|    500|  1|    4|     2|2016-04-14 12:03:21|2016|
+            +---+-------+---+-----+------+-------------------+----+
+        3)  Interval: only 2016-04-11.
+            Target table result:
+            +---+-------+---+-----+------+-------------------+----+
+            |day|feature| id|month|run_id|          timestamp|year|
+            +---+-------+---+-----+------+-------------------+----+
+            | 11|    200|  1|    4|     3|2016-04-11 11:31:11|2016|
+            | 12|    300|  1|    4|     1|2016-04-12 11:44:12|2016|
+            | 13|    400|  1|    4|     1|2016-04-13 11:46:24|2016|
+            | 14|    500|  1|    4|     2|2016-04-14 12:03:21|2016|
+            +---+-------+---+-----+------+-------------------+----+
+        """
+        # arrange
+        create_temp_view(dataframe=mocked_date_df, name="input_data")
+
+        db = environment.get_variable("FEATURE_STORE_HISTORICAL_DATABASE")
+        path = "test_folder/historical/entity/feature_set"
+
+        spark_session.conf.set("spark.sql.sources.partitionOverwriteMode", "dynamic")
+        spark_session.sql(f"create database if not exists {db}")
+        spark_session.sql(
+            f"create table if not exists {db}.feature_set_interval "
+            f"(id int, timestamp timestamp, feature int, "
+            f"run_id int, year int, month int, day int);"
+        )
+
+        dbconfig = MetastoreConfig()
+        dbconfig.get_options = Mock(
+            return_value={"mode": "overwrite", "format_": "parquet", "path": path}
+        )
+
+        historical_writer = HistoricalFeatureStoreWriter(
+            db_config=dbconfig, interval_mode=True
+        )
+
+        first_run_hook = RunHook(id=1)
+        second_run_hook = RunHook(id=2)
+        third_run_hook = RunHook(id=3)
+
+        (
+            first_run_target_df,
+            second_run_target_df,
+            third_run_target_df,
+        ) = pipeline_interval_run_target_dfs
+
+        test_pipeline = FeatureSetPipeline(
+            source=Source(
+                readers=[
+                    TableReader(id="id", table="input_data",).with_incremental_strategy(
+                        IncrementalStrategy("ts")
+                    ),
+                ],
+                query="select * from id ",
+            ),
+            feature_set=FeatureSet(
+                name="feature_set_interval",
+                entity="entity",
+                description="",
+                keys=[KeyFeature(name="id", description="", dtype=DataType.INTEGER,)],
+                timestamp=TimestampFeature(from_column="ts"),
+                features=[
+                    Feature(name="feature", description="", dtype=DataType.INTEGER),
+                    Feature(name="run_id", description="", dtype=DataType.INTEGER),
+                ],
+            ),
+            sink=Sink([historical_writer],),
+        )
+
+        # act and assert
+        dbconfig.get_path_with_partitions = Mock(
+            return_value=[
+                "test_folder/historical/entity/feature_set/year=2016/month=4/day=11",
+                "test_folder/historical/entity/feature_set/year=2016/month=4/day=12",
+                "test_folder/historical/entity/feature_set/year=2016/month=4/day=13",
+            ]
+        )
+        test_pipeline.feature_set.add_pre_hook(first_run_hook)
+        test_pipeline.run(end_date="2016-04-13", start_date="2016-04-11")
+        first_run_output_df = spark_session.read.parquet(path)
+        assert_dataframe_equality(first_run_output_df, first_run_target_df)
+
+        dbconfig.get_path_with_partitions = Mock(
+            return_value=[
+                "test_folder/historical/entity/feature_set/year=2016/month=4/day=14",
+            ]
+        )
+        test_pipeline.feature_set.add_pre_hook(second_run_hook)
+        test_pipeline.run_for_date("2016-04-14")
+        second_run_output_df = spark_session.read.parquet(path)
+        assert_dataframe_equality(second_run_output_df, second_run_target_df)
+
+        dbconfig.get_path_with_partitions = Mock(
+            return_value=[
+                "test_folder/historical/entity/feature_set/year=2016/month=4/day=11",
+            ]
+        )
+        test_pipeline.feature_set.add_pre_hook(third_run_hook)
+        test_pipeline.run_for_date("2016-04-11")
+        third_run_output_df = spark_session.read.parquet(path)
+        assert_dataframe_equality(third_run_output_df, third_run_target_df)
+
+        # tear down
+        shutil.rmtree("test_folder")