quintoandar
diff --git a/‎CHANGELOG.md‎
Lines changed: 4 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎butterfree/clients/cassandra_client.py‎
Lines changed: 29 additions & 40 deletions b/‎butterfree/clients/cassandra_client.py‎
Lines changed: 29 additions & 40 deletions
diff --git a/‎butterfree/clients/spark_client.py‎
Lines changed: 3 additions & 67 deletions b/‎butterfree/clients/spark_client.py‎
Lines changed: 3 additions & 67 deletions
diff --git a/‎butterfree/load/writers/historical_feature_store_writer.py‎
Lines changed: 51 additions & 20 deletions b/‎butterfree/load/writers/historical_feature_store_writer.py‎
Lines changed: 51 additions & 20 deletions
diff --git a/‎butterfree/load/writers/online_feature_store_writer.py‎
Lines changed: 21 additions & 7 deletions b/‎butterfree/load/writers/online_feature_store_writer.py‎
Lines changed: 21 additions & 7 deletions
@@ -33,6 +33,10 @@ Preferably use **Added**, **Changed**, **Removed** and **Fixed** topics in each
 * [BUG] Fix Cassandra Connect Session ([#316](https://github.com/quintoandar/butterfree/pull/316))
 * Fix method to generate agg feature name. ([#326](https://github.com/quintoandar/butterfree/pull/326))
 
+## [1.1.3](https://github.com/quintoandar/butterfree/releases/tag/1.1.3)
+### Added
+* [MLOP-636] Create migration classes ([#282](https://github.com/quintoandar/butterfree/pull/282))
+
 ## [1.1.3](https://github.com/quintoandar/butterfree/releases/tag/1.1.3)
 ### Added
 * [MLOP-599] Apply mypy to ButterFree ([#273](https://github.com/quintoandar/butterfree/pull/273))
 
@@ -3,15 +3,9 @@
 from typing import Dict, List, Optional
 
 from cassandra.auth import PlainTextAuthProvider
-from cassandra.cluster import (
-    EXEC_PROFILE_DEFAULT,
-    Cluster,
-    ExecutionProfile,
-    ResponseFuture,
-    Session,
-)
-from cassandra.policies import DCAwareRoundRobinPolicy
-from cassandra.query import ConsistencyLevel, dict_factory
+from cassandra.cluster import Cluster, ResponseFuture, Session
+from cassandra.policies import RoundRobinPolicy
+from cassandra.query import dict_factory
 from typing_extensions import TypedDict
 
 from butterfree.clients import AbstractClient
@@ -61,36 +55,29 @@ def __init__(
     @property
     def conn(self, *, ssl_path: str = None) -> Session:  # type: ignore
         """Establishes a Cassandra connection."""
-        if not self._session:
-            auth_provider = (
-                PlainTextAuthProvider(username=self.user, password=self.password)
-                if self.user is not None
-                else None
-            )
-            ssl_opts = (
-                {
-                    "ca_certs": ssl_path,
-                    "ssl_version": PROTOCOL_TLSv1,
-                    "cert_reqs": CERT_REQUIRED,
-                }
-                if ssl_path is not None
-                else None
-            )
-
-            execution_profiles = {
-                EXEC_PROFILE_DEFAULT: ExecutionProfile(
-                    load_balancing_policy=DCAwareRoundRobinPolicy(),
-                    consistency_level=ConsistencyLevel.LOCAL_QUORUM,
-                    row_factory=dict_factory,
-                )
+        auth_provider = (
+            PlainTextAuthProvider(username=self.user, password=self.password)
+            if self.user is not None
+            else None
+        )
+        ssl_opts = (
+            {
+                "ca_certs": ssl_path,
+                "ssl_version": PROTOCOL_TLSv1,
+                "cert_reqs": CERT_REQUIRED,
             }
-            cluster = Cluster(
-                contact_points=self.host,
-                auth_provider=auth_provider,
-                ssl_options=ssl_opts,
-                execution_profiles=execution_profiles,
-            )
-            self._session = cluster.connect(self.keyspace)
+            if ssl_path is not None
+            else None
+        )
+
+        cluster = Cluster(
+            contact_points=self.host,
+            auth_provider=auth_provider,
+            ssl_options=ssl_opts,
+            load_balancing_policy=RoundRobinPolicy(),
+        )
+        self._session = cluster.connect(self.keyspace)
+        self._session.row_factory = dict_factory
         return self._session
 
     def sql(self, query: str) -> ResponseFuture:
@@ -100,9 +87,11 @@ def sql(self, query: str) -> ResponseFuture:
             query: desired query.
 
         """
-        return self.conn.execute(query)
+        if not self._session:
+            raise RuntimeError("There's no session available for this query.")
+        return self._session.execute(query)
 
-    def get_schema(self, table: str, database: str = None) -> List[Dict[str, str]]:
+    def get_schema(self, table: str) -> List[Dict[str, str]]:
         """Returns desired table schema.
 
         Attributes:
 
@@ -1,6 +1,5 @@
 """SparkClient entity."""
 
-import json
 from typing import Any, Dict, List, Optional, Union
 
 from pyspark.sql import DataFrame, DataFrameReader, SparkSession
@@ -58,7 +57,7 @@ def read(
         """
         if not isinstance(format, str):
             raise ValueError("format needs to be a string with the desired read format")
-        if path and not isinstance(path, (str, list)):
+        if not isinstance(path, (str, list)):
             raise ValueError("path needs to be a string or a list of string")
 
         df_reader: Union[
@@ -67,7 +66,7 @@ def read(
 
         df_reader = df_reader.schema(schema) if schema else df_reader
 
-        return df_reader.format(format).load(path=path, **options)  # type: ignore
+        return df_reader.format(format).load(path, **options)  # type: ignore
 
     def read_table(self, table: str, database: str = None) -> DataFrame:
         """Use the SparkSession.read interface to read a metastore table.
@@ -217,8 +216,7 @@ def write_table(
             **options,
         )
 
-    @staticmethod
-    def create_temporary_view(dataframe: DataFrame, name: str) -> Any:
+    def create_temporary_view(self, dataframe: DataFrame, name: str) -> Any:
         """Create a temporary view from a given dataframe.
 
         Args:
@@ -273,65 +271,3 @@ def add_table_partitions(
         )
 
         self.conn.sql(command)
-
-    @staticmethod
-    def _filter_schema(schema: DataFrame) -> List[str]:
-        """Returns filtered schema with the desired information.
-
-        Attributes:
-            schema: desired table.
-
-        Returns:
-            A list of strings in the format
-            ['{"column_name": "example1", type: "Spark_type"}', ...]
-
-        """
-        return (
-            schema.filter(
-                ~schema.col_name.isin(
-                    ["# Partition Information", "# col_name", "year", "month", "day"]
-                )
-            )
-            .toJSON()
-            .collect()
-        )
-
-    def _convert_schema(self, schema: DataFrame) -> List[Dict[str, str]]:
-        """Returns schema with the desired information.
-
-        Attributes:
-            schema: desired table.
-
-        Returns:
-            A list of dictionaries in the format
-            [{"column_name": "example1", type: "Spark_type"}, ...]
-
-        """
-        schema_list = self._filter_schema(schema)
-        converted_schema = []
-        for row in schema_list:
-            converted_schema.append(json.loads(row))
-
-        return converted_schema
-
-    def get_schema(self, table: str, database: str = None) -> List[Dict[str, str]]:
-        """Returns desired table schema.
-
-        Attributes:
-            table: desired table.
-
-        Returns:
-            A list of dictionaries in the format
-            [{"column_name": "example1", type: "Spark_type"}, ...]
-
-        """
-        query = f"DESCRIBE {database}.{table} "  # noqa
-
-        response = self.sql(query)
-
-        if not response:
-            raise RuntimeError(
-                f"No columns found for table: {table}" f"in database: {database}"
-            )
-
-        return self._convert_schema(response)
@@ -1,7 +1,7 @@
 """Holds the Historical Feature Store writer class."""
 
 import os
-from typing import Any
+from typing import Any, Union
 
 from pyspark.sql.dataframe import DataFrame
 from pyspark.sql.functions import dayofmonth, month, year
@@ -106,17 +106,16 @@ class HistoricalFeatureStoreWriter(Writer):
 
     def __init__(
         self,
-        db_config: AbstractWriteConfig = None,
+        db_config: Union[AbstractWriteConfig, MetastoreConfig] = None,
         database: str = None,
         num_partitions: int = None,
         validation_threshold: float = DEFAULT_VALIDATION_THRESHOLD,
         debug_mode: bool = False,
         interval_mode: bool = False,
         check_schema_hook: Hook = None,
     ):
-        super(HistoricalFeatureStoreWriter, self).__init__(
-            db_config or MetastoreConfig(), debug_mode, interval_mode
-        )
+        super(HistoricalFeatureStoreWriter, self).__init__(debug_mode, interval_mode)
+        self.db_config = db_config or MetastoreConfig()
         self.database = database or environment.get_variable(
             "FEATURE_STORE_HISTORICAL_DATABASE"
         )
@@ -141,20 +140,25 @@ def write(
         """
         dataframe = self._create_partitions(dataframe)
 
-        dataframe = self._apply_transformations(dataframe)
+        partition_df = self._apply_transformations(dataframe)
+
+        if self.debug_mode:
+            dataframe = partition_df
+        else:
+            dataframe = self.check_schema(
+                spark_client, partition_df, feature_set.name, self.database
+            )
 
         if self.interval_mode:
-            partition_overwrite_mode = spark_client.conn.conf.get(
-                "spark.sql.sources.partitionOverwriteMode"
-            ).lower()
-
-            if partition_overwrite_mode != "dynamic":
-                raise RuntimeError(
-                    "m=load_incremental_table, "
-                    "spark.sql.sources.partitionOverwriteMode={}, "
-                    "msg=partitionOverwriteMode have to "
-                    "be configured to 'dynamic'".format(partition_overwrite_mode)
+            if self.debug_mode:
+                spark_client.create_temporary_view(
+                    dataframe=dataframe,
+                    name=f"historical_feature_store__{feature_set.name}",
                 )
+                return
+
+            self._incremental_mode(feature_set, dataframe, spark_client)
+            return
 
         if self.debug_mode:
             spark_client.create_temporary_view(
@@ -173,6 +177,34 @@ def write(
             **self.db_config.get_options(s3_key),
         )
 
+    def _incremental_mode(
+        self, feature_set: FeatureSet, dataframe: DataFrame, spark_client: SparkClient
+    ) -> None:
+
+        partition_overwrite_mode = spark_client.conn.conf.get(
+            "spark.sql.sources.partitionOverwriteMode"
+        ).lower()
+
+        if partition_overwrite_mode != "dynamic":
+            raise RuntimeError(
+                "m=load_incremental_table, "
+                "spark.sql.sources.partitionOverwriteMode={}, "
+                "msg=partitionOverwriteMode have to be configured to 'dynamic'".format(
+                    partition_overwrite_mode
+                )
+            )
+
+        s3_key = os.path.join("historical", feature_set.entity, feature_set.name)
+        options = {"path": self.db_config.get_options(s3_key).get("path")}
+
+        spark_client.write_dataframe(
+            dataframe=dataframe,
+            format_=self.db_config.format_,
+            mode=self.db_config.mode,
+            **options,
+            partitionBy=self.PARTITION_BY,
+        )
+
     def _assert_validation_count(
         self, table_name: str, written_count: int, dataframe_count: int
     ) -> None:
@@ -199,9 +231,10 @@ def validate(
         Raises:
             AssertionError: if count of written data doesn't match count in current
                 feature set dataframe.
+
         """
         table_name = (
-            os.path.join("historical", feature_set.entity, feature_set.name)
+            f"{feature_set.name}"
             if self.interval_mode and not self.debug_mode
             else (
                 f"{self.database}.{feature_set.name}"
@@ -213,9 +246,7 @@ def validate(
         written_count = (
             spark_client.read(
                 self.db_config.format_,
-                path=self.db_config.get_path_with_partitions(
-                    table_name, self._create_partitions(dataframe)
-                ),
+                path=self.db_config.get_path_with_partitions(table_name, dataframe),
             ).count()
             if self.interval_mode and not self.debug_mode
             else spark_client.read_table(table_name).count()
 
@@ -7,7 +7,7 @@
 from pyspark.sql.functions import col, row_number
 from pyspark.sql.streaming import StreamingQuery
 
-from butterfree.clients import SparkClient
+from butterfree.clients import CassandraClient, SparkClient
 from butterfree.configs.db import AbstractWriteConfig, CassandraConfig
 from butterfree.constants.columns import TIMESTAMP_COLUMN
 from butterfree.hooks import Hook
@@ -80,18 +80,16 @@ class OnlineFeatureStoreWriter(Writer):
 
     def __init__(
         self,
-        db_config: AbstractWriteConfig = None,
-        database: str = None,
+        db_config: Union[AbstractWriteConfig, CassandraConfig] = None,
         debug_mode: bool = False,
         write_to_entity: bool = False,
         interval_mode: bool = False,
         check_schema_hook: Hook = None,
     ):
-        super(OnlineFeatureStoreWriter, self).__init__(
-            db_config or CassandraConfig(), debug_mode, interval_mode, write_to_entity
-        )
+        super(OnlineFeatureStoreWriter, self).__init__(debug_mode, interval_mode)
+        self.db_config = db_config or CassandraConfig()
+        self.write_to_entity = write_to_entity
         self.check_schema_hook = check_schema_hook
-        self.database = database
 
     @staticmethod
     def filter_latest(dataframe: DataFrame, id_columns: List[Any]) -> DataFrame:
@@ -182,6 +180,22 @@ def write(
         """
         table_name = feature_set.entity if self.write_to_entity else feature_set.name
 
+        if not self.debug_mode:
+            config = (
+                self.db_config
+                if self.db_config == CassandraConfig
+                else CassandraConfig()
+            )
+
+            cassandra_client = CassandraClient(
+                host=[config.host],
+                keyspace=config.keyspace,
+                user=config.username,
+                password=config.password,
+            )
+
+            dataframe = self.check_schema(cassandra_client, dataframe, table_name)
+
         if dataframe.isStreaming:
             dataframe = self._apply_transformations(dataframe)
             if self.debug_mode: