Skip to content

Commit 5e6f416

Browse files
authored
fix: Rollback to latest stable release (#391)
# What We are reverting the `staging` and `master` branches to the codebase of `1.4.0` version. Currently this is the stable and tested one ## Changes reverted - **Revert "fix: move incremental filter (#388)"** - **Revert "fix: rollback repartition (#386)"** - **Revert "chore: level (#382)"** - **Revert "fix: performance adjustments, migrate (#378)"** - **Revert "fix: performance improvements (#374)"** ## Changes maintained - #385 - #376 ## What's next We'll add in the next PR's (and releases) the new features
2 parents b7c7d48 + 4e4293d commit 5e6f416

File tree

7 files changed

+39
-88
lines changed

7 files changed

+39
-88
lines changed

butterfree/_cli/migrate.py

Lines changed: 2 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import os
66
import pkgutil
77
import sys
8-
from typing import Set, Type
8+
from typing import Set
99

1010
import boto3
1111
import setuptools
@@ -90,18 +90,8 @@ def __fs_objects(path: str) -> Set[FeatureSetPipeline]:
9090

9191
instances.add(value)
9292

93-
def create_instance(cls: Type[FeatureSetPipeline]) -> FeatureSetPipeline:
94-
sig = inspect.signature(cls.__init__)
95-
parameters = sig.parameters
96-
97-
if "run_date" in parameters:
98-
run_date = datetime.datetime.today().strftime("%Y-%m-%d")
99-
return cls(run_date)
100-
101-
return cls()
102-
10393
logger.info("Creating instances...")
104-
return set(create_instance(value) for value in instances) # type: ignore
94+
return set(value() for value in instances) # type: ignore
10595

10696

10797
PATH = typer.Argument(

butterfree/extract/source.py

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
from typing import List, Optional
44

55
from pyspark.sql import DataFrame
6-
from pyspark.storagelevel import StorageLevel
76

87
from butterfree.clients import SparkClient
98
from butterfree.extract.readers.reader import Reader
@@ -96,21 +95,16 @@ def construct(
9695
DataFrame with the query result against all readers.
9796
9897
"""
99-
# Step 1: Build temporary views for each reader
10098
for reader in self.readers:
101-
reader.build(client=client, start_date=start_date, end_date=end_date)
99+
reader.build(
100+
client=client, start_date=start_date, end_date=end_date
101+
) # create temporary views for each reader
102102

103-
# Step 2: Execute SQL query on the combined readers
104103
dataframe = client.sql(self.query)
105104

106-
# Step 3: Cache the dataframe if necessary, using memory and disk storage
107105
if not dataframe.isStreaming and self.eager_evaluation:
108-
# Persist to ensure the DataFrame is stored in mem and disk (if necessary)
109-
dataframe.persist(StorageLevel.MEMORY_AND_DISK)
110-
# Trigger the cache/persist operation by performing an action
111-
dataframe.count()
106+
dataframe.cache().count()
112107

113-
# Step 4: Run post-processing hooks on the dataframe
114108
post_hook_df = self.run_post_hooks(dataframe)
115109

116110
return post_hook_df

butterfree/migrations/database_migration/cassandra_migration.py

Lines changed: 2 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -78,9 +78,6 @@ def _get_alter_table_add_query(self, columns: List[Diff], table_name: str) -> st
7878
def _get_alter_column_type_query(self, column: Diff, table_name: str) -> str:
7979
"""Creates CQL statement to alter columns' types.
8080
81-
In Cassandra 3.4.x to 3.11.x alter type is not allowed.
82-
This method creates a temp column to comply.
83-
8481
Args:
8582
columns: list of Diff objects with ALTER_TYPE kind.
8683
table_name: table name.
@@ -89,23 +86,10 @@ def _get_alter_column_type_query(self, column: Diff, table_name: str) -> str:
8986
Alter column type query.
9087
9188
"""
92-
temp_column_name = f"{column.column}_temp"
93-
94-
add_temp_column_query = (
95-
f"ALTER TABLE {table_name} ADD {temp_column_name} {column.value};"
96-
)
97-
copy_data_to_temp_query = (
98-
f"UPDATE {table_name} SET {temp_column_name} = {column.column};"
99-
)
100-
101-
drop_old_column_query = f"ALTER TABLE {table_name} DROP {column.column};"
102-
rename_temp_column_query = (
103-
f"ALTER TABLE {table_name} RENAME {temp_column_name} TO {column.column};"
104-
)
89+
parsed_columns = self._get_parsed_columns([column])
10590

10691
return (
107-
f"{add_temp_column_query} {copy_data_to_temp_query} "
108-
f"{drop_old_column_query} {rename_temp_column_query};"
92+
f"ALTER TABLE {table_name} ALTER {parsed_columns.replace(' ', ' TYPE ')};"
10993
)
11094

11195
@staticmethod

butterfree/pipelines/feature_set_pipeline.py

Lines changed: 4 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,6 @@
22

33
from typing import List, Optional
44

5-
from pyspark.storagelevel import StorageLevel
6-
75
from butterfree.clients import SparkClient
86
from butterfree.dataframe_service import repartition_sort_df
97
from butterfree.extract import Source
@@ -211,45 +209,35 @@ def run(
211209
soon. Use only if strictly necessary.
212210
213211
"""
214-
# Step 1: Construct input dataframe from the source.
215212
dataframe = self.source.construct(
216213
client=self.spark_client,
217214
start_date=self.feature_set.define_start_date(start_date),
218215
end_date=end_date,
219216
)
220217

221-
# Step 2: Repartition and sort if required, avoid if not necessary.
222218
if partition_by:
223219
order_by = order_by or partition_by
224220
dataframe = repartition_sort_df(
225221
dataframe, partition_by, order_by, num_processors
226222
)
227223

228-
# Step 3: Construct the feature set dataframe using defined transformations.
229-
transformed_dataframe = self.feature_set.construct(
224+
dataframe = self.feature_set.construct(
230225
dataframe=dataframe,
231226
client=self.spark_client,
232227
start_date=start_date,
233228
end_date=end_date,
234229
num_processors=num_processors,
235230
)
236231

237-
if transformed_dataframe.storageLevel != StorageLevel(
238-
False, False, False, False, 1
239-
):
240-
dataframe.unpersist() # Clear the data from the cache (disk and memory)
241-
242-
# Step 4: Load the data into the configured sink.
243232
self.sink.flush(
244-
dataframe=transformed_dataframe,
233+
dataframe=dataframe,
245234
feature_set=self.feature_set,
246235
spark_client=self.spark_client,
247236
)
248237

249-
# Step 5: Validate the output if not streaming and data volume is reasonable.
250-
if not transformed_dataframe.isStreaming:
238+
if not dataframe.isStreaming:
251239
self.sink.validate(
252-
dataframe=transformed_dataframe,
240+
dataframe=dataframe,
253241
feature_set=self.feature_set,
254242
spark_client=self.spark_client,
255243
)

butterfree/transform/aggregated_feature_set.py

Lines changed: 21 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -387,7 +387,6 @@ def _aggregate(
387387
]
388388

389389
groupby = self.keys_columns.copy()
390-
391390
if window is not None:
392391
dataframe = dataframe.withColumn("window", window.get())
393392
groupby.append("window")
@@ -411,23 +410,19 @@ def _aggregate(
411410
"keep_rn", functions.row_number().over(partition_window)
412411
).filter("keep_rn = 1")
413412

414-
current_partitions = dataframe.rdd.getNumPartitions()
415-
optimal_partitions = num_processors or current_partitions
416-
417-
if current_partitions != optimal_partitions:
418-
dataframe = repartition_df(
419-
dataframe,
420-
partition_by=groupby,
421-
num_processors=optimal_partitions,
422-
)
423-
413+
# repartition to have all rows for each group at the same partition
414+
# by doing that, we won't have to shuffle data on grouping by id
415+
dataframe = repartition_df(
416+
dataframe,
417+
partition_by=groupby,
418+
num_processors=num_processors,
419+
)
424420
grouped_data = dataframe.groupby(*groupby)
425421

426-
if self._pivot_column and self._pivot_values:
422+
if self._pivot_column:
427423
grouped_data = grouped_data.pivot(self._pivot_column, self._pivot_values)
428424

429425
aggregated = grouped_data.agg(*aggregations)
430-
431426
return self._with_renamed_columns(aggregated, features, window)
432427

433428
def _with_renamed_columns(
@@ -576,12 +571,14 @@ def construct(
576571

577572
pre_hook_df = self.run_pre_hooks(dataframe)
578573

579-
output_df = pre_hook_df
580-
for feature in self.keys + [self.timestamp]:
581-
output_df = feature.transform(output_df)
574+
output_df = reduce(
575+
lambda df, feature: feature.transform(df),
576+
self.keys + [self.timestamp],
577+
pre_hook_df,
578+
)
582579

583580
if self._windows and end_date is not None:
584-
# Run aggregations for each window
581+
# run aggregations for each window
585582
agg_list = [
586583
self._aggregate(
587584
dataframe=output_df,
@@ -601,12 +598,13 @@ def construct(
601598

602599
# keeping this logic to maintain the same behavior for already implemented
603600
# feature sets
601+
604602
if self._windows[0].slide == "1 day":
605603
base_df = self._get_base_dataframe(
606604
client=client, dataframe=output_df, end_date=end_date
607605
)
608606

609-
# Left join each aggregation result to our base dataframe
607+
# left join each aggregation result to our base dataframe
610608
output_df = reduce(
611609
lambda left, right: self._dataframe_join(
612610
left,
@@ -639,18 +637,12 @@ def construct(
639637
output_df = output_df.select(*self.columns).replace( # type: ignore
640638
float("nan"), None
641639
)
642-
643-
if not output_df.isStreaming and self.deduplicate_rows:
644-
output_df = self._filter_duplicated_rows(output_df)
640+
if not output_df.isStreaming:
641+
if self.deduplicate_rows:
642+
output_df = self._filter_duplicated_rows(output_df)
643+
if self.eager_evaluation:
644+
output_df.cache().count()
645645

646646
post_hook_df = self.run_post_hooks(output_df)
647647

648-
# Eager evaluation, only if needed and managable
649-
if not output_df.isStreaming and self.eager_evaluation:
650-
# Small dataframes only
651-
if output_df.count() < 1_000_000:
652-
post_hook_df.cache().count()
653-
else:
654-
post_hook_df.cache() # Cache without materialization for large volumes
655-
656648
return post_hook_df

butterfree/transform/feature_set.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -436,8 +436,11 @@ def construct(
436436
pre_hook_df,
437437
).select(*self.columns)
438438

439-
if not output_df.isStreaming and self.deduplicate_rows:
440-
output_df = self._filter_duplicated_rows(output_df)
439+
if not output_df.isStreaming:
440+
if self.deduplicate_rows:
441+
output_df = self._filter_duplicated_rows(output_df)
442+
if self.eager_evaluation:
443+
output_df.cache().count()
441444

442445
output_df = self.incremental_strategy.filter_with_incremental_strategy(
443446
dataframe=output_df, start_date=start_date, end_date=end_date

tests/unit/butterfree/transform/test_feature_set.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -220,7 +220,7 @@ def test_construct(
220220
+ feature_divide.get_output_columns()
221221
)
222222
assert_dataframe_equality(result_df, feature_set_dataframe)
223-
assert not result_df.is_cached
223+
assert result_df.is_cached
224224

225225
def test_construct_invalid_df(
226226
self, key_id, timestamp_c, feature_add, feature_divide

0 commit comments

Comments
 (0)