|
2 | 2 |
|
3 | 3 | from typing import List, Optional
|
4 | 4 |
|
| 5 | +from pyspark.storagelevel import StorageLevel |
| 6 | + |
5 | 7 | from butterfree.clients import SparkClient
|
6 | 8 | from butterfree.dataframe_service import repartition_sort_df
|
7 | 9 | from butterfree.extract import Source
|
@@ -209,35 +211,46 @@ def run(
|
209 | 211 | soon. Use only if strictly necessary.
|
210 | 212 |
|
211 | 213 | """
|
| 214 | + # Step 1: Construct input dataframe from the source. |
212 | 215 | dataframe = self.source.construct(
|
213 | 216 | client=self.spark_client,
|
214 | 217 | start_date=self.feature_set.define_start_date(start_date),
|
215 | 218 | end_date=end_date,
|
216 | 219 | )
|
217 | 220 |
|
| 221 | + # Step 2: Repartition and sort if required, avoid if not necessary. |
218 | 222 | if partition_by:
|
219 | 223 | order_by = order_by or partition_by
|
220 |
| - dataframe = repartition_sort_df( |
221 |
| - dataframe, partition_by, order_by, num_processors |
222 |
| - ) |
223 |
| - |
224 |
| - dataframe = self.feature_set.construct( |
| 224 | + current_partitions = dataframe.rdd.getNumPartitions() |
| 225 | + optimal_partitions = num_processors or current_partitions |
| 226 | + if current_partitions != optimal_partitions: |
| 227 | + dataframe = repartition_sort_df( |
| 228 | + dataframe, partition_by, order_by, num_processors |
| 229 | + ) |
| 230 | + |
| 231 | + # Step 3: Construct the feature set dataframe using defined transformations. |
| 232 | + transformed_dataframe = self.feature_set.construct( |
225 | 233 | dataframe=dataframe,
|
226 | 234 | client=self.spark_client,
|
227 | 235 | start_date=start_date,
|
228 | 236 | end_date=end_date,
|
229 | 237 | num_processors=num_processors,
|
230 | 238 | )
|
231 | 239 |
|
| 240 | + if dataframe.storageLevel != StorageLevel.NONE: |
| 241 | + dataframe.unpersist() # Clear the data from the cache (disk and memory) |
| 242 | + |
| 243 | + # Step 4: Load the data into the configured sink. |
232 | 244 | self.sink.flush(
|
233 |
| - dataframe=dataframe, |
| 245 | + dataframe=transformed_dataframe, |
234 | 246 | feature_set=self.feature_set,
|
235 | 247 | spark_client=self.spark_client,
|
236 | 248 | )
|
237 | 249 |
|
238 |
| - if not dataframe.isStreaming: |
| 250 | + # Step 5: Validate the output if not streaming and data volume is reasonable. |
| 251 | + if not transformed_dataframe.isStreaming: |
239 | 252 | self.sink.validate(
|
240 |
| - dataframe=dataframe, |
| 253 | + dataframe=transformed_dataframe, |
241 | 254 | feature_set=self.feature_set,
|
242 | 255 | spark_client=self.spark_client,
|
243 | 256 | )
|
|
0 commit comments