Skip to content

Commit 3b2d2aa

Browse files
committed
[MLOP-635] Rebase Incremental Job/Interval Run branch for test on selected feature sets (#278)
* Add interval branch modifications. * Add interval_runs notebook. * Add tests. * Apply style (black, flack8 and mypy). * Fix tests. * Change version to create package dev.
1 parent daf7ca1 commit 3b2d2aa

File tree

58 files changed

+4736
-389
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

58 files changed

+4736
-389
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ coverage.xml
4747
*.cover
4848
.hypothesis/
4949
*cov.xml
50+
test_folder/
5051

5152
# Translations
5253
*.mo

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,10 @@ All notable changes to this project will be documented in this file.
33

44
Preferably use **Added**, **Changed**, **Removed** and **Fixed** topics in each release or unreleased log for a better organization.
55

6+
## [1.1.3](https://github.com/quintoandar/butterfree/releases/tag/1.1.3)
7+
### Added
8+
* [MLOP-636] Create migration classes ([#282](https://github.com/quintoandar/butterfree/pull/282))
9+
610
## [1.1.3](https://github.com/quintoandar/butterfree/releases/tag/1.1.3)
711
### Added
812
* [MLOP-599] Apply mypy to ButterFree ([#273](https://github.com/quintoandar/butterfree/pull/273))

butterfree/clients/cassandra_client.py

Lines changed: 19 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -33,33 +33,31 @@ class CassandraClient(AbstractClient):
3333
"""Cassandra Client.
3434
3535
Attributes:
36-
cassandra_user: username to use in connection.
37-
cassandra_password: password to use in connection.
38-
cassandra_key_space: key space used in connection.
39-
cassandra_host: cassandra endpoint used in connection.
36+
user: username to use in connection.
37+
password: password to use in connection.
38+
keyspace: key space used in connection.
39+
host: cassandra endpoint used in connection.
4040
"""
4141

4242
def __init__(
4343
self,
44-
cassandra_host: List[str],
45-
cassandra_key_space: str,
46-
cassandra_user: Optional[str] = None,
47-
cassandra_password: Optional[str] = None,
44+
host: List[str],
45+
keyspace: str,
46+
user: Optional[str] = None,
47+
password: Optional[str] = None,
4848
) -> None:
49-
self.cassandra_host = cassandra_host
50-
self.cassandra_key_space = cassandra_key_space
51-
self.cassandra_user = cassandra_user
52-
self.cassandra_password = cassandra_password
49+
self.host = host
50+
self.keyspace = keyspace
51+
self.user = user
52+
self.password = password
5353
self._session: Optional[Session] = None
5454

5555
@property
5656
def conn(self, *, ssl_path: str = None) -> Session: # type: ignore
5757
"""Establishes a Cassandra connection."""
5858
auth_provider = (
59-
PlainTextAuthProvider(
60-
username=self.cassandra_user, password=self.cassandra_password
61-
)
62-
if self.cassandra_user is not None
59+
PlainTextAuthProvider(username=self.user, password=self.password)
60+
if self.user is not None
6361
else None
6462
)
6563
ssl_opts = (
@@ -73,12 +71,12 @@ def conn(self, *, ssl_path: str = None) -> Session: # type: ignore
7371
)
7472

7573
cluster = Cluster(
76-
contact_points=self.cassandra_host,
74+
contact_points=self.host,
7775
auth_provider=auth_provider,
7876
ssl_options=ssl_opts,
7977
load_balancing_policy=RoundRobinPolicy(),
8078
)
81-
self._session = cluster.connect(self.cassandra_key_space)
79+
self._session = cluster.connect(self.keyspace)
8280
self._session.row_factory = dict_factory
8381
return self._session
8482

@@ -106,16 +104,15 @@ def get_schema(self, table: str) -> List[Dict[str, str]]:
106104
"""
107105
query = (
108106
f"SELECT column_name, type FROM system_schema.columns " # noqa
109-
f"WHERE keyspace_name = '{self.cassandra_key_space}' " # noqa
107+
f"WHERE keyspace_name = '{self.keyspace}' " # noqa
110108
f" AND table_name = '{table}';" # noqa
111109
)
112110

113111
response = list(self.sql(query))
114112

115113
if not response:
116114
raise RuntimeError(
117-
f"No columns found for table: {table}"
118-
f"in key space: {self.cassandra_key_space}"
115+
f"No columns found for table: {table}" f"in key space: {self.keyspace}"
119116
)
120117

121118
return response
@@ -143,7 +140,7 @@ def _get_create_table_query(
143140
else:
144141
columns_str = joined_parsed_columns
145142

146-
query = f"CREATE TABLE {self.cassandra_key_space}.{table} " f"({columns_str}); "
143+
query = f"CREATE TABLE {self.keyspace}.{table} " f"({columns_str}); "
147144

148145
return query
149146

butterfree/clients/spark_client.py

Lines changed: 53 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,10 @@ def conn(self) -> SparkSession:
3434
def read(
3535
self,
3636
format: str,
37-
options: Dict[str, Any],
37+
path: Optional[Union[str, List[str]]] = None,
3838
schema: Optional[StructType] = None,
3939
stream: bool = False,
40+
**options: Any,
4041
) -> DataFrame:
4142
"""Use the SparkSession.read interface to load data into a dataframe.
4243
@@ -45,24 +46,27 @@ def read(
4546
4647
Args:
4748
format: string with the format to be used by the DataframeReader.
48-
options: options to setup the DataframeReader.
49+
path: optional string or a list of string for file-system.
4950
stream: flag to indicate if data must be read in stream mode.
5051
schema: an optional pyspark.sql.types.StructType for the input schema.
52+
options: options to setup the DataframeReader.
5153
5254
Returns:
5355
Dataframe
5456
5557
"""
5658
if not isinstance(format, str):
5759
raise ValueError("format needs to be a string with the desired read format")
58-
if not isinstance(options, dict):
59-
raise ValueError("options needs to be a dict with the setup configurations")
60+
if not isinstance(path, (str, list)):
61+
raise ValueError("path needs to be a string or a list of string")
6062

6163
df_reader: Union[
6264
DataStreamReader, DataFrameReader
6365
] = self.conn.readStream if stream else self.conn.read
66+
6467
df_reader = df_reader.schema(schema) if schema else df_reader
65-
return df_reader.format(format).options(**options).load()
68+
69+
return df_reader.format(format).load(path, **options) # type: ignore
6670

6771
def read_table(self, table: str, database: str = None) -> DataFrame:
6872
"""Use the SparkSession.read interface to read a metastore table.
@@ -223,3 +227,47 @@ def create_temporary_view(self, dataframe: DataFrame, name: str) -> Any:
223227
if not dataframe.isStreaming:
224228
return dataframe.createOrReplaceTempView(name)
225229
return dataframe.writeStream.format("memory").queryName(name).start()
230+
231+
def add_table_partitions(
232+
self, partitions: List[Dict[str, Any]], table: str, database: str = None
233+
) -> None:
234+
"""Add partitions to an existing table.
235+
236+
Args:
237+
partitions: partitions to add to the table.
238+
It's expected a list of partition dicts to add to the table.
239+
Example: `[{"year": 2020, "month": 8, "day": 14}, ...]`
240+
table: table to add the partitions.
241+
database: name of the database where the table is saved.
242+
"""
243+
for partition_dict in partitions:
244+
if not all(
245+
(
246+
isinstance(key, str)
247+
and (isinstance(value, str) or isinstance(value, int))
248+
)
249+
for key, value in partition_dict.items()
250+
):
251+
raise ValueError(
252+
"Partition keys must be column names "
253+
"and values must be string or int."
254+
)
255+
256+
database_expr = f"`{database}`." if database else ""
257+
key_values_expr = [
258+
", ".join(
259+
[
260+
"{} = {}".format(k, v)
261+
if not isinstance(v, str)
262+
else "{} = '{}'".format(k, v)
263+
for k, v in partition.items()
264+
]
265+
)
266+
for partition in partitions
267+
]
268+
partitions_expr = " ".join(f"PARTITION ( {expr} )" for expr in key_values_expr)
269+
command = (
270+
f"ALTER TABLE {database_expr}`{table}` ADD IF NOT EXISTS {partitions_expr}"
271+
)
272+
273+
self.conn.sql(command)

butterfree/configs/db/metastore_config.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,11 @@
33
import os
44
from typing import Any, Dict, List, Optional
55

6+
from pyspark.sql import DataFrame
7+
68
from butterfree.configs import environment
79
from butterfree.configs.db import AbstractWriteConfig
10+
from butterfree.dataframe_service import extract_partition_values
811

912

1013
class MetastoreConfig(AbstractWriteConfig):
@@ -87,6 +90,31 @@ def get_options(self, key: str) -> Dict[Optional[str], Optional[str]]:
8790
"path": os.path.join(f"{self.file_system}://{self.path}/", key),
8891
}
8992

93+
def get_path_with_partitions(self, key: str, dataframe: DataFrame) -> List:
94+
"""Get options for AWS S3 from partitioned parquet file.
95+
96+
Options will be a dictionary with the write and read configuration for
97+
Spark to AWS S3.
98+
99+
Args:
100+
key: path to save data into AWS S3 bucket.
101+
dataframe: spark dataframe containing data from a feature set.
102+
103+
Returns:
104+
A list of string for file-system backed data sources.
105+
"""
106+
path_list = []
107+
dataframe_values = extract_partition_values(
108+
dataframe, partition_columns=["year", "month", "day"]
109+
)
110+
for row in dataframe_values:
111+
path_list.append(
112+
f"{self.file_system}://{self.path}/{key}/year={row['year']}/"
113+
f"month={row['month']}/day={row['day']}"
114+
)
115+
116+
return path_list
117+
90118
def translate(self, schema: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
91119
"""Translate feature set spark schema to the corresponding database."""
92120
pass

butterfree/configs/environment.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,8 +35,8 @@ def get_variable(variable_name: str, default_value: str = None) -> Optional[str]
3535
"""Gets an environment variable.
3636
3737
The variable comes from it's explicitly declared value in the running
38-
environment or from the default value declared in the environment.yaml
39-
specification or from the default_value.
38+
environment or from the default value declared in specification or from the
39+
default_value.
4040
4141
Args:
4242
variable_name: environment variable name.
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
"""Allowed windows units and lengths in seconds."""
2+
3+
ALLOWED_WINDOWS = {
4+
"second": 1,
5+
"seconds": 1,
6+
"minute": 60,
7+
"minutes": 60,
8+
"hour": 3600,
9+
"hours": 3600,
10+
"day": 86400,
11+
"days": 86400,
12+
"week": 604800,
13+
"weeks": 604800,
14+
"year": 29030400,
15+
"years": 29030400,
16+
}
Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,11 @@
11
"""Dataframe optimization components regarding Butterfree."""
2+
from butterfree.dataframe_service.incremental_strategy import IncrementalStrategy
3+
from butterfree.dataframe_service.partitioning import extract_partition_values
24
from butterfree.dataframe_service.repartition import repartition_df, repartition_sort_df
35

4-
__all__ = ["repartition_df", "repartition_sort_df"]
6+
__all__ = [
7+
"extract_partition_values",
8+
"IncrementalStrategy",
9+
"repartition_df",
10+
"repartition_sort_df",
11+
]

0 commit comments

Comments
 (0)