-
Notifications
You must be signed in to change notification settings - Fork 1.2k
feat: Support compute engine to use multi feature views as source #5482
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
87429ca
98b6179
89bac63
8ef4f4f
74a5dee
0d64399
53bb6fb
ea50f43
5e74aa8
07f5e0b
d2eb23f
9122f4b
15f8bde
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -14,7 +14,7 @@ | |
import copy | ||
import warnings | ||
from datetime import datetime, timedelta | ||
from typing import Dict, List, Optional, Tuple, Type | ||
from typing import Dict, List, Optional, Tuple, Type, Union | ||
|
||
from google.protobuf.duration_pb2 import Duration | ||
from google.protobuf.message import Message | ||
|
@@ -90,6 +90,7 @@ class FeatureView(BaseFeatureView): | |
ttl: Optional[timedelta] | ||
batch_source: DataSource | ||
stream_source: Optional[DataSource] | ||
source_views: Optional[List["FeatureView"]] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In same for There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. good catch |
||
entity_columns: List[Field] | ||
features: List[Field] | ||
online: bool | ||
|
@@ -103,7 +104,8 @@ def __init__( | |
self, | ||
*, | ||
name: str, | ||
source: DataSource, | ||
source: Union[DataSource, "FeatureView", List["FeatureView"]], | ||
sink_source: Optional[DataSource] = None, | ||
schema: Optional[List[Field]] = None, | ||
entities: Optional[List[Entity]] = None, | ||
ttl: Optional[timedelta] = timedelta(days=0), | ||
|
@@ -144,22 +146,45 @@ def __init__( | |
self.ttl = ttl | ||
schema = schema or [] | ||
|
||
# Initialize data sources. | ||
# Normalize source | ||
self.stream_source = None | ||
self.data_source: Optional[DataSource] = None | ||
self.source_views: List[FeatureView] = [] | ||
|
||
if isinstance(source, DataSource): | ||
self.data_source = source | ||
elif isinstance(source, FeatureView): | ||
self.source_views = [source] | ||
elif isinstance(source, list) and all( | ||
isinstance(sv, FeatureView) for sv in source | ||
): | ||
self.source_views = source | ||
else: | ||
raise TypeError( | ||
"source must be a DataSource, a FeatureView, or a list of FeatureView." | ||
) | ||
|
||
# Set up stream, batch and derived view sources | ||
if ( | ||
isinstance(source, PushSource) | ||
or isinstance(source, KafkaSource) | ||
or isinstance(source, KinesisSource) | ||
isinstance(self.data_source, PushSource) | ||
or isinstance(self.data_source, KafkaSource) | ||
or isinstance(self.data_source, KinesisSource) | ||
): | ||
self.stream_source = source | ||
if not source.batch_source: | ||
# Stream source definition | ||
self.stream_source = self.data_source | ||
if not self.data_source.batch_source: | ||
raise ValueError( | ||
f"A batch_source needs to be specified for stream source `{source.name}`" | ||
f"A batch_source needs to be specified for stream source `{self.data_source.name}`" | ||
) | ||
else: | ||
self.batch_source = source.batch_source | ||
self.batch_source = self.data_source.batch_source | ||
elif self.data_source: | ||
# Batch source definition | ||
self.batch_source = self.data_source | ||
else: | ||
self.stream_source = None | ||
self.batch_source = source | ||
# Derived view source definition | ||
if not sink_source: | ||
raise ValueError("Derived FeatureView must specify `sink_source`.") | ||
self.batch_source = sink_source | ||
|
||
# Initialize features and entity columns. | ||
features: List[Field] = [] | ||
|
@@ -201,25 +226,26 @@ def __init__( | |
) | ||
|
||
# TODO(felixwang9817): Add more robust validation of features. | ||
cols = [field.name for field in schema] | ||
for col in cols: | ||
if ( | ||
self.batch_source.field_mapping is not None | ||
and col in self.batch_source.field_mapping.keys() | ||
): | ||
raise ValueError( | ||
f"The field {col} is mapped to {self.batch_source.field_mapping[col]} for this data source. " | ||
f"Please either remove this field mapping or use {self.batch_source.field_mapping[col]} as the " | ||
f"Entity or Feature name." | ||
) | ||
if self.batch_source is not None: | ||
cols = [field.name for field in schema] | ||
for col in cols: | ||
if ( | ||
self.batch_source.field_mapping is not None | ||
and col in self.batch_source.field_mapping.keys() | ||
): | ||
raise ValueError( | ||
f"The field {col} is mapped to {self.batch_source.field_mapping[col]} for this data source. " | ||
f"Please either remove this field mapping or use {self.batch_source.field_mapping[col]} as the " | ||
f"Entity or Feature name." | ||
) | ||
|
||
super().__init__( | ||
name=name, | ||
features=features, | ||
description=description, | ||
tags=tags, | ||
owner=owner, | ||
source=source, | ||
source=self.batch_source, | ||
) | ||
self.online = online | ||
self.offline = offline | ||
|
@@ -348,13 +374,18 @@ def to_proto(self) -> FeatureViewProto: | |
meta = self.to_proto_meta() | ||
ttl_duration = self.get_ttl_duration() | ||
|
||
batch_source_proto = self.batch_source.to_proto() | ||
batch_source_proto.data_source_class_type = f"{self.batch_source.__class__.__module__}.{self.batch_source.__class__.__name__}" | ||
batch_source_proto = None | ||
if self.batch_source: | ||
batch_source_proto = self.batch_source.to_proto() | ||
batch_source_proto.data_source_class_type = f"{self.batch_source.__class__.__module__}.{self.batch_source.__class__.__name__}" | ||
|
||
stream_source_proto = None | ||
if self.stream_source: | ||
stream_source_proto = self.stream_source.to_proto() | ||
stream_source_proto.data_source_class_type = f"{self.stream_source.__class__.__module__}.{self.stream_source.__class__.__name__}" | ||
source_view_protos = None | ||
if self.source_views: | ||
source_view_protos = [view.to_proto().spec for view in self.source_views] | ||
spec = FeatureViewSpecProto( | ||
name=self.name, | ||
entities=self.entities, | ||
|
@@ -368,6 +399,7 @@ def to_proto(self) -> FeatureViewProto: | |
offline=self.offline, | ||
batch_source=batch_source_proto, | ||
stream_source=stream_source_proto, | ||
source_views=source_view_protos, | ||
) | ||
|
||
return FeatureViewProto(spec=spec, meta=meta) | ||
|
@@ -403,12 +435,21 @@ def from_proto(cls, feature_view_proto: FeatureViewProto): | |
Returns: | ||
A FeatureViewProto object based on the feature view protobuf. | ||
""" | ||
batch_source = DataSource.from_proto(feature_view_proto.spec.batch_source) | ||
batch_source = ( | ||
DataSource.from_proto(feature_view_proto.spec.batch_source) | ||
if feature_view_proto.spec.HasField("batch_source") | ||
else None | ||
) | ||
stream_source = ( | ||
DataSource.from_proto(feature_view_proto.spec.stream_source) | ||
if feature_view_proto.spec.HasField("stream_source") | ||
else None | ||
) | ||
source_views = [ | ||
FeatureView.from_proto(FeatureViewProto(spec=view_spec, meta=None)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
We might need to handle this in FeatureView.from_proto(). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also, do we not need to store metadata for nested feature views ? meta=None ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We need both cycle detection and de-duplication during serialization. It may not cause issue for few feature views but if there are many feature views, it could cause slowness. A -> [B, C] When serializing FeatureViewA = FeatureViewD and FeatureViewE get serialized twice. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. right make sense. |
||
for view_spec in feature_view_proto.spec.source_views | ||
] | ||
|
||
feature_view = cls( | ||
name=feature_view_proto.spec.name, | ||
description=feature_view_proto.spec.description, | ||
|
@@ -421,7 +462,7 @@ def from_proto(cls, feature_view_proto: FeatureViewProto): | |
if feature_view_proto.spec.ttl.ToNanoseconds() == 0 | ||
else feature_view_proto.spec.ttl.ToTimedelta() | ||
), | ||
source=batch_source, | ||
source=batch_source if batch_source else source_views, | ||
) | ||
if stream_source: | ||
feature_view.stream_source = stream_source | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
from typing import List, Set | ||
|
||
from feast.infra.compute_engines.dag.node import DAGNode | ||
|
||
|
||
def topo_sort(root: DAGNode) -> List[DAGNode]: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why not call it topological_sort? |
||
""" | ||
Topologically sort a DAG starting from a single root node. | ||
Args: | ||
root: The root DAGNode. | ||
Returns: | ||
A list of DAGNodes in topological order (dependencies first). | ||
""" | ||
return topo_sort_multiple([root]) | ||
|
||
|
||
def topo_sort_multiple(roots: List[DAGNode]) -> List[DAGNode]: | ||
""" | ||
Topologically sort a DAG with multiple roots. | ||
Args: | ||
roots: List of root DAGNodes. | ||
Returns: | ||
A list of all reachable DAGNodes in execution-safe order. | ||
""" | ||
visited: Set[int] = set() | ||
ordered: List[DAGNode] = [] | ||
|
||
def dfs(node: DAGNode): | ||
if id(node) in visited: | ||
return | ||
visited.add(id(node)) | ||
for input_node in node.inputs: | ||
dfs(input_node) | ||
ordered.append(node) | ||
|
||
for root in roots: | ||
dfs(root) | ||
|
||
return ordered |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe we should just call it sink?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sounds good, will update it