-
Notifications
You must be signed in to change notification settings - Fork 1.2k
feat: Support aggregation in odfv #5666
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
a101f0f
398dbfa
5ccd4ea
6a39899
62af43f
81466fa
b9541a1
2e85113
6c7cf7e
41d20bc
db15d1f
08d0f2a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -33,6 +33,7 @@ | |
| RequestDataNotFoundInEntityRowsException, | ||
| ) | ||
| from feast.field import Field | ||
| from feast.infra.compute_engines.backends.pandas_backend import PandasBackend | ||
| from feast.infra.key_encoding_utils import deserialize_entity_key | ||
| from feast.protos.feast.serving.ServingService_pb2 import ( | ||
| FieldStatus, | ||
|
|
@@ -561,6 +562,76 @@ def construct_response_feature_vector( | |
| ) | ||
|
|
||
|
|
||
| def _get_aggregate_operations(agg_specs) -> dict: | ||
| """ | ||
| Convert Aggregation specs to agg_ops format for PandasBackend. | ||
|
|
||
| Reused from LocalFeatureBuilder logic. | ||
| TODO: This logic is duplicated from feast.infra.compute_engines.local.feature_builder.LocalFeatureBuilder._get_aggregate_operations(). | ||
| Consider refactoring to a shared utility module in the future. | ||
| """ | ||
| agg_ops = {} | ||
| for agg in agg_specs: | ||
| if agg.time_window is not None: | ||
| raise ValueError( | ||
| "Time window aggregation is not supported in online serving." | ||
| ) | ||
| alias = f"{agg.function}_{agg.column}" | ||
| agg_ops[alias] = (agg.function, agg.column) | ||
| return agg_ops | ||
|
|
||
|
|
||
| def _apply_aggregations_to_response( | ||
| response_data: Union[pyarrow.Table, Dict[str, List[Any]]], | ||
| aggregations, | ||
| group_keys: Optional[List[str]], | ||
| mode: str, | ||
| ) -> Union[pyarrow.Table, Dict[str, List[Any]]]: | ||
| """ | ||
| Apply aggregations using PandasBackend. | ||
|
|
||
| Args: | ||
| response_data: Either a pyarrow.Table or dict of lists containing the data | ||
| aggregations: List of Aggregation objects to apply | ||
| group_keys: List of column names to group by (optional) | ||
| mode: Transformation mode ("python", "pandas", or "substrait") | ||
|
|
||
| Returns: | ||
| Aggregated data in the same format as input | ||
|
|
||
| TODO: Consider refactoring to support backends other than pandas in the future. | ||
| """ | ||
| if not aggregations: | ||
| return response_data | ||
|
|
||
| backend = PandasBackend() | ||
|
|
||
| # Convert to pandas DataFrame | ||
| if isinstance(response_data, dict): | ||
| df = pd.DataFrame(response_data) | ||
| else: # pyarrow.Table | ||
| df = backend.from_arrow(response_data) | ||
|
|
||
| if df.empty: | ||
| return response_data | ||
|
|
||
| # Convert aggregations to agg_ops format | ||
| agg_ops = _get_aggregate_operations(aggregations) | ||
|
|
||
| # Apply aggregations using PandasBackend | ||
| if group_keys: | ||
| result_df = backend.groupby_agg(df, group_keys, agg_ops) | ||
| else: | ||
| # No grouping - aggregate over entire dataset | ||
| result_df = backend.groupby_agg(df, [], agg_ops) | ||
HaoXuAI marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| # Convert back to original format | ||
| if mode == "python": | ||
| return {col: result_df[col].tolist() for col in result_df.columns} | ||
| else: # pandas or substrait | ||
| return backend.to_arrow(result_df) | ||
|
|
||
|
|
||
| def _augment_response_with_on_demand_transforms( | ||
| online_features_response: GetOnlineFeaturesResponse, | ||
| feature_refs: List[str], | ||
|
|
@@ -605,7 +676,31 @@ def _augment_response_with_on_demand_transforms( | |
| for odfv_name, _feature_refs in odfv_feature_refs.items(): | ||
| odfv = requested_odfv_map[odfv_name] | ||
| if not odfv.write_to_online_store: | ||
| if odfv.mode == "python": | ||
| # Apply aggregations if configured. | ||
| if odfv.aggregations: | ||
| if odfv.mode == "python": | ||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Currently if this is a python input, we don't have aggregation function to use OOTD, so it use the
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what does OOTD mean ?
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. OOTB :) |
||
| if initial_response_dict is None: | ||
| initial_response_dict = initial_response.to_dict() | ||
| initial_response_dict = _apply_aggregations_to_response( | ||
| initial_response_dict, | ||
| odfv.aggregations, | ||
| odfv.entities, | ||
| odfv.mode, | ||
| ) | ||
| elif odfv.mode in {"pandas", "substrait"}: | ||
| if initial_response_arrow is None: | ||
| initial_response_arrow = initial_response.to_arrow() | ||
| initial_response_arrow = _apply_aggregations_to_response( | ||
| initial_response_arrow, | ||
| odfv.aggregations, | ||
| odfv.entities, | ||
| odfv.mode, | ||
| ) | ||
|
|
||
| # Apply transformation. Note: aggregations and transformation configs are mutually exclusive | ||
| # TODO: Fix to make it work for having both aggregation and transformation | ||
| # ticket: https://github.com/feast-dev/feast/issues/5689 | ||
| elif odfv.mode == "python": | ||
| if initial_response_dict is None: | ||
| initial_response_dict = initial_response.to_dict() | ||
| transformed_features_dict: Dict[str, List[Any]] = odfv.transform_dict( | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,89 @@ | ||
| # Copyright 2025 The Feast Authors | ||
| # | ||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||
| # you may not use this file except in compliance with the License. | ||
| # You may obtain a copy of the License at | ||
| # | ||
| # https://www.apache.org/licenses/LICENSE-2.0 | ||
| # | ||
| # Unless required by applicable law or agreed to in writing, software | ||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| # See the License for the specific language governing permissions and | ||
| # limitations under the License. | ||
|
|
||
| """Tests for OnDemandFeatureView aggregations in online serving.""" | ||
|
|
||
| import pyarrow as pa | ||
|
|
||
| from feast.aggregation import Aggregation | ||
| from feast.utils import _apply_aggregations_to_response | ||
|
|
||
|
|
||
| def test_aggregation_python_mode(): | ||
| """Test aggregations in Python mode (dict format).""" | ||
| data = { | ||
| "driver_id": [1, 1, 2, 2], | ||
| "trips": [10, 20, 15, 25], | ||
| } | ||
| aggs = [Aggregation(column="trips", function="sum")] | ||
|
|
||
| result = _apply_aggregations_to_response(data, aggs, ["driver_id"], "python") | ||
|
|
||
| assert result == {"driver_id": [1, 2], "sum_trips": [30, 40]} | ||
|
|
||
|
|
||
| def test_aggregation_pandas_mode(): | ||
| """Test aggregations in Pandas mode (Arrow table format).""" | ||
| table = pa.table( | ||
| { | ||
| "driver_id": [1, 1, 2, 2], | ||
| "trips": [10, 20, 15, 25], | ||
| } | ||
| ) | ||
| aggs = [Aggregation(column="trips", function="sum")] | ||
|
|
||
| result = _apply_aggregations_to_response(table, aggs, ["driver_id"], "pandas") | ||
|
|
||
| assert isinstance(result, pa.Table) | ||
| result_df = result.to_pandas() | ||
| assert list(result_df["driver_id"]) == [1, 2] | ||
| assert list(result_df["sum_trips"]) == [30, 40] | ||
|
|
||
|
|
||
| def test_multiple_aggregations(): | ||
| """Test multiple aggregation functions.""" | ||
| data = { | ||
| "driver_id": [1, 1, 2, 2], | ||
| "trips": [10, 20, 15, 25], | ||
| "revenue": [100.0, 200.0, 150.0, 250.0], | ||
| } | ||
| aggs = [ | ||
| Aggregation(column="trips", function="sum"), | ||
| Aggregation(column="revenue", function="mean"), | ||
| ] | ||
|
|
||
| result = _apply_aggregations_to_response(data, aggs, ["driver_id"], "python") | ||
|
|
||
| assert result["driver_id"] == [1, 2] | ||
| assert result["sum_trips"] == [30, 40] | ||
| assert result["mean_revenue"] == [150.0, 200.0] | ||
|
|
||
|
|
||
| def test_no_aggregations_returns_original(): | ||
| """Test that no aggregations returns original data.""" | ||
| data = {"driver_id": [1, 2], "trips": [10, 20]} | ||
|
|
||
| result = _apply_aggregations_to_response(data, [], ["driver_id"], "python") | ||
|
|
||
| assert result == data | ||
|
|
||
|
|
||
| def test_empty_data_returns_empty(): | ||
| """Test that empty data returns empty result.""" | ||
| data = {"driver_id": [], "trips": []} | ||
| aggs = [Aggregation(column="trips", function="sum")] | ||
|
|
||
| result = _apply_aggregations_to_response(data, aggs, ["driver_id"], "python") | ||
|
|
||
| assert result == data |
Uh oh!
There was an error while loading. Please reload this page.