chore: Clean up push source tests (#2912)

felixwang9817 · web-flow · commit dcd8ec9f0274 · 2022-07-06T12:36:15.000-07:00
* Delete unnecessary unit test

Signed-off-by: Felix Wang &lt;wangfelix98@gmail.com&gt;

* Clean up python feature server test

Signed-off-by: Felix Wang &lt;wangfelix98@gmail.com&gt;

* Clean up push source offline retrieval test

Signed-off-by: Felix Wang &lt;wangfelix98@gmail.com&gt;

* Clean up push source online retrieval test

Signed-off-by: Felix Wang &lt;wangfelix98@gmail.com&gt;

* Clean up offline write tests

Signed-off-by: Felix Wang &lt;wangfelix98@gmail.com&gt;

* Add back reorder columns test for offline write

Signed-off-by: Felix Wang &lt;wangfelix98@gmail.com&gt;

* Rename create_dataset

Signed-off-by: Felix Wang &lt;wangfelix98@gmail.com&gt;

* Add SFV back into online retrieval test

Signed-off-by: Felix Wang &lt;wangfelix98@gmail.com&gt;
diff --git a/sdk/python/tests/conftest.py b/sdk/python/tests/conftest.py
@@ -26,7 +26,7 @@
 
 from feast import FeatureStore
 from feast.wait import wait_retry_backoff
-from tests.data.data_creator import create_dataset
+from tests.data.data_creator import create_basic_driver_dataset
 from tests.integration.feature_repos.integration_test_repo_config import (
     IntegrationTestRepoConfig,
 )
@@ -351,7 +351,7 @@ def universal_data_sources(environment) -> TestData:
 
 @pytest.fixture
 def e2e_data_sources(environment: Environment):
-    df = create_dataset()
+    df = create_basic_driver_dataset()
     data_source = environment.data_source_creator.create_data_source(
         df, environment.feature_store.project, field_mapping={"ts_1": "ts"},
     )
diff --git a/sdk/python/tests/data/data_creator.py b/sdk/python/tests/data/data_creator.py
@@ -7,7 +7,7 @@
 from feast.types import FeastType, Float32, Int32, Int64, String
 
 
-def create_dataset(
+def create_basic_driver_dataset(
     entity_type: FeastType = Int32,
     feature_dtype: str = None,
     feature_is_list: bool = False,
diff --git a/sdk/python/tests/integration/e2e/test_python_feature_server.py b/sdk/python/tests/integration/e2e/test_python_feature_server.py
@@ -7,13 +7,8 @@
 
 from feast.feast_object import FeastObject
 from feast.feature_server import get_app
-from tests.integration.feature_repos.integration_test_repo_config import (
-    IntegrationTestRepoConfig,
-)
 from tests.integration.feature_repos.repo_configuration import (
-    construct_test_environment,
     construct_universal_feature_views,
-    construct_universal_test_data,
 )
 from tests.integration.feature_repos.universal.entities import (
     customer,
@@ -63,16 +58,13 @@ def test_get_online_features(python_fs_client):
 @pytest.mark.integration
 @pytest.mark.universal_online_stores
 def test_push(python_fs_client):
-    # TODO(felixwang9817): Note that we choose an entity value of 102 here since it is not included
-    # in the existing range of entity values (1-49). This allows us to push data for this test
-    # without affecting other tests. This decision is tech debt, and should be resolved by finding a
-    # better way to isolate data sources across tests.
+    initial_temp = get_temperatures(python_fs_client, location_ids=[1])[0]
     json_data = json.dumps(
         {
             "push_source_name": "location_stats_push_source",
             "df": {
-                "location_id": [102],
-                "temperature": [4],
+                "location_id": [1],
+                "temperature": [initial_temp * 100],
                 "event_timestamp": [str(datetime.utcnow())],
                 "created": [str(datetime.utcnow())],
             },
@@ -82,7 +74,7 @@ def test_push(python_fs_client):
 
     # Check new pushed temperature is fetched
     assert response.status_code == 200
-    assert get_temperatures(python_fs_client, location_ids=[102]) == [4]
+    assert get_temperatures(python_fs_client, location_ids=[1]) == [initial_temp * 100]
 
 
 def get_temperatures(client, location_ids: List[int]):
@@ -102,20 +94,14 @@ def get_temperatures(client, location_ids: List[int]):
 
 
 @pytest.fixture
-def python_fs_client(request):
-    config = IntegrationTestRepoConfig()
-    environment = construct_test_environment(config, fixture_request=request)
+def python_fs_client(environment, universal_data_sources, request):
     fs = environment.feature_store
-    try:
-        entities, datasets, data_sources = construct_universal_test_data(environment)
-        feature_views = construct_universal_feature_views(data_sources)
-        feast_objects: List[FeastObject] = []
-        feast_objects.extend(feature_views.values())
-        feast_objects.extend([driver(), customer(), location()])
-        fs.apply(feast_objects)
-        fs.materialize(environment.start_date, environment.end_date)
-        client = TestClient(get_app(fs))
-        yield client
-    finally:
-        fs.teardown()
-        environment.data_source_creator.teardown()
+    entities, datasets, data_sources = universal_data_sources
+    feature_views = construct_universal_feature_views(data_sources)
+    feast_objects: List[FeastObject] = []
+    feast_objects.extend(feature_views.values())
+    feast_objects.extend([driver(), customer(), location()])
+    fs.apply(feast_objects)
+    fs.materialize(environment.start_date, environment.end_date)
+    client = TestClient(get_app(fs))
+    yield client
diff --git a/sdk/python/tests/integration/offline_store/test_offline_write.py b/sdk/python/tests/integration/offline_store/test_offline_write.py
@@ -7,52 +7,54 @@
 
 from feast import FeatureView, Field
 from feast.types import Float32, Int32
+from tests.integration.feature_repos.repo_configuration import (
+    construct_universal_feature_views,
+)
 from tests.integration.feature_repos.universal.entities import driver
 
-# TODO(felixwang9817): Add a unit test that checks that write_to_offline_store can reorder columns.
-# This should only happen after https://github.com/feast-dev/feast/issues/2797 is fixed.
-
 
 @pytest.mark.integration
 @pytest.mark.universal_offline_stores
-@pytest.mark.universal_online_stores(only=["sqlite"])
-def test_writing_incorrect_schema_fails(environment, universal_data_sources):
-    """Tests that writing a dataframe with an incorrect schema fails."""
+def test_reorder_columns(environment, universal_data_sources):
+    """Tests that a dataframe with columns in the wrong order is reordered."""
     store = environment.feature_store
     _, _, data_sources = universal_data_sources
-    driver_entity = driver()
-    driver_stats = FeatureView(
-        name="driver_stats",
-        entities=[driver_entity],
-        schema=[
-            Field(name="avg_daily_trips", dtype=Int32),
-            Field(name="conv_rate", dtype=Float32),
-            Field(name="acc_rate", dtype=Float32),
-        ],
-        source=data_sources.driver,
-    )
+    feature_views = construct_universal_feature_views(data_sources)
+    driver_fv = feature_views.driver
+    store.apply([driver(), driver_fv])
 
     now = datetime.utcnow()
     ts = pd.Timestamp(now).round("ms")
 
-    entity_df = pd.DataFrame.from_dict(
-        {"driver_id": [1001, 1002], "event_timestamp": [ts - timedelta(hours=3), ts]}
+    # This dataframe has columns in the wrong order.
+    df_to_write = pd.DataFrame.from_dict(
+        {
+            "avg_daily_trips": [random.randint(0, 10), random.randint(0, 10)],
+            "created": [ts, ts],
+            "conv_rate": [random.random(), random.random()],
+            "event_timestamp": [ts, ts],
+            "acc_rate": [random.random(), random.random()],
+            "driver_id": [1001, 1001],
+        },
     )
 
-    store.apply([driver_entity, driver_stats])
-    df = store.get_historical_features(
-        entity_df=entity_df,
-        features=[
-            "driver_stats:conv_rate",
-            "driver_stats:acc_rate",
-            "driver_stats:avg_daily_trips",
-        ],
-        full_feature_names=False,
-    ).to_df()
+    store.write_to_offline_store(
+        driver_fv.name, df_to_write, allow_registry_cache=False
+    )
 
-    assert df["conv_rate"].isnull().all()
-    assert df["acc_rate"].isnull().all()
-    assert df["avg_daily_trips"].isnull().all()
+
+@pytest.mark.integration
+@pytest.mark.universal_offline_stores
+def test_writing_incorrect_schema_fails(environment, universal_data_sources):
+    """Tests that writing a dataframe with an incorrect schema fails."""
+    store = environment.feature_store
+    _, _, data_sources = universal_data_sources
+    feature_views = construct_universal_feature_views(data_sources)
+    driver_fv = feature_views.driver
+    store.apply([driver(), driver_fv])
+
+    now = datetime.utcnow()
+    ts = pd.Timestamp(now).round("ms")
 
     expected_df = pd.DataFrame.from_dict(
         {
@@ -65,13 +67,12 @@ def test_writing_incorrect_schema_fails(environment, universal_data_sources):
     )
     with pytest.raises(ValueError):
         store.write_to_offline_store(
-            driver_stats.name, expected_df, allow_registry_cache=False
+            driver_fv.name, expected_df, allow_registry_cache=False
         )
 
 
 @pytest.mark.integration
 @pytest.mark.universal_offline_stores
-@pytest.mark.universal_online_stores(only=["sqlite"])
 def test_writing_consecutively_to_offline_store(environment, universal_data_sources):
     store = environment.feature_store
     _, _, data_sources = universal_data_sources
@@ -96,7 +97,7 @@ def test_writing_consecutively_to_offline_store(environment, universal_data_sour
     entity_df = pd.DataFrame.from_dict(
         {
             "driver_id": [1001, 1001],
-            "event_timestamp": [ts - timedelta(hours=4), ts - timedelta(hours=3)],
+            "event_timestamp": [ts + timedelta(hours=3), ts + timedelta(hours=4)],
         }
     )
 
@@ -117,7 +118,7 @@ def test_writing_consecutively_to_offline_store(environment, universal_data_sour
 
     first_df = pd.DataFrame.from_dict(
         {
-            "event_timestamp": [ts - timedelta(hours=4), ts - timedelta(hours=3)],
+            "event_timestamp": [ts + timedelta(hours=3), ts + timedelta(hours=4)],
             "driver_id": [1001, 1001],
             "conv_rate": [random.random(), random.random()],
             "acc_rate": [random.random(), random.random()],
@@ -155,7 +156,7 @@ def test_writing_consecutively_to_offline_store(environment, universal_data_sour
 
     second_df = pd.DataFrame.from_dict(
         {
-            "event_timestamp": [ts - timedelta(hours=1), ts],
+            "event_timestamp": [ts + timedelta(hours=5), ts + timedelta(hours=6)],
             "driver_id": [1001, 1001],
             "conv_rate": [random.random(), random.random()],
             "acc_rate": [random.random(), random.random()],
@@ -172,10 +173,10 @@ def test_writing_consecutively_to_offline_store(environment, universal_data_sour
         {
             "driver_id": [1001, 1001, 1001, 1001],
             "event_timestamp": [
-                ts - timedelta(hours=4),
-                ts - timedelta(hours=3),
-                ts - timedelta(hours=1),
-                ts,
+                ts + timedelta(hours=3),
+                ts + timedelta(hours=4),
+                ts + timedelta(hours=5),
+                ts + timedelta(hours=6),
             ],
         }
     )
diff --git a/sdk/python/tests/integration/offline_store/test_push_offline.py b/sdk/python/tests/integration/offline_store/test_push_offline.py
@@ -8,39 +8,30 @@
 from tests.integration.feature_repos.repo_configuration import (
     construct_universal_feature_views,
 )
-from tests.integration.feature_repos.universal.entities import (
-    customer,
-    driver,
-    location,
-)
+from tests.integration.feature_repos.universal.entities import location
 
 
 @pytest.mark.integration
 @pytest.mark.universal_offline_stores
-@pytest.mark.universal_online_stores(only=["sqlite"])
-def test_push_features_and_read_from_offline_store(environment, universal_data_sources):
+def test_push_features_and_read(environment, universal_data_sources):
     store = environment.feature_store
-
-    (_, _, data_sources) = universal_data_sources
+    _, _, data_sources = universal_data_sources
     feature_views = construct_universal_feature_views(data_sources)
-    now = pd.Timestamp(datetime.datetime.utcnow()).round("ms")
+    location_fv = feature_views.pushed_locations
+    store.apply([location(), location_fv])
 
-    store.apply([driver(), customer(), location(), *feature_views.values()])
-    entity_df = pd.DataFrame.from_dict({"location_id": [100], "event_timestamp": [now]})
+    now = pd.Timestamp(datetime.datetime.utcnow()).round("ms")
+    entity_df = pd.DataFrame.from_dict({"location_id": [1], "event_timestamp": [now]})
 
     before_df = store.get_historical_features(
         entity_df=entity_df,
         features=["pushable_location_stats:temperature"],
         full_feature_names=False,
     ).to_df()
 
-    # TODO(felixwang9817): Note that we choose an entity value of 100 here since it is not included
-    # in the existing range of entity values (1-49). This allows us to push data for this test
-    # without affecting other tests. This decision is tech debt, and should be resolved by finding a
-    # better way to isolate data sources across tests.
     data = {
         "event_timestamp": [now],
-        "location_id": [100],
+        "location_id": [1],
         "temperature": [4],
         "created": [now],
     }
diff --git a/sdk/python/tests/integration/online_store/test_push_online.py b/sdk/python/tests/integration/online_store/test_push_online.py
@@ -6,29 +6,20 @@
 from tests.integration.feature_repos.repo_configuration import (
     construct_universal_feature_views,
 )
-from tests.integration.feature_repos.universal.entities import (
-    customer,
-    driver,
-    location,
-)
+from tests.integration.feature_repos.universal.entities import location
 
 
 @pytest.mark.integration
 @pytest.mark.universal_online_stores
 def test_push_features_and_read(environment, universal_data_sources):
     store = environment.feature_store
-
-    (_, datasets, data_sources) = universal_data_sources
+    _, _, data_sources = universal_data_sources
     feature_views = construct_universal_feature_views(data_sources)
+    location_fv = feature_views.pushed_locations
+    store.apply([location(), location_fv])
 
-    store.apply([driver(), customer(), location(), *feature_views.values()])
-
-    # TODO(felixwang9817): Note that we choose an entity value of 101 here since it is not included
-    # in the existing range of entity values (1-49). This allows us to push data for this test
-    # without affecting other tests. This decision is tech debt, and should be resolved by finding a
-    # better way to isolate data sources across tests.
     data = {
-        "location_id": [101],
+        "location_id": [1],
         "temperature": [4],
         "event_timestamp": [pd.Timestamp(datetime.datetime.utcnow()).round("ms")],
         "created": [pd.Timestamp(datetime.datetime.utcnow()).round("ms")],
@@ -39,8 +30,8 @@ def test_push_features_and_read(environment, universal_data_sources):
 
     online_resp = store.get_online_features(
         features=["pushable_location_stats:temperature"],
-        entity_rows=[{"location_id": 101}],
+        entity_rows=[{"location_id": 1}],
     )
     online_resp_dict = online_resp.to_dict()
-    assert online_resp_dict["location_id"] == [101]
+    assert online_resp_dict["location_id"] == [1]
     assert online_resp_dict["temperature"] == [4]
diff --git a/sdk/python/tests/integration/online_store/test_universal_online.py b/sdk/python/tests/integration/online_store/test_universal_online.py
diff --git a/sdk/python/tests/integration/registration/test_universal_types.py b/sdk/python/tests/integration/registration/test_universal_types.py
diff --git a/sdk/python/tests/unit/test_registry_conflict.py b/sdk/python/tests/unit/test_registry_conflict.py