logicalclocks · SirOibaf · Dec 22, 2020 · Nov 13, 2020 · Nov 13, 2020 · Nov 16, 2020
diff --git a/auto_doc.py b/auto_doc.py
@@ -72,6 +72,12 @@
             "hsfs.storage_connector.StorageConnector"
         ),
     },
+    "query_vs_dataframe.md": {
+        "query_methods": keras_autodoc.get_methods("hsfs.constructor.query.Query"),
+        "query_properties": keras_autodoc.get_properties(
+            "hsfs.constructor.query.Query"
+        ),
+    },
     "api/connection_api.md": {
         "connection": ["hsfs.connection.Connection"],
         "connection_properties": keras_autodoc.get_properties(

diff --git a/docs/overview.md b/docs/overview.md
@@ -50,7 +50,7 @@ Entities within the Feature Store are organized hierarchically. On the most gran
 
 [**Feature Groups**](generated/feature_group.md) are entities that contain both metadata about the grouped features, as well as information of the jobs used to ingest the data contained in a feature group and also the actual location of the data (HopsFS or externally, such as S3). Typically, feature groups represent a logical set of features coming from the same data source sharing a common primary key. Feature groups also contain the schema and type information of the features, for the user to know how to interpret the data.
 
-Feature groups can also be used to compute [Statistics](generated/statistics.md) over features, or to define [Data Validation Rules](generated/data_validation.md) using the statistics and schema information.
+Feature groups can also be used to compute Statistics over features, or to define Data Validation Rules using the statistics and schema information.
 
 In order to enable [online serving](overview.md#offline-vs-offline-feature-store) for features of a feature group, the feature group needs to be made available as an online feature group.
 
@@ -60,7 +60,7 @@ In order to be able to train machine learning models efficiently, the feature da
 
 Training datasets can be created with features from any number of feature groups, as long as the feature groups can be joined in a meaningful way.
 
-Users are able to compute [Statistics](generated/statistics.md) also for training datasets, which will make it easy to understand a dataset's characteristics also in the future.
+Users are able to compute Statistics also for training datasets, which will make it easy to understand a dataset's characteristics also in the future.
 
 The Hopsworks Feature Store has support for writing training datasets either to the distributed file system of Hopsworks - HopsFS - or to external storage such as S3.
 

diff --git a/docs/quickstart.md b/docs/quickstart.md
@@ -23,7 +23,7 @@ The Hopsworks feature feature store library is called `hsfs` (**H**opswork**s**
 The library is Apache V2 licensed and available [here](https://github.com/logicalclocks/feature-store-api). The library is currently available for Python and JVM languages such as Scala and Java.
 If you want to connect to the Feature Store from outside Hopsworks, see our [integration guides](setup.md).
 
-The library is build around metadata-objects, representing entities within the Feature Store. You can modify metadata by changing it in the metadata-objects and subsequently persisting it to the Feature Store. In fact, the Feature Store itself is also represented by an object. Furthermore, these objects have methods to save data along with the entities in the feature store. This data can be materialized from [Spark or Pandas DataFrames, or the `HSFS`-**Query** abstraction](generated/programming_interface.md).
+The library is build around metadata-objects, representing entities within the Feature Store. You can modify metadata by changing it in the metadata-objects and subsequently persisting it to the Feature Store. In fact, the Feature Store itself is also represented by an object. Furthermore, these objects have methods to save data along with the entities in the feature store. This data can be materialized from [Spark or Pandas DataFrames, or the `HSFS`-**Query** abstraction](generated/query_vs_dataframe.md).
 
 ### Guide Notebooks
 

diff --git a/docs/templates/query_vs_dataframe.md b/docs/templates/query_vs_dataframe.md
@@ -0,0 +1,70 @@
+# Query vs DataFrame
+
+HSFS provides a DataFrame API to ingest data into the Hopsworks Feature Store. You can also retrieve feature data in a DataFrame, that can either be used directly to train models or [materialized to file(s)](training_dataset.md) for later use to train models.
+
+The idea of the Feature Store is to have pre-computed features available for both training and serving models. The key functionality required to generate training datasets from reusable features are: feature selection, joins, filters and point in time queries. To enable this functionality, we are introducing a new expressive Query abstraction with `HSFS` that provides these operations and guarantees reproducible creation of training datasets from features in the Feature Store.
+
+The new joining functionality is heavily inspired by the APIs used by Pandas to merge DataFrames. The APIs allow you to specify which features to select from which feature group, how to join them and which features to use in join conditions.
+
+```python
+# create a query
+feature_join = rain_fg.select_all()
+                         .join(temperature_fg.select_all(), on=["date", "location_id"])
+                         .join(location_fg.select_all())
+
+td = fs.create_training_dataset("rain_dataset",
+                          version=1,
+                          label=”weekly_rain”,
+                          data_format=”tfrecords”)
+
+# materialize query in the specified file format
+td.save(feature_join)
+
+# use materialized training dataset for training, possibly in a different environment
+td = fs.get_training_dataset(“rain_dataset”, version=1)
+
+# get TFRecordDataset to use in a TensorFlow model
+dataset = td.tf_data().tf_record_dataset(batch_size=32, num_epochs=100)
+
+# reproduce query for online feature store and drop label for inference
+jdbc_querystring = td.get_query(online=True, with_label=False)
+```
+
+If a data scientist wants to modify a new feature that is not available in the Feature Store, she can write code to compute the new feature (using existing features or external data) and ingest the new feature values into the Feature Store. If the new feature is based solely on existing feature values in the Feature Store, we call it a derived feature. The same HSFS APIs can be used to compute derived features as well as features using external data sources.
+
+## The Query Abstraction
+
+Most operations performed on `FeatureGroup` metadata objects will return a `Query` with the applied operation.
+
+### Examples
+
+For example, selecting features from a feature group is a lazy operation, returning a query with the selected
+features only:
+
+```python
+rain_fg = fs.get_feature_group("rain_fg")
+
+# Returns Query
+feature_join = rain_fg.select(["location_id", "weekly_rainfall"])
+```
+
+Similarly joins return queries:
+
+```python
+feature_join = rain_fg.select_all()
+                         .join(temperature_fg.select_all(), on=["date", "location_id"])
+                         .join(location_fg.select_all())
+```
+
+As well as filters:
+```python
+feature_join = rain_fg.filter(rain_fg.location_id == 10)
+```
+
+## Methods
+
+{{query_methods}}
+
+## Properties
+
+{{query_properties}}
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -50,9 +50,9 @@ nav:
     - Storage Connector: generated/storage_connector.md
     - Feature: generated/feature.md
     - Training Dataset: generated/training_dataset.md
-    - Dataframe vs. Query: guides/programming_interface.md
-    - Statistics: guides/statistics.md
-    - Data Validation: guides/data_validation.md
+    - Dataframe vs. Query: generated/query_vs_dataframe.md
+    # - Statistics: guides/statistics.md
+    # - Data Validation: guides/data_validation.md
   - API Reference:
     - Connection: generated/api/connection_api.md
     - FeatureStore: generated/api/feature_store_api.md

diff --git a/python/hsfs/constructor/__init__.py b/python/hsfs/constructor/__init__.py
@@ -0,0 +1,15 @@
+#
+#   Copyright 2020 Logical Clocks AB
+#
+#   Licensed under the Apache License, Version 2.0 (the "License");
+#   you may not use this file except in compliance with the License.
+#   You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#   Unless required by applicable law or agreed to in writing, software
+#   distributed under the License is distributed on an "AS IS" BASIS,
+#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#   See the License for the specific language governing permissions and
+#   limitations under the License.
+#
diff --git a/python/hsfs/constructor/filter.py b/python/hsfs/constructor/filter.py
@@ -0,0 +1,140 @@
+#
+#   Copyright 2020 Logical Clocks AB
+#
+#   Licensed under the Apache License, Version 2.0 (the "License");
+#   you may not use this file except in compliance with the License.
+#   You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#   Unless required by applicable law or agreed to in writing, software
+#   distributed under the License is distributed on an "AS IS" BASIS,
+#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#   See the License for the specific language governing permissions and
+#   limitations under the License.
+#
+
+import json
+
+from hsfs import util
+
+
+class Filter:
+    GE = "GREATER_THAN_OR_EQUAL"
+    GT = "GREATER_THAN"
+    NE = "NOT_EQUALS"
+    EQ = "EQUALS"
+    LE = "LESS_THAN_OR_EQUAL"
+    LT = "LESS_THAN"
+
+    def __init__(self, feature, condition, value):
+        self._feature = feature
+        self._condition = condition
+        self._value = value
+
+    def json(self):
+        return json.dumps(self, cls=util.FeatureStoreEncoder)
+
+    def to_dict(self):
+        return {
+            "feature": self._feature,
+            "condition": self._condition,
+            "value": str(self._value),
+        }
+
+    def __and__(self, other):
+        if isinstance(other, Filter):
+            return Logic.And(left_f=self, right_f=other)
+        elif isinstance(other, Logic):
+            return Logic.And(left_f=self, right_l=other)
+        else:
+            raise TypeError(
+                "Operator `&` expected type `Filter` or `Logic`, got `{}`".format(
+                    type(other)
+                )
+            )
+
+    def __or__(self, other):
+        if isinstance(other, Filter):
+            return Logic.Or(left_f=self, right_f=other)
+        elif isinstance(other, Logic):
+            return Logic.Or(left_f=self, right_l=other)
+        else:
+            raise TypeError(
+                "Operator `|` expected type `Filter` or `Logic`, got `{}`".format(
+                    type(other)
+                )
+            )
+
+    def __repr__(self):
+        return f"Filter({self._feature!r}, {self._condition!r}, {self._value!r})"
+
+    def __str__(self):
+        return self.json()
+
+
+class Logic:
+    AND = "AND"
+    OR = "OR"
+    SINGLE = "SINGLE"
+
+    def __init__(self, type, left_f=None, right_f=None, left_l=None, right_l=None):
+        self._type = type
+        self._left_f = left_f
+        self._right_f = right_f
+        self._left_l = left_l
+        self._right_l = right_l
+
+    def json(self):
+        return json.dumps(self, cls=util.FeatureStoreEncoder)
+
+    def to_dict(self):
+        return {
+            "type": self._type,
+            "leftFilter": self._left_f,
+            "rightFilter": self._right_f,
+            "leftLogic": self._left_l,
+            "rightLogic": self._right_l,
+        }
+
+    @classmethod
+    def And(cls, left_f=None, right_f=None, left_l=None, right_l=None):
+        return cls(cls.AND, left_f, right_f, left_l, right_l)
+
+    @classmethod
+    def Or(cls, left_f=None, right_f=None, left_l=None, right_l=None):
+        return cls(cls.OR, left_f, right_f, left_l, right_l)
+
+    @classmethod
+    def Single(cls, left_f):
+        return cls(cls.SINGLE, left_f)
+
+    def __and__(self, other):
+        if isinstance(other, Filter):
+            return Logic.And(left_l=self, right_f=other)
+        elif isinstance(other, Logic):
+            return Logic.And(left_l=self, right_l=other)
+        else:
+            raise TypeError(
+                "Operator `&` expected type `Filter` or `Logic`, got `{}`".format(
+                    type(other)
+                )
+            )
+
+    def __or__(self, other):
+        if isinstance(other, Filter):
+            return Logic.Or(left_l=self, right_f=other)
+        elif isinstance(other, Logic):
+            return Logic.Or(left_l=self, right_l=other)
+        else:
+            raise TypeError(
+                "Operator `|` expected type `Filter` or `Logic`, got `{}`".format(
+                    type(other)
+                )
+            )
+
+    def __repr__(self):
+        return f"Logic({self._type!r}, {self._left_f!r}, {self._right_f!r}, {self._left_l!r}, {self._right_l!r})"
+
+    def __str__(self):
+        return self.json()
diff --git a/python/hsfs/core/fs_query.py → python/hsfs/constructor/fs_query.py b/python/hsfs/core/fs_query.py → python/hsfs/constructor/fs_query.py
@@ -15,7 +15,7 @@
 #
 
 import humps
-from hsfs.core import hudi_feature_group_alias, on_demand_feature_group_alias
+from hsfs.constructor import hudi_feature_group_alias, on_demand_feature_group_alias
 
 
 class FsQuery:

diff --git a/python/hsfs/core/hudi_feature_group_alias.py → ...s/constructor/hudi_feature_group_alias.py b/python/hsfs/core/hudi_feature_group_alias.py → ...s/constructor/hudi_feature_group_alias.py
diff --git a/python/hsfs/core/join.py → python/hsfs/constructor/join.py b/python/hsfs/core/join.py → python/hsfs/constructor/join.py
diff --git a/...sfs/core/on_demand_feature_group_alias.py → ...structor/on_demand_feature_group_alias.py b/...sfs/core/on_demand_feature_group_alias.py → ...structor/on_demand_feature_group_alias.py
@@ -16,13 +16,15 @@
 
 import humps
 
-from hsfs import on_demand_feature_group as on_demand_fg
+from hsfs import feature_group
 
 
 class OnDemandFeatureGroupAlias:
     def __init__(self, on_demand_feature_group, alias):
-        self._on_demand_feature_group = on_demand_fg.OnDemandFeatureGroup.from_response_json(
-            on_demand_feature_group
+        self._on_demand_feature_group = (
+            feature_group.OnDemandFeatureGroup.from_response_json(
+                on_demand_feature_group
+            )
         )
         self._alias = alias