Skip to content

Commit 4c48a3b

Browse files
[HOPSWORKS-2280] Save statistics configuration also for training datasets (#217)
1 parent 9302c27 commit 4c48a3b

22 files changed

+344
-127
lines changed

auto_doc.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,12 @@
7878
"hsfs.constructor.query.Query"
7979
),
8080
},
81+
"statistics.md": {
82+
"statistics_config": ["hsfs.statistics_config.StatisticsConfig"],
83+
"statistics_config_properties": keras_autodoc.get_properties(
84+
"hsfs.statistics_config.StatisticsConfig"
85+
),
86+
},
8187
"api/connection_api.md": {
8288
"connection": ["hsfs.connection.Connection"],
8389
"connection_properties": keras_autodoc.get_properties(
@@ -130,6 +136,12 @@
130136
"hsfs.storage_connector.StorageConnector"
131137
),
132138
},
139+
"api/statistics_config_api.md": {
140+
"statistics_config": ["hsfs.statistics_config.StatisticsConfig"],
141+
"statistics_config_properties": keras_autodoc.get_properties(
142+
"hsfs.statistics_config.StatisticsConfig"
143+
),
144+
},
133145
}
134146

135147
hsfs_dir = pathlib.Path(__file__).resolve().parents[0]
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
# StatisticsConfig
2+
3+
{{statistics_config}}
4+
5+
## Properties
6+
7+
{{statistics_config_properties}}

docs/templates/statistics.md

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
# Statistics
2+
3+
HSFS provides functionality to compute statistics for [training datasets](training_dataset.md) and [feature groups](feature_group.md) and save these along with their other metadata in the [feature store](feature_store.md).
4+
These statistics are meant to be helpful for Data Scientists to perform explorative data analysis and then recognize suitable [features](feature.md) or [training datasets](training_dataset.md) for models.
5+
6+
Statistics are configured on a training dataset or feature group level using a `StatisticsConfig` object.
7+
This object can be passed at creation time of the dataset or group or it can later on be updated through the API.
8+
9+
{{statistics_config}}
10+
11+
For example, to enable all statistics (descriptive, histograms and correlations) for a training dataset:
12+
13+
=== "Python"
14+
```python
15+
from hsfs.statistics_config import StatisticsConfig
16+
17+
td = fs.create_training_dataset("rain_dataset",
18+
version=1,
19+
label=”weekly_rain”,
20+
data_format=”tfrecords”,
21+
statistics_config=StatisticsConfig(true, true, true))
22+
23+
```
24+
=== "Scala"
25+
```scala
26+
val td = (fs.createTrainingDataset()
27+
.name("rain_dataset")
28+
.version(1)
29+
.label(”weekly_rain”)
30+
.dataFormat(”tfrecords”)
31+
.statisticsConfig(new StatisticsConfig(true, true, true))
32+
.build())
33+
```
34+
35+
And similarly for feature groups.
36+
37+
!!! note "Default StatisticsConfig"
38+
By default all training datasets and feature groups will be configured such that only descriptive statistics
39+
are computed. However, you can also enable `histograms` and `correlations` or limit the features for which
40+
statistics are computed.
41+
42+
## Properties
43+
44+
{{statistics_config_properties}}

java/src/main/java/com/logicalclocks/hsfs/FeatureGroup.java

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -75,8 +75,7 @@ public class FeatureGroup extends FeatureGroupBase {
7575
public FeatureGroup(FeatureStore featureStore, @NonNull String name, Integer version, String description,
7676
List<String> primaryKeys, List<String> partitionKeys, String hudiPrecombineKey,
7777
boolean onlineEnabled, TimeTravelFormat timeTravelFormat, List<Feature> features,
78-
Boolean statisticsEnabled, Boolean histograms, Boolean correlations,
79-
List<String> statisticColumns) {
78+
StatisticsConfig statisticsConfig) {
8079
this.featureStore = featureStore;
8180
this.name = name;
8281
this.version = version;
@@ -87,10 +86,7 @@ public FeatureGroup(FeatureStore featureStore, @NonNull String name, Integer ver
8786
this.onlineEnabled = onlineEnabled;
8887
this.timeTravelFormat = timeTravelFormat != null ? timeTravelFormat : TimeTravelFormat.HUDI;
8988
this.features = features;
90-
this.statisticsEnabled = statisticsEnabled != null ? statisticsEnabled : true;
91-
this.histograms = histograms;
92-
this.correlations = correlations;
93-
this.statisticColumns = statisticColumns;
89+
this.statisticsConfig = statisticsConfig != null ? statisticsConfig : new StatisticsConfig();
9490
}
9591

9692
public FeatureGroup() {
@@ -183,7 +179,7 @@ public void save(Dataset<Row> featureData, Map<String, String> writeOptions)
183179
throws FeatureStoreException, IOException {
184180
featureGroupEngine.saveFeatureGroup(this, featureData, primaryKeys, partitionKeys, hudiPrecombineKey,
185181
writeOptions);
186-
if (statisticsEnabled) {
182+
if (statisticsConfig.getEnabled()) {
187183
statisticsEngine.computeStatistics(this, featureData);
188184
}
189185
}

java/src/main/java/com/logicalclocks/hsfs/OnDemandFeatureGroup.java

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -68,8 +68,7 @@ public class OnDemandFeatureGroup extends FeatureGroupBase {
6868
public OnDemandFeatureGroup(FeatureStore featureStore, @NonNull String name, Integer version, String query,
6969
OnDemandDataFormat dataFormat, String path, Map<String, String> options,
7070
@NonNull StorageConnector storageConnector, String description, List<Feature> features,
71-
Boolean statisticsEnabled, Boolean histograms, Boolean correlations,
72-
List<String> statisticColumns) {
71+
StatisticsConfig statisticsConfig) {
7372
this.featureStore = featureStore;
7473
this.name = name;
7574
this.version = version;
@@ -83,10 +82,7 @@ public OnDemandFeatureGroup(FeatureStore featureStore, @NonNull String name, Int
8382
this.description = description;
8483
this.storageConnector = storageConnector;
8584
this.features = features;
86-
this.statisticsEnabled = statisticsEnabled != null ? statisticsEnabled : true;
87-
this.histograms = histograms;
88-
this.correlations = correlations;
89-
this.statisticColumns = statisticColumns;
85+
this.statisticsConfig = statisticsConfig != null ? statisticsConfig : new StatisticsConfig();
9086
}
9187

9288
public OnDemandFeatureGroup() {
@@ -95,7 +91,7 @@ public OnDemandFeatureGroup() {
9591
public void save() throws FeatureStoreException, IOException {
9692
onDemandFeatureGroupEngine.saveFeatureGroup(this);
9793

98-
if (statisticsEnabled) {
94+
if (statisticsConfig.getEnabled()) {
9995
statisticsEngine.computeStatistics(this, read());
10096
}
10197
}
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
/*
2+
* Copyright (c) 2021 Logical Clocks AB
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
*
14+
* See the License for the specific language governing permissions and limitations under the License.
15+
*/
16+
17+
package com.logicalclocks.hsfs;
18+
19+
import lombok.AllArgsConstructor;
20+
import lombok.Builder;
21+
import lombok.Getter;
22+
import lombok.NoArgsConstructor;
23+
import lombok.Setter;
24+
25+
import java.util.ArrayList;
26+
import java.util.List;
27+
28+
@AllArgsConstructor
29+
@NoArgsConstructor
30+
@Builder
31+
public class StatisticsConfig {
32+
@Getter
33+
@Setter
34+
private Boolean enabled = true;
35+
36+
@Getter
37+
@Setter
38+
private Boolean histograms = false;
39+
40+
@Getter
41+
@Setter
42+
private Boolean correlations = false;
43+
44+
@Getter
45+
@Setter
46+
private List<String> columns = new ArrayList<>();
47+
48+
public StatisticsConfig(Boolean enabled, Boolean histograms, Boolean correlations) {
49+
this.enabled = enabled;
50+
this.histograms = histograms;
51+
this.correlations = correlations;
52+
}
53+
}

java/src/main/java/com/logicalclocks/hsfs/TrainingDataset.java

Lines changed: 17 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -90,23 +90,7 @@ public class TrainingDataset {
9090

9191
@Getter
9292
@Setter
93-
@JsonIgnore
94-
private Boolean statisticsEnabled = true;
95-
96-
@Getter
97-
@Setter
98-
@JsonIgnore
99-
private Boolean histograms;
100-
101-
@Getter
102-
@Setter
103-
@JsonIgnore
104-
private Boolean correlations;
105-
106-
@Getter
107-
@Setter
108-
@JsonIgnore
109-
private List<String> statisticColumns;
93+
private StatisticsConfig statisticsConfig = new StatisticsConfig();
11094

11195
@Getter
11296
@Setter
@@ -123,8 +107,7 @@ public class TrainingDataset {
123107
@Builder
124108
public TrainingDataset(@NonNull String name, Integer version, String description, DataFormat dataFormat,
125109
StorageConnector storageConnector, String location, List<Split> splits, Long seed,
126-
FeatureStore featureStore, Boolean statisticsEnabled, Boolean histograms,
127-
Boolean correlations, List<String> statisticColumns, List<String> label) {
110+
FeatureStore featureStore, StatisticsConfig statisticsConfig, List<String> label) {
128111
this.name = name;
129112
this.version = version;
130113
this.description = description;
@@ -142,10 +125,7 @@ public TrainingDataset(@NonNull String name, Integer version, String description
142125
this.splits = splits;
143126
this.seed = seed;
144127
this.featureStore = featureStore;
145-
this.statisticsEnabled = statisticsEnabled != null ? statisticsEnabled : true;
146-
this.histograms = histograms;
147-
this.correlations = correlations;
148-
this.statisticColumns = statisticColumns;
128+
this.statisticsConfig = statisticsConfig != null ? statisticsConfig : new StatisticsConfig();
149129
this.label = label;
150130
}
151131

@@ -195,7 +175,7 @@ public void save(Query query, Map<String, String> writeOptions) throws FeatureSt
195175
public void save(Dataset<Row> dataset, Map<String, String> writeOptions)
196176
throws FeatureStoreException, IOException {
197177
trainingDatasetEngine.save(this, dataset, writeOptions, label);
198-
if (statisticsEnabled) {
178+
if (statisticsConfig.getEnabled()) {
199179
statisticsEngine.computeStatistics(this, dataset);
200180
}
201181
}
@@ -314,12 +294,24 @@ public void show(int numRows) {
314294
* @throws IOException
315295
*/
316296
public Statistics computeStatistics() throws FeatureStoreException, IOException {
317-
if (statisticsEnabled) {
297+
if (statisticsConfig.getEnabled()) {
318298
return statisticsEngine.computeStatistics(this, read());
319299
}
320300
return null;
321301
}
322302

303+
/**
304+
* Update the statistics configuration of the training dataset.
305+
* Change the `enabled`, `histograms`, `correlations` or `columns` attributes and persist
306+
* the changes by calling this method.
307+
*
308+
* @throws FeatureStoreException
309+
* @throws IOException
310+
*/
311+
public void updateStatisticsConfig() throws FeatureStoreException, IOException {
312+
trainingDatasetEngine.updateStatisticsConfig(this);
313+
}
314+
323315
/**
324316
* Get the last statistics commit for the training dataset.
325317
*

java/src/main/java/com/logicalclocks/hsfs/engine/FeatureGroupBaseEngine.java

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -66,10 +66,10 @@ public void appendFeatures(FeatureGroupBase featureGroup, List<Feature> features
6666
FeatureGroup apiFG = featureGroupApi.updateMetadata(fgBaseSend, "updateMetadata");
6767
featureGroup.setFeatures(apiFG.getFeatures());
6868
}
69-
70-
public void updateStatisticsConfig(FeatureGroupBase featureGroup) throws FeatureStoreException, IOException {
71-
FeatureGroup apiFG = featureGroupApi.updateMetadata(featureGroup, "updateStatsSettings");
72-
featureGroup.setCorrelations(apiFG.getCorrelations());
73-
featureGroup.setHistograms(apiFG.getHistograms());
69+
70+
public void updateStatisticsConfig(FeatureGroup featureGroup) throws FeatureStoreException, IOException {
71+
FeatureGroup apiFG = featureGroupApi.updateMetadata(featureGroup, "updateStatsConfig");
72+
featureGroup.getStatisticsConfig().setCorrelations(apiFG.getStatisticsConfig().getCorrelations());
73+
featureGroup.getStatisticsConfig().setHistograms(apiFG.getStatisticsConfig().getHistograms());
7474
}
7575
}

java/src/main/java/com/logicalclocks/hsfs/engine/FeatureGroupEngine.java

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -109,8 +109,7 @@ public void saveFeatureGroup(FeatureGroup featureGroup, Dataset<Row> dataset, Li
109109
featureGroup.setVersion(apiFG.getVersion());
110110
featureGroup.setLocation(apiFG.getLocation());
111111
featureGroup.setId(apiFG.getId());
112-
featureGroup.setCorrelations(apiFG.getCorrelations());
113-
featureGroup.setHistograms(apiFG.getHistograms());
112+
featureGroup.setStatisticsConfig(apiFG.getStatisticsConfig());
114113

115114
/* if hudi precombine key was not provided and TimeTravelFormat is HUDI, retrieve from backend and set */
116115
if (featureGroup.getTimeTravelFormat() == TimeTravelFormat.HUDI & hudiPrecombineKey == null) {

java/src/main/java/com/logicalclocks/hsfs/engine/StatisticsEngine.java

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -44,14 +44,17 @@ public StatisticsEngine(EntityEndpointType entityType) {
4444

4545
public Statistics computeStatistics(TrainingDataset trainingDataset, Dataset<Row> dataFrame)
4646
throws FeatureStoreException, IOException {
47-
return statisticsApi.post(trainingDataset, computeStatistics(dataFrame, trainingDataset.getStatisticColumns(),
48-
trainingDataset.getHistograms(), trainingDataset.getCorrelations()));
47+
return statisticsApi.post(trainingDataset, computeStatistics(dataFrame,
48+
trainingDataset.getStatisticsConfig().getColumns(),
49+
trainingDataset.getStatisticsConfig().getHistograms(),
50+
trainingDataset.getStatisticsConfig().getCorrelations()));
4951
}
5052

5153
public Statistics computeStatistics(FeatureGroupBase featureGroup, Dataset<Row> dataFrame)
5254
throws FeatureStoreException, IOException {
53-
return statisticsApi.post(featureGroup, computeStatistics(dataFrame, featureGroup.getStatisticColumns(),
54-
featureGroup.getHistograms(), featureGroup.getCorrelations()));
55+
return statisticsApi.post(featureGroup, computeStatistics(dataFrame,
56+
featureGroup.getStatisticsConfig().getColumns(),
57+
featureGroup.getStatisticsConfig().getHistograms(), featureGroup.getStatisticsConfig().getCorrelations()));
5558
}
5659

5760
private Statistics computeStatistics(Dataset<Row> dataFrame, List<String> statisticColumns, Boolean histograms,

0 commit comments

Comments
 (0)