Skip to content

Commit 12bf1ee

Browse files
authored
[HOPSWORKS-2197] Add ADLS connector support (#221)
1 parent 023f97e commit 12bf1ee

File tree

10 files changed

+292
-98
lines changed

10 files changed

+292
-98
lines changed

java/src/main/java/com/logicalclocks/hsfs/StorageConnector.java

Lines changed: 38 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
package com.logicalclocks.hsfs;
1818

1919
import com.fasterxml.jackson.annotation.JsonIgnore;
20+
import com.logicalclocks.hsfs.metadata.Option;
2021
import com.logicalclocks.hsfs.util.Constants;
2122
import lombok.AllArgsConstructor;
2223
import lombok.Getter;
@@ -28,6 +29,7 @@
2829
import java.util.Arrays;
2930
import java.util.Date;
3031
import java.util.HashMap;
32+
import java.util.List;
3133
import java.util.Map;
3234
import java.util.stream.Collectors;
3335

@@ -120,18 +122,48 @@ public class StorageConnector {
120122
@Setter
121123
private String arguments;
122124

125+
@Getter
126+
@Setter
127+
private Integer generation;
128+
129+
@Getter
130+
@Setter
131+
private String directoryId;
132+
133+
@Getter
134+
@Setter
135+
private String applicationId;
136+
137+
@Getter
138+
@Setter
139+
private String serviceCredentials;
140+
141+
@Getter
142+
@Setter
143+
private String accountName;
144+
145+
@Getter
146+
@Setter
147+
private String containerName;
148+
149+
@Getter
150+
@Setter
151+
private List<Option> sparkOptions;
152+
123153
@Getter
124154
@Setter
125155
private StorageConnectorType storageConnectorType;
126156

127157
@JsonIgnore
128-
public Map<String, String> getSparkOptions() throws FeatureStoreException {
129-
if (StorageConnectorType.JDBC.equals(storageConnectorType)) {
130-
return getJdbcOptions();
131-
} else if (StorageConnectorType.REDSHIFT.equals(storageConnectorType)) {
132-
return getRedshiftOptions();
158+
public Map<String, String> getSparkOptionsInt() throws FeatureStoreException {
159+
switch (storageConnectorType) {
160+
case JDBC:
161+
return getJdbcOptions();
162+
case REDSHIFT:
163+
return getRedshiftOptions();
164+
default:
165+
throw new FeatureStoreException("Spark options are not supported for connector " + storageConnectorType);
133166
}
134-
throw new FeatureStoreException("Spark options are not supported for connector " + storageConnectorType);
135167
}
136168

137169
@JsonIgnore

java/src/main/java/com/logicalclocks/hsfs/StorageConnectorType.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,5 +20,6 @@ public enum StorageConnectorType {
2020
HOPSFS,
2121
S3,
2222
JDBC,
23-
REDSHIFT
23+
REDSHIFT,
24+
ADLS
2425
}

java/src/main/java/com/logicalclocks/hsfs/TrainingDataset.java

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
import com.logicalclocks.hsfs.engine.StatisticsEngine;
2222
import com.logicalclocks.hsfs.engine.TrainingDatasetEngine;
2323
import com.logicalclocks.hsfs.constructor.Query;
24+
import com.logicalclocks.hsfs.engine.Utils;
2425
import com.logicalclocks.hsfs.metadata.Statistics;
2526
import lombok.Builder;
2627
import lombok.Getter;
@@ -73,7 +74,6 @@ public class TrainingDataset {
7374

7475
@Getter
7576
@Setter
76-
@JsonIgnore
7777
private StorageConnector storageConnector;
7878

7979
@Getter
@@ -103,6 +103,7 @@ public class TrainingDataset {
103103

104104
private TrainingDatasetEngine trainingDatasetEngine = new TrainingDatasetEngine();
105105
private StatisticsEngine statisticsEngine = new StatisticsEngine(EntityEndpointType.TRAINING_DATASET);
106+
private Utils utils = new Utils();
106107

107108
@Builder
108109
public TrainingDataset(@NonNull String name, Integer version, String description, DataFormat dataFormat,
@@ -115,13 +116,7 @@ public TrainingDataset(@NonNull String name, Integer version, String description
115116
this.location = location;
116117
this.storageConnector = storageConnector;
117118

118-
if (storageConnector != null) {
119-
if (storageConnector.getStorageConnectorType() == StorageConnectorType.S3) {
120-
// Default it's already HOPSFS_TRAINING_DATASET
121-
this.trainingDatasetType = TrainingDatasetType.EXTERNAL_TRAINING_DATASET;
122-
}
123-
}
124-
119+
this.trainingDatasetType = utils.getTrainingDatasetType(storageConnector);
125120
this.splits = splits;
126121
this.seed = seed;
127122
this.featureStore = featureStore;

java/src/main/java/com/logicalclocks/hsfs/engine/SparkEngine.java

Lines changed: 64 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -26,10 +26,10 @@
2626
import com.logicalclocks.hsfs.OnDemandFeatureGroup;
2727
import com.logicalclocks.hsfs.Split;
2828
import com.logicalclocks.hsfs.StorageConnector;
29-
import com.logicalclocks.hsfs.StorageConnectorType;
3029
import com.logicalclocks.hsfs.TimeTravelFormat;
3130
import com.logicalclocks.hsfs.TrainingDataset;
3231
import com.logicalclocks.hsfs.metadata.OnDemandOptions;
32+
import com.logicalclocks.hsfs.metadata.Option;
3333
import com.logicalclocks.hsfs.util.Constants;
3434
import lombok.Getter;
3535
import org.apache.hadoop.fs.Path;
@@ -80,7 +80,7 @@ public Dataset<Row> sql(String query) {
8080
}
8181

8282
public Dataset<Row> jdbc(String query, StorageConnector storageConnector) throws FeatureStoreException {
83-
Map<String, String> readOptions = storageConnector.getSparkOptions();
83+
Map<String, String> readOptions = storageConnector.getSparkOptionsInt();
8484
if (!Strings.isNullOrEmpty(query)) {
8585
readOptions.put("query", query);
8686
}
@@ -124,42 +124,6 @@ public void registerHudiTemporaryTable(FeatureGroup featureGroup, String alias,
124124
leftFeaturegroupStartTimestamp, leftFeaturegroupEndTimestamp, readOptions);
125125
}
126126

127-
public void configureConnector(StorageConnector storageConnector) {
128-
if (storageConnector.getStorageConnectorType() == StorageConnectorType.S3) {
129-
configureS3Connector(storageConnector);
130-
}
131-
}
132-
133-
public static String sparkPath(String path) {
134-
if (path.startsWith(Constants.S3_SCHEME)) {
135-
return path.replaceFirst(Constants.S3_SCHEME, Constants.S3_SPARK_SCHEME);
136-
}
137-
return path;
138-
}
139-
140-
private void configureS3Connector(StorageConnector storageConnector) {
141-
if (!Strings.isNullOrEmpty(storageConnector.getAccessKey())
142-
&& Strings.isNullOrEmpty(storageConnector.getSessionToken())) {
143-
sparkSession.conf().set(Constants.S3_ACCESS_KEY_ENV, storageConnector.getAccessKey());
144-
sparkSession.conf().set(Constants.S3_SECRET_KEY_ENV, storageConnector.getSecretKey());
145-
}
146-
if (!Strings.isNullOrEmpty(storageConnector.getSessionToken())) {
147-
sparkSession.conf().set(Constants.S3_CREDENTIAL_PROVIDER_ENV, Constants.S3_TEMPORARY_CREDENTIAL_PROVIDER);
148-
sparkSession.conf().set(Constants.S3_ACCESS_KEY_ENV, storageConnector.getAccessKey());
149-
sparkSession.conf().set(Constants.S3_SECRET_KEY_ENV, storageConnector.getSecretKey());
150-
sparkSession.conf().set(Constants.S3_SESSION_KEY_ENV, storageConnector.getSessionToken());
151-
}
152-
if (!Strings.isNullOrEmpty(storageConnector.getServerEncryptionAlgorithm())) {
153-
sparkSession.conf().set(
154-
"fs.s3a.server-side-encryption-algorithm",
155-
storageConnector.getServerEncryptionAlgorithm()
156-
);
157-
}
158-
if (!Strings.isNullOrEmpty(storageConnector.getServerEncryptionKey())) {
159-
sparkSession.conf().set("fs.s3a.server-side-encryption.key", storageConnector.getServerEncryptionKey());
160-
}
161-
}
162-
163127
/**
164128
* Setup Spark to write the data on the File System.
165129
*
@@ -171,9 +135,8 @@ private void configureS3Connector(StorageConnector storageConnector) {
171135
public void write(TrainingDataset trainingDataset, Dataset<Row> dataset,
172136
Map<String, String> writeOptions, SaveMode saveMode) {
173137

174-
if (trainingDataset.getStorageConnector() != null) {
175-
SparkEngine.getInstance().configureConnector(trainingDataset.getStorageConnector());
176-
}
138+
setupConnectorHadoopConf(trainingDataset.getStorageConnector());
139+
177140
if (trainingDataset.getSplits() == null) {
178141
// Write a single dataset
179142

@@ -296,10 +259,7 @@ private void writeSingle(Dataset<Row> dataset, DataFormat dataFormat,
296259
// OnDemand Feature Group in TFRecords format. However Spark does not use an enum but a string.
297260
public Dataset<Row> read(StorageConnector storageConnector, String dataFormat,
298261
Map<String, String> readOptions, String path) {
299-
300-
if (storageConnector.getStorageConnectorType() == StorageConnectorType.S3) {
301-
configureS3Connector(storageConnector);
302-
}
262+
setupConnectorHadoopConf(storageConnector);
303263

304264
return SparkEngine.getInstance().getSparkSession()
305265
.read()
@@ -322,7 +282,7 @@ public Dataset<Row> read(StorageConnector storageConnector, String dataFormat,
322282
public Map<String, String> getOnlineOptions(Map<String, String> providedWriteOptions,
323283
FeatureGroup featureGroup,
324284
StorageConnector storageConnector) throws FeatureStoreException {
325-
Map<String, String> writeOptions = storageConnector.getSparkOptions();
285+
Map<String, String> writeOptions = storageConnector.getSparkOptionsInt();
326286
writeOptions.put(Constants.JDBC_TABLE, utils.getFgName(featureGroup));
327287

328288
// add user provided configuration
@@ -400,4 +360,62 @@ public String profile(Dataset<Row> df, boolean correlation, boolean histogram) {
400360
public String profile(Dataset<Row> df) {
401361
return profile(df, null, true, true);
402362
}
363+
364+
public void setupConnectorHadoopConf(StorageConnector storageConnector) {
365+
if (storageConnector == null) {
366+
return;
367+
}
368+
369+
switch (storageConnector.getStorageConnectorType()) {
370+
case S3:
371+
setupS3ConnectorHadoopConf(storageConnector);
372+
break;
373+
case ADLS:
374+
setupAdlsConnectorHadoopConf(storageConnector);
375+
break;
376+
default:
377+
// No-OP
378+
break;
379+
}
380+
}
381+
382+
public static String sparkPath(String path) {
383+
if (path.startsWith(Constants.S3_SCHEME)) {
384+
return path.replaceFirst(Constants.S3_SCHEME, Constants.S3_SPARK_SCHEME);
385+
}
386+
return path;
387+
}
388+
389+
private void setupS3ConnectorHadoopConf(StorageConnector storageConnector) {
390+
if (!Strings.isNullOrEmpty(storageConnector.getAccessKey())) {
391+
sparkSession.sparkContext().hadoopConfiguration()
392+
.set(Constants.S3_ACCESS_KEY_ENV, storageConnector.getAccessKey());
393+
}
394+
if (!Strings.isNullOrEmpty(storageConnector.getSecretKey())) {
395+
sparkSession.sparkContext().hadoopConfiguration()
396+
.set(Constants.S3_SECRET_KEY_ENV, storageConnector.getSecretKey());
397+
}
398+
if (!Strings.isNullOrEmpty(storageConnector.getServerEncryptionAlgorithm())) {
399+
sparkSession.sparkContext().hadoopConfiguration().set(
400+
"fs.s3a.server-side-encryption-algorithm",
401+
storageConnector.getServerEncryptionAlgorithm()
402+
);
403+
}
404+
if (!Strings.isNullOrEmpty(storageConnector.getServerEncryptionKey())) {
405+
sparkSession.sparkContext().hadoopConfiguration()
406+
.set("fs.s3a.server-side-encryption.key", storageConnector.getServerEncryptionKey());
407+
}
408+
if (!Strings.isNullOrEmpty(storageConnector.getSessionToken())) {
409+
sparkSession.sparkContext().hadoopConfiguration()
410+
.set(Constants.S3_CREDENTIAL_PROVIDER_ENV, Constants.S3_TEMPORARY_CREDENTIAL_PROVIDER);
411+
sparkSession.sparkContext().hadoopConfiguration()
412+
.set(Constants.S3_SESSION_KEY_ENV, storageConnector.getSessionToken());
413+
}
414+
}
415+
416+
private void setupAdlsConnectorHadoopConf(StorageConnector storageConnector) {
417+
for (Option confOption : storageConnector.getSparkOptions()) {
418+
sparkSession.sparkContext().hadoopConfiguration().set(confOption.getName(), confOption.getValue());
419+
}
420+
}
403421
}

java/src/main/java/com/logicalclocks/hsfs/engine/TrainingDatasetEngine.java

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -114,10 +114,6 @@ public void insert(TrainingDataset trainingDataset, Dataset<Row> dataset,
114114
}
115115

116116
public Dataset<Row> read(TrainingDataset trainingDataset, String split, Map<String, String> providedOptions) {
117-
if (trainingDataset.getStorageConnector() != null) {
118-
SparkEngine.getInstance().configureConnector(trainingDataset.getStorageConnector());
119-
}
120-
121117
String path = "";
122118
if (com.google.common.base.Strings.isNullOrEmpty(split)) {
123119
// ** glob means "all sub directories"

java/src/main/java/com/logicalclocks/hsfs/engine/Utils.java

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,10 @@
1919
import com.logicalclocks.hsfs.Feature;
2020
import com.logicalclocks.hsfs.FeatureStoreException;
2121
import com.logicalclocks.hsfs.FeatureGroup;
22+
import com.logicalclocks.hsfs.StorageConnectorType;
2223
import com.logicalclocks.hsfs.TrainingDatasetFeature;
2324
import com.logicalclocks.hsfs.StorageConnector;
25+
import com.logicalclocks.hsfs.TrainingDatasetType;
2426
import com.logicalclocks.hsfs.metadata.StorageConnectorApi;
2527
import org.apache.commons.io.FileUtils;
2628
import org.apache.spark.sql.Dataset;
@@ -84,6 +86,16 @@ public void trainingDatasetSchemaMatch(Dataset<Row> dataset, List<TrainingDatase
8486
}
8587
}
8688

89+
public TrainingDatasetType getTrainingDatasetType(StorageConnector storageConnector) {
90+
if (storageConnector == null) {
91+
return TrainingDatasetType.HOPSFS_TRAINING_DATASET;
92+
} else if (storageConnector.getStorageConnectorType() == StorageConnectorType.HOPSFS) {
93+
return TrainingDatasetType.HOPSFS_TRAINING_DATASET;
94+
} else {
95+
return TrainingDatasetType.EXTERNAL_TRAINING_DATASET;
96+
}
97+
}
98+
8799
// TODO(Fabio): this should be moved in the backend
88100
public String getTableName(FeatureGroup offlineFeatureGroup) {
89101
return offlineFeatureGroup.getFeatureStore().getName() + "."
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
/*
2+
* Copyright (c) 2021 Logical Clocks AB
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
*
14+
* See the License for the specific language governing permissions and limitations under the License.
15+
*/
16+
17+
package com.logicalclocks.hsfs.metadata;
18+
19+
import lombok.AllArgsConstructor;
20+
import lombok.Getter;
21+
import lombok.NoArgsConstructor;
22+
import lombok.Setter;
23+
24+
@AllArgsConstructor
25+
@NoArgsConstructor
26+
public class Option {
27+
@Getter
28+
@Setter
29+
private String name;
30+
31+
@Getter
32+
@Setter
33+
private String value;
34+
}

0 commit comments

Comments
 (0)