This is an automated email from the ASF dual-hosted git repository.
wlo pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/gobblin.git
The following commit(s) were added to refs/heads/master by this push:
new 694ed3764 [GOBBLIN-1967] Add external data node for generic
ingress/egress on GaaS (#3838)
694ed3764 is described below
commit 694ed37649ce4d48e4f4810287c7245339d19069
Author: William Lo <[email protected]>
AuthorDate: Fri Dec 1 12:59:36 2023 -0500
[GOBBLIN-1967] Add external data node for generic ingress/egress on GaaS
(#3838)
* Add external data node for generic ingress/egress on GaaS
* Address reviews and cleanup
* Use URI representation for external dataset descriptor node
* Fix error message in containing check
* Address review
---
.../dataset/ExternalUriDatasetDescriptor.java | 64 ++++++++++++++++++++++
.../flowgraph/DatasetDescriptorConfigKeys.java | 2 +
.../flowgraph/datanodes/ExternalUriDataNode.java | 44 +++++++++++++++
.../dataset/ExternalUriDatasetDescriptorTest.java | 53 ++++++++++++++++++
.../flowgraph/datanodes/ExternalDataNodeTest.java | 46 ++++++++++++++++
5 files changed, 209 insertions(+)
diff --git
a/gobblin-service/src/main/java/org/apache/gobblin/service/modules/dataset/ExternalUriDatasetDescriptor.java
b/gobblin-service/src/main/java/org/apache/gobblin/service/modules/dataset/ExternalUriDatasetDescriptor.java
new file mode 100644
index 000000000..5bc47a59e
--- /dev/null
+++
b/gobblin-service/src/main/java/org/apache/gobblin/service/modules/dataset/ExternalUriDatasetDescriptor.java
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.gobblin.service.modules.dataset;
+
+import com.google.common.base.Preconditions;
+import com.typesafe.config.Config;
+import java.io.IOException;
+import java.util.ArrayList;
+import lombok.Getter;
+import
org.apache.gobblin.service.modules.flowgraph.DatasetDescriptorConfigKeys;
+import
org.apache.gobblin.service.modules.flowgraph.DatasetDescriptorErrorUtils;
+
+
+/**
+ * Describes a external dataset not on HDFS, for usage with
Data-Integration-Library in a generic way - see:
https://github.com/linkedin/data-integration-library/tree/master
+ * Datasets under ExternalUriDatasetDescriptor can also be represented by more
specific dataset descriptors, e.g. HttpDatasetDescriptor, SqlDatasetDescriptor,
etc.
+ * e.g, https://some-api:443/user/123/names for a http URI
+ * e.g, jdbc:mysql://some-db:3306/db for a sql URI
+ */
+public class ExternalUriDatasetDescriptor extends BaseDatasetDescriptor
implements DatasetDescriptor {
+
+ @Getter
+ private final String uri;
+
+ public ExternalUriDatasetDescriptor(Config config) throws IOException {
+ super(config);
+
Preconditions.checkArgument(config.hasPath(DatasetDescriptorConfigKeys.URI_KEY),
"Dataset descriptor config must specify a URI");
+ // refers to an external URI of a given dataset, see
https://github.com/linkedin/data-integration-library/blob/master/docs/parameters/ms.source.uri.md
+ this.uri = config.getString(DatasetDescriptorConfigKeys.URI_KEY);
+ }
+
+ @Override
+ public String getPath() {
+ return this.uri;
+ }
+
+ /**
+ * Check if this dataset descriptor is equivalent to another dataset
descriptor
+ *
+ * @param inputDatasetDescriptorConfig whose path should be in the format of
an external path (e.g. http url)
+ */
+ @Override
+ protected ArrayList<String> isPathContaining(DatasetDescriptor
inputDatasetDescriptorConfig) {
+ ArrayList<String> errors = new ArrayList<>();
+ String otherPath = inputDatasetDescriptorConfig.getPath();
+ DatasetDescriptorErrorUtils.populateErrorForDatasetDescriptorKey(errors,
inputDatasetDescriptorConfig.getIsInputDataset(),
DatasetDescriptorConfigKeys.URI_KEY, this.getPath(), otherPath, false);
+ return errors;
+ }
+}
diff --git
a/gobblin-service/src/main/java/org/apache/gobblin/service/modules/flowgraph/DatasetDescriptorConfigKeys.java
b/gobblin-service/src/main/java/org/apache/gobblin/service/modules/flowgraph/DatasetDescriptorConfigKeys.java
index 36d473b05..3e98346f1 100644
---
a/gobblin-service/src/main/java/org/apache/gobblin/service/modules/flowgraph/DatasetDescriptorConfigKeys.java
+++
b/gobblin-service/src/main/java/org/apache/gobblin/service/modules/flowgraph/DatasetDescriptorConfigKeys.java
@@ -33,6 +33,8 @@ public class DatasetDescriptorConfigKeys {
public static final String PATH_KEY = "path";
public static final String SUBPATHS_KEY = "subPaths";
public static final String FS_URI_KEY = "fs.uri";
+
+ public static final String URI_KEY = "uri";
public static final String DATABASE_KEY = "databaseName";
public static final String TABLE_KEY = "tableName";
public static final String FORMAT_KEY = "format";
diff --git
a/gobblin-service/src/main/java/org/apache/gobblin/service/modules/flowgraph/datanodes/ExternalUriDataNode.java
b/gobblin-service/src/main/java/org/apache/gobblin/service/modules/flowgraph/datanodes/ExternalUriDataNode.java
new file mode 100644
index 000000000..903654d6e
--- /dev/null
+++
b/gobblin-service/src/main/java/org/apache/gobblin/service/modules/flowgraph/datanodes/ExternalUriDataNode.java
@@ -0,0 +1,44 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.gobblin.service.modules.flowgraph.datanodes;
+
+import com.typesafe.config.Config;
+import org.apache.gobblin.service.modules.dataset.ExternalUriDatasetDescriptor;
+import org.apache.gobblin.service.modules.flowgraph.BaseDataNode;
+
+
+/**
+ * A DataNode for generic ingress/egress data movement outside of HDFS (HTTP
or otherwise)
+ */
+public class ExternalUriDataNode extends BaseDataNode {
+ public static final String EXTERNAL_PLATFORM_NAME = "external";
+
+ public ExternalUriDataNode(Config nodeProps) throws
DataNodeCreationException {
+ super(nodeProps);
+ }
+
+ @Override
+ public String getDefaultDatasetDescriptorPlatform() {
+ return EXTERNAL_PLATFORM_NAME;
+ }
+
+ @Override
+ public String getDefaultDatasetDescriptorClass() {
+ return ExternalUriDatasetDescriptor.class.getCanonicalName();
+ }
+}
diff --git
a/gobblin-service/src/test/java/org/apache/gobblin/service/modules/dataset/ExternalUriDatasetDescriptorTest.java
b/gobblin-service/src/test/java/org/apache/gobblin/service/modules/dataset/ExternalUriDatasetDescriptorTest.java
new file mode 100644
index 000000000..4c5b313c9
--- /dev/null
+++
b/gobblin-service/src/test/java/org/apache/gobblin/service/modules/dataset/ExternalUriDatasetDescriptorTest.java
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.gobblin.service.modules.dataset;
+
+import com.typesafe.config.Config;
+import com.typesafe.config.ConfigFactory;
+import com.typesafe.config.ConfigValueFactory;
+import java.io.IOException;
+import
org.apache.gobblin.service.modules.flowgraph.DatasetDescriptorConfigKeys;
+import org.junit.Assert;
+import org.testng.annotations.Test;
+
+
+public class ExternalUriDatasetDescriptorTest {
+
+ @Test
+ public void testContains() throws IOException {
+ Config config1 = ConfigFactory.empty()
+ .withValue(DatasetDescriptorConfigKeys.PLATFORM_KEY,
ConfigValueFactory.fromAnyRef("external"))
+ .withValue(DatasetDescriptorConfigKeys.URI_KEY,
ConfigValueFactory.fromAnyRef("https://a.com/b"));
+ ExternalUriDatasetDescriptor descriptor1 = new
ExternalUriDatasetDescriptor(config1);
+
+ // Verify that same path points to same dataset
+ Config config2 = ConfigFactory.empty()
+ .withValue(DatasetDescriptorConfigKeys.PLATFORM_KEY,
ConfigValueFactory.fromAnyRef("external"))
+ .withValue(DatasetDescriptorConfigKeys.URI_KEY,
ConfigValueFactory.fromAnyRef("https://a.com/b"));
+ ExternalUriDatasetDescriptor descriptor2 = new
ExternalUriDatasetDescriptor(config2);
+ Assert.assertEquals(descriptor2.contains(descriptor1).size(), 0);
+
+ // Verify that different path points to different dataset
+ Config config3 = ConfigFactory.empty()
+ .withValue(DatasetDescriptorConfigKeys.PLATFORM_KEY,
ConfigValueFactory.fromAnyRef("external"))
+ .withValue(DatasetDescriptorConfigKeys.URI_KEY,
ConfigValueFactory.fromAnyRef("https://a.com/c"));
+ ExternalUriDatasetDescriptor descriptor3 = new
ExternalUriDatasetDescriptor(config3);
+ Assert.assertNotEquals(descriptor3.contains(descriptor1).size(), 0);
+
+ }
+}
\ No newline at end of file
diff --git
a/gobblin-service/src/test/java/org/apache/gobblin/service/modules/flowgraph/datanodes/ExternalDataNodeTest.java
b/gobblin-service/src/test/java/org/apache/gobblin/service/modules/flowgraph/datanodes/ExternalDataNodeTest.java
new file mode 100644
index 000000000..f44c58dcd
--- /dev/null
+++
b/gobblin-service/src/test/java/org/apache/gobblin/service/modules/flowgraph/datanodes/ExternalDataNodeTest.java
@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.gobblin.service.modules.flowgraph.datanodes;
+
+import com.typesafe.config.Config;
+import com.typesafe.config.ConfigFactory;
+import com.typesafe.config.ConfigValueFactory;
+import org.apache.gobblin.service.modules.dataset.ExternalUriDatasetDescriptor;
+import org.apache.gobblin.service.modules.flowgraph.DataNode;
+import org.apache.gobblin.service.modules.flowgraph.FlowGraphConfigurationKeys;
+import org.junit.Assert;
+import org.testng.annotations.Test;
+
+
+public class ExternalDataNodeTest {
+
+ @Test
+ public void testConfig() throws DataNode.DataNodeCreationException {
+ String expectedNodeId = "some-node-id";
+
+ Config config = ConfigFactory.empty()
+ .withValue(FlowGraphConfigurationKeys.DATA_NODE_ID_KEY,
ConfigValueFactory.fromAnyRef(expectedNodeId));
+ ExternalUriDataNode node = new ExternalUriDataNode(config);
+
+ // Verify the node id
+ String id = node.getId();
+ Assert.assertEquals(id, expectedNodeId);
+ Assert.assertEquals(node.getDefaultDatasetDescriptorPlatform(),
"external");
+ Assert.assertEquals(node.getDefaultDatasetDescriptorClass(),
ExternalUriDatasetDescriptor.class.getCanonicalName());
+ }
+}
\ No newline at end of file