This is an automated email from the ASF dual-hosted git repository.

wlo pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/gobblin.git


The following commit(s) were added to refs/heads/master by this push:
     new 694ed3764 [GOBBLIN-1967] Add external data node for generic 
ingress/egress on GaaS (#3838)
694ed3764 is described below

commit 694ed37649ce4d48e4f4810287c7245339d19069
Author: William Lo <[email protected]>
AuthorDate: Fri Dec 1 12:59:36 2023 -0500

    [GOBBLIN-1967] Add external data node for generic ingress/egress on GaaS 
(#3838)
    
    * Add external data node for generic ingress/egress on GaaS
    
    * Address reviews and cleanup
    
    * Use URI representation for external dataset descriptor node
    
    * Fix error message in containing check
    
    * Address review
---
 .../dataset/ExternalUriDatasetDescriptor.java      | 64 ++++++++++++++++++++++
 .../flowgraph/DatasetDescriptorConfigKeys.java     |  2 +
 .../flowgraph/datanodes/ExternalUriDataNode.java   | 44 +++++++++++++++
 .../dataset/ExternalUriDatasetDescriptorTest.java  | 53 ++++++++++++++++++
 .../flowgraph/datanodes/ExternalDataNodeTest.java  | 46 ++++++++++++++++
 5 files changed, 209 insertions(+)

diff --git 
a/gobblin-service/src/main/java/org/apache/gobblin/service/modules/dataset/ExternalUriDatasetDescriptor.java
 
b/gobblin-service/src/main/java/org/apache/gobblin/service/modules/dataset/ExternalUriDatasetDescriptor.java
new file mode 100644
index 000000000..5bc47a59e
--- /dev/null
+++ 
b/gobblin-service/src/main/java/org/apache/gobblin/service/modules/dataset/ExternalUriDatasetDescriptor.java
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.gobblin.service.modules.dataset;
+
+import com.google.common.base.Preconditions;
+import com.typesafe.config.Config;
+import java.io.IOException;
+import java.util.ArrayList;
+import lombok.Getter;
+import 
org.apache.gobblin.service.modules.flowgraph.DatasetDescriptorConfigKeys;
+import 
org.apache.gobblin.service.modules.flowgraph.DatasetDescriptorErrorUtils;
+
+
+/**
+ * Describes a external dataset not on HDFS, for usage with 
Data-Integration-Library in a generic way - see: 
https://github.com/linkedin/data-integration-library/tree/master
+ * Datasets under ExternalUriDatasetDescriptor can also be represented by more 
specific dataset descriptors, e.g. HttpDatasetDescriptor, SqlDatasetDescriptor, 
etc.
+ * e.g, https://some-api:443/user/123/names for a http URI
+ * e.g, jdbc:mysql://some-db:3306/db for a sql URI
+ */
+public class ExternalUriDatasetDescriptor extends BaseDatasetDescriptor 
implements DatasetDescriptor {
+
+  @Getter
+  private final String uri;
+
+  public ExternalUriDatasetDescriptor(Config config) throws IOException {
+    super(config);
+    
Preconditions.checkArgument(config.hasPath(DatasetDescriptorConfigKeys.URI_KEY),
 "Dataset descriptor config must specify a URI");
+    // refers to an external URI of a given dataset, see 
https://github.com/linkedin/data-integration-library/blob/master/docs/parameters/ms.source.uri.md
+    this.uri = config.getString(DatasetDescriptorConfigKeys.URI_KEY);
+  }
+
+  @Override
+  public String getPath() {
+    return this.uri;
+  }
+
+  /**
+   * Check if this dataset descriptor is equivalent to another dataset 
descriptor
+   *
+   * @param inputDatasetDescriptorConfig whose path should be in the format of 
an external path (e.g. http url)
+   */
+  @Override
+  protected ArrayList<String> isPathContaining(DatasetDescriptor 
inputDatasetDescriptorConfig) {
+    ArrayList<String> errors = new ArrayList<>();
+    String otherPath = inputDatasetDescriptorConfig.getPath();
+    DatasetDescriptorErrorUtils.populateErrorForDatasetDescriptorKey(errors, 
inputDatasetDescriptorConfig.getIsInputDataset(), 
DatasetDescriptorConfigKeys.URI_KEY, this.getPath(), otherPath, false);
+    return errors;
+  }
+}
diff --git 
a/gobblin-service/src/main/java/org/apache/gobblin/service/modules/flowgraph/DatasetDescriptorConfigKeys.java
 
b/gobblin-service/src/main/java/org/apache/gobblin/service/modules/flowgraph/DatasetDescriptorConfigKeys.java
index 36d473b05..3e98346f1 100644
--- 
a/gobblin-service/src/main/java/org/apache/gobblin/service/modules/flowgraph/DatasetDescriptorConfigKeys.java
+++ 
b/gobblin-service/src/main/java/org/apache/gobblin/service/modules/flowgraph/DatasetDescriptorConfigKeys.java
@@ -33,6 +33,8 @@ public class DatasetDescriptorConfigKeys {
   public static final String PATH_KEY = "path";
   public static final String SUBPATHS_KEY = "subPaths";
   public static final String FS_URI_KEY = "fs.uri";
+
+  public static final String URI_KEY = "uri";
   public static final String DATABASE_KEY = "databaseName";
   public static final String TABLE_KEY = "tableName";
   public static final String FORMAT_KEY = "format";
diff --git 
a/gobblin-service/src/main/java/org/apache/gobblin/service/modules/flowgraph/datanodes/ExternalUriDataNode.java
 
b/gobblin-service/src/main/java/org/apache/gobblin/service/modules/flowgraph/datanodes/ExternalUriDataNode.java
new file mode 100644
index 000000000..903654d6e
--- /dev/null
+++ 
b/gobblin-service/src/main/java/org/apache/gobblin/service/modules/flowgraph/datanodes/ExternalUriDataNode.java
@@ -0,0 +1,44 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.gobblin.service.modules.flowgraph.datanodes;
+
+import com.typesafe.config.Config;
+import org.apache.gobblin.service.modules.dataset.ExternalUriDatasetDescriptor;
+import org.apache.gobblin.service.modules.flowgraph.BaseDataNode;
+
+
+/**
+ * A DataNode for generic ingress/egress data movement outside of HDFS (HTTP 
or otherwise)
+ */
+public class ExternalUriDataNode extends BaseDataNode {
+  public static final String EXTERNAL_PLATFORM_NAME = "external";
+
+  public ExternalUriDataNode(Config nodeProps) throws 
DataNodeCreationException {
+    super(nodeProps);
+  }
+
+  @Override
+  public String getDefaultDatasetDescriptorPlatform() {
+    return EXTERNAL_PLATFORM_NAME;
+  }
+
+  @Override
+  public String getDefaultDatasetDescriptorClass() {
+    return ExternalUriDatasetDescriptor.class.getCanonicalName();
+  }
+}
diff --git 
a/gobblin-service/src/test/java/org/apache/gobblin/service/modules/dataset/ExternalUriDatasetDescriptorTest.java
 
b/gobblin-service/src/test/java/org/apache/gobblin/service/modules/dataset/ExternalUriDatasetDescriptorTest.java
new file mode 100644
index 000000000..4c5b313c9
--- /dev/null
+++ 
b/gobblin-service/src/test/java/org/apache/gobblin/service/modules/dataset/ExternalUriDatasetDescriptorTest.java
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.gobblin.service.modules.dataset;
+
+import com.typesafe.config.Config;
+import com.typesafe.config.ConfigFactory;
+import com.typesafe.config.ConfigValueFactory;
+import java.io.IOException;
+import 
org.apache.gobblin.service.modules.flowgraph.DatasetDescriptorConfigKeys;
+import org.junit.Assert;
+import org.testng.annotations.Test;
+
+
+public class ExternalUriDatasetDescriptorTest {
+
+  @Test
+  public void testContains() throws IOException {
+    Config config1 = ConfigFactory.empty()
+        .withValue(DatasetDescriptorConfigKeys.PLATFORM_KEY, 
ConfigValueFactory.fromAnyRef("external"))
+        .withValue(DatasetDescriptorConfigKeys.URI_KEY, 
ConfigValueFactory.fromAnyRef("https://a.com/b";));
+    ExternalUriDatasetDescriptor descriptor1 = new 
ExternalUriDatasetDescriptor(config1);
+
+    // Verify that same path points to same dataset
+    Config config2 = ConfigFactory.empty()
+        .withValue(DatasetDescriptorConfigKeys.PLATFORM_KEY, 
ConfigValueFactory.fromAnyRef("external"))
+        .withValue(DatasetDescriptorConfigKeys.URI_KEY, 
ConfigValueFactory.fromAnyRef("https://a.com/b";));
+    ExternalUriDatasetDescriptor descriptor2 = new 
ExternalUriDatasetDescriptor(config2);
+    Assert.assertEquals(descriptor2.contains(descriptor1).size(), 0);
+
+    // Verify that different path points to different dataset
+    Config config3 = ConfigFactory.empty()
+        .withValue(DatasetDescriptorConfigKeys.PLATFORM_KEY, 
ConfigValueFactory.fromAnyRef("external"))
+        .withValue(DatasetDescriptorConfigKeys.URI_KEY, 
ConfigValueFactory.fromAnyRef("https://a.com/c";));
+    ExternalUriDatasetDescriptor descriptor3 = new 
ExternalUriDatasetDescriptor(config3);
+    Assert.assertNotEquals(descriptor3.contains(descriptor1).size(), 0);
+
+  }
+}
\ No newline at end of file
diff --git 
a/gobblin-service/src/test/java/org/apache/gobblin/service/modules/flowgraph/datanodes/ExternalDataNodeTest.java
 
b/gobblin-service/src/test/java/org/apache/gobblin/service/modules/flowgraph/datanodes/ExternalDataNodeTest.java
new file mode 100644
index 000000000..f44c58dcd
--- /dev/null
+++ 
b/gobblin-service/src/test/java/org/apache/gobblin/service/modules/flowgraph/datanodes/ExternalDataNodeTest.java
@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.gobblin.service.modules.flowgraph.datanodes;
+
+import com.typesafe.config.Config;
+import com.typesafe.config.ConfigFactory;
+import com.typesafe.config.ConfigValueFactory;
+import org.apache.gobblin.service.modules.dataset.ExternalUriDatasetDescriptor;
+import org.apache.gobblin.service.modules.flowgraph.DataNode;
+import org.apache.gobblin.service.modules.flowgraph.FlowGraphConfigurationKeys;
+import org.junit.Assert;
+import org.testng.annotations.Test;
+
+
+public class ExternalDataNodeTest {
+
+  @Test
+  public void testConfig() throws DataNode.DataNodeCreationException {
+    String expectedNodeId = "some-node-id";
+
+    Config config = ConfigFactory.empty()
+        .withValue(FlowGraphConfigurationKeys.DATA_NODE_ID_KEY, 
ConfigValueFactory.fromAnyRef(expectedNodeId));
+    ExternalUriDataNode node = new ExternalUriDataNode(config);
+
+    // Verify the node id
+    String id = node.getId();
+    Assert.assertEquals(id, expectedNodeId);
+    Assert.assertEquals(node.getDefaultDatasetDescriptorPlatform(), 
"external");
+    Assert.assertEquals(node.getDefaultDatasetDescriptorClass(), 
ExternalUriDatasetDescriptor.class.getCanonicalName());
+  }
+}
\ No newline at end of file

Reply via email to