This is an automated email from the ASF dual-hosted git repository.
lesun pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-gobblin.git
The following commit(s) were added to refs/heads/master by this push:
new fe35cb9 [GOBBLIN-1341] Make cluster name resolution more robust
fe35cb9 is described below
commit fe35cb9717a5803d5eb00fe99768a170177289a1
Author: aprokofiev <[email protected]>
AuthorDate: Tue Jan 5 15:00:04 2021 -0800
[GOBBLIN-1341] Make cluster name resolution more robust
[GOBBLIN-1341] Make cluster name resolution more
robust
We extend cluster name conversion to handle names
with
and without url schemes in config. In practice,
scheme
and port numbers are irrelevant, as a single host
usually
maps to a single cluster.
Handle use cases with local path and invalid urls
Clarify the logic based on PR comments
Closes #3177 from aplex/cluster-names
---
.../org/apache/gobblin/util/ClustersNames.java | 78 ++++++++++++++++------
.../org/apache/gobblin/util/ClustersNamesTest.java | 71 +++++++++++++++-----
.../test/resources/GobblinClustersNames.properties | 4 +-
3 files changed, 113 insertions(+), 40 deletions(-)
diff --git
a/gobblin-utility/src/main/java/org/apache/gobblin/util/ClustersNames.java
b/gobblin-utility/src/main/java/org/apache/gobblin/util/ClustersNames.java
index 5819236..5ea6a37 100644
--- a/gobblin-utility/src/main/java/org/apache/gobblin/util/ClustersNames.java
+++ b/gobblin-utility/src/main/java/org/apache/gobblin/util/ClustersNames.java
@@ -17,22 +17,19 @@
package org.apache.gobblin.util;
-import java.io.IOException;
-import java.io.InputStream;
-import java.net.InetAddress;
-import java.net.URI;
-import java.net.URISyntaxException;
-import java.net.URL;
-import java.net.UnknownHostException;
-import java.util.Properties;
-
+import com.google.common.base.Preconditions;
+import com.google.common.io.Closer;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import com.google.common.base.Preconditions;
-import com.google.common.io.Closer;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.*;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Properties;
/**
@@ -78,11 +75,30 @@ public class ClustersNames {
}
}
+ /**
+ * Returns human-readable name of the cluster.
+ *
+ * Method first checks config for exact cluster url match. If nothing is
found,
+ * it will also check host:port and just hostname match.
+ * If it still could not find a match, hostname from the url will be
returned.
+ *
+ * For incomplete or invalid urls, we'll return a name based on clusterUrl,
+ * that will have only alphanumeric characters, dashes, underscores and dots.
+ * */
public String getClusterName(String clusterUrl) {
- if (null == clusterUrl)
+ if (null == clusterUrl) {
return null;
- String res = this.urlToNameMap.getProperty(clusterUrl);
- return null != res ? res : normalizeClusterUrl(clusterUrl);
+ }
+
+ List<String> candidates = generateUrlMatchCandidates(clusterUrl);
+ for (String candidate : candidates) {
+ String name = this.urlToNameMap.getProperty(candidate);
+ if (name != null) {
+ return name;
+ }
+ }
+
+ return candidates.get(candidates.size() - 1);
}
public void addClusterMapping(String clusterUrl, String clusterName) {
@@ -97,21 +113,39 @@ public class ClustersNames {
this.urlToNameMap.put(clusterUrl.toString(), clusterName);
}
- // Strip out the port number if it is a valid URI
- private static String normalizeClusterUrl(String clusterIdentifier) {
+ /**
+ * @see #getClusterName(String) for logic description.
+ */
+ private static List<String> generateUrlMatchCandidates(String
clusterIdentifier) {
+ ArrayList<String> candidates = new ArrayList<>();
+ candidates.add(clusterIdentifier);
+
try {
URI uri = new URI(clusterIdentifier.trim());
- // URIs without protocol prefix
- if (!uri.isOpaque() && null != uri.getHost()) {
- clusterIdentifier = uri.getHost();
+ if (uri.getHost() != null) {
+ if (uri.getPort() != -1) {
+ candidates.add(uri.getHost() + ":" + uri.getPort());
+ }
+
+ // we prefer a config entry with 'host:port', but if it's missing
+ // we'll consider just 'host' config entry
+ candidates.add(uri.getHost());
+ } else if (uri.getScheme() != null && uri.getPath() != null) {
+ // we have a scheme and a path, but not the host name
+ // assuming local host
+ candidates.add("localhost");
} else {
- clusterIdentifier = uri.toString().replaceAll("[/:]","
").trim().replaceAll(" ", "_");
+ candidates.add(getSafeFallbackName(clusterIdentifier));
}
} catch (URISyntaxException e) {
- //leave ID as is
+ candidates.add(getSafeFallbackName(clusterIdentifier));
}
- return clusterIdentifier;
+ return candidates;
+ }
+
+ private static String getSafeFallbackName(String clusterIdentifier) {
+ return clusterIdentifier.replaceAll("[^\\w-\\.]", "_");
}
/**
diff --git
a/gobblin-utility/src/test/java/org/apache/gobblin/util/ClustersNamesTest.java
b/gobblin-utility/src/test/java/org/apache/gobblin/util/ClustersNamesTest.java
index 34ad41f..cffb792 100644
---
a/gobblin-utility/src/test/java/org/apache/gobblin/util/ClustersNamesTest.java
+++
b/gobblin-utility/src/test/java/org/apache/gobblin/util/ClustersNamesTest.java
@@ -21,23 +21,60 @@ import org.testng.Assert;
import org.testng.annotations.Test;
-/** Unit tests for {@link ClustersNames}. This test relies on the
ClustersNames.properties file */
+/**
+ * Unit tests for {@link ClustersNames}. This test relies on the
ClustersNames.properties file
+ */
public class ClustersNamesTest {
-
- @Test
- public void testClustersNames() {
ClustersNames clustersNames = ClustersNames.getInstance();
-
Assert.assertEquals(clustersNames.getClusterName("http://cluster1-rm.some.company.com"),
- "cluster1");
-
Assert.assertEquals(clustersNames.getClusterName("http://cluster2-rm.some.company.com:12345"),
- "cluster2");
-
Assert.assertEquals(clustersNames.getClusterName("cluster1-rm.some.company.com"),
- "cluster1-rm.some.company.com");
-
Assert.assertEquals(clustersNames.getClusterName("http://cluster3-rm.some.company.com:12345"),
- "cluster3-rm.some.company.com");
- Assert.assertEquals(clustersNames.getClusterName("file:///"),
- "file");
- Assert.assertEquals(clustersNames.getClusterName("uri:fancy-uri"),
- "uri_fancy-uri");
- }
+
+ @Test
+ public void testRegisteredUrls() {
+
Assert.assertEquals(clustersNames.getClusterName("http://cluster1-rm.some.company.com"),
+ "cluster1");
+
Assert.assertEquals(clustersNames.getClusterName("http://cluster2-rm.some.company.com:12345"),
+ "cluster2");
+ }
+
+ @Test
+ public void testHostNameWithoutScheme() {
+
Assert.assertEquals(clustersNames.getClusterName("cluster1-rm.some.company.com"),
+ "cluster1-rm.some.company.com");
+
Assert.assertEquals(clustersNames.getClusterName("cluster-host-name-4.some.company.com"),
+ "cluster4");
+ }
+
+ @Test
+ public void testUnregisteredUrl() {
+
Assert.assertEquals(clustersNames.getClusterName("http://nonexistent-cluster-rm.some.company.com:12345"),
+ "nonexistent-cluster-rm.some.company.com");
+ }
+
+ @Test
+ public void testPortSpecificOverrides() {
+
Assert.assertEquals(clustersNames.getClusterName("http://cluster-host-name-4.some.company.com/"),
+ "cluster4");
+
Assert.assertEquals(clustersNames.getClusterName("http://cluster-host-name-4.some.company.com:12345"),
+ "cluster4");
+
Assert.assertEquals(clustersNames.getClusterName("http://cluster-host-name-4.some.company.com:789"),
+ "cluster4-custom-port");
+ }
+
+ @Test
+ public void testLocalPaths() {
+ Assert.assertEquals(clustersNames.getClusterName("file:///"),
"localhost");
+ Assert.assertEquals(clustersNames.getClusterName("file:/home/test"),
"localhost");
+ }
+
+ @Test
+ public void testEmptyNames() {
+ Assert.assertEquals(clustersNames.getClusterName(""), "");
+ Assert.assertNull(clustersNames.getClusterName((String) null));
+ }
+
+ @Test
+ public void testInvalidUrls() {
+ Assert.assertEquals(clustersNames.getClusterName("uri:fancy-uri"),
"uri_fancy-uri");
+ Assert.assertEquals(clustersNames.getClusterName("test/path"),
"test_path");
+
Assert.assertEquals(clustersNames.getClusterName("http://host/?s=^test"),
"http___host__s__test");
+ }
}
diff --git a/gobblin-utility/src/test/resources/GobblinClustersNames.properties
b/gobblin-utility/src/test/resources/GobblinClustersNames.properties
index 09bfc02..dc756c3 100644
--- a/gobblin-utility/src/test/resources/GobblinClustersNames.properties
+++ b/gobblin-utility/src/test/resources/GobblinClustersNames.properties
@@ -18,4 +18,6 @@
# IMPORTANT: you need to escape colons(:) as the properties loader may
interpret them as name/value
# separators.
http\://cluster1-rm.some.company.com=cluster1
-http\://cluster2-rm.some.company.com\:12345=cluster2
\ No newline at end of file
+http\://cluster2-rm.some.company.com\:12345=cluster2
+cluster-host-name-4.some.company.com=cluster4
+http\://cluster-host-name-4.some.company.com\:789=cluster4-custom-port