This is an automated email from the ASF dual-hosted git repository.

lesun pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-gobblin.git


The following commit(s) were added to refs/heads/master by this push:
     new fe35cb9  [GOBBLIN-1341] Make cluster name resolution more robust
fe35cb9 is described below

commit fe35cb9717a5803d5eb00fe99768a170177289a1
Author: aprokofiev <[email protected]>
AuthorDate: Tue Jan 5 15:00:04 2021 -0800

    [GOBBLIN-1341] Make cluster name resolution more robust
    
    [GOBBLIN-1341] Make cluster name resolution more
    robust
    
    We extend cluster name conversion to handle names
    with
    and without url schemes in config. In practice,
    scheme
    and port numbers are irrelevant, as a single host
    usually
    maps to a single cluster.
    
    Handle use cases with local path and invalid urls
    
    Clarify the logic based on PR comments
    
    Closes #3177 from aplex/cluster-names
---
 .../org/apache/gobblin/util/ClustersNames.java     | 78 ++++++++++++++++------
 .../org/apache/gobblin/util/ClustersNamesTest.java | 71 +++++++++++++++-----
 .../test/resources/GobblinClustersNames.properties |  4 +-
 3 files changed, 113 insertions(+), 40 deletions(-)

diff --git 
a/gobblin-utility/src/main/java/org/apache/gobblin/util/ClustersNames.java 
b/gobblin-utility/src/main/java/org/apache/gobblin/util/ClustersNames.java
index 5819236..5ea6a37 100644
--- a/gobblin-utility/src/main/java/org/apache/gobblin/util/ClustersNames.java
+++ b/gobblin-utility/src/main/java/org/apache/gobblin/util/ClustersNames.java
@@ -17,22 +17,19 @@
 
 package org.apache.gobblin.util;
 
-import java.io.IOException;
-import java.io.InputStream;
-import java.net.InetAddress;
-import java.net.URI;
-import java.net.URISyntaxException;
-import java.net.URL;
-import java.net.UnknownHostException;
-import java.util.Properties;
-
+import com.google.common.base.Preconditions;
+import com.google.common.io.Closer;
 import org.apache.commons.lang.StringUtils;
 import org.apache.hadoop.conf.Configuration;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import com.google.common.base.Preconditions;
-import com.google.common.io.Closer;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.*;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Properties;
 
 
 /**
@@ -78,11 +75,30 @@ public class ClustersNames {
     }
   }
 
+  /**
+   * Returns human-readable name of the cluster.
+   *
+   * Method first checks config for exact cluster url match. If nothing is 
found,
+   * it will also check host:port and just hostname match.
+   * If it still could not find a match, hostname from the url will be 
returned.
+   *
+   * For incomplete or invalid urls, we'll return a name based on clusterUrl,
+   * that will have only alphanumeric characters, dashes, underscores and dots.
+   * */
   public String getClusterName(String clusterUrl) {
-    if (null == clusterUrl)
+    if (null == clusterUrl) {
       return null;
-    String res = this.urlToNameMap.getProperty(clusterUrl);
-    return null != res ? res : normalizeClusterUrl(clusterUrl);
+    }
+
+    List<String> candidates = generateUrlMatchCandidates(clusterUrl);
+    for (String candidate : candidates) {
+      String name = this.urlToNameMap.getProperty(candidate);
+      if (name != null) {
+        return name;
+      }
+    }
+
+    return candidates.get(candidates.size() - 1);
   }
 
   public void addClusterMapping(String clusterUrl, String clusterName) {
@@ -97,21 +113,39 @@ public class ClustersNames {
     this.urlToNameMap.put(clusterUrl.toString(), clusterName);
   }
 
-  // Strip out the port number if it is a valid URI
-  private static String normalizeClusterUrl(String clusterIdentifier) {
+  /**
+   * @see #getClusterName(String) for logic description.
+   */
+  private static List<String> generateUrlMatchCandidates(String 
clusterIdentifier) {
+    ArrayList<String> candidates = new ArrayList<>();
+    candidates.add(clusterIdentifier);
+
     try {
       URI uri = new URI(clusterIdentifier.trim());
-      // URIs without protocol prefix
-      if (!uri.isOpaque() && null != uri.getHost()) {
-        clusterIdentifier = uri.getHost();
+      if (uri.getHost() != null) {
+        if (uri.getPort() != -1) {
+          candidates.add(uri.getHost() + ":" + uri.getPort());
+        }
+
+        // we prefer a config entry with 'host:port', but if it's missing
+        // we'll consider just 'host' config entry
+        candidates.add(uri.getHost());
+      } else if (uri.getScheme() != null && uri.getPath() != null) {
+        // we have a scheme and a path, but not the host name
+        // assuming local host
+        candidates.add("localhost");
       } else {
-        clusterIdentifier = uri.toString().replaceAll("[/:]"," 
").trim().replaceAll(" ", "_");
+        candidates.add(getSafeFallbackName(clusterIdentifier));
       }
     } catch (URISyntaxException e) {
-      //leave ID as is
+      candidates.add(getSafeFallbackName(clusterIdentifier));
     }
 
-    return clusterIdentifier;
+    return candidates;
+  }
+
+  private static String getSafeFallbackName(String clusterIdentifier) {
+    return clusterIdentifier.replaceAll("[^\\w-\\.]", "_");
   }
 
   /**
diff --git 
a/gobblin-utility/src/test/java/org/apache/gobblin/util/ClustersNamesTest.java 
b/gobblin-utility/src/test/java/org/apache/gobblin/util/ClustersNamesTest.java
index 34ad41f..cffb792 100644
--- 
a/gobblin-utility/src/test/java/org/apache/gobblin/util/ClustersNamesTest.java
+++ 
b/gobblin-utility/src/test/java/org/apache/gobblin/util/ClustersNamesTest.java
@@ -21,23 +21,60 @@ import org.testng.Assert;
 import org.testng.annotations.Test;
 
 
-/** Unit tests for {@link ClustersNames}. This test relies on the 
ClustersNames.properties file */
+/**
+ * Unit tests for {@link ClustersNames}. This test relies on the 
ClustersNames.properties file
+ */
 public class ClustersNamesTest {
-
-  @Test
-  public void testClustersNames() {
     ClustersNames clustersNames = ClustersNames.getInstance();
-    
Assert.assertEquals(clustersNames.getClusterName("http://cluster1-rm.some.company.com";),
-                        "cluster1");
-    
Assert.assertEquals(clustersNames.getClusterName("http://cluster2-rm.some.company.com:12345";),
-                        "cluster2");
-    
Assert.assertEquals(clustersNames.getClusterName("cluster1-rm.some.company.com"),
-                        "cluster1-rm.some.company.com");
-    
Assert.assertEquals(clustersNames.getClusterName("http://cluster3-rm.some.company.com:12345";),
-        "cluster3-rm.some.company.com");
-    Assert.assertEquals(clustersNames.getClusterName("file:///"),
-        "file");
-    Assert.assertEquals(clustersNames.getClusterName("uri:fancy-uri"),
-        "uri_fancy-uri");
-  }
+
+    @Test
+    public void testRegisteredUrls() {
+        
Assert.assertEquals(clustersNames.getClusterName("http://cluster1-rm.some.company.com";),
+                "cluster1");
+        
Assert.assertEquals(clustersNames.getClusterName("http://cluster2-rm.some.company.com:12345";),
+                "cluster2");
+    }
+
+    @Test
+    public void testHostNameWithoutScheme() {
+        
Assert.assertEquals(clustersNames.getClusterName("cluster1-rm.some.company.com"),
+                "cluster1-rm.some.company.com");
+        
Assert.assertEquals(clustersNames.getClusterName("cluster-host-name-4.some.company.com"),
+                "cluster4");
+    }
+
+    @Test
+    public void testUnregisteredUrl() {
+        
Assert.assertEquals(clustersNames.getClusterName("http://nonexistent-cluster-rm.some.company.com:12345";),
+                "nonexistent-cluster-rm.some.company.com");
+    }
+
+    @Test
+    public void testPortSpecificOverrides() {
+        
Assert.assertEquals(clustersNames.getClusterName("http://cluster-host-name-4.some.company.com/";),
+                "cluster4");
+        
Assert.assertEquals(clustersNames.getClusterName("http://cluster-host-name-4.some.company.com:12345";),
+                "cluster4");
+        
Assert.assertEquals(clustersNames.getClusterName("http://cluster-host-name-4.some.company.com:789";),
+                "cluster4-custom-port");
+    }
+
+    @Test
+    public void testLocalPaths() {
+        Assert.assertEquals(clustersNames.getClusterName("file:///"), 
"localhost");
+        Assert.assertEquals(clustersNames.getClusterName("file:/home/test"), 
"localhost");
+    }
+
+    @Test
+    public void testEmptyNames() {
+        Assert.assertEquals(clustersNames.getClusterName(""), "");
+        Assert.assertNull(clustersNames.getClusterName((String) null));
+    }
+
+    @Test
+    public void testInvalidUrls() {
+        Assert.assertEquals(clustersNames.getClusterName("uri:fancy-uri"), 
"uri_fancy-uri");
+        Assert.assertEquals(clustersNames.getClusterName("test/path"), 
"test_path");
+        
Assert.assertEquals(clustersNames.getClusterName("http://host/?s=^test";), 
"http___host__s__test");
+    }
 }
diff --git a/gobblin-utility/src/test/resources/GobblinClustersNames.properties 
b/gobblin-utility/src/test/resources/GobblinClustersNames.properties
index 09bfc02..dc756c3 100644
--- a/gobblin-utility/src/test/resources/GobblinClustersNames.properties
+++ b/gobblin-utility/src/test/resources/GobblinClustersNames.properties
@@ -18,4 +18,6 @@
 # IMPORTANT: you need to escape colons(:) as the properties loader may 
interpret them as name/value
 # separators.
 http\://cluster1-rm.some.company.com=cluster1
-http\://cluster2-rm.some.company.com\:12345=cluster2
\ No newline at end of file
+http\://cluster2-rm.some.company.com\:12345=cluster2
+cluster-host-name-4.some.company.com=cluster4
+http\://cluster-host-name-4.some.company.com\:789=cluster4-custom-port

Reply via email to