HADOOP-13590. Retry until TGT expires even if the UGI renewal thread 
encountered exception.


Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo
Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/367c3d41
Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/367c3d41
Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/367c3d41

Branch: refs/heads/YARN-4752
Commit: 367c3d41217728c2e61252c5a5235e5bc1f9822f
Parents: 280357c
Author: Xiao Chen <x...@apache.org>
Authored: Wed Nov 9 09:07:12 2016 -0800
Committer: Xiao Chen <x...@apache.org>
Committed: Wed Nov 9 09:07:12 2016 -0800

----------------------------------------------------------------------
 .../hadoop/security/UserGroupInformation.java   |  70 ++++++++-
 .../hadoop/security/TestUGIWithMiniKdc.java     | 144 +++++++++++++++++++
 .../security/TestUserGroupInformation.java      |  93 ++++++++++++
 3 files changed, 303 insertions(+), 4 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hadoop/blob/367c3d41/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/UserGroupInformation.java
----------------------------------------------------------------------
diff --git 
a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/UserGroupInformation.java
 
b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/UserGroupInformation.java
index 111c3f8..82603a4 100644
--- 
a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/UserGroupInformation.java
+++ 
b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/UserGroupInformation.java
@@ -43,6 +43,7 @@ import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
+import java.util.concurrent.TimeUnit;
 
 import javax.security.auth.Subject;
 import javax.security.auth.callback.CallbackHandler;
@@ -54,14 +55,18 @@ import javax.security.auth.login.LoginContext;
 import javax.security.auth.login.LoginException;
 import javax.security.auth.spi.LoginModule;
 
+import org.apache.hadoop.io.retry.RetryPolicies;
 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.classification.InterfaceStability;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.retry.RetryPolicy;
 import org.apache.hadoop.metrics2.annotation.Metric;
 import org.apache.hadoop.metrics2.annotation.Metrics;
 import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
 import org.apache.hadoop.metrics2.lib.MetricsRegistry;
+import org.apache.hadoop.metrics2.lib.MutableGaugeInt;
+import org.apache.hadoop.metrics2.lib.MutableGaugeLong;
 import org.apache.hadoop.metrics2.lib.MutableQuantiles;
 import org.apache.hadoop.metrics2.lib.MutableRate;
 import org.apache.hadoop.security.SaslRpcServer.AuthMethod;
@@ -85,7 +90,8 @@ import org.slf4j.LoggerFactory;
 @InterfaceAudience.LimitedPrivate({"HDFS", "MapReduce", "HBase", "Hive", 
"Oozie"})
 @InterfaceStability.Evolving
 public class UserGroupInformation {
-  private static final Logger LOG = LoggerFactory.getLogger(
+  @VisibleForTesting
+  static final Logger LOG = LoggerFactory.getLogger(
       UserGroupInformation.class);
 
   /**
@@ -121,6 +127,10 @@ public class UserGroupInformation {
     MutableRate loginFailure;
     @Metric("GetGroups") MutableRate getGroups;
     MutableQuantiles[] getGroupsQuantiles;
+    @Metric("Renewal failures since startup")
+    private MutableGaugeLong renewalFailuresTotal;
+    @Metric("Renewal failures since last successful login")
+    private MutableGaugeInt renewalFailures;
 
     static UgiMetrics create() {
       return DefaultMetricsSystem.instance().register(new UgiMetrics());
@@ -138,6 +148,10 @@ public class UserGroupInformation {
         }
       }
     }
+
+    MutableGaugeInt getRenewalFailures() {
+      return renewalFailures;
+    }
   }
   
   /**
@@ -963,6 +977,7 @@ public class UserGroupInformation {
           return;
         }
         long nextRefresh = getRefreshTime(tgt);
+        RetryPolicy rp = null;
         while (true) {
           try {
             long now = Time.now();
@@ -986,13 +1001,40 @@ public class UserGroupInformation {
             }
             nextRefresh = Math.max(getRefreshTime(tgt),
               now + kerberosMinSecondsBeforeRelogin);
+            metrics.renewalFailures.set(0);
+            rp = null;
           } catch (InterruptedException ie) {
             LOG.warn("Terminating renewal thread");
             return;
           } catch (IOException ie) {
-            LOG.warn("Exception encountered while running the" +
-                " renewal command. Aborting renew thread. " + ie);
-            return;
+            metrics.renewalFailuresTotal.incr();
+            final long tgtEndTime = tgt.getEndTime().getTime();
+            LOG.warn("Exception encountered while running the renewal "
+                    + "command for {}. (TGT end time:{}, renewalFailures: {},"
+                    + "renewalFailuresTotal: {})", getUserName(), tgtEndTime,
+                metrics.renewalFailures, metrics.renewalFailuresTotal, ie);
+            final long now = Time.now();
+            if (rp == null) {
+              // Use a dummy maxRetries to create the policy. The policy will
+              // only be used to get next retry time with exponential back-off.
+              // The final retry time will be later limited within the
+              // tgt endTime in getNextTgtRenewalTime.
+              rp = RetryPolicies.exponentialBackoffRetry(Long.SIZE - 2,
+                  kerberosMinSecondsBeforeRelogin, TimeUnit.MILLISECONDS);
+            }
+            try {
+              nextRefresh = getNextTgtRenewalTime(tgtEndTime, now, rp);
+            } catch (Exception e) {
+              LOG.error("Exception when calculating next tgt renewal time", e);
+              return;
+            }
+            metrics.renewalFailures.incr();
+            // retry until close enough to tgt endTime.
+            if (now > nextRefresh) {
+              LOG.error("TGT is expired. Aborting renew thread for {}.",
+                  getUserName());
+              return;
+            }
           }
         }
       }
@@ -1001,6 +1043,26 @@ public class UserGroupInformation {
     t.setName("TGT Renewer for " + getUserName());
     t.start();
   }
+
+  /**
+   * Get time for next login retry. This will allow the thread to retry with
+   * exponential back-off, until tgt endtime.
+   * Last retry is {@link #kerberosMinSecondsBeforeRelogin} before endtime.
+   *
+   * @param tgtEndTime EndTime of the tgt.
+   * @param now Current time.
+   * @param rp The retry policy.
+   * @return Time for next login retry.
+   */
+  @VisibleForTesting
+  static long getNextTgtRenewalTime(final long tgtEndTime, final long now,
+      final RetryPolicy rp) throws Exception {
+    final long lastRetryTime = tgtEndTime - kerberosMinSecondsBeforeRelogin;
+    final RetryPolicy.RetryAction ra = rp.shouldRetry(null,
+        metrics.renewalFailures.value(), 0, false);
+    return Math.min(lastRetryTime, now + ra.delayMillis);
+  }
+
   /**
    * Log a user in from a keytab file. Loads a user identity from a keytab
    * file and logs them in. They become the currently logged-in user.

http://git-wip-us.apache.org/repos/asf/hadoop/blob/367c3d41/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/security/TestUGIWithMiniKdc.java
----------------------------------------------------------------------
diff --git 
a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/security/TestUGIWithMiniKdc.java
 
b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/security/TestUGIWithMiniKdc.java
new file mode 100644
index 0000000..0016a65
--- /dev/null
+++ 
b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/security/TestUGIWithMiniKdc.java
@@ -0,0 +1,144 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * <p>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.security;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.minikdc.MiniKdc;
+import org.apache.hadoop.security.authentication.util.KerberosUtil;
+import org.apache.hadoop.test.GenericTestUtils;
+import org.apache.hadoop.test.LambdaTestUtils;
+import org.apache.hadoop.util.PlatformName;
+import org.apache.log4j.Level;
+import org.junit.After;
+import org.junit.Test;
+
+import javax.security.auth.Subject;
+import javax.security.auth.kerberos.KerberosPrincipal;
+import javax.security.auth.login.AppConfigurationEntry;
+import javax.security.auth.login.LoginContext;
+import java.io.File;
+import java.security.Principal;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Properties;
+import java.util.Set;
+
+import static 
org.apache.hadoop.fs.CommonConfigurationKeysPublic.HADOOP_KERBEROS_MIN_SECONDS_BEFORE_RELOGIN;
+
+/**
+ * Test {@link UserGroupInformation} with a minikdc.
+ */
+public class TestUGIWithMiniKdc {
+
+  private static MiniKdc kdc;
+
+  @After
+  public void teardown() {
+    UserGroupInformation.reset();
+    if (kdc != null) {
+      kdc.stop();
+    }
+  }
+
+  private void setupKdc() throws Exception {
+    Properties kdcConf = MiniKdc.createConf();
+    // tgt expire time = 30 seconds
+    kdcConf.setProperty(MiniKdc.MAX_TICKET_LIFETIME, "30");
+    kdcConf.setProperty(MiniKdc.MIN_TICKET_LIFETIME, "30");
+    File kdcDir = new File(System.getProperty("test.dir", "target"));
+    kdc = new MiniKdc(kdcConf, kdcDir);
+    kdc.start();
+  }
+
+  @Test(timeout = 120000)
+  public void testAutoRenewalThreadRetryWithKdc() throws Exception {
+    GenericTestUtils.setLogLevel(UserGroupInformation.LOG, Level.DEBUG);
+    final Configuration conf = new Configuration();
+    // Relogin every 1 second
+    conf.setLong(HADOOP_KERBEROS_MIN_SECONDS_BEFORE_RELOGIN, 1);
+    SecurityUtil.setAuthenticationMethod(
+        UserGroupInformation.AuthenticationMethod.KERBEROS, conf);
+    UserGroupInformation.setConfiguration(conf);
+
+    LoginContext loginContext = null;
+    try {
+      final String principal = "foo";
+      final File workDir = new File(System.getProperty("test.dir", "target"));
+      final File keytab = new File(workDir, "foo.keytab");
+      final Set<Principal> principals = new HashSet<>();
+      principals.add(new KerberosPrincipal(principal));
+      setupKdc();
+      kdc.createPrincipal(keytab, principal);
+
+      // client login
+      final Subject subject =
+          new Subject(false, principals, new HashSet<>(), new HashSet<>());
+
+      loginContext = new LoginContext("", subject, null,
+          new javax.security.auth.login.Configuration() {
+            @Override
+            public AppConfigurationEntry[] getAppConfigurationEntry(
+                String name) {
+              Map<String, String> options = new HashMap<>();
+              options.put("principal", principal);
+              options.put("refreshKrb5Config", "true");
+              if (PlatformName.IBM_JAVA) {
+                options.put("useKeytab", keytab.getPath());
+                options.put("credsType", "both");
+              } else {
+                options.put("keyTab", keytab.getPath());
+                options.put("useKeyTab", "true");
+                options.put("storeKey", "true");
+                options.put("doNotPrompt", "true");
+                options.put("useTicketCache", "true");
+                options.put("renewTGT", "true");
+                options.put("isInitiator", Boolean.toString(true));
+              }
+              String ticketCache = System.getenv("KRB5CCNAME");
+              if (ticketCache != null) {
+                options.put("ticketCache", ticketCache);
+              }
+              options.put("debug", "true");
+              return new AppConfigurationEntry[] {new AppConfigurationEntry(
+                  KerberosUtil.getKrb5LoginModuleName(),
+                  AppConfigurationEntry.LoginModuleControlFlag.REQUIRED,
+                  options)};
+            }
+          });
+      loginContext.login();
+      final Subject loginSubject = loginContext.getSubject();
+      UserGroupInformation.loginUserFromSubject(loginSubject);
+
+      // Verify retry happens. Do not verify retry count to reduce flakiness.
+      // Detailed back-off logic is tested separately in
+      // TestUserGroupInformation#testGetNextRetryTime
+      LambdaTestUtils.await(30000, 500,
+          () -> {
+            final int count =
+                UserGroupInformation.metrics.getRenewalFailures().value();
+            UserGroupInformation.LOG.info("Renew failure count is {}", count);
+            return count > 0;
+          });
+    } finally {
+      if (loginContext != null) {
+        loginContext.logout();
+      }
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/hadoop/blob/367c3d41/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/security/TestUserGroupInformation.java
----------------------------------------------------------------------
diff --git 
a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/security/TestUserGroupInformation.java
 
b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/security/TestUserGroupInformation.java
index a52cd46..ed0f58c 100644
--- 
a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/security/TestUserGroupInformation.java
+++ 
b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/security/TestUserGroupInformation.java
@@ -20,6 +20,8 @@ import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.CommonConfigurationKeysPublic;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.retry.RetryPolicies;
+import org.apache.hadoop.io.retry.RetryPolicy;
 import org.apache.hadoop.ipc.TestRpcBase.TestTokenIdentifier;
 import org.apache.hadoop.metrics2.MetricsRecordBuilder;
 import org.apache.hadoop.security.SaslRpcServer.AuthMethod;
@@ -27,13 +29,18 @@ import 
org.apache.hadoop.security.UserGroupInformation.AuthenticationMethod;
 import org.apache.hadoop.security.authentication.util.KerberosName;
 import org.apache.hadoop.security.token.Token;
 import org.apache.hadoop.security.token.TokenIdentifier;
+import org.apache.hadoop.test.GenericTestUtils;
 import org.apache.hadoop.util.Shell;
 import org.apache.hadoop.util.StringUtils;
+import org.apache.hadoop.util.Time;
+import org.apache.log4j.Level;
 import org.junit.After;
 import org.junit.Assert;
 import org.junit.Before;
 import org.junit.BeforeClass;
 import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 import javax.security.auth.Subject;
 import javax.security.auth.kerberos.KerberosPrincipal;
@@ -49,10 +56,13 @@ import java.lang.reflect.Method;
 import java.security.PrivilegedExceptionAction;
 import java.util.Collection;
 import java.util.ConcurrentModificationException;
+import java.util.Date;
 import java.util.LinkedHashSet;
 import java.util.Set;
+import java.util.concurrent.TimeUnit;
 
 import static 
org.apache.hadoop.fs.CommonConfigurationKeys.HADOOP_USER_GROUP_METRICS_PERCENTILES_INTERVALS;
+import static 
org.apache.hadoop.fs.CommonConfigurationKeysPublic.HADOOP_KERBEROS_MIN_SECONDS_BEFORE_RELOGIN;
 import static 
org.apache.hadoop.fs.CommonConfigurationKeysPublic.HADOOP_SECURITY_AUTH_TO_LOCAL;
 import static org.apache.hadoop.test.MetricsAsserts.assertCounter;
 import static org.apache.hadoop.test.MetricsAsserts.assertCounterGt;
@@ -74,6 +84,9 @@ import static org.mockito.Mockito.mock;
 import static org.mockito.Mockito.when;
 
 public class TestUserGroupInformation {
+
+  static final Logger LOG = LoggerFactory.getLogger(
+      TestUserGroupInformation.class);
   final private static String USER_NAME = "us...@hadoop.apache.org";
   final private static String GROUP1_NAME = "group1";
   final private static String GROUP2_NAME = "group2";
@@ -1027,6 +1040,86 @@ public class TestUserGroupInformation {
         return null;
       }
     });
+  }
+
+  @Test
+  public void testGetNextRetryTime() throws Exception {
+    GenericTestUtils.setLogLevel(UserGroupInformation.LOG, Level.DEBUG);
+    final long reloginInterval = 1;
+    final long reloginIntervalMs = reloginInterval * 1000;
+    // Relogin happens every 1 second.
+    conf.setLong(HADOOP_KERBEROS_MIN_SECONDS_BEFORE_RELOGIN, reloginInterval);
+    SecurityUtil.setAuthenticationMethod(AuthenticationMethod.KERBEROS, conf);
+    UserGroupInformation.setConfiguration(conf);
+
+    // Suppose tgt start time is now, end time is 20 seconds from now.
+    final long now = Time.now();
+    final Date endDate = new Date(now + 20000);
+
+    // Explicitly test the exponential back-off logic.
+    // Suppose some time (10 seconds) passed.
+    // Verify exponential backoff and max=(login interval before endTime).
+    final long currentTime = now + 10000;
+    final long endTime = endDate.getTime();
+
+    assertEquals(0, UserGroupInformation.metrics.getRenewalFailures().value());
+    RetryPolicy rp = RetryPolicies.exponentialBackoffRetry(Long.SIZE - 2,
+        1000, TimeUnit.MILLISECONDS);
+    long lastRetry =
+        UserGroupInformation.getNextTgtRenewalTime(endTime, currentTime, rp);
+    assertWithinBounds(
+        UserGroupInformation.metrics.getRenewalFailures().value(),
+        lastRetry, reloginIntervalMs, currentTime);
+
+    UserGroupInformation.metrics.getRenewalFailures().incr();
+    lastRetry =
+        UserGroupInformation.getNextTgtRenewalTime(endTime, currentTime, rp);
+    assertWithinBounds(
+        UserGroupInformation.metrics.getRenewalFailures().value(),
+        lastRetry, reloginIntervalMs, currentTime);
+
+    UserGroupInformation.metrics.getRenewalFailures().incr();
+    lastRetry =
+        UserGroupInformation.getNextTgtRenewalTime(endTime, currentTime, rp);
+    assertWithinBounds(
+        UserGroupInformation.metrics.getRenewalFailures().value(),
+        lastRetry, reloginIntervalMs, currentTime);
+
+    UserGroupInformation.metrics.getRenewalFailures().incr();
+    lastRetry =
+        UserGroupInformation.getNextTgtRenewalTime(endTime, currentTime, rp);
+    assertWithinBounds(
+        UserGroupInformation.metrics.getRenewalFailures().value(),
+        lastRetry, reloginIntervalMs, currentTime);
+
+    // last try should be right before expiry.
+    UserGroupInformation.metrics.getRenewalFailures().incr();
+    lastRetry =
+        UserGroupInformation.getNextTgtRenewalTime(endTime, currentTime, rp);
+    String str =
+        "5th retry, now:" + currentTime + ", retry:" + lastRetry;
+    LOG.info(str);
+    assertEquals(str, endTime - reloginIntervalMs, lastRetry);
+
+    // make sure no more retries after (tgt endTime - login interval).
+    UserGroupInformation.metrics.getRenewalFailures().incr();
+    lastRetry =
+        UserGroupInformation.getNextTgtRenewalTime(endTime, currentTime, rp);
+    str = "overflow retry, now:" + currentTime + ", retry:" + lastRetry;
+    LOG.info(str);
+    assertEquals(str, endTime - reloginIntervalMs, lastRetry);
+  }
 
+  private void assertWithinBounds(final int numFailures, final long lastRetry,
+      final long reloginIntervalMs, long now) {
+    // shift is 2 to the power of (numFailure).
+    int shift = numFailures + 1;
+    final long lower = now + reloginIntervalMs * (long)((1 << shift) * 0.5);
+    final long upper = now + reloginIntervalMs * (long)((1 << shift) * 1.5);
+    final String str = new String("Retry#" + (numFailures + 1) + ", now:" + now
+        + ", lower bound:" + lower + ", upper bound:" + upper
+        + ", retry:" + lastRetry);
+    LOG.info(str);
+    assertTrue(str, lower <= lastRetry && lastRetry < upper);
   }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: common-commits-unsubscr...@hadoop.apache.org
For additional commands, e-mail: common-commits-h...@hadoop.apache.org

Reply via email to