Oleksiy Sayankin created HIVE-19587:
---------------------------------------

             Summary: HeartBeat thread uses cancelled delegation token while 
connecting to meta on KERBEROS cluster
                 Key: HIVE-19587
                 URL: https://issues.apache.org/jira/browse/HIVE-19587
             Project: Hive
          Issue Type: Bug
            Reporter: Oleksiy Sayankin
            Assignee: Oleksiy Sayankin


*STEP 1. Create test data*

{code}
create table t1 (id int) ROW FORMAT DELIMITED FIELDS TERMINATED BY  ",";
create table t2 (id int) ROW FORMAT DELIMITED FIELDS TERMINATED BY  ",";
{code}

Generate 10 000 000 lines of random data

{code}
package com.test.app;

import java.io.FileNotFoundException;
import java.io.PrintWriter;
import java.util.concurrent.ThreadLocalRandom;

public class App {
  public static void main(String[] args) throws FileNotFoundException {
    try (PrintWriter out = new PrintWriter("table.data");) {
      int min = 0;
      int max = 10_000;
      int numRows = 10_000_000;
      for (int i = 0; i <= numRows - 1; i++){
        int randomNum = ThreadLocalRandom.current().nextInt(min, max + 1);
        out.println(randomNum);
      }
    }
  }
}
{code}

Upload data to Hive tables

{code}
load data local inpath '/home/myuser/table.data' into table t1;
load data local inpath '/home/myuser/table.data' into table t2;
{code}

*STEP 2. Configure transactions in hive-site.xml*

{code}
<!-- Transactions -->

   <property>
      <name>hive.exec.dynamic.partition.mode</name>
      <value>nonstrict</value>
   </property>
   <property>
      <name>hive.support.concurrency</name>
      <value>true</value>
   </property>
   <property>
      <name>hive.enforce.bucketing</name>
      <value>true</value>
   </property>
   <property>
      <name>hive.txn.manager</name>
      <value>org.apache.hadoop.hive.ql.lockmgr.DbTxnManager</value>
   </property>
   <property>
      <name>hive.compactor.initiator.on</name>
      <value>true</value>
   </property>
   <property>
      <name>hive.compactor.worker.threads</name>
      <value>1</value>
   </property>
{code}

*STEP 3. Configure hive.txn.timeout in hive-site.xml*

{code}
<!-- Delegation token issue -->
   <property>
      <name>hive.txn.timeout</name>
      <value>10s</value>
   </property>
{code}

*STEP 4. Connect via beeline to HS2 with KERBEROS*

{code}
!connect 
jdbc:hive2://node8.cluster:10000/default;principal=myuser/node8.cluster@NODE8;ssl=true;sslTrustStore=/opt/myuser/conf/ssl_truststore
{code}

{code}
select count(*) from t1;
{code}

*STEP 5. Close connection and reconnect*

{code}
!close
{code}

{code}
!connect 
jdbc:hive2://node8.cluster:10000/default;principal=myuser/node8.cluster@NODE8;ssl=true;sslTrustStore=/opt/myuser/conf/ssl_truststore
{code}

*STEP 6. Perform long playing query*

This query lasts about 600s

{code}
select count(*) from t1 join t2 on t1.id = t2.id;
{code}


*EXPECTED RESULT*

Query finishes successfully

*ACTUAL RESULT*

{code}
2018-05-17T13:54:54,921 ERROR [pool-7-thread-10] transport.TSaslTransport: SASL 
negotiation failure
javax.security.sasl.SaslException: DIGEST-MD5: IO error acquiring password
        at 
com.sun.security.sasl.digest.DigestMD5Server.validateClientResponse(DigestMD5Server.java:598)
 
        at 
com.sun.security.sasl.digest.DigestMD5Server.evaluateResponse(DigestMD5Server.java:244)
        at 
org.apache.thrift.transport.TSaslTransport$SaslParticipant.evaluateChallengeOrResponse(TSaslTransport.java:539)
 
        at 
org.apache.thrift.transport.TSaslTransport.open(TSaslTransport.java:283)
        at 
org.apache.thrift.transport.TSaslServerTransport.open(TSaslServerTransport.java:41)
 
        at 
org.apache.thrift.transport.TSaslServerTransport$Factory.getTransport(TSaslServerTransport.java:216)
 
        at 
org.apache.hadoop.hive.thrift.HadoopThriftAuthBridge$Server$TUGIAssumingTransportFactory$1.run(HadoopThriftAuthBridge.java:663)
 
        at 
org.apache.hadoop.hive.thrift.HadoopThriftAuthBridge$Server$TUGIAssumingTransportFactory$1.run(HadoopThriftAuthBridge.java:660)
 
        at java.security.AccessController.doPrivileged(Native Method) 
        at javax.security.auth.Subject.doAs(Subject.java:360)
        at 
org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1613)
 
        at 
org.apache.hadoop.hive.thrift.HadoopThriftAuthBridge$Server$TUGIAssumingTransportFactory.getTransport(HadoopThriftAuthBridge.java:660)
 
        at 
org.apache.thrift.server.TThreadPoolServer$WorkerProcess.run(TThreadPoolServer.java:269)
        at 
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) 
        at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) 
        at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.hadoop.security.token.SecretManager$InvalidToken: token 
expired or does not exist: owner=myuser, renewer=myuser, realUser=, 
issueDate=1526565229297, maxDate=1527170029297, sequenceNumber=1, masterKeyId=1
        at 
org.apache.hadoop.hive.thrift.TokenStoreDelegationTokenSecretManager.retrievePassword(TokenStoreDelegationTokenSecretManager.java:104)
 
        at 
org.apache.hadoop.hive.thrift.TokenStoreDelegationTokenSecretManager.retrievePassword(TokenStoreDelegationTokenSecretManager.java:56)
        at 
org.apache.hadoop.hive.thrift.HadoopThriftAuthBridge$Server$SaslDigestCallbackHandler.getPassword(HadoopThriftAuthBridge.java:472)
        at 
org.apache.hadoop.hive.thrift.HadoopThriftAuthBridge$Server$SaslDigestCallbackHandler.handle(HadoopThriftAuthBridge.java:503)
 
        at 
com.sun.security.sasl.digest.DigestMD5Server.validateClientResponse(DigestMD5Server.java:589)
        ... 15 more
2018-05-17T13:54:54,966 ERROR [pool-7-thread-10] server.TThreadPoolServer: 
Error occurred during processing of message.
java.lang.RuntimeException: org.apache.thrift.transport.TTransportException: 
DIGEST-MD5: IO error acquiring password
        at 
org.apache.thrift.transport.TSaslServerTransport$Factory.getTransport(TSaslServerTransport.java:219)
 
        at 
org.apache.hadoop.hive.thrift.HadoopThriftAuthBridge$Server$TUGIAssumingTransportFactory$1.run(HadoopThriftAuthBridge.java:663)
 
        at 
org.apache.hadoop.hive.thrift.HadoopThriftAuthBridge$Server$TUGIAssumingTransportFactory$1.run(HadoopThriftAuthBridge.java:660)
        at java.security.AccessController.doPrivileged(Native Method) 
        at javax.security.auth.Subject.doAs(Subject.java:360)
        at 
org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1613)
 
        at 
org.apache.hadoop.hive.thrift.HadoopThriftAuthBridge$Server$TUGIAssumingTransportFactory.getTransport(HadoopThriftAuthBridge.java:660)
 
        at 
org.apache.thrift.server.TThreadPoolServer$WorkerProcess.run(TThreadPoolServer.java:269)
        at 
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) 
        at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
        at java.lang.Thread.run(Thread.java:748) 
Caused by: org.apache.thrift.transport.TTransportException: DIGEST-MD5: IO 
error acquiring password
        at 
org.apache.thrift.transport.TSaslTransport.sendAndThrowMessage(TSaslTransport.java:232)
 
        at 
org.apache.thrift.transport.TSaslTransport.open(TSaslTransport.java:316) 
        at 
org.apache.thrift.transport.TSaslServerTransport.open(TSaslServerTransport.java:41)
 
        at 
org.apache.thrift.transport.TSaslServerTransport$Factory.getTransport(TSaslServerTransport.java:216)
 
        ... 10 more
{code}


*ROOT-CAUSE*

HS2 uses cancelled delegation token to connect to metastore. From logs

1. Metastore creates delegation token (issueDate=1526565229297)

{code}
2018-05-17T13:53:49,297  INFO [pool-7-thread-3] 
delegation.AbstractDelegationTokenSecretManager: Creating password for 
identifier: owner=myuser, renewer=myuser, realUser=, issueDate=1526565229297, m$
2018-05-17T13:53:50,789  INFO [Thread-18] txn.TxnHandler: 'Cleaner' locked by 
'node8.cluster'
2018-05-17T13:53:50,793  INFO [Thread-18] txn.TxnHandler: 'Cleaner' unlocked by 
'node8.cluster'
2018-05-17T13:53:54,840  INFO [pool-7-thread-4] metastore.HiveMetaStore: 3: 
Opening raw store with implementation 
class:org.apache.hadoop.hive.metastore.ObjectStore
{code}

2. Metastore cancels delegation token (issueDate=1526565229297)

{code}
2018-05-17T13:54:32,380  INFO [pool-7-thread-4] 
thrift.TokenStoreDelegationTokenSecretManager: Token cancelation requested for 
identifier: owner=mapr, renewer=mapr, realUser=, issueDate=1526565229297, 
maxDate=1527170029297,$
2018-05-17T13:54:32,416  INFO [pool-7-thread-4] metastore.HiveMetaStore: 3: 
Cleaning up thread local RawStore...
{code}

3. HS2 uses canceled delegation token and crashes (issueDate=1526565229297):

{code}
2018-05-17T13:54:54,921 ERROR [pool-7-thread-10] transport.TSaslTransport: SASL 
negotiation failure
javax.security.sasl.SaslException: DIGEST-MD5: IO error acquiring password
        at 
com.sun.security.sasl.digest.DigestMD5Server.validateClientResponse(DigestMD5Server.java:598)
 
        at 
com.sun.security.sasl.digest.DigestMD5Server.evaluateResponse(DigestMD5Server.java:244)
        at 
org.apache.thrift.transport.TSaslTransport$SaslParticipant.evaluateChallengeOrResponse(TSaslTransport.java:539)
 
        at 
org.apache.thrift.transport.TSaslTransport.open(TSaslTransport.java:283)
        at 
org.apache.thrift.transport.TSaslServerTransport.open(TSaslServerTransport.java:41)
 
        at 
org.apache.thrift.transport.TSaslServerTransport$Factory.getTransport(TSaslServerTransport.java:216)
 
        at 
org.apache.hadoop.hive.thrift.HadoopThriftAuthBridge$Server$TUGIAssumingTransportFactory$1.run(HadoopThriftAuthBridge.java:663)
 
        at 
org.apache.hadoop.hive.thrift.HadoopThriftAuthBridge$Server$TUGIAssumingTransportFactory$1.run(HadoopThriftAuthBridge.java:660)
 
        at java.security.AccessController.doPrivileged(Native Method) 
        at javax.security.auth.Subject.doAs(Subject.java:360)
        at 
org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1613)
 
        at 
org.apache.hadoop.hive.thrift.HadoopThriftAuthBridge$Server$TUGIAssumingTransportFactory.getTransport(HadoopThriftAuthBridge.java:660)
 
        at 
org.apache.thrift.server.TThreadPoolServer$WorkerProcess.run(TThreadPoolServer.java:269)
        at 
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) 
        at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) 
        at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.hadoop.security.token.SecretManager$InvalidToken: token 
expired or does not exist: owner=myuser, renewer=myuser, realUser=, 
issueDate=1526565229297, maxDate=1527170029297, sequenceNumber=1, masterKeyId=1
        at 
org.apache.hadoop.hive.thrift.TokenStoreDelegationTokenSecretManager.retrievePassword(TokenStoreDelegationTokenSecretManager.java:104)
{code}


*WORKAROUND*

>From Hive sources:

{code}
heartBeatInterval = HIVE_TXN_TIMEOUT / 2.
{code}

So to skip HB during job one needs to set

{code}
HIVE_TXN_TIMEOUT = (max_job_duration * 2) * 1.25.
{code}

Here I added 1.25 factor to be sure there is no HB. In my case max_job_duration 
= ~600s, then 

{code}
HIVE_TXN_TIMEOUT = 600s * 2 * 1.25 = 1500s
{code}

When I {{set hive.txn.timeout}} = 1500s issue was gone on my local env.




--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

Reply via email to