[
https://issues.apache.org/jira/browse/TRAFODION-2468?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15935262#comment-15935262
]
ASF GitHub Bot commented on TRAFODION-2468:
-------------------------------------------
Github user prashanth-vasudev commented on a diff in the pull request:
https://github.com/apache/incubator-trafodion/pull/993#discussion_r107266154
--- Diff:
core/sqf/src/seatrans/tm/hbasetmlib2/src/main/java/org/trafodion/dtm/HBaseTxClient.java
---
@@ -1232,8 +1203,218 @@ public void run() {
}
if(LOG.isDebugEnabled()) LOG.debug("Exiting recovery
thread for tm ID: " + tmID);
}
- }
+
+ private Map<Long, TransactionState> getTransactionsFromRegions(
+ Map<String, byte[]> regions)
+ throws IOException, KeeperException,
+ DeserializationException
+ {
+ if (LOG.isDebugEnabled()) LOG.debug("TRAF RCOV THREAD: in-doubt
region size " + regions.size());
+ for (Map.Entry<String, byte[]> regionEntry : regions.entrySet()) {
+ Map<Long, TransactionState> transactionStates =
+ new HashMap<Long, TransactionState>();
+ List<Long> TxRecoverList = new ArrayList<Long>();
+ String hostnamePort = regionEntry.getKey();
+ byte[] regionBytes = regionEntry.getValue();
+ if (LOG.isDebugEnabled())
+ LOG.debug("TRAF RCOV THREAD:Recovery Thread Processing
region: " + new String(regionBytes));
+ if (recoveryIterations == 0) {
+ if(LOG.isWarnEnabled()) {
+ // Let's get the host name
+ final byte [] delimiter = ",".getBytes();
+ String[] hostname = hostnamePort.split(new
String(delimiter), 3);
+ if (hostname.length < 2) {
+ throw new IllegalArgumentException("hostnamePort
format is incorrect");
+ }
+
+ LOG.warn ("TRAF RCOV THREAD:Starting recovery with " +
regions.size() +
+ " regions to recover. First region hostname: " +
hostnamePort +
+ " Recovery iterations: " + recoveryIterations);
+ }
+ }
+ else {
+ if(recoveryIterations % 10 == 0) {
+ if(LOG.isWarnEnabled()) {
+ // Let's get the host name
+ final byte [] delimiter = ",".getBytes();
+ String[] hostname = hostnamePort.split(new
String(delimiter), 3);
+ if (hostname.length < 2) {
+ throw new IllegalArgumentException("hostnamePort
format is incorrect");
+ }
+ LOG.warn("TRAF RCOV THREAD:Recovery thread
encountered " + regions.size() +
+ " regions to recover. First region hostname: " +
hostnamePort +
+ " Recovery iterations: " + recoveryIterations);
+ }
+ }
+ }
+ try {
+ TxRecoverList = txnManager.recoveryRequest(hostnamePort,
regionBytes, tmID);
+ }
+ catch (IOException e) {
+ // For all cases of Exception, we rely on the region to
redrive the request.
+ // Likely there is nothing to recover, due to a stale
region entry, but it is always safe to redrive.
+ // We log a warning event and delete the ZKNode entry.
+ LOG.warn("TRAF RCOV THREAD:Exception calling
txnManager.recoveryRequest. " + "TM: " +
+ tmID + " regionBytes: [" + regionBytes + "].
Deleting zookeeper region entry. \n exception: ", e);
+ zookeeper.deleteRegionEntry(regionEntry);
+
+ // In the case of NotServingRegionException we will repost
the ZKNode after refreshing the table.
+ if ((e instanceof NotServingRegionException) ||
(e.getCause() instanceof NotServingRegionException)){
+ // Create a local HTable object using the regionInfo
+ HTable table = new HTable(config,
HRegionInfo.parseFrom(regionBytes).getTable().getNameAsString());
+ // Repost a zookeeper entry for all current regions in
the table
+ zookeeper.postAllRegionEntries(table);
+ }
+ } // IOException
+
+ if (TxRecoverList != null) {
+ if (LOG.isDebugEnabled()) LOG.trace("TRAF RCOV THREAD:size
of TxRecoverList " + TxRecoverList.size());
+ if (TxRecoverList.size() == 0) {
+ // First delete the zookeeper entry
+ LOG.warn("TRAF RCOV THREAD:Leftover Znode calling
txnManager.recoveryRequest. " + "TM: " +
+ tmID + " regionBytes: [" + regionBytes + "].
Deleting zookeeper region entry. ");
+ zookeeper.deleteRegionEntry(regionEntry);
+ }
+ for (Long txid : TxRecoverList) {
+ TransactionState ts = transactionStates.get(txid);
+ if (ts == null) {
+ ts = new TransactionState(txid);
+
+ //Identify if DDL is part of this transaction and
valid
+ if(hbtx.useDDLTrans){
+ TmDDL tmDDL = hbtx.getTmDDL();
+ StringBuilder state = new StringBuilder ();
+ tmDDL.getState(txid,state);
+ if(state.toString().equals("VALID"))
+ ts.setDDLTx(true);
+ }
+ }
+ this.addRegionToTS(hostnamePort, regionBytes, ts);
+ transactionStates.put(txid, ts);
+ }
+ }
+ else if (LOG.isDebugEnabled()) LOG.debug("TRAF RCOV
THREAD:size od TxRecoverList is NULL ");
+
+ return transactionStates;
+ }
+ return null;
+ }
+
+ private Map<Long, TransactionState> getTransactionsFromTmDDL()
+ throws IOException
+ {
+ if (LOG.isDebugEnabled()) LOG.debug("TRAF RCOV THREAD: Checking for
DDL only recovery");
+
+ //Access TMDDL, return null if not enabled.
+ if(! hbtx.useDDLTrans)
+ return null;
+
+ Map<Long, TransactionState> transactionStates = null;
+ TmDDL tmDDL = hbtx.getTmDDL();
+ List<Long> txIdList = tmDDL.getTxIdList(tmID);
+
+ //This list of txID is specific to tmID owner.
+ //This list may include txId that are:
+ //1. currently in ACTIVE state. RecoverTransactions() call takes
care of
+ //ignoring TxId which are currently actively in progress.
+ //2. Txids regions which have not yet requested for help(regions
requesting help
+ //from zookeeper) , probably will, could be timing.
+ //3. Txids regions which have already requested for help.
+ //4. Txids whose regions have already serviced, but only require
recovery
+ //from DDL perspective.
+ //For 2 and 3 use cases above, those regions will ultimately seek
help if
+ //they need help. So no need to handle those regions here. We are
only
+ //interested to handle use case 4. If usecase 4 also involves DML
regions
+ //it is ok to recover the DDL only here and not dependent on DML
regions.
+ //
+ //Note that recoverTransactions() attempts recovery, its a no-op if
those
+ //txids are completed for some reason, some of the regions might
have completed
+ //processing, ignoreUnknownTransactionException is enabled.
+ if(txIdList != null && txIdList.size() > 0)
+ {
+ transactionStates = new HashMap<Long, TransactionState>();
+ for (Long txid : txIdList)
+ {
+ //build ts object
+ TransactionState ts = new TransactionState(txid);
+ ts.setDDLTx(true);
+ transactionStates.put(txid, ts);
+ }
+ }
+ return transactionStates;
+ }
+
+ private void recoverTransactions(Map<Long, TransactionState>
transactionStates) throws IOException
+ {
+ if (LOG.isDebugEnabled()) LOG.debug("TRAF RCOV THREAD: in-doubt
transaction size " + transactionStates.size());
+
+ for (Map.Entry<Long, TransactionState> tsEntry :
transactionStates.entrySet()) {
+ int isTransactionStillAlive = 0;
+ TransactionState ts = tsEntry.getValue();
+ Long txID = ts.getTransactionId();
+ // TransactionState ts = new TransactionState(txID);
+
+ //It is possible for long prepare situations that involve
multiple DDL
+ //operations, multiple prompts from RS is received. Hence
check to see if there
+ //is a TS object in main TS list and transaction is still
active.
+ //Note that tsEntry is local TS object.
+ if (hbtx.mapTransactionStates.get(txID) != null) {
+ if
(hbtx.mapTransactionStates.get(txID).getStatus().toString().contains("ACTIVE"))
{
+ isTransactionStillAlive = 1;
+ }
+ if (LOG.isInfoEnabled())
+ LOG.info("TRAF RCOV THREAD: TID " + txID
+ + " still has TS object in TM memory. TS details: "
+ + hbtx.mapTransactionStates.get(txID).toString()
+ + " transactionAlive: " + isTransactionStillAlive);
+ if(isTransactionStillAlive == 1)
+ continue; //for loop
+ }
+
+ try {
+ audit.getTransactionState(ts);
+ if
(ts.getStatus().equals(TransState.STATE_COMMITTED.toString())) {
+ if (LOG.isDebugEnabled())
+ LOG.debug("TRAF RCOV THREAD:Redriving commit for "
+ txID + " number of regions " + ts.getParticipatingRegions().size() +
+ " and tolerating
UnknownTransactionExceptions");
+ txnManager.doCommit(ts, true /*ignore
UnknownTransactionException*/);
+ if(useTlog && useForgotten) {
+ long nextAsn =
tLog.getNextAuditSeqNum((int)TransactionState.getNodeId(txID));
+ tLog.putSingleRecord(txID, ts.getCommitId(),
"FORGOTTEN", null, forceForgotten, nextAsn);
+ }
+ } else if
(ts.getStatus().equals(TransState.STATE_ABORTED.toString())) {
+ if (LOG.isDebugEnabled())
+ LOG.debug("TRAF RCOV THREAD:Redriving abort for "
+ txID);
+ txnManager.abort(ts);
+ } else {
+ if (LOG.isDebugEnabled())
+ LOG.debug("TRAF RCOV THREAD:Redriving abort for "
+ txID);
+ LOG.warn("Recovering transaction " + txID + ", status
is not set to COMMITTED or ABORTED. Aborting.");
+ txnManager.abort(ts);
+ }
+
+ } catch (UnsuccessfulDDLException ddle) {
+ LOG.error("UnsuccessfulDDLException encountered by
Recovery Thread. Registering for retry. txID: " + txID + "Exception " , ddle);
+
+ //Note that there may not be anymore redrive triggers from
region server point of view for DDL operation.
+ //Register this DDL transaction for subsequent redrive
from Audit Control Event.
+ //TODO: Launch a new Redrive Thread out of
auditControlPoint().
+ TmDDL tmDDL = hbtx.getTmDDL();
+ tmDDL.setState(txID,"REDRIVE");
--- End diff --
will remove the REDRIVE tag tmDDL and set recovery thread to loop back for
recheck.
> Recovery of DDL transaction may fail upon TM restart
> ----------------------------------------------------
>
> Key: TRAFODION-2468
> URL: https://issues.apache.org/jira/browse/TRAFODION-2468
> Project: Apache Trafodion
> Issue Type: Bug
> Components: dtm
> Affects Versions: 2.1-incubating
> Reporter: Prashanth Vasudev
> Assignee: Prashanth Vasudev
>
> Depending on timing of TM process going down and restarting, if there is a
> DDL operation in flight that has not registered in TMDDL table, recovery of
> the DDL operation may get unnoticed.
--
This message was sent by Atlassian JIRA
(v6.3.15#6346)