deniskuzZ commented on code in PR #4384:
URL: https://github.com/apache/hive/pull/4384#discussion_r1309120157
##########
standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/txn/TxnHandler.java:
##########
@@ -5755,80 +5807,86 @@ private void timeOutLocks(Connection dbConn) {
*/
@RetrySemantics.Idempotent
public void performTimeOuts() {
- Connection dbConn = null;
- Statement stmt = null;
- ResultSet rs = null;
try {
- dbConn = getDbConn(Connection.TRANSACTION_READ_COMMITTED);
- //We currently commit after selecting the TXNS to abort. So whether
SERIALIZABLE
- //READ_COMMITTED, the effect is the same. We could use FOR UPDATE on
Select from TXNS
- //and do the whole performTimeOuts() in a single huge transaction, but
the only benefit
- //would be to make sure someone cannot heartbeat one of these txns at
the same time.
- //The attempt to heartbeat would block and fail immediately after it's
unblocked.
- //With current (RC + multiple txns) implementation it is possible for
someone to send
- //heartbeat at the very end of the expire interval, and just after the
Select from TXNS
- //is made, in which case heartbeat will succeed but txn will still be
Aborted.
- //Solving this corner case is not worth the perf penalty. The client
should heartbeat in a
- //timely way.
- timeOutLocks(dbConn);
- while(true) {
- stmt = dbConn.createStatement();
- String s = " \"TXN_ID\" FROM \"TXNS\" WHERE \"TXN_STATE\" = " +
TxnStatus.OPEN +
- " AND (" +
- "\"TXN_TYPE\" != " + TxnType.REPL_CREATED.getValue() +
- " AND \"TXN_LAST_HEARTBEAT\" < " + getEpochFn(dbProduct) +
"-" + timeout +
- " OR " +
- " \"TXN_TYPE\" = " + TxnType.REPL_CREATED.getValue() +
- " AND \"TXN_LAST_HEARTBEAT\" < " + getEpochFn(dbProduct) +
"-" + replicationTxnTimeout +
- ")";
- //safety valve for extreme cases
- s = sqlGenerator.addLimitClause(10 * TIMED_OUT_TXN_ABORT_BATCH_SIZE,
s);
- LOG.debug("Going to execute query <{}>", s);
- rs = stmt.executeQuery(s);
- if(!rs.next()) {
- return;//no more timedout txns
- }
- List<List<Long>> timedOutTxns = new ArrayList<>();
- List<Long> currentBatch = new
ArrayList<>(TIMED_OUT_TXN_ABORT_BATCH_SIZE);
- timedOutTxns.add(currentBatch);
- do {
- if(currentBatch.size() == TIMED_OUT_TXN_ABORT_BATCH_SIZE) {
- currentBatch = new ArrayList<>(TIMED_OUT_TXN_ABORT_BATCH_SIZE);
- timedOutTxns.add(currentBatch);
- }
- currentBatch.add(rs.getLong(1));
- } while(rs.next());
- dbConn.commit();
- close(rs, stmt, null);
- int numTxnsAborted = 0;
- for(List<Long> batchToAbort : timedOutTxns) {
- if (abortTxns(dbConn, batchToAbort, true, false, false,
TxnErrorMsg.ABORT_TIMEOUT) == batchToAbort.size()) {
- dbConn.commit();
- numTxnsAborted += batchToAbort.size();
- //todo: add TXNS.COMMENT filed and set it to 'aborted by system
due to timeout'
- }
- else {
- //could not abort all txns in this batch - this may happen because
in parallel with this
- //operation there was activity on one of the txns in this batch
(commit/abort/heartbeat)
- //This is not likely but may happen if client experiences long
pause between heartbeats or
- //unusually long/extreme pauses between heartbeat() calls and
other logic in checkLock(),
- //lock(), etc.
- dbConn.rollback();
+ retryHandler.executeWithoutRetry(
+ new RetryCallProperties()
+ .withCallerId("performTimeOuts()")
+ .withDataSource(POOL_TX)
+ .withExceptionSupplier(e -> new MetaException("Aborting timed
out transactions failed due to " + RetryHandler.getMessage(e))),
+ (DataSourceWrapper dataSourceWrapper) -> {
Review Comment:
could wee extract the logic functioin into a lambda var and use it later in
retryHandler?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]