singhpk234 commented on code in PR #1517: URL: https://github.com/apache/polaris/pull/1517#discussion_r2074454437
########## extension/persistence/relational-jdbc/src/main/java/org/apache/polaris/extension/persistence/relational/jdbc/DatasourceOperations.java: ########## @@ -173,23 +190,82 @@ public int executeUpdate(String query) throws SQLException { * @throws SQLException : Exception caught during transaction execution. */ public void runWithinTransaction(TransactionCallback callback) throws SQLException { - try (Connection connection = borrowConnection()) { - boolean autoCommit = connection.getAutoCommit(); - connection.setAutoCommit(false); - boolean success = false; + withRetries( + () -> { + try (Connection connection = borrowConnection()) { + boolean autoCommit = connection.getAutoCommit(); + boolean success = false; + connection.setAutoCommit(false); + try { + try (Statement statement = connection.createStatement()) { + success = callback.execute(statement); + } + } finally { + if (success) { + connection.commit(); + } else { + connection.rollback(); + } + connection.setAutoCommit(autoCommit); + } + } + return null; + }); + } + + private boolean isRetryable(SQLException e) { + String sqlState = e.getSQLState(); + + if (sqlState != null) { + return sqlState.equals(DEADLOCK_SQL_CODE) + || // Deadlock detected + sqlState.equals(SERIALIZATION_FAILURE_SQL_CODE); // Serialization failure + } + + // Additionally, one might check for specific error messages or other conditions + return e.getMessage().contains("connection refused") + || e.getMessage().contains("connection reset"); + } + + public <T> T withRetries(Operation<T> operation) throws SQLException { + int attempts = 0; + // maximum number of retries. + int maxAttempts = relationalJdbcConfiguration.maxRetries().orElse(1); + // How long we should try, since the first attempt. + long maxDuration = relationalJdbcConfiguration.maxDurationInMs().orElse(100L); + // How long to wait before first failure. + long delay = relationalJdbcConfiguration.initialDelayInMs().orElse(100L); + + // maximum time we will retry till. + long maxRetryTime = Instant.now().toEpochMilli() + maxDuration; + + while (attempts < maxAttempts) { try { - try (Statement statement = connection.createStatement()) { - success = callback.execute(statement); + return operation.execute(); + } catch (SQLException e) { + attempts++; + long timeLeft = Math.max((maxRetryTime - Instant.now().toEpochMilli()), 0L); + if (attempts >= maxAttempts || !isRetryable(e) || timeLeft == 0) { + throw e; } - } finally { - if (success) { - connection.commit(); - } else { - connection.rollback(); + // Add jitter + long timeToSleep = Math.min(timeLeft, delay + (long) (random.nextDouble() * 0.2 * delay)); Review Comment: delay is not a fixed component, it raises with the exponent of 2 essentialy its (2^ attempt) so essentially the jitter we have is random value between ((2 ^ attempt), (2 ^ (attempt + 1) * 0.2) ], this we do both both exponential backoff as well not exactly have collisions. 0 sleep time will not be possible as both timeLeft and delay + (long) (random.nextDouble() * 0.2 * delay)) > 0 please let me know your thoughts considering above. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@polaris.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org