dimas-b commented on code in PR #1517: URL: https://github.com/apache/polaris/pull/1517#discussion_r2074467913
########## extension/persistence/relational-jdbc/src/main/java/org/apache/polaris/extension/persistence/relational/jdbc/DatasourceOperations.java: ########## @@ -173,23 +190,82 @@ public int executeUpdate(String query) throws SQLException { * @throws SQLException : Exception caught during transaction execution. */ public void runWithinTransaction(TransactionCallback callback) throws SQLException { - try (Connection connection = borrowConnection()) { - boolean autoCommit = connection.getAutoCommit(); - connection.setAutoCommit(false); - boolean success = false; + withRetries( + () -> { + try (Connection connection = borrowConnection()) { + boolean autoCommit = connection.getAutoCommit(); + boolean success = false; + connection.setAutoCommit(false); + try { + try (Statement statement = connection.createStatement()) { + success = callback.execute(statement); + } + } finally { + if (success) { + connection.commit(); + } else { + connection.rollback(); + } + connection.setAutoCommit(autoCommit); + } + } + return null; + }); + } + + private boolean isRetryable(SQLException e) { + String sqlState = e.getSQLState(); + + if (sqlState != null) { + return sqlState.equals(DEADLOCK_SQL_CODE) + || // Deadlock detected + sqlState.equals(SERIALIZATION_FAILURE_SQL_CODE); // Serialization failure + } + + // Additionally, one might check for specific error messages or other conditions + return e.getMessage().contains("connection refused") + || e.getMessage().contains("connection reset"); + } + + public <T> T withRetries(Operation<T> operation) throws SQLException { + int attempts = 0; + // maximum number of retries. + int maxAttempts = relationalJdbcConfiguration.maxRetries().orElse(1); + // How long we should try, since the first attempt. + long maxDuration = relationalJdbcConfiguration.maxDurationInMs().orElse(100L); + // How long to wait before first failure. + long delay = relationalJdbcConfiguration.initialDelayInMs().orElse(100L); + + // maximum time we will retry till. + long maxRetryTime = Instant.now().toEpochMilli() + maxDuration; + + while (attempts < maxAttempts) { try { - try (Statement statement = connection.createStatement()) { - success = callback.execute(statement); + return operation.execute(); + } catch (SQLException e) { + attempts++; + long timeLeft = Math.max((maxRetryTime - Instant.now().toEpochMilli()), 0L); + if (attempts >= maxAttempts || !isRetryable(e) || timeLeft == 0) { + throw e; } - } finally { - if (success) { - connection.commit(); - } else { - connection.rollback(); + // Add jitter + long timeToSleep = Math.min(timeLeft, delay + (long) (random.nextDouble() * 0.2 * delay)); Review Comment: nit: `double` is probably an overkill. We do not need that level of precision for delay computation. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@polaris.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org