This is an automated email from the ASF dual-hosted git repository.

liaoxin01 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new c8ed7602af3 [fix](s3) Avoid retrying object storage SlowDown errors 
(#63776)
c8ed7602af3 is described below

commit c8ed7602af3abf242beefb2a8fc21a026b2fdfc3
Author: Yixuan Wang <[email protected]>
AuthorDate: Tue Jun 9 21:52:48 2026 +0800

    [fix](s3) Avoid retrying object storage SlowDown errors (#63776)
    
    ### What problem does this PR solve?
    
    Object storage throttling errors can be retried by the SDK retry policy.
    When requests are already rate limited, these retries add **extra sleep
    time** and delay the caller from entering the next processing flow.
    This change disables retry for throttling responses in object storage
    clients:
    - S3 `SlowDown` errors are not retried.
    - Azure 429 `TooManyRequests` is not added to retryable status codes.
    Other retryable errors keep the existing retry behavior.
---
 be/src/util/s3_util.cpp            |  3 +--
 cloud/src/common/config.h          |  2 ++
 cloud/src/recycler/s3_accessor.cpp |  8 +++++---
 common/cpp/obj_retry_strategy.cpp  | 10 ++++++++--
 common/cpp/obj_retry_strategy.h    |  7 +++++--
 5 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/be/src/util/s3_util.cpp b/be/src/util/s3_util.cpp
index 851fbf43edf..fc4c81d4bd9 100644
--- a/be/src/util/s3_util.cpp
+++ b/be/src/util/s3_util.cpp
@@ -349,7 +349,6 @@ std::shared_ptr<io::ObjStorageClient> 
S3ClientFactory::_create_azure_client(
     }
 
     Azure::Storage::Blobs::BlobClientOptions options;
-    
options.Retry.StatusCodes.insert(Azure::Core::Http::HttpStatusCode::TooManyRequests);
     options.Retry.MaxRetries = config::max_s3_client_retry;
     
options.PerRetryPolicies.emplace_back(std::make_unique<AzureRetryRecordPolicy>());
     if (_ca_cert_file_path.empty()) {
@@ -526,7 +525,7 @@ std::shared_ptr<io::ObjStorageClient> 
S3ClientFactory::_create_s3_client(
     }
 
     aws_config.retryStrategy = std::make_shared<S3CustomRetryStrategy>(
-            config::max_s3_client_retry /*scaleFactor = 25*/);
+            config::max_s3_client_retry /*scaleFactor = 25*/, 
/*retry_slow_down=*/false);
 
     std::shared_ptr<Aws::S3::S3Client> new_client = 
std::make_shared<Aws::S3::S3Client>(
             get_aws_credentials_provider(s3_conf), std::move(aws_config),
diff --git a/cloud/src/common/config.h b/cloud/src/common/config.h
index 25b8e0101e6..088dab135d9 100644
--- a/cloud/src/common/config.h
+++ b/cloud/src/common/config.h
@@ -332,6 +332,8 @@ CONF_Validator(s3_client_http_scheme, [](const std::string& 
config) -> bool {
 
 // Max retry times for object storage request
 CONF_mInt64(max_s3_client_retry, "10");
+// Whether to retry on S3 SlowDown (429/503) errors
+CONF_Bool(s3_client_retry_slow_down, "false");
 
 // Max byte getting delete bitmap can return, default is 1GB
 CONF_mInt64(max_get_delete_bitmap_byte, "1073741824");
diff --git a/cloud/src/recycler/s3_accessor.cpp 
b/cloud/src/recycler/s3_accessor.cpp
index cc4384b75a5..d48c51c8469 100644
--- a/cloud/src/recycler/s3_accessor.cpp
+++ b/cloud/src/recycler/s3_accessor.cpp
@@ -393,7 +393,9 @@ int S3Accessor::init() {
     case S3Conf::AZURE: {
 #ifdef USE_AZURE
         Azure::Storage::Blobs::BlobClientOptions options;
-        
options.Retry.StatusCodes.insert(Azure::Core::Http::HttpStatusCode::TooManyRequests);
+        if (config::s3_client_retry_slow_down) {
+            
options.Retry.StatusCodes.insert(Azure::Core::Http::HttpStatusCode::TooManyRequests);
+        }
         options.Retry.MaxRetries = config::max_s3_client_retry;
         auto cred =
                 
std::make_shared<Azure::Storage::StorageSharedKeyCredential>(conf_.ak, 
conf_.sk);
@@ -405,7 +407,7 @@ int S3Accessor::init() {
         // In Azure's HTTP requests, all policies in the vector are called in 
a chained manner following the HTTP pipeline approach.
         // Within the RetryPolicy, the nextPolicy is called multiple times 
inside a loop.
         // All policies in the PerRetryPolicies are downstream of the 
RetryPolicy.
-        // Therefore, you only need to add a policy to check if the response 
code is 429 and if the retry count meets the condition, it can record the retry 
count.
+        // Therefore, the policy can record retries after the RetryPolicy has 
handled the response.
         
options.PerRetryPolicies.emplace_back(std::make_unique<AzureRetryRecordPolicy>());
         auto container_client = 
std::make_shared<Azure::Storage::Blobs::BlobContainerClient>(
                 uri_, cred, std::move(options));
@@ -439,7 +441,7 @@ int S3Accessor::init() {
             aws_config.scheme = Aws::Http::Scheme::HTTP;
         }
         aws_config.retryStrategy = std::make_shared<S3CustomRetryStrategy>(
-                config::max_s3_client_retry /*scaleFactor = 25*/);
+                config::max_s3_client_retry, 
config::s3_client_retry_slow_down);
 
         if (_ca_cert_file_path.empty()) {
             _ca_cert_file_path =
diff --git a/common/cpp/obj_retry_strategy.cpp 
b/common/cpp/obj_retry_strategy.cpp
index 6da4c23980a..58ca47f6a68 100644
--- a/common/cpp/obj_retry_strategy.cpp
+++ b/common/cpp/obj_retry_strategy.cpp
@@ -25,7 +25,8 @@ namespace doris {
 
 bvar::Adder<int64_t> object_request_retry_count("object_request_retry_count");
 
-S3CustomRetryStrategy::S3CustomRetryStrategy(int maxRetries) : 
DefaultRetryStrategy(maxRetries) {}
+S3CustomRetryStrategy::S3CustomRetryStrategy(int maxRetries, bool 
retry_slow_down)
+        : DefaultRetryStrategy(maxRetries), _retry_slow_down(retry_slow_down) 
{}
 
 S3CustomRetryStrategy::~S3CustomRetryStrategy() = default;
 
@@ -35,6 +36,11 @@ bool S3CustomRetryStrategy::ShouldRetry(const 
Aws::Client::AWSError<Aws::Client:
         return false;
     }
 
+    if (!_retry_slow_down && error.GetExceptionName() == "SlowDown" &&
+        error.GetResponseCode() == 
Aws::Http::HttpResponseCode::SERVICE_UNAVAILABLE) {
+        return false;
+    }
+
     if (Aws::Http::IsRetryableHttpResponseCode(error.GetResponseCode()) || 
error.ShouldRetry()) {
         object_request_retry_count << 1;
         LOG(INFO) << "retry due to error: " << error << ", attempt: " << 
attemptedRetries + 1 << "/"
@@ -74,4 +80,4 @@ std::unique_ptr<AzureRetryRecordPolicy::HttpPolicy> 
AzureRetryRecordPolicy::Clon
     return std::make_unique<AzureRetryRecordPolicy>(*this);
 }
 #endif
-} // namespace doris
\ No newline at end of file
+} // namespace doris
diff --git a/common/cpp/obj_retry_strategy.h b/common/cpp/obj_retry_strategy.h
index dd98f871716..aaf35f6b100 100644
--- a/common/cpp/obj_retry_strategy.h
+++ b/common/cpp/obj_retry_strategy.h
@@ -27,11 +27,14 @@
 namespace doris {
 class S3CustomRetryStrategy final : public Aws::Client::DefaultRetryStrategy {
 public:
-    S3CustomRetryStrategy(int maxRetries);
+    S3CustomRetryStrategy(int maxRetries, bool retry_slow_down = true);
     ~S3CustomRetryStrategy() override;
 
     bool ShouldRetry(const Aws::Client::AWSError<Aws::Client::CoreErrors>& 
error,
                      long attemptedRetries) const override;
+
+private:
+    bool _retry_slow_down;
 };
 
 #ifdef USE_AZURE
@@ -47,4 +50,4 @@ public:
             Azure::Core::Context const& context) const override;
 };
 #endif
-} // namespace doris
\ No newline at end of file
+} // namespace doris


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to