This is an automated email from the ASF dual-hosted git repository.
liaoxin01 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new c8ed7602af3 [fix](s3) Avoid retrying object storage SlowDown errors
(#63776)
c8ed7602af3 is described below
commit c8ed7602af3abf242beefb2a8fc21a026b2fdfc3
Author: Yixuan Wang <[email protected]>
AuthorDate: Tue Jun 9 21:52:48 2026 +0800
[fix](s3) Avoid retrying object storage SlowDown errors (#63776)
### What problem does this PR solve?
Object storage throttling errors can be retried by the SDK retry policy.
When requests are already rate limited, these retries add **extra sleep
time** and delay the caller from entering the next processing flow.
This change disables retry for throttling responses in object storage
clients:
- S3 `SlowDown` errors are not retried.
- Azure 429 `TooManyRequests` is not added to retryable status codes.
Other retryable errors keep the existing retry behavior.
---
be/src/util/s3_util.cpp | 3 +--
cloud/src/common/config.h | 2 ++
cloud/src/recycler/s3_accessor.cpp | 8 +++++---
common/cpp/obj_retry_strategy.cpp | 10 ++++++++--
common/cpp/obj_retry_strategy.h | 7 +++++--
5 files changed, 21 insertions(+), 9 deletions(-)
diff --git a/be/src/util/s3_util.cpp b/be/src/util/s3_util.cpp
index 851fbf43edf..fc4c81d4bd9 100644
--- a/be/src/util/s3_util.cpp
+++ b/be/src/util/s3_util.cpp
@@ -349,7 +349,6 @@ std::shared_ptr<io::ObjStorageClient>
S3ClientFactory::_create_azure_client(
}
Azure::Storage::Blobs::BlobClientOptions options;
-
options.Retry.StatusCodes.insert(Azure::Core::Http::HttpStatusCode::TooManyRequests);
options.Retry.MaxRetries = config::max_s3_client_retry;
options.PerRetryPolicies.emplace_back(std::make_unique<AzureRetryRecordPolicy>());
if (_ca_cert_file_path.empty()) {
@@ -526,7 +525,7 @@ std::shared_ptr<io::ObjStorageClient>
S3ClientFactory::_create_s3_client(
}
aws_config.retryStrategy = std::make_shared<S3CustomRetryStrategy>(
- config::max_s3_client_retry /*scaleFactor = 25*/);
+ config::max_s3_client_retry /*scaleFactor = 25*/,
/*retry_slow_down=*/false);
std::shared_ptr<Aws::S3::S3Client> new_client =
std::make_shared<Aws::S3::S3Client>(
get_aws_credentials_provider(s3_conf), std::move(aws_config),
diff --git a/cloud/src/common/config.h b/cloud/src/common/config.h
index 25b8e0101e6..088dab135d9 100644
--- a/cloud/src/common/config.h
+++ b/cloud/src/common/config.h
@@ -332,6 +332,8 @@ CONF_Validator(s3_client_http_scheme, [](const std::string&
config) -> bool {
// Max retry times for object storage request
CONF_mInt64(max_s3_client_retry, "10");
+// Whether to retry on S3 SlowDown (429/503) errors
+CONF_Bool(s3_client_retry_slow_down, "false");
// Max byte getting delete bitmap can return, default is 1GB
CONF_mInt64(max_get_delete_bitmap_byte, "1073741824");
diff --git a/cloud/src/recycler/s3_accessor.cpp
b/cloud/src/recycler/s3_accessor.cpp
index cc4384b75a5..d48c51c8469 100644
--- a/cloud/src/recycler/s3_accessor.cpp
+++ b/cloud/src/recycler/s3_accessor.cpp
@@ -393,7 +393,9 @@ int S3Accessor::init() {
case S3Conf::AZURE: {
#ifdef USE_AZURE
Azure::Storage::Blobs::BlobClientOptions options;
-
options.Retry.StatusCodes.insert(Azure::Core::Http::HttpStatusCode::TooManyRequests);
+ if (config::s3_client_retry_slow_down) {
+
options.Retry.StatusCodes.insert(Azure::Core::Http::HttpStatusCode::TooManyRequests);
+ }
options.Retry.MaxRetries = config::max_s3_client_retry;
auto cred =
std::make_shared<Azure::Storage::StorageSharedKeyCredential>(conf_.ak,
conf_.sk);
@@ -405,7 +407,7 @@ int S3Accessor::init() {
// In Azure's HTTP requests, all policies in the vector are called in
a chained manner following the HTTP pipeline approach.
// Within the RetryPolicy, the nextPolicy is called multiple times
inside a loop.
// All policies in the PerRetryPolicies are downstream of the
RetryPolicy.
- // Therefore, you only need to add a policy to check if the response
code is 429 and if the retry count meets the condition, it can record the retry
count.
+ // Therefore, the policy can record retries after the RetryPolicy has
handled the response.
options.PerRetryPolicies.emplace_back(std::make_unique<AzureRetryRecordPolicy>());
auto container_client =
std::make_shared<Azure::Storage::Blobs::BlobContainerClient>(
uri_, cred, std::move(options));
@@ -439,7 +441,7 @@ int S3Accessor::init() {
aws_config.scheme = Aws::Http::Scheme::HTTP;
}
aws_config.retryStrategy = std::make_shared<S3CustomRetryStrategy>(
- config::max_s3_client_retry /*scaleFactor = 25*/);
+ config::max_s3_client_retry,
config::s3_client_retry_slow_down);
if (_ca_cert_file_path.empty()) {
_ca_cert_file_path =
diff --git a/common/cpp/obj_retry_strategy.cpp
b/common/cpp/obj_retry_strategy.cpp
index 6da4c23980a..58ca47f6a68 100644
--- a/common/cpp/obj_retry_strategy.cpp
+++ b/common/cpp/obj_retry_strategy.cpp
@@ -25,7 +25,8 @@ namespace doris {
bvar::Adder<int64_t> object_request_retry_count("object_request_retry_count");
-S3CustomRetryStrategy::S3CustomRetryStrategy(int maxRetries) :
DefaultRetryStrategy(maxRetries) {}
+S3CustomRetryStrategy::S3CustomRetryStrategy(int maxRetries, bool
retry_slow_down)
+ : DefaultRetryStrategy(maxRetries), _retry_slow_down(retry_slow_down)
{}
S3CustomRetryStrategy::~S3CustomRetryStrategy() = default;
@@ -35,6 +36,11 @@ bool S3CustomRetryStrategy::ShouldRetry(const
Aws::Client::AWSError<Aws::Client:
return false;
}
+ if (!_retry_slow_down && error.GetExceptionName() == "SlowDown" &&
+ error.GetResponseCode() ==
Aws::Http::HttpResponseCode::SERVICE_UNAVAILABLE) {
+ return false;
+ }
+
if (Aws::Http::IsRetryableHttpResponseCode(error.GetResponseCode()) ||
error.ShouldRetry()) {
object_request_retry_count << 1;
LOG(INFO) << "retry due to error: " << error << ", attempt: " <<
attemptedRetries + 1 << "/"
@@ -74,4 +80,4 @@ std::unique_ptr<AzureRetryRecordPolicy::HttpPolicy>
AzureRetryRecordPolicy::Clon
return std::make_unique<AzureRetryRecordPolicy>(*this);
}
#endif
-} // namespace doris
\ No newline at end of file
+} // namespace doris
diff --git a/common/cpp/obj_retry_strategy.h b/common/cpp/obj_retry_strategy.h
index dd98f871716..aaf35f6b100 100644
--- a/common/cpp/obj_retry_strategy.h
+++ b/common/cpp/obj_retry_strategy.h
@@ -27,11 +27,14 @@
namespace doris {
class S3CustomRetryStrategy final : public Aws::Client::DefaultRetryStrategy {
public:
- S3CustomRetryStrategy(int maxRetries);
+ S3CustomRetryStrategy(int maxRetries, bool retry_slow_down = true);
~S3CustomRetryStrategy() override;
bool ShouldRetry(const Aws::Client::AWSError<Aws::Client::CoreErrors>&
error,
long attemptedRetries) const override;
+
+private:
+ bool _retry_slow_down;
};
#ifdef USE_AZURE
@@ -47,4 +50,4 @@ public:
Azure::Core::Context const& context) const override;
};
#endif
-} // namespace doris
\ No newline at end of file
+} // namespace doris
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]