felipecrv commented on code in PR #40299: URL: https://github.com/apache/arrow/pull/40299#discussion_r1535935204
########## cpp/src/arrow/filesystem/s3fs.cc: ########## @@ -913,6 +920,134 @@ Result<std::shared_ptr<S3ClientHolder>> GetClientHolder( // ----------------------------------------------------------------------- // S3 client factory: build S3Client from S3Options +#ifdef ARROW_S3_HAS_S3CLIENT_CONFIGURATION + +// GH-40279: standard initialization of S3Client creates a new `S3EndpointProvider` +// every time. Its construction takes 1ms, which makes instantiating every S3Client +// very costly (see upstream bug report +// at https://github.com/aws/aws-sdk-cpp/issues/2880). +// To work around this, we build and cache `S3EndpointProvider` instances +// for each distinct endpoint configuration, and reuse them whenever possible. +// Since most applications tend to use a single endpoint configuration, this +// makes the 1ms setup cost a once-per-process overhead, making it much more +// bearable - if not ideal. + +struct EndpointConfigKey { + explicit EndpointConfigKey(const Aws::S3::S3ClientConfiguration& config) + : region(config.region), + scheme(config.scheme), + endpoint_override(config.endpointOverride), + use_virtual_addressing(config.useVirtualAddressing) {} + + Aws::String region; + Aws::Http::Scheme scheme; + Aws::String endpoint_override; + bool use_virtual_addressing; + + bool operator==(const EndpointConfigKey& other) const noexcept { + return region == other.region && scheme == other.scheme && + endpoint_override == other.endpoint_override && + use_virtual_addressing == other.use_virtual_addressing; + } +}; + +} // namespace +} // namespace arrow::fs + +template <> +struct std::hash<arrow::fs::EndpointConfigKey> { + std::size_t operator()(const arrow::fs::EndpointConfigKey& key) const noexcept { + // A crude hash is sufficient since we expect the cache to remain very small. + auto h = std::hash<Aws::String>{}; + return h(key.region) ^ h(key.endpoint_override); + } +}; + +namespace arrow::fs { +namespace { + +// EndpointProvider configuration happens in a non-thread-safe way, even +// when the updates are idempotent. This is a problem when trying to reuse +// a single EndpointProvider from several clients. +// To work around this, this class ensures reconfiguration of an existing +// EndpointProvider is a no-op. +class InitOnceEndpointProvider : public Aws::S3::S3EndpointProviderBase { + public: + explicit InitOnceEndpointProvider( + std::shared_ptr<Aws::S3::S3EndpointProviderBase> wrapped) + : wrapped_(std::move(wrapped)) {} + + void InitBuiltInParameters(const Aws::S3::S3ClientConfiguration& config) override {} + + void OverrideEndpoint(const Aws::String& endpoint) override { + ARROW_LOG(ERROR) << "unexpected call to InitOnceEndpointProvider::OverrideEndpoint"; + } + Aws::S3::Endpoint::S3ClientContextParameters& AccessClientContextParameters() override { + ARROW_LOG(ERROR) + << "unexpected call to InitOnceEndpointProvider::AccessClientContextParameters"; + // Need to return a reference to something... + return wrapped_->AccessClientContextParameters(); + } + + const Aws::S3::Endpoint::S3ClientContextParameters& GetClientContextParameters() + const override { + return wrapped_->GetClientContextParameters(); + } + Aws::Endpoint::ResolveEndpointOutcome ResolveEndpoint( + const Aws::Endpoint::EndpointParameters& params) const override { + return wrapped_->ResolveEndpoint(params); + } + + protected: + std::shared_ptr<Aws::S3::S3EndpointProviderBase> wrapped_; +}; + +// A class that instantiates a single EndpointProvider per distinct endpoint +// configuration and initializes it in a thread-safe way. See earlier comments +// for rationale. +class EndpointProviderBuilder { + public: + std::shared_ptr<Aws::S3::S3EndpointProviderBase> Lookup( + const Aws::S3::S3ClientConfiguration& config) { + auto key = EndpointConfigKey(config); + CacheValue* value; + { + std::unique_lock lock(cache_mutex_); + value = &cache_[std::move(key)]; + } + std::call_once(value->once, [&]() { + auto endpoint_provider = std::make_shared<Aws::S3::S3EndpointProvider>(); + endpoint_provider->InitBuiltInParameters(config); + value->endpoint_provider = + std::make_shared<InitOnceEndpointProvider>(std::move(endpoint_provider)); + }); + return value->endpoint_provider; + } + + void Reset() { + std::unique_lock lock(cache_mutex_); + cache_.clear(); + } + + static EndpointProviderBuilder* Instance() { + static EndpointProviderBuilder instance; + return &instance; + } + + protected: Review Comment: Who's extending this class? This could be `private`, right? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org