pitrou commented on code in PR #43601: URL: https://github.com/apache/arrow/pull/43601#discussion_r1793403929
########## cpp/src/arrow/filesystem/s3_test_cert.h: ########## @@ -0,0 +1,80 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +namespace arrow { +namespace fs { + +// The below two static strings are generated according to +// https://github.com/minio/minio/tree/RELEASE.2024-09-22T00-33-43Z/docs/tls#323-generate-a-self-signed-certificate +// `openssl req -new -x509 -nodes -days 36500 -keyout private.key -out public.crt -config +// openssl.conf` +const char* kMinioPrivateKey = R"(-----BEGIN PRIVATE KEY----- Review Comment: We should probably make this static and constexpr to avoid any linkage issues. ```suggestion static constexpr const char* kMinioPrivateKey = R"(-----BEGIN PRIVATE KEY----- ``` ########## cpp/src/arrow/filesystem/s3_test_cert.h: ########## @@ -0,0 +1,80 @@ +// Licensed to the Apache Software Foundation (ASF) under one Review Comment: Can you perhaps rename this file `s3_test_cert_internal.h`? Header files with the "internal" suffix don't get installed, so they don't risk being used as public APIs by third-party code. ########## cpp/src/arrow/filesystem/s3_test_cert.h: ########## @@ -0,0 +1,80 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +namespace arrow { +namespace fs { Review Comment: Nit: can collapse nested namespaces ```suggestion namespace arrow::fs { ``` ########## cpp/src/arrow/filesystem/s3fs.h: ########## @@ -196,6 +196,23 @@ struct ARROW_EXPORT S3Options { /// delay between retries. std::shared_ptr<S3RetryStrategy> retry_strategy; + /// the SSE-C customized key (raw 32 bytes key). + std::string sse_customer_key; + + /// Path to a single PEM file holding all TLS CA certificates + /// + /// If empty, the underlying TLS library's defaults will be used. + std::string tls_ca_file_path; + + /// Path to a directory holding TLS CA certificates in individual PEM files + /// named along the OpenSSL "hashed" format. + /// + /// If empty, the underlying TLS library's defaults will be used. + std::string tls_ca_dir_path; + + /// Controls whether to verify TLS certificates. Defaults to true. Review Comment: ```suggestion /// Whether to verify the S3 endpoint's TLS certificate, if the scheme is "https". ``` ########## cpp/src/arrow/filesystem/s3_test_util.cc: ########## @@ -69,6 +77,40 @@ std::string MinioTestServer::access_key() const { return impl_->access_key_; } std::string MinioTestServer::secret_key() const { return impl_->secret_key_; } +std::string MinioTestServer::ca_path() const { + return impl_->temp_dir_ca_->path().ToString(); +} + +std::string MinioTestServer::scheme() const { return impl_->scheme_; } + +Status MinioTestServer::GenerateCertificateFile() { + // create the dedicated folder for certificate file, rather than reuse the data + // folder, since there is test case to check whether the folder is empty. + ARROW_ASSIGN_OR_RAISE(impl_->temp_dir_ca_, TemporaryDir::Make("s3fs-test-ca-")); + + ARROW_ASSIGN_OR_RAISE(auto public_crt_file, + PlatformFilename::FromString(ca_path() + "/public.crt")); + ARROW_ASSIGN_OR_RAISE(auto public_cert_fd, FileOpenWritable(public_crt_file)); + ARROW_RETURN_NOT_OK(FileWrite(public_cert_fd.fd(), + reinterpret_cast<const uint8_t*>(kMinioCert), + strlen(kMinioCert))); + ARROW_RETURN_NOT_OK(public_cert_fd.Close()); + + ARROW_ASSIGN_OR_RAISE(auto private_key_file, + PlatformFilename::FromString(ca_path() + "/private.key")); + ARROW_ASSIGN_OR_RAISE(auto private_key_fd, FileOpenWritable(private_key_file)); + ARROW_RETURN_NOT_OK(FileWrite(private_key_fd.fd(), + reinterpret_cast<const uint8_t*>(kMinioPrivateKey), + strlen(kMinioPrivateKey))); + ARROW_RETURN_NOT_OK(private_key_fd.Close()); + + arrow::fs::FileSystemGlobalOptions global_options; + global_options.tls_verify_certificates = false; Review Comment: Why set this to `false`? If TLS is only enabled under Linux, then surely we can instead pass the right CA path or file? ########## cpp/src/arrow/filesystem/s3_internal.h: ########## @@ -291,6 +293,47 @@ class ConnectRetryStrategy : public Aws::Client::RetryStrategy { int32_t max_retry_duration_; }; +/// \brief calculate the MD5 of the input sse-c key (raw key, not base64 encoded) +/// \param sse_customer_key is the input sse key +/// \return the base64 encoded MD5 for the input key +inline Result<std::string> CalculateSSECustomerKeyMD5( + const std::string& sse_customer_key) { + // the key needs to be 256 bits (32 bytes) according to + // https://docs.aws.amazon.com/AmazonS3/latest/userguide/ServerSideEncryptionCustomerKeys.html#specifying-s3-c-encryption + if (sse_customer_key.length() != 32) { + return Status::Invalid("32 bytes sse-c key is expected"); + } + + // Convert the raw binary key to an Aws::String + Aws::String sse_customer_key_aws_string(sse_customer_key.data(), + sse_customer_key.length()); + + // Compute the MD5 hash of the raw binary key + Aws::Utils::ByteBuffer sse_customer_key_md5 = + Aws::Utils::HashingUtils::CalculateMD5(sse_customer_key_aws_string); + + // Base64-encode the MD5 hash + return arrow::util::base64_encode(std::string_view( + reinterpret_cast<const char*>(sse_customer_key_md5.GetUnderlyingData()), + sse_customer_key_md5.GetLength())); +} + +template <typename S3RequestType> +Status SetSSECustomerKey(S3RequestType& request, const std::string& sse_customer_key) { + if (sse_customer_key.empty()) { + return Status::OK(); // do nothing if the sse_customer_key is not configured + } +#ifdef ARROW_S3_SUPPORT_SSEC Review Comment: `ARROW_S3_SUPPORT_SSEC` should be defined in `s3_internal.h`, otherwise there's a risk that it's not defined properly. Also, can you please rename it to for example `ARROW_S3_HAS_SSE_C`? ########## cpp/src/arrow/filesystem/filesystem.h: ########## @@ -710,6 +710,9 @@ struct FileSystemGlobalOptions { /// /// If empty, the underlying TLS library's defaults will be used. std::string tls_ca_dir_path; + + /// Controls whether to verify TLS certificates. Defaults to true. Review Comment: ```suggestion /// Whether to verify TLS certificates. Defaults to true. ``` ########## cpp/src/arrow/filesystem/s3fs_test.cc: ########## @@ -443,6 +459,7 @@ class TestS3FS : public S3TestMixin { // Most tests will create buckets options_.allow_bucket_creation = true; options_.allow_bucket_deletion = true; + options_.tls_verify_certificates = false; Review Comment: I think it would be better to properly set the CA verification path here. ########## cpp/src/arrow/filesystem/s3fs.h: ########## @@ -196,6 +196,23 @@ struct ARROW_EXPORT S3Options { /// delay between retries. std::shared_ptr<S3RetryStrategy> retry_strategy; + /// the SSE-C customized key (raw 32 bytes key). + std::string sse_customer_key; + + /// Path to a single PEM file holding all TLS CA certificates + /// + /// If empty, the underlying TLS library's defaults will be used. Review Comment: Not exactly. The global filesystem options will be used. ########## cpp/src/arrow/filesystem/s3fs_test.cc: ########## @@ -1590,5 +1640,21 @@ TEST(S3GlobalOptions, DefaultsLogLevel) { } } +TEST(CalculateSSECustomerKeyMD5, Sanity) { Review Comment: Can we move this test up in this file before the `S3FileSystem` tests? ########## cpp/src/arrow/filesystem/s3fs_benchmark.cc: ########## @@ -81,7 +81,12 @@ class MinioFixture : public benchmark::Fixture { } client_config_.endpointOverride = ToAwsString(minio_->connect_string()); - client_config_.scheme = Aws::Http::Scheme::HTTP; + if (minio_->scheme() == "https") { Review Comment: We shouldn't change the benchmark configuration because it may make benchmark numbers non-comparable with previous commits. Can this benchmark always use "http"? ########## cpp/src/arrow/filesystem/s3fs_test.cc: ########## @@ -532,15 +549,17 @@ class TestS3FS : public S3TestMixin { Result<std::shared_ptr<S3FileSystem>> MakeNewFileSystem( io::IOContext io_context = io::default_io_context()) { options_.ConfigureAccessKey(minio_->access_key(), minio_->secret_key()); - options_.scheme = "http"; + options_.scheme = minio_->scheme(); options_.endpoint_override = minio_->connect_string(); if (!options_.retry_strategy) { options_.retry_strategy = std::make_shared<ShortRetryStrategy>(); } return S3FileSystem::Make(options_, io_context); } - void MakeFileSystem() { ASSERT_OK_AND_ASSIGN(fs_, MakeNewFileSystem()); } + void MakeFileSystem() { + ASSERT_OK_AND_ASSIGN(fs_, MakeNewFileSystem(io::default_io_context())); Review Comment: Not sure this change achieves anything? ########## cpp/src/arrow/filesystem/s3_internal.h: ########## @@ -291,6 +293,47 @@ class ConnectRetryStrategy : public Aws::Client::RetryStrategy { int32_t max_retry_duration_; }; +/// \brief calculate the MD5 of the input sse-c key (raw key, not base64 encoded) +/// \param sse_customer_key is the input sse key +/// \return the base64 encoded MD5 for the input key +inline Result<std::string> CalculateSSECustomerKeyMD5( + const std::string& sse_customer_key) { + // the key needs to be 256 bits (32 bytes) according to + // https://docs.aws.amazon.com/AmazonS3/latest/userguide/ServerSideEncryptionCustomerKeys.html#specifying-s3-c-encryption + if (sse_customer_key.length() != 32) { + return Status::Invalid("32 bytes sse-c key is expected"); + } + + // Convert the raw binary key to an Aws::String + Aws::String sse_customer_key_aws_string(sse_customer_key.data(), + sse_customer_key.length()); + + // Compute the MD5 hash of the raw binary key + Aws::Utils::ByteBuffer sse_customer_key_md5 = + Aws::Utils::HashingUtils::CalculateMD5(sse_customer_key_aws_string); + + // Base64-encode the MD5 hash + return arrow::util::base64_encode(std::string_view( + reinterpret_cast<const char*>(sse_customer_key_md5.GetUnderlyingData()), + sse_customer_key_md5.GetLength())); +} + +template <typename S3RequestType> +Status SetSSECustomerKey(S3RequestType& request, const std::string& sse_customer_key) { Review Comment: Nit: since `request` is mutable here, it should be passed as a pointer according to our coding guidelines. ```suggestion Status SetSSECustomerKey(S3RequestType* request, const std::string& sse_customer_key) { ``` ########## cpp/src/arrow/filesystem/s3fs_test.cc: ########## @@ -1295,6 +1314,36 @@ TEST_F(TestS3FS, OpenInputFile) { ASSERT_RAISES(IOError, file->Seek(10)); } +#ifdef MINIO_SERVER_WITH_TLS +TEST_F(TestS3FS, SSECustomerKeyMatch) { + // normal write/read with correct SSEC key + std::shared_ptr<io::OutputStream> stream; + options_.sse_customer_key = "12345678123456781234567812345678"; + MakeFileSystem(); + ASSERT_OK_AND_ASSIGN(stream, fs_->OpenOutputStream("bucket/newfile_with_sse_c")); + ASSERT_OK(stream->Write("some")); + ASSERT_OK(stream->Close()); + ASSERT_OK_AND_ASSIGN(auto file, fs_->OpenInputFile("bucket/newfile_with_sse_c")); + ASSERT_OK_AND_ASSIGN(auto buf, file->Read(4)); Review Comment: Let's read one more byte to make sure we hit EOF? ```suggestion ASSERT_OK_AND_ASSIGN(auto buf, file->Read(5)); ``` ########## cpp/src/arrow/filesystem/s3fs_test.cc: ########## @@ -1295,6 +1314,36 @@ TEST_F(TestS3FS, OpenInputFile) { ASSERT_RAISES(IOError, file->Seek(10)); } +#ifdef MINIO_SERVER_WITH_TLS +TEST_F(TestS3FS, SSECustomerKeyMatch) { + // normal write/read with correct SSEC key + std::shared_ptr<io::OutputStream> stream; + options_.sse_customer_key = "12345678123456781234567812345678"; + MakeFileSystem(); + ASSERT_OK_AND_ASSIGN(stream, fs_->OpenOutputStream("bucket/newfile_with_sse_c")); + ASSERT_OK(stream->Write("some")); Review Comment: This will exercise the code path for multipart uploads. But we should also exercise the case where files are uploaded using a single request (see `S3Options::allow_delayed_open`). Perhaps you can parametrize the test like this: ```c++ TEST_F(TestS3FS, SSECustomerKeyMatch) { // normal write/read with correct SSEC key std::shared_ptr<io::OutputStream> stream; options_.sse_customer_key = "12345678123456781234567812345678"; for (options_.allow_delayed_open : {false, true}) { ARROW_SCOPED_TRACE("allow_delayed_open = ", allow_delayed_open); MakeFileSystem(); ASSERT_OK_AND_ASSIGN(stream, fs_->OpenOutputStream("bucket/newfile_with_sse_c")); // etc. ``` ########## cpp/src/arrow/filesystem/s3fs_test.cc: ########## @@ -1295,6 +1314,36 @@ TEST_F(TestS3FS, OpenInputFile) { ASSERT_RAISES(IOError, file->Seek(10)); } +#ifdef MINIO_SERVER_WITH_TLS +TEST_F(TestS3FS, SSECustomerKeyMatch) { + // normal write/read with correct SSEC key + std::shared_ptr<io::OutputStream> stream; + options_.sse_customer_key = "12345678123456781234567812345678"; + MakeFileSystem(); + ASSERT_OK_AND_ASSIGN(stream, fs_->OpenOutputStream("bucket/newfile_with_sse_c")); + ASSERT_OK(stream->Write("some")); + ASSERT_OK(stream->Close()); + ASSERT_OK_AND_ASSIGN(auto file, fs_->OpenInputFile("bucket/newfile_with_sse_c")); + ASSERT_OK_AND_ASSIGN(auto buf, file->Read(4)); + AssertBufferEqual(*buf, "some"); + ASSERT_OK(RestoreTestBucket()); +} + +TEST_F(TestS3FS, SSECustomerKeyMismatch) { + std::shared_ptr<io::OutputStream> stream; + options_.sse_customer_key = "12345678123456781234567812345678"; + MakeFileSystem(); Review Comment: Same comment here: we should exercise both multipart uploads and single-request uploads. ########## cpp/src/arrow/filesystem/s3fs_test.cc: ########## @@ -1295,6 +1314,36 @@ TEST_F(TestS3FS, OpenInputFile) { ASSERT_RAISES(IOError, file->Seek(10)); } +#ifdef MINIO_SERVER_WITH_TLS +TEST_F(TestS3FS, SSECustomerKeyMatch) { + // normal write/read with correct SSEC key + std::shared_ptr<io::OutputStream> stream; + options_.sse_customer_key = "12345678123456781234567812345678"; + MakeFileSystem(); + ASSERT_OK_AND_ASSIGN(stream, fs_->OpenOutputStream("bucket/newfile_with_sse_c")); + ASSERT_OK(stream->Write("some")); + ASSERT_OK(stream->Close()); + ASSERT_OK_AND_ASSIGN(auto file, fs_->OpenInputFile("bucket/newfile_with_sse_c")); + ASSERT_OK_AND_ASSIGN(auto buf, file->Read(4)); + AssertBufferEqual(*buf, "some"); + ASSERT_OK(RestoreTestBucket()); +} + +TEST_F(TestS3FS, SSECustomerKeyMismatch) { + std::shared_ptr<io::OutputStream> stream; + options_.sse_customer_key = "12345678123456781234567812345678"; + MakeFileSystem(); + ASSERT_OK_AND_ASSIGN(stream, fs_->OpenOutputStream("bucket/newfile_with_sse_c")); + ASSERT_OK(stream->Write("some")); + ASSERT_OK(stream->Close()); + + options_.sse_customer_key = "87654321876543218765432187654321"; + MakeFileSystem(); + ASSERT_RAISES(IOError, fs_->OpenInputFile("bucket/newfile_with_sse_c")); + ASSERT_OK(RestoreTestBucket()); +} Review Comment: Can we also add a test with missing SSE-C key? Something like: ```c++ TEST_F(TestS3FS, SSECustomerKeyMissing) { std::shared_ptr<io::OutputStream> stream; options_.sse_customer_key = "12345678123456781234567812345678"; MakeFileSystem(); ASSERT_OK_AND_ASSIGN(stream, fs_->OpenOutputStream("bucket/newfile_with_sse_c")); ASSERT_OK(stream->Write("some")); ASSERT_OK(stream->Close()); options_.sse_customer_key = {}; MakeFileSystem(); ASSERT_RAISES(IOError, fs_->OpenInputFile("bucket/newfile_with_sse_c")); } ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
