pitrou commented on code in PR #43601:
URL: https://github.com/apache/arrow/pull/43601#discussion_r1793403929


##########
cpp/src/arrow/filesystem/s3_test_cert.h:
##########
@@ -0,0 +1,80 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+namespace arrow {
+namespace fs {
+
+// The below two static strings are generated according to
+// 
https://github.com/minio/minio/tree/RELEASE.2024-09-22T00-33-43Z/docs/tls#323-generate-a-self-signed-certificate
+// `openssl req -new -x509 -nodes -days 36500 -keyout private.key -out 
public.crt -config
+// openssl.conf`
+const char* kMinioPrivateKey = R"(-----BEGIN PRIVATE KEY-----

Review Comment:
   We should probably make this static and constexpr to avoid any linkage 
issues.
   ```suggestion
   static constexpr const char* kMinioPrivateKey = R"(-----BEGIN PRIVATE 
KEY-----
   ```



##########
cpp/src/arrow/filesystem/s3_test_cert.h:
##########
@@ -0,0 +1,80 @@
+// Licensed to the Apache Software Foundation (ASF) under one

Review Comment:
   Can you perhaps rename this file `s3_test_cert_internal.h`? Header files 
with the "internal" suffix don't get installed, so they don't risk being used 
as public APIs by third-party code.



##########
cpp/src/arrow/filesystem/s3_test_cert.h:
##########
@@ -0,0 +1,80 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+namespace arrow {
+namespace fs {

Review Comment:
   Nit: can collapse nested namespaces
   ```suggestion
   namespace arrow::fs {
   ```



##########
cpp/src/arrow/filesystem/s3fs.h:
##########
@@ -196,6 +196,23 @@ struct ARROW_EXPORT S3Options {
   /// delay between retries.
   std::shared_ptr<S3RetryStrategy> retry_strategy;
 
+  /// the SSE-C customized key (raw 32 bytes key).
+  std::string sse_customer_key;
+
+  /// Path to a single PEM file holding all TLS CA certificates
+  ///
+  /// If empty, the underlying TLS library's defaults will be used.
+  std::string tls_ca_file_path;
+
+  /// Path to a directory holding TLS CA certificates in individual PEM files
+  /// named along the OpenSSL "hashed" format.
+  ///
+  /// If empty, the underlying TLS library's defaults will be used.
+  std::string tls_ca_dir_path;
+
+  /// Controls whether to verify TLS certificates. Defaults to true.

Review Comment:
   ```suggestion
     /// Whether to verify the S3 endpoint's TLS certificate, if the scheme is 
"https".
   ```



##########
cpp/src/arrow/filesystem/s3_test_util.cc:
##########
@@ -69,6 +77,40 @@ std::string MinioTestServer::access_key() const { return 
impl_->access_key_; }
 
 std::string MinioTestServer::secret_key() const { return impl_->secret_key_; }
 
+std::string MinioTestServer::ca_path() const {
+  return impl_->temp_dir_ca_->path().ToString();
+}
+
+std::string MinioTestServer::scheme() const { return impl_->scheme_; }
+
+Status MinioTestServer::GenerateCertificateFile() {
+  // create the dedicated folder for certificate file, rather than reuse the 
data
+  // folder, since there is test case to check whether the folder is empty.
+  ARROW_ASSIGN_OR_RAISE(impl_->temp_dir_ca_, 
TemporaryDir::Make("s3fs-test-ca-"));
+
+  ARROW_ASSIGN_OR_RAISE(auto public_crt_file,
+                        PlatformFilename::FromString(ca_path() + 
"/public.crt"));
+  ARROW_ASSIGN_OR_RAISE(auto public_cert_fd, 
FileOpenWritable(public_crt_file));
+  ARROW_RETURN_NOT_OK(FileWrite(public_cert_fd.fd(),
+                                reinterpret_cast<const uint8_t*>(kMinioCert),
+                                strlen(kMinioCert)));
+  ARROW_RETURN_NOT_OK(public_cert_fd.Close());
+
+  ARROW_ASSIGN_OR_RAISE(auto private_key_file,
+                        PlatformFilename::FromString(ca_path() + 
"/private.key"));
+  ARROW_ASSIGN_OR_RAISE(auto private_key_fd, 
FileOpenWritable(private_key_file));
+  ARROW_RETURN_NOT_OK(FileWrite(private_key_fd.fd(),
+                                reinterpret_cast<const 
uint8_t*>(kMinioPrivateKey),
+                                strlen(kMinioPrivateKey)));
+  ARROW_RETURN_NOT_OK(private_key_fd.Close());
+
+  arrow::fs::FileSystemGlobalOptions global_options;
+  global_options.tls_verify_certificates = false;

Review Comment:
   Why set this to `false`? If TLS is only enabled under Linux, then surely we 
can instead pass the right CA path or file?



##########
cpp/src/arrow/filesystem/s3_internal.h:
##########
@@ -291,6 +293,47 @@ class ConnectRetryStrategy : public 
Aws::Client::RetryStrategy {
   int32_t max_retry_duration_;
 };
 
+/// \brief calculate the MD5 of the input sse-c key (raw key, not base64 
encoded)
+/// \param sse_customer_key is the input sse key
+/// \return the base64 encoded MD5 for the input key
+inline Result<std::string> CalculateSSECustomerKeyMD5(
+    const std::string& sse_customer_key) {
+  // the key needs to be 256 bits (32 bytes) according to
+  // 
https://docs.aws.amazon.com/AmazonS3/latest/userguide/ServerSideEncryptionCustomerKeys.html#specifying-s3-c-encryption
+  if (sse_customer_key.length() != 32) {
+    return Status::Invalid("32 bytes sse-c key is expected");
+  }
+
+  // Convert the raw binary key to an Aws::String
+  Aws::String sse_customer_key_aws_string(sse_customer_key.data(),
+                                          sse_customer_key.length());
+
+  // Compute the MD5 hash of the raw binary key
+  Aws::Utils::ByteBuffer sse_customer_key_md5 =
+      Aws::Utils::HashingUtils::CalculateMD5(sse_customer_key_aws_string);
+
+  // Base64-encode the MD5 hash
+  return arrow::util::base64_encode(std::string_view(
+      reinterpret_cast<const char*>(sse_customer_key_md5.GetUnderlyingData()),
+      sse_customer_key_md5.GetLength()));
+}
+
+template <typename S3RequestType>
+Status SetSSECustomerKey(S3RequestType& request, const std::string& 
sse_customer_key) {
+  if (sse_customer_key.empty()) {
+    return Status::OK();  // do nothing if the sse_customer_key is not 
configured
+  }
+#ifdef ARROW_S3_SUPPORT_SSEC

Review Comment:
   `ARROW_S3_SUPPORT_SSEC` should be defined in `s3_internal.h`, otherwise 
there's a risk that it's not defined properly.
   
   Also, can you please rename it to for example `ARROW_S3_HAS_SSE_C`?
   



##########
cpp/src/arrow/filesystem/filesystem.h:
##########
@@ -710,6 +710,9 @@ struct FileSystemGlobalOptions {
   ///
   /// If empty, the underlying TLS library's defaults will be used.
   std::string tls_ca_dir_path;
+
+  /// Controls whether to verify TLS certificates. Defaults to true.

Review Comment:
   ```suggestion
     /// Whether to verify TLS certificates. Defaults to true.
   ```



##########
cpp/src/arrow/filesystem/s3fs_test.cc:
##########
@@ -443,6 +459,7 @@ class TestS3FS : public S3TestMixin {
     // Most tests will create buckets
     options_.allow_bucket_creation = true;
     options_.allow_bucket_deletion = true;
+    options_.tls_verify_certificates = false;

Review Comment:
   I think it would be better to properly set the CA verification path here. 



##########
cpp/src/arrow/filesystem/s3fs.h:
##########
@@ -196,6 +196,23 @@ struct ARROW_EXPORT S3Options {
   /// delay between retries.
   std::shared_ptr<S3RetryStrategy> retry_strategy;
 
+  /// the SSE-C customized key (raw 32 bytes key).
+  std::string sse_customer_key;
+
+  /// Path to a single PEM file holding all TLS CA certificates
+  ///
+  /// If empty, the underlying TLS library's defaults will be used.

Review Comment:
   Not exactly. The global filesystem options will be used.



##########
cpp/src/arrow/filesystem/s3fs_test.cc:
##########
@@ -1590,5 +1640,21 @@ TEST(S3GlobalOptions, DefaultsLogLevel) {
   }
 }
 
+TEST(CalculateSSECustomerKeyMD5, Sanity) {

Review Comment:
   Can we move this test up in this file before the `S3FileSystem` tests?



##########
cpp/src/arrow/filesystem/s3fs_benchmark.cc:
##########
@@ -81,7 +81,12 @@ class MinioFixture : public benchmark::Fixture {
     }
 
     client_config_.endpointOverride = ToAwsString(minio_->connect_string());
-    client_config_.scheme = Aws::Http::Scheme::HTTP;
+    if (minio_->scheme() == "https") {

Review Comment:
   We shouldn't change the benchmark configuration because it may make 
benchmark numbers non-comparable with previous commits. Can this benchmark 
always use "http"?



##########
cpp/src/arrow/filesystem/s3fs_test.cc:
##########
@@ -532,15 +549,17 @@ class TestS3FS : public S3TestMixin {
   Result<std::shared_ptr<S3FileSystem>> MakeNewFileSystem(
       io::IOContext io_context = io::default_io_context()) {
     options_.ConfigureAccessKey(minio_->access_key(), minio_->secret_key());
-    options_.scheme = "http";
+    options_.scheme = minio_->scheme();
     options_.endpoint_override = minio_->connect_string();
     if (!options_.retry_strategy) {
       options_.retry_strategy = std::make_shared<ShortRetryStrategy>();
     }
     return S3FileSystem::Make(options_, io_context);
   }
 
-  void MakeFileSystem() { ASSERT_OK_AND_ASSIGN(fs_, MakeNewFileSystem()); }
+  void MakeFileSystem() {
+    ASSERT_OK_AND_ASSIGN(fs_, MakeNewFileSystem(io::default_io_context()));

Review Comment:
   Not sure this change achieves anything?



##########
cpp/src/arrow/filesystem/s3_internal.h:
##########
@@ -291,6 +293,47 @@ class ConnectRetryStrategy : public 
Aws::Client::RetryStrategy {
   int32_t max_retry_duration_;
 };
 
+/// \brief calculate the MD5 of the input sse-c key (raw key, not base64 
encoded)
+/// \param sse_customer_key is the input sse key
+/// \return the base64 encoded MD5 for the input key
+inline Result<std::string> CalculateSSECustomerKeyMD5(
+    const std::string& sse_customer_key) {
+  // the key needs to be 256 bits (32 bytes) according to
+  // 
https://docs.aws.amazon.com/AmazonS3/latest/userguide/ServerSideEncryptionCustomerKeys.html#specifying-s3-c-encryption
+  if (sse_customer_key.length() != 32) {
+    return Status::Invalid("32 bytes sse-c key is expected");
+  }
+
+  // Convert the raw binary key to an Aws::String
+  Aws::String sse_customer_key_aws_string(sse_customer_key.data(),
+                                          sse_customer_key.length());
+
+  // Compute the MD5 hash of the raw binary key
+  Aws::Utils::ByteBuffer sse_customer_key_md5 =
+      Aws::Utils::HashingUtils::CalculateMD5(sse_customer_key_aws_string);
+
+  // Base64-encode the MD5 hash
+  return arrow::util::base64_encode(std::string_view(
+      reinterpret_cast<const char*>(sse_customer_key_md5.GetUnderlyingData()),
+      sse_customer_key_md5.GetLength()));
+}
+
+template <typename S3RequestType>
+Status SetSSECustomerKey(S3RequestType& request, const std::string& 
sse_customer_key) {

Review Comment:
   Nit: since `request` is mutable here, it should be passed as a pointer 
according to our coding guidelines.
   ```suggestion
   Status SetSSECustomerKey(S3RequestType* request, const std::string& 
sse_customer_key) {
   ```



##########
cpp/src/arrow/filesystem/s3fs_test.cc:
##########
@@ -1295,6 +1314,36 @@ TEST_F(TestS3FS, OpenInputFile) {
   ASSERT_RAISES(IOError, file->Seek(10));
 }
 
+#ifdef MINIO_SERVER_WITH_TLS
+TEST_F(TestS3FS, SSECustomerKeyMatch) {
+  // normal write/read with correct SSEC key
+  std::shared_ptr<io::OutputStream> stream;
+  options_.sse_customer_key = "12345678123456781234567812345678";
+  MakeFileSystem();
+  ASSERT_OK_AND_ASSIGN(stream, 
fs_->OpenOutputStream("bucket/newfile_with_sse_c"));
+  ASSERT_OK(stream->Write("some"));
+  ASSERT_OK(stream->Close());
+  ASSERT_OK_AND_ASSIGN(auto file, 
fs_->OpenInputFile("bucket/newfile_with_sse_c"));
+  ASSERT_OK_AND_ASSIGN(auto buf, file->Read(4));

Review Comment:
   Let's read one more byte to make sure we hit EOF?
   ```suggestion
     ASSERT_OK_AND_ASSIGN(auto buf, file->Read(5));
   ```



##########
cpp/src/arrow/filesystem/s3fs_test.cc:
##########
@@ -1295,6 +1314,36 @@ TEST_F(TestS3FS, OpenInputFile) {
   ASSERT_RAISES(IOError, file->Seek(10));
 }
 
+#ifdef MINIO_SERVER_WITH_TLS
+TEST_F(TestS3FS, SSECustomerKeyMatch) {
+  // normal write/read with correct SSEC key
+  std::shared_ptr<io::OutputStream> stream;
+  options_.sse_customer_key = "12345678123456781234567812345678";
+  MakeFileSystem();
+  ASSERT_OK_AND_ASSIGN(stream, 
fs_->OpenOutputStream("bucket/newfile_with_sse_c"));
+  ASSERT_OK(stream->Write("some"));

Review Comment:
   This will exercise the code path for multipart uploads. But we should also 
exercise the case where files are uploaded using a single request (see 
`S3Options::allow_delayed_open`).
   
   Perhaps you can parametrize the test like this:
   ```c++
   TEST_F(TestS3FS, SSECustomerKeyMatch) {
     // normal write/read with correct SSEC key
     std::shared_ptr<io::OutputStream> stream;
     options_.sse_customer_key = "12345678123456781234567812345678";
     for (options_.allow_delayed_open : {false, true}) {
       ARROW_SCOPED_TRACE("allow_delayed_open = ", allow_delayed_open);
       MakeFileSystem();
       ASSERT_OK_AND_ASSIGN(stream, 
fs_->OpenOutputStream("bucket/newfile_with_sse_c"));
       // etc.
   ```



##########
cpp/src/arrow/filesystem/s3fs_test.cc:
##########
@@ -1295,6 +1314,36 @@ TEST_F(TestS3FS, OpenInputFile) {
   ASSERT_RAISES(IOError, file->Seek(10));
 }
 
+#ifdef MINIO_SERVER_WITH_TLS
+TEST_F(TestS3FS, SSECustomerKeyMatch) {
+  // normal write/read with correct SSEC key
+  std::shared_ptr<io::OutputStream> stream;
+  options_.sse_customer_key = "12345678123456781234567812345678";
+  MakeFileSystem();
+  ASSERT_OK_AND_ASSIGN(stream, 
fs_->OpenOutputStream("bucket/newfile_with_sse_c"));
+  ASSERT_OK(stream->Write("some"));
+  ASSERT_OK(stream->Close());
+  ASSERT_OK_AND_ASSIGN(auto file, 
fs_->OpenInputFile("bucket/newfile_with_sse_c"));
+  ASSERT_OK_AND_ASSIGN(auto buf, file->Read(4));
+  AssertBufferEqual(*buf, "some");
+  ASSERT_OK(RestoreTestBucket());
+}
+
+TEST_F(TestS3FS, SSECustomerKeyMismatch) {
+  std::shared_ptr<io::OutputStream> stream;
+  options_.sse_customer_key = "12345678123456781234567812345678";
+  MakeFileSystem();

Review Comment:
   Same comment here: we should exercise both multipart uploads and 
single-request uploads.



##########
cpp/src/arrow/filesystem/s3fs_test.cc:
##########
@@ -1295,6 +1314,36 @@ TEST_F(TestS3FS, OpenInputFile) {
   ASSERT_RAISES(IOError, file->Seek(10));
 }
 
+#ifdef MINIO_SERVER_WITH_TLS
+TEST_F(TestS3FS, SSECustomerKeyMatch) {
+  // normal write/read with correct SSEC key
+  std::shared_ptr<io::OutputStream> stream;
+  options_.sse_customer_key = "12345678123456781234567812345678";
+  MakeFileSystem();
+  ASSERT_OK_AND_ASSIGN(stream, 
fs_->OpenOutputStream("bucket/newfile_with_sse_c"));
+  ASSERT_OK(stream->Write("some"));
+  ASSERT_OK(stream->Close());
+  ASSERT_OK_AND_ASSIGN(auto file, 
fs_->OpenInputFile("bucket/newfile_with_sse_c"));
+  ASSERT_OK_AND_ASSIGN(auto buf, file->Read(4));
+  AssertBufferEqual(*buf, "some");
+  ASSERT_OK(RestoreTestBucket());
+}
+
+TEST_F(TestS3FS, SSECustomerKeyMismatch) {
+  std::shared_ptr<io::OutputStream> stream;
+  options_.sse_customer_key = "12345678123456781234567812345678";
+  MakeFileSystem();
+  ASSERT_OK_AND_ASSIGN(stream, 
fs_->OpenOutputStream("bucket/newfile_with_sse_c"));
+  ASSERT_OK(stream->Write("some"));
+  ASSERT_OK(stream->Close());
+
+  options_.sse_customer_key = "87654321876543218765432187654321";
+  MakeFileSystem();
+  ASSERT_RAISES(IOError, fs_->OpenInputFile("bucket/newfile_with_sse_c"));
+  ASSERT_OK(RestoreTestBucket());
+}

Review Comment:
   Can we also add a test with missing SSE-C key? Something like:
   ```c++
   TEST_F(TestS3FS, SSECustomerKeyMissing) {
     std::shared_ptr<io::OutputStream> stream;
     options_.sse_customer_key = "12345678123456781234567812345678";
     MakeFileSystem();
     ASSERT_OK_AND_ASSIGN(stream, 
fs_->OpenOutputStream("bucket/newfile_with_sse_c"));
     ASSERT_OK(stream->Write("some"));
     ASSERT_OK(stream->Close());
   
     options_.sse_customer_key = {};
     MakeFileSystem();
     ASSERT_RAISES(IOError, fs_->OpenInputFile("bucket/newfile_with_sse_c"));
   }
   ```
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to