pitrou commented on code in PR #46017:
URL: https://github.com/apache/arrow/pull/46017#discussion_r2027252797


##########
cpp/src/parquet/encryption/secure_string.h:
##########
@@ -0,0 +1,66 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <string>
+
+#include "arrow/util/span.h"
+
+namespace parquet::encryption {
+/**
+ * A secure string that ensures the wrapped string is cleared from memory on
+ * deconstruction. This class can only be created from std::string that are 
securely
+ * erased after creation.
+ *
+ * Note: This class does not provide a constructor / assignment operator that 
copies a
+ * std::string because that would allow code to create a SecureString while 
accidentally
+ * not noticing the need to securely erasing the argument after invoking the 
constructor /
+ * calling the assignment operator.
+ */
+class SecureString {
+ public:
+  SecureString() noexcept = default;
+  SecureString(SecureString&&) noexcept;
+  SecureString(const SecureString&) noexcept = default;
+  explicit SecureString(std::string&&) noexcept;
+
+  SecureString& operator=(SecureString&&) noexcept;
+  SecureString& operator=(const SecureString&) noexcept;
+  SecureString& operator=(std::string&& secret) noexcept;
+
+  bool operator==(const SecureString&) const;
+  bool operator!=(const SecureString&) const;
+
+  ~SecureString() { dispose(); }
+
+  [[nodiscard]] bool empty() const;
+  [[nodiscard]] std::size_t size() const;
+  [[nodiscard]] std::size_t length() const;
+  [[nodiscard]] ::arrow::util::span<const uint8_t> as_span() const;
+
+  void dispose();
+
+  static void secure_clear(std::string&);

Review Comment:
   We mostly prohibit mutable ref arguments, so use a pointer.
   ```suggestion
     static void SecureClear(std::string*);
   ```



##########
cpp/src/parquet/encryption/secure_string.h:
##########
@@ -0,0 +1,66 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <string>
+
+#include "arrow/util/span.h"
+
+namespace parquet::encryption {
+/**
+ * A secure string that ensures the wrapped string is cleared from memory on
+ * deconstruction. This class can only be created from std::string that are 
securely
+ * erased after creation.
+ *
+ * Note: This class does not provide a constructor / assignment operator that 
copies a
+ * std::string because that would allow code to create a SecureString while 
accidentally
+ * not noticing the need to securely erasing the argument after invoking the 
constructor /
+ * calling the assignment operator.
+ */
+class SecureString {
+ public:
+  SecureString() noexcept = default;
+  SecureString(SecureString&&) noexcept;
+  SecureString(const SecureString&) noexcept = default;
+  explicit SecureString(std::string&&) noexcept;
+
+  SecureString& operator=(SecureString&&) noexcept;
+  SecureString& operator=(const SecureString&) noexcept;
+  SecureString& operator=(std::string&& secret) noexcept;
+
+  bool operator==(const SecureString&) const;
+  bool operator!=(const SecureString&) const;
+
+  ~SecureString() { dispose(); }
+
+  [[nodiscard]] bool empty() const;
+  [[nodiscard]] std::size_t size() const;
+  [[nodiscard]] std::size_t length() const;
+  [[nodiscard]] ::arrow::util::span<const uint8_t> as_span() const;

Review Comment:
   Also perhaps add a `string_view` accessor and a mutable `span` one?
   ```suggestion
     [[nodiscard]] ::arrow::util::span<uint8_t> as_span();
     [[nodiscard]] ::arrow::util::span<const uint8_t> as_span() const;
     [[nodiscard]] std::string_view as_view();
   ```



##########
cpp/src/parquet/encryption/secure_string_test.cc:
##########
@@ -0,0 +1,106 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gtest/gtest.h>
+#include <vector>
+
+#include "parquet/encryption/secure_string.h"
+
+namespace parquet::encryption::test {
+
+void assert_securely_cleared(const std::string& string) {
+  // the entire buffer of the string is filled with zeros
+  std::vector<char> zeros(string.capacity());
+  ::arrow::util::span actual(string.data(), string.capacity());
+  ::arrow::util::span expected(zeros.data(), zeros.size());
+  ASSERT_EQ(actual, expected);
+
+  // the string is empty
+  ASSERT_TRUE(string.empty());
+}
+
+TEST(TestSecureString, SecureClearString) {
+  // short string
+  {
+    std::string tiny("abc");
+    SecureString::secure_clear(tiny);
+    assert_securely_cleared(tiny);
+  }

Review Comment:
   For more stringent testing, I think we also want to examine the string area 
_before_ it is cleared (or moved, etc.).
   
   Something like (untested):
   ```c++
   std::string_view StringArea(const std::string& string) {
     return std::string_view(string.data(), string.capacity());
   }
   
   void AssertSecurelyCleared(std::string_view area) {
     // the entire area is filled with zeros
     std::string zeros(area.size(), '\0');
     ASSERT_EQ(area, std::string_view(zeros));
   }
   
   void AssertSecurelyCleared(const std::string& string) {
     AssertSecurelyCleared(StringArea(string));
   }
   
   TEST(TestSecureString, SecureClearCheck) {
     // short string
     {
       std::string tiny("abc");
       auto old_area = StringArea(tiny);
       SecureString::SecureClear(tiny);
       AssertSecurelyCleared(tiny);
       AssertSecurelyCleared(old_area);
     }
     // etc.



##########
cpp/src/parquet/encryption/encryption.h:
##########
@@ -431,15 +424,16 @@ class PARQUET_EXPORT FileEncryptionProperties {
 
  private:
   EncryptionAlgorithm algorithm_;
-  std::string footer_key_;
+  encryption::SecureString footer_key_;
   std::string footer_key_metadata_;
   bool encrypted_footer_;
   std::string file_aad_;
   std::string aad_prefix_;
   bool store_aad_prefix_in_file_;
   ColumnPathToEncryptionPropertiesMap encrypted_columns_;
 
-  FileEncryptionProperties(ParquetCipher::type cipher, const std::string& 
footer_key,
+  FileEncryptionProperties(ParquetCipher::type cipher,
+                           const encryption::SecureString& footer_key,

Review Comment:
   Here as well, could avoid copies by taking strings by value.



##########
cpp/src/parquet/encryption/encryption.h:
##########
@@ -46,28 +47,28 @@ using ColumnPathToEncryptionPropertiesMap =
 
 class PARQUET_EXPORT DecryptionKeyRetriever {
  public:
-  virtual std::string GetKey(const std::string& key_metadata) = 0;
+  virtual encryption::SecureString GetKey(const std::string& key_metadata) = 0;
   virtual ~DecryptionKeyRetriever() {}
 };
 
 /// Simple integer key retriever
 class PARQUET_EXPORT IntegerKeyIdRetriever : public DecryptionKeyRetriever {
  public:
-  void PutKey(uint32_t key_id, const std::string& key);
-  std::string GetKey(const std::string& key_metadata) override;
+  void PutKey(uint32_t key_id, const encryption::SecureString& key);

Review Comment:
   Could probably take the `SecureString` by value here, since we're going to 
store it.



##########
cpp/src/parquet/encryption/internal_file_decryptor.h:
##########
@@ -39,9 +40,9 @@ class FileDecryptionProperties;
 // CAUTION: Decryptor objects are not thread-safe.
 class PARQUET_EXPORT Decryptor {
  public:
-  Decryptor(std::unique_ptr<encryption::AesDecryptor> decryptor, const 
std::string& key,
-            const std::string& file_aad, const std::string& aad,
-            ::arrow::MemoryPool* pool);
+  Decryptor(std::unique_ptr<encryption::AesDecryptor> decryptor,
+            const encryption::SecureString& key, const std::string& file_aad,
+            const std::string& aad, ::arrow::MemoryPool* pool);

Review Comment:
   Same here (take strings by value)



##########
cpp/src/parquet/encryption/file_key_wrapper.cc:
##########
@@ -113,14 +113,15 @@ KeyEncryptionKey FileKeyWrapper::CreateKeyEncryptionKey(
     const std::string& master_key_id) {
   std::string kek_bytes(kKeyEncryptionKeyLength, '\0');
   RandBytes(reinterpret_cast<uint8_t*>(kek_bytes.data()), 
kKeyEncryptionKeyLength);
+  SecureString secure_kek_bytes(std::move(kek_bytes));

Review Comment:
   Here as well, we could write directly into a pre-sized SecureString.



##########
cpp/src/parquet/encryption/encryption.h:
##########
@@ -402,21 +397,19 @@ class PARQUET_EXPORT FileEncryptionProperties {
    private:
     ParquetCipher::type parquet_cipher_;
     bool encrypted_footer_;
-    std::string footer_key_;
+    encryption::SecureString footer_key_;
     std::string footer_key_metadata_;
 
     std::string aad_prefix_;
     bool store_aad_prefix_in_file_;
     ColumnPathToEncryptionPropertiesMap encrypted_columns_;
   };
 
-  ~FileEncryptionProperties() { footer_key_.clear(); }
-
   bool encrypted_footer() const { return encrypted_footer_; }
 
   EncryptionAlgorithm algorithm() const { return algorithm_; }
 
-  std::string footer_key() const { return footer_key_; }
+  encryption::SecureString footer_key() const { return footer_key_; }

Review Comment:
   Could perhaps return a const ref, assuming the member variable doesn't 
change?



##########
cpp/src/parquet/encryption/secure_string_test.cc:
##########
@@ -0,0 +1,106 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gtest/gtest.h>
+#include <vector>
+
+#include "parquet/encryption/secure_string.h"
+
+namespace parquet::encryption::test {
+
+void assert_securely_cleared(const std::string& string) {
+  // the entire buffer of the string is filled with zeros
+  std::vector<char> zeros(string.capacity());
+  ::arrow::util::span actual(string.data(), string.capacity());
+  ::arrow::util::span expected(zeros.data(), zeros.size());
+  ASSERT_EQ(actual, expected);
+
+  // the string is empty
+  ASSERT_TRUE(string.empty());
+}
+
+TEST(TestSecureString, SecureClearString) {
+  // short string
+  {
+    std::string tiny("abc");
+    SecureString::secure_clear(tiny);
+    assert_securely_cleared(tiny);
+  }
+
+  // long string
+  {
+    std::string large(1024, 'x');
+    large.resize(1024, 'y');
+    SecureString::secure_clear(large);
+    assert_securely_cleared(large);
+  }
+
+  // empty string
+  {
+    // this creates an empty string with some non-zero characters in the 
string buffer
+    // we test that all those characters are securely cleared
+    std::string empty("abcdef");
+    empty.resize(0);
+    SecureString::secure_clear(empty);
+    assert_securely_cleared(empty);
+  }
+}
+
+TEST(TestSecureString, Construct) {
+  // move constructing from a string securely clears that string
+  std::string string("hello world");
+  SecureString secret_from_string(std::move(string));
+  assert_securely_cleared(string);
+  ASSERT_FALSE(secret_from_string.empty());
+
+  // move constructing from a secure string securely clears that secure string
+  // Note: there is no way to test the secure clearing of the moved secure 
string
+  SecureString secret_from_move_secret(std::move(secret_from_string));
+  ASSERT_TRUE(secret_from_string.empty());
+  ASSERT_FALSE(secret_from_move_secret.empty());
+
+  // copy constructing from a secure string does not modify that secure string
+  SecureString secret_from_secret(secret_from_move_secret);
+  ASSERT_FALSE(secret_from_move_secret.empty());
+  ASSERT_FALSE(secret_from_secret.empty());
+  ASSERT_EQ(secret_from_secret, secret_from_move_secret);
+}
+
+TEST(TestSecureString, Assign) {
+  // move assigning from a string securely clears that string
+  std::string string("hello world");
+  SecureString secret_from_string;
+  secret_from_string = std::move(string);
+  assert_securely_cleared(string);
+  ASSERT_FALSE(secret_from_string.empty());
+
+  // move assigning from a secure string securely clears that secure string
+  // Note: there is no way to test the secure clearing of the moved secure 
string
+  SecureString secret_from_move_secret;
+  secret_from_move_secret = std::move(secret_from_string);
+  ASSERT_TRUE(secret_from_string.empty());
+  ASSERT_FALSE(secret_from_move_secret.empty());
+
+  // assigning from a secure string does not modify that secure string
+  SecureString secret_from_secret;
+  secret_from_secret = secret_from_move_secret;
+  ASSERT_FALSE(secret_from_move_secret.empty());
+  ASSERT_FALSE(secret_from_secret.empty());
+  ASSERT_EQ(secret_from_secret, secret_from_move_secret);
+}
+

Review Comment:
   Probably also want a test that a SecureString was cleared on destruction ?



##########
cpp/src/parquet/encryption/key_toolkit.h:
##########
@@ -92,14 +92,14 @@ class PARQUET_EXPORT KeyToolkit {
 // parsing from "key material"
 class PARQUET_EXPORT KeyWithMasterId {
  public:
-  KeyWithMasterId(std::string key_bytes, std::string master_id)
+  KeyWithMasterId(SecureString key_bytes, std::string master_id)
       : key_bytes_(std::move(key_bytes)), master_id_(std::move(master_id)) {}
 
-  const std::string& data_key() const { return key_bytes_; }
+  const SecureString& data_key() const { return key_bytes_; }
   const std::string& master_id() const { return master_id_; }
 
  private:
-  const std::string key_bytes_;
+  const SecureString key_bytes_;
   const std::string master_id_;

Review Comment:
   I'm not sure it makes sense to keep the `const` qualifiers here since all 
methods are `const` already.
   
   With `const` member variables, `KeyMasterId` cannot be moved but will always 
be copied.
   



##########
cpp/src/parquet/encryption/internal_file_encryptor.h:
##########
@@ -36,7 +36,7 @@ class ColumnEncryptionProperties;
 
 class PARQUET_EXPORT Encryptor {
  public:
-  Encryptor(encryption::AesEncryptor* aes_encryptor, const std::string& key,
+  Encryptor(encryption::AesEncryptor* aes_encryptor, const 
encryption::SecureString& key,
             const std::string& file_aad, const std::string& aad,
             ::arrow::MemoryPool* pool);

Review Comment:
   We could probably take those arguments by value, to avoid copying them in 
the constructor?
   ```suggestion
     Encryptor(encryption::AesEncryptor* aes_encryptor, 
encryption::SecureString key,
               std::string file_aad, std::string aad,
               ::arrow::MemoryPool* pool);
   ```



##########
cpp/src/parquet/encryption/key_toolkit_internal.cc:
##########
@@ -59,10 +59,10 @@ std::string DecryptKeyLocally(const std::string& 
encoded_encrypted_key,
   ::arrow::util::span<uint8_t> decrypted_key_span(
       reinterpret_cast<uint8_t*>(&decrypted_key[0]), decrypted_key_len);

Review Comment:
   Perhaps we could write directly into an appropriately-sized `SecureString`?



##########
cpp/src/parquet/encryption/secure_string.h:
##########
@@ -0,0 +1,66 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <string>
+
+#include "arrow/util/span.h"
+
+namespace parquet::encryption {
+/**
+ * A secure string that ensures the wrapped string is cleared from memory on
+ * deconstruction. This class can only be created from std::string that are 
securely
+ * erased after creation.
+ *
+ * Note: This class does not provide a constructor / assignment operator that 
copies a
+ * std::string because that would allow code to create a SecureString while 
accidentally
+ * not noticing the need to securely erasing the argument after invoking the 
constructor /
+ * calling the assignment operator.
+ */
+class SecureString {
+ public:
+  SecureString() noexcept = default;
+  SecureString(SecureString&&) noexcept;
+  SecureString(const SecureString&) noexcept = default;
+  explicit SecureString(std::string&&) noexcept;
+
+  SecureString& operator=(SecureString&&) noexcept;
+  SecureString& operator=(const SecureString&) noexcept;
+  SecureString& operator=(std::string&& secret) noexcept;
+
+  bool operator==(const SecureString&) const;
+  bool operator!=(const SecureString&) const;
+
+  ~SecureString() { dispose(); }
+
+  [[nodiscard]] bool empty() const;
+  [[nodiscard]] std::size_t size() const;
+  [[nodiscard]] std::size_t length() const;
+  [[nodiscard]] ::arrow::util::span<const uint8_t> as_span() const;
+
+  void dispose();
+
+  static void secure_clear(std::string&);
+  static void secure_clear(uint8_t* data, size_t size);

Review Comment:
   Nit: non-trivial methods should be `CamelCase`, so `Dispose` and 
`SecureClear`.



##########
cpp/src/parquet/encryption/secure_string.cc:
##########
@@ -0,0 +1,103 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/encryption/secure_string.h"
+
+#include <openssl/crypto.h>
+#include <openssl/opensslv.h>
+#include <utility>
+#if defined(_WIN32)
+#  include <windows.h>
+#endif
+
+#include "arrow/util/span.h"
+#include "parquet/encryption/encryption.h"
+
+namespace parquet::encryption {
+SecureString::SecureString(SecureString&& secret) noexcept
+    : secret_(std::move(secret.secret_)) {}
+SecureString::SecureString(std::string&& secret) noexcept : 
secret_(std::move(secret)) {
+  secure_clear(secret);
+}
+
+SecureString& SecureString::operator=(SecureString&& secret) noexcept {
+  if (this == &secret) {
+    // self-assignment
+    return *this;
+  }
+  dispose();
+  secret_ = std::move(secret.secret_);
+  return *this;
+}
+SecureString& SecureString::operator=(const SecureString& secret) noexcept {
+  if (this == &secret) {
+    // self-assignment
+    return *this;
+  }
+  dispose();
+  secret_ = secret.secret_;
+  return *this;
+}
+SecureString& SecureString::operator=(std::string&& secret) noexcept {
+  dispose();
+  secret_ = std::move(secret);
+  secure_clear(secret);
+  return *this;
+}
+
+bool SecureString::operator==(const SecureString& other) const {
+  return secret_ == other.secret_;
+}
+
+bool SecureString::operator!=(const SecureString& other) const {
+  return secret_ != other.secret_;
+}
+
+bool SecureString::empty() const { return secret_.empty(); }
+std::size_t SecureString::size() const { return secret_.size(); }
+std::size_t SecureString::length() const { return secret_.length(); }
+::arrow::util::span<const uint8_t> SecureString::as_span() const {
+  return str2span(secret_);

Review Comment:
   `str2span` is a trivial function, we can probably copy its contents here.



##########
cpp/src/parquet/encryption/encryption.h:
##########
@@ -327,19 +322,19 @@ class PARQUET_EXPORT FileDecryptionProperties {
   }
 
  private:
-  std::string footer_key_;
+  encryption::SecureString footer_key_;
   std::string aad_prefix_;
   std::shared_ptr<AADPrefixVerifier> aad_prefix_verifier_;
 
-  const std::string empty_string_ = "";
+  const encryption::SecureString empty_string_ = encryption::SecureString("");
   ColumnPathToDecryptionPropertiesMap column_decryption_properties_;
 
   std::shared_ptr<DecryptionKeyRetriever> key_retriever_;
   bool check_plaintext_footer_integrity_;
   bool plaintext_files_allowed_;
 
   FileDecryptionProperties(
-      const std::string& footer_key,
+      const encryption::SecureString& footer_key,

Review Comment:
   Same comments above.



##########
cpp/src/parquet/encryption/encryption.h:
##########
@@ -327,19 +322,19 @@ class PARQUET_EXPORT FileDecryptionProperties {
   }
 
  private:
-  std::string footer_key_;
+  encryption::SecureString footer_key_;
   std::string aad_prefix_;
   std::shared_ptr<AADPrefixVerifier> aad_prefix_verifier_;
 
-  const std::string empty_string_ = "";
+  const encryption::SecureString empty_string_ = encryption::SecureString("");

Review Comment:
   Is `empty_string_` useful at all? It looks rather dubious.
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Reply via email to