bkietz commented on code in PR #46017: URL: https://github.com/apache/arrow/pull/46017#discussion_r2029158514
########## cpp/src/parquet/encryption/file_key_wrapper.cc: ########## @@ -113,14 +113,15 @@ KeyEncryptionKey FileKeyWrapper::CreateKeyEncryptionKey( const std::string& master_key_id) { std::string kek_bytes(kKeyEncryptionKeyLength, '\0'); RandBytes(reinterpret_cast<uint8_t*>(kek_bytes.data()), kKeyEncryptionKeyLength); + SecureString secure_kek_bytes(std::move(kek_bytes)); Review Comment: ```suggestion SecureString secure_kek_bytes(std::string(kKeyEncryptionKeyLength, '\0')); RandBytes(secure_kek_bytes.as_span().data(), kKeyEncryptionKeyLength); ``` ########## cpp/src/parquet/encryption/read_configurations_test.cc: ########## @@ -103,9 +103,9 @@ class TestDecryptionConfiguration // This vector will hold various decryption configurations. std::vector<std::shared_ptr<parquet::FileDecryptionProperties>> vector_of_decryption_configurations_; - std::string kFooterEncryptionKey_ = std::string(kFooterEncryptionKey); - std::string kColumnEncryptionKey1_ = std::string(kColumnEncryptionKey1); - std::string kColumnEncryptionKey2_ = std::string(kColumnEncryptionKey2); + SecureString kFooterEncryptionKey_ = kFooterEncryptionKey; Review Comment: `const` seems warranted here ```suggestion const SecureString kFooterEncryptionKey_ = kFooterEncryptionKey; ``` ########## cpp/src/parquet/encryption/encryption.h: ########## @@ -46,28 +47,28 @@ using ColumnPathToEncryptionPropertiesMap = class PARQUET_EXPORT DecryptionKeyRetriever { public: - virtual std::string GetKey(const std::string& key_metadata) = 0; + virtual encryption::SecureString GetKey(const std::string& key_metadata) = 0; virtual ~DecryptionKeyRetriever() {} }; /// Simple integer key retriever class PARQUET_EXPORT IntegerKeyIdRetriever : public DecryptionKeyRetriever { public: - void PutKey(uint32_t key_id, const std::string& key); - std::string GetKey(const std::string& key_metadata) override; + void PutKey(uint32_t key_id, const encryption::SecureString& key); + encryption::SecureString GetKey(const std::string& key_metadata) override; Review Comment: Nit: it's not relevant to this PR, but this signature is surprising enough that I think it'd be worthwhile to inline this function and include a comment ```suggestion encryption::SecureString GetKey(const std::string& key_metadata) override { // key_metadata is string but for IntegerKeyIdRetriever it encodes // a native-endian 32 bit unsigned integer key_id uint32_t key_id; assert(key_metadata.size() == sizeof(key_id)); memcpy(&key_id, key_metadata.data(), sizeof(key_id)); return GetKey(key_id); } encryption::SecureString GetKey(uint32_t key_id) { return key_map_.at(key_id); } ``` ########## cpp/src/parquet/encryption/secure_string_test.cc: ########## @@ -0,0 +1,106 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <gtest/gtest.h> +#include <vector> + +#include "parquet/encryption/secure_string.h" + +namespace parquet::encryption::test { + +void assert_securely_cleared(const std::string& string) { + // the entire buffer of the string is filled with zeros + std::vector<char> zeros(string.capacity()); + ::arrow::util::span actual(string.data(), string.capacity()); + ::arrow::util::span expected(zeros.data(), zeros.size()); + ASSERT_EQ(actual, expected); + + // the string is empty + ASSERT_TRUE(string.empty()); +} + +TEST(TestSecureString, SecureClearString) { + // short string + { + std::string tiny("abc"); + SecureString::secure_clear(tiny); + assert_securely_cleared(tiny); + } + + // long string + { + std::string large(1024, 'x'); + large.resize(1024, 'y'); + SecureString::secure_clear(large); + assert_securely_cleared(large); + } + + // empty string + { + // this creates an empty string with some non-zero characters in the string buffer + // we test that all those characters are securely cleared + std::string empty("abcdef"); + empty.resize(0); + SecureString::secure_clear(empty); + assert_securely_cleared(empty); + } +} + +TEST(TestSecureString, Construct) { + // move constructing from a string securely clears that string + std::string string("hello world"); + SecureString secret_from_string(std::move(string)); + assert_securely_cleared(string); + ASSERT_FALSE(secret_from_string.empty()); + + // move constructing from a secure string securely clears that secure string + // Note: there is no way to test the secure clearing of the moved secure string + SecureString secret_from_move_secret(std::move(secret_from_string)); + ASSERT_TRUE(secret_from_string.empty()); + ASSERT_FALSE(secret_from_move_secret.empty()); + + // copy constructing from a secure string does not modify that secure string + SecureString secret_from_secret(secret_from_move_secret); + ASSERT_FALSE(secret_from_move_secret.empty()); + ASSERT_FALSE(secret_from_secret.empty()); + ASSERT_EQ(secret_from_secret, secret_from_move_secret); +} + +TEST(TestSecureString, Assign) { + // move assigning from a string securely clears that string + std::string string("hello world"); + SecureString secret_from_string; + secret_from_string = std::move(string); + assert_securely_cleared(string); + ASSERT_FALSE(secret_from_string.empty()); + + // move assigning from a secure string securely clears that secure string + // Note: there is no way to test the secure clearing of the moved secure string + SecureString secret_from_move_secret; + secret_from_move_secret = std::move(secret_from_string); + ASSERT_TRUE(secret_from_string.empty()); + ASSERT_FALSE(secret_from_move_secret.empty()); + + // assigning from a secure string does not modify that secure string + SecureString secret_from_secret; + secret_from_secret = secret_from_move_secret; + ASSERT_FALSE(secret_from_move_secret.empty()); + ASSERT_FALSE(secret_from_secret.empty()); + ASSERT_EQ(secret_from_secret, secret_from_move_secret); +} + Review Comment: The standardese "true" way would be to template SecureString on an allocator so that you could intercept destruction of the bytes it stores. That's entirely too much here; we could just add a debug assertion inside ~SecureString (maybe only enabled by an env var or `#if defined(ARROW_ASSERT_SECURE_STRING_ZEROED)`) ########## cpp/src/parquet/encryption/secure_string.h: ########## @@ -0,0 +1,70 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include <cstdint> +#include <string> + +#include "arrow/util/span.h" +#include "parquet/platform.h" + +namespace parquet::encryption { +/** + * A secure string that ensures the wrapped string is cleared from memory on + * deconstruction. This class can only be created from std::string that are securely + * erased after creation. + * + * Note: This class does not provide a constructor / assignment operator that copies a + * std::string because that would allow code to create a SecureString while accidentally + * not noticing the need to securely erasing the argument after invoking the constructor / + * calling the assignment operator. + */ +class PARQUET_EXPORT SecureString { + public: + SecureString() noexcept = default; + SecureString(SecureString&&) noexcept; + SecureString(const SecureString&) noexcept = default; Review Comment: ```suggestion SecureString(const SecureString&) = default; ``` This should not be `noexcept`; copying the string requires a new allocation which may throw ########## cpp/src/parquet/encryption/secure_string.h: ########## @@ -0,0 +1,70 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include <cstdint> +#include <string> + +#include "arrow/util/span.h" +#include "parquet/platform.h" + +namespace parquet::encryption { +/** + * A secure string that ensures the wrapped string is cleared from memory on + * deconstruction. This class can only be created from std::string that are securely + * erased after creation. + * + * Note: This class does not provide a constructor / assignment operator that copies a + * std::string because that would allow code to create a SecureString while accidentally + * not noticing the need to securely erasing the argument after invoking the constructor / + * calling the assignment operator. + */ +class PARQUET_EXPORT SecureString { + public: + SecureString() noexcept = default; + SecureString(SecureString&&) noexcept; + SecureString(const SecureString&) noexcept = default; + explicit SecureString(std::string&&) noexcept; Review Comment: ```suggestion explicit SecureString(std::string&&) noexcept; explicit SecureString(size_t size) : SecureString{std::string(size, '\0')} {} ``` ########## cpp/src/parquet/encryption/secure_string.h: ########## @@ -0,0 +1,70 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include <cstdint> +#include <string> + +#include "arrow/util/span.h" +#include "parquet/platform.h" + +namespace parquet::encryption { +/** + * A secure string that ensures the wrapped string is cleared from memory on + * deconstruction. This class can only be created from std::string that are securely + * erased after creation. + * + * Note: This class does not provide a constructor / assignment operator that copies a + * std::string because that would allow code to create a SecureString while accidentally + * not noticing the need to securely erasing the argument after invoking the constructor / + * calling the assignment operator. + */ +class PARQUET_EXPORT SecureString { + public: + SecureString() noexcept = default; + SecureString(SecureString&&) noexcept; + SecureString(const SecureString&) noexcept = default; + explicit SecureString(std::string&&) noexcept; + + SecureString& operator=(SecureString&&) noexcept; + SecureString& operator=(const SecureString&) noexcept; Review Comment: ```suggestion SecureString& operator=(const SecureString&); ``` (same for copy assignment) ########## cpp/src/parquet/encryption/secure_string.cc: ########## @@ -0,0 +1,111 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "parquet/encryption/secure_string.h" + +#include <openssl/crypto.h> +#include <openssl/opensslv.h> +#include <utility> +#if defined(_WIN32) +# include <windows.h> +#endif + +#include "arrow/util/span.h" +#include "parquet/encryption/encryption.h" + +namespace parquet::encryption { +SecureString::SecureString(SecureString&& secret) noexcept + : secret_(std::move(secret.secret_)) {} +SecureString::SecureString(std::string&& secret) noexcept : secret_(std::move(secret)) { + SecureClear(&secret); +} + +SecureString& SecureString::operator=(SecureString&& secret) noexcept { + if (this == &secret) { + // self-assignment + return *this; + } + Dispose(); + secret_ = std::move(secret.secret_); + return *this; +} +SecureString& SecureString::operator=(const SecureString& secret) noexcept { + if (this == &secret) { + // self-assignment + return *this; + } + Dispose(); + secret_ = secret.secret_; + return *this; +} +SecureString& SecureString::operator=(std::string&& secret) noexcept { + Dispose(); + secret_ = std::move(secret); + SecureClear(&secret); + return *this; +} + +bool SecureString::operator==(const SecureString& other) const { + return secret_ == other.secret_; +} + +bool SecureString::operator!=(const SecureString& other) const { + return secret_ != other.secret_; +} + +bool SecureString::empty() const { return secret_.empty(); } +std::size_t SecureString::size() const { return secret_.size(); } +std::size_t SecureString::length() const { return secret_.length(); } + +::arrow::util::span<uint8_t> SecureString::as_span() { + return {reinterpret_cast<uint8_t*>(secret_.data()), secret_.size()}; +} +::arrow::util::span<const uint8_t> SecureString::as_span() const { + return {reinterpret_cast<const uint8_t*>(secret_.data()), secret_.size()}; +} +std::string_view SecureString::as_view() const { + return {secret_.data(), secret_.size()}; +} + +void SecureString::Dispose() { SecureClear(&secret_); } +void SecureString::SecureClear(std::string* secret) { + secret->clear(); + SecureClear(reinterpret_cast<uint8_t*>(secret->data()), secret->capacity()); +} +inline void SecureString::SecureClear(uint8_t* data, size_t size) { + // Heavily borrowed from libb2's `secure_zero_memory` at + // https://github.com/BLAKE2/libb2/blob/master/src/blake2-impl.h +#if defined(_WIN32) + SecureZeroMemory(data, size); +#elif defined(__STDC_LIB_EXT1__) + // memset_s is meant to not be optimized away + memset_s(data, size, 0, size); +#elif defined(OPENSSL_VERSION_NUMBER) && OPENSSL_VERSION_NUMBER >= 0x30000000 + OPENSSL_cleanse(data, size); +#elif defined(__GLIBC__) && (__GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 25)) + // glibc 2.25+ has explicit_bzero + explicit_bzero(data, size); +#else + // Try to ensure that a true library call to memset() will be generated + // by the compiler. + static const volatile auto memset_v = &memset; + memset_v(data, 0, size); + __asm__ __volatile__("" ::"r"(data) : "memory"); Review Comment: It seems questionable to me that the ultimate fallback should not be a simple loop ```suggestion for (size_t i = 0; i < size; ++i) { data[i] = 0; } ``` I don't know how cross platform this `__asm__` block is. Also, since this code is borrowed from another project it needs to be mentioned in LICENSE.txt, for example https://github.com/apache/arrow/blob/7df396eec4c620ddc4a4db780ea0d5ffa4cf685b/LICENSE.txt#L1541-L1543 ########## cpp/src/parquet/encryption/internal_file_decryptor.cc: ########## @@ -46,7 +46,7 @@ int32_t Decryptor::CiphertextLength(int32_t plaintext_len) const { int32_t Decryptor::Decrypt(::arrow::util::span<const uint8_t> ciphertext, ::arrow::util::span<uint8_t> plaintext) { - return aes_decryptor_->Decrypt(ciphertext, str2span(key_), str2span(aad_), plaintext); + return aes_decryptor_->Decrypt(ciphertext, key_.as_span(), str2span(aad_), plaintext); Review Comment: Since `str2span` is no longer used, we could delete it ########## cpp/src/parquet/encryption/encryption.h: ########## @@ -327,19 +322,19 @@ class PARQUET_EXPORT FileDecryptionProperties { } private: - std::string footer_key_; + encryption::SecureString footer_key_; std::string aad_prefix_; std::shared_ptr<AADPrefixVerifier> aad_prefix_verifier_; - const std::string empty_string_ = ""; + const encryption::SecureString empty_string_ = encryption::SecureString(""); Review Comment: This looks like a holdover from the first draft of encryption; IIRC column_key() returned references to strings at that time (so `return "";` would fail to compile) -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org