[GitHub] [arrow] thamht4190 commented on a change in pull request #8023: ARROW-9318: [C++] Parquet encryption key management
thamht4190 commented on a change in pull request #8023: URL: https://github.com/apache/arrow/pull/8023#discussion_r579596186 ## File path: cpp/src/parquet/encryption/test_encryption_util.cc ## @@ -0,0 +1,481 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// This module defines an abstract interface for iterating through pages in a +// Parquet column chunk within a row group. It could be extended in the future +// to iterate through all data pages in all chunks in a file. + +#include + +#include "parquet/encryption/test_encryption_util.h" +#include "parquet/file_reader.h" +#include "parquet/file_writer.h" +#include "parquet/properties.h" + +using FileClass = ::arrow::io::FileOutputStream; + +namespace parquet { +namespace encryption { +namespace test { + +FileEncryptor::FileEncryptor() { schema_ = SetupEncryptionSchema(); } + +std::shared_ptr FileEncryptor::SetupEncryptionSchema() { + parquet::schema::NodeVector fields; + // Create a primitive node named 'boolean_field' with type:BOOLEAN, + // repetition:REQUIRED Review comment: I removed them. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] thamht4190 commented on a change in pull request #8023: ARROW-9318: [C++] Parquet encryption key management
thamht4190 commented on a change in pull request #8023: URL: https://github.com/apache/arrow/pull/8023#discussion_r579596103 ## File path: cpp/src/parquet/encryption/test_encryption_util.cc ## @@ -0,0 +1,481 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// This module defines an abstract interface for iterating through pages in a +// Parquet column chunk within a row group. It could be extended in the future +// to iterate through all data pages in all chunks in a file. + +#include + +#include "parquet/encryption/test_encryption_util.h" +#include "parquet/file_reader.h" +#include "parquet/file_writer.h" +#include "parquet/properties.h" + +using FileClass = ::arrow::io::FileOutputStream; + +namespace parquet { +namespace encryption { +namespace test { + +FileEncryptor::FileEncryptor() { schema_ = SetupEncryptionSchema(); } + +std::shared_ptr FileEncryptor::SetupEncryptionSchema() { + parquet::schema::NodeVector fields; + // Create a primitive node named 'boolean_field' with type:BOOLEAN, + // repetition:REQUIRED + fields.push_back(PrimitiveNode::Make(kBooleanFieldName, Repetition::REQUIRED, + Type::BOOLEAN, ConvertedType::NONE)); + + // Create a primitive node named 'int32_field' with type:INT32, repetition:REQUIRED, + // logical type:TIME_MILLIS + fields.push_back(PrimitiveNode::Make(kInt32FieldName, Repetition::REQUIRED, Type::INT32, + ConvertedType::TIME_MILLIS)); + + // Create a primitive node named 'int64_field' with type:INT64, repetition:REPEATED + fields.push_back(PrimitiveNode::Make(kInt64FieldName, Repetition::REPEATED, Type::INT64, + ConvertedType::NONE)); + + fields.push_back(PrimitiveNode::Make(kInt96FieldName, Repetition::REQUIRED, Type::INT96, + ConvertedType::NONE)); + + fields.push_back(PrimitiveNode::Make(kFloatFieldName, Repetition::REQUIRED, Type::FLOAT, + ConvertedType::NONE)); + + fields.push_back(PrimitiveNode::Make(kDoubleFieldName, Repetition::REQUIRED, + Type::DOUBLE, ConvertedType::NONE)); + + // Create a primitive node named 'ba_field' with type:BYTE_ARRAY, repetition:OPTIONAL + fields.push_back(PrimitiveNode::Make(kByteArrayFieldName, Repetition::OPTIONAL, + Type::BYTE_ARRAY, ConvertedType::NONE)); + + // Create a primitive node named 'flba_field' with type:FIXED_LEN_BYTE_ARRAY, + // repetition:REQUIRED, field_length = kFixedLength + fields.push_back(PrimitiveNode::Make(kFixedLenByteArrayFieldName, Repetition::REQUIRED, + Type::FIXED_LEN_BYTE_ARRAY, ConvertedType::NONE, + kFixedLength)); + + // Create a GroupNode named 'schema' using the primitive nodes defined above + // This GroupNode is the root node of the schema tree + return std::static_pointer_cast( + GroupNode::Make("schema", Repetition::REQUIRED, fields)); +} + +void FileEncryptor::EncryptFile( +std::string file, +std::shared_ptr encryption_configurations) { + WriterProperties::Builder prop_builder; + prop_builder.compression(parquet::Compression::SNAPPY); + prop_builder.encryption(encryption_configurations); + std::shared_ptr writer_properties = prop_builder.build(); + + PARQUET_ASSIGN_OR_THROW(auto out_file, FileClass::Open(file)); + // Create a ParquetFileWriter instance + std::shared_ptr file_writer = + parquet::ParquetFileWriter::Open(out_file, schema_, writer_properties); + + for (int r = 0; r < num_rgs; r++) { +bool buffered_mode = r % 2 == 0; +auto row_group_writer = buffered_mode ? file_writer->AppendBufferedRowGroup() + : file_writer->AppendRowGroup(); + +int column_index = 0; +// Captures i by reference; increments it by one +auto get_next_column = [&]() { + return buffered_mode ? row_group_writer->column(column_index++) + : row_group_writer->NextColumn(); +}; + +// Write the Bool column +parquet::BoolWriter* bool_writer = +
[GitHub] [arrow] thamht4190 commented on a change in pull request #8023: ARROW-9318: [C++] Parquet encryption key management
thamht4190 commented on a change in pull request #8023: URL: https://github.com/apache/arrow/pull/8023#discussion_r579596080 ## File path: cpp/src/parquet/encryption/test_encryption_util.cc ## @@ -0,0 +1,481 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// This module defines an abstract interface for iterating through pages in a +// Parquet column chunk within a row group. It could be extended in the future +// to iterate through all data pages in all chunks in a file. + +#include + +#include "parquet/encryption/test_encryption_util.h" +#include "parquet/file_reader.h" +#include "parquet/file_writer.h" +#include "parquet/properties.h" + +using FileClass = ::arrow::io::FileOutputStream; + +namespace parquet { +namespace encryption { +namespace test { + +FileEncryptor::FileEncryptor() { schema_ = SetupEncryptionSchema(); } + +std::shared_ptr FileEncryptor::SetupEncryptionSchema() { + parquet::schema::NodeVector fields; + // Create a primitive node named 'boolean_field' with type:BOOLEAN, + // repetition:REQUIRED + fields.push_back(PrimitiveNode::Make(kBooleanFieldName, Repetition::REQUIRED, + Type::BOOLEAN, ConvertedType::NONE)); + + // Create a primitive node named 'int32_field' with type:INT32, repetition:REQUIRED, + // logical type:TIME_MILLIS + fields.push_back(PrimitiveNode::Make(kInt32FieldName, Repetition::REQUIRED, Type::INT32, + ConvertedType::TIME_MILLIS)); + + // Create a primitive node named 'int64_field' with type:INT64, repetition:REPEATED + fields.push_back(PrimitiveNode::Make(kInt64FieldName, Repetition::REPEATED, Type::INT64, + ConvertedType::NONE)); + + fields.push_back(PrimitiveNode::Make(kInt96FieldName, Repetition::REQUIRED, Type::INT96, + ConvertedType::NONE)); + + fields.push_back(PrimitiveNode::Make(kFloatFieldName, Repetition::REQUIRED, Type::FLOAT, + ConvertedType::NONE)); + + fields.push_back(PrimitiveNode::Make(kDoubleFieldName, Repetition::REQUIRED, + Type::DOUBLE, ConvertedType::NONE)); + + // Create a primitive node named 'ba_field' with type:BYTE_ARRAY, repetition:OPTIONAL + fields.push_back(PrimitiveNode::Make(kByteArrayFieldName, Repetition::OPTIONAL, + Type::BYTE_ARRAY, ConvertedType::NONE)); + + // Create a primitive node named 'flba_field' with type:FIXED_LEN_BYTE_ARRAY, + // repetition:REQUIRED, field_length = kFixedLength + fields.push_back(PrimitiveNode::Make(kFixedLenByteArrayFieldName, Repetition::REQUIRED, + Type::FIXED_LEN_BYTE_ARRAY, ConvertedType::NONE, + kFixedLength)); + + // Create a GroupNode named 'schema' using the primitive nodes defined above + // This GroupNode is the root node of the schema tree + return std::static_pointer_cast( + GroupNode::Make("schema", Repetition::REQUIRED, fields)); +} + +void FileEncryptor::EncryptFile( +std::string file, +std::shared_ptr encryption_configurations) { + WriterProperties::Builder prop_builder; + prop_builder.compression(parquet::Compression::SNAPPY); + prop_builder.encryption(encryption_configurations); + std::shared_ptr writer_properties = prop_builder.build(); + + PARQUET_ASSIGN_OR_THROW(auto out_file, FileClass::Open(file)); + // Create a ParquetFileWriter instance + std::shared_ptr file_writer = + parquet::ParquetFileWriter::Open(out_file, schema_, writer_properties); + + for (int r = 0; r < num_rgs; r++) { +bool buffered_mode = r % 2 == 0; +auto row_group_writer = buffered_mode ? file_writer->AppendBufferedRowGroup() + : file_writer->AppendRowGroup(); + +int column_index = 0; +// Captures i by reference; increments it by one +auto get_next_column = [&]() { + return buffered_mode ? row_group_writer->column(column_index++) + : row_group_writer->NextColumn(); +}; + +// Write the Bool column +parquet::BoolWriter* bool_writer = +
[GitHub] [arrow] thamht4190 commented on a change in pull request #8023: ARROW-9318: [C++] Parquet encryption key management
thamht4190 commented on a change in pull request #8023: URL: https://github.com/apache/arrow/pull/8023#discussion_r579595188 ## File path: cpp/src/parquet/encryption/test_encryption_util.cc ## @@ -0,0 +1,481 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// This module defines an abstract interface for iterating through pages in a +// Parquet column chunk within a row group. It could be extended in the future +// to iterate through all data pages in all chunks in a file. + +#include + +#include "parquet/encryption/test_encryption_util.h" +#include "parquet/file_reader.h" +#include "parquet/file_writer.h" +#include "parquet/properties.h" + +using FileClass = ::arrow::io::FileOutputStream; + +namespace parquet { +namespace encryption { +namespace test { + +FileEncryptor::FileEncryptor() { schema_ = SetupEncryptionSchema(); } + +std::shared_ptr FileEncryptor::SetupEncryptionSchema() { + parquet::schema::NodeVector fields; + // Create a primitive node named 'boolean_field' with type:BOOLEAN, + // repetition:REQUIRED + fields.push_back(PrimitiveNode::Make(kBooleanFieldName, Repetition::REQUIRED, + Type::BOOLEAN, ConvertedType::NONE)); + + // Create a primitive node named 'int32_field' with type:INT32, repetition:REQUIRED, + // logical type:TIME_MILLIS + fields.push_back(PrimitiveNode::Make(kInt32FieldName, Repetition::REQUIRED, Type::INT32, + ConvertedType::TIME_MILLIS)); + + // Create a primitive node named 'int64_field' with type:INT64, repetition:REPEATED + fields.push_back(PrimitiveNode::Make(kInt64FieldName, Repetition::REPEATED, Type::INT64, + ConvertedType::NONE)); + + fields.push_back(PrimitiveNode::Make(kInt96FieldName, Repetition::REQUIRED, Type::INT96, + ConvertedType::NONE)); + + fields.push_back(PrimitiveNode::Make(kFloatFieldName, Repetition::REQUIRED, Type::FLOAT, + ConvertedType::NONE)); + + fields.push_back(PrimitiveNode::Make(kDoubleFieldName, Repetition::REQUIRED, + Type::DOUBLE, ConvertedType::NONE)); + + // Create a primitive node named 'ba_field' with type:BYTE_ARRAY, repetition:OPTIONAL + fields.push_back(PrimitiveNode::Make(kByteArrayFieldName, Repetition::OPTIONAL, + Type::BYTE_ARRAY, ConvertedType::NONE)); + + // Create a primitive node named 'flba_field' with type:FIXED_LEN_BYTE_ARRAY, + // repetition:REQUIRED, field_length = kFixedLength + fields.push_back(PrimitiveNode::Make(kFixedLenByteArrayFieldName, Repetition::REQUIRED, + Type::FIXED_LEN_BYTE_ARRAY, ConvertedType::NONE, + kFixedLength)); Review comment: These code are not new, I only moved it to a new file inside `encryption/` folder, and reuse it for key management test. These input/output files of this test has been used in interop test with java, so the scheme should be kept unchanged. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] thamht4190 commented on a change in pull request #8023: ARROW-9318: [C++] Parquet encryption key management
thamht4190 commented on a change in pull request #8023: URL: https://github.com/apache/arrow/pull/8023#discussion_r579594227 ## File path: cpp/src/parquet/encryption/properties_driven_crypto_factory.h ## @@ -0,0 +1,210 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "parquet/encryption/encryption.h" +#include "parquet/encryption/file_key_wrapper.h" +#include "parquet/encryption/key_toolkit.h" +#include "parquet/encryption/kms_client_factory.h" +#include "parquet/platform.h" + +namespace parquet { +namespace encryption { + +static constexpr ParquetCipher::type kDefaultEncryptionAlgorithm = +ParquetCipher::AES_GCM_V1; +static constexpr bool kDefaultPlaintextFooter = false; +static constexpr bool kDefaultDoubleWrapping = true; +static constexpr uint64_t kDefaultCacheLifetimeSeconds = 600; // 10 minutes +static constexpr bool kDefaultInternalKeyMaterial = true; +static constexpr bool kDefaultUniformEncryption = false; +static constexpr int32_t kDefaultDataKeyLengthBits = 128; + +class PARQUET_EXPORT EncryptionConfiguration { + public: + class PARQUET_EXPORT Builder { + public: +/// footer_key: ID of the master key for footer encryption/signing +explicit Builder(const std::string& footer_key) +: footer_key_(footer_key), + encryption_algorithm_(kDefaultEncryptionAlgorithm), + plaintext_footer_(kDefaultPlaintextFooter), + double_wrapping_(kDefaultDoubleWrapping), + cache_lifetime_seconds_(kDefaultCacheLifetimeSeconds), + internal_key_material_(kDefaultInternalKeyMaterial), + uniform_encryption_(kDefaultUniformEncryption), + data_key_length_bits_(kDefaultDataKeyLengthBits) {} + +/// List of columns to encrypt, with master key IDs (see HIVE-21848). +/// Format: "masterKeyID:colName,colName;masterKeyID:colName..." +/// Either +/// column_keys(const std::string&) +/// or +/// uniform_encryption() +/// must be called. If none are called, or if both are called, an exception will be +/// thrown. +Builder* column_keys(const std::string& column_keys); + +/// encrypt footer and all columns with the same encryption key. +Builder* uniform_encryption(); + +/// Parquet encryption algorithm. Can be "AES_GCM_V1" (default), or "AES_GCM_CTR_V1". +Builder* encryption_algorithm(ParquetCipher::type algo); + +/// Write files with plaintext footer. +/// The default is false - files are written with encrypted footer. +Builder* plaintext_footer(bool plaintext_footer); + +/// Use double wrapping - where data encryption keys (DEKs) are encrypted with key +/// encryption keys (KEKs), which in turn are encrypted with master keys. +/// The default is true. If set to false, use single wrapping - where DEKs are +/// encrypted directly with master keys. +Builder* double_wrapping(bool double_wrapping); + +/// Lifetime of cached entities (key encryption keys, local wrapping keys, KMS client +/// objects). +/// The default is 600 (10 minutes). +Builder* cache_lifetime_seconds(uint64_t cache_lifetime_seconds); + +/// Store key material inside Parquet file footers; this mode doesn’t produce +/// additional files. By default, true. If set to false, key material is stored in +/// separate files in the same folder, which enables key rotation for immutable +/// Parquet files. +Builder* internal_key_material(bool internal_key_material); + +/// Length of data encryption keys (DEKs), randomly generated by parquet key +/// management tools. Can be 128, 192 or 256 bits. +/// The default is 128 bits. +Builder* data_key_length_bits(int32_t data_key_length_bits); + +std::shared_ptr build(); + + private: +std::string footer_key_; +std::string column_keys_; +ParquetCipher::type encryption_algorithm_; +bool plaintext_footer_; +bool double_wrapping_; +uint64_t cache_lifetime_seconds_; +bool internal_key_material_; +bool uniform_encryption_; +int32_t data_key_length_bits_; + }; + + const std::string& footer_key() const { return footer_key_; } + const std::string& column_keys()
[GitHub] [arrow] thamht4190 commented on a change in pull request #8023: ARROW-9318: [C++] Parquet encryption key management
thamht4190 commented on a change in pull request #8023: URL: https://github.com/apache/arrow/pull/8023#discussion_r498709142 ## File path: cpp/src/parquet/encryption/remote_kms_client.cc ## @@ -0,0 +1,127 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/json/object_parser.h" +#include "arrow/json/object_writer.h" + +#include "parquet/encryption/key_toolkit_internal.h" +#include "parquet/encryption/remote_kms_client.h" +#include "parquet/exception.h" + +using arrow::json::ObjectParser; +using arrow::json::ObjectWriter; + +namespace parquet { +namespace encryption { + +constexpr const char RemoteKmsClient::kLocalWrapNoKeyVersion[]; + +constexpr const char RemoteKmsClient::LocalKeyWrap::kLocalWrapKeyVersionField[]; +constexpr const char RemoteKmsClient::LocalKeyWrap::kLocalWrapEncryptedKeyField[]; + +RemoteKmsClient::LocalKeyWrap::LocalKeyWrap(const std::string& master_key_version, +const std::string& encrypted_encoded_key) +: encrypted_encoded_key_(encrypted_encoded_key), + master_key_version_(master_key_version) {} + +std::string RemoteKmsClient::LocalKeyWrap::CreateSerialized( +const std::string& encrypted_encoded_key) { + ObjectWriter json_writer; + + json_writer.SetString(kLocalWrapKeyVersionField, kLocalWrapNoKeyVersion); + json_writer.SetString(kLocalWrapEncryptedKeyField, encrypted_encoded_key); + + return json_writer.Serialize(); +} + +RemoteKmsClient::LocalKeyWrap RemoteKmsClient::LocalKeyWrap::Parse( +const std::string& wrapped_key) { + ObjectParser json_parser; + if (!json_parser.Parse(wrapped_key)) { +throw ParquetException("Failed to parse local key wrap json " + wrapped_key); + } + std::string master_key_version; + PARQUET_ASSIGN_OR_THROW(master_key_version, + json_parser.GetString(kLocalWrapKeyVersionField)); + + std::string encrypted_encoded_key; + PARQUET_ASSIGN_OR_THROW(encrypted_encoded_key, + json_parser.GetString(kLocalWrapEncryptedKeyField)); + + return RemoteKmsClient::LocalKeyWrap(master_key_version, encrypted_encoded_key); +} + +void RemoteKmsClient::Initialize(const KmsConnectionConfig& kms_connection_config, + bool is_wrap_locally) { + kms_connection_config_ = kms_connection_config; + is_wrap_locally_ = is_wrap_locally; + if (is_wrap_locally_) { +master_key_cache_.Clear(); + } + + is_default_token_ = + kms_connection_config_.key_access_token() == KmsClient::kKeyAccessTokenDefault; + + InitializeInternal(); +} + +std::string RemoteKmsClient::WrapKey(const std::string& key_bytes, + const std::string& master_key_identifier) { + if (is_wrap_locally_) { Review comment: Thanks! This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] thamht4190 commented on a change in pull request #8023: ARROW-9318: [C++] Parquet encryption key management
thamht4190 commented on a change in pull request #8023: URL: https://github.com/apache/arrow/pull/8023#discussion_r498685819 ## File path: cpp/src/parquet/encryption/remote_kms_client.cc ## @@ -0,0 +1,127 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/json/object_parser.h" +#include "arrow/json/object_writer.h" + +#include "parquet/encryption/key_toolkit_internal.h" +#include "parquet/encryption/remote_kms_client.h" +#include "parquet/exception.h" + +using arrow::json::ObjectParser; +using arrow::json::ObjectWriter; + +namespace parquet { +namespace encryption { + +constexpr const char RemoteKmsClient::kLocalWrapNoKeyVersion[]; + +constexpr const char RemoteKmsClient::LocalKeyWrap::kLocalWrapKeyVersionField[]; +constexpr const char RemoteKmsClient::LocalKeyWrap::kLocalWrapEncryptedKeyField[]; + +RemoteKmsClient::LocalKeyWrap::LocalKeyWrap(const std::string& master_key_version, +const std::string& encrypted_encoded_key) +: encrypted_encoded_key_(encrypted_encoded_key), + master_key_version_(master_key_version) {} + +std::string RemoteKmsClient::LocalKeyWrap::CreateSerialized( +const std::string& encrypted_encoded_key) { + ObjectWriter json_writer; + + json_writer.SetString(kLocalWrapKeyVersionField, kLocalWrapNoKeyVersion); + json_writer.SetString(kLocalWrapEncryptedKeyField, encrypted_encoded_key); + + return json_writer.Serialize(); +} + +RemoteKmsClient::LocalKeyWrap RemoteKmsClient::LocalKeyWrap::Parse( +const std::string& wrapped_key) { + ObjectParser json_parser; + if (!json_parser.Parse(wrapped_key)) { +throw ParquetException("Failed to parse local key wrap json " + wrapped_key); + } + std::string master_key_version; + PARQUET_ASSIGN_OR_THROW(master_key_version, + json_parser.GetString(kLocalWrapKeyVersionField)); + + std::string encrypted_encoded_key; + PARQUET_ASSIGN_OR_THROW(encrypted_encoded_key, + json_parser.GetString(kLocalWrapEncryptedKeyField)); + + return RemoteKmsClient::LocalKeyWrap(master_key_version, encrypted_encoded_key); +} + +void RemoteKmsClient::Initialize(const KmsConnectionConfig& kms_connection_config, + bool is_wrap_locally) { + kms_connection_config_ = kms_connection_config; + is_wrap_locally_ = is_wrap_locally; + if (is_wrap_locally_) { +master_key_cache_.Clear(); + } + + is_default_token_ = + kms_connection_config_.key_access_token() == KmsClient::kKeyAccessTokenDefault; + + InitializeInternal(); +} + +std::string RemoteKmsClient::WrapKey(const std::string& key_bytes, + const std::string& master_key_identifier) { + if (is_wrap_locally_) { Review comment: Right @pitrou. I was over-thought. @ggershinsky Actually, as a C++ developer, when looking at `RemoteKmsClient` class implementation: * If I'm going to write in-server wrapping kms client, I can see that `RemoteKmsClient` doesn't help much (and also has a lot of local wrapping stuff), I may choose to write `MyRemoteKmsClient` which inherits from `KmsClient`, not `RemoteKmsClient`. There are 2 advantages I can see: reducing one level of inheritance and a lighter-weight class. * If I'm going to write local wrapping kms client, I still need to implement `WrapKeyInServer` and `UnwrapKeyInServer` (even I can let them empty methods). As you said to me, users are going to use either local wrapping or in-server wrapping in their system, not both at the same time. So current implementation of `RemoteKmsClient` doesn't really make sense to me. Will you reconsider at this point? This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] thamht4190 commented on a change in pull request #8023: ARROW-9318: [C++] Parquet encryption key management
thamht4190 commented on a change in pull request #8023: URL: https://github.com/apache/arrow/pull/8023#discussion_r498676039 ## File path: cpp/src/parquet/encryption/test_encryption_util.h ## @@ -0,0 +1,152 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// This module defines an abstract interface for iterating through pages in a +// Parquet column chunk within a row group. It could be extended in the future +// to iterate through all data pages in all chunks in a file. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "arrow/status.h" +#include "arrow/testing/util.h" +#include "arrow/util/io_util.h" + +#include "parquet/column_page.h" +#include "parquet/column_reader.h" +#include "parquet/column_writer.h" +#include "parquet/encoding.h" +#include "parquet/encryption/encryption.h" +#include "parquet/platform.h" +#include "parquet/test_util.h" + +namespace parquet { +namespace encryption { +namespace test { + +using arrow::internal::TemporaryDir; + +using parquet::ConvertedType; +using parquet::Repetition; +using parquet::Type; +using schema::GroupNode; +using schema::NodePtr; +using schema::PrimitiveNode; + +constexpr int kFixedLength = 10; + +const char kFooterEncryptionKey[] = "0123456789012345"; // 128bit/16 +const char kColumnEncryptionKey1[] = "1234567890123450"; +const char kColumnEncryptionKey2[] = "1234567890123451"; +const char kFileName[] = "tester"; + +inline std::string data_file(const char* file) { + std::string dir_string(parquet::test::get_data_dir()); + std::stringstream ss; + ss << dir_string << "/" << file; + return ss.str(); +} + +// A temporary directory that contains the encrypted files generated in the tests. +extern std::unique_ptr temp_dir; + +inline arrow::Result> temp_data_dir() { + arrow::Result> dir; + ARROW_ASSIGN_OR_RAISE(dir, TemporaryDir::Make("parquet-encryption-test-")); + return dir; +} + +static constexpr const char kDoubleFieldName[] = "double_field"; +static constexpr const char kFloatFieldName[] = "float_field"; +static constexpr const char kBooleanFieldName[] = "boolean_field"; +static constexpr const char kInt32FieldName[] = "int32_field"; +static constexpr const char kInt64FieldName[] = "int64_field"; +static constexpr const char kInt96FieldName[] = "int96_field"; +static constexpr const char kByteArrayFieldName[] = "ba_field"; +static constexpr const char kFixedLenByteArrayFieldName[] = "flba_field"; + +const char kFooterMasterKey[] = "0123456789112345"; Review comment: No special reason. I changed all of them to `const char`. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] thamht4190 commented on a change in pull request #8023: ARROW-9318: [C++] Parquet encryption key management
thamht4190 commented on a change in pull request #8023: URL: https://github.com/apache/arrow/pull/8023#discussion_r498334175 ## File path: cpp/src/parquet/encryption/remote_kms_client.cc ## @@ -0,0 +1,127 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/json/object_parser.h" +#include "arrow/json/object_writer.h" + +#include "parquet/encryption/key_toolkit_internal.h" +#include "parquet/encryption/remote_kms_client.h" +#include "parquet/exception.h" + +using arrow::json::ObjectParser; +using arrow::json::ObjectWriter; + +namespace parquet { +namespace encryption { + +constexpr const char RemoteKmsClient::kLocalWrapNoKeyVersion[]; + +constexpr const char RemoteKmsClient::LocalKeyWrap::kLocalWrapKeyVersionField[]; +constexpr const char RemoteKmsClient::LocalKeyWrap::kLocalWrapEncryptedKeyField[]; + +RemoteKmsClient::LocalKeyWrap::LocalKeyWrap(const std::string& master_key_version, +const std::string& encrypted_encoded_key) +: encrypted_encoded_key_(encrypted_encoded_key), + master_key_version_(master_key_version) {} + +std::string RemoteKmsClient::LocalKeyWrap::CreateSerialized( +const std::string& encrypted_encoded_key) { + ObjectWriter json_writer; + + json_writer.SetString(kLocalWrapKeyVersionField, kLocalWrapNoKeyVersion); + json_writer.SetString(kLocalWrapEncryptedKeyField, encrypted_encoded_key); + + return json_writer.Serialize(); +} + +RemoteKmsClient::LocalKeyWrap RemoteKmsClient::LocalKeyWrap::Parse( +const std::string& wrapped_key) { + ObjectParser json_parser; + if (!json_parser.Parse(wrapped_key)) { +throw ParquetException("Failed to parse local key wrap json " + wrapped_key); + } + std::string master_key_version; + PARQUET_ASSIGN_OR_THROW(master_key_version, + json_parser.GetString(kLocalWrapKeyVersionField)); + + std::string encrypted_encoded_key; + PARQUET_ASSIGN_OR_THROW(encrypted_encoded_key, + json_parser.GetString(kLocalWrapEncryptedKeyField)); + + return RemoteKmsClient::LocalKeyWrap(master_key_version, encrypted_encoded_key); +} + +void RemoteKmsClient::Initialize(const KmsConnectionConfig& kms_connection_config, + bool is_wrap_locally) { + kms_connection_config_ = kms_connection_config; + is_wrap_locally_ = is_wrap_locally; + if (is_wrap_locally_) { +master_key_cache_.Clear(); + } + + is_default_token_ = + kms_connection_config_.key_access_token() == KmsClient::kKeyAccessTokenDefault; + + InitializeInternal(); +} + +std::string RemoteKmsClient::WrapKey(const std::string& key_bytes, + const std::string& master_key_identifier) { + if (is_wrap_locally_) { Review comment: What about keeping `RemoteKmsClient` API the same, but write more 2 internal classes, like below: ``` namespace internal { class LocalWrappingRemoteKmsClient {}; class InServerWrappingRemoteKmsClient {}; } class RemoteKmsClient { private: bool wrap_locally_; arrow::util::variant internal_kms_client_; } ``` Any idea? @pitrou @ggershinsky This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] thamht4190 commented on a change in pull request #8023: ARROW-9318: [C++] Parquet encryption key management
thamht4190 commented on a change in pull request #8023: URL: https://github.com/apache/arrow/pull/8023#discussion_r498326446 ## File path: cpp/src/arrow/util/concurrent_map.h ## @@ -0,0 +1,81 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "arrow/util/mutex.h" + +namespace arrow { +namespace util { + +template +class ConcurrentMap { Review comment: @pitrou Yes, in this case, only the map is accessed by multiple threads. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] thamht4190 commented on a change in pull request #8023: ARROW-9318: [C++] Parquet encryption key management
thamht4190 commented on a change in pull request #8023: URL: https://github.com/apache/arrow/pull/8023#discussion_r498154775 ## File path: cpp/src/parquet/encryption/properties_driven_crypto_factory.h ## @@ -0,0 +1,210 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "parquet/encryption/encryption.h" +#include "parquet/encryption/file_key_wrapper.h" +#include "parquet/encryption/key_toolkit.h" +#include "parquet/encryption/kms_client_factory.h" +#include "parquet/platform.h" + +namespace parquet { +namespace encryption { + +static constexpr ParquetCipher::type kDefaultEncryptionAlgorithm = +ParquetCipher::AES_GCM_V1; +static constexpr bool kDefaultPlaintextFooter = false; +static constexpr bool kDefaultDoubleWrapping = true; +static constexpr uint64_t kDefaultCacheLifetimeSeconds = 600; // 10 minutes +static constexpr bool kDefaultInternalKeyMaterial = true; +static constexpr bool kDefaultUniformEncryption = false; +static constexpr int32_t kDefaultDataKeyLengthBits = 128; + +class PARQUET_EXPORT EncryptionConfiguration { + public: + class PARQUET_EXPORT Builder { + public: +/// footer_key: ID of the master key for footer encryption/signing +explicit Builder(const std::string& footer_key) +: footer_key_(footer_key), + encryption_algorithm_(kDefaultEncryptionAlgorithm), + plaintext_footer_(kDefaultPlaintextFooter), + double_wrapping_(kDefaultDoubleWrapping), + cache_lifetime_seconds_(kDefaultCacheLifetimeSeconds), + internal_key_material_(kDefaultInternalKeyMaterial), + uniform_encryption_(kDefaultUniformEncryption), + data_key_length_bits_(kDefaultDataKeyLengthBits) {} + +/// List of columns to encrypt, with master key IDs (see HIVE-21848). +/// Format: "masterKeyID:colName,colName;masterKeyID:colName..." +/// Either +/// column_keys(const std::string&) +/// or +/// uniform_encryption() +/// must be called. If none are called, or if both are called, an exception will be +/// thrown. +Builder* column_keys(const std::string& column_keys); + +/// encrypt footer and all columns with the same encryption key. +Builder* uniform_encryption(); + +/// Parquet encryption algorithm. Can be "AES_GCM_V1" (default), or "AES_GCM_CTR_V1". +Builder* encryption_algorithm(ParquetCipher::type algo); + +/// Write files with plaintext footer. +/// The default is false - files are written with encrypted footer. +Builder* plaintext_footer(bool plaintext_footer); + +/// Use double wrapping - where data encryption keys (DEKs) are encrypted with key +/// encryption keys (KEKs), which in turn are encrypted with master keys. +/// The default is true. If set to false, use single wrapping - where DEKs are +/// encrypted directly with master keys. +Builder* double_wrapping(bool double_wrapping); + +/// Lifetime of cached entities (key encryption keys, local wrapping keys, KMS client +/// objects). +/// The default is 600 (10 minutes). +Builder* cache_lifetime_seconds(uint64_t cache_lifetime_seconds); + +/// Store key material inside Parquet file footers; this mode doesn’t produce +/// additional files. By default, true. If set to false, key material is stored in +/// separate files in the same folder, which enables key rotation for immutable +/// Parquet files. +Builder* internal_key_material(bool internal_key_material); + +/// Length of data encryption keys (DEKs), randomly generated by parquet key +/// management tools. Can be 128, 192 or 256 bits. +/// The default is 128 bits. +Builder* data_key_length_bits(int32_t data_key_length_bits); + +std::shared_ptr build(); + + private: +std::string footer_key_; +std::string column_keys_; +ParquetCipher::type encryption_algorithm_; +bool plaintext_footer_; +bool double_wrapping_; +uint64_t cache_lifetime_seconds_; +bool internal_key_material_; +bool uniform_encryption_; +int32_t data_key_length_bits_; + }; + + const std::string& footer_key() const { return footer_key_; } + const std::string& column_keys()
[GitHub] [arrow] thamht4190 commented on a change in pull request #8023: ARROW-9318: [C++] Parquet encryption key management
thamht4190 commented on a change in pull request #8023: URL: https://github.com/apache/arrow/pull/8023#discussion_r498153010 ## File path: cpp/src/parquet/encryption/key_metadata.h ## @@ -0,0 +1,92 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "arrow/util/variant.h" + +#include "parquet/encryption/key_material.h" +#include "parquet/exception.h" +#include "parquet/platform.h" + +namespace parquet { +namespace encryption { + +// Parquet encryption specification defines "key metadata" as an arbitrary byte array, +// generated by file writers for each encryption key, and passed to the low level API for +// storage in the file footer. The "key metadata" field is made available to file readers +// to enable recovery of the key. This interface can be utilized for implementation +// of any key management scheme. +// +// The keytools package (PARQUET-1373) implements one approach, of many possible, to key +// management and to generation of the "key metadata" fields. This approach, based on the +// "envelope encryption" pattern, allows integration with KMS servers. It keeps the actual +// material, required to recover a key, in a "key material" object (see the KeyMaterial +// class for details). This class is implemented to support version 1 of the parquet key +// management tools specification. +// +// KeyMetadata writes (and reads) the "key metadata" field as a flat json object, +// with the following fields: +// 1. "keyMaterialType" - a String, with the type of key material. +// 2. "internalStorage" - a boolean. If true, means that "key material" is kept inside the +// "key metadata" field. If false, "key material" is kept externally (outside Parquet +// files) - in this case, "key metadata" keeps a reference to the external "key material". +// 3. "keyReference" - a String, with the reference to the external "key material". +// Written only if internalStorage is false. +// +// If internalStorage is true, "key material" is a part of "key metadata", and the json +// keeps additional fields, described in the KeyMaterial class. +class PARQUET_EXPORT KeyMetadata { + public: + static constexpr const char kKeyMaterialInternalStorageField[] = "internalStorage"; + static constexpr const char kKeyReferenceField[] = "keyReference"; + + static KeyMetadata Parse(const std::string& key_metadata_bytes); Review comment: yes, `key_metadata_bytes` is expected to be a serialized json object. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] thamht4190 commented on a change in pull request #8023: ARROW-9318: [C++] Parquet encryption key management
thamht4190 commented on a change in pull request #8023: URL: https://github.com/apache/arrow/pull/8023#discussion_r498151648 ## File path: cpp/src/parquet/encryption/key_toolkit_internal.h ## @@ -0,0 +1,52 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +namespace parquet { +namespace encryption { +namespace internal { + +// "data encryption key" and "master key identifier" are paired together as output when +// parsing from "key material" +class KeyWithMasterId { + public: + KeyWithMasterId(const std::string& key_bytes, const std::string& master_id) + : key_bytes_(key_bytes), master_id_(master_id) {} + + const std::string& data_key() const { return key_bytes_; } + const std::string& master_id() const { return master_id_; } + + private: + const std::string key_bytes_; + const std::string master_id_; +}; + +/// Encrypts "key" with "master_key", using AES-GCM and the "aad" Review comment: Sure @ggershinsky. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] thamht4190 commented on a change in pull request #8023: ARROW-9318: [C++] Parquet encryption key management
thamht4190 commented on a change in pull request #8023: URL: https://github.com/apache/arrow/pull/8023#discussion_r498147171 ## File path: cpp/src/parquet/encryption/kms_client.h ## @@ -0,0 +1,84 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "parquet/exception.h" +#include "parquet/platform.h" + +namespace parquet { +namespace encryption { + +// This class wraps the key access token of a KMS server. If your token changes over time, +// you should keep the reference to the KeyAccessToken object and call Refresh() method +// every time you have a new token. +class PARQUET_EXPORT KeyAccessToken { + public: + KeyAccessToken() = default; + + explicit KeyAccessToken(const std::string value) : value_(value) {} + + void Refresh(const std::string& new_value) { value_ = new_value; } + + const std::string& value() const { return value_; } + + void SetDefaultIfEmpty(); Review comment: After checking again, in both case of wrapping and unwrapping, key access token is always set to "DEFAULT" if empty, so I will remove this method and set it from `KmsConnectionConfig` constructor. Thanks! This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] thamht4190 commented on a change in pull request #8023: ARROW-9318: [C++] Parquet encryption key management
thamht4190 commented on a change in pull request #8023: URL: https://github.com/apache/arrow/pull/8023#discussion_r498141296 ## File path: cpp/src/parquet/encryption/kms_client.h ## @@ -0,0 +1,84 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "parquet/exception.h" +#include "parquet/platform.h" + +namespace parquet { +namespace encryption { + +// This class wraps the key access token of a KMS server. If your token changes over time, +// you should keep the reference to the KeyAccessToken object and call Refresh() method +// every time you have a new token. +class PARQUET_EXPORT KeyAccessToken { + public: + KeyAccessToken() = default; + + explicit KeyAccessToken(const std::string value) : value_(value) {} + + void Refresh(const std::string& new_value) { value_ = new_value; } + + const std::string& value() const { return value_; } + + void SetDefaultIfEmpty(); + + private: + std::string value_; +}; + +struct PARQUET_EXPORT KmsConnectionConfig { + std::string kms_instance_id; + std::string kms_instance_url; + std::shared_ptr refreshable_key_access_token; Review comment: After parsing this `KmsConnectionConfig` object into `PropertiesDrivenCryptoFactory`, the token can be changed in the future, so we need `shared_ptr` to change its value later. `KeyAccessToken` can be accessed by multiple threads. I will add a mutex. Sorry, I forgot it. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] thamht4190 commented on a change in pull request #8023: ARROW-9318: [C++] Parquet encryption key management
thamht4190 commented on a change in pull request #8023: URL: https://github.com/apache/arrow/pull/8023#discussion_r498102558 ## File path: cpp/src/parquet/encryption/properties_driven_crypto_factory.h ## @@ -0,0 +1,210 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "parquet/encryption/encryption.h" +#include "parquet/encryption/file_key_wrapper.h" +#include "parquet/encryption/key_toolkit.h" +#include "parquet/encryption/kms_client_factory.h" +#include "parquet/platform.h" + +namespace parquet { +namespace encryption { + +static constexpr ParquetCipher::type kDefaultEncryptionAlgorithm = +ParquetCipher::AES_GCM_V1; +static constexpr bool kDefaultPlaintextFooter = false; +static constexpr bool kDefaultDoubleWrapping = true; +static constexpr uint64_t kDefaultCacheLifetimeSeconds = 600; // 10 minutes +static constexpr bool kDefaultInternalKeyMaterial = true; +static constexpr bool kDefaultUniformEncryption = false; +static constexpr int32_t kDefaultDataKeyLengthBits = 128; + +class PARQUET_EXPORT EncryptionConfiguration { + public: + class PARQUET_EXPORT Builder { + public: +/// footer_key: ID of the master key for footer encryption/signing +explicit Builder(const std::string& footer_key) +: footer_key_(footer_key), + encryption_algorithm_(kDefaultEncryptionAlgorithm), + plaintext_footer_(kDefaultPlaintextFooter), + double_wrapping_(kDefaultDoubleWrapping), + cache_lifetime_seconds_(kDefaultCacheLifetimeSeconds), + internal_key_material_(kDefaultInternalKeyMaterial), + uniform_encryption_(kDefaultUniformEncryption), + data_key_length_bits_(kDefaultDataKeyLengthBits) {} + +/// List of columns to encrypt, with master key IDs (see HIVE-21848). +/// Format: "masterKeyID:colName,colName;masterKeyID:colName..." +/// Either +/// column_keys(const std::string&) +/// or +/// uniform_encryption() +/// must be called. If none are called, or if both are called, an exception will be +/// thrown. +Builder* column_keys(const std::string& column_keys); + +/// encrypt footer and all columns with the same encryption key. +Builder* uniform_encryption(); + +/// Parquet encryption algorithm. Can be "AES_GCM_V1" (default), or "AES_GCM_CTR_V1". +Builder* encryption_algorithm(ParquetCipher::type algo); + +/// Write files with plaintext footer. +/// The default is false - files are written with encrypted footer. +Builder* plaintext_footer(bool plaintext_footer); + +/// Use double wrapping - where data encryption keys (DEKs) are encrypted with key +/// encryption keys (KEKs), which in turn are encrypted with master keys. +/// The default is true. If set to false, use single wrapping - where DEKs are +/// encrypted directly with master keys. +Builder* double_wrapping(bool double_wrapping); + +/// Lifetime of cached entities (key encryption keys, local wrapping keys, KMS client +/// objects). +/// The default is 600 (10 minutes). +Builder* cache_lifetime_seconds(uint64_t cache_lifetime_seconds); + +/// Store key material inside Parquet file footers; this mode doesn’t produce +/// additional files. By default, true. If set to false, key material is stored in +/// separate files in the same folder, which enables key rotation for immutable +/// Parquet files. +Builder* internal_key_material(bool internal_key_material); + +/// Length of data encryption keys (DEKs), randomly generated by parquet key +/// management tools. Can be 128, 192 or 256 bits. +/// The default is 128 bits. +Builder* data_key_length_bits(int32_t data_key_length_bits); + +std::shared_ptr build(); + + private: +std::string footer_key_; +std::string column_keys_; +ParquetCipher::type encryption_algorithm_; +bool plaintext_footer_; +bool double_wrapping_; +uint64_t cache_lifetime_seconds_; +bool internal_key_material_; +bool uniform_encryption_; +int32_t data_key_length_bits_; + }; + + const std::string& footer_key() const { return footer_key_; } + const std::string& column_keys()
[GitHub] [arrow] thamht4190 commented on a change in pull request #8023: ARROW-9318: [C++] Parquet encryption key management
thamht4190 commented on a change in pull request #8023: URL: https://github.com/apache/arrow/pull/8023#discussion_r498008512 ## File path: cpp/src/parquet/encryption/test_encryption_util.cc ## @@ -0,0 +1,481 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// This module defines an abstract interface for iterating through pages in a +// Parquet column chunk within a row group. It could be extended in the future +// to iterate through all data pages in all chunks in a file. + +#include + +#include "parquet/encryption/test_encryption_util.h" +#include "parquet/file_reader.h" +#include "parquet/file_writer.h" +#include "parquet/properties.h" + +using FileClass = ::arrow::io::FileOutputStream; + +namespace parquet { +namespace encryption { +namespace test { + +FileEncryptor::FileEncryptor() { schema_ = SetupEncryptionSchema(); } + +std::shared_ptr FileEncryptor::SetupEncryptionSchema() { + parquet::schema::NodeVector fields; + // Create a primitive node named 'boolean_field' with type:BOOLEAN, + // repetition:REQUIRED + fields.push_back(PrimitiveNode::Make(kBooleanFieldName, Repetition::REQUIRED, + Type::BOOLEAN, ConvertedType::NONE)); + + // Create a primitive node named 'int32_field' with type:INT32, repetition:REQUIRED, + // logical type:TIME_MILLIS + fields.push_back(PrimitiveNode::Make(kInt32FieldName, Repetition::REQUIRED, Type::INT32, + ConvertedType::TIME_MILLIS)); + + // Create a primitive node named 'int64_field' with type:INT64, repetition:REPEATED + fields.push_back(PrimitiveNode::Make(kInt64FieldName, Repetition::REPEATED, Type::INT64, + ConvertedType::NONE)); + + fields.push_back(PrimitiveNode::Make(kInt96FieldName, Repetition::REQUIRED, Type::INT96, + ConvertedType::NONE)); + + fields.push_back(PrimitiveNode::Make(kFloatFieldName, Repetition::REQUIRED, Type::FLOAT, + ConvertedType::NONE)); + + fields.push_back(PrimitiveNode::Make(kDoubleFieldName, Repetition::REQUIRED, + Type::DOUBLE, ConvertedType::NONE)); + + // Create a primitive node named 'ba_field' with type:BYTE_ARRAY, repetition:OPTIONAL + fields.push_back(PrimitiveNode::Make(kByteArrayFieldName, Repetition::OPTIONAL, + Type::BYTE_ARRAY, ConvertedType::NONE)); + + // Create a primitive node named 'flba_field' with type:FIXED_LEN_BYTE_ARRAY, + // repetition:REQUIRED, field_length = kFixedLength + fields.push_back(PrimitiveNode::Make(kFixedLenByteArrayFieldName, Repetition::REQUIRED, + Type::FIXED_LEN_BYTE_ARRAY, ConvertedType::NONE, + kFixedLength)); + + // Create a GroupNode named 'schema' using the primitive nodes defined above + // This GroupNode is the root node of the schema tree + return std::static_pointer_cast( + GroupNode::Make("schema", Repetition::REQUIRED, fields)); +} + +void FileEncryptor::EncryptFile( +std::string file, +std::shared_ptr encryption_configurations) { + WriterProperties::Builder prop_builder; + prop_builder.compression(parquet::Compression::SNAPPY); + prop_builder.encryption(encryption_configurations); + std::shared_ptr writer_properties = prop_builder.build(); + + PARQUET_ASSIGN_OR_THROW(auto out_file, FileClass::Open(file)); + // Create a ParquetFileWriter instance + std::shared_ptr file_writer = + parquet::ParquetFileWriter::Open(out_file, schema_, writer_properties); + + for (int r = 0; r < num_rgs; r++) { +bool buffered_mode = r % 2 == 0; +auto row_group_writer = buffered_mode ? file_writer->AppendBufferedRowGroup() + : file_writer->AppendRowGroup(); + +int column_index = 0; +// Captures i by reference; increments it by one +auto get_next_column = [&]() { + return buffered_mode ? row_group_writer->column(column_index++) + : row_group_writer->NextColumn(); +}; + +// Write the Bool column +parquet::BoolWriter* bool_writer = +
[GitHub] [arrow] thamht4190 commented on a change in pull request #8023: ARROW-9318: [C++] Parquet encryption key management
thamht4190 commented on a change in pull request #8023: URL: https://github.com/apache/arrow/pull/8023#discussion_r498005234 ## File path: cpp/src/parquet/encryption/properties_driven_crypto_factory.h ## @@ -0,0 +1,210 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "parquet/encryption/encryption.h" +#include "parquet/encryption/file_key_wrapper.h" +#include "parquet/encryption/key_toolkit.h" +#include "parquet/encryption/kms_client_factory.h" +#include "parquet/platform.h" + +namespace parquet { +namespace encryption { + +static constexpr ParquetCipher::type kDefaultEncryptionAlgorithm = +ParquetCipher::AES_GCM_V1; +static constexpr bool kDefaultPlaintextFooter = false; +static constexpr bool kDefaultDoubleWrapping = true; +static constexpr uint64_t kDefaultCacheLifetimeSeconds = 600; // 10 minutes +static constexpr bool kDefaultInternalKeyMaterial = true; +static constexpr bool kDefaultUniformEncryption = false; +static constexpr int32_t kDefaultDataKeyLengthBits = 128; + +class PARQUET_EXPORT EncryptionConfiguration { + public: + class PARQUET_EXPORT Builder { + public: +/// footer_key: ID of the master key for footer encryption/signing +explicit Builder(const std::string& footer_key) +: footer_key_(footer_key), + encryption_algorithm_(kDefaultEncryptionAlgorithm), + plaintext_footer_(kDefaultPlaintextFooter), + double_wrapping_(kDefaultDoubleWrapping), + cache_lifetime_seconds_(kDefaultCacheLifetimeSeconds), + internal_key_material_(kDefaultInternalKeyMaterial), + uniform_encryption_(kDefaultUniformEncryption), + data_key_length_bits_(kDefaultDataKeyLengthBits) {} + +/// List of columns to encrypt, with master key IDs (see HIVE-21848). +/// Format: "masterKeyID:colName,colName;masterKeyID:colName..." +/// Either +/// column_keys(const std::string&) +/// or +/// uniform_encryption() +/// must be called. If none are called, or if both are called, an exception will be +/// thrown. +Builder* column_keys(const std::string& column_keys); + +/// encrypt footer and all columns with the same encryption key. +Builder* uniform_encryption(); + +/// Parquet encryption algorithm. Can be "AES_GCM_V1" (default), or "AES_GCM_CTR_V1". +Builder* encryption_algorithm(ParquetCipher::type algo); + +/// Write files with plaintext footer. +/// The default is false - files are written with encrypted footer. +Builder* plaintext_footer(bool plaintext_footer); + +/// Use double wrapping - where data encryption keys (DEKs) are encrypted with key +/// encryption keys (KEKs), which in turn are encrypted with master keys. +/// The default is true. If set to false, use single wrapping - where DEKs are +/// encrypted directly with master keys. +Builder* double_wrapping(bool double_wrapping); + +/// Lifetime of cached entities (key encryption keys, local wrapping keys, KMS client +/// objects). +/// The default is 600 (10 minutes). +Builder* cache_lifetime_seconds(uint64_t cache_lifetime_seconds); + +/// Store key material inside Parquet file footers; this mode doesn’t produce +/// additional files. By default, true. If set to false, key material is stored in +/// separate files in the same folder, which enables key rotation for immutable +/// Parquet files. +Builder* internal_key_material(bool internal_key_material); + +/// Length of data encryption keys (DEKs), randomly generated by parquet key +/// management tools. Can be 128, 192 or 256 bits. +/// The default is 128 bits. +Builder* data_key_length_bits(int32_t data_key_length_bits); + +std::shared_ptr build(); + + private: +std::string footer_key_; +std::string column_keys_; +ParquetCipher::type encryption_algorithm_; +bool plaintext_footer_; +bool double_wrapping_; +uint64_t cache_lifetime_seconds_; +bool internal_key_material_; +bool uniform_encryption_; +int32_t data_key_length_bits_; + }; + + const std::string& footer_key() const { return footer_key_; } + const std::string& column_keys()
[GitHub] [arrow] thamht4190 commented on a change in pull request #8023: ARROW-9318: [C++] Parquet encryption key management
thamht4190 commented on a change in pull request #8023: URL: https://github.com/apache/arrow/pull/8023#discussion_r497999593 ## File path: cpp/src/parquet/encryption/key_metadata.h ## @@ -0,0 +1,92 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "arrow/util/variant.h" + +#include "parquet/encryption/key_material.h" +#include "parquet/exception.h" +#include "parquet/platform.h" + +namespace parquet { +namespace encryption { + +// Parquet encryption specification defines "key metadata" as an arbitrary byte array, +// generated by file writers for each encryption key, and passed to the low level API for +// storage in the file footer. The "key metadata" field is made available to file readers +// to enable recovery of the key. This interface can be utilized for implementation +// of any key management scheme. +// +// The keytools package (PARQUET-1373) implements one approach, of many possible, to key +// management and to generation of the "key metadata" fields. This approach, based on the +// "envelope encryption" pattern, allows integration with KMS servers. It keeps the actual +// material, required to recover a key, in a "key material" object (see the KeyMaterial +// class for details). This class is implemented to support version 1 of the parquet key +// management tools specification. +// +// KeyMetadata writes (and reads) the "key metadata" field as a flat json object, +// with the following fields: +// 1. "keyMaterialType" - a String, with the type of key material. +// 2. "internalStorage" - a boolean. If true, means that "key material" is kept inside the +// "key metadata" field. If false, "key material" is kept externally (outside Parquet +// files) - in this case, "key metadata" keeps a reference to the external "key material". +// 3. "keyReference" - a String, with the reference to the external "key material". +// Written only if internalStorage is false. +// +// If internalStorage is true, "key material" is a part of "key metadata", and the json +// keeps additional fields, described in the KeyMaterial class. +class PARQUET_EXPORT KeyMetadata { + public: + static constexpr const char kKeyMaterialInternalStorageField[] = "internalStorage"; + static constexpr const char kKeyReferenceField[] = "keyReference"; Review comment: These are used in `key_material.cc` This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] thamht4190 commented on a change in pull request #8023: ARROW-9318: [C++] Parquet encryption key management
thamht4190 commented on a change in pull request #8023: URL: https://github.com/apache/arrow/pull/8023#discussion_r497997865 ## File path: cpp/src/arrow/util/concurrent_map.h ## @@ -0,0 +1,81 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "arrow/util/mutex.h" + +namespace arrow { +namespace util { + +template +class ConcurrentMap { Review comment: @pitrou @bkietz I need a map which can be accessed by multiple threads. Please advice which is the preferred way to do it. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] thamht4190 commented on a change in pull request #8023: ARROW-9318: [C++] Parquet encryption key management
thamht4190 commented on a change in pull request #8023: URL: https://github.com/apache/arrow/pull/8023#discussion_r497996333 ## File path: cpp/src/arrow/json/object_parser.h ## @@ -0,0 +1,45 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/json/rapidjson_defs.h" // IWYU pragma: keep + +#include + +#include "arrow/result.h" +#include "arrow/util/string_view.h" +#include "arrow/util/visibility.h" + +namespace arrow { +namespace json { + +namespace rj = arrow::rapidjson; + +class ARROW_EXPORT ObjectParser { + public: + bool Parse(arrow::util::string_view json); + + Result GetString(const char* key) const; + Result GetBool(const char* key) const; + + private: + rj::Document _document; Review comment: @bkietz Sorry I don't understand this part "For example, the only public mention of it ObjectParser is KeyMaterial::Parse where it could be replaced by a string.". Can you explain more? Thanks! This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] thamht4190 commented on a change in pull request #8023: ARROW-9318: [C++] Parquet encryption key management
thamht4190 commented on a change in pull request #8023: URL: https://github.com/apache/arrow/pull/8023#discussion_r497994581 ## File path: cpp/src/parquet/encryption/file_key_unwrapper.h ## @@ -0,0 +1,66 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/util/concurrent_map.h" + +#include "parquet/encryption/encryption.h" +#include "parquet/encryption/key_material.h" +#include "parquet/encryption/key_toolkit.h" +#include "parquet/encryption/key_toolkit_internal.h" +#include "parquet/encryption/kms_client.h" +#include "parquet/platform.h" + +namespace parquet { +namespace encryption { + +// This class will retrieve the key from "key metadata", following these steps: +// 1. Parse "key metadata" (see structure in KeyMetadata class). +// 2. Retrieve "key material" which can be stored inside or outside "key metadata" +//Currently we don't support the case "key material" stores outside "key metadata" +//yet. +// 3. Unwrap the "data encryption key" from "key material". There are 2 modes: +// 3.1. single wrapping: decrypt the wrapped "data encryption key" directly with "master +// encryption key" 3.2. double wrapping: 2 steps: 3.2.1. "key encryption key" is decrypted +// with "master encryption key" 3.2.2. "data encryption key" is decrypted with the above +// "key encryption key" +class PARQUET_EXPORT FileKeyUnwrapper : public DecryptionKeyRetriever { + public: + /// key_toolkit and kms_connection_config is to get KmsClient from cache or create + /// KmsClient if it's not in the cache yet. cache_entry_lifetime_seconds is life time of + /// KmsClient in the cache. + FileKeyUnwrapper(KeyToolkit* key_toolkit, Review comment: It references to `PropertiesDrivenCryptoFactory::key_toolkit_` which should remain alive during with `PropertiesDrivenCryptoFactory` object. `FileKeyUnwrapper` is used internally in `PropertiesDrivenCryptoFactory`, so I think it's ok to pass a raw pointer. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] thamht4190 commented on a change in pull request #8023: ARROW-9318: [C++] Parquet encryption key management
thamht4190 commented on a change in pull request #8023: URL: https://github.com/apache/arrow/pull/8023#discussion_r497982290 ## File path: cpp/src/parquet/encryption/file_key_unwrapper.h ## @@ -0,0 +1,66 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/util/concurrent_map.h" + +#include "parquet/encryption/encryption.h" +#include "parquet/encryption/key_material.h" +#include "parquet/encryption/key_toolkit.h" +#include "parquet/encryption/key_toolkit_internal.h" +#include "parquet/encryption/kms_client.h" +#include "parquet/platform.h" + +namespace parquet { +namespace encryption { + +// This class will retrieve the key from "key metadata", following these steps: +// 1. Parse "key metadata" (see structure in KeyMetadata class). +// 2. Retrieve "key material" which can be stored inside or outside "key metadata" +//Currently we don't support the case "key material" stores outside "key metadata" +//yet. +// 3. Unwrap the "data encryption key" from "key material". There are 2 modes: +// 3.1. single wrapping: decrypt the wrapped "data encryption key" directly with "master +// encryption key" 3.2. double wrapping: 2 steps: 3.2.1. "key encryption key" is decrypted +// with "master encryption key" 3.2.2. "data encryption key" is decrypted with the above +// "key encryption key" +class PARQUET_EXPORT FileKeyUnwrapper : public DecryptionKeyRetriever { + public: + /// key_toolkit and kms_connection_config is to get KmsClient from cache or create + /// KmsClient if it's not in the cache yet. cache_entry_lifetime_seconds is life time of + /// KmsClient in the cache. + FileKeyUnwrapper(KeyToolkit* key_toolkit, + const KmsConnectionConfig& kms_connection_config, + uint64_t cache_lifetime_seconds); + + std::string GetKey(const std::string& key_metadata) const override; + + private: + internal::KeyWithMasterId GetDataEncryptionKey(const KeyMaterial& key_material) const; + std::shared_ptr GetKmsClientFromConfigOrKeyMaterial( + const KeyMaterial& key_material) const; + + /// A map of Key Encryption Key (KEK) ID -> KEK bytes, for the current token + mutable std::shared_ptr> kek_per_kek_id_; + KeyToolkit* key_toolkit_; + mutable KmsConnectionConfig kms_connection_config_; Review comment: Because it is modified in `std::string GetKey(const std::string& key_metadata) const override;` This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] thamht4190 commented on a change in pull request #8023: ARROW-9318: [C++] Parquet encryption key management
thamht4190 commented on a change in pull request #8023: URL: https://github.com/apache/arrow/pull/8023#discussion_r497981750 ## File path: cpp/src/parquet/encryption/file_key_unwrapper.h ## @@ -0,0 +1,66 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/util/concurrent_map.h" + +#include "parquet/encryption/encryption.h" +#include "parquet/encryption/key_material.h" +#include "parquet/encryption/key_toolkit.h" +#include "parquet/encryption/key_toolkit_internal.h" +#include "parquet/encryption/kms_client.h" +#include "parquet/platform.h" + +namespace parquet { +namespace encryption { + +// This class will retrieve the key from "key metadata", following these steps: +// 1. Parse "key metadata" (see structure in KeyMetadata class). +// 2. Retrieve "key material" which can be stored inside or outside "key metadata" +//Currently we don't support the case "key material" stores outside "key metadata" +//yet. +// 3. Unwrap the "data encryption key" from "key material". There are 2 modes: +// 3.1. single wrapping: decrypt the wrapped "data encryption key" directly with "master +// encryption key" 3.2. double wrapping: 2 steps: 3.2.1. "key encryption key" is decrypted +// with "master encryption key" 3.2.2. "data encryption key" is decrypted with the above +// "key encryption key" +class PARQUET_EXPORT FileKeyUnwrapper : public DecryptionKeyRetriever { + public: + /// key_toolkit and kms_connection_config is to get KmsClient from cache or create + /// KmsClient if it's not in the cache yet. cache_entry_lifetime_seconds is life time of + /// KmsClient in the cache. + FileKeyUnwrapper(KeyToolkit* key_toolkit, + const KmsConnectionConfig& kms_connection_config, + uint64_t cache_lifetime_seconds); + + std::string GetKey(const std::string& key_metadata) const override; + + private: + internal::KeyWithMasterId GetDataEncryptionKey(const KeyMaterial& key_material) const; + std::shared_ptr GetKmsClientFromConfigOrKeyMaterial( + const KeyMaterial& key_material) const; + + /// A map of Key Encryption Key (KEK) ID -> KEK bytes, for the current token + mutable std::shared_ptr> kek_per_kek_id_; Review comment: This variable references the item in cache. When the item is expired, a thread can remove it from cache, while this thread can still access it. So I use shared_ptr here to make sure this variable still alive when the item in cache is removed. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] thamht4190 commented on a change in pull request #8023: ARROW-9318: [C++] Parquet encryption key management
thamht4190 commented on a change in pull request #8023: URL: https://github.com/apache/arrow/pull/8023#discussion_r497981549 ## File path: cpp/src/parquet/encryption/file_key_wrapper.h ## @@ -0,0 +1,81 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "arrow/util/concurrent_map.h" + +#include "parquet/encryption/file_key_material_store.h" +#include "parquet/encryption/key_encryption_key.h" +#include "parquet/encryption/key_toolkit.h" +#include "parquet/encryption/kms_client.h" +#include "parquet/platform.h" + +namespace parquet { +namespace encryption { + +// This class will generate "key metadata" from "data encryption key" and "master key", +// following these steps: +// 1. Wrap "data encryption key". There are 2 modes: +// 1.1. single wrapping: encrypt "data encryption key" directly with "master encryption +// key" 1.2. double wrapping: 2 steps: 1.2.1. "key encryption key" is randomized (see +// structure of KeyEncryptionKey class) 1.2.2. "data encryption key" is encrypted with the +// above "key encryption key" +// 2. Create "key material" (see structure in KeyMaterial class) +// 3. Create "key metadata" with "key material" inside or a reference to outside "key +// material" (see structure in KeyMetadata class). +// Currently we don't support the case "key material" stores outside "key metadata" +// yet. +class PARQUET_EXPORT FileKeyWrapper { + public: + static constexpr int kKeyEncryptionKeyLength = 16; + static constexpr int kKeyEncryptionKeyIdLength = 16; + + /// key_toolkit and kms_connection_config is to get KmsClient from the cache or create + /// KmsClient if it's not in the cache yet. cache_entry_lifetime_seconds is life time of + /// KmsClient in the cache. key_material_store is to store "key material" outside + /// parquet file, NULL if "key material" is stored inside parquet file. + FileKeyWrapper(KeyToolkit* key_toolkit, + const KmsConnectionConfig& kms_connection_config, + std::shared_ptr key_material_store, + uint64_t cache_entry_lifetime_seconds, bool double_wrapping); + + /// Creates key_metadata field for a given data key, via wrapping the key with the + /// master key + std::string GetEncryptionKeyMetadata(const std::string& data_key, + const std::string& master_key_id, + bool is_footer_key); + + private: + KeyEncryptionKey CreateKeyEncryptionKey(const std::string& master_key_id); + + /// A map of Master Encryption Key ID -> KeyEncryptionKey, for the current token + std::shared_ptr> kek_per_master_key_id_; Review comment: That variable references the item in cache. When the item is expired, a thread can remove it from cache, while this thread can still access it. So I use `shared_ptr` here to make sure this variable still alive when the item in cache is removed. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] thamht4190 commented on a change in pull request #8023: ARROW-9318: [C++] Parquet encryption key management
thamht4190 commented on a change in pull request #8023: URL: https://github.com/apache/arrow/pull/8023#discussion_r497980986 ## File path: cpp/src/parquet/encryption/key_encryption_key.h ## @@ -0,0 +1,62 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "arrow/util/base64.h" + +namespace parquet { +namespace encryption { + +// In the double wrapping mode, each "data encryption key" (DEK) is encrypted with a “key +// encryption key” (KEK), that in turn is encrypted with a "master encryption key" (MEK). +// In a writer process, a random KEK is generated for each MEK ID, and cached in a map. This allows to perform an interaction with a KMS server only once for each +// MEK, in order to wrap its KEK. "Data encryption key" (DEK) wrapping is performed Review comment: Yes. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] thamht4190 commented on a change in pull request #8023: ARROW-9318: [C++] Parquet encryption key management
thamht4190 commented on a change in pull request #8023: URL: https://github.com/apache/arrow/pull/8023#discussion_r497980844 ## File path: cpp/src/parquet/encryption/key_material.h ## @@ -0,0 +1,125 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "arrow/json/object_parser.h" + +#include "parquet/platform.h" + +namespace parquet { +namespace encryption { + +// KeyMaterial class represents the "key material", keeping the information that allows +// readers to recover an encryption key (see description of the KeyMetadata class). The +// keytools package (PARQUET-1373) implements the "envelope encryption" pattern, in a +// "single wrapping" or "double wrapping" mode. In the single wrapping mode, the key +// material is generated by encrypting the "data encryption key" (DEK) by a "master key". +// In the double wrapping mode, the key material is generated by encrypting the DEK by a +// "key encryption key" (KEK), that in turn is encrypted by a "master key". +// +// Key material is kept in a flat json object, with the following fields: +// 1. "keyMaterialType" - a String, with the type of key material. In the current +// version, only one value is allowed - "PKMT1" (stands +// for "parquet key management tools, version 1"). For external key material storage, +// this field is written in both "key metadata" and "key material" jsons. For internal +// key material storage, this field is written only once in the common json. +// 2. "isFooterKey" - a boolean. If true, means that the material belongs to a file footer +// key, and keeps additional information (such as +// KMS instance ID and URL). If false, means that the material belongs to a column +// key. +// 3. "kmsInstanceID" - a String, with the KMS Instance ID. Written only in footer key +// material. +// 4. "kmsInstanceURL" - a String, with the KMS Instance URL. Written only in footer key +// material. +// 5. "masterKeyID" - a String, with the ID of the master key used to generate the +// material. +// 6. "wrappedDEK" - a String, with the wrapped DEK (base64 encoding). +// 7. "doubleWrapping" - a boolean. If true, means that the material was generated in +// double wrapping mode. +// If false - in single wrapping mode. +// 8. "keyEncryptionKeyID" - a String, with the ID of the KEK used to generate the +// material. Written only in double wrapping mode. +// 9. "wrappedKEK" - a String, with the wrapped KEK (base64 encoding). Written only in +// double wrapping mode. +class PARQUET_EXPORT KeyMaterial { + public: + // these fields are defined in a specification and should never be changed + static constexpr const char kKeyMaterialTypeField[] = "keyMaterialType"; + static constexpr const char kKeyMaterialType1[] = "PKMT1"; + + static constexpr const char kFooterKeyIdInFile[] = "footerKey"; + static constexpr const char kColumnKeyIdInFilePrefix[] = "columnKey"; + + static constexpr const char kIsFooterKeyField[] = "isFooterKey"; + static constexpr const char kDoubleWrappingField[] = "doubleWrapping"; + static constexpr const char kKmsInstanceIdField[] = "kmsInstanceID"; + static constexpr const char kKmsInstanceUrlField[] = "kmsInstanceURL"; + static constexpr const char kMasterKeyIdField[] = "masterKeyID"; + static constexpr const char kWrappedDataEncryptionKeyField[] = "wrappedDEK"; + static constexpr const char kKeyEncryptionKeyIdField[] = "keyEncryptionKeyID"; + static constexpr const char kWrappedKeyEncryptionKeyField[] = "wrappedKEK"; + + public: + KeyMaterial() = default; + + static KeyMaterial Parse(const std::string& key_material_string); + + static KeyMaterial Parse(const arrow::json::ObjectParser& key_material_json); + + /// This method returns a json string that will be stored either inside a parquet file + /// or in a key material store outside the parquet file. Review comment: No, I meant "store" (or more clear, "file store", "file storage"). Sorry if my English is not very good. This is an automated message from the Apache Git Service. To respond to the message, please log on to
[GitHub] [arrow] thamht4190 commented on a change in pull request #8023: ARROW-9318: [C++] Parquet encryption key management
thamht4190 commented on a change in pull request #8023: URL: https://github.com/apache/arrow/pull/8023#discussion_r497980169 ## File path: cpp/src/parquet/encryption/key_toolkit_internal.h ## @@ -0,0 +1,52 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +namespace parquet { +namespace encryption { +namespace internal { + +// "data encryption key" and "master key identifier" are paired together as output when +// parsing from "key material" +class KeyWithMasterId { + public: + KeyWithMasterId(const std::string& key_bytes, const std::string& master_id) + : key_bytes_(key_bytes), master_id_(master_id) {} + + const std::string& data_key() const { return key_bytes_; } + const std::string& master_id() const { return master_id_; } + + private: + const std::string key_bytes_; + const std::string master_id_; +}; + +/// Encrypts "key" with "master_key", using AES-GCM and the "aad" Review comment: cc @ggershinsky This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] thamht4190 commented on a change in pull request #8023: ARROW-9318: [C++] Parquet encryption key management
thamht4190 commented on a change in pull request #8023: URL: https://github.com/apache/arrow/pull/8023#discussion_r497978500 ## File path: cpp/src/parquet/encryption/kms_client.h ## @@ -0,0 +1,84 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "parquet/exception.h" +#include "parquet/platform.h" + +namespace parquet { +namespace encryption { + +// This class wraps the key access token of a KMS server. If your token changes over time, +// you should keep the reference to the KeyAccessToken object and call Refresh() method +// every time you have a new token. +class PARQUET_EXPORT KeyAccessToken { + public: + KeyAccessToken() = default; + + explicit KeyAccessToken(const std::string value) : value_(value) {} + + void Refresh(const std::string& new_value) { value_ = new_value; } + + const std::string& value() const { return value_; } + + void SetDefaultIfEmpty(); Review comment: This function will set `value_` to `"DEFAULT"` if `value_` is empty. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] thamht4190 commented on a change in pull request #8023: ARROW-9318: [C++] Parquet encryption key management
thamht4190 commented on a change in pull request #8023: URL: https://github.com/apache/arrow/pull/8023#discussion_r497977436 ## File path: cpp/src/parquet/encryption/properties_driven_crypto_factory.h ## @@ -0,0 +1,210 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "parquet/encryption/encryption.h" +#include "parquet/encryption/file_key_wrapper.h" +#include "parquet/encryption/key_toolkit.h" +#include "parquet/encryption/kms_client_factory.h" +#include "parquet/platform.h" + +namespace parquet { +namespace encryption { + +static constexpr ParquetCipher::type kDefaultEncryptionAlgorithm = +ParquetCipher::AES_GCM_V1; +static constexpr bool kDefaultPlaintextFooter = false; +static constexpr bool kDefaultDoubleWrapping = true; +static constexpr uint64_t kDefaultCacheLifetimeSeconds = 600; // 10 minutes +static constexpr bool kDefaultInternalKeyMaterial = true; +static constexpr bool kDefaultUniformEncryption = false; +static constexpr int32_t kDefaultDataKeyLengthBits = 128; + +class PARQUET_EXPORT EncryptionConfiguration { + public: + class PARQUET_EXPORT Builder { + public: +/// footer_key: ID of the master key for footer encryption/signing +explicit Builder(const std::string& footer_key) +: footer_key_(footer_key), + encryption_algorithm_(kDefaultEncryptionAlgorithm), + plaintext_footer_(kDefaultPlaintextFooter), + double_wrapping_(kDefaultDoubleWrapping), + cache_lifetime_seconds_(kDefaultCacheLifetimeSeconds), + internal_key_material_(kDefaultInternalKeyMaterial), + uniform_encryption_(kDefaultUniformEncryption), + data_key_length_bits_(kDefaultDataKeyLengthBits) {} + +/// List of columns to encrypt, with master key IDs (see HIVE-21848). +/// Format: "masterKeyID:colName,colName;masterKeyID:colName..." +/// Either +/// column_keys(const std::string&) +/// or +/// uniform_encryption() +/// must be called. If none are called, or if both are called, an exception will be +/// thrown. +Builder* column_keys(const std::string& column_keys); + +/// encrypt footer and all columns with the same encryption key. +Builder* uniform_encryption(); + +/// Parquet encryption algorithm. Can be "AES_GCM_V1" (default), or "AES_GCM_CTR_V1". +Builder* encryption_algorithm(ParquetCipher::type algo); + +/// Write files with plaintext footer. +/// The default is false - files are written with encrypted footer. +Builder* plaintext_footer(bool plaintext_footer); + +/// Use double wrapping - where data encryption keys (DEKs) are encrypted with key +/// encryption keys (KEKs), which in turn are encrypted with master keys. +/// The default is true. If set to false, use single wrapping - where DEKs are +/// encrypted directly with master keys. +Builder* double_wrapping(bool double_wrapping); + +/// Lifetime of cached entities (key encryption keys, local wrapping keys, KMS client +/// objects). +/// The default is 600 (10 minutes). +Builder* cache_lifetime_seconds(uint64_t cache_lifetime_seconds); + +/// Store key material inside Parquet file footers; this mode doesn’t produce +/// additional files. By default, true. If set to false, key material is stored in +/// separate files in the same folder, which enables key rotation for immutable +/// Parquet files. +Builder* internal_key_material(bool internal_key_material); + +/// Length of data encryption keys (DEKs), randomly generated by parquet key +/// management tools. Can be 128, 192 or 256 bits. +/// The default is 128 bits. +Builder* data_key_length_bits(int32_t data_key_length_bits); + +std::shared_ptr build(); + + private: +std::string footer_key_; +std::string column_keys_; +ParquetCipher::type encryption_algorithm_; +bool plaintext_footer_; +bool double_wrapping_; +uint64_t cache_lifetime_seconds_; +bool internal_key_material_; +bool uniform_encryption_; +int32_t data_key_length_bits_; + }; + + const std::string& footer_key() const { return footer_key_; } + const std::string& column_keys()
[GitHub] [arrow] thamht4190 commented on a change in pull request #8023: ARROW-9318: [C++] Parquet encryption key management
thamht4190 commented on a change in pull request #8023: URL: https://github.com/apache/arrow/pull/8023#discussion_r497974822 ## File path: cpp/src/parquet/encryption/properties_driven_crypto_factory.h ## @@ -0,0 +1,210 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "parquet/encryption/encryption.h" +#include "parquet/encryption/file_key_wrapper.h" +#include "parquet/encryption/key_toolkit.h" +#include "parquet/encryption/kms_client_factory.h" +#include "parquet/platform.h" + +namespace parquet { +namespace encryption { + +static constexpr ParquetCipher::type kDefaultEncryptionAlgorithm = +ParquetCipher::AES_GCM_V1; +static constexpr bool kDefaultPlaintextFooter = false; +static constexpr bool kDefaultDoubleWrapping = true; +static constexpr uint64_t kDefaultCacheLifetimeSeconds = 600; // 10 minutes +static constexpr bool kDefaultInternalKeyMaterial = true; +static constexpr bool kDefaultUniformEncryption = false; +static constexpr int32_t kDefaultDataKeyLengthBits = 128; + +class PARQUET_EXPORT EncryptionConfiguration { + public: + class PARQUET_EXPORT Builder { + public: +/// footer_key: ID of the master key for footer encryption/signing +explicit Builder(const std::string& footer_key) +: footer_key_(footer_key), + encryption_algorithm_(kDefaultEncryptionAlgorithm), + plaintext_footer_(kDefaultPlaintextFooter), + double_wrapping_(kDefaultDoubleWrapping), + cache_lifetime_seconds_(kDefaultCacheLifetimeSeconds), + internal_key_material_(kDefaultInternalKeyMaterial), + uniform_encryption_(kDefaultUniformEncryption), + data_key_length_bits_(kDefaultDataKeyLengthBits) {} + +/// List of columns to encrypt, with master key IDs (see HIVE-21848). +/// Format: "masterKeyID:colName,colName;masterKeyID:colName..." +/// Either +/// column_keys(const std::string&) +/// or +/// uniform_encryption() +/// must be called. If none are called, or if both are called, an exception will be +/// thrown. +Builder* column_keys(const std::string& column_keys); + +/// encrypt footer and all columns with the same encryption key. +Builder* uniform_encryption(); + +/// Parquet encryption algorithm. Can be "AES_GCM_V1" (default), or "AES_GCM_CTR_V1". +Builder* encryption_algorithm(ParquetCipher::type algo); + +/// Write files with plaintext footer. +/// The default is false - files are written with encrypted footer. +Builder* plaintext_footer(bool plaintext_footer); + +/// Use double wrapping - where data encryption keys (DEKs) are encrypted with key +/// encryption keys (KEKs), which in turn are encrypted with master keys. +/// The default is true. If set to false, use single wrapping - where DEKs are +/// encrypted directly with master keys. +Builder* double_wrapping(bool double_wrapping); + +/// Lifetime of cached entities (key encryption keys, local wrapping keys, KMS client +/// objects). +/// The default is 600 (10 minutes). +Builder* cache_lifetime_seconds(uint64_t cache_lifetime_seconds); + +/// Store key material inside Parquet file footers; this mode doesn’t produce +/// additional files. By default, true. If set to false, key material is stored in +/// separate files in the same folder, which enables key rotation for immutable +/// Parquet files. +Builder* internal_key_material(bool internal_key_material); + +/// Length of data encryption keys (DEKs), randomly generated by parquet key +/// management tools. Can be 128, 192 or 256 bits. +/// The default is 128 bits. +Builder* data_key_length_bits(int32_t data_key_length_bits); + +std::shared_ptr build(); + + private: +std::string footer_key_; +std::string column_keys_; +ParquetCipher::type encryption_algorithm_; +bool plaintext_footer_; +bool double_wrapping_; +uint64_t cache_lifetime_seconds_; +bool internal_key_material_; +bool uniform_encryption_; +int32_t data_key_length_bits_; + }; + + const std::string& footer_key() const { return footer_key_; } + const std::string& column_keys()
[GitHub] [arrow] thamht4190 commented on a change in pull request #8023: ARROW-9318: [C++] Parquet encryption key management
thamht4190 commented on a change in pull request #8023: URL: https://github.com/apache/arrow/pull/8023#discussion_r497973635 ## File path: cpp/src/parquet/encryption/properties_driven_crypto_factory.h ## @@ -0,0 +1,210 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "parquet/encryption/encryption.h" +#include "parquet/encryption/file_key_wrapper.h" +#include "parquet/encryption/key_toolkit.h" +#include "parquet/encryption/kms_client_factory.h" +#include "parquet/platform.h" + +namespace parquet { +namespace encryption { + +static constexpr ParquetCipher::type kDefaultEncryptionAlgorithm = +ParquetCipher::AES_GCM_V1; +static constexpr bool kDefaultPlaintextFooter = false; +static constexpr bool kDefaultDoubleWrapping = true; +static constexpr uint64_t kDefaultCacheLifetimeSeconds = 600; // 10 minutes +static constexpr bool kDefaultInternalKeyMaterial = true; +static constexpr bool kDefaultUniformEncryption = false; +static constexpr int32_t kDefaultDataKeyLengthBits = 128; + +class PARQUET_EXPORT EncryptionConfiguration { + public: + class PARQUET_EXPORT Builder { + public: +/// footer_key: ID of the master key for footer encryption/signing +explicit Builder(const std::string& footer_key) +: footer_key_(footer_key), + encryption_algorithm_(kDefaultEncryptionAlgorithm), + plaintext_footer_(kDefaultPlaintextFooter), + double_wrapping_(kDefaultDoubleWrapping), + cache_lifetime_seconds_(kDefaultCacheLifetimeSeconds), + internal_key_material_(kDefaultInternalKeyMaterial), + uniform_encryption_(kDefaultUniformEncryption), + data_key_length_bits_(kDefaultDataKeyLengthBits) {} + +/// List of columns to encrypt, with master key IDs (see HIVE-21848). +/// Format: "masterKeyID:colName,colName;masterKeyID:colName..." +/// Either +/// column_keys(const std::string&) +/// or +/// uniform_encryption() +/// must be called. If none are called, or if both are called, an exception will be +/// thrown. +Builder* column_keys(const std::string& column_keys); + +/// encrypt footer and all columns with the same encryption key. +Builder* uniform_encryption(); + +/// Parquet encryption algorithm. Can be "AES_GCM_V1" (default), or "AES_GCM_CTR_V1". +Builder* encryption_algorithm(ParquetCipher::type algo); + +/// Write files with plaintext footer. +/// The default is false - files are written with encrypted footer. +Builder* plaintext_footer(bool plaintext_footer); + +/// Use double wrapping - where data encryption keys (DEKs) are encrypted with key +/// encryption keys (KEKs), which in turn are encrypted with master keys. +/// The default is true. If set to false, use single wrapping - where DEKs are +/// encrypted directly with master keys. +Builder* double_wrapping(bool double_wrapping); + +/// Lifetime of cached entities (key encryption keys, local wrapping keys, KMS client +/// objects). +/// The default is 600 (10 minutes). +Builder* cache_lifetime_seconds(uint64_t cache_lifetime_seconds); + +/// Store key material inside Parquet file footers; this mode doesn’t produce +/// additional files. By default, true. If set to false, key material is stored in +/// separate files in the same folder, which enables key rotation for immutable +/// Parquet files. +Builder* internal_key_material(bool internal_key_material); + +/// Length of data encryption keys (DEKs), randomly generated by parquet key +/// management tools. Can be 128, 192 or 256 bits. +/// The default is 128 bits. +Builder* data_key_length_bits(int32_t data_key_length_bits); + +std::shared_ptr build(); + + private: +std::string footer_key_; +std::string column_keys_; +ParquetCipher::type encryption_algorithm_; +bool plaintext_footer_; +bool double_wrapping_; +uint64_t cache_lifetime_seconds_; +bool internal_key_material_; +bool uniform_encryption_; +int32_t data_key_length_bits_; + }; + + const std::string& footer_key() const { return footer_key_; } + const std::string& column_keys()
[GitHub] [arrow] thamht4190 commented on a change in pull request #8023: ARROW-9318: [C++] Parquet encryption key management
thamht4190 commented on a change in pull request #8023: URL: https://github.com/apache/arrow/pull/8023#discussion_r497973376 ## File path: cpp/src/parquet/encryption/remote_kms_client.cc ## @@ -0,0 +1,127 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/json/object_parser.h" +#include "arrow/json/object_writer.h" + +#include "parquet/encryption/key_toolkit_internal.h" +#include "parquet/encryption/remote_kms_client.h" +#include "parquet/exception.h" + +using arrow::json::ObjectParser; +using arrow::json::ObjectWriter; + +namespace parquet { +namespace encryption { + +constexpr const char RemoteKmsClient::kLocalWrapNoKeyVersion[]; + +constexpr const char RemoteKmsClient::LocalKeyWrap::kLocalWrapKeyVersionField[]; +constexpr const char RemoteKmsClient::LocalKeyWrap::kLocalWrapEncryptedKeyField[]; + +RemoteKmsClient::LocalKeyWrap::LocalKeyWrap(const std::string& master_key_version, +const std::string& encrypted_encoded_key) +: encrypted_encoded_key_(encrypted_encoded_key), + master_key_version_(master_key_version) {} + +std::string RemoteKmsClient::LocalKeyWrap::CreateSerialized( +const std::string& encrypted_encoded_key) { + ObjectWriter json_writer; + + json_writer.SetString(kLocalWrapKeyVersionField, kLocalWrapNoKeyVersion); + json_writer.SetString(kLocalWrapEncryptedKeyField, encrypted_encoded_key); + + return json_writer.Serialize(); +} + +RemoteKmsClient::LocalKeyWrap RemoteKmsClient::LocalKeyWrap::Parse( +const std::string& wrapped_key) { + ObjectParser json_parser; + if (!json_parser.Parse(wrapped_key)) { +throw ParquetException("Failed to parse local key wrap json " + wrapped_key); + } + std::string master_key_version; + PARQUET_ASSIGN_OR_THROW(master_key_version, + json_parser.GetString(kLocalWrapKeyVersionField)); + + std::string encrypted_encoded_key; + PARQUET_ASSIGN_OR_THROW(encrypted_encoded_key, + json_parser.GetString(kLocalWrapEncryptedKeyField)); + + return RemoteKmsClient::LocalKeyWrap(master_key_version, encrypted_encoded_key); +} + +void RemoteKmsClient::Initialize(const KmsConnectionConfig& kms_connection_config, + bool is_wrap_locally) { + kms_connection_config_ = kms_connection_config; + is_wrap_locally_ = is_wrap_locally; + if (is_wrap_locally_) { +master_key_cache_.Clear(); + } + + is_default_token_ = + kms_connection_config_.key_access_token() == KmsClient::kKeyAccessTokenDefault; + + InitializeInternal(); +} + +std::string RemoteKmsClient::WrapKey(const std::string& key_bytes, + const std::string& master_key_identifier) { + if (is_wrap_locally_) { Review comment: I think it's a good idea. However, needs @ggershinsky to confirm if I can create 2 separated classes, for example: `LocalWrappingRemoteKmsClient` and `InServerWrappingRemoteKmsClient`? This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] thamht4190 commented on a change in pull request #8023: ARROW-9318: [C++] Parquet encryption key management
thamht4190 commented on a change in pull request #8023: URL: https://github.com/apache/arrow/pull/8023#discussion_r497971996 ## File path: cpp/src/parquet/encryption/two_level_cache_with_expiration.h ## @@ -0,0 +1,167 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "arrow/util/concurrent_map.h" +#include "arrow/util/mutex.h" + +#include "parquet/encryption/key_toolkit_internal.h" Review comment: It should be removed. Thanks! This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] thamht4190 commented on a change in pull request #8023: ARROW-9318: [C++] Parquet encryption key management
thamht4190 commented on a change in pull request #8023: URL: https://github.com/apache/arrow/pull/8023#discussion_r497971688 ## File path: cpp/src/parquet/encryption/two_level_cache_with_expiration.h ## @@ -0,0 +1,167 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "arrow/util/concurrent_map.h" +#include "arrow/util/mutex.h" + +#include "parquet/encryption/key_toolkit_internal.h" + +using arrow::util::ConcurrentMap; + +namespace parquet { +namespace encryption { + +namespace internal { + +// in miliseconds +using TimePoint = std::chrono::system_clock::time_point; + +static inline TimePoint CurrentTimePoint() { return std::chrono::system_clock::now(); } + +template +class ExpiringCacheEntry { + public: + ExpiringCacheEntry() = default; + + ExpiringCacheEntry(const E& cached_item, uint64_t expiration_interval_millis) + : cached_item_(cached_item) { +expiration_timestamp_ = +CurrentTimePoint() + std::chrono::milliseconds(expiration_interval_millis); + } + + bool IsExpired() { +auto now = CurrentTimePoint(); +return (now > expiration_timestamp_); + } + + E cached_item() { return cached_item_; } + + private: + TimePoint expiration_timestamp_; + E cached_item_; +}; + +// This class is to avoid the below warning when compiling KeyToolkit class with VS2015 +// warning C4503: decorated name length exceeded, name was truncated +template +class ExpiringCacheMapEntry { + public: + ExpiringCacheMapEntry() = default; + + explicit ExpiringCacheMapEntry(std::shared_ptr> cached_item, + uint64_t expiration_interval_millis) + : map_cache_(cached_item, expiration_interval_millis) {} + + bool IsExpired() { return map_cache_.IsExpired(); } + + std::shared_ptr> cached_item() { return map_cache_.cached_item(); } + + private: + ExpiringCacheEntry>> map_cache_; Review comment: The lock in `TwoLevelCacheWithExpiration` is for the outside map `cache_` and also `last_cache_cleanup_timestamp_`. While this `ConcurrentMap` can be accessed and modified outside `TwoLevelCacheWithExpiration`, for example: https://github.com/apache/arrow/pull/8023/files#diff-0006a54749b9fee8573f28939bab6eb0R76 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] thamht4190 commented on a change in pull request #8023: ARROW-9318: [C++] Parquet encryption key management
thamht4190 commented on a change in pull request #8023: URL: https://github.com/apache/arrow/pull/8023#discussion_r497970716 ## File path: cpp/src/parquet/encryption/two_level_cache_with_expiration.h ## @@ -0,0 +1,167 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "arrow/util/concurrent_map.h" +#include "arrow/util/mutex.h" + +#include "parquet/encryption/key_toolkit_internal.h" + +using arrow::util::ConcurrentMap; + +namespace parquet { +namespace encryption { + +namespace internal { + +// in miliseconds +using TimePoint = std::chrono::system_clock::time_point; + +static inline TimePoint CurrentTimePoint() { return std::chrono::system_clock::now(); } + +template +class ExpiringCacheEntry { + public: + ExpiringCacheEntry() = default; + + ExpiringCacheEntry(const E& cached_item, uint64_t expiration_interval_millis) + : cached_item_(cached_item) { +expiration_timestamp_ = +CurrentTimePoint() + std::chrono::milliseconds(expiration_interval_millis); + } + + bool IsExpired() { +auto now = CurrentTimePoint(); +return (now > expiration_timestamp_); + } + + E cached_item() { return cached_item_; } + + private: + TimePoint expiration_timestamp_; + E cached_item_; +}; + +// This class is to avoid the below warning when compiling KeyToolkit class with VS2015 +// warning C4503: decorated name length exceeded, name was truncated +template +class ExpiringCacheMapEntry { + public: + ExpiringCacheMapEntry() = default; + + explicit ExpiringCacheMapEntry(std::shared_ptr> cached_item, + uint64_t expiration_interval_millis) + : map_cache_(cached_item, expiration_interval_millis) {} + + bool IsExpired() { return map_cache_.IsExpired(); } + + std::shared_ptr> cached_item() { return map_cache_.cached_item(); } + + private: + ExpiringCacheEntry>> map_cache_; +}; + +} // namespace internal + +// Two-level cache with expiration of internal caches according to token lifetime. +// External cache is per token, internal is per string key. +// Wrapper class around: +//std::unordered_map>> +// This cache is safe to be shared between threads. +template +class TwoLevelCacheWithExpiration { + public: + TwoLevelCacheWithExpiration() { +last_cache_cleanup_timestamp_ = internal::CurrentTimePoint(); + } + + std::shared_ptr> GetOrCreateInternalCache( + const std::string& access_token, uint64_t cache_entry_lifetime_ms) { Review comment: I follow the code in Java: https://github.com/apache/parquet-mr/pull/615/files#diff-6926b4ba3bb623d4467b3752c5846f47R42. So maybe @ggershinsky can explain here. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] thamht4190 commented on a change in pull request #8023: ARROW-9318: [C++] Parquet encryption key management
thamht4190 commented on a change in pull request #8023: URL: https://github.com/apache/arrow/pull/8023#discussion_r497970111 ## File path: cpp/src/parquet/encryption/two_level_cache_with_expiration.h ## @@ -0,0 +1,167 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "arrow/util/concurrent_map.h" +#include "arrow/util/mutex.h" + +#include "parquet/encryption/key_toolkit_internal.h" + +using arrow::util::ConcurrentMap; + +namespace parquet { +namespace encryption { + +namespace internal { + +// in miliseconds +using TimePoint = std::chrono::system_clock::time_point; + +static inline TimePoint CurrentTimePoint() { return std::chrono::system_clock::now(); } + +template +class ExpiringCacheEntry { + public: + ExpiringCacheEntry() = default; + + ExpiringCacheEntry(const E& cached_item, uint64_t expiration_interval_millis) + : cached_item_(cached_item) { +expiration_timestamp_ = +CurrentTimePoint() + std::chrono::milliseconds(expiration_interval_millis); + } + + bool IsExpired() { +auto now = CurrentTimePoint(); +return (now > expiration_timestamp_); + } + + E cached_item() { return cached_item_; } + + private: + TimePoint expiration_timestamp_; + E cached_item_; +}; + +// This class is to avoid the below warning when compiling KeyToolkit class with VS2015 +// warning C4503: decorated name length exceeded, name was truncated +template +class ExpiringCacheMapEntry { + public: + ExpiringCacheMapEntry() = default; + + explicit ExpiringCacheMapEntry(std::shared_ptr> cached_item, + uint64_t expiration_interval_millis) + : map_cache_(cached_item, expiration_interval_millis) {} + + bool IsExpired() { return map_cache_.IsExpired(); } + + std::shared_ptr> cached_item() { return map_cache_.cached_item(); } + + private: + ExpiringCacheEntry>> map_cache_; +}; + +} // namespace internal + +// Two-level cache with expiration of internal caches according to token lifetime. +// External cache is per token, internal is per string key. +// Wrapper class around: +//std::unordered_map>> +// This cache is safe to be shared between threads. +template +class TwoLevelCacheWithExpiration { + public: + TwoLevelCacheWithExpiration() { +last_cache_cleanup_timestamp_ = internal::CurrentTimePoint(); + } + + std::shared_ptr> GetOrCreateInternalCache( + const std::string& access_token, uint64_t cache_entry_lifetime_ms) { +auto lock = mutex_.Lock(); + +auto external_cache_entry = cache_.find(access_token); +if (external_cache_entry == cache_.end() || +external_cache_entry->second.IsExpired()) { + cache_.insert( + {access_token, internal::ExpiringCacheMapEntry( + std::shared_ptr>(new ConcurrentMap()), + cache_entry_lifetime_ms)}); +} + +return cache_[access_token].cached_item(); + } + + void RemoveCacheEntriesForToken(const std::string& access_token) { +auto lock = mutex_.Lock(); +cache_.erase(access_token); + } + + void RemoveCacheEntriesForAllTokens() { +auto lock = mutex_.Lock(); +cache_.clear(); + } + + void CheckCacheForExpiredTokens(uint64_t cache_cleanup_period_ms) { Review comment: Why? This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] thamht4190 commented on a change in pull request #8023: ARROW-9318: [C++] Parquet encryption key management
thamht4190 commented on a change in pull request #8023: URL: https://github.com/apache/arrow/pull/8023#discussion_r497969857 ## File path: cpp/src/parquet/encryption/two_level_cache_with_expiration_test.cc ## @@ -0,0 +1,207 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include + +#include + +#include "arrow/util/concurrent_map.h" + +#include "parquet/encryption/two_level_cache_with_expiration.h" + +namespace parquet { +namespace encryption { +namespace test { + +class TwoLevelCacheWithExpirationTest : public ::testing::Test { + public: + void SetUp() { +// lifetime is 1s +std::shared_ptr> lifetime_1s = +cache_.GetOrCreateInternalCache("lifetime1s", 1000); +lifetime_1s->Insert("item1", 1); +lifetime_1s->Insert("item2", 2); + +// lifetime is 3s +std::shared_ptr> lifetime_3s = +cache_.GetOrCreateInternalCache("lifetime3s", 3000); +lifetime_3s->Insert("item21", 21); +lifetime_3s->Insert("item22", 22); + } + + protected: + void TaskInsert(int thread_no) { +for (int i = 0; i < 20; i++) { + std::string token = i % 2 == 0 ? "lifetime1s" : "lifetime3s"; + uint64_t lifetime_ms = i % 2 == 0 ? 1000 : 3000; + std::shared_ptr> internal_cache = + cache_.GetOrCreateInternalCache(token, lifetime_ms); + std::stringstream ss; + ss << "item_" << thread_no << "_" << i; + internal_cache->Insert(ss.str(), i); + std::this_thread::sleep_for(std::chrono::milliseconds(10)); +} + } + + void TaskClean() { +for (int i = 0; i < 20; i++) { + cache_.Clear(); + std::this_thread::sleep_for(std::chrono::milliseconds(20)); +} + } + + TwoLevelCacheWithExpiration cache_; +}; + +TEST_F(TwoLevelCacheWithExpirationTest, RemoveExpiration) { + std::shared_ptr> lifetime_1s_before_expiration = + cache_.GetOrCreateInternalCache("lifetime1s", 1000); + ASSERT_EQ(lifetime_1s_before_expiration->size(), 2); + + // wait for 2s, we expect: + // lifetime_1s will be expired + // lifetime_3s will not be expired + std::this_thread::sleep_for(std::chrono::milliseconds(2000)); + // now clear expired items from the cache + cache_.RemoveExpiredEntriesFromCache(); + + // lifetime_1s (with 2 items) is expired and has been removed from the cache. + // Now the cache create a new object which has no item. + std::shared_ptr> lifetime_1s = + cache_.GetOrCreateInternalCache("lifetime1s", 1000); + ASSERT_EQ(lifetime_1s->size(), 0); + + // However, lifetime_1s_before_expiration can still access normally and independently + // from the one in cache + lifetime_1s_before_expiration->Insert("item3", 3); + ASSERT_EQ(lifetime_1s_before_expiration->size(), 3); + ASSERT_EQ(lifetime_1s->size(), 0); + + // lifetime_3s is not expired and still contains 2 items. + std::shared_ptr> lifetime_3s = + cache_.GetOrCreateInternalCache("lifetime3s", 3000); + ASSERT_EQ(lifetime_3s->size(), 2); +} + +TEST_F(TwoLevelCacheWithExpirationTest, CleanupPeriodTooBig) { + // wait for 2s, now: + // lifetime_1s is expired + // lifetime_3s isn't expired + std::this_thread::sleep_for(std::chrono::milliseconds(2000)); + + // if cleanup_period is too big (10s), the expired items may not be removed from cache. + cache_.CheckCacheForExpiredTokens(1); + + // lifetime_1s (with 2 items) is expired but not removed from the cache, still contains Review comment: I think you're right about "what's being tested is useful" which is not. In real use case, developers should pass a small enough value for `cleanup_period` param. I will delete this test. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] thamht4190 commented on a change in pull request #8023: ARROW-9318: [C++] Parquet encryption key management
thamht4190 commented on a change in pull request #8023: URL: https://github.com/apache/arrow/pull/8023#discussion_r486434226 ## File path: cpp/src/parquet/key_toolkit.h ## @@ -0,0 +1,123 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include + +#include "parquet/key_encryption_key.h" +#include "parquet/kms_client.h" +#include "parquet/kms_client_factory.h" +#include "parquet/platform.h" +#include "parquet/two_level_cache_with_expiration.h" + +namespace parquet { +namespace encryption { + +class KeyWithMasterID { Review comment: > Can you put internal-only stuff in separate header files? `KeyWithMasterId` is an internal thing, however, since it appears in public header file `FileKeyUnwrapper.h" (i.e. `KeyWithMasterId FileKeyUnwrapper::GetDataEncryptionKey(const KeyMaterial& key_material) const` - which will be used in KeyToolkit key rotation later). So it must be in a public `key_toolkit.h`. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] thamht4190 commented on a change in pull request #8023: ARROW-9318: [C++] Parquet encryption key management
thamht4190 commented on a change in pull request #8023: URL: https://github.com/apache/arrow/pull/8023#discussion_r486434226 ## File path: cpp/src/parquet/key_toolkit.h ## @@ -0,0 +1,123 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include + +#include "parquet/key_encryption_key.h" +#include "parquet/kms_client.h" +#include "parquet/kms_client_factory.h" +#include "parquet/platform.h" +#include "parquet/two_level_cache_with_expiration.h" + +namespace parquet { +namespace encryption { + +class KeyWithMasterID { Review comment: > Can you put internal-only stuff in separate header files? `KeyWithMasterId` is an internal thing, however, since it appears in public header file `FileKeyUnwrapper.h"` (i.e. `KeyWithMasterId FileKeyUnwrapper::GetDataEncryptionKey(const KeyMaterial& key_material) const` - which will be used in KeyToolkit key rotation later). So it must be in a public `key_toolkit.h`. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] thamht4190 commented on a change in pull request #8023: ARROW-9318: [C++] Parquet encryption key management
thamht4190 commented on a change in pull request #8023: URL: https://github.com/apache/arrow/pull/8023#discussion_r486420973 ## File path: cpp/src/parquet/key_toolkit.h ## @@ -0,0 +1,123 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include + +#include "parquet/key_encryption_key.h" +#include "parquet/kms_client.h" +#include "parquet/kms_client_factory.h" +#include "parquet/platform.h" +#include "parquet/two_level_cache_with_expiration.h" + +namespace parquet { +namespace encryption { + +class KeyWithMasterID { + public: + KeyWithMasterID(const std::string& key_bytes, const std::string& master_id) + : key_bytes_(key_bytes), master_id_(master_id) {} + + const std::string& data_key() const { return key_bytes_; } + const std::string& master_id() const { return master_id_; } + + private: + std::string key_bytes_; + std::string master_id_; +}; + +class PARQUET_EXPORT KeyToolkit { + public: + class KmsClientCache { + public: +static KmsClientCache& GetInstance() { + static KmsClientCache instance; + return instance; +} +TwoLevelCacheWithExpiration>& cache() { return cache_; } + + private: +TwoLevelCacheWithExpiration> cache_; + }; + + class KEKWriteCache { + public: +static KEKWriteCache& GetInstance() { + static KEKWriteCache instance; + return instance; +} +TwoLevelCacheWithExpiration& cache() { return cache_; } + + private: +TwoLevelCacheWithExpiration cache_; + }; + + class KEKReadCache { + public: +static KEKReadCache& GetInstance() { + static KEKReadCache instance; + return instance; +} +TwoLevelCacheWithExpiration& cache() { return cache_; } + + private: +TwoLevelCacheWithExpiration cache_; + }; + + // KMS client two level cache: token -> KMSInstanceId -> KmsClient + static TwoLevelCacheWithExpiration>& + kms_client_cache_per_token() { +return KmsClientCache::GetInstance().cache(); + } + + // KEK two level cache for wrapping: token -> MEK_ID -> KeyEncryptionKey + static TwoLevelCacheWithExpiration& kek_write_cache_per_token() { +return KEKWriteCache::GetInstance().cache(); + } + + // KEK two level cache for unwrapping: token -> KEK_ID -> KEK bytes + static TwoLevelCacheWithExpiration& kek_read_cache_per_token() { +return KEKReadCache::GetInstance().cache(); + } + + static std::shared_ptr GetKmsClient( + std::shared_ptr kms_client_factory, Review comment: > Why isn't this a method on KmsClientFactory? @pitrou because here we get from the cache, we create a new client from factory only if the item is not in the cache (or the corresponding token expires). This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] thamht4190 commented on a change in pull request #8023: ARROW-9318: [C++] Parquet encryption key management
thamht4190 commented on a change in pull request #8023: URL: https://github.com/apache/arrow/pull/8023#discussion_r486400053 ## File path: cpp/src/parquet/key_toolkit.h ## @@ -0,0 +1,123 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include + +#include "parquet/key_encryption_key.h" +#include "parquet/kms_client.h" +#include "parquet/kms_client_factory.h" +#include "parquet/platform.h" +#include "parquet/two_level_cache_with_expiration.h" + +namespace parquet { +namespace encryption { + +class KeyWithMasterID { + public: + KeyWithMasterID(const std::string& key_bytes, const std::string& master_id) + : key_bytes_(key_bytes), master_id_(master_id) {} + + const std::string& data_key() const { return key_bytes_; } + const std::string& master_id() const { return master_id_; } + + private: + std::string key_bytes_; + std::string master_id_; +}; + +class PARQUET_EXPORT KeyToolkit { + public: + class KmsClientCache { + public: +static KmsClientCache& GetInstance() { + static KmsClientCache instance; Review comment: Eventually I think 2 above ways are identical. The user have to keep an object alive along the time of their program and these both needs to be well documented. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] thamht4190 commented on a change in pull request #8023: ARROW-9318: [C++] Parquet encryption key management
thamht4190 commented on a change in pull request #8023: URL: https://github.com/apache/arrow/pull/8023#discussion_r486397802 ## File path: cpp/src/parquet/key_toolkit.h ## @@ -0,0 +1,123 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include + +#include "parquet/key_encryption_key.h" +#include "parquet/kms_client.h" +#include "parquet/kms_client_factory.h" +#include "parquet/platform.h" +#include "parquet/two_level_cache_with_expiration.h" + +namespace parquet { +namespace encryption { + +class KeyWithMasterID { + public: + KeyWithMasterID(const std::string& key_bytes, const std::string& master_id) + : key_bytes_(key_bytes), master_id_(master_id) {} + + const std::string& data_key() const { return key_bytes_; } + const std::string& master_id() const { return master_id_; } + + private: + std::string key_bytes_; + std::string master_id_; +}; + +class PARQUET_EXPORT KeyToolkit { + public: + class KmsClientCache { + public: +static KmsClientCache& GetInstance() { + static KmsClientCache instance; + return instance; +} +TwoLevelCacheWithExpiration>& cache() { return cache_; } + + private: +TwoLevelCacheWithExpiration> cache_; + }; + + class KEKWriteCache { + public: +static KEKWriteCache& GetInstance() { + static KEKWriteCache instance; + return instance; +} +TwoLevelCacheWithExpiration& cache() { return cache_; } + + private: +TwoLevelCacheWithExpiration cache_; + }; + + class KEKReadCache { + public: +static KEKReadCache& GetInstance() { + static KEKReadCache instance; + return instance; +} +TwoLevelCacheWithExpiration& cache() { return cache_; } + + private: +TwoLevelCacheWithExpiration cache_; + }; + + // KMS client two level cache: token -> KMSInstanceId -> KmsClient + static TwoLevelCacheWithExpiration>& + kms_client_cache_per_token() { +return KmsClientCache::GetInstance().cache(); + } + + // KEK two level cache for wrapping: token -> MEK_ID -> KeyEncryptionKey + static TwoLevelCacheWithExpiration& kek_write_cache_per_token() { +return KEKWriteCache::GetInstance().cache(); + } + + // KEK two level cache for unwrapping: token -> KEK_ID -> KEK bytes + static TwoLevelCacheWithExpiration& kek_read_cache_per_token() { +return KEKReadCache::GetInstance().cache(); + } + + static std::shared_ptr GetKmsClient( + std::shared_ptr kms_client_factory, + const KmsConnectionConfig& kms_connection_config, bool is_wrap_locally, + uint64_t cache_entry_lifetime); + + // Encrypts "key" with "master_key", using AES-GCM and the "aad" + static std::string EncryptKeyLocally(const std::string& key, Review comment: I guess the purpose to put `EncryptKeyLocally`/`DecryptKeyLocally` method in `KeyToolkit` is that user can make use of this function to encrypt/decrypt the key locally without writing the code again. An example: We have used these methods in `TestOnlyInMemoryKms` class. So it's not really internal stuff. cc @ggershinsky This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] thamht4190 commented on a change in pull request #8023: ARROW-9318: [C++] Parquet encryption key management
thamht4190 commented on a change in pull request #8023: URL: https://github.com/apache/arrow/pull/8023#discussion_r486382968 ## File path: cpp/src/parquet/key_material.h ## @@ -0,0 +1,120 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include + +namespace parquet { +namespace encryption { + +// KeyMaterial class represents the "key material", keeping the information that allows +// readers to recover an encryption key (see description of the KeyMetadata class). The +// keytools package (PARQUET-1373) implements the "envelope encryption" pattern, in a +// "single wrapping" or "double wrapping" mode. In the single wrapping mode, the key +// material is generated by encrypting the "data encryption key" (DEK) by a "master key". +// In the double wrapping mode, the key material is generated by encrypting the DEK by a +// "key encryption key" (KEK), that in turn is encrypted by a "master key". +// +// Key material is kept in a flat json object, with the following fields: +// 1. "keyMaterialType" - a String, with the type of key material. In the current +// version, only one value is allowed - "PKMT1" (stands +// for "parquet key management tools, version 1"). For external key material storage, +// this field is written in both "key metadata" and "key material" jsons. For internal +// key material storage, this field is written only once in the common json. +// 2. "isFooterKey" - a boolean. If true, means that the material belongs to a file footer +// key, and keeps additional information (such as +// KMS instance ID and URL). If false, means that the material belongs to a column +// key. +// 3. "kmsInstanceID" - a String, with the KMS Instance ID. Written only in footer key +// material. +// 4. "kmsInstanceURL" - a String, with the KMS Instance URL. Written only in footer key +// material. +// 5. "masterKeyID" - a String, with the ID of the master key used to generate the +// material. +// 6. "wrappedDEK" - a String, with the wrapped DEK (base64 encoding). +// 7. "doubleWrapping" - a boolean. If true, means that the material was generated in +// double wrapping mode. +// If false - in single wrapping mode. +// 8. "keyEncryptionKeyID" - a String, with the ID of the KEK used to generate the +// material. Written only in double wrapping mode. +// 9. "wrappedKEK" - a String, with the wrapped KEK (base64 encoding). Written only in +// double wrapping mode. +class KeyMaterial { + public: + static constexpr char KEY_MATERIAL_TYPE_FIELD[] = "keyMaterialType"; + static constexpr char KEY_MATERIAL_TYPE1[] = "PKMT1"; + + static constexpr char FOOTER_KEY_ID_IN_FILE[] = "footerKey"; + static constexpr char COLUMN_KEY_ID_IN_FILE_PREFIX[] = "columnKey"; + + static constexpr char IS_FOOTER_KEY_FIELD[] = "isFooterKey"; + static constexpr char DOUBLE_WRAPPING_FIELD[] = "doubleWrapping"; + static constexpr char KMS_INSTANCE_ID_FIELD[] = "kmsInstanceID"; + static constexpr char KMS_INSTANCE_URL_FIELD[] = "kmsInstanceURL"; + static constexpr char MASTER_KEY_ID_FIELD[] = "masterKeyID"; + static constexpr char WRAPPED_DEK_FIELD[] = "wrappedDEK"; + static constexpr char KEK_ID_FIELD[] = "keyEncryptionKeyID"; + static constexpr char WRAPPED_KEK_FIELD[] = "wrappedKEK"; + + public: + KeyMaterial() = default; + + static KeyMaterial Parse(const std::string& key_material_string); + + static KeyMaterial Parse(const rapidjson::Document& key_material_json); + + static std::string CreateSerialized(bool is_footer_key, Review comment: I updated it to `SerializeToJson` This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] thamht4190 commented on a change in pull request #8023: ARROW-9318: [C++] Parquet encryption key management
thamht4190 commented on a change in pull request #8023: URL: https://github.com/apache/arrow/pull/8023#discussion_r486382035 ## File path: cpp/src/parquet/file_key_wrapper.h ## @@ -0,0 +1,68 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "parquet/file_key_material_store.h" +#include "parquet/key_encryption_key.h" +#include "parquet/kms_client.h" +#include "parquet/kms_client_factory.h" + +namespace parquet { +namespace encryption { + +class FileKeyWrapper { + public: + static constexpr int KEK_LENGTH = 16; + static constexpr int KEK_ID_LENGTH = 16; + + static constexpr int RND_MAX_BYTES = 32; + + FileKeyWrapper(std::shared_ptr kms_client_factory, + const KmsConnectionConfig& kms_connection_config, + std::shared_ptr key_material_store, + uint64_t cache_entry_lifetime, bool double_wrapping, + bool is_wrap_locally); + + std::string GetEncryptionKeyMetadata(const std::string& data_key, + const std::string& master_key_id, + bool is_footer_key); + + std::string GetEncryptionKeyMetadata(const std::string& data_key, + const std::string& master_key_id, + bool is_footer_key, std::string key_id_in_file); + + private: + KeyEncryptionKey CreateKeyEncryptionKey(const std::string& master_key_id); + + // A map of MEK_ID -> KeyEncryptionKey, for the current token + std::map kek_per_master_key_id_; + + std::shared_ptr kms_client_; + KmsConnectionConfig kms_connection_config_; + std::shared_ptr key_material_store_; + uint64_t cache_entry_lifetime_; + bool double_wrapping_; + uint16_t key_counter_; Review comment: I removed it anw, since it's for external storage. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] thamht4190 commented on a change in pull request #8023: ARROW-9318: [C++] Parquet encryption key management
thamht4190 commented on a change in pull request #8023: URL: https://github.com/apache/arrow/pull/8023#discussion_r486380916 ## File path: cpp/src/parquet/key_metadata.h ## @@ -0,0 +1,91 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "parquet/key_material.h" + +namespace parquet { +namespace encryption { + +// Parquet encryption specification defines "key metadata" as an arbitrary byte array, +// generated by file writers for each encryption key, and passed to the low level API for +// storage in the file footer . The "key metadata" field is made available to file readers +// to enable recovery of the key. This simple interface can be utilized for implementation +// of any key management scheme. +// +// The keytools package (PARQUET-1373) implements one approach, of many possible, to key +// management and to generation of the "key metadata" fields. This approach, based on the +// "envelope encryption" pattern, allows to work with KMS servers. It keeps the actual +// material, required to recover a key, in a "key material" object (see the KeyMaterial +// class for details). +// +// KeyMetadata class writes (and reads) the "key metadata" field as a flat json object, +// with the following fields: +// 1. "keyMaterialType" - a String, with the type of key material. In the current Review comment: @ggershinsky any suggestion for the comment? This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] thamht4190 commented on a change in pull request #8023: ARROW-9318: [C++] Parquet encryption key management
thamht4190 commented on a change in pull request #8023: URL: https://github.com/apache/arrow/pull/8023#discussion_r486372095 ## File path: cpp/src/parquet/file_key_wrapper.h ## @@ -0,0 +1,68 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "parquet/file_key_material_store.h" +#include "parquet/key_encryption_key.h" +#include "parquet/kms_client.h" +#include "parquet/kms_client_factory.h" + +namespace parquet { +namespace encryption { + +class FileKeyWrapper { + public: + static constexpr int KEK_LENGTH = 16; + static constexpr int KEK_ID_LENGTH = 16; + + static constexpr int RND_MAX_BYTES = 32; + + FileKeyWrapper(std::shared_ptr kms_client_factory, + const KmsConnectionConfig& kms_connection_config, + std::shared_ptr key_material_store, + uint64_t cache_entry_lifetime, bool double_wrapping, + bool is_wrap_locally); + + std::string GetEncryptionKeyMetadata(const std::string& data_key, + const std::string& master_key_id, + bool is_footer_key); + + std::string GetEncryptionKeyMetadata(const std::string& data_key, Review comment: @ggershinsky I removed the method with `key_id_in_file`. It will be added when working with external storage. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] thamht4190 commented on a change in pull request #8023: ARROW-9318: [C++] Parquet encryption key management
thamht4190 commented on a change in pull request #8023: URL: https://github.com/apache/arrow/pull/8023#discussion_r486371291 ## File path: cpp/src/parquet/kms_client_factory.h ## @@ -0,0 +1,42 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "parquet/kms_client.h" +#include "parquet/platform.h" + +namespace parquet { +namespace encryption { + +class PARQUET_EXPORT KmsClientFactory { + public: + explicit KmsClientFactory(bool wrap_locally) : wrap_locally_(wrap_locally) {} + + KmsClientFactory() : KmsClientFactory(false) {} + + virtual ~KmsClientFactory() {} + + virtual std::shared_ptr CreateKmsClient( Review comment: > If this is only for internal use @pitrou it's for public use. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] thamht4190 commented on a change in pull request #8023: ARROW-9318: [C++] Parquet encryption key management
thamht4190 commented on a change in pull request #8023: URL: https://github.com/apache/arrow/pull/8023#discussion_r486370626 ## File path: cpp/src/parquet/kms_client_factory.h ## @@ -0,0 +1,42 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "parquet/kms_client.h" +#include "parquet/platform.h" + +namespace parquet { +namespace encryption { + +class PARQUET_EXPORT KmsClientFactory { + public: + explicit KmsClientFactory(bool wrap_locally) : wrap_locally_(wrap_locally) {} + + KmsClientFactory() : KmsClientFactory(false) {} + + virtual ~KmsClientFactory() {} + + virtual std::shared_ptr CreateKmsClient( Review comment: > The user can't create a KmsClientFactory directly? No, he can't. `KmsClientFactory` doesn't know the actual type of `KmsClient` yet. When the user create their own kms client class (inherited from `KmsClient`), he will have to create their own `KmsClientFactory` as well. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] thamht4190 commented on a change in pull request #8023: ARROW-9318: [C++] Parquet encryption key management
thamht4190 commented on a change in pull request #8023: URL: https://github.com/apache/arrow/pull/8023#discussion_r486370626 ## File path: cpp/src/parquet/kms_client_factory.h ## @@ -0,0 +1,42 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "parquet/kms_client.h" +#include "parquet/platform.h" + +namespace parquet { +namespace encryption { + +class PARQUET_EXPORT KmsClientFactory { + public: + explicit KmsClientFactory(bool wrap_locally) : wrap_locally_(wrap_locally) {} + + KmsClientFactory() : KmsClientFactory(false) {} + + virtual ~KmsClientFactory() {} + + virtual std::shared_ptr CreateKmsClient( Review comment: > The user can't create a KmsClientFactory directly? @pitrou No, he can't. `KmsClientFactory` doesn't know the actual type of `KmsClient` yet. When the user create their own kms client class (inherited from `KmsClient`), he will have to create their own `KmsClientFactory` as well. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] thamht4190 commented on a change in pull request #8023: ARROW-9318: [C++] Parquet encryption key management
thamht4190 commented on a change in pull request #8023: URL: https://github.com/apache/arrow/pull/8023#discussion_r486367648 ## File path: cpp/src/parquet/key_toolkit.h ## @@ -0,0 +1,123 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include + +#include "parquet/key_encryption_key.h" +#include "parquet/kms_client.h" +#include "parquet/kms_client_factory.h" +#include "parquet/platform.h" +#include "parquet/two_level_cache_with_expiration.h" + +namespace parquet { +namespace encryption { + +class KeyWithMasterID { + public: + KeyWithMasterID(const std::string& key_bytes, const std::string& master_id) + : key_bytes_(key_bytes), master_id_(master_id) {} + + const std::string& data_key() const { return key_bytes_; } + const std::string& master_id() const { return master_id_; } + + private: + std::string key_bytes_; + std::string master_id_; +}; + +class PARQUET_EXPORT KeyToolkit { + public: + class KmsClientCache { + public: +static KmsClientCache& GetInstance() { + static KmsClientCache instance; Review comment: @ggershinsky I worry that if the cache is a member of `PropertiesDrivenCryptoFactory`, users of API must keep the life time of `PropertiesDrivenCryptoFactory` object after getting `FileEncryptionProperties`/`FileDecryptionProperties`. If they forget that, it will be easy to get a crash in their programs. I prefer to let users manage the life time of the cache themselves and pass it into `PropertiesDrivenCryptoFactory`. Any final decision? This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] thamht4190 commented on a change in pull request #8023: ARROW-9318: [C++] Parquet encryption key management
thamht4190 commented on a change in pull request #8023: URL: https://github.com/apache/arrow/pull/8023#discussion_r483013904 ## File path: cpp/src/parquet/file_key_wrapper.h ## @@ -0,0 +1,68 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "parquet/file_key_material_store.h" +#include "parquet/key_encryption_key.h" +#include "parquet/kms_client.h" +#include "parquet/kms_client_factory.h" + +namespace parquet { +namespace encryption { + +class FileKeyWrapper { + public: + static constexpr int KEK_LENGTH = 16; + static constexpr int KEK_ID_LENGTH = 16; + + static constexpr int RND_MAX_BYTES = 32; + + FileKeyWrapper(std::shared_ptr kms_client_factory, + const KmsConnectionConfig& kms_connection_config, + std::shared_ptr key_material_store, + uint64_t cache_entry_lifetime, bool double_wrapping, + bool is_wrap_locally); + + std::string GetEncryptionKeyMetadata(const std::string& data_key, Review comment: Since we already have a struct KeyMetadata, so creating one more struct to contain this string doesn't make sense. We can change it to: ``` KeyMetadata FileKeyWrapper::GetEncryptionKeyMetadata(...) ``` and create a method: ``` std::string KeyMetadata::SerializeToJson() ``` But it seems not a reversed step of `std::string FileKeyWrapper::GetKey(const std::string& key_metadata) const`. So I still prefer current approach. cc @ggershinsky. ## File path: cpp/src/parquet/file_key_wrapper.h ## @@ -0,0 +1,68 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "parquet/file_key_material_store.h" +#include "parquet/key_encryption_key.h" +#include "parquet/kms_client.h" +#include "parquet/kms_client_factory.h" + +namespace parquet { +namespace encryption { + +class FileKeyWrapper { + public: + static constexpr int KEK_LENGTH = 16; + static constexpr int KEK_ID_LENGTH = 16; + + static constexpr int RND_MAX_BYTES = 32; + + FileKeyWrapper(std::shared_ptr kms_client_factory, + const KmsConnectionConfig& kms_connection_config, + std::shared_ptr key_material_store, + uint64_t cache_entry_lifetime, bool double_wrapping, + bool is_wrap_locally); + + std::string GetEncryptionKeyMetadata(const std::string& data_key, Review comment: @emkornfield Since we already have a struct KeyMetadata, so creating one more struct to contain this string doesn't make sense. We can change it to: ``` KeyMetadata FileKeyWrapper::GetEncryptionKeyMetadata(...) ``` and create a method: ``` std::string KeyMetadata::SerializeToJson() ``` But it seems not a reversed step of `std::string FileKeyWrapper::GetKey(const std::string& key_metadata) const`. So I still prefer current approach. cc @ggershinsky. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] thamht4190 commented on a change in pull request #8023: ARROW-9318: [C++] Parquet encryption key management
thamht4190 commented on a change in pull request #8023: URL: https://github.com/apache/arrow/pull/8023#discussion_r482995943 ## File path: cpp/src/parquet/key_toolkit.h ## @@ -0,0 +1,123 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include + +#include "parquet/key_encryption_key.h" +#include "parquet/kms_client.h" +#include "parquet/kms_client_factory.h" +#include "parquet/platform.h" +#include "parquet/two_level_cache_with_expiration.h" + +namespace parquet { +namespace encryption { + +class KeyWithMasterID { + public: + KeyWithMasterID(const std::string& key_bytes, const std::string& master_id) + : key_bytes_(key_bytes), master_id_(master_id) {} + + const std::string& data_key() const { return key_bytes_; } + const std::string& master_id() const { return master_id_; } + + private: + std::string key_bytes_; + std::string master_id_; +}; + +class PARQUET_EXPORT KeyToolkit { + public: + class KmsClientCache { + public: +static KmsClientCache& GetInstance() { + static KmsClientCache instance; Review comment: to @ggershinsky This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] thamht4190 commented on a change in pull request #8023: ARROW-9318: [C++] Parquet encryption key management
thamht4190 commented on a change in pull request #8023: URL: https://github.com/apache/arrow/pull/8023#discussion_r482993016 ## File path: cpp/src/parquet/file_key_wrapper.h ## @@ -0,0 +1,68 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "parquet/file_key_material_store.h" +#include "parquet/key_encryption_key.h" +#include "parquet/kms_client.h" +#include "parquet/kms_client_factory.h" + +namespace parquet { +namespace encryption { + +class FileKeyWrapper { + public: + static constexpr int KEK_LENGTH = 16; + static constexpr int KEK_ID_LENGTH = 16; + + static constexpr int RND_MAX_BYTES = 32; + + FileKeyWrapper(std::shared_ptr kms_client_factory, + const KmsConnectionConfig& kms_connection_config, + std::shared_ptr key_material_store, + uint64_t cache_entry_lifetime, bool double_wrapping, + bool is_wrap_locally); + + std::string GetEncryptionKeyMetadata(const std::string& data_key, + const std::string& master_key_id, + bool is_footer_key); + + std::string GetEncryptionKeyMetadata(const std::string& data_key, + const std::string& master_key_id, + bool is_footer_key, std::string key_id_in_file); + + private: + KeyEncryptionKey CreateKeyEncryptionKey(const std::string& master_key_id); + + // A map of MEK_ID -> KeyEncryptionKey, for the current token + std::map kek_per_master_key_id_; + + std::shared_ptr kms_client_; + KmsConnectionConfig kms_connection_config_; + std::shared_ptr key_material_store_; + uint64_t cache_entry_lifetime_; + bool double_wrapping_; + uint16_t key_counter_; Review comment: Can you help with this question @ggershinsky ? This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] thamht4190 commented on a change in pull request #8023: ARROW-9318: [C++] Parquet encryption key management
thamht4190 commented on a change in pull request #8023: URL: https://github.com/apache/arrow/pull/8023#discussion_r479634599 ## File path: cpp/src/parquet/key_material.cc ## @@ -0,0 +1,161 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include "parquet/exception.h" +#include "parquet/key_material.h" +#include "parquet/key_metadata.h" + +namespace parquet { +namespace encryption { + +constexpr char KeyMaterial::KEY_MATERIAL_TYPE_FIELD[]; +constexpr char KeyMaterial::KEY_MATERIAL_TYPE1[]; + +constexpr char KeyMaterial::FOOTER_KEY_ID_IN_FILE[]; +constexpr char KeyMaterial::COLUMN_KEY_ID_IN_FILE_PREFIX[]; + +constexpr char KeyMaterial::IS_FOOTER_KEY_FIELD[]; +constexpr char KeyMaterial::DOUBLE_WRAPPING_FIELD[]; +constexpr char KeyMaterial::KMS_INSTANCE_ID_FIELD[]; +constexpr char KeyMaterial::KMS_INSTANCE_URL_FIELD[]; +constexpr char KeyMaterial::MASTER_KEY_ID_FIELD[]; +constexpr char KeyMaterial::WRAPPED_DEK_FIELD[]; +constexpr char KeyMaterial::KEK_ID_FIELD[]; +constexpr char KeyMaterial::WRAPPED_KEK_FIELD[]; + +KeyMaterial::KeyMaterial(bool is_footer_key, const std::string& kms_instance_id, + const std::string& kms_instance_url, + const std::string& master_key_id, bool is_double_wrapped, + const std::string& kek_id, + const std::string& encoded_wrapped_kek, + const std::string& encoded_wrapped_dek) +: is_footer_key_(is_footer_key), + kms_instance_id_(kms_instance_id), + kms_instance_url_(kms_instance_url), + master_key_id_(master_key_id), + is_double_wrapped_(is_double_wrapped), + kek_id_(kek_id), + encoded_wrapped_kek_(encoded_wrapped_kek), + encoded_wrapped_dek_(encoded_wrapped_dek) {} + +KeyMaterial KeyMaterial::Parse(const std::string& key_material_string) { + rapidjson::Document document; + document.Parse(key_material_string.c_str()); + + if (document.HasParseError() || !document.IsObject()) { +throw ParquetException("Failed to parse key metadata " + key_material_string); + } + + // External key material - extract "key material type", and make sure it is supported + std::string key_material_type = document[KEY_MATERIAL_TYPE_FIELD].GetString(); + if (KEY_MATERIAL_TYPE1 != key_material_type) { +throw ParquetException("Wrong key material type: " + key_material_type + " vs " + + KEY_MATERIAL_TYPE1); + } + // Parse other fields (common to internal and external key material) + return Parse(document); +} + +KeyMaterial KeyMaterial::Parse(const rapidjson::Document& key_material_json) { + // 2. Check if "key material" belongs to file footer key + bool is_footer_key = key_material_json[IS_FOOTER_KEY_FIELD].GetBool(); + std::string kms_instance_id; + std::string kms_instance_url; + if (is_footer_key) { +// 3. For footer key, extract KMS Instance ID +kms_instance_id = key_material_json[KMS_INSTANCE_ID_FIELD].GetString(); +// 4. For footer key, extract KMS Instance URL +kms_instance_url = key_material_json[KMS_INSTANCE_URL_FIELD].GetString(); + } + // 5. Extract master key ID + std::string master_key_id = key_material_json[MASTER_KEY_ID_FIELD].GetString(); + // 6. Extract wrapped DEK + std::string encoded_wrapped_dek = key_material_json[WRAPPED_DEK_FIELD].GetString(); + std::string kek_id; + std::string encoded_wrapped_kek; + // 7. Check if "key material" was generated in double wrapping mode + bool is_double_wrapped = key_material_json[DOUBLE_WRAPPING_FIELD].GetBool(); + if (is_double_wrapped) { +// 8. In double wrapping mode, extract KEK ID +kek_id = key_material_json[KEK_ID_FIELD].GetString(); +// 9. In double wrapping mode, extract wrapped KEK +encoded_wrapped_kek = key_material_json[WRAPPED_KEK_FIELD].GetString(); + } + + return KeyMaterial(is_footer_key, kms_instance_id, kms_instance_url, master_key_id, + is_double_wrapped, kek_id, encoded_wrapped_kek, encoded_wrapped_dek); +} + +std::string KeyMaterial::CreateSerialized( +bool is_footer_key, const std::string& kms_instance_id, Review comment: I'm not
[GitHub] [arrow] thamht4190 commented on a change in pull request #8023: ARROW-9318: [C++] Parquet encryption key management
thamht4190 commented on a change in pull request #8023: URL: https://github.com/apache/arrow/pull/8023#discussion_r479633600 ## File path: cpp/src/parquet/key_toolkit.h ## @@ -0,0 +1,123 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include + +#include "parquet/key_encryption_key.h" +#include "parquet/kms_client.h" +#include "parquet/kms_client_factory.h" +#include "parquet/platform.h" +#include "parquet/two_level_cache_with_expiration.h" + +namespace parquet { +namespace encryption { + +class KeyWithMasterID { Review comment: > Does this need to be exported? No, it's used internally in FileKeyUnwrapper.cc (implemented) and KeyToolkit.cc (will be implemented later). This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] thamht4190 commented on a change in pull request #8023: ARROW-9318: [C++] Parquet encryption key management
thamht4190 commented on a change in pull request #8023: URL: https://github.com/apache/arrow/pull/8023#discussion_r479633145 ## File path: cpp/src/parquet/kms_client.h ## @@ -0,0 +1,81 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "parquet/exception.h" +#include "parquet/platform.h" + +namespace parquet { +namespace encryption { + +class PARQUET_EXPORT KeyAccessToken { + public: + KeyAccessToken() = default; + + explicit KeyAccessToken(const std::string value) : value_(value) {} + + void Refresh(const std::string& new_value) { value_ = new_value; } + + const std::string& value() const { return value_; } + + void SetDefaultIfEmpty(); + + private: + std::string value_; +}; + +struct PARQUET_EXPORT KmsConnectionConfig { + std::string kms_instance_id; + std::string kms_instance_url; + std::shared_ptr refreshable_key_access_token; + std::unordered_map custom_kms_conf; + + const std::string& key_access_token() const { +if (refreshable_key_access_token == NULL || +refreshable_key_access_token->value().empty()) { + throw ParquetException("key access token is not set!"); +} +return refreshable_key_access_token->value(); + } + + void SetDefaultIfEmpty(); +}; + +class PARQUET_EXPORT KmsClient { + public: + static constexpr char KMS_INSTANCE_ID_DEFAULT[] = "DEFAULT"; + static constexpr char KMS_INSTANCE_URL_DEFAULT[] = "DEFAULT"; + static constexpr char KEY_ACCESS_TOKEN_DEFAULT[] = "DEFAULT"; + + // Wraps a key - encrypts it with the master key, encodes the result + // and potentially adds a KMS-specific metadata. + virtual std::string WrapKey(const std::string& key_bytes, + const std::string& master_key_identifier) = 0; Review comment: @ggershinsky I always see `master_key_identifier` goes with a `key`, so should we pass an input param `const KeyWithMasterId& key_with_master_id` instead of 2 input string params? This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] thamht4190 commented on a change in pull request #8023: ARROW-9318: [C++] Parquet encryption key management
thamht4190 commented on a change in pull request #8023: URL: https://github.com/apache/arrow/pull/8023#discussion_r479632794 ## File path: cpp/src/parquet/key_material.cc ## @@ -0,0 +1,161 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include "parquet/exception.h" +#include "parquet/key_material.h" +#include "parquet/key_metadata.h" + +namespace parquet { +namespace encryption { + +constexpr char KeyMaterial::KEY_MATERIAL_TYPE_FIELD[]; +constexpr char KeyMaterial::KEY_MATERIAL_TYPE1[]; + +constexpr char KeyMaterial::FOOTER_KEY_ID_IN_FILE[]; +constexpr char KeyMaterial::COLUMN_KEY_ID_IN_FILE_PREFIX[]; + +constexpr char KeyMaterial::IS_FOOTER_KEY_FIELD[]; +constexpr char KeyMaterial::DOUBLE_WRAPPING_FIELD[]; +constexpr char KeyMaterial::KMS_INSTANCE_ID_FIELD[]; +constexpr char KeyMaterial::KMS_INSTANCE_URL_FIELD[]; +constexpr char KeyMaterial::MASTER_KEY_ID_FIELD[]; +constexpr char KeyMaterial::WRAPPED_DEK_FIELD[]; +constexpr char KeyMaterial::KEK_ID_FIELD[]; +constexpr char KeyMaterial::WRAPPED_KEK_FIELD[]; + +KeyMaterial::KeyMaterial(bool is_footer_key, const std::string& kms_instance_id, + const std::string& kms_instance_url, + const std::string& master_key_id, bool is_double_wrapped, + const std::string& kek_id, + const std::string& encoded_wrapped_kek, + const std::string& encoded_wrapped_dek) +: is_footer_key_(is_footer_key), + kms_instance_id_(kms_instance_id), + kms_instance_url_(kms_instance_url), + master_key_id_(master_key_id), + is_double_wrapped_(is_double_wrapped), + kek_id_(kek_id), + encoded_wrapped_kek_(encoded_wrapped_kek), + encoded_wrapped_dek_(encoded_wrapped_dek) {} + +KeyMaterial KeyMaterial::Parse(const std::string& key_material_string) { + rapidjson::Document document; + document.Parse(key_material_string.c_str()); + + if (document.HasParseError() || !document.IsObject()) { +throw ParquetException("Failed to parse key metadata " + key_material_string); + } + + // External key material - extract "key material type", and make sure it is supported + std::string key_material_type = document[KEY_MATERIAL_TYPE_FIELD].GetString(); + if (KEY_MATERIAL_TYPE1 != key_material_type) { +throw ParquetException("Wrong key material type: " + key_material_type + " vs " + + KEY_MATERIAL_TYPE1); + } + // Parse other fields (common to internal and external key material) + return Parse(document); +} + +KeyMaterial KeyMaterial::Parse(const rapidjson::Document& key_material_json) { + // 2. Check if "key material" belongs to file footer key + bool is_footer_key = key_material_json[IS_FOOTER_KEY_FIELD].GetBool(); Review comment: I should add more validation before passing. ## File path: cpp/src/parquet/key_material.cc ## @@ -0,0 +1,161 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include "parquet/exception.h" +#include "parquet/key_material.h" +#include "parquet/key_metadata.h" + +namespace parquet { +namespace encryption { + +constexpr char KeyMaterial::KEY_MATERIAL_TYPE_FIELD[]; +constexpr char KeyMaterial::KEY_MATERIAL_TYPE1[]; + +constexpr char KeyMaterial::FOOTER_KEY_ID_IN_FILE[]; +constexpr char KeyMaterial::COLUMN_KEY_ID_IN_FILE_PREFIX[]; + +constexpr
[GitHub] [arrow] thamht4190 commented on a change in pull request #8023: ARROW-9318: [C++] Parquet encryption key management
thamht4190 commented on a change in pull request #8023: URL: https://github.com/apache/arrow/pull/8023#discussion_r479632859 ## File path: cpp/src/parquet/key_metadata.h ## @@ -0,0 +1,91 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "parquet/key_material.h" + +namespace parquet { +namespace encryption { + +// Parquet encryption specification defines "key metadata" as an arbitrary byte array, +// generated by file writers for each encryption key, and passed to the low level API for +// storage in the file footer . The "key metadata" field is made available to file readers +// to enable recovery of the key. This simple interface can be utilized for implementation +// of any key management scheme. +// +// The keytools package (PARQUET-1373) implements one approach, of many possible, to key +// management and to generation of the "key metadata" fields. This approach, based on the +// "envelope encryption" pattern, allows to work with KMS servers. It keeps the actual +// material, required to recover a key, in a "key material" object (see the KeyMaterial +// class for details). +// +// KeyMetadata class writes (and reads) the "key metadata" field as a flat json object, +// with the following fields: +// 1. "keyMaterialType" - a String, with the type of key material. In the current Review comment: Sorry. I don't get this point. Can you explain? This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] thamht4190 commented on a change in pull request #8023: ARROW-9318: [C++] Parquet encryption key management
thamht4190 commented on a change in pull request #8023: URL: https://github.com/apache/arrow/pull/8023#discussion_r479626558 ## File path: cpp/src/parquet/key_material.cc ## @@ -0,0 +1,161 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include "parquet/exception.h" +#include "parquet/key_material.h" +#include "parquet/key_metadata.h" + +namespace parquet { +namespace encryption { + +constexpr char KeyMaterial::KEY_MATERIAL_TYPE_FIELD[]; +constexpr char KeyMaterial::KEY_MATERIAL_TYPE1[]; + +constexpr char KeyMaterial::FOOTER_KEY_ID_IN_FILE[]; +constexpr char KeyMaterial::COLUMN_KEY_ID_IN_FILE_PREFIX[]; + +constexpr char KeyMaterial::IS_FOOTER_KEY_FIELD[]; +constexpr char KeyMaterial::DOUBLE_WRAPPING_FIELD[]; +constexpr char KeyMaterial::KMS_INSTANCE_ID_FIELD[]; +constexpr char KeyMaterial::KMS_INSTANCE_URL_FIELD[]; +constexpr char KeyMaterial::MASTER_KEY_ID_FIELD[]; +constexpr char KeyMaterial::WRAPPED_DEK_FIELD[]; +constexpr char KeyMaterial::KEK_ID_FIELD[]; +constexpr char KeyMaterial::WRAPPED_KEK_FIELD[]; + +KeyMaterial::KeyMaterial(bool is_footer_key, const std::string& kms_instance_id, + const std::string& kms_instance_url, + const std::string& master_key_id, bool is_double_wrapped, + const std::string& kek_id, + const std::string& encoded_wrapped_kek, + const std::string& encoded_wrapped_dek) +: is_footer_key_(is_footer_key), + kms_instance_id_(kms_instance_id), + kms_instance_url_(kms_instance_url), + master_key_id_(master_key_id), + is_double_wrapped_(is_double_wrapped), + kek_id_(kek_id), + encoded_wrapped_kek_(encoded_wrapped_kek), + encoded_wrapped_dek_(encoded_wrapped_dek) {} + +KeyMaterial KeyMaterial::Parse(const std::string& key_material_string) { + rapidjson::Document document; + document.Parse(key_material_string.c_str()); Review comment: > I thought there was a document parser that takes a string pointer and a size, that is probably better to use if it is available. I've seen something for json (under `arrow/json` folder), but they seems not fit to the usage here. > Is the json allowed to be utf-8 encoded? Do you mean if rapidjson supports utf-8 encodings? If so, yes. ## File path: cpp/src/parquet/key_material.cc ## @@ -0,0 +1,161 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include "parquet/exception.h" +#include "parquet/key_material.h" +#include "parquet/key_metadata.h" + +namespace parquet { +namespace encryption { + +constexpr char KeyMaterial::KEY_MATERIAL_TYPE_FIELD[]; +constexpr char KeyMaterial::KEY_MATERIAL_TYPE1[]; + +constexpr char KeyMaterial::FOOTER_KEY_ID_IN_FILE[]; +constexpr char KeyMaterial::COLUMN_KEY_ID_IN_FILE_PREFIX[]; + +constexpr char KeyMaterial::IS_FOOTER_KEY_FIELD[]; +constexpr char KeyMaterial::DOUBLE_WRAPPING_FIELD[]; +constexpr char KeyMaterial::KMS_INSTANCE_ID_FIELD[]; +constexpr char KeyMaterial::KMS_INSTANCE_URL_FIELD[]; +constexpr char KeyMaterial::MASTER_KEY_ID_FIELD[]; +constexpr char KeyMaterial::WRAPPED_DEK_FIELD[]; +constexpr char KeyMaterial::KEK_ID_FIELD[]; +constexpr char KeyMaterial::WRAPPED_KEK_FIELD[]; + +KeyMaterial::KeyMaterial(bool is_footer_key, const std::string& kms_instance_id, + const
[GitHub] [arrow] thamht4190 commented on a change in pull request #8023: ARROW-9318: [C++] Parquet encryption key management
thamht4190 commented on a change in pull request #8023: URL: https://github.com/apache/arrow/pull/8023#discussion_r479625144 ## File path: cpp/src/parquet/file_key_wrapper.h ## @@ -0,0 +1,68 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "parquet/file_key_material_store.h" +#include "parquet/key_encryption_key.h" +#include "parquet/kms_client.h" +#include "parquet/kms_client_factory.h" + +namespace parquet { +namespace encryption { + +class FileKeyWrapper { + public: + static constexpr int KEK_LENGTH = 16; + static constexpr int KEK_ID_LENGTH = 16; + + static constexpr int RND_MAX_BYTES = 32; Review comment: This constant is unused -> removed it. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] thamht4190 commented on a change in pull request #8023: ARROW-9318: [C++] Parquet encryption key management
thamht4190 commented on a change in pull request #8023: URL: https://github.com/apache/arrow/pull/8023#discussion_r479625018 ## File path: cpp/src/parquet/file_key_unwrapper.h ## @@ -0,0 +1,65 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "parquet/encryption.h" +#include "parquet/key_material.h" +#include "parquet/key_toolkit.h" +#include "parquet/kms_client.h" +#include "parquet/kms_client_factory.h" +#include "parquet/platform.h" + +namespace parquet { +namespace encryption { + +class PARQUET_EXPORT FileKeyUnwrapper : public DecryptionKeyRetriever { + public: + FileKeyUnwrapper(std::shared_ptr kms_client_factory, Review comment: `kms_client_factory` is stored here in `FileKeyUnwrapper` class and `PropertiesDrivenCryptoFactory` class. We are not sure the lifetime of these 2 classes, then I use shared_ptr. One way to use unique_ptr is only storing it at one place as a static variable in `KeyToolkit` class. But because you questioned about `KeyTookit` singleton, I'll wait until we resolve that question. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] thamht4190 commented on a change in pull request #8023: ARROW-9318: [C++] Parquet encryption key management
thamht4190 commented on a change in pull request #8023: URL: https://github.com/apache/arrow/pull/8023#discussion_r479624632 ## File path: cpp/src/parquet/file_key_unwrapper.h ## @@ -0,0 +1,65 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "parquet/encryption.h" +#include "parquet/key_material.h" +#include "parquet/key_toolkit.h" +#include "parquet/kms_client.h" +#include "parquet/kms_client_factory.h" +#include "parquet/platform.h" + +namespace parquet { +namespace encryption { + +class PARQUET_EXPORT FileKeyUnwrapper : public DecryptionKeyRetriever { + public: + FileKeyUnwrapper(std::shared_ptr kms_client_factory, + const KmsConnectionConfig& kms_connection_config, + uint64_t cache_lifetime, bool is_wrap_locally); + + std::string GetKey(const std::string& key_metadata) override; + + private: + class KeyWithMasterID { + public: +KeyWithMasterID(const std::string& key_bytes, const std::string& master_id) +: key_bytes_(key_bytes), master_id_(master_id) {} + +const std::string& data_key() const { return key_bytes_; } Review comment: Removed it and use `KeyWithMasterId` defined in `key_toolkit.h` This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] thamht4190 commented on a change in pull request #8023: ARROW-9318: [C++] Parquet encryption key management
thamht4190 commented on a change in pull request #8023: URL: https://github.com/apache/arrow/pull/8023#discussion_r479624053 ## File path: cpp/src/parquet/key_encryption_key.h ## @@ -0,0 +1,57 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "arrow/util/base64.h" + +namespace parquet { +namespace encryption { + +class KeyEncryptionKey { + public: + KeyEncryptionKey() = default; Review comment: Removed. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] thamht4190 commented on a change in pull request #8023: ARROW-9318: [C++] Parquet encryption key management
thamht4190 commented on a change in pull request #8023: URL: https://github.com/apache/arrow/pull/8023#discussion_r477067068 ## File path: cpp/src/parquet/encryption_internal.h ## @@ -45,6 +45,9 @@ constexpr int8_t kOffsetIndex = 7; /// Performs AES encryption operations with GCM or CTR ciphers. class AesEncryptor { public: + /// Can serve one key length only. Possible values: 16, 24, 32 bytes. + explicit AesEncryptor(ParquetCipher::type alg_id, int key_len, bool metadata); Review comment: @emkornfield I want to create a local variable of AesEncryptor, without using smart pointers, so it will store in stack rather than in heap. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] thamht4190 commented on a change in pull request #8023: ARROW-9318: [C++] Parquet encryption key management
thamht4190 commented on a change in pull request #8023: URL: https://github.com/apache/arrow/pull/8023#discussion_r475668043 ## File path: cpp/src/parquet/encryption.h ## @@ -47,15 +47,15 @@ using ColumnPathToEncryptionPropertiesMap = class PARQUET_EXPORT DecryptionKeyRetriever { public: - virtual std::string GetKey(const std::string& key_metadata) const = 0; Review comment: Yes, I can do it. However, I don't understand why we must let it be `const` method. Can you please explain? This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] thamht4190 commented on a change in pull request #8023: ARROW-9318: [C++] Parquet encryption key management
thamht4190 commented on a change in pull request #8023: URL: https://github.com/apache/arrow/pull/8023#discussion_r475446337 ## File path: cpp/src/parquet/file_key_material_store.h ## @@ -0,0 +1,51 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License") = 0; you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +namespace parquet { +namespace encryption { + +class FileKeyMaterialStore { + public: + // Initializes key material store for a parquet file. + virtual void Initialize(const std::string& parquet_file_path, bool temp_store) = 0; Review comment: For simplicity, I will remove members of this class since they are not used anywhere yet and keeps only the class name. I will add later when implementing with hadoop file system. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] thamht4190 commented on a change in pull request #8023: ARROW-9318: [C++] Parquet encryption key management
thamht4190 commented on a change in pull request #8023: URL: https://github.com/apache/arrow/pull/8023#discussion_r475442393 ## File path: cpp/src/parquet/kms_client_factory.h ## @@ -0,0 +1,42 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "parquet/kms_client.h" +#include "parquet/platform.h" + +namespace parquet { +namespace encryption { + +class PARQUET_EXPORT KmsClientFactory { + public: + explicit KmsClientFactory(bool wrap_locally) : wrap_locally_(wrap_locally) {} + + KmsClientFactory() : KmsClientFactory(false) {} + + virtual ~KmsClientFactory() {} + + virtual std::shared_ptr CreateKmsClient( Review comment: The code accesses a KmsClient instance at many places, so I think a shared client is a must. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] thamht4190 commented on a change in pull request #8023: ARROW-9318: [C++] Parquet encryption key management
thamht4190 commented on a change in pull request #8023: URL: https://github.com/apache/arrow/pull/8023#discussion_r475440024 ## File path: cpp/src/parquet/key_toolkit.h ## @@ -0,0 +1,123 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include + +#include "parquet/key_encryption_key.h" +#include "parquet/kms_client.h" +#include "parquet/kms_client_factory.h" +#include "parquet/platform.h" +#include "parquet/two_level_cache_with_expiration.h" + +namespace parquet { +namespace encryption { + +class KeyWithMasterID { + public: + KeyWithMasterID(const std::string& key_bytes, const std::string& master_id) + : key_bytes_(key_bytes), master_id_(master_id) {} + + const std::string& data_key() const { return key_bytes_; } + const std::string& master_id() const { return master_id_; } + + private: + std::string key_bytes_; + std::string master_id_; +}; + +class PARQUET_EXPORT KeyToolkit { Review comment: I based on Java version, so maybe @ggershinsky will help to answer this question. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] thamht4190 commented on a change in pull request #8023: ARROW-9318: [C++] Parquet encryption key management
thamht4190 commented on a change in pull request #8023: URL: https://github.com/apache/arrow/pull/8023#discussion_r475368729 ## File path: cpp/src/parquet/encryption.h ## @@ -47,15 +47,15 @@ using ColumnPathToEncryptionPropertiesMap = class PARQUET_EXPORT DecryptionKeyRetriever { public: - virtual std::string GetKey(const std::string& key_metadata) const = 0; Review comment: @emkornfield because inherited classes (i.e. `FileKeyUnwrapper` class in this pull) may need to modify its members inside this method. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org