adamreeve commented on code in PR #48009:
URL: https://github.com/apache/arrow/pull/48009#discussion_r2496153376
##########
python/pyarrow/_parquet_encryption.pyx:
##########
@@ -413,6 +425,17 @@ cdef class CryptoFactory(_Weakrefable):
encryption_config : EncryptionConfiguration
Configuration of the encryption, such as which columns to encrypt
+ parquet_file_path : str or pathlib.Path, default ""
+ Path to the parquet file to be encrypted. Only required when the
+ internal_key_material attribute of EncryptionConfiguration is set
+ to False. Used to derive the path for storing key material
+ specific to this parquet file.
+
+ filesystem : FileSystem, default None
+ Used only when internal_key_material is set to False on
+ EncryptionConfiguration. If nothinng passed, will be inferred
Review Comment:
```suggestion
EncryptionConfiguration. If None, the file system will be
inferred
```
##########
python/pyarrow/_parquet_encryption.pyx:
##########
@@ -473,9 +529,167 @@ cdef class CryptoFactory(_Weakrefable):
def remove_cache_entries_for_all_tokens(self):
self.factory.get().RemoveCacheEntriesForAllTokens()
+ def rotate_master_keys(
+ self,
+ KmsConnectionConfig kms_connection_config,
+ parquet_file_path,
+ FileSystem filesystem,
Review Comment:
Probably makes sense to default to None and infer the file system by default:
```suggestion
FileSystem filesystem=None,
```
##########
python/pyarrow/_parquet_encryption.pyx:
##########
@@ -445,6 +482,15 @@ cdef class CryptoFactory(_Weakrefable):
Configuration of the decryption, such as cache timeout.
Can be None.
+ parquet_file_path : str or pathlib.Path, default ""
+ Path to the parquet file to be decrypted. Only required when
+ the parquet file uses external key material. Used to derive
+ the path to the external key material file.
+
+ filesystem : FileSystem, default None
+ Used only when the parquet file uses external key material. If
+ nothinng passed, will be inferred based on parquet_file_path.
Review Comment:
```suggestion
None, the file system will be inferred based on
parquet_file_path.
```
##########
python/pyarrow/_parquet_encryption.pyx:
##########
@@ -473,9 +529,167 @@ cdef class CryptoFactory(_Weakrefable):
def remove_cache_entries_for_all_tokens(self):
self.factory.get().RemoveCacheEntriesForAllTokens()
+ def rotate_master_keys(
+ self,
+ KmsConnectionConfig kms_connection_config,
+ parquet_file_path,
+ FileSystem filesystem,
+ double_wrapping = True,
+ cache_lifetime_seconds = 600):
+ """ Rotates master encryption keys for a Parquet file that uses
+ external key material.
+
+ Parameters
+ ----------
+ kms_connection_config : KmsConnectionConfig
+ Configuration of connection to KMS
+
+ parquet_file_path : str or pathlib.Path
+ Path to a parquet file using external key material.
+
+ filesystem : FileSystem, default None
+ Used only when the parquet file uses external key material. If
+ nothinng passed, will be inferred based on parquet_file_path.
+
+ double_wrapping : bool, default True
+ In the single wrapping mode, decrypts data keys with old master
+ keys, then encrypts them with new master keys. In the double
+ wrapping mode, decrypts KEKs (key encryption keys) with old
+ master keys, generates new KEKs and encrypts them with new master
+ keys.
+
+ cache_lifetime_seconds : int or float, default 600
+ During key rotation, KMS Client and Key Encryption Keys will be
+ cached for this duration.
+
+ Returns
+ -------
+ file_decryption_properties : FileDecryptionProperties
+ File decryption properties.
Review Comment:
This doesn't return anything
```suggestion
```
##########
python/pyarrow/_parquet_encryption.pyx:
##########
@@ -473,9 +529,167 @@ cdef class CryptoFactory(_Weakrefable):
def remove_cache_entries_for_all_tokens(self):
self.factory.get().RemoveCacheEntriesForAllTokens()
+ def rotate_master_keys(
+ self,
+ KmsConnectionConfig kms_connection_config,
+ parquet_file_path,
+ FileSystem filesystem,
+ double_wrapping = True,
+ cache_lifetime_seconds = 600):
+ """ Rotates master encryption keys for a Parquet file that uses
+ external key material.
+
+ Parameters
+ ----------
+ kms_connection_config : KmsConnectionConfig
+ Configuration of connection to KMS
+
+ parquet_file_path : str or pathlib.Path
+ Path to a parquet file using external key material.
+
+ filesystem : FileSystem, default None
+ Used only when the parquet file uses external key material. If
+ nothinng passed, will be inferred based on parquet_file_path.
+
+ double_wrapping : bool, default True
+ In the single wrapping mode, decrypts data keys with old master
+ keys, then encrypts them with new master keys. In the double
+ wrapping mode, decrypts KEKs (key encryption keys) with old
+ master keys, generates new KEKs and encrypts them with new master
+ keys.
+
+ cache_lifetime_seconds : int or float, default 600
+ During key rotation, KMS Client and Key Encryption Keys will be
+ cached for this duration.
+
+ Returns
+ -------
+ file_decryption_properties : FileDecryptionProperties
+ File decryption properties.
+ """
+ cdef:
+ c_string c_parquet_file_path
+ shared_ptr[CFileSystem] c_filesystem
+
+ if parquet_file_path != "":
+ filesystem, parquet_file_path = _resolve_filesystem_and_path(
+ parquet_file_path, filesystem)
+
+ c_parquet_file_path = tobytes(parquet_file_path)
+ c_filesystem = _unwrap_fs(filesystem)
+
+ status = self.factory.get().SafeRotateMasterKeys(
+ deref(kms_connection_config.unwrap().get()),
+ c_parquet_file_path,
+ c_filesystem,
+ double_wrapping,
+ cache_lifetime_seconds);
+
+ return check_status(status)
Review Comment:
`check_status` returns an int, I think this should just be:
```suggestion
check_status(status)
```
##########
python/pyarrow/_parquet_encryption.pyx:
##########
@@ -473,9 +529,167 @@ cdef class CryptoFactory(_Weakrefable):
def remove_cache_entries_for_all_tokens(self):
self.factory.get().RemoveCacheEntriesForAllTokens()
+ def rotate_master_keys(
+ self,
+ KmsConnectionConfig kms_connection_config,
+ parquet_file_path,
+ FileSystem filesystem,
+ double_wrapping = True,
+ cache_lifetime_seconds = 600):
+ """ Rotates master encryption keys for a Parquet file that uses
+ external key material.
+
+ Parameters
+ ----------
+ kms_connection_config : KmsConnectionConfig
+ Configuration of connection to KMS
+
+ parquet_file_path : str or pathlib.Path
+ Path to a parquet file using external key material.
+
+ filesystem : FileSystem, default None
+ Used only when the parquet file uses external key material. If
+ nothinng passed, will be inferred based on parquet_file_path.
Review Comment:
```suggestion
None, the file system will be inferred based on
parquet_file_path.
```
##########
python/pyarrow/tests/parquet/test_encryption.py:
##########
@@ -514,31 +533,33 @@ def kms_factory(kms_connection_configuration):
# assert table.num_rows == result_table.num_rows
[email protected](reason="External key material not supported yet")
-def test_encrypted_parquet_write_external(tempdir, data_table):
- """Write an encrypted parquet, with external key
- material.
- Currently it's not implemented, so should throw
- an exception"""
+def test_encrypted_parquet_write_read_external(tempdir, data_table,
+ external_encryption_config):
+ """Write an encrypted parquet with external key material.
+ """
path = tempdir / PARQUET_NAME
- # Encrypt the file with the footer key
- encryption_config = pe.EncryptionConfiguration(
- footer_key=FOOTER_KEY_NAME,
- column_keys={},
- internal_key_material=False)
-
- kms_connection_config = pe.KmsConnectionConfig(
- custom_kms_conf={FOOTER_KEY_NAME: FOOTER_KEY.decode("UTF-8")}
- )
-
- def kms_factory(kms_connection_configuration):
- return InMemoryKmsClient(kms_connection_configuration)
+ kms_connection_config, crypto_factory = write_encrypted_file(
+ path, data_table, FOOTER_KEY_NAME, COL_KEY_NAME, FOOTER_KEY, COL_KEY,
+ external_encryption_config)
- crypto_factory = pe.CryptoFactory(kms_factory)
- # Write with encryption properties
- write_encrypted_parquet(path, data_table, encryption_config,
- kms_connection_config, crypto_factory)
+ verify_file_encrypted(path)
+
+ decryption_config = pe.DecryptionConfiguration()
+ result_table = read_encrypted_parquet(
+ path, decryption_config, kms_connection_config, crypto_factory,
+ internal_key_material=False)
+
+ try:
+ store = pa._parquet_encryption.FileSystemKeyMaterialStore.for_file(
+ path)
+ assert len(key_ids := store.get_key_id_set()) == (
+ len(external_encryption_config.column_keys[COL_KEY_NAME]) + 1 )
+ assert all([store.get_key_material(k) is not None for k in key_ids ])
+ except Exception as e:
+ pytest.fail(f"Unable to read external key material store: {e}")
+
+ assert data_table.equals(result_table)
Review Comment:
Could you also please add a test of key rotation?
##########
python/pyarrow/_parquet_encryption.pyx:
##########
@@ -473,9 +529,167 @@ cdef class CryptoFactory(_Weakrefable):
def remove_cache_entries_for_all_tokens(self):
self.factory.get().RemoveCacheEntriesForAllTokens()
+ def rotate_master_keys(
+ self,
+ KmsConnectionConfig kms_connection_config,
+ parquet_file_path,
+ FileSystem filesystem,
+ double_wrapping = True,
+ cache_lifetime_seconds = 600):
+ """ Rotates master encryption keys for a Parquet file that uses
+ external key material.
+
+ Parameters
+ ----------
+ kms_connection_config : KmsConnectionConfig
+ Configuration of connection to KMS
+
+ parquet_file_path : str or pathlib.Path
+ Path to a parquet file using external key material.
+
+ filesystem : FileSystem, default None
+ Used only when the parquet file uses external key material. If
+ nothinng passed, will be inferred based on parquet_file_path.
+
+ double_wrapping : bool, default True
+ In the single wrapping mode, decrypts data keys with old master
+ keys, then encrypts them with new master keys. In the double
+ wrapping mode, decrypts KEKs (key encryption keys) with old
+ master keys, generates new KEKs and encrypts them with new master
+ keys.
Review Comment:
This comment sounds like it's assuming that the `double_wrapping` parameter
matches the existing state of the file, but I think it's possible to use this
method to convert from single to double wrapping or vice-versa. How about
something like:
```suggestion
In the single wrapping mode, encrypts data encryption keys with
new master keys. In the double wrapping mode, generates new
KEKs (key encryption keys) and uses these to encrypt the data
keys,
and encrypts the KEKs with the new master keys.
```
##########
python/pyarrow/_parquet_encryption.pyx:
##########
@@ -473,9 +529,167 @@ cdef class CryptoFactory(_Weakrefable):
def remove_cache_entries_for_all_tokens(self):
self.factory.get().RemoveCacheEntriesForAllTokens()
+ def rotate_master_keys(
+ self,
+ KmsConnectionConfig kms_connection_config,
+ parquet_file_path,
+ FileSystem filesystem,
+ double_wrapping = True,
+ cache_lifetime_seconds = 600):
+ """ Rotates master encryption keys for a Parquet file that uses
+ external key material.
+
+ Parameters
+ ----------
+ kms_connection_config : KmsConnectionConfig
+ Configuration of connection to KMS
+
+ parquet_file_path : str or pathlib.Path
+ Path to a parquet file using external key material.
+
+ filesystem : FileSystem, default None
+ Used only when the parquet file uses external key material. If
+ nothinng passed, will be inferred based on parquet_file_path.
+
+ double_wrapping : bool, default True
+ In the single wrapping mode, decrypts data keys with old master
+ keys, then encrypts them with new master keys. In the double
+ wrapping mode, decrypts KEKs (key encryption keys) with old
+ master keys, generates new KEKs and encrypts them with new master
+ keys.
+
+ cache_lifetime_seconds : int or float, default 600
+ During key rotation, KMS Client and Key Encryption Keys will be
+ cached for this duration.
+
+ Returns
+ -------
+ file_decryption_properties : FileDecryptionProperties
+ File decryption properties.
+ """
+ cdef:
+ c_string c_parquet_file_path
+ shared_ptr[CFileSystem] c_filesystem
+
+ if parquet_file_path != "":
+ filesystem, parquet_file_path = _resolve_filesystem_and_path(
+ parquet_file_path, filesystem)
+
+ c_parquet_file_path = tobytes(parquet_file_path)
+ c_filesystem = _unwrap_fs(filesystem)
+
+ status = self.factory.get().SafeRotateMasterKeys(
+ deref(kms_connection_config.unwrap().get()),
+ c_parquet_file_path,
+ c_filesystem,
+ double_wrapping,
+ cache_lifetime_seconds);
+
+ return check_status(status)
+
cdef inline shared_ptr[CPyCryptoFactory] unwrap(self):
return self.factory
+
+cdef class KeyMaterial(_Weakrefable):
+
+ @property
+ def is_footer_key(self):
+ return self.key_material.get().is_footer_key()
+
+ @property
+ def is_double_wrapped(self):
+ return self.key_material.get().is_double_wrapped()
+
+ @property
+ def master_key_id(self):
+ return frombytes(self.key_material.get().master_key_id())
+
+ @property
+ def wrapped_dek(self):
+ return frombytes(self.key_material.get().wrapped_dek())
+
+ @property
+ def kek_id(self):
+ return frombytes(self.key_material.get().kek_id())
+
+ @property
+ def wrapped_kek(self):
+ return frombytes(self.key_material.get().wrapped_kek())
+
+ @property
+ def kms_instance_id(self):
+ return frombytes(self.key_material.get().kms_instance_id())
+
+ @property
+ def kms_instance_url(self):
+ return frombytes(self.key_material.get().kms_instance_url())
+ @staticmethod
+ cdef inline KeyMaterial wrap(shared_ptr[CKeyMaterial] key_material):
+ wrapper = KeyMaterial()
+ wrapper.key_material = key_material
+ return wrapper
+
+ @staticmethod
+ def parse(
+ const c_string key_material_string):
+ cdef:
+ shared_ptr[CKeyMaterial] c_key_material
+ c_key_material = make_shared[CKeyMaterial](move(
+ CKeyMaterial.Parse(key_material_string)
+ ))
+ return KeyMaterial.wrap(c_key_material)
+
+cdef class FileSystemKeyMaterialStore(_Weakrefable):
+
+ def get_key_material(self, key_id):
+ cdef:
+ c_string c_key_id = tobytes(key_id)
+ c_string c_key_material_string
+
+ c_key_material_string = self.store.get().GetKeyMaterial(c_key_id)
+ if c_key_material_string.empty():
+ raise KeyError("Invalid key id")
+ return KeyMaterial.parse(c_key_material_string)
+
+ def get_key_id_set(self):
+ return self.store.get().GetKeyIDSet()
+
+ @classmethod
+ def for_file(cls, parquet_file_path,
+ FileSystem filesystem = None):
+ """Creates a FileSystemKeyMaterialStore for a parquet file that
+ was created with external key material.
+
+ Parameters
+ ----------
+ parquet_file_path : str or pathlib.Path
+ Path to a parquet file using external key material.
+
+ filesystem : FileSystem, default None
+ FileSystem where the parquet file is located. If nothinng passed,
Review Comment:
```suggestion
FileSystem where the parquet file is located. If None,
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]