pitrou commented on code in PR #49667:
URL: https://github.com/apache/arrow/pull/49667#discussion_r3118928422
##########
python/pyarrow/includes/libparquet.pxd:
##########
@@ -633,6 +634,28 @@ cdef extern from "parquet/encryption/encryption.h"
namespace "parquet" nogil:
" parquet::FileDecryptionProperties":
pass
+ cdef cppclass CFileDecryptionPropertiesBuilder\
+ " parquet::FileDecryptionProperties::Builder":
+ CFileDecryptionPropertiesBuilder() except +
Review Comment:
If all these APIs can raise C++ exceptions, which kind of exceptions will be
raised on the Python side?
##########
python/pyarrow/tests/parquet/test_encryption.py:
##########
@@ -722,3 +722,204 @@ def test_encrypted_parquet_read_table(tempdir,
data_table, basic_encryption_conf
result_table = pq.read_table(
tempdir, decryption_properties=file_decryption_properties)
assert data_table.equals(result_table)
+
+
+class TestDirectKeyEncryption:
+ """Tests for create_encryption_properties /
create_decryption_properties."""
+
+ KEY_128 = b"0123456789abcdef"
+ KEY_192 = b"0123456789abcdef01234567"
+ KEY_256 = b"0123456789abcdef0123456789abcdef"
+ AAD_PREFIX = b"test_aad_prefix"
+
+ @pytest.mark.parametrize("key", [
+ b"0123456789abcdef",
+ b"0123456789abcdef01234567",
+ b"0123456789abcdef0123456789abcdef",
+ ], ids=["aes128", "aes192", "aes256"])
+ def test_roundtrip_key_sizes(self, tempdir, data_table, key):
+ path = tempdir / f"direct_{len(key) * 8}.parquet"
+
+ enc_props = pe.create_encryption_properties(footer_key=key)
+ pq.write_table(data_table, path, encryption_properties=enc_props)
+
+ dec_props = pe.create_decryption_properties(footer_key=key)
+ result = pq.read_table(path, decryption_properties=dec_props)
+ assert data_table.equals(result)
+
+ def test_roundtrip_with_aad_prefix(self, tempdir, data_table):
+ path = tempdir / "direct_aad.parquet"
+
+ enc_props = pe.create_encryption_properties(
+ footer_key=self.KEY_128,
+ aad_prefix=self.AAD_PREFIX,
+ )
+ pq.write_table(data_table, path, encryption_properties=enc_props)
+
+ dec_props = pe.create_decryption_properties(
+ footer_key=self.KEY_128,
+ aad_prefix=self.AAD_PREFIX,
+ )
+ result = pq.read_table(path, decryption_properties=dec_props)
+ assert data_table.equals(result)
+
+ def test_roundtrip_aad_prefix_not_stored(self, tempdir, data_table):
+ """When store_aad_prefix=False, reader must supply aad_prefix."""
+ path = tempdir / "direct_aad_not_stored.parquet"
+
+ enc_props = pe.create_encryption_properties(
+ footer_key=self.KEY_128,
+ aad_prefix=self.AAD_PREFIX,
+ store_aad_prefix=False,
+ )
+ pq.write_table(data_table, path, encryption_properties=enc_props)
+
+ # Reading without aad_prefix should fail
+ dec_props_no_aad = pe.create_decryption_properties(
+ footer_key=self.KEY_128,
+ )
+ with pytest.raises(IOError):
Review Comment:
Can we also test something about the exception message?
```suggestion
with pytest.raises(IOError, match='XXX...'):
```
##########
python/pyarrow/tests/parquet/test_encryption.py:
##########
@@ -722,3 +722,204 @@ def test_encrypted_parquet_read_table(tempdir,
data_table, basic_encryption_conf
result_table = pq.read_table(
tempdir, decryption_properties=file_decryption_properties)
assert data_table.equals(result_table)
+
+
+class TestDirectKeyEncryption:
+ """Tests for create_encryption_properties /
create_decryption_properties."""
+
+ KEY_128 = b"0123456789abcdef"
+ KEY_192 = b"0123456789abcdef01234567"
+ KEY_256 = b"0123456789abcdef0123456789abcdef"
+ AAD_PREFIX = b"test_aad_prefix"
+
+ @pytest.mark.parametrize("key", [
+ b"0123456789abcdef",
+ b"0123456789abcdef01234567",
+ b"0123456789abcdef0123456789abcdef",
+ ], ids=["aes128", "aes192", "aes256"])
+ def test_roundtrip_key_sizes(self, tempdir, data_table, key):
+ path = tempdir / f"direct_{len(key) * 8}.parquet"
+
+ enc_props = pe.create_encryption_properties(footer_key=key)
+ pq.write_table(data_table, path, encryption_properties=enc_props)
+
+ dec_props = pe.create_decryption_properties(footer_key=key)
+ result = pq.read_table(path, decryption_properties=dec_props)
+ assert data_table.equals(result)
+
+ def test_roundtrip_with_aad_prefix(self, tempdir, data_table):
+ path = tempdir / "direct_aad.parquet"
+
+ enc_props = pe.create_encryption_properties(
+ footer_key=self.KEY_128,
+ aad_prefix=self.AAD_PREFIX,
+ )
+ pq.write_table(data_table, path, encryption_properties=enc_props)
+
+ dec_props = pe.create_decryption_properties(
+ footer_key=self.KEY_128,
+ aad_prefix=self.AAD_PREFIX,
+ )
+ result = pq.read_table(path, decryption_properties=dec_props)
+ assert data_table.equals(result)
+
+ def test_roundtrip_aad_prefix_not_stored(self, tempdir, data_table):
+ """When store_aad_prefix=False, reader must supply aad_prefix."""
+ path = tempdir / "direct_aad_not_stored.parquet"
+
+ enc_props = pe.create_encryption_properties(
+ footer_key=self.KEY_128,
+ aad_prefix=self.AAD_PREFIX,
+ store_aad_prefix=False,
+ )
+ pq.write_table(data_table, path, encryption_properties=enc_props)
+
+ # Reading without aad_prefix should fail
+ dec_props_no_aad = pe.create_decryption_properties(
+ footer_key=self.KEY_128,
+ )
+ with pytest.raises(IOError):
+ pq.read_table(path, decryption_properties=dec_props_no_aad)
+
+ # Reading with correct aad_prefix should succeed
+ dec_props = pe.create_decryption_properties(
+ footer_key=self.KEY_128,
+ aad_prefix=self.AAD_PREFIX,
+ )
+ result = pq.read_table(path, decryption_properties=dec_props)
+ assert data_table.equals(result)
+
+ def test_wrong_aad_prefix_fails(self, tempdir, data_table):
+ path = tempdir / "direct_wrong_aad.parquet"
+
+ enc_props = pe.create_encryption_properties(
+ footer_key=self.KEY_128,
+ aad_prefix=self.AAD_PREFIX,
+ )
+ pq.write_table(data_table, path, encryption_properties=enc_props)
+
+ dec_props = pe.create_decryption_properties(
+ footer_key=self.KEY_128,
+ aad_prefix=b"wrong_prefix",
+ )
+ with pytest.raises(IOError):
Review Comment:
Same here.
##########
python/pyarrow/_parquet_encryption.pyx:
##########
@@ -711,3 +711,204 @@ cdef shared_ptr[CDecryptionConfiguration]
pyarrow_unwrap_decryptionconfig(object
if isinstance(decryptionconfig, DecryptionConfiguration):
return (<DecryptionConfiguration> decryptionconfig).unwrap()
raise TypeError("Expected DecryptionConfiguration, got %s" %
type(decryptionconfig))
+
+
+def create_decryption_properties(
+ footer_key,
+ *,
+ aad_prefix=None,
+ bint check_footer_integrity=True,
+ bint allow_plaintext_files=False,
+):
+ """
+ Create FileDecryptionProperties using a direct footer key.
+
+ This is a low-level API that constructs decryption properties directly
+ from a plaintext key, bypassing the KMS-based :class:`CryptoFactory`.
+ It is intended for callers that manage key wrapping and storage
+ themselves (e.g. an application-level scheme).
+
+ For most use cases, prefer the higher-level :class:`CryptoFactory`
+ with :class:`DecryptionConfiguration`, which implements the full
+ Parquet key management specification and is interoperable with
+ other tools and frameworks.
+
+ .. note::
+ Currently only uniform encryption (single key for footer and all
+ columns) is supported with this method. Per-column keys are not
+ yet available; files encrypted with per-column keys cannot be
+ decrypted using this function.
+
+ Parameters
+ ----------
+ footer_key : bytes
+ The decryption key for the file footer and all columns (uniform
+ encryption). Must be 16, 24, or 32 bytes for AES-128, AES-192,
+ or AES-256 respectively.
+ aad_prefix : bytes, optional
+ Additional Authenticated Data prefix. Must match the AAD prefix
+ that was used during encryption. Required if the AAD prefix was
+ not stored in the file metadata during encryption.
+ check_footer_integrity : bool, default True
+ Whether to verify footer integrity using the signature stored
+ in the file. Set to False only for debugging.
+ allow_plaintext_files : bool, default False
+ Whether to allow reading plaintext (unencrypted) files with
+ these decryption properties without raising an error.
+
+ Returns
+ -------
+ FileDecryptionProperties
+ Properties that can be passed to :func:`~pyarrow.parquet.read_table`,
+ :class:`~pyarrow.parquet.ParquetFile`, or
+ :class:`~pyarrow.dataset.ParquetFragmentScanOptions`.
+
+ Examples
+ --------
+ >>> import pyarrow.parquet as pq
+ >>> import pyarrow.parquet.encryption as pe
+ >>> props = pe.create_decryption_properties(
+ ... footer_key=b'0123456789abcdef',
+ ... aad_prefix=b'table_id',
+ ... )
+ >>> table = pq.read_table('encrypted.parquet', decryption_properties=props)
+ """
+ cdef:
+ CSecureString c_footer_key
+ c_string c_aad_prefix
+ CFileDecryptionPropertiesBuilder* builder
+ shared_ptr[CFileDecryptionProperties] props
+
+ footer_key_bytes = tobytes(footer_key)
+ if len(footer_key_bytes) not in (16, 24, 32):
+ raise ValueError(
+ f"footer_key must be 16, 24, or 32 bytes, got
{len(footer_key_bytes)}"
+ )
+
+ c_footer_key = CSecureString(<c_string>footer_key_bytes)
+ builder = new CFileDecryptionPropertiesBuilder()
Review Comment:
Why are we using `new`? We can create a plain value, I think.
##########
python/pyarrow/tests/parquet/test_encryption.py:
##########
@@ -722,3 +722,204 @@ def test_encrypted_parquet_read_table(tempdir,
data_table, basic_encryption_conf
result_table = pq.read_table(
tempdir, decryption_properties=file_decryption_properties)
assert data_table.equals(result_table)
+
+
+class TestDirectKeyEncryption:
+ """Tests for create_encryption_properties /
create_decryption_properties."""
+
+ KEY_128 = b"0123456789abcdef"
+ KEY_192 = b"0123456789abcdef01234567"
+ KEY_256 = b"0123456789abcdef0123456789abcdef"
+ AAD_PREFIX = b"test_aad_prefix"
+
+ @pytest.mark.parametrize("key", [
+ b"0123456789abcdef",
+ b"0123456789abcdef01234567",
+ b"0123456789abcdef0123456789abcdef",
Review Comment:
Nit: reuse the `KEY_` constants above?
##########
python/pyarrow/_parquet_encryption.pyx:
##########
@@ -711,3 +711,204 @@ cdef shared_ptr[CDecryptionConfiguration]
pyarrow_unwrap_decryptionconfig(object
if isinstance(decryptionconfig, DecryptionConfiguration):
return (<DecryptionConfiguration> decryptionconfig).unwrap()
raise TypeError("Expected DecryptionConfiguration, got %s" %
type(decryptionconfig))
+
+
+def create_decryption_properties(
+ footer_key,
+ *,
+ aad_prefix=None,
+ bint check_footer_integrity=True,
+ bint allow_plaintext_files=False,
+):
+ """
+ Create FileDecryptionProperties using a direct footer key.
+
+ This is a low-level API that constructs decryption properties directly
+ from a plaintext key, bypassing the KMS-based :class:`CryptoFactory`.
+ It is intended for callers that manage key wrapping and storage
+ themselves (e.g. an application-level scheme).
+
+ For most use cases, prefer the higher-level :class:`CryptoFactory`
+ with :class:`DecryptionConfiguration`, which implements the full
+ Parquet key management specification and is interoperable with
+ other tools and frameworks.
+
+ .. note::
+ Currently only uniform encryption (single key for footer and all
+ columns) is supported with this method. Per-column keys are not
+ yet available; files encrypted with per-column keys cannot be
+ decrypted using this function.
+
+ Parameters
+ ----------
+ footer_key : bytes
+ The decryption key for the file footer and all columns (uniform
+ encryption). Must be 16, 24, or 32 bytes for AES-128, AES-192,
+ or AES-256 respectively.
+ aad_prefix : bytes, optional
+ Additional Authenticated Data prefix. Must match the AAD prefix
+ that was used during encryption. Required if the AAD prefix was
+ not stored in the file metadata during encryption.
+ check_footer_integrity : bool, default True
+ Whether to verify footer integrity using the signature stored
+ in the file. Set to False only for debugging.
+ allow_plaintext_files : bool, default False
+ Whether to allow reading plaintext (unencrypted) files with
+ these decryption properties without raising an error.
+
+ Returns
+ -------
+ FileDecryptionProperties
+ Properties that can be passed to :func:`~pyarrow.parquet.read_table`,
+ :class:`~pyarrow.parquet.ParquetFile`, or
+ :class:`~pyarrow.dataset.ParquetFragmentScanOptions`.
+
+ Examples
+ --------
+ >>> import pyarrow.parquet as pq
+ >>> import pyarrow.parquet.encryption as pe
+ >>> props = pe.create_decryption_properties(
+ ... footer_key=b'0123456789abcdef',
+ ... aad_prefix=b'table_id',
+ ... )
+ >>> table = pq.read_table('encrypted.parquet', decryption_properties=props)
+ """
+ cdef:
+ CSecureString c_footer_key
+ c_string c_aad_prefix
+ CFileDecryptionPropertiesBuilder* builder
+ shared_ptr[CFileDecryptionProperties] props
+
+ footer_key_bytes = tobytes(footer_key)
Review Comment:
We shouldn't call `tobytes` as it will utf8-encode a str object.
##########
python/pyarrow/_parquet_encryption.pyx:
##########
@@ -711,3 +711,204 @@ cdef shared_ptr[CDecryptionConfiguration]
pyarrow_unwrap_decryptionconfig(object
if isinstance(decryptionconfig, DecryptionConfiguration):
return (<DecryptionConfiguration> decryptionconfig).unwrap()
raise TypeError("Expected DecryptionConfiguration, got %s" %
type(decryptionconfig))
+
+
+def create_decryption_properties(
+ footer_key,
+ *,
+ aad_prefix=None,
+ bint check_footer_integrity=True,
+ bint allow_plaintext_files=False,
+):
+ """
+ Create FileDecryptionProperties using a direct footer key.
+
+ This is a low-level API that constructs decryption properties directly
+ from a plaintext key, bypassing the KMS-based :class:`CryptoFactory`.
+ It is intended for callers that manage key wrapping and storage
+ themselves (e.g. an application-level scheme).
+
+ For most use cases, prefer the higher-level :class:`CryptoFactory`
+ with :class:`DecryptionConfiguration`, which implements the full
+ Parquet key management specification and is interoperable with
+ other tools and frameworks.
+
+ .. note::
+ Currently only uniform encryption (single key for footer and all
+ columns) is supported with this method. Per-column keys are not
+ yet available; files encrypted with per-column keys cannot be
+ decrypted using this function.
+
+ Parameters
+ ----------
+ footer_key : bytes
+ The decryption key for the file footer and all columns (uniform
+ encryption). Must be 16, 24, or 32 bytes for AES-128, AES-192,
+ or AES-256 respectively.
+ aad_prefix : bytes, optional
+ Additional Authenticated Data prefix. Must match the AAD prefix
+ that was used during encryption. Required if the AAD prefix was
+ not stored in the file metadata during encryption.
+ check_footer_integrity : bool, default True
+ Whether to verify footer integrity using the signature stored
+ in the file. Set to False only for debugging.
+ allow_plaintext_files : bool, default False
+ Whether to allow reading plaintext (unencrypted) files with
+ these decryption properties without raising an error.
+
+ Returns
+ -------
+ FileDecryptionProperties
+ Properties that can be passed to :func:`~pyarrow.parquet.read_table`,
+ :class:`~pyarrow.parquet.ParquetFile`, or
+ :class:`~pyarrow.dataset.ParquetFragmentScanOptions`.
+
+ Examples
+ --------
+ >>> import pyarrow.parquet as pq
+ >>> import pyarrow.parquet.encryption as pe
+ >>> props = pe.create_decryption_properties(
+ ... footer_key=b'0123456789abcdef',
+ ... aad_prefix=b'table_id',
+ ... )
+ >>> table = pq.read_table('encrypted.parquet', decryption_properties=props)
+ """
+ cdef:
+ CSecureString c_footer_key
+ c_string c_aad_prefix
+ CFileDecryptionPropertiesBuilder* builder
+ shared_ptr[CFileDecryptionProperties] props
+
+ footer_key_bytes = tobytes(footer_key)
+ if len(footer_key_bytes) not in (16, 24, 32):
+ raise ValueError(
+ f"footer_key must be 16, 24, or 32 bytes, got
{len(footer_key_bytes)}"
+ )
+
+ c_footer_key = CSecureString(<c_string>footer_key_bytes)
+ builder = new CFileDecryptionPropertiesBuilder()
+
+ try:
+ builder.footer_key(c_footer_key)
+
+ if aad_prefix is not None:
+ c_aad_prefix = tobytes(aad_prefix)
Review Comment:
Same here.
##########
python/pyarrow/_parquet_encryption.pyx:
##########
@@ -711,3 +711,204 @@ cdef shared_ptr[CDecryptionConfiguration]
pyarrow_unwrap_decryptionconfig(object
if isinstance(decryptionconfig, DecryptionConfiguration):
return (<DecryptionConfiguration> decryptionconfig).unwrap()
raise TypeError("Expected DecryptionConfiguration, got %s" %
type(decryptionconfig))
+
+
+def create_decryption_properties(
+ footer_key,
+ *,
+ aad_prefix=None,
+ bint check_footer_integrity=True,
+ bint allow_plaintext_files=False,
+):
+ """
+ Create FileDecryptionProperties using a direct footer key.
+
+ This is a low-level API that constructs decryption properties directly
+ from a plaintext key, bypassing the KMS-based :class:`CryptoFactory`.
+ It is intended for callers that manage key wrapping and storage
+ themselves (e.g. an application-level scheme).
+
+ For most use cases, prefer the higher-level :class:`CryptoFactory`
+ with :class:`DecryptionConfiguration`, which implements the full
+ Parquet key management specification and is interoperable with
+ other tools and frameworks.
+
+ .. note::
+ Currently only uniform encryption (single key for footer and all
+ columns) is supported with this method. Per-column keys are not
+ yet available; files encrypted with per-column keys cannot be
+ decrypted using this function.
+
+ Parameters
+ ----------
+ footer_key : bytes
+ The decryption key for the file footer and all columns (uniform
+ encryption). Must be 16, 24, or 32 bytes for AES-128, AES-192,
+ or AES-256 respectively.
+ aad_prefix : bytes, optional
+ Additional Authenticated Data prefix. Must match the AAD prefix
+ that was used during encryption. Required if the AAD prefix was
+ not stored in the file metadata during encryption.
+ check_footer_integrity : bool, default True
+ Whether to verify footer integrity using the signature stored
+ in the file. Set to False only for debugging.
+ allow_plaintext_files : bool, default False
+ Whether to allow reading plaintext (unencrypted) files with
+ these decryption properties without raising an error.
+
+ Returns
+ -------
+ FileDecryptionProperties
+ Properties that can be passed to :func:`~pyarrow.parquet.read_table`,
+ :class:`~pyarrow.parquet.ParquetFile`, or
+ :class:`~pyarrow.dataset.ParquetFragmentScanOptions`.
+
+ Examples
+ --------
+ >>> import pyarrow.parquet as pq
+ >>> import pyarrow.parquet.encryption as pe
+ >>> props = pe.create_decryption_properties(
+ ... footer_key=b'0123456789abcdef',
+ ... aad_prefix=b'table_id',
+ ... )
+ >>> table = pq.read_table('encrypted.parquet', decryption_properties=props)
+ """
+ cdef:
+ CSecureString c_footer_key
+ c_string c_aad_prefix
+ CFileDecryptionPropertiesBuilder* builder
+ shared_ptr[CFileDecryptionProperties] props
+
+ footer_key_bytes = tobytes(footer_key)
+ if len(footer_key_bytes) not in (16, 24, 32):
+ raise ValueError(
+ f"footer_key must be 16, 24, or 32 bytes, got
{len(footer_key_bytes)}"
+ )
+
+ c_footer_key = CSecureString(<c_string>footer_key_bytes)
+ builder = new CFileDecryptionPropertiesBuilder()
+
+ try:
+ builder.footer_key(c_footer_key)
+
+ if aad_prefix is not None:
+ c_aad_prefix = tobytes(aad_prefix)
+ builder.aad_prefix(c_aad_prefix)
+
+ if not check_footer_integrity:
+ builder.disable_footer_signature_verification()
+
+ if allow_plaintext_files:
+ builder.plaintext_files_allowed()
+
+ props = builder.build()
+ finally:
+ del builder
+
+ return FileDecryptionProperties.wrap(props)
+
+
+def create_encryption_properties(
+ footer_key,
+ *,
+ aad_prefix=None,
+ bint store_aad_prefix=True,
+ encryption_algorithm="AES_GCM_V1",
+ bint plaintext_footer=False,
+):
+ """
+ Create FileEncryptionProperties using a direct footer key.
+
+ This is a low-level API that constructs encryption properties directly
+ from a plaintext key, bypassing the KMS-based :class:`CryptoFactory`.
+ It is intended for callers that manage key wrapping and storage
+ themselves (e.g. an application-level scheme).
+
+ .. warning::
+ The caller is responsible for key management best practices.
+ Reusing the same key for multiple files without unique data keys
+ weakens AES-GCM security. The higher-level :class:`CryptoFactory`
+ with :class:`EncryptionConfiguration` handles this automatically
+ and is interoperable with other tools and frameworks --
+ prefer it unless you have a specific reason to manage
+ keys yourself.
+
+ .. note::
+ Currently only uniform encryption (single key for footer and all
+ columns) is supported with this method. Per-column keys are not
+ yet available; the provided key encrypts both the footer and
+ every column.
+
+ Parameters
+ ----------
+ footer_key : bytes
+ The encryption key for the file footer and all columns (uniform
+ encryption). Must be 16, 24, or 32 bytes for AES-128, AES-192,
+ or AES-256 respectively.
+ aad_prefix : bytes, optional
+ Additional Authenticated Data prefix for cryptographic binding.
+ store_aad_prefix : bool, default True
+ Whether to store the AAD prefix in the Parquet file metadata.
+ Set to False when the AAD prefix will be supplied externally
+ at read time.
+ Only meaningful when *aad_prefix* is provided.
+ encryption_algorithm : str, default "AES_GCM_V1"
+ Encryption algorithm. Either ``"AES_GCM_V1"`` or
+ ``"AES_GCM_CTR_V1"``.
+ plaintext_footer : bool, default False
+ Whether to leave the file footer unencrypted. When True, file
+ schema and metadata are readable without a key.
+
+ Returns
+ -------
+ FileEncryptionProperties
+ Properties that can be passed to :func:`~pyarrow.parquet.write_table`
or
+ :class:`~pyarrow.parquet.ParquetWriter`.
+
+ Examples
+ --------
+ >>> import pyarrow as pa
+ >>> import pyarrow.parquet as pq
+ >>> import pyarrow.parquet.encryption as pe
+ >>> table = pa.table({'col': [1, 2, 3]})
+ >>> props = pe.create_encryption_properties(
+ ... footer_key=b'0123456789abcdef',
+ ... aad_prefix=b'table_id',
+ ... store_aad_prefix=False,
+ ... )
+ >>> pq.write_table(table, 'encrypted.parquet', encryption_properties=props)
+ """
+ cdef:
+ CSecureString c_footer_key
+ c_string c_aad_prefix
+ CFileEncryptionPropertiesBuilder* builder
+ shared_ptr[CFileEncryptionProperties] props
+ ParquetCipher cipher
+
+ footer_key_bytes = tobytes(footer_key)
Review Comment:
Same here and other instances below.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]