This is an automated email from the ASF dual-hosted git repository.
adamreeve pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 336fdb35868 GH-47435: [Python][Parquet] Add direct key
encryption/decryption API (#49667)
336fdb35868 is described below
commit 336fdb35868af7ce46001b73703dd3f7f2e39b8d
Author: Sreesh Maheshwar <[email protected]>
AuthorDate: Sun May 24 23:25:58 2026 +0100
GH-47435: [Python][Parquet] Add direct key encryption/decryption API
(#49667)
### Rationale for this change
See https://github.com/apache/arrow/issues/47435.
### What changes are included in this PR?
Adds direct encryption / decryption Python API
### Are these changes tested?
Yes, see PR.
### Are there any user-facing changes?
Yes, new Python bindings.
* GitHub Issue: #47435
Authored-by: Sreesh Maheshwar <[email protected]>
Signed-off-by: Adam Reeve <[email protected]>
---
docs/source/python/api/formats.rst | 2 +
python/pyarrow/_parquet_encryption.pxd | 5 +
python/pyarrow/_parquet_encryption.pyx | 209 ++++++++++++++++++++++
python/pyarrow/includes/libparquet.pxd | 25 ++-
python/pyarrow/parquet/encryption.py | 4 +-
python/pyarrow/tests/parquet/test_encryption.py | 226 ++++++++++++++++++++++++
6 files changed, 469 insertions(+), 2 deletions(-)
diff --git a/docs/source/python/api/formats.rst
b/docs/source/python/api/formats.rst
index a4f02084c4a..57a5e824fab 100644
--- a/docs/source/python/api/formats.rst
+++ b/docs/source/python/api/formats.rst
@@ -119,6 +119,8 @@ Encrypted Parquet Files
KmsConnectionConfig
EncryptionConfiguration
DecryptionConfiguration
+ create_encryption_properties
+ create_decryption_properties
.. _api.orc:
diff --git a/python/pyarrow/_parquet_encryption.pxd
b/python/pyarrow/_parquet_encryption.pxd
index 48939fe277f..1a12a6d6785 100644
--- a/python/pyarrow/_parquet_encryption.pxd
+++ b/python/pyarrow/_parquet_encryption.pxd
@@ -20,6 +20,11 @@
from pyarrow.includes.common cimport *
from pyarrow.includes.libparquet_encryption cimport *
+from pyarrow.includes.libparquet cimport (
+ CSecureString,
+ CFileDecryptionPropertiesBuilder,
+ CFileEncryptionPropertiesBuilder,
+)
from pyarrow._parquet cimport (ParquetCipher,
CFileEncryptionProperties,
CFileDecryptionProperties,
diff --git a/python/pyarrow/_parquet_encryption.pyx
b/python/pyarrow/_parquet_encryption.pyx
index db6a6b56ac4..7fe7fa7491d 100644
--- a/python/pyarrow/_parquet_encryption.pyx
+++ b/python/pyarrow/_parquet_encryption.pyx
@@ -711,3 +711,212 @@ cdef shared_ptr[CDecryptionConfiguration]
pyarrow_unwrap_decryptionconfig(object
if isinstance(decryptionconfig, DecryptionConfiguration):
return (<DecryptionConfiguration> decryptionconfig).unwrap()
raise TypeError("Expected DecryptionConfiguration, got %s" %
type(decryptionconfig))
+
+
+def create_decryption_properties(
+ footer_key,
+ *,
+ aad_prefix=None,
+ bint check_footer_integrity=True,
+ bint allow_plaintext_files=False,
+):
+ """
+ Create FileDecryptionProperties using a direct footer key.
+
+ This is a low-level API that constructs decryption properties directly
+ from a plaintext key, bypassing the KMS-based :class:`CryptoFactory`.
+ It is intended for callers that manage key wrapping and storage
+ themselves (e.g. an application-level scheme).
+
+ For most use cases, prefer the higher-level :class:`CryptoFactory`
+ with :class:`DecryptionConfiguration`, which implements the full
+ Parquet key management specification and is interoperable with
+ other tools and frameworks.
+
+ .. note::
+ Currently only uniform encryption (single key for footer and all
+ columns) is supported with this method. Per-column keys are not
+ yet available; files encrypted with per-column keys cannot be
+ decrypted using this function.
+
+ Parameters
+ ----------
+ footer_key : bytes
+ The decryption key for the file footer and all columns (uniform
+ encryption). Must be 16, 24, or 32 bytes for AES-128, AES-192,
+ or AES-256 respectively.
+ aad_prefix : bytes, optional
+ Additional Authenticated Data prefix. Must match the AAD prefix
+ that was used during encryption. Required if the AAD prefix was
+ not stored in the file metadata during encryption.
+ check_footer_integrity : bool, default True
+ Whether to verify footer integrity using the signature stored
+ in the file. Set to False only for debugging.
+ allow_plaintext_files : bool, default False
+ Whether to allow reading plaintext (unencrypted) files with
+ these decryption properties without raising an error.
+
+ Returns
+ -------
+ FileDecryptionProperties
+ Properties that can be passed to :func:`~pyarrow.parquet.read_table`,
+ :class:`~pyarrow.parquet.ParquetFile`, or
+ :class:`~pyarrow.dataset.ParquetFragmentScanOptions`.
+
+ Examples
+ --------
+ >>> import pyarrow.parquet as pq
+ >>> import pyarrow.parquet.encryption as pe
+ >>> props = pe.create_decryption_properties(
+ ... footer_key=b'0123456789abcdef',
+ ... aad_prefix=b'table_id',
+ ... )
+ >>> table = pq.read_table('encrypted.parquet', decryption_properties=props)
+ """
+ cdef:
+ c_string c_footer_key_str
+ CSecureString c_footer_key
+ CFileDecryptionPropertiesBuilder builder
+ shared_ptr[CFileDecryptionProperties] props
+
+ if not isinstance(footer_key, bytes):
+ raise TypeError(
+ f"footer_key must be bytes, not {type(footer_key).__name__}"
+ )
+ if len(footer_key) not in (16, 24, 32):
+ raise ValueError(
+ f"footer_key must be 16, 24, or 32 bytes, got {len(footer_key)}"
+ )
+
+ c_footer_key_str = <c_string>footer_key
+ c_footer_key = CSecureString(move(c_footer_key_str))
+ builder.footer_key(c_footer_key)
+
+ if aad_prefix is not None:
+ if not isinstance(aad_prefix, bytes):
+ raise TypeError(
+ f"aad_prefix must be bytes, not {type(aad_prefix).__name__}"
+ )
+ builder.aad_prefix(<c_string>aad_prefix)
+
+ if not check_footer_integrity:
+ builder.disable_footer_signature_verification()
+
+ if allow_plaintext_files:
+ builder.plaintext_files_allowed()
+
+ props = builder.build()
+
+ return FileDecryptionProperties.wrap(props)
+
+
+def create_encryption_properties(
+ footer_key,
+ *,
+ aad_prefix=None,
+ bint store_aad_prefix=True,
+ encryption_algorithm="AES_GCM_V1",
+ bint plaintext_footer=False,
+):
+ """
+ Create FileEncryptionProperties using a direct footer key.
+
+ This is a low-level API that constructs encryption properties directly
+ from a plaintext key, bypassing the KMS-based :class:`CryptoFactory`.
+ It is intended for callers that manage key wrapping and storage
+ themselves (e.g. an application-level scheme).
+
+ .. warning::
+ The caller is responsible for key management best practices.
+ Reusing the same key for multiple files without unique data keys
+ weakens AES-GCM security. The higher-level :class:`CryptoFactory`
+ with :class:`EncryptionConfiguration` handles this automatically
+ and is interoperable with other tools and frameworks --
+ prefer it unless you have a specific reason to manage
+ keys yourself.
+
+ .. note::
+ Currently only uniform encryption (single key for footer and all
+ columns) is supported with this method. Per-column keys are not
+ yet available; the provided key encrypts both the footer and
+ every column.
+
+ Parameters
+ ----------
+ footer_key : bytes
+ The encryption key for the file footer and all columns (uniform
+ encryption). Must be 16, 24, or 32 bytes for AES-128, AES-192,
+ or AES-256 respectively.
+ aad_prefix : bytes, optional
+ Additional Authenticated Data prefix for cryptographic binding.
+ store_aad_prefix : bool, default True
+ Whether to store the AAD prefix in the Parquet file metadata.
+ Set to False when the AAD prefix will be supplied externally
+ at read time.
+ Only meaningful when *aad_prefix* is provided.
+ encryption_algorithm : str, default "AES_GCM_V1"
+ Encryption algorithm. Either ``"AES_GCM_V1"`` or
+ ``"AES_GCM_CTR_V1"``.
+ plaintext_footer : bool, default False
+ Whether to leave the file footer unencrypted. When True, file
+ schema and metadata are readable without a key.
+
+ Returns
+ -------
+ FileEncryptionProperties
+ Properties that can be passed to :func:`~pyarrow.parquet.write_table`
or
+ :class:`~pyarrow.parquet.ParquetWriter`.
+
+ Examples
+ --------
+ >>> import pyarrow as pa
+ >>> import pyarrow.parquet as pq
+ >>> import pyarrow.parquet.encryption as pe
+ >>> table = pa.table({'col': [1, 2, 3]})
+ >>> props = pe.create_encryption_properties(
+ ... footer_key=b'0123456789abcdef',
+ ... aad_prefix=b'table_id',
+ ... store_aad_prefix=False,
+ ... )
+ >>> pq.write_table(table, 'encrypted.parquet', encryption_properties=props)
+ """
+ cdef:
+ c_string c_footer_key_str
+ CSecureString c_footer_key
+ CFileEncryptionPropertiesBuilder* builder
+ shared_ptr[CFileEncryptionProperties] props
+ ParquetCipher cipher
+
+ if not isinstance(footer_key, bytes):
+ raise TypeError(
+ f"footer_key must be bytes, not {type(footer_key).__name__}"
+ )
+ if len(footer_key) not in (16, 24, 32):
+ raise ValueError(
+ f"footer_key must be 16, 24, or 32 bytes, got {len(footer_key)}"
+ )
+
+ cipher = cipher_from_name(encryption_algorithm)
+ c_footer_key_str = <c_string>footer_key
+ c_footer_key = CSecureString(move(c_footer_key_str))
+ builder = new CFileEncryptionPropertiesBuilder(c_footer_key)
+ try:
+ builder.algorithm(cipher)
+
+ if aad_prefix is not None:
+ if not isinstance(aad_prefix, bytes):
+ raise TypeError(
+ f"aad_prefix must be bytes, not
{type(aad_prefix).__name__}"
+ )
+ builder.aad_prefix(<c_string>aad_prefix)
+ if not store_aad_prefix:
+ builder.disable_aad_prefix_storage()
+
+ if plaintext_footer:
+ builder.set_plaintext_footer()
+
+ props = builder.build()
+ finally:
+ del builder
+
+ return FileEncryptionProperties.wrap(props)
diff --git a/python/pyarrow/includes/libparquet.pxd
b/python/pyarrow/includes/libparquet.pxd
index a834bd5dfa0..df353cc7805 100644
--- a/python/pyarrow/includes/libparquet.pxd
+++ b/python/pyarrow/includes/libparquet.pxd
@@ -22,7 +22,8 @@ from pyarrow.includes.libarrow cimport (Type, CChunkedArray,
CScalar, CSchema,
CStatus, CTable, CMemoryPool, CBuffer,
CKeyValueMetadata, CRandomAccessFile,
COutputStream, CCacheOptions,
- TimeUnit, CRecordBatchReader)
+ TimeUnit, CRecordBatchReader,
+ CSecureString)
cdef extern from "parquet/api/schema.h" namespace "parquet::schema" nogil:
@@ -635,6 +636,28 @@ cdef extern from "parquet/encryption/encryption.h"
namespace "parquet" nogil:
" parquet::FileDecryptionProperties":
pass
+ cdef cppclass CFileDecryptionPropertiesBuilder\
+ " parquet::FileDecryptionProperties::Builder":
+ CFileDecryptionPropertiesBuilder() except +
+ CFileDecryptionPropertiesBuilder* footer_key(
+ CSecureString footer_key) except +
+ CFileDecryptionPropertiesBuilder* aad_prefix(
+ c_string aad_prefix) except +
+ CFileDecryptionPropertiesBuilder*
disable_footer_signature_verification() except +
+ CFileDecryptionPropertiesBuilder* plaintext_files_allowed() except +
+ shared_ptr[CFileDecryptionProperties] build() except +
+
cdef cppclass CFileEncryptionProperties\
" parquet::FileEncryptionProperties":
pass
+
+ cdef cppclass CFileEncryptionPropertiesBuilder\
+ " parquet::FileEncryptionProperties::Builder":
+ CFileEncryptionPropertiesBuilder(CSecureString footer_key) except +
+ CFileEncryptionPropertiesBuilder* set_plaintext_footer() except +
+ CFileEncryptionPropertiesBuilder* algorithm(
+ ParquetCipher parquet_cipher) except +
+ CFileEncryptionPropertiesBuilder* aad_prefix(
+ c_string aad_prefix) except +
+ CFileEncryptionPropertiesBuilder* disable_aad_prefix_storage() except +
+ shared_ptr[CFileEncryptionProperties] build() except +
diff --git a/python/pyarrow/parquet/encryption.py
b/python/pyarrow/parquet/encryption.py
index df6eed913fa..ce95e5d4507 100644
--- a/python/pyarrow/parquet/encryption.py
+++ b/python/pyarrow/parquet/encryption.py
@@ -20,4 +20,6 @@ from pyarrow._parquet_encryption import (CryptoFactory, #
noqa
EncryptionConfiguration,
DecryptionConfiguration,
KmsConnectionConfig,
- KmsClient)
+ KmsClient,
+ create_encryption_properties,
+ create_decryption_properties)
diff --git a/python/pyarrow/tests/parquet/test_encryption.py
b/python/pyarrow/tests/parquet/test_encryption.py
index 4e2fb069bd0..6a3842f3edf 100644
--- a/python/pyarrow/tests/parquet/test_encryption.py
+++ b/python/pyarrow/tests/parquet/test_encryption.py
@@ -37,6 +37,11 @@ FOOTER_KEY_NAME = "footer_key"
COL_KEY = b"1234567890123450"
COL_KEY_NAME = "col_key"
+DIRECT_KEY_128 = b"0123456789abcdef"
+DIRECT_KEY_192 = b"0123456789abcdef01234567"
+DIRECT_KEY_256 = b"0123456789abcdef0123456789abcdef"
+DIRECT_AAD_PREFIX = b"test_aad_prefix"
+
# Marks all of the tests in this module
# Ignore these with pytest ... -m 'not parquet_encryption'
@@ -722,3 +727,224 @@ def test_encrypted_parquet_read_table(tempdir,
data_table, basic_encryption_conf
result_table = pq.read_table(
tempdir, decryption_properties=file_decryption_properties)
assert data_table.equals(result_table)
+
+
+class TestDirectKeyEncryption:
+ """Tests for create_encryption_properties /
create_decryption_properties."""
+
+ @pytest.mark.parametrize("key", [
+ DIRECT_KEY_128, DIRECT_KEY_192, DIRECT_KEY_256,
+ ], ids=["aes128", "aes192", "aes256"])
+ def test_roundtrip_key_sizes(self, tempdir, data_table, key):
+ path = tempdir / f"direct_{len(key) * 8}.parquet"
+
+ enc_props = pe.create_encryption_properties(footer_key=key)
+ pq.write_table(data_table, path, encryption_properties=enc_props)
+
+ dec_props = pe.create_decryption_properties(footer_key=key)
+ result = pq.read_table(path, decryption_properties=dec_props)
+ assert data_table.equals(result)
+
+ def test_roundtrip_with_aad_prefix(self, tempdir, data_table):
+ path = tempdir / "direct_aad.parquet"
+
+ enc_props = pe.create_encryption_properties(
+ footer_key=DIRECT_KEY_128,
+ aad_prefix=DIRECT_AAD_PREFIX,
+ )
+ pq.write_table(data_table, path, encryption_properties=enc_props)
+
+ dec_props = pe.create_decryption_properties(
+ footer_key=DIRECT_KEY_128,
+ aad_prefix=DIRECT_AAD_PREFIX,
+ )
+ result = pq.read_table(path, decryption_properties=dec_props)
+ assert data_table.equals(result)
+
+ def test_roundtrip_aad_prefix_not_stored(self, tempdir, data_table):
+ """When store_aad_prefix=False, reader must supply aad_prefix."""
+ path = tempdir / "direct_aad_not_stored.parquet"
+
+ enc_props = pe.create_encryption_properties(
+ footer_key=DIRECT_KEY_128,
+ aad_prefix=DIRECT_AAD_PREFIX,
+ store_aad_prefix=False,
+ )
+ pq.write_table(data_table, path, encryption_properties=enc_props)
+
+ # Reading without aad_prefix should fail
+ dec_props_no_aad = pe.create_decryption_properties(
+ footer_key=DIRECT_KEY_128,
+ )
+ with pytest.raises(IOError, match="AAD"):
+ pq.read_table(path, decryption_properties=dec_props_no_aad)
+
+ # Reading with correct aad_prefix should succeed
+ dec_props = pe.create_decryption_properties(
+ footer_key=DIRECT_KEY_128,
+ aad_prefix=DIRECT_AAD_PREFIX,
+ )
+ result = pq.read_table(path, decryption_properties=dec_props)
+ assert data_table.equals(result)
+
+ def test_wrong_aad_prefix_fails(self, tempdir, data_table):
+ path = tempdir / "direct_wrong_aad.parquet"
+
+ enc_props = pe.create_encryption_properties(
+ footer_key=DIRECT_KEY_128,
+ aad_prefix=DIRECT_AAD_PREFIX,
+ )
+ pq.write_table(data_table, path, encryption_properties=enc_props)
+
+ dec_props = pe.create_decryption_properties(
+ footer_key=DIRECT_KEY_128,
+ aad_prefix=b"wrong_prefix",
+ )
+ with pytest.raises(IOError, match="AAD"):
+ pq.read_table(path, decryption_properties=dec_props)
+
+ def test_encrypted_file_has_pare_magic(self, tempdir, data_table):
+ path = tempdir / "direct_magic.parquet"
+
+ enc_props = pe.create_encryption_properties(
+ footer_key=DIRECT_KEY_128)
+ pq.write_table(data_table, path, encryption_properties=enc_props)
+
+ with open(path, "rb") as f:
+ magic = f.read(4)
+ assert magic == b"PARE"
+
+ def test_plaintext_footer(self, tempdir, data_table):
+ path = tempdir / "direct_plaintext_footer.parquet"
+
+ enc_props = pe.create_encryption_properties(
+ footer_key=DIRECT_KEY_128,
+ plaintext_footer=True,
+ )
+ pq.write_table(data_table, path, encryption_properties=enc_props)
+
+ dec_props = pe.create_decryption_properties(
+ footer_key=DIRECT_KEY_128)
+ result = pq.read_table(path, decryption_properties=dec_props)
+ assert data_table.equals(result)
+
+ def test_aes_gcm_ctr_v1_algorithm(self, tempdir, data_table):
+ path = tempdir / "direct_ctr.parquet"
+
+ enc_props = pe.create_encryption_properties(
+ footer_key=DIRECT_KEY_128,
+ encryption_algorithm="AES_GCM_CTR_V1",
+ )
+ pq.write_table(data_table, path, encryption_properties=enc_props)
+
+ dec_props = pe.create_decryption_properties(
+ footer_key=DIRECT_KEY_128)
+ result = pq.read_table(path, decryption_properties=dec_props)
+ assert data_table.equals(result)
+
+ def test_wrong_key_fails(self, tempdir, data_table):
+ path = tempdir / "direct_wrong_key.parquet"
+
+ enc_props = pe.create_encryption_properties(
+ footer_key=DIRECT_KEY_128)
+ pq.write_table(data_table, path, encryption_properties=enc_props)
+
+ wrong_key = b"fedcba9876543210"
+ dec_props = pe.create_decryption_properties(footer_key=wrong_key)
+ with pytest.raises(IOError, match="decrypt"):
+ pq.read_table(path, decryption_properties=dec_props)
+
+ def test_reading_without_decryption_fails(self, tempdir, data_table):
+ path = tempdir / "direct_no_decrypt.parquet"
+
+ enc_props = pe.create_encryption_properties(
+ footer_key=DIRECT_KEY_128)
+ pq.write_table(data_table, path, encryption_properties=enc_props)
+
+ with pytest.raises(IOError, match="encrypted metadata"):
+ pq.read_table(path)
+
+ def test_allow_plaintext_files(self, tempdir, data_table):
+ """Plaintext file reads should work when allow_plaintext_files=True."""
+ path = tempdir / "plaintext.parquet"
+ pq.write_table(data_table, path)
+
+ dec_props = pe.create_decryption_properties(
+ footer_key=DIRECT_KEY_128,
+ allow_plaintext_files=True,
+ )
+ result = pq.read_table(path, decryption_properties=dec_props)
+ assert data_table.equals(result)
+
+ def test_plaintext_file_rejected_by_default(self, tempdir, data_table):
+ """Default allow_plaintext_files=False should reject plaintext
files."""
+ path = tempdir / "plaintext_rejected.parquet"
+ pq.write_table(data_table, path)
+
+ dec_props = pe.create_decryption_properties(
+ footer_key=DIRECT_KEY_128)
+ with pytest.raises(IOError, match="plaintext"):
+ pq.read_table(path, decryption_properties=dec_props)
+
+ def test_check_footer_integrity_false(self, tempdir, data_table):
+ """check_footer_integrity=False should still allow decryption."""
+ path = tempdir / "direct_no_footer_check.parquet"
+
+ enc_props = pe.create_encryption_properties(
+ footer_key=DIRECT_KEY_128)
+ pq.write_table(data_table, path, encryption_properties=enc_props)
+
+ dec_props = pe.create_decryption_properties(
+ footer_key=DIRECT_KEY_128,
+ check_footer_integrity=False,
+ )
+ result = pq.read_table(path, decryption_properties=dec_props)
+ assert data_table.equals(result)
+
+ def test_plaintext_footer_has_par1_magic(self, tempdir, data_table):
+ """plaintext_footer=True should produce PAR1 magic, not PARE."""
+ path = tempdir / "direct_plaintext_magic.parquet"
+
+ enc_props = pe.create_encryption_properties(
+ footer_key=DIRECT_KEY_128,
+ plaintext_footer=True,
+ )
+ pq.write_table(data_table, path, encryption_properties=enc_props)
+
+ with open(path, "rb") as f:
+ magic = f.read(4)
+ assert magic == b"PAR1"
+
+ def test_invalid_key_length_raises(self):
+ with pytest.raises(ValueError, match="16, 24, or 32 bytes"):
+ pe.create_encryption_properties(footer_key=b"short")
+
+ with pytest.raises(ValueError, match="16, 24, or 32 bytes"):
+ pe.create_encryption_properties(footer_key=b"")
+
+ with pytest.raises(ValueError, match="16, 24, or 32 bytes"):
+ pe.create_decryption_properties(footer_key=b"short")
+
+ def test_invalid_algorithm_raises(self):
+ with pytest.raises(ValueError, match="Invalid cipher name"):
+ pe.create_encryption_properties(
+ footer_key=DIRECT_KEY_128,
+ encryption_algorithm="INVALID",
+ )
+
+ def test_footer_key_rejects_non_bytes(self):
+ with pytest.raises(TypeError, match="footer_key must be bytes"):
+ pe.create_encryption_properties(footer_key="0123456789abcdef")
+
+ with pytest.raises(TypeError, match="footer_key must be bytes"):
+ pe.create_decryption_properties(footer_key="0123456789abcdef")
+
+ with pytest.raises(TypeError, match="footer_key must be bytes"):
+ pe.create_encryption_properties(footer_key=None)
+
+ def test_aad_prefix_rejects_str(self, tempdir, data_table):
+ with pytest.raises(TypeError, match="aad_prefix must be bytes"):
+ pe.create_encryption_properties(
+ footer_key=DIRECT_KEY_128,
+ aad_prefix="not_bytes",
+ )