pitrou commented on code in PR #45411:
URL: https://github.com/apache/arrow/pull/45411#discussion_r1945044630
##########
docs/source/cpp/parquet.rst:
##########
@@ -585,6 +585,82 @@ More specifically, Parquet C++ supports:
* EncryptionWithFooterKey and EncryptionWithColumnKey modes.
* Encrypted Footer and Plaintext Footer modes.
+Configuration
+~~~~~~~~~~~~~
+
+An example for writing a dataset using encrypted Parquet file format:
+
+.. code-block:: cpp
+
+ #include <arrow/util/logging.h>
+
+ #include "arrow/dataset/file_parquet.h"
+ #include "arrow/dataset/parquet_encryption_config.h"
+ #include "arrow/testing/gtest_util.h"
+ #include "parquet/encryption/crypto_factory.h"
+
+ using arrow::internal::checked_pointer_cast;
+
+ auto crypto_factory =
std::make_shared<parquet::encryption::CryptoFactory>();
+ parquet::encryption::KmsClientFactory kms_client_factory = ...;
+ crypto_factory->RegisterKmsClientFactory(std::move(kms_client_factory));
+ auto kms_connection_config =
std::make_shared<parquet::encryption::KmsConnectionConfig>();
+
+ // Set write options with encryption configuration.
+ auto encryption_config =
+ std::make_shared<parquet::encryption::EncryptionConfiguration>(
+ std::string("footer_key"));
+ encryption_config->column_keys = "col_key: a";
+ auto parquet_encryption_config =
std::make_shared<ParquetEncryptionConfig>();
+ // Directly assign shared_ptr objects to ParquetEncryptionConfig members
+ parquet_encryption_config->crypto_factory = crypto_factory;
+ parquet_encryption_config->kms_connection_config = kms_connection_config;
+ parquet_encryption_config->encryption_config = std::move(encryption_config);
+
+ auto file_format = std::make_shared<ParquetFileFormat>();
+ auto parquet_file_write_options =
+
checked_pointer_cast<ParquetFileWriteOptions>(file_format->DefaultWriteOptions());
+ parquet_file_write_options->parquet_encryption_config =
+ std::move(parquet_encryption_config);
+
+ // Write dataset.
+ arrow::Table table = ...;
+ auto dataset = std::make_shared<InMemoryDataset>(table);
+ EXPECT_OK_AND_ASSIGN(auto scanner_builder, dataset->NewScan());
+ EXPECT_OK_AND_ASSIGN(auto scanner, scanner_builder->Finish());
+
+ FileSystemDatasetWriteOptions write_options;
+ write_options.file_write_options = parquet_file_write_options;
+ write_options.base_dir = "example.parquet";
+ ARROW_CHECK_OK(FileSystemDataset::Write(write_options, std::move(scanner)));
+
+Column encryption is configured by setting ``encryption_config->column_keys``
to a string
+of the format ``"masterKeyID:colName,colName;masterKeyID:colName..."``.
+
+Encrypting columns that have nested fields (for instance struct, map, or even
list data types)
+require configuring column keys for the inner fields, not the column itself.
+Configuring a column key for the column itself causes this error (here column
name is ``col``):
+
+.. code-block::
+
+ OSError: Encrypted column col not in file schema
+
+An example encryption configuration for columns with nested fields:
+
+.. code-block:: cpp
+
+ auto table_schema = schema({
+ field("ListColumn", list(int32())),
+ field("MapColumn", map(utf8(), int32())),
+ field("StructColumn", struct_({field("f1", int32()), field("f2",
utf8())})),
+ });
+
+ encryption_config->column_keys = "column_key_name: "
+ "ListColumn.list.element, "
+ "MapColumn.key_value.key,
MapColumn.key_value.value, "
+ "StructColumn.f1, StructColumn.f2"
Review Comment:
Are the spaces embedded in the string actually supported? Also, it seems to
lack a semicolon at the end of the line.
##########
docs/source/cpp/parquet.rst:
##########
@@ -585,6 +585,82 @@ More specifically, Parquet C++ supports:
* EncryptionWithFooterKey and EncryptionWithColumnKey modes.
* Encrypted Footer and Plaintext Footer modes.
+Configuration
+~~~~~~~~~~~~~
+
+An example for writing a dataset using encrypted Parquet file format:
+
+.. code-block:: cpp
+
+ #include <arrow/util/logging.h>
+
+ #include "arrow/dataset/file_parquet.h"
+ #include "arrow/dataset/parquet_encryption_config.h"
+ #include "arrow/testing/gtest_util.h"
+ #include "parquet/encryption/crypto_factory.h"
+
+ using arrow::internal::checked_pointer_cast;
+
+ auto crypto_factory =
std::make_shared<parquet::encryption::CryptoFactory>();
+ parquet::encryption::KmsClientFactory kms_client_factory = ...;
+ crypto_factory->RegisterKmsClientFactory(std::move(kms_client_factory));
+ auto kms_connection_config =
std::make_shared<parquet::encryption::KmsConnectionConfig>();
+
+ // Set write options with encryption configuration.
+ auto encryption_config =
+ std::make_shared<parquet::encryption::EncryptionConfiguration>(
+ std::string("footer_key"));
+ encryption_config->column_keys = "col_key: a";
+ auto parquet_encryption_config =
std::make_shared<ParquetEncryptionConfig>();
+ // Directly assign shared_ptr objects to ParquetEncryptionConfig members
+ parquet_encryption_config->crypto_factory = crypto_factory;
+ parquet_encryption_config->kms_connection_config = kms_connection_config;
+ parquet_encryption_config->encryption_config = std::move(encryption_config);
+
+ auto file_format = std::make_shared<ParquetFileFormat>();
+ auto parquet_file_write_options =
+
checked_pointer_cast<ParquetFileWriteOptions>(file_format->DefaultWriteOptions());
+ parquet_file_write_options->parquet_encryption_config =
+ std::move(parquet_encryption_config);
+
+ // Write dataset.
+ arrow::Table table = ...;
+ auto dataset = std::make_shared<InMemoryDataset>(table);
+ EXPECT_OK_AND_ASSIGN(auto scanner_builder, dataset->NewScan());
+ EXPECT_OK_AND_ASSIGN(auto scanner, scanner_builder->Finish());
+
+ FileSystemDatasetWriteOptions write_options;
+ write_options.file_write_options = parquet_file_write_options;
+ write_options.base_dir = "example.parquet";
+ ARROW_CHECK_OK(FileSystemDataset::Write(write_options, std::move(scanner)));
+
+Column encryption is configured by setting ``encryption_config->column_keys``
to a string
+of the format ``"masterKeyID:colName,colName;masterKeyID:colName..."``.
Review Comment:
```suggestion
of the format ``"columnKeyID:colName,colName;columnKeyID:colName..."``.
```
##########
docs/source/cpp/parquet.rst:
##########
@@ -585,6 +585,82 @@ More specifically, Parquet C++ supports:
* EncryptionWithFooterKey and EncryptionWithColumnKey modes.
* Encrypted Footer and Plaintext Footer modes.
+Configuration
+~~~~~~~~~~~~~
+
+An example for writing a dataset using encrypted Parquet file format:
+
+.. code-block:: cpp
Review Comment:
@jorisvandenbossche @AlenkaF @raulcd What is our preferred policy for code
examples? Do we put them inline in the docs? Do we use separate files?
##########
docs/source/python/parquet.rst:
##########
@@ -739,6 +739,36 @@ An example encryption configuration:
},
)
+.. note::
+ Encrypting columns that have nested fields (for instance struct, map, or
even list data types)
+ require configuring column keys for the inner fields, not the column itself.
+ Configuring a column key for the column itself causes this error (here
column name is ``col``):
+
+ .. code-block::
+
+ OSError: Encrypted column col not in file schema
+
+An example encryption configuration for columns with nested fields:
Review Comment:
```suggestion
An example encryption configuration for columns with nested fields, where
all of the columns will be encrypted with the key identified by
`column_key_id`:
```
##########
docs/source/python/parquet.rst:
##########
@@ -739,6 +739,36 @@ An example encryption configuration:
},
)
+.. note::
+ Encrypting columns that have nested fields (for instance struct, map, or
even list data types)
+ require configuring column keys for the inner fields, not the column itself.
+ Configuring a column key for the column itself causes this error (here
column name is ``col``):
+
+ .. code-block::
+
+ OSError: Encrypted column col not in file schema
+
+An example encryption configuration for columns with nested fields:
+
+.. code-block:: python
+
+ schema = pa.schema([
+ ("ListColumn", pa.list_(pa.int32())),
+ ("MapColumn", pa.map_(pa.string(), pa.int32())),
+ ("StructColumn", pa.struct([("f1", pa.int32()), ("f2", pa.string())])),
+ ])
+
+ encryption_config = pq.EncryptionConfiguration(
+ footer_key="footer_key_name",
+ column_keys={
+ "column_key_name": [
Review Comment:
```suggestion
"column_key_id": [
```
##########
docs/source/cpp/parquet.rst:
##########
@@ -585,6 +585,82 @@ More specifically, Parquet C++ supports:
* EncryptionWithFooterKey and EncryptionWithColumnKey modes.
* Encrypted Footer and Plaintext Footer modes.
+Configuration
+~~~~~~~~~~~~~
+
+An example for writing a dataset using encrypted Parquet file format:
+
+.. code-block:: cpp
+
+ #include <arrow/util/logging.h>
+
+ #include "arrow/dataset/file_parquet.h"
+ #include "arrow/dataset/parquet_encryption_config.h"
+ #include "arrow/testing/gtest_util.h"
+ #include "parquet/encryption/crypto_factory.h"
+
+ using arrow::internal::checked_pointer_cast;
+
+ auto crypto_factory =
std::make_shared<parquet::encryption::CryptoFactory>();
+ parquet::encryption::KmsClientFactory kms_client_factory = ...;
+ crypto_factory->RegisterKmsClientFactory(std::move(kms_client_factory));
+ auto kms_connection_config =
std::make_shared<parquet::encryption::KmsConnectionConfig>();
+
+ // Set write options with encryption configuration.
+ auto encryption_config =
+ std::make_shared<parquet::encryption::EncryptionConfiguration>(
+ std::string("footer_key"));
+ encryption_config->column_keys = "col_key: a";
+ auto parquet_encryption_config =
std::make_shared<ParquetEncryptionConfig>();
+ // Directly assign shared_ptr objects to ParquetEncryptionConfig members
+ parquet_encryption_config->crypto_factory = crypto_factory;
+ parquet_encryption_config->kms_connection_config = kms_connection_config;
+ parquet_encryption_config->encryption_config = std::move(encryption_config);
+
+ auto file_format = std::make_shared<ParquetFileFormat>();
+ auto parquet_file_write_options =
+
checked_pointer_cast<ParquetFileWriteOptions>(file_format->DefaultWriteOptions());
+ parquet_file_write_options->parquet_encryption_config =
+ std::move(parquet_encryption_config);
+
+ // Write dataset.
+ arrow::Table table = ...;
+ auto dataset = std::make_shared<InMemoryDataset>(table);
+ EXPECT_OK_AND_ASSIGN(auto scanner_builder, dataset->NewScan());
+ EXPECT_OK_AND_ASSIGN(auto scanner, scanner_builder->Finish());
+
+ FileSystemDatasetWriteOptions write_options;
+ write_options.file_write_options = parquet_file_write_options;
+ write_options.base_dir = "example.parquet";
+ ARROW_CHECK_OK(FileSystemDataset::Write(write_options, std::move(scanner)));
+
+Column encryption is configured by setting ``encryption_config->column_keys``
to a string
+of the format ``"masterKeyID:colName,colName;masterKeyID:colName..."``.
+
+Encrypting columns that have nested fields (for instance struct, map, or even
list data types)
+require configuring column keys for the inner fields, not the column itself.
+Configuring a column key for the column itself causes this error (here column
name is ``col``):
+
+.. code-block::
+
+ OSError: Encrypted column col not in file schema
+
+An example encryption configuration for columns with nested fields:
+
+.. code-block:: cpp
+
+ auto table_schema = schema({
+ field("ListColumn", list(int32())),
+ field("MapColumn", map(utf8(), int32())),
+ field("StructColumn", struct_({field("f1", int32()), field("f2",
utf8())})),
+ });
+
+ encryption_config->column_keys = "column_key_name: "
Review Comment:
```suggestion
encryption_config->column_keys = "column_key_id: "
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]