wgtmac commented on code in PR #34616:
URL: https://github.com/apache/arrow/pull/34616#discussion_r1269059268


##########
cpp/src/arrow/dataset/file_parquet.h:
##########
@@ -236,11 +252,25 @@ class ARROW_DS_EXPORT ParquetFileWriteOptions : public 
FileWriteOptions {
   /// \brief Parquet Arrow writer properties.
   std::shared_ptr<parquet::ArrowWriterProperties> arrow_writer_properties;
 
+  /// \brief A getter function to retrieve the dataset encryption configuration
+  std::shared_ptr<ParquetEncryptionConfig> GetParquetEncryptionConfig() const {
+    return parquet_encryption_config_;
+  }
+  /// \brief A setter for ParquetEncryptionConfig
+  void SetParquetEncryptionConfig(
+      std::shared_ptr<ParquetEncryptionConfig> dataset_encryption_config) {
+    parquet_encryption_config_ = std::move(dataset_encryption_config);

Review Comment:
   ```suggestion
         std::shared_ptr<ParquetEncryptionConfig> parquet_encryption_config) {
       parquet_encryption_config_ = std::move(parquet_encryption_config);
   ```



##########
cpp/src/arrow/dataset/file_parquet.h:
##########
@@ -226,6 +229,19 @@ class ARROW_DS_EXPORT ParquetFragmentScanOptions : public 
FragmentScanOptions {
   /// ScanOptions. Additionally, dictionary columns come from
   /// ParquetFileFormat::ReaderOptions::dict_columns.
   std::shared_ptr<parquet::ArrowReaderProperties> arrow_reader_properties;
+  /// \brief A getter function to retrieve the dataset decryption configuration

Review Comment:
   ```suggestion
     /// \brief A getter function to retrieve the parquet decryption 
configuration
   ```



##########
cpp/src/arrow/dataset/file_parquet.h:
##########
@@ -236,11 +252,25 @@ class ARROW_DS_EXPORT ParquetFileWriteOptions : public 
FileWriteOptions {
   /// \brief Parquet Arrow writer properties.
   std::shared_ptr<parquet::ArrowWriterProperties> arrow_writer_properties;
 
+  /// \brief A getter function to retrieve the dataset encryption configuration

Review Comment:
   ```suggestion
     /// \brief A getter function to retrieve the parquet encryption 
configuration
   ```



##########
python/pyarrow/_dataset_parquet.pyx:
##########
@@ -637,10 +717,14 @@ cdef class 
ParquetFragmentScanOptions(FragmentScanOptions):
         If not None, override the maximum total size of containers allocated
         when decoding Thrift structures. The default limit should be
         sufficient for most Parquet files.
+    dataset_decryption_config : ParquetDecryptionConfig, default None

Review Comment:
   ```suggestion
       parquet_decryption_config : ParquetDecryptionConfig, default None
   ```



##########
python/pyarrow/_dataset_parquet.pyx:
##########
@@ -564,6 +635,14 @@ cdef class ParquetFileWriteOptions(FileWriteOptions):
             data_page_version=self._properties["data_page_version"],
         )
 
+        cdef shared_ptr[CParquetEncryptionConfig] c_config
+        if self._properties["dataset_encryption_config"]:
+            config = self._properties["dataset_encryption_config"]

Review Comment:
   ```suggestion
           if self._properties["parquet_encryption_config"]:
               config = self._properties["parquet_encryption_config"]
   ```



##########
python/pyarrow/_dataset_parquet.pyx:
##########
@@ -598,6 +677,7 @@ cdef class ParquetFileWriteOptions(FileWriteOptions):
             coerce_timestamps=None,
             allow_truncated_timestamps=False,
             use_compliant_nested_type=True,
+            dataset_encryption_config=None,

Review Comment:
   ```suggestion
               parquet_encryption_config=None,
   ```



##########
python/examples/dataset/write_dataset_encrypted.py:
##########
@@ -0,0 +1,93 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+import pyarrow as pa
+import pyarrow.dataset as ds
+import pyarrow.parquet.encryption as pe
+from pyarrow.tests.parquet.encryption import InMemoryKmsClient
+from datetime import timedelta
+import shutil
+import os
+
+""" A sample to demostrate dataset encryption and decryption"""
+
+# create a list of dictionaries that will represent our dataset
+table = pa.table({'year': [2020, 2022, 2021, 2022, 2019, 2021],
+                  'n_legs': [2, 2, 4, 4, 5, 100],
+                  'animal': ["Flamingo", "Parrot", "Dog", "Horse",
+                             "Brittle stars", "Centipede"]})
+
+# create a PyArrow dataset from the table
+dataset = ds.dataset(table)
+
+FOOTER_KEY = b"0123456789112345"
+FOOTER_KEY_NAME = "footer_key"
+COL_KEY = b"1234567890123450"
+COL_KEY_NAME = "col_key"
+
+encryption_config = pe.EncryptionConfiguration(
+    footer_key=FOOTER_KEY_NAME,
+    plaintext_footer=False,
+    # Use COL_KEY_NAME to encrypt `n_legs` and `animal` columns.
+    column_keys={
+        COL_KEY_NAME: ["n_legs", "animal"],
+    },
+    encryption_algorithm="AES_GCM_V1",
+    # requires timedelta or an assertion is raised
+    cache_lifetime=timedelta(minutes=5.0),
+    data_key_length_bits=256)
+
+kms_connection_config = pe.KmsConnectionConfig(
+    custom_kms_conf={
+        FOOTER_KEY_NAME: FOOTER_KEY.decode("UTF-8"),
+        COL_KEY_NAME: COL_KEY.decode("UTF-8"),
+    }
+)
+
+decryption_config = pe.DecryptionConfiguration(cache_lifetime=300)
+
+
+def kms_factory(kms_connection_configuration):
+    return InMemoryKmsClient(kms_connection_configuration)
+
+
+crypto_factory = pe.CryptoFactory(kms_factory)
+dataset_encryption_cfg = ds.ParquetEncryptionConfig(
+    crypto_factory, kms_connection_config, encryption_config)
+dataset_decryption_cfg = ds.ParquetDecryptionConfig(crypto_factory,

Review Comment:
   ```suggestion
   parquet_encryption_cfg = ds.ParquetEncryptionConfig(
       crypto_factory, kms_connection_config, encryption_config)
   parquet_decryption_cfg = ds.ParquetDecryptionConfig(crypto_factory,
   ```



##########
python/pyarrow/_dataset_parquet.pyx:
##########
@@ -670,6 +758,14 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions):
     cdef ArrowReaderProperties* arrow_reader_properties(self):
         return self.parquet_options.arrow_reader_properties.get()
 
+    @property
+    def dataset_decryption_config(self):
+        return self._dataset_decryption_config
+
+    @dataset_decryption_config.setter

Review Comment:
   ```suggestion
       def parquet_decryption_config(self):
           return self._parquet_decryption_config
   
       @parquet_decryption_config.setter
   ```



##########
python/pyarrow/_dataset_parquet.pyx:
##########
@@ -637,10 +717,14 @@ cdef class 
ParquetFragmentScanOptions(FragmentScanOptions):
         If not None, override the maximum total size of containers allocated
         when decoding Thrift structures. The default limit should be
         sufficient for most Parquet files.
+    dataset_decryption_config : ParquetDecryptionConfig, default None
+        If not None, use the provided ParquetDecryptionConfig to decrypt the
+        Parquet file.
     """
 
     cdef:
         CParquetFragmentScanOptions* parquet_options
+        ParquetDecryptionConfig _dataset_decryption_config

Review Comment:
   ```suggestion
           ParquetDecryptionConfig _parquet_decryption_config
   ```



##########
python/pyarrow/includes/libarrow_dataset_parquet.pxd:
##########
@@ -31,6 +31,8 @@ cdef extern from "arrow/dataset/api.h" namespace 
"arrow::dataset" nogil:
             "arrow::dataset::ParquetFileWriteOptions"(CFileWriteOptions):
         shared_ptr[WriterProperties] writer_properties
         shared_ptr[ArrowWriterProperties] arrow_writer_properties
+        shared_ptr[CParquetEncryptionConfig] GetParquetEncryptionConfig()
+        void SetParquetEncryptionConfig(shared_ptr[CParquetEncryptionConfig] 
dataset_encryption_config)

Review Comment:
   ```suggestion
           void SetParquetEncryptionConfig(shared_ptr[CParquetEncryptionConfig] 
parquet_encryption_config)
   ```



##########
python/examples/dataset/write_dataset_encrypted.py:
##########
@@ -0,0 +1,93 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+import pyarrow as pa
+import pyarrow.dataset as ds
+import pyarrow.parquet.encryption as pe
+from pyarrow.tests.parquet.encryption import InMemoryKmsClient
+from datetime import timedelta
+import shutil
+import os
+
+""" A sample to demostrate dataset encryption and decryption"""
+
+# create a list of dictionaries that will represent our dataset
+table = pa.table({'year': [2020, 2022, 2021, 2022, 2019, 2021],
+                  'n_legs': [2, 2, 4, 4, 5, 100],
+                  'animal': ["Flamingo", "Parrot", "Dog", "Horse",
+                             "Brittle stars", "Centipede"]})
+
+# create a PyArrow dataset from the table
+dataset = ds.dataset(table)
+
+FOOTER_KEY = b"0123456789112345"
+FOOTER_KEY_NAME = "footer_key"
+COL_KEY = b"1234567890123450"
+COL_KEY_NAME = "col_key"
+
+encryption_config = pe.EncryptionConfiguration(
+    footer_key=FOOTER_KEY_NAME,
+    plaintext_footer=False,
+    # Use COL_KEY_NAME to encrypt `n_legs` and `animal` columns.
+    column_keys={
+        COL_KEY_NAME: ["n_legs", "animal"],
+    },
+    encryption_algorithm="AES_GCM_V1",
+    # requires timedelta or an assertion is raised
+    cache_lifetime=timedelta(minutes=5.0),
+    data_key_length_bits=256)
+
+kms_connection_config = pe.KmsConnectionConfig(
+    custom_kms_conf={
+        FOOTER_KEY_NAME: FOOTER_KEY.decode("UTF-8"),
+        COL_KEY_NAME: COL_KEY.decode("UTF-8"),
+    }
+)
+
+decryption_config = pe.DecryptionConfiguration(cache_lifetime=300)
+
+
+def kms_factory(kms_connection_configuration):
+    return InMemoryKmsClient(kms_connection_configuration)
+
+
+crypto_factory = pe.CryptoFactory(kms_factory)
+dataset_encryption_cfg = ds.ParquetEncryptionConfig(
+    crypto_factory, kms_connection_config, encryption_config)
+dataset_decryption_cfg = ds.ParquetDecryptionConfig(crypto_factory,
+                                                    kms_connection_config,
+                                                    decryption_config)
+
+# set encryption config for parquet fragment scan options
+pq_scan_opts = ds.ParquetFragmentScanOptions()
+pq_scan_opts.dataset_decryption_config = dataset_decryption_cfg

Review Comment:
   ```suggestion
   pq_scan_opts.parquet_decryption_config = parquet_decryption_cfg
   ```



##########
cpp/src/arrow/dataset/file_parquet.h:
##########
@@ -226,6 +229,19 @@ class ARROW_DS_EXPORT ParquetFragmentScanOptions : public 
FragmentScanOptions {
   /// ScanOptions. Additionally, dictionary columns come from
   /// ParquetFileFormat::ReaderOptions::dict_columns.
   std::shared_ptr<parquet::ArrowReaderProperties> arrow_reader_properties;
+  /// \brief A getter function to retrieve the dataset decryption configuration
+  std::shared_ptr<ParquetDecryptionConfig> GetParquetDecryptionConfig() const {
+    return parquet_decryption_config_;
+  }
+  /// \brief A setter for ParquetDecryptionConfig
+  void SetParquetDecryptionConfig(
+      std::shared_ptr<ParquetDecryptionConfig> dataset_decryption_config) {
+    parquet_decryption_config_ = std::move(dataset_decryption_config);

Review Comment:
   ```suggestion
         std::shared_ptr<ParquetDecryptionConfig> parquet_decryption_config) {
       parquet_decryption_config_ = std::move(parquet_decryption_config);
   ```



##########
python/examples/dataset/write_dataset_encrypted.py:
##########
@@ -0,0 +1,93 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+import pyarrow as pa
+import pyarrow.dataset as ds
+import pyarrow.parquet.encryption as pe
+from pyarrow.tests.parquet.encryption import InMemoryKmsClient
+from datetime import timedelta
+import shutil
+import os
+
+""" A sample to demostrate dataset encryption and decryption"""

Review Comment:
   ```suggestion
   """ A sample to demonstrate parquet dataset encryption and decryption"""
   ```



##########
python/pyarrow/_dataset_parquet.pyx:
##########
@@ -660,6 +745,9 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions):
         if thrift_container_size_limit is not None:
             self.thrift_container_size_limit = thrift_container_size_limit
 
+        if dataset_decryption_config:
+            self.SetParquetDecryptionConfig(dataset_decryption_config)

Review Comment:
   ```suggestion
           if parquet_decryption_config:
               self.SetParquetDecryptionConfig(parquet_decryption_config)
   ```



##########
python/pyarrow/_dataset_parquet.pyx:
##########
@@ -649,7 +733,8 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions):
                  buffer_size=8192,
                  bint pre_buffer=False,
                  thrift_string_size_limit=None,
-                 thrift_container_size_limit=None):
+                 thrift_container_size_limit=None,
+                 dataset_decryption_config=None):

Review Comment:
   ```suggestion
                    parquet_decryption_config=None):
   ```



##########
python/examples/dataset/write_dataset_encrypted.py:
##########
@@ -0,0 +1,93 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+import pyarrow as pa
+import pyarrow.dataset as ds
+import pyarrow.parquet.encryption as pe
+from pyarrow.tests.parquet.encryption import InMemoryKmsClient
+from datetime import timedelta
+import shutil
+import os
+
+""" A sample to demostrate dataset encryption and decryption"""
+
+# create a list of dictionaries that will represent our dataset
+table = pa.table({'year': [2020, 2022, 2021, 2022, 2019, 2021],
+                  'n_legs': [2, 2, 4, 4, 5, 100],
+                  'animal': ["Flamingo", "Parrot", "Dog", "Horse",
+                             "Brittle stars", "Centipede"]})
+
+# create a PyArrow dataset from the table
+dataset = ds.dataset(table)
+
+FOOTER_KEY = b"0123456789112345"
+FOOTER_KEY_NAME = "footer_key"
+COL_KEY = b"1234567890123450"
+COL_KEY_NAME = "col_key"
+
+encryption_config = pe.EncryptionConfiguration(
+    footer_key=FOOTER_KEY_NAME,
+    plaintext_footer=False,
+    # Use COL_KEY_NAME to encrypt `n_legs` and `animal` columns.
+    column_keys={
+        COL_KEY_NAME: ["n_legs", "animal"],
+    },
+    encryption_algorithm="AES_GCM_V1",
+    # requires timedelta or an assertion is raised
+    cache_lifetime=timedelta(minutes=5.0),
+    data_key_length_bits=256)
+
+kms_connection_config = pe.KmsConnectionConfig(
+    custom_kms_conf={
+        FOOTER_KEY_NAME: FOOTER_KEY.decode("UTF-8"),
+        COL_KEY_NAME: COL_KEY.decode("UTF-8"),
+    }
+)
+
+decryption_config = pe.DecryptionConfiguration(cache_lifetime=300)
+
+
+def kms_factory(kms_connection_configuration):
+    return InMemoryKmsClient(kms_connection_configuration)
+
+
+crypto_factory = pe.CryptoFactory(kms_factory)
+dataset_encryption_cfg = ds.ParquetEncryptionConfig(
+    crypto_factory, kms_connection_config, encryption_config)
+dataset_decryption_cfg = ds.ParquetDecryptionConfig(crypto_factory,
+                                                    kms_connection_config,
+                                                    decryption_config)
+
+# set encryption config for parquet fragment scan options
+pq_scan_opts = ds.ParquetFragmentScanOptions()
+pq_scan_opts.dataset_decryption_config = dataset_decryption_cfg
+pformat = 
pa.dataset.ParquetFileFormat(default_fragment_scan_options=pq_scan_opts)
+
+if os.path.exists('sample_dataset'):
+    shutil.rmtree('sample_dataset')
+
+write_options = pformat.make_write_options(
+    dataset_encryption_config=dataset_encryption_cfg)

Review Comment:
   ```suggestion
       parquet_encryption_config=parquet_encryption_cfg)
   ```



##########
python/pyarrow/includes/libarrow_dataset_parquet.pxd:
##########
@@ -62,6 +64,8 @@ cdef extern from "arrow/dataset/api.h" namespace 
"arrow::dataset" nogil:
             "arrow::dataset::ParquetFragmentScanOptions"(CFragmentScanOptions):
         shared_ptr[CReaderProperties] reader_properties
         shared_ptr[ArrowReaderProperties] arrow_reader_properties
+        shared_ptr[CParquetDecryptionConfig] GetDatasetDecryptionConfig()

Review Comment:
   ```suggestion
           shared_ptr[CParquetDecryptionConfig] GetParquetDecryptionConfig()
   ```



##########
python/pyarrow/includes/libarrow_dataset_parquet.pxd:
##########
@@ -62,6 +64,8 @@ cdef extern from "arrow/dataset/api.h" namespace 
"arrow::dataset" nogil:
             "arrow::dataset::ParquetFragmentScanOptions"(CFragmentScanOptions):
         shared_ptr[CReaderProperties] reader_properties
         shared_ptr[ArrowReaderProperties] arrow_reader_properties
+        shared_ptr[CParquetDecryptionConfig] GetDatasetDecryptionConfig()
+        void SetParquetDecryptionConfig(shared_ptr[CParquetDecryptionConfig] 
dataset_decryption_config)

Review Comment:
   ```suggestion
           void SetParquetDecryptionConfig(shared_ptr[CParquetDecryptionConfig] 
parquet_decryption_config)
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to