pitrou commented on code in PR #45462:
URL: https://github.com/apache/arrow/pull/45462#discussion_r2112222174


##########
cpp/src/parquet/encryption/encryption.cc:
##########
@@ -163,6 +163,76 @@ FileEncryptionProperties::Builder* 
FileEncryptionProperties::Builder::encrypted_
   return this;
 }
 
+void FileEncryptionProperties::encrypt_schema(const SchemaDescriptor& schema) {
+  // Check that all columns in columnEncryptionProperties exist in the schema.
+  // Copy the encrypted_columns map as we are going to modify it while 
iterating it
+  auto encrypted_columns = 
ColumnPathToEncryptionPropertiesMap(encrypted_columns_);
+  // if columnEncryptionProperties is empty, every column in file schema will 
be
+  // encrypted with footer key.
+  if (!encrypted_columns.empty()) {
+    std::vector<std::pair<std::string, std::string>> column_path_vec;
+    // First, memorize all column or schema paths of the schema as dot-strings.
+    for (int i = 0; i < schema.num_columns(); i++) {
+      auto column = schema.Column(i);
+      auto column_path = column->path()->ToDotString();
+      auto schema_path = column->schema_path()->ToDotString();
+      column_path_vec.emplace_back(column_path, column_path);
+      if (schema_path != column_path) {
+        column_path_vec.emplace_back(schema_path, column_path);
+      }
+    }
+    // Sort them alphabetically, so that we can use binary-search and look up 
parent
+    // columns.
+    std::sort(column_path_vec.begin(), column_path_vec.end());
+
+    // Check if encrypted column exists in schema, or if it is a parent field 
of a column.
+    for (const auto& elem : encrypted_columns) {
+      auto& encrypted_column = elem.first;
+      auto encrypted_column_prefix = encrypted_column + ".";
+      auto encrypted_column_prefix_len = encrypted_column_prefix.size();
+
+      // first we look up encrypted_columns as
+      // find first column that equals encrypted_column or starts with 
encrypted_column
+      auto it = std::lower_bound(
+          column_path_vec.begin(), column_path_vec.end(), encrypted_column,
+          [&](const std::pair<std::string, std::string>& item, const 
std::string& term) {
+            return item.first < term;
+          });
+      bool matches = false;
+
+      // encrypted_column encrypts column 'it' when 'it' is either equal to
+      // encrypted_column, or 'it' starts with encrypted_column_prefix,
+      // i.e. encrypted_column followed by a '.'
+      while (
+          it != column_path_vec.end() &&
+          (it->first == encrypted_column ||
+           // C++20: can be replaced with 
it->first.starts_with(encrypted_column_prefix)

Review Comment:
   We have a `StartsWith` utility in `arrow/util/string.h`, FTR



##########
cpp/src/parquet/schema.h:
##########
@@ -132,6 +134,8 @@ class PARQUET_EXPORT Node {
 
   const std::shared_ptr<ColumnPath> path() const;
 
+  const std::shared_ptr<ColumnPath> schema_path() const;

Review Comment:
   If we decide these APIs fit here, we should add docstrings to `path` and 
`schema_path` so that the distinction is clear (but I'm not sure we should 
really put them here).



##########
cpp/src/parquet/schema.cc:
##########
@@ -68,11 +68,25 @@ std::shared_ptr<ColumnPath> ColumnPath::FromDotString(const 
std::string& dotstri
 }
 
 std::shared_ptr<ColumnPath> ColumnPath::FromNode(const Node& node) {
+  return FromNode(node, false);
+}
+
+std::shared_ptr<ColumnPath> ColumnPath::FromNode(const Node& node, bool 
schema_path) {
   // Build the path in reverse order as we traverse the nodes to the top
   std::vector<std::string> rpath_;
   const Node* cursor = &node;
   // The schema node is not part of the ColumnPath
   while (cursor->parent()) {
+    if (schema_path &&
+        (
+            // nested fields in arrow schema do not know these intermediate 
nodes
+            cursor->parent()->converted_type() == ConvertedType::MAP ||
+            cursor->parent()->converted_type() == ConvertedType::LIST ||
+            (cursor->parent()->parent() &&
+             cursor->parent()->parent()->converted_type() == 
ConvertedType::LIST))) {
+      cursor = cursor->parent();
+      continue;
+    }

Review Comment:
   I'm not sure this logic deserves to be here, as it's quite Arrow-specific.
   Furthermore, the actual conversion logic from Parquet schema to Arrow schema 
is more complicated, example here:
   
https://github.com/apache/arrow/blob/d7015bd6e610b6cd6752f6cd543509bd5f8853ff/cpp/src/parquet/arrow/arrow_schema_test.cc#L533-L556
   



##########
cpp/src/parquet/schema.h:
##########
@@ -386,6 +390,8 @@ class PARQUET_EXPORT ColumnDescriptor {
 
   const std::shared_ptr<schema::ColumnPath> path() const;
 
+  const std::shared_ptr<schema::ColumnPath> schema_path() const;

Review Comment:
   Same here.



##########
cpp/src/parquet/encryption/encryption.h:
##########
@@ -429,6 +429,16 @@ class PARQUET_EXPORT FileEncryptionProperties {
     return encrypted_columns_;
   }
 
+  /// All columns in encrypted_columns must refer to columns in the given 
schema.
+  /// They can also refer to parent fields if schema contains nested fields. 
Then
+  /// all those nested fields of a matching parent field are encrypted by the 
same key.
+  /// This method modifies encrypted_columns to reflect this.
+  ///
+  /// Columns in encrypted_columns can refer to the parquet column paths as 
well as the
+  /// schema paths of columns. Those are usually identical, except for nested 
fields of
+  /// lists and maps.
+  void encrypt_schema(const SchemaDescriptor& schema);

Review Comment:
   Since this is not a trivial accessor, let's make it camel-case: 
`EncryptSchema`



##########
cpp/src/parquet/encryption/encryption.h:
##########
@@ -429,6 +429,16 @@ class PARQUET_EXPORT FileEncryptionProperties {
     return encrypted_columns_;
   }
 
+  /// All columns in encrypted_columns must refer to columns in the given 
schema.

Review Comment:
   Add a brief summary to this? For example:
   ```suggestion
     /// \brief Modify encrypted columns to conform to the schema
     ///
     /// All columns in encrypted_columns must refer to columns in the given 
schema.
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Reply via email to