This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new d21a924f30 GH-42102: [C++][Parquet] Add binary that extracts a footer 
from a parquet file (#42174)
d21a924f30 is described below

commit d21a924f3012c1e589a3393ebae2c78ee290ba5c
Author: Alkis Evlogimenos <[email protected]>
AuthorDate: Mon Jul 22 17:55:28 2024 +0300

    GH-42102: [C++][Parquet] Add binary that extracts a footer from a parquet 
file (#42174)
    
    ### Rationale for this change
    
    This binary will make it a lot easier for customers to share their parquet 
metadata with the community so that we can build a repository of footers that 
can be used for advancing the state of metadata in parquet.
    
    ### What changes are included in this PR?
    
    Usage from the file binary itself:
    ```
    Usage: parquet-dump-footer
      -h|--help    Print help and exit
      --no-scrub   Do not scrub potentially confidential metadata
      --debug      Output text represenation of footer for inspection
      --in <uri>   Input file (required): must be an URI or an absolute local 
path
      --out <path> Output file (optional, default stdout)
    
      Dump the footer of a Parquet file to stdout or to a file, optionally with
      potentially confidential metadata scrubbed.
    ```
    
    ### Are these changes tested?
    
    Manually on existing parquet files.
    
    ### Are there any user-facing changes?
    
    No.
    
    * GitHub Issue: #42102
    
    Lead-authored-by: Alkis Evlogimenos <[email protected]>
    Co-authored-by: Antoine Pitrou <[email protected]>
    Signed-off-by: Antoine Pitrou <[email protected]>
---
 cpp/src/parquet/metadata.cc              |  65 +++++++++++++++
 cpp/src/parquet/metadata.h               |   7 ++
 cpp/tools/parquet/CMakeLists.txt         |   3 +-
 cpp/tools/parquet/parquet_dump_footer.cc | 135 +++++++++++++++++++++++++++++++
 4 files changed, 209 insertions(+), 1 deletion(-)

diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc
index ee83918189..7bab910461 100644
--- a/cpp/src/parquet/metadata.cc
+++ b/cpp/src/parquet/metadata.cc
@@ -21,6 +21,8 @@
 #include <cinttypes>
 #include <memory>
 #include <ostream>
+#include <random>
+#include <sstream>
 #include <string>
 #include <string_view>
 #include <utility>
@@ -29,6 +31,7 @@
 #include "arrow/io/memory.h"
 #include "arrow/util/key_value_metadata.h"
 #include "arrow/util/logging.h"
+#include "arrow/util/pcg_random.h"
 #include "parquet/encryption/encryption_internal.h"
 #include "parquet/encryption/internal_file_decryptor.h"
 #include "parquet/exception.h"
@@ -599,6 +602,49 @@ std::vector<SortingColumn> 
RowGroupMetaData::sorting_columns() const {
   return impl_->sorting_columns();
 }
 
+// Replace string data with random-generated uppercase characters
+static void Scrub(std::string* s) {
+  static ::arrow::random::pcg64 rng;
+  std::uniform_int_distribution<> caps(65, 90);
+  for (auto& c : *s) c = caps(rng);
+}
+
+// Replace potentially sensitive metadata with random data
+static void Scrub(format::FileMetaData* md) {
+  for (auto& s : md->schema) {
+    Scrub(&s.name);
+  }
+  for (auto& r : md->row_groups) {
+    for (auto& c : r.columns) {
+      Scrub(&c.file_path);
+      if (c.__isset.meta_data) {
+        auto& m = c.meta_data;
+        for (auto& p : m.path_in_schema) Scrub(&p);
+        for (auto& kv : m.key_value_metadata) {
+          Scrub(&kv.key);
+          Scrub(&kv.value);
+        }
+        Scrub(&m.statistics.max_value);
+        Scrub(&m.statistics.min_value);
+        Scrub(&m.statistics.min);
+        Scrub(&m.statistics.max);
+      }
+
+      if (c.crypto_metadata.__isset.ENCRYPTION_WITH_COLUMN_KEY) {
+        auto& m = c.crypto_metadata.ENCRYPTION_WITH_COLUMN_KEY;
+        for (auto& p : m.path_in_schema) Scrub(&p);
+        Scrub(&m.key_metadata);
+      }
+      Scrub(&c.encrypted_column_metadata);
+    }
+  }
+  for (auto& kv : md->key_value_metadata) {
+    Scrub(&kv.key);
+    Scrub(&kv.value);
+  }
+  Scrub(&md->footer_signing_key_metadata);
+}
+
 // file metadata
 class FileMetaData::FileMetaDataImpl {
  public:
@@ -821,6 +867,21 @@ class FileMetaData::FileMetaDataImpl {
     return out;
   }
 
+  std::string SerializeUnencrypted(bool scrub, bool debug) const {
+    auto md = *metadata_;
+    if (scrub) Scrub(&md);
+    if (debug) {
+      std::ostringstream ss;
+      md.printTo(ss);
+      return ss.str();
+    } else {
+      ThriftSerializer serializer;
+      std::string out;
+      serializer.SerializeToString(&md, &out);
+      return out;
+    }
+  }
+
   void set_file_decryptor(std::shared_ptr<InternalFileDecryptor> 
file_decryptor) {
     file_decryptor_ = std::move(file_decryptor);
   }
@@ -992,6 +1053,10 @@ std::shared_ptr<FileMetaData> FileMetaData::Subset(
   return impl_->Subset(row_groups);
 }
 
+std::string FileMetaData::SerializeUnencrypted(bool scrub, bool json) const {
+  return impl_->SerializeUnencrypted(scrub, json);
+}
+
 void FileMetaData::WriteTo(::arrow::io::OutputStream* dst,
                            const std::shared_ptr<Encryptor>& encryptor) const {
   return impl_->WriteTo(dst, encryptor);
diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h
index 9fc30df58e..e02d2e7c85 100644
--- a/cpp/src/parquet/metadata.h
+++ b/cpp/src/parquet/metadata.h
@@ -396,6 +396,13 @@ class PARQUET_EXPORT FileMetaData {
   /// FileMetaData.
   std::shared_ptr<FileMetaData> Subset(const std::vector<int>& row_groups) 
const;
 
+  /// \brief Serialize metadata unencrypted as string
+  ///
+  /// \param[in] scrub whether to remove sensitive information from the 
metadata.
+  /// \param[in] debug whether to serialize the metadata as Thrift (if false) 
or
+  /// debug text (if true).
+  std::string SerializeUnencrypted(bool scrub, bool debug) const;
+
  private:
   friend FileMetaDataBuilder;
   friend class SerializedFile;
diff --git a/cpp/tools/parquet/CMakeLists.txt b/cpp/tools/parquet/CMakeLists.txt
index 81ab49421d..e05645da28 100644
--- a/cpp/tools/parquet/CMakeLists.txt
+++ b/cpp/tools/parquet/CMakeLists.txt
@@ -16,7 +16,7 @@
 # under the License.
 
 if(PARQUET_BUILD_EXECUTABLES)
-  set(PARQUET_TOOLS parquet-dump-schema parquet-reader parquet-scan)
+  set(PARQUET_TOOLS parquet-dump-footer parquet-dump-schema parquet-reader 
parquet-scan)
 
   foreach(TOOL ${PARQUET_TOOLS})
     string(REGEX REPLACE "-" "_" TOOL_SOURCE ${TOOL})
@@ -31,6 +31,7 @@ if(PARQUET_BUILD_EXECUTABLES)
     install(TARGETS ${TOOL} ${INSTALL_IS_OPTIONAL}
             RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
   endforeach(TOOL)
+  target_link_libraries(parquet-dump-footer ${ARROW_LIBRARIES})
 
   add_dependencies(parquet ${PARQUET_TOOLS})
 endif()
diff --git a/cpp/tools/parquet/parquet_dump_footer.cc 
b/cpp/tools/parquet/parquet_dump_footer.cc
new file mode 100644
index 0000000000..c7a4b78fdd
--- /dev/null
+++ b/cpp/tools/parquet/parquet_dump_footer.cc
@@ -0,0 +1,135 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <cstdint>
+#include <cstring>
+#include <fstream>
+#include <iostream>
+#include <optional>
+
+#include "arrow/filesystem/filesystem.h"
+#include "arrow/util/endian.h"
+#include "arrow/util/ubsan.h"
+#include "parquet/metadata.h"
+
+namespace parquet {
+namespace {
+uint32_t ReadLE32(const void* p) {
+  uint32_t x = ::arrow::util::SafeLoadAs<uint32_t>(static_cast<const 
uint8_t*>(p));
+  return ::arrow::bit_util::FromLittleEndian(x);
+}
+
+void AppendLE32(uint32_t v, std::string* out) {
+  v = ::arrow::bit_util::ToLittleEndian(v);
+  out->append(reinterpret_cast<const char*>(&v), sizeof(v));
+}
+
+int DoIt(std::string in, bool scrub, bool json, std::string out) {
+  std::string path;
+  auto fs = ::arrow::fs::FileSystemFromUriOrPath(in, &path).ValueOrDie();
+  auto file = fs->OpenInputFile(path).ValueOrDie();
+  int64_t file_len = file->GetSize().ValueOrDie();
+  if (file_len < 8) {
+    std::cerr << "File too short: " << in << "\n";
+    return 3;
+  }
+  // First do an opportunistic read of up to 1 MiB to try and get the entire 
footer.
+  int64_t tail_len = std::min(file_len, int64_t{1} << 20);
+  std::string tail;
+  tail.resize(tail_len);
+  char* data = tail.data();
+  file->ReadAt(file_len - tail_len, tail_len, data).ValueOrDie();
+  if (auto magic = ReadLE32(data + tail_len - 4); magic != ReadLE32("PAR1")) {
+    std::cerr << "Not a Parquet file: " << in << "\n";
+    return 4;
+  }
+  uint32_t metadata_len = ReadLE32(data + tail_len - 8);
+  if (tail_len >= metadata_len + 8) {
+    // The footer is entirely in the initial read. Trim to size.
+    tail = tail.substr(tail_len - (metadata_len + 8));
+  } else {
+    // The footer is larger than the initial read, read again the exact size.
+    if (metadata_len > file_len) {
+      std::cerr << "File too short: " << in << "\n";
+      return 5;
+    }
+    tail_len = metadata_len + 8;
+    tail.resize(tail_len);
+    data = tail.data();
+    file->ReadAt(file_len - tail_len, tail_len, data).ValueOrDie();
+  }
+  auto md = FileMetaData::Make(tail.data(), &metadata_len);
+  std::string ser = md->SerializeUnencrypted(scrub, json);
+  if (!json) {
+    AppendLE32(static_cast<uint32_t>(ser.size()), &ser);
+    ser.append("PAR1", 4);
+  }
+  std::optional<std::fstream> fout;
+  if (!out.empty()) fout.emplace(out, std::ios::out);
+  std::ostream& os = fout ? *fout : std::cout;
+  if (!os.write(ser.data(), ser.size())) {
+    std::cerr << "Failed to write to output file: " << out << "\n";
+    return 6;
+  }
+
+  return 0;
+}
+}  // namespace
+}  // namespace parquet
+
+static int PrintHelp() {
+  std::cerr << R"(Usage: parquet-dump-footer
+  -h|--help    Print help and exit
+  --no-scrub   Do not scrub potentially confidential metadata
+  --debug      Output text represenation of footer for inspection
+  --in <uri>   Input file (required): must be an URI or an absolute local path
+  --out <path> Output file (optional, default stdout)
+
+  Dump the footer of a Parquet file to stdout or to a file, optionally with
+  potentially confidential metadata scrubbed.
+)";
+  return 1;
+}
+
+int main(int argc, char** argv) {
+  bool scrub = true;
+  bool json = false;
+  std::string in;
+  std::string out;
+  for (int i = 1; i < argc; i++) {
+    char* arg = argv[i];
+    if (!std::strcmp(arg, "-h") || !std::strcmp(arg, "--help")) {
+      return PrintHelp();
+    } else if (!std::strcmp(arg, "--no-scrub")) {
+      scrub = false;
+    } else if (!std::strcmp(arg, "--json")) {
+      json = true;
+    } else if (!std::strcmp(arg, "--in")) {
+      if (i + 1 >= argc) return PrintHelp();
+      in = argv[++i];
+    } else if (!std::strcmp(arg, "--out")) {
+      if (i + 1 >= argc) return PrintHelp();
+      out = argv[++i];
+    } else {
+      // Unknown option.
+      return PrintHelp();
+    }
+  }
+  if (in.empty()) return PrintHelp();
+
+  return parquet::DoIt(in, scrub, json, out);
+}

Reply via email to