This is an automated email from the ASF dual-hosted git repository.

kou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new be8740d181 GH-44815: [C++][Parquet] Add an example to dump statistics 
read as `arrow::ArrayStatistics` (#44816)
be8740d181 is described below

commit be8740d18145a2297fdf090604573ecccc56f3fa
Author: Sutou Kouhei <[email protected]>
AuthorDate: Tue Nov 26 17:58:48 2024 +0900

    GH-44815: [C++][Parquet] Add an example to dump statistics read as 
`arrow::ArrayStatistics` (#44816)
    
    ### Rationale for this change
    
    I want to use this in the C data interface statistics documents: 
https://github.com/apache/arrow/pull/43553
    
    ### What changes are included in this PR?
    
    Add an executable that reads an Apache Parquet file and dumps statistics 
read as `arrow::ArrayStatistics`.
    
    ### Are these changes tested?
    
    Yes.
    
    ### Are there any user-facing changes?
    
    No.
    * GitHub Issue: #44815
    
    Authored-by: Sutou Kouhei <[email protected]>
    Signed-off-by: Sutou Kouhei <[email protected]>
---
 cpp/tools/parquet/CMakeLists.txt                   |  7 ++-
 cpp/tools/parquet/parquet_dump_arrow_statistics.cc | 58 ++++++++++++++++++++++
 2 files changed, 64 insertions(+), 1 deletion(-)

diff --git a/cpp/tools/parquet/CMakeLists.txt b/cpp/tools/parquet/CMakeLists.txt
index 87c3254607..5aaa456dca 100644
--- a/cpp/tools/parquet/CMakeLists.txt
+++ b/cpp/tools/parquet/CMakeLists.txt
@@ -16,7 +16,12 @@
 # under the License.
 
 if(PARQUET_BUILD_EXECUTABLES)
-  set(PARQUET_TOOLS parquet-dump-footer parquet-dump-schema parquet-reader 
parquet-scan)
+  set(PARQUET_TOOLS
+      parquet-dump-arrow-statistics
+      parquet-dump-footer
+      parquet-dump-schema
+      parquet-reader
+      parquet-scan)
 
   foreach(TOOL ${PARQUET_TOOLS})
     string(REGEX REPLACE "-" "_" TOOL_SOURCE ${TOOL})
diff --git a/cpp/tools/parquet/parquet_dump_arrow_statistics.cc 
b/cpp/tools/parquet/parquet_dump_arrow_statistics.cc
new file mode 100644
index 0000000000..8aeced94f6
--- /dev/null
+++ b/cpp/tools/parquet/parquet_dump_arrow_statistics.cc
@@ -0,0 +1,58 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <arrow/api.h>
+#include <arrow/io/api.h>
+#include <parquet/arrow/reader.h>
+
+#include <cstdlib>
+#include <iostream>
+
+namespace {
+arrow::Status PrintArrowStatistics(const char* path) {
+  ARROW_ASSIGN_OR_RAISE(
+      auto input, arrow::io::MemoryMappedFile::Open(path, 
arrow::io::FileMode::READ));
+  ARROW_ASSIGN_OR_RAISE(auto reader,
+                        parquet::arrow::OpenFile(input, 
arrow::default_memory_pool()));
+  ARROW_ASSIGN_OR_RAISE(auto record_batch_reader, 
reader->GetRecordBatchReader());
+  while (true) {
+    ARROW_ASSIGN_OR_RAISE(auto record_batch, record_batch_reader->Next());
+    if (!record_batch) {
+      break;
+    }
+    ARROW_ASSIGN_OR_RAISE(auto statistics_array, 
record_batch->MakeStatisticsArray());
+    std::cout << statistics_array->ToString() << std::endl;
+  }
+  return arrow::Status::OK();
+}
+};  // namespace
+
+int main(int argc, char** argv) {
+  if (argc != 2) {
+    std::cerr << "Usage: " << argv[0] << " PARQUET_PATH" << std::endl;
+    std::cerr << " e.g.: " << argv[0] << " sample.parquet" << std::endl;
+    return EXIT_FAILURE;
+  }
+
+  auto status = PrintArrowStatistics(argv[1]);
+  if (status.ok()) {
+    return EXIT_SUCCESS;
+  } else {
+    std::cerr << status.ToString() << std::endl;
+    return EXIT_FAILURE;
+  }
+}

Reply via email to