This is an automated email from the ASF dual-hosted git repository.

maplefu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 03c3f8ed41 GH-43598: [C++][Parquet] Parquet Metadata Printer supports 
print sort-columns (#43599)
03c3f8ed41 is described below

commit 03c3f8ed41573b611c4c6dfaf1ec166a9b78decb
Author: mwish <[email protected]>
AuthorDate: Sun Nov 10 01:10:42 2024 +0800

    GH-43598: [C++][Parquet] Parquet Metadata Printer supports print 
sort-columns (#43599)
    
    
    
    ### Rationale for this change
    
    Now we have "sort-columns" support in Parquet spec, Python ( 
https://github.com/apache/arrow/pull/37665/files ) and C++. We can support 
print it in metadata Printer
    
    ### What changes are included in this PR?
    
    Add "SortingColumns" support in parquet printer
    
    ### Are these changes tested?
    
    * [x] TODO after https://github.com/apache/parquet-testing/pull/56 is merged
    
    ### Are there any user-facing changes?
    
    No
    
    * GitHub Issue: #43598
    
    Authored-by: mwish <[email protected]>
    Signed-off-by: mwish <[email protected]>
---
 cpp/src/parquet/printer.cc     | 24 ++++++++++++++++++++++++
 cpp/src/parquet/reader_test.cc | 17 +++++++++++++++--
 2 files changed, 39 insertions(+), 2 deletions(-)

diff --git a/cpp/src/parquet/printer.cc b/cpp/src/parquet/printer.cc
index 3ce3e1da4b..730e1e17ab 100644
--- a/cpp/src/parquet/printer.cc
+++ b/cpp/src/parquet/printer.cc
@@ -142,6 +142,15 @@ void ParquetFilePrinter::DebugPrint(std::ostream& stream, 
std::list<int> selecte
     stream << "--- Total Bytes: " << group_metadata->total_byte_size() << " 
---\n";
     stream << "--- Total Compressed Bytes: " << 
group_metadata->total_compressed_size()
            << " ---\n";
+    auto sorting_columns = group_metadata->sorting_columns();
+    if (!sorting_columns.empty()) {
+      stream << "--- Sort Columns:\n";
+      for (auto column : sorting_columns) {
+        stream << "column_idx: " << column.column_idx
+               << ", descending: " << column.descending
+               << ", nulls_first: " << column.nulls_first << "\n";
+      }
+    }
     stream << "--- Rows: " << group_metadata->num_rows() << " ---\n";
 
     // Print column metadata
@@ -285,6 +294,21 @@ void ParquetFilePrinter::JSONPrint(std::ostream& stream, 
std::list<int> selected
     stream << " \"TotalBytes\": \"" << group_metadata->total_byte_size() << 
"\", ";
     stream << " \"TotalCompressedBytes\": \"" << 
group_metadata->total_compressed_size()
            << "\", ";
+    auto row_group_sorting_columns = group_metadata->sorting_columns();
+    if (!row_group_sorting_columns.empty()) {
+      stream << " \"SortColumns\": [\n";
+      for (size_t i = 0; i < row_group_sorting_columns.size(); i++) {
+        stream << "         {\"column_idx\": " << 
row_group_sorting_columns[i].column_idx
+               << ", \"descending\": " << 
row_group_sorting_columns[i].descending
+               << ", \"nulls_first\": " << 
row_group_sorting_columns[i].nulls_first
+               << "}";
+        if (i + 1 != row_group_sorting_columns.size()) {
+          stream << ",";
+        }
+        stream << '\n';
+      }
+      stream << "       ], ";
+    }
     stream << " \"Rows\": \"" << group_metadata->num_rows() << "\",\n";
 
     // Print column metadata
diff --git a/cpp/src/parquet/reader_test.cc b/cpp/src/parquet/reader_test.cc
index 688c875b9e..62a971799c 100644
--- a/cpp/src/parquet/reader_test.cc
+++ b/cpp/src/parquet/reader_test.cc
@@ -1180,6 +1180,16 @@ TEST_F(TestJSONWithLocalFile, JSONOutputFLBA) {
   EXPECT_THAT(json_content, testing::HasSubstr(json_contains));
 }
 
+TEST_F(TestJSONWithLocalFile, JSONOutputSortColumns) {
+  std::string json_content = ReadFromLocalFile("sort_columns.parquet");
+
+  std::string json_contains = R"###("SortColumns": [
+         {"column_idx": 0, "descending": 1, "nulls_first": 1},
+         {"column_idx": 1, "descending": 0, "nulls_first": 0}
+       ])###";
+  EXPECT_THAT(json_content, testing::HasSubstr(json_contains));
+}
+
 // GH-44101: Test that JSON output is valid JSON
 TEST_F(TestJSONWithLocalFile, ValidJsonOutput) {
   auto check_json_valid = [](std::string_view json_string) -> ::arrow::Status {
@@ -1195,8 +1205,11 @@ TEST_F(TestJSONWithLocalFile, ValidJsonOutput) {
   };
   std::vector<std::string_view> check_file_lists = {
       "data_index_bloom_encoding_with_length.parquet",
-      "data_index_bloom_encoding_stats.parquet", 
"alltypes_tiny_pages_plain.parquet",
-      "concatenated_gzip_members.parquet", "nulls.snappy.parquet"};
+      "data_index_bloom_encoding_stats.parquet",
+      "alltypes_tiny_pages_plain.parquet",
+      "concatenated_gzip_members.parquet",
+      "nulls.snappy.parquet",
+      "sort_columns.parquet"};
   for (const auto& file : check_file_lists) {
     std::string json_content = ReadFromLocalFile(file);
     ASSERT_OK(check_json_valid(json_content))

Reply via email to