This is an automated email from the ASF dual-hosted git repository.
maplefu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 03c3f8ed41 GH-43598: [C++][Parquet] Parquet Metadata Printer supports
print sort-columns (#43599)
03c3f8ed41 is described below
commit 03c3f8ed41573b611c4c6dfaf1ec166a9b78decb
Author: mwish <[email protected]>
AuthorDate: Sun Nov 10 01:10:42 2024 +0800
GH-43598: [C++][Parquet] Parquet Metadata Printer supports print
sort-columns (#43599)
### Rationale for this change
Now we have "sort-columns" support in Parquet spec, Python (
https://github.com/apache/arrow/pull/37665/files ) and C++. We can support
print it in metadata Printer
### What changes are included in this PR?
Add "SortingColumns" support in parquet printer
### Are these changes tested?
* [x] TODO after https://github.com/apache/parquet-testing/pull/56 is merged
### Are there any user-facing changes?
No
* GitHub Issue: #43598
Authored-by: mwish <[email protected]>
Signed-off-by: mwish <[email protected]>
---
cpp/src/parquet/printer.cc | 24 ++++++++++++++++++++++++
cpp/src/parquet/reader_test.cc | 17 +++++++++++++++--
2 files changed, 39 insertions(+), 2 deletions(-)
diff --git a/cpp/src/parquet/printer.cc b/cpp/src/parquet/printer.cc
index 3ce3e1da4b..730e1e17ab 100644
--- a/cpp/src/parquet/printer.cc
+++ b/cpp/src/parquet/printer.cc
@@ -142,6 +142,15 @@ void ParquetFilePrinter::DebugPrint(std::ostream& stream,
std::list<int> selecte
stream << "--- Total Bytes: " << group_metadata->total_byte_size() << "
---\n";
stream << "--- Total Compressed Bytes: " <<
group_metadata->total_compressed_size()
<< " ---\n";
+ auto sorting_columns = group_metadata->sorting_columns();
+ if (!sorting_columns.empty()) {
+ stream << "--- Sort Columns:\n";
+ for (auto column : sorting_columns) {
+ stream << "column_idx: " << column.column_idx
+ << ", descending: " << column.descending
+ << ", nulls_first: " << column.nulls_first << "\n";
+ }
+ }
stream << "--- Rows: " << group_metadata->num_rows() << " ---\n";
// Print column metadata
@@ -285,6 +294,21 @@ void ParquetFilePrinter::JSONPrint(std::ostream& stream,
std::list<int> selected
stream << " \"TotalBytes\": \"" << group_metadata->total_byte_size() <<
"\", ";
stream << " \"TotalCompressedBytes\": \"" <<
group_metadata->total_compressed_size()
<< "\", ";
+ auto row_group_sorting_columns = group_metadata->sorting_columns();
+ if (!row_group_sorting_columns.empty()) {
+ stream << " \"SortColumns\": [\n";
+ for (size_t i = 0; i < row_group_sorting_columns.size(); i++) {
+ stream << " {\"column_idx\": " <<
row_group_sorting_columns[i].column_idx
+ << ", \"descending\": " <<
row_group_sorting_columns[i].descending
+ << ", \"nulls_first\": " <<
row_group_sorting_columns[i].nulls_first
+ << "}";
+ if (i + 1 != row_group_sorting_columns.size()) {
+ stream << ",";
+ }
+ stream << '\n';
+ }
+ stream << " ], ";
+ }
stream << " \"Rows\": \"" << group_metadata->num_rows() << "\",\n";
// Print column metadata
diff --git a/cpp/src/parquet/reader_test.cc b/cpp/src/parquet/reader_test.cc
index 688c875b9e..62a971799c 100644
--- a/cpp/src/parquet/reader_test.cc
+++ b/cpp/src/parquet/reader_test.cc
@@ -1180,6 +1180,16 @@ TEST_F(TestJSONWithLocalFile, JSONOutputFLBA) {
EXPECT_THAT(json_content, testing::HasSubstr(json_contains));
}
+TEST_F(TestJSONWithLocalFile, JSONOutputSortColumns) {
+ std::string json_content = ReadFromLocalFile("sort_columns.parquet");
+
+ std::string json_contains = R"###("SortColumns": [
+ {"column_idx": 0, "descending": 1, "nulls_first": 1},
+ {"column_idx": 1, "descending": 0, "nulls_first": 0}
+ ])###";
+ EXPECT_THAT(json_content, testing::HasSubstr(json_contains));
+}
+
// GH-44101: Test that JSON output is valid JSON
TEST_F(TestJSONWithLocalFile, ValidJsonOutput) {
auto check_json_valid = [](std::string_view json_string) -> ::arrow::Status {
@@ -1195,8 +1205,11 @@ TEST_F(TestJSONWithLocalFile, ValidJsonOutput) {
};
std::vector<std::string_view> check_file_lists = {
"data_index_bloom_encoding_with_length.parquet",
- "data_index_bloom_encoding_stats.parquet",
"alltypes_tiny_pages_plain.parquet",
- "concatenated_gzip_members.parquet", "nulls.snappy.parquet"};
+ "data_index_bloom_encoding_stats.parquet",
+ "alltypes_tiny_pages_plain.parquet",
+ "concatenated_gzip_members.parquet",
+ "nulls.snappy.parquet",
+ "sort_columns.parquet"};
for (const auto& file : check_file_lists) {
std::string json_content = ReadFromLocalFile(file);
ASSERT_OK(check_json_valid(json_content))