This is an automated email from the ASF dual-hosted git repository.
maplefu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new f5691d467c GH-44101: [C++][Parquet] Tools: Debug Print for Json should
be valid JSON (#44532)
f5691d467c is described below
commit f5691d467c5638c6f6a07948de82369cc0ea282e
Author: mwish <[email protected]>
AuthorDate: Thu Oct 31 14:15:20 2024 +0800
GH-44101: [C++][Parquet] Tools: Debug Print for Json should be valid JSON
(#44532)
### Rationale for this change
The printJson is not a valid json now. This is ok for human-read, but when
I want to analysis it with json tools or ai, it will prevent from using it.
### What changes are included in this PR?
Change the output to be a valid json.
Style:
previously, the `\"` trailing would be added in start of object, but this
patch put it to end of object
Before:
```
stream << "\", \"number\":\"" << number;
stream << "\"...";
```
After:
```
stream << ", \"number\":\"" << number << "\"";
```
### Are these changes tested?
Yes
### Are there any user-facing changes?
Minor format change
* GitHub Issue: #44101
Authored-by: mwish <[email protected]>
Signed-off-by: mwish <[email protected]>
---
cpp/src/parquet/CMakeLists.txt | 2 +-
cpp/src/parquet/printer.cc | 56 ++++++++++++++++++++++--------------------
cpp/src/parquet/printer.h | 2 +-
cpp/src/parquet/reader_test.cc | 32 ++++++++++++++++++++++++
4 files changed, 64 insertions(+), 28 deletions(-)
diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt
index b984ef77ad..e43a254fb6 100644
--- a/cpp/src/parquet/CMakeLists.txt
+++ b/cpp/src/parquet/CMakeLists.txt
@@ -320,7 +320,7 @@ if(ARROW_TESTING)
# "link" our dependencies so that include paths are configured
# correctly
target_link_libraries(parquet_testing PUBLIC ${ARROW_GTEST_GMOCK})
- list(APPEND PARQUET_TEST_LINK_LIBS parquet_testing)
+ list(APPEND PARQUET_TEST_LINK_LIBS parquet_testing RapidJSON)
endif()
if(NOT ARROW_BUILD_SHARED)
diff --git a/cpp/src/parquet/printer.cc b/cpp/src/parquet/printer.cc
index 60adfc697f..3ce3e1da4b 100644
--- a/cpp/src/parquet/printer.cc
+++ b/cpp/src/parquet/printer.cc
@@ -220,7 +220,7 @@ void ParquetFilePrinter::DebugPrint(std::ostream& stream,
std::list<int> selecte
bool hasRow;
do {
hasRow = false;
- for (auto scanner : scanners) {
+ for (const auto& scanner : scanners) {
if (scanner->HasNext()) {
hasRow = true;
scanner->PrintNext(stream, COL_WIDTH);
@@ -246,7 +246,7 @@ void ParquetFilePrinter::JSONPrint(std::ostream& stream,
std::list<int> selected
<< file_metadata->schema()->group_node()->field_count() << "\",\n";
stream << " \"NumberOfColumns\": \"" << file_metadata->num_columns() <<
"\",\n";
- if (selected_columns.size() == 0) {
+ if (selected_columns.empty()) {
for (int i = 0; i < file_metadata->num_columns(); i++) {
selected_columns.push_back(i);
}
@@ -299,29 +299,30 @@ void ParquetFilePrinter::JSONPrint(std::ostream& stream,
std::list<int> selected
<< column_chunk->num_values() << "\", "
<< "\"StatsSet\": ";
if (column_chunk->is_stats_set()) {
- stream << "\"True\", \"Stats\": {";
+ stream << R"("True", "Stats": {)";
if (stats->HasNullCount()) {
- stream << "\"NumNulls\": \"" << stats->null_count();
+ stream << R"("NumNulls": ")" << stats->null_count() << "\"";
}
if (stats->HasDistinctCount()) {
- stream << "\", "
- << "\"DistinctValues\": \"" << stats->distinct_count();
+ stream << ", "
+ << R"("DistinctValues": ")" << stats->distinct_count() <<
"\"";
}
if (stats->HasMinMax()) {
std::string min = stats->EncodeMin(), max = stats->EncodeMax();
- stream << "\", "
- << "\"Max\": \"" << FormatStatValue(descr->physical_type(),
max)
+ stream << ", "
+ << R"("Max": ")" << FormatStatValue(descr->physical_type(),
max)
<< "\", "
- << "\"Min\": \"" << FormatStatValue(descr->physical_type(),
min);
+ << R"("Min": ")" << FormatStatValue(descr->physical_type(),
min) << "\"";
}
- stream << "\" },";
+ stream << " },";
} else {
stream << "\"False\",";
}
stream << "\n \"Compression\": \""
<< ::arrow::internal::AsciiToUpper(
Codec::GetCodecAsString(column_chunk->compression()))
- << "\", \"Encodings\": \"";
+ << R"(", "Encodings": )";
+ stream << "\"";
if (column_chunk->encoding_stats().empty()) {
for (auto encoding : column_chunk->encodings()) {
stream << EncodingToString(encoding) << " ";
@@ -329,40 +330,43 @@ void ParquetFilePrinter::JSONPrint(std::ostream& stream,
std::list<int> selected
} else {
PrintPageEncodingStats(stream, column_chunk->encoding_stats());
}
- stream << "\", "
- << "\"UncompressedSize\": \"" <<
column_chunk->total_uncompressed_size()
- << "\", \"CompressedSize\": \"" <<
column_chunk->total_compressed_size();
+ stream << "\"";
+ stream << ", "
+ << R"("UncompressedSize": ")" <<
column_chunk->total_uncompressed_size()
+ << R"(", "CompressedSize": ")" <<
column_chunk->total_compressed_size()
+ << "\"";
if (column_chunk->bloom_filter_offset()) {
// Output BloomFilter {offset, length}
- stream << "\", BloomFilter {"
- << "\"offset\": \"" <<
column_chunk->bloom_filter_offset().value();
+ stream << ", \"BloomFilter\": {"
+ << R"("offset": ")" <<
column_chunk->bloom_filter_offset().value() << "\"";
if (column_chunk->bloom_filter_length()) {
- stream << "\", \"length\": \"" <<
column_chunk->bloom_filter_length().value();
+ stream << R"(, "length": ")" <<
column_chunk->bloom_filter_length().value()
+ << "\"";
}
- stream << "\"}";
+ stream << "}";
}
if (column_chunk->GetColumnIndexLocation()) {
auto location = column_chunk->GetColumnIndexLocation().value();
// Output ColumnIndex {offset, length}
- stream << "\", ColumnIndex {"
- << "\"offset\": \"" << location.offset;
- stream << "\", \"length\": \"" << location.length;
+ stream << ", \"ColumnIndex\": {"
+ << R"("offset": ")" << location.offset;
+ stream << R"(", "length": ")" << location.length;
stream << "\"}";
}
if (column_chunk->GetOffsetIndexLocation()) {
auto location = column_chunk->GetOffsetIndexLocation().value();
// Output OffsetIndex {offset, length}
- stream << "\", OffsetIndex {"
- << "\"offset\": \"" << location.offset;
- stream << "\", \"length\": \"" << location.length;
- stream << "\"}";
+ stream << ", \"OffsetIndex\": {"
+ << R"("offset": ")" << location.offset << "\"";
+ stream << R"(, "length": ")" << location.length << "\"";
+ stream << "}";
}
// end of a ColumnChunk
- stream << "\" }";
+ stream << " }";
c1++;
if (c1 != static_cast<int>(selected_columns.size())) {
stream << ",\n";
diff --git a/cpp/src/parquet/printer.h b/cpp/src/parquet/printer.h
index 6bdf5b456f..bb86b107f9 100644
--- a/cpp/src/parquet/printer.h
+++ b/cpp/src/parquet/printer.h
@@ -32,7 +32,7 @@ class PARQUET_EXPORT ParquetFilePrinter {
public:
explicit ParquetFilePrinter(ParquetFileReader* reader) : fileReader(reader)
{}
- ~ParquetFilePrinter() {}
+ ~ParquetFilePrinter() = default;
void DebugPrint(std::ostream& stream, std::list<int> selected_columns,
bool print_values = false, bool format_dump = false,
diff --git a/cpp/src/parquet/reader_test.cc b/cpp/src/parquet/reader_test.cc
index fb77ba6cbc..688c875b9e 100644
--- a/cpp/src/parquet/reader_test.cc
+++ b/cpp/src/parquet/reader_test.cc
@@ -27,6 +27,12 @@
#include <gmock/gmock.h>
#include <gtest/gtest.h>
+#include "arrow/json/rapidjson_defs.h" // IWYU pragma: keep
+
+#include <rapidjson/document.h>
+#include <rapidjson/error/en.h>
+#include <rapidjson/stringbuffer.h>
+
#include "arrow/array.h"
#include "arrow/buffer.h"
#include "arrow/io/file.h"
@@ -47,6 +53,8 @@
#include "parquet/printer.h"
#include "parquet/test_util.h"
+namespace rj = arrow::rapidjson;
+
using arrow::internal::checked_pointer_cast;
using arrow::internal::Zip;
@@ -1172,6 +1180,30 @@ TEST_F(TestJSONWithLocalFile, JSONOutputFLBA) {
EXPECT_THAT(json_content, testing::HasSubstr(json_contains));
}
+// GH-44101: Test that JSON output is valid JSON
+TEST_F(TestJSONWithLocalFile, ValidJsonOutput) {
+ auto check_json_valid = [](std::string_view json_string) -> ::arrow::Status {
+ rj::Document json_doc;
+ constexpr auto kParseFlags = rj::kParseFullPrecisionFlag |
rj::kParseNanAndInfFlag;
+ json_doc.Parse<kParseFlags>(json_string.data(), json_string.length());
+ if (json_doc.HasParseError()) {
+ return ::arrow::Status::Invalid("JSON parse error at offset ",
+ json_doc.GetErrorOffset(), ": ",
+
rj::GetParseError_En(json_doc.GetParseError()));
+ }
+ return ::arrow::Status::OK();
+ };
+ std::vector<std::string_view> check_file_lists = {
+ "data_index_bloom_encoding_with_length.parquet",
+ "data_index_bloom_encoding_stats.parquet",
"alltypes_tiny_pages_plain.parquet",
+ "concatenated_gzip_members.parquet", "nulls.snappy.parquet"};
+ for (const auto& file : check_file_lists) {
+ std::string json_content = ReadFromLocalFile(file);
+ ASSERT_OK(check_json_valid(json_content))
+ << "Invalid JSON output for file: " << file << ", content:" <<
json_content;
+ }
+}
+
TEST(TestFileReader, BufferedReadsWithDictionary) {
const int num_rows = 1000;