(arrow) branch main updated: GH-44101: [C++][Parquet] Tools: Debug Print for Json should be valid JSON (#44532)

maplefu Wed, 30 Oct 2024 23:23:11 -0700

This is an automated email from the ASF dual-hosted git repository.

maplefu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git



The following commit(s) were added to refs/heads/main by this push:
     new f5691d467c GH-44101: [C++][Parquet] Tools: Debug Print for Json should 
be valid JSON (#44532)
f5691d467c is described below

commit f5691d467c5638c6f6a07948de82369cc0ea282e
Author: mwish <[email protected]>
AuthorDate: Thu Oct 31 14:15:20 2024 +0800

    GH-44101: [C++][Parquet] Tools: Debug Print for Json should be valid JSON 
(#44532)
    
    ### Rationale for this change
    
    The printJson is not a valid json now. This is ok for human-read, but when 
I want to analysis it with json tools or ai, it will prevent from using it.
    
    ### What changes are included in this PR?
    
    Change the output to be a valid json.
    
    Style:
    
    previously, the `\"` trailing would be added in start of object, but this 
patch put it to end of object
    
    Before:
    ```
    stream << "\", \"number\":\""  << number;
    stream << "\"...";
    ```
    
    After:
    
    ```
    stream << ", \"number\":\""  << number << "\"";
    ```
    
    ### Are these changes tested?
    
    Yes
    
    ### Are there any user-facing changes?
    
    Minor format change
    
    * GitHub Issue: #44101
    
    Authored-by: mwish <[email protected]>
    Signed-off-by: mwish <[email protected]>
---
 cpp/src/parquet/CMakeLists.txt |  2 +-
 cpp/src/parquet/printer.cc     | 56 ++++++++++++++++++++++--------------------
 cpp/src/parquet/printer.h      |  2 +-
 cpp/src/parquet/reader_test.cc | 32 ++++++++++++++++++++++++
 4 files changed, 64 insertions(+), 28 deletions(-)

diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt
index b984ef77ad..e43a254fb6 100644
--- a/cpp/src/parquet/CMakeLists.txt
+++ b/cpp/src/parquet/CMakeLists.txt
@@ -320,7 +320,7 @@ if(ARROW_TESTING)
   # "link" our dependencies so that include paths are configured
   # correctly
   target_link_libraries(parquet_testing PUBLIC ${ARROW_GTEST_GMOCK})
-  list(APPEND PARQUET_TEST_LINK_LIBS parquet_testing)
+  list(APPEND PARQUET_TEST_LINK_LIBS parquet_testing RapidJSON)
 endif()
 
 if(NOT ARROW_BUILD_SHARED)
diff --git a/cpp/src/parquet/printer.cc b/cpp/src/parquet/printer.cc
index 60adfc697f..3ce3e1da4b 100644
--- a/cpp/src/parquet/printer.cc
+++ b/cpp/src/parquet/printer.cc
@@ -220,7 +220,7 @@ void ParquetFilePrinter::DebugPrint(std::ostream& stream, 
std::list<int> selecte
     bool hasRow;
     do {
       hasRow = false;
-      for (auto scanner : scanners) {
+      for (const auto& scanner : scanners) {
         if (scanner->HasNext()) {
           hasRow = true;
           scanner->PrintNext(stream, COL_WIDTH);
@@ -246,7 +246,7 @@ void ParquetFilePrinter::JSONPrint(std::ostream& stream, 
std::list<int> selected
          << file_metadata->schema()->group_node()->field_count() << "\",\n";
   stream << "  \"NumberOfColumns\": \"" << file_metadata->num_columns() << 
"\",\n";
 
-  if (selected_columns.size() == 0) {
+  if (selected_columns.empty()) {
     for (int i = 0; i < file_metadata->num_columns(); i++) {
       selected_columns.push_back(i);
     }
@@ -299,29 +299,30 @@ void ParquetFilePrinter::JSONPrint(std::ostream& stream, 
std::list<int> selected
              << column_chunk->num_values() << "\", "
              << "\"StatsSet\": ";
       if (column_chunk->is_stats_set()) {
-        stream << "\"True\", \"Stats\": {";
+        stream << R"("True", "Stats": {)";
         if (stats->HasNullCount()) {
-          stream << "\"NumNulls\": \"" << stats->null_count();
+          stream << R"("NumNulls": ")" << stats->null_count() << "\"";
         }
         if (stats->HasDistinctCount()) {
-          stream << "\", "
-                 << "\"DistinctValues\": \"" << stats->distinct_count();
+          stream << ", "
+                 << R"("DistinctValues": ")" << stats->distinct_count() << 
"\"";
         }
         if (stats->HasMinMax()) {
           std::string min = stats->EncodeMin(), max = stats->EncodeMax();
-          stream << "\", "
-                 << "\"Max\": \"" << FormatStatValue(descr->physical_type(), 
max)
+          stream << ", "
+                 << R"("Max": ")" << FormatStatValue(descr->physical_type(), 
max)
                  << "\", "
-                 << "\"Min\": \"" << FormatStatValue(descr->physical_type(), 
min);
+                 << R"("Min": ")" << FormatStatValue(descr->physical_type(), 
min) << "\"";
         }
-        stream << "\" },";
+        stream << " },";
       } else {
         stream << "\"False\",";
       }
       stream << "\n           \"Compression\": \""
              << ::arrow::internal::AsciiToUpper(
                     Codec::GetCodecAsString(column_chunk->compression()))
-             << "\", \"Encodings\": \"";
+             << R"(", "Encodings": )";
+      stream << "\"";
       if (column_chunk->encoding_stats().empty()) {
         for (auto encoding : column_chunk->encodings()) {
           stream << EncodingToString(encoding) << " ";
@@ -329,40 +330,43 @@ void ParquetFilePrinter::JSONPrint(std::ostream& stream, 
std::list<int> selected
       } else {
         PrintPageEncodingStats(stream, column_chunk->encoding_stats());
       }
-      stream << "\", "
-             << "\"UncompressedSize\": \"" << 
column_chunk->total_uncompressed_size()
-             << "\", \"CompressedSize\": \"" << 
column_chunk->total_compressed_size();
+      stream << "\"";
+      stream << ", "
+             << R"("UncompressedSize": ")" << 
column_chunk->total_uncompressed_size()
+             << R"(", "CompressedSize": ")" << 
column_chunk->total_compressed_size()
+             << "\"";
 
       if (column_chunk->bloom_filter_offset()) {
         // Output BloomFilter {offset, length}
-        stream << "\", BloomFilter {"
-               << "\"offset\": \"" << 
column_chunk->bloom_filter_offset().value();
+        stream << ", \"BloomFilter\": {"
+               << R"("offset": ")" << 
column_chunk->bloom_filter_offset().value() << "\"";
         if (column_chunk->bloom_filter_length()) {
-          stream << "\", \"length\": \"" << 
column_chunk->bloom_filter_length().value();
+          stream << R"(, "length": ")" << 
column_chunk->bloom_filter_length().value()
+                 << "\"";
         }
-        stream << "\"}";
+        stream << "}";
       }
 
       if (column_chunk->GetColumnIndexLocation()) {
         auto location = column_chunk->GetColumnIndexLocation().value();
         // Output ColumnIndex {offset, length}
-        stream << "\", ColumnIndex {"
-               << "\"offset\": \"" << location.offset;
-        stream << "\", \"length\": \"" << location.length;
+        stream << ", \"ColumnIndex\": {"
+               << R"("offset": ")" << location.offset;
+        stream << R"(", "length": ")" << location.length;
         stream << "\"}";
       }
 
       if (column_chunk->GetOffsetIndexLocation()) {
         auto location = column_chunk->GetOffsetIndexLocation().value();
         // Output OffsetIndex {offset, length}
-        stream << "\", OffsetIndex {"
-               << "\"offset\": \"" << location.offset;
-        stream << "\", \"length\": \"" << location.length;
-        stream << "\"}";
+        stream << ", \"OffsetIndex\": {"
+               << R"("offset": ")" << location.offset << "\"";
+        stream << R"(, "length": ")" << location.length << "\"";
+        stream << "}";
       }
 
       // end of a ColumnChunk
-      stream << "\" }";
+      stream << " }";
       c1++;
       if (c1 != static_cast<int>(selected_columns.size())) {
         stream << ",\n";
diff --git a/cpp/src/parquet/printer.h b/cpp/src/parquet/printer.h
index 6bdf5b456f..bb86b107f9 100644
--- a/cpp/src/parquet/printer.h
+++ b/cpp/src/parquet/printer.h
@@ -32,7 +32,7 @@ class PARQUET_EXPORT ParquetFilePrinter {
 
  public:
   explicit ParquetFilePrinter(ParquetFileReader* reader) : fileReader(reader) 
{}
-  ~ParquetFilePrinter() {}
+  ~ParquetFilePrinter() = default;
 
   void DebugPrint(std::ostream& stream, std::list<int> selected_columns,
                   bool print_values = false, bool format_dump = false,
diff --git a/cpp/src/parquet/reader_test.cc b/cpp/src/parquet/reader_test.cc
index fb77ba6cbc..688c875b9e 100644
--- a/cpp/src/parquet/reader_test.cc
+++ b/cpp/src/parquet/reader_test.cc
@@ -27,6 +27,12 @@
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 
+#include "arrow/json/rapidjson_defs.h"  // IWYU pragma: keep
+
+#include <rapidjson/document.h>
+#include <rapidjson/error/en.h>
+#include <rapidjson/stringbuffer.h>
+
 #include "arrow/array.h"
 #include "arrow/buffer.h"
 #include "arrow/io/file.h"
@@ -47,6 +53,8 @@
 #include "parquet/printer.h"
 #include "parquet/test_util.h"
 
+namespace rj = arrow::rapidjson;
+
 using arrow::internal::checked_pointer_cast;
 using arrow::internal::Zip;
 
@@ -1172,6 +1180,30 @@ TEST_F(TestJSONWithLocalFile, JSONOutputFLBA) {
   EXPECT_THAT(json_content, testing::HasSubstr(json_contains));
 }
 
+// GH-44101: Test that JSON output is valid JSON
+TEST_F(TestJSONWithLocalFile, ValidJsonOutput) {
+  auto check_json_valid = [](std::string_view json_string) -> ::arrow::Status {
+    rj::Document json_doc;
+    constexpr auto kParseFlags = rj::kParseFullPrecisionFlag | 
rj::kParseNanAndInfFlag;
+    json_doc.Parse<kParseFlags>(json_string.data(), json_string.length());
+    if (json_doc.HasParseError()) {
+      return ::arrow::Status::Invalid("JSON parse error at offset ",
+                                      json_doc.GetErrorOffset(), ": ",
+                                      
rj::GetParseError_En(json_doc.GetParseError()));
+    }
+    return ::arrow::Status::OK();
+  };
+  std::vector<std::string_view> check_file_lists = {
+      "data_index_bloom_encoding_with_length.parquet",
+      "data_index_bloom_encoding_stats.parquet", 
"alltypes_tiny_pages_plain.parquet",
+      "concatenated_gzip_members.parquet", "nulls.snappy.parquet"};
+  for (const auto& file : check_file_lists) {
+    std::string json_content = ReadFromLocalFile(file);
+    ASSERT_OK(check_json_valid(json_content))
+        << "Invalid JSON output for file: " << file << ", content:" << 
json_content;
+  }
+}
+
 TEST(TestFileReader, BufferedReadsWithDictionary) {
   const int num_rows = 1000;

(arrow) branch main updated: GH-44101: [C++][Parquet] Tools: Debug Print for Json should be valid JSON (#44532)

Reply via email to