parquet-cpp git commit: PARQUET-958: [C++] Print Parquet metadata in JSON format

uwe Mon, 24 Apr 2017 23:35:24 -0700

Repository: parquet-cpp
Updated Branches:
  refs/heads/master a54404ed0 -> 39ebf2afa



PARQUET-958: [C++] Print Parquet metadata in JSON format

Made minor formatting changes to DebugPrint
No support to print values. Only the metadata is JSON formatted in this patch.

Author: Deepak Majeti <[email protected]>

Closes #310 from majetideepak/PARQUET-958 and squashes the following commits:

4d9cbbd [Deepak Majeti] change DebugPrint to take filename
3c78bc0 [Deepak Majeti] use raw string
97f016a [Deepak Majeti] add test and clang format
ec12ddb [Deepak Majeti] add JSONPrint
9c697e2 [Deepak Majeti] fix CMake flag for benchmarks


Project: http://git-wip-us.apache.org/repos/asf/parquet-cpp/repo
Commit: http://git-wip-us.apache.org/repos/asf/parquet-cpp/commit/39ebf2af
Tree: http://git-wip-us.apache.org/repos/asf/parquet-cpp/tree/39ebf2af
Diff: http://git-wip-us.apache.org/repos/asf/parquet-cpp/diff/39ebf2af

Branch: refs/heads/master
Commit: 39ebf2afafbf498fa5d584143248e7988a4c04dd
Parents: a54404e
Author: Deepak Majeti <[email protected]>
Authored: Tue Apr 25 08:34:58 2017 +0200
Committer: Uwe L. Korn <[email protected]>
Committed: Tue Apr 25 08:34:58 2017 +0200

----------------------------------------------------------------------
 benchmarks/CMakeLists.txt   |   2 +-
 src/parquet/file/printer.cc | 118 +++++++++++++++++++++++++++++++++++----
 src/parquet/file/printer.h  |   8 ++-
 src/parquet/reader-test.cc  |  66 ++++++++++++++++++++++
 tools/parquet_reader.cc     |  11 +++-
 5 files changed, 189 insertions(+), 16 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/39ebf2af/benchmarks/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
index 1df5dea..2ef8113 100644
--- a/benchmarks/CMakeLists.txt
+++ b/benchmarks/CMakeLists.txt
@@ -19,7 +19,7 @@ SET(LINK_LIBS
   snappystatic
   thriftstatic)
 
-if (PARQUET_BUILD_EXECUTABLES)
+if (PARQUET_BUILD_BENCHMARKS)
   add_executable(decode_benchmark decode_benchmark.cc)
 
   # This uses private APIs

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/39ebf2af/src/parquet/file/printer.cc
----------------------------------------------------------------------
diff --git a/src/parquet/file/printer.cc b/src/parquet/file/printer.cc
index 8dd9d55..4d0dad4 100644
--- a/src/parquet/file/printer.cc
+++ b/src/parquet/file/printer.cc
@@ -33,10 +33,11 @@ namespace parquet {
 #define COL_WIDTH "30"
 
 void ParquetFilePrinter::DebugPrint(
-    std::ostream& stream, std::list<int> selected_columns, bool print_values) {
+    std::ostream& stream, std::list<int> selected_columns, bool print_values,
+    const char* filename) {
   const FileMetaData* file_metadata = fileReader->metadata().get();
 
-  stream << "File statistics:\n";
+  stream << "File Name: " << filename << "\n";
   stream << "Version: " << file_metadata->version() << "\n";
   stream << "Created By: " << file_metadata->created_by() << "\n";
   stream << "Total rows: " << file_metadata->num_rows() << "\n";
@@ -71,7 +72,7 @@ void ParquetFilePrinter::DebugPrint(
     std::unique_ptr<RowGroupMetaData> group_metadata = 
file_metadata->RowGroup(r);
 
     stream << "--- Total Bytes " << group_metadata->total_byte_size() << " 
---\n";
-    stream << "  rows: " << group_metadata->num_rows() << "---\n";
+    stream << "  Rows: " << group_metadata->num_rows() << "---\n";
 
     // Print column metadata
     for (auto i : selected_columns) {
@@ -79,25 +80,25 @@ void ParquetFilePrinter::DebugPrint(
       std::shared_ptr<RowGroupStatistics> stats = column_chunk->statistics();
 
       const ColumnDescriptor* descr = file_metadata->schema()->Column(i);
-      stream << "Column " << i << std::endl << ", values: " << 
column_chunk->num_values();
+      stream << "Column " << i << std::endl << ", Values: " << 
column_chunk->num_values();
       if (column_chunk->is_stats_set()) {
         std::string min = stats->EncodeMin(), max = stats->EncodeMax();
-        stream << ", null values: " << stats->null_count()
-               << ", distinct values: " << stats->distinct_count() << std::endl
-               << "  max: " << FormatStatValue(descr->physical_type(), 
max.c_str())
-               << ", min: " << FormatStatValue(descr->physical_type(), 
min.c_str());
+        stream << ", Null Values: " << stats->null_count()
+               << ", Distinct Values: " << stats->distinct_count() << std::endl
+               << "  Max: " << FormatStatValue(descr->physical_type(), 
max.c_str())
+               << ", Min: " << FormatStatValue(descr->physical_type(), 
min.c_str());
       } else {
         stream << "  Statistics Not Set";
       }
       stream << std::endl
-             << "  compression: " << 
CompressionToString(column_chunk->compression())
-             << ", encodings: ";
+             << "  Compression: " << 
CompressionToString(column_chunk->compression())
+             << ", Encodings: ";
       for (auto encoding : column_chunk->encodings()) {
         stream << EncodingToString(encoding) << " ";
       }
       stream << std::endl
-             << "  uncompressed size: " << 
column_chunk->total_uncompressed_size()
-             << ", compressed size: " << column_chunk->total_compressed_size()
+             << "  Uncompressed Size: " << 
column_chunk->total_uncompressed_size()
+             << ", Compressed Size: " << column_chunk->total_compressed_size()
              << std::endl;
     }
 
@@ -140,4 +141,97 @@ void ParquetFilePrinter::DebugPrint(
   }
 }
 
+void ParquetFilePrinter::JSONPrint(
+    std::ostream& stream, std::list<int> selected_columns,
+    const char* filename) {
+  const FileMetaData* file_metadata = fileReader->metadata().get();
+  stream << "{\n";
+  stream << "  \"FileName\": \"" << filename << "\",\n";
+  stream << "  \"Version\": \"" << file_metadata->version() << "\",\n";
+  stream << "  \"CreatedBy\": \"" << file_metadata->created_by() << "\",\n";
+  stream << "  \"TotalRows\": \"" << file_metadata->num_rows() << "\",\n";
+  stream << "  \"NumberOfRowGroups\": \"" << file_metadata->num_row_groups() 
<< "\",\n";
+  stream << "  \"NumberOfRealColumns\": \""
+         << file_metadata->schema()->group_node()->field_count() << "\",\n";
+  stream << "  \"NumberOfColumns\": \"" << file_metadata->num_columns() << 
"\",\n";
+
+  if (selected_columns.size() == 0) {
+    for (int i = 0; i < file_metadata->num_columns(); i++) {
+      selected_columns.push_back(i);
+    }
+  } else {
+    for (auto i : selected_columns) {
+      if (i < 0 || i >= file_metadata->num_columns()) {
+        throw ParquetException("Selected column is out of range");
+      }
+    }
+  }
+
+  stream << "  \"Columns\": [\n";
+  int c = 0;
+  for (auto i : selected_columns) {
+    const ColumnDescriptor* descr = file_metadata->schema()->Column(i);
+    stream << "     { \"Id\": \"" << i << "\", \"Name\": \"" << descr->name() 
<< "\","
+           << " \"PhysicalType\": \"" << TypeToString(descr->physical_type()) 
<< "\","
+           << " \"LogicalType\": \"" << 
LogicalTypeToString(descr->logical_type())
+           << "\" }";
+    c++;
+    if (c != static_cast<int>(selected_columns.size())) { stream << ",\n"; }
+  }
+
+  stream << "\n  ],\n  \"RowGroups\": [\n";
+  for (int r = 0; r < file_metadata->num_row_groups(); ++r) {
+    stream << "     {\n       \"Id\": \"" << r << "\", ";
+
+    auto group_reader = fileReader->RowGroup(r);
+    std::unique_ptr<RowGroupMetaData> group_metadata = 
file_metadata->RowGroup(r);
+
+    stream << " \"TotalBytes\": \"" << group_metadata->total_byte_size() << 
"\", ";
+    stream << " \"Rows\": \"" << group_metadata->num_rows() << "\",\n";
+
+    // Print column metadata
+    stream << "       \"ColumnChunks\": [\n";
+    int c1 = 0;
+    for (auto i : selected_columns) {
+      auto column_chunk = group_metadata->ColumnChunk(i);
+      std::shared_ptr<RowGroupStatistics> stats = column_chunk->statistics();
+
+      const ColumnDescriptor* descr = file_metadata->schema()->Column(i);
+      stream << "          {\"Id\": \"" << i << "\", \"Values\": \""
+             << column_chunk->num_values() << "\", "
+             << "\"StatsSet\": ";
+      if (column_chunk->is_stats_set()) {
+        stream << "\"True\", \"Stats\": {";
+        std::string min = stats->EncodeMin(), max = stats->EncodeMax();
+        stream << "\"NumNulls\": \"" << stats->null_count() << "\", "
+               << "\"DistinctValues\": \"" << stats->distinct_count() << "\", "
+               << "\"Max\": \"" << FormatStatValue(descr->physical_type(), 
max.c_str())
+               << "\", "
+               << "\"Min\": \"" << FormatStatValue(descr->physical_type(), 
min.c_str())
+               << "\" },";
+      } else {
+        stream << "\"False\",";
+      }
+      stream << "\n           \"Compression\": \""
+             << CompressionToString(column_chunk->compression())
+             << "\", \"Encodings\": \"";
+      for (auto encoding : column_chunk->encodings()) {
+        stream << EncodingToString(encoding) << " ";
+      }
+      stream << "\", "
+             << "\"UncompressedSize\": \"" << 
column_chunk->total_uncompressed_size()
+             << "\", \"CompressedSize\": \"" << 
column_chunk->total_compressed_size();
+
+      // end of a ColumnChunk
+      stream << "\" }";
+      c1++;
+      if (c1 != static_cast<int>(selected_columns.size())) { stream << ",\n"; }
+    }
+
+    stream << "\n        ]\n     }";
+    if ((r + 1) != static_cast<int>(file_metadata->num_row_groups())) { stream 
<< ",\n"; }
+  }
+  stream << "\n  ]\n}\n";
+}
+
 }  // namespace parquet

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/39ebf2af/src/parquet/file/printer.h
----------------------------------------------------------------------
diff --git a/src/parquet/file/printer.h b/src/parquet/file/printer.h
index 433f9e8..bd54e40 100644
--- a/src/parquet/file/printer.h
+++ b/src/parquet/file/printer.h
@@ -32,12 +32,18 @@ namespace parquet {
 class PARQUET_EXPORT ParquetFilePrinter {
  private:
   ParquetFileReader* fileReader;
+
  public:
   explicit ParquetFilePrinter(ParquetFileReader* reader) : fileReader(reader) 
{}
   ~ParquetFilePrinter() {}
 
   void DebugPrint(
-      std::ostream& stream, std::list<int> selected_columns, bool print_values 
= true);
+      std::ostream& stream, std::list<int> selected_columns, bool print_values 
= true,
+      const char* fileame = "No Name");
+
+  void JSONPrint(
+      std::ostream& stream, std::list<int> selected_columns,
+      const char* filename = "No Name");
 };
 
 }  // namespace parquet

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/39ebf2af/src/parquet/reader-test.cc
----------------------------------------------------------------------
diff --git a/src/parquet/reader-test.cc b/src/parquet/reader-test.cc
index f7c666c..71f982b 100644
--- a/src/parquet/reader-test.cc
+++ b/src/parquet/reader-test.cc
@@ -256,4 +256,70 @@ TEST(TestFileReaderAdHoc, NationDictTruncatedDataPage) {
   ASSERT_EQ(ss2.str(), ss.str());
 }
 
+TEST(TestJSONWithLocalFile, JSONOutput) {
+  std::string jsonOutput = R"###({
+  "FileName": "alltypes_plain.parquet",
+  "Version": "0",
+  "CreatedBy": "impala version 1.3.0-INTERNAL (build 
8a48ddb1eff84592b3fc06bc6f51ec120e1fffc9)",
+  "TotalRows": "8",
+  "NumberOfRowGroups": "1",
+  "NumberOfRealColumns": "11",
+  "NumberOfColumns": "11",
+  "Columns": [
+     { "Id": "0", "Name": "id", "PhysicalType": "INT32", "LogicalType": "NONE" 
},
+     { "Id": "1", "Name": "bool_col", "PhysicalType": "BOOLEAN", 
"LogicalType": "NONE" },
+     { "Id": "2", "Name": "tinyint_col", "PhysicalType": "INT32", 
"LogicalType": "NONE" },
+     { "Id": "3", "Name": "smallint_col", "PhysicalType": "INT32", 
"LogicalType": "NONE" },
+     { "Id": "4", "Name": "int_col", "PhysicalType": "INT32", "LogicalType": 
"NONE" },
+     { "Id": "5", "Name": "bigint_col", "PhysicalType": "INT64", 
"LogicalType": "NONE" },
+     { "Id": "6", "Name": "float_col", "PhysicalType": "FLOAT", "LogicalType": 
"NONE" },
+     { "Id": "7", "Name": "double_col", "PhysicalType": "DOUBLE", 
"LogicalType": "NONE" },
+     { "Id": "8", "Name": "date_string_col", "PhysicalType": "BYTE_ARRAY", 
"LogicalType": "NONE" },
+     { "Id": "9", "Name": "string_col", "PhysicalType": "BYTE_ARRAY", 
"LogicalType": "NONE" },
+     { "Id": "10", "Name": "timestamp_col", "PhysicalType": "INT96", 
"LogicalType": "NONE" }
+  ],
+  "RowGroups": [
+     {
+       "Id": "0",  "TotalBytes": "671",  "Rows": "8",
+       "ColumnChunks": [
+          {"Id": "0", "Values": "8", "StatsSet": "False",
+           "Compression": "UNCOMPRESSED", "Encodings": "RLE PLAIN_DICTIONARY 
PLAIN ", "UncompressedSize": "73", "CompressedSize": "73" },
+          {"Id": "1", "Values": "8", "StatsSet": "False",
+           "Compression": "UNCOMPRESSED", "Encodings": "RLE PLAIN_DICTIONARY 
PLAIN ", "UncompressedSize": "24", "CompressedSize": "24" },
+          {"Id": "2", "Values": "8", "StatsSet": "False",
+           "Compression": "UNCOMPRESSED", "Encodings": "RLE PLAIN_DICTIONARY 
PLAIN ", "UncompressedSize": "47", "CompressedSize": "47" },
+          {"Id": "3", "Values": "8", "StatsSet": "False",
+           "Compression": "UNCOMPRESSED", "Encodings": "RLE PLAIN_DICTIONARY 
PLAIN ", "UncompressedSize": "47", "CompressedSize": "47" },
+          {"Id": "4", "Values": "8", "StatsSet": "False",
+           "Compression": "UNCOMPRESSED", "Encodings": "RLE PLAIN_DICTIONARY 
PLAIN ", "UncompressedSize": "47", "CompressedSize": "47" },
+          {"Id": "5", "Values": "8", "StatsSet": "False",
+           "Compression": "UNCOMPRESSED", "Encodings": "RLE PLAIN_DICTIONARY 
PLAIN ", "UncompressedSize": "55", "CompressedSize": "55" },
+          {"Id": "6", "Values": "8", "StatsSet": "False",
+           "Compression": "UNCOMPRESSED", "Encodings": "RLE PLAIN_DICTIONARY 
PLAIN ", "UncompressedSize": "47", "CompressedSize": "47" },
+          {"Id": "7", "Values": "8", "StatsSet": "False",
+           "Compression": "UNCOMPRESSED", "Encodings": "RLE PLAIN_DICTIONARY 
PLAIN ", "UncompressedSize": "55", "CompressedSize": "55" },
+          {"Id": "8", "Values": "8", "StatsSet": "False",
+           "Compression": "UNCOMPRESSED", "Encodings": "RLE PLAIN_DICTIONARY 
PLAIN ", "UncompressedSize": "88", "CompressedSize": "88" },
+          {"Id": "9", "Values": "8", "StatsSet": "False",
+           "Compression": "UNCOMPRESSED", "Encodings": "RLE PLAIN_DICTIONARY 
PLAIN ", "UncompressedSize": "49", "CompressedSize": "49" },
+          {"Id": "10", "Values": "8", "StatsSet": "False",
+           "Compression": "UNCOMPRESSED", "Encodings": "RLE PLAIN_DICTIONARY 
PLAIN ", "UncompressedSize": "139", "CompressedSize": "139" }
+        ]
+     }
+  ]
+}
+)###";
+
+  std::stringstream ss;
+  // empty list means print all
+  std::list<int> columns;
+
+  auto reader =
+      ParquetFileReader::OpenFile(alltypes_plain(), false, 
default_reader_properties());
+  ParquetFilePrinter printer(reader.get());
+  printer.JSONPrint(ss, columns, "alltypes_plain.parquet");
+
+  ASSERT_EQ(jsonOutput, ss.str());
+}
+
 }  // namespace parquet

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/39ebf2af/tools/parquet_reader.cc
----------------------------------------------------------------------
diff --git a/tools/parquet_reader.cc b/tools/parquet_reader.cc
index 25f81c1..7ef59dc 100644
--- a/tools/parquet_reader.cc
+++ b/tools/parquet_reader.cc
@@ -23,7 +23,7 @@
 
 int main(int argc, char** argv) {
   if (argc > 5 || argc < 2) {
-    std::cerr << "Usage: parquet_reader [--only-metadata] [--no-memory-map] "
+    std::cerr << "Usage: parquet_reader [--only-metadata] [--no-memory-map] 
[--json]"
                  "[--columns=...] <file>"
               << std::endl;
     return -1;
@@ -32,6 +32,7 @@ int main(int argc, char** argv) {
   std::string filename;
   bool print_values = true;
   bool memory_map = true;
+  bool format_json = false;
 
   // Read command-line options
   const std::string COLUMNS_PREFIX = "--columns=";
@@ -43,6 +44,8 @@ int main(int argc, char** argv) {
       print_values = false;
     } else if ((param = std::strstr(argv[i], "--no-memory-map"))) {
       memory_map = false;
+    } else if ((param = std::strstr(argv[i], "--json"))) {
+      format_json = true;
     } else if ((param = std::strstr(argv[i], COLUMNS_PREFIX.c_str()))) {
       value = std::strtok(param + COLUMNS_PREFIX.length(), ",");
       while (value) {
@@ -58,7 +61,11 @@ int main(int argc, char** argv) {
     std::unique_ptr<parquet::ParquetFileReader> reader =
         parquet::ParquetFileReader::OpenFile(filename, memory_map);
     parquet::ParquetFilePrinter printer(reader.get());
-    printer.DebugPrint(std::cout, columns, print_values);
+    if (format_json) {
+      printer.JSONPrint(std::cout, columns, filename.c_str());
+    } else {
+      printer.DebugPrint(std::cout, columns, print_values, filename.c_str());
+    }
   } catch (const std::exception& e) {
     std::cerr << "Parquet error: " << e.what() << std::endl;
     return -1;

parquet-cpp git commit: PARQUET-958: [C++] Print Parquet metadata in JSON format

Reply via email to