Repository: parquet-cpp Updated Branches: refs/heads/master a54404ed0 -> 39ebf2afa
PARQUET-958: [C++] Print Parquet metadata in JSON format Made minor formatting changes to DebugPrint No support to print values. Only the metadata is JSON formatted in this patch. Author: Deepak Majeti <[email protected]> Closes #310 from majetideepak/PARQUET-958 and squashes the following commits: 4d9cbbd [Deepak Majeti] change DebugPrint to take filename 3c78bc0 [Deepak Majeti] use raw string 97f016a [Deepak Majeti] add test and clang format ec12ddb [Deepak Majeti] add JSONPrint 9c697e2 [Deepak Majeti] fix CMake flag for benchmarks Project: http://git-wip-us.apache.org/repos/asf/parquet-cpp/repo Commit: http://git-wip-us.apache.org/repos/asf/parquet-cpp/commit/39ebf2af Tree: http://git-wip-us.apache.org/repos/asf/parquet-cpp/tree/39ebf2af Diff: http://git-wip-us.apache.org/repos/asf/parquet-cpp/diff/39ebf2af Branch: refs/heads/master Commit: 39ebf2afafbf498fa5d584143248e7988a4c04dd Parents: a54404e Author: Deepak Majeti <[email protected]> Authored: Tue Apr 25 08:34:58 2017 +0200 Committer: Uwe L. Korn <[email protected]> Committed: Tue Apr 25 08:34:58 2017 +0200 ---------------------------------------------------------------------- benchmarks/CMakeLists.txt | 2 +- src/parquet/file/printer.cc | 118 +++++++++++++++++++++++++++++++++++---- src/parquet/file/printer.h | 8 ++- src/parquet/reader-test.cc | 66 ++++++++++++++++++++++ tools/parquet_reader.cc | 11 +++- 5 files changed, 189 insertions(+), 16 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/39ebf2af/benchmarks/CMakeLists.txt ---------------------------------------------------------------------- diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index 1df5dea..2ef8113 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -19,7 +19,7 @@ SET(LINK_LIBS snappystatic thriftstatic) -if (PARQUET_BUILD_EXECUTABLES) +if (PARQUET_BUILD_BENCHMARKS) add_executable(decode_benchmark decode_benchmark.cc) # This uses private APIs http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/39ebf2af/src/parquet/file/printer.cc ---------------------------------------------------------------------- diff --git a/src/parquet/file/printer.cc b/src/parquet/file/printer.cc index 8dd9d55..4d0dad4 100644 --- a/src/parquet/file/printer.cc +++ b/src/parquet/file/printer.cc @@ -33,10 +33,11 @@ namespace parquet { #define COL_WIDTH "30" void ParquetFilePrinter::DebugPrint( - std::ostream& stream, std::list<int> selected_columns, bool print_values) { + std::ostream& stream, std::list<int> selected_columns, bool print_values, + const char* filename) { const FileMetaData* file_metadata = fileReader->metadata().get(); - stream << "File statistics:\n"; + stream << "File Name: " << filename << "\n"; stream << "Version: " << file_metadata->version() << "\n"; stream << "Created By: " << file_metadata->created_by() << "\n"; stream << "Total rows: " << file_metadata->num_rows() << "\n"; @@ -71,7 +72,7 @@ void ParquetFilePrinter::DebugPrint( std::unique_ptr<RowGroupMetaData> group_metadata = file_metadata->RowGroup(r); stream << "--- Total Bytes " << group_metadata->total_byte_size() << " ---\n"; - stream << " rows: " << group_metadata->num_rows() << "---\n"; + stream << " Rows: " << group_metadata->num_rows() << "---\n"; // Print column metadata for (auto i : selected_columns) { @@ -79,25 +80,25 @@ void ParquetFilePrinter::DebugPrint( std::shared_ptr<RowGroupStatistics> stats = column_chunk->statistics(); const ColumnDescriptor* descr = file_metadata->schema()->Column(i); - stream << "Column " << i << std::endl << ", values: " << column_chunk->num_values(); + stream << "Column " << i << std::endl << ", Values: " << column_chunk->num_values(); if (column_chunk->is_stats_set()) { std::string min = stats->EncodeMin(), max = stats->EncodeMax(); - stream << ", null values: " << stats->null_count() - << ", distinct values: " << stats->distinct_count() << std::endl - << " max: " << FormatStatValue(descr->physical_type(), max.c_str()) - << ", min: " << FormatStatValue(descr->physical_type(), min.c_str()); + stream << ", Null Values: " << stats->null_count() + << ", Distinct Values: " << stats->distinct_count() << std::endl + << " Max: " << FormatStatValue(descr->physical_type(), max.c_str()) + << ", Min: " << FormatStatValue(descr->physical_type(), min.c_str()); } else { stream << " Statistics Not Set"; } stream << std::endl - << " compression: " << CompressionToString(column_chunk->compression()) - << ", encodings: "; + << " Compression: " << CompressionToString(column_chunk->compression()) + << ", Encodings: "; for (auto encoding : column_chunk->encodings()) { stream << EncodingToString(encoding) << " "; } stream << std::endl - << " uncompressed size: " << column_chunk->total_uncompressed_size() - << ", compressed size: " << column_chunk->total_compressed_size() + << " Uncompressed Size: " << column_chunk->total_uncompressed_size() + << ", Compressed Size: " << column_chunk->total_compressed_size() << std::endl; } @@ -140,4 +141,97 @@ void ParquetFilePrinter::DebugPrint( } } +void ParquetFilePrinter::JSONPrint( + std::ostream& stream, std::list<int> selected_columns, + const char* filename) { + const FileMetaData* file_metadata = fileReader->metadata().get(); + stream << "{\n"; + stream << " \"FileName\": \"" << filename << "\",\n"; + stream << " \"Version\": \"" << file_metadata->version() << "\",\n"; + stream << " \"CreatedBy\": \"" << file_metadata->created_by() << "\",\n"; + stream << " \"TotalRows\": \"" << file_metadata->num_rows() << "\",\n"; + stream << " \"NumberOfRowGroups\": \"" << file_metadata->num_row_groups() << "\",\n"; + stream << " \"NumberOfRealColumns\": \"" + << file_metadata->schema()->group_node()->field_count() << "\",\n"; + stream << " \"NumberOfColumns\": \"" << file_metadata->num_columns() << "\",\n"; + + if (selected_columns.size() == 0) { + for (int i = 0; i < file_metadata->num_columns(); i++) { + selected_columns.push_back(i); + } + } else { + for (auto i : selected_columns) { + if (i < 0 || i >= file_metadata->num_columns()) { + throw ParquetException("Selected column is out of range"); + } + } + } + + stream << " \"Columns\": [\n"; + int c = 0; + for (auto i : selected_columns) { + const ColumnDescriptor* descr = file_metadata->schema()->Column(i); + stream << " { \"Id\": \"" << i << "\", \"Name\": \"" << descr->name() << "\"," + << " \"PhysicalType\": \"" << TypeToString(descr->physical_type()) << "\"," + << " \"LogicalType\": \"" << LogicalTypeToString(descr->logical_type()) + << "\" }"; + c++; + if (c != static_cast<int>(selected_columns.size())) { stream << ",\n"; } + } + + stream << "\n ],\n \"RowGroups\": [\n"; + for (int r = 0; r < file_metadata->num_row_groups(); ++r) { + stream << " {\n \"Id\": \"" << r << "\", "; + + auto group_reader = fileReader->RowGroup(r); + std::unique_ptr<RowGroupMetaData> group_metadata = file_metadata->RowGroup(r); + + stream << " \"TotalBytes\": \"" << group_metadata->total_byte_size() << "\", "; + stream << " \"Rows\": \"" << group_metadata->num_rows() << "\",\n"; + + // Print column metadata + stream << " \"ColumnChunks\": [\n"; + int c1 = 0; + for (auto i : selected_columns) { + auto column_chunk = group_metadata->ColumnChunk(i); + std::shared_ptr<RowGroupStatistics> stats = column_chunk->statistics(); + + const ColumnDescriptor* descr = file_metadata->schema()->Column(i); + stream << " {\"Id\": \"" << i << "\", \"Values\": \"" + << column_chunk->num_values() << "\", " + << "\"StatsSet\": "; + if (column_chunk->is_stats_set()) { + stream << "\"True\", \"Stats\": {"; + std::string min = stats->EncodeMin(), max = stats->EncodeMax(); + stream << "\"NumNulls\": \"" << stats->null_count() << "\", " + << "\"DistinctValues\": \"" << stats->distinct_count() << "\", " + << "\"Max\": \"" << FormatStatValue(descr->physical_type(), max.c_str()) + << "\", " + << "\"Min\": \"" << FormatStatValue(descr->physical_type(), min.c_str()) + << "\" },"; + } else { + stream << "\"False\","; + } + stream << "\n \"Compression\": \"" + << CompressionToString(column_chunk->compression()) + << "\", \"Encodings\": \""; + for (auto encoding : column_chunk->encodings()) { + stream << EncodingToString(encoding) << " "; + } + stream << "\", " + << "\"UncompressedSize\": \"" << column_chunk->total_uncompressed_size() + << "\", \"CompressedSize\": \"" << column_chunk->total_compressed_size(); + + // end of a ColumnChunk + stream << "\" }"; + c1++; + if (c1 != static_cast<int>(selected_columns.size())) { stream << ",\n"; } + } + + stream << "\n ]\n }"; + if ((r + 1) != static_cast<int>(file_metadata->num_row_groups())) { stream << ",\n"; } + } + stream << "\n ]\n}\n"; +} + } // namespace parquet http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/39ebf2af/src/parquet/file/printer.h ---------------------------------------------------------------------- diff --git a/src/parquet/file/printer.h b/src/parquet/file/printer.h index 433f9e8..bd54e40 100644 --- a/src/parquet/file/printer.h +++ b/src/parquet/file/printer.h @@ -32,12 +32,18 @@ namespace parquet { class PARQUET_EXPORT ParquetFilePrinter { private: ParquetFileReader* fileReader; + public: explicit ParquetFilePrinter(ParquetFileReader* reader) : fileReader(reader) {} ~ParquetFilePrinter() {} void DebugPrint( - std::ostream& stream, std::list<int> selected_columns, bool print_values = true); + std::ostream& stream, std::list<int> selected_columns, bool print_values = true, + const char* fileame = "No Name"); + + void JSONPrint( + std::ostream& stream, std::list<int> selected_columns, + const char* filename = "No Name"); }; } // namespace parquet http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/39ebf2af/src/parquet/reader-test.cc ---------------------------------------------------------------------- diff --git a/src/parquet/reader-test.cc b/src/parquet/reader-test.cc index f7c666c..71f982b 100644 --- a/src/parquet/reader-test.cc +++ b/src/parquet/reader-test.cc @@ -256,4 +256,70 @@ TEST(TestFileReaderAdHoc, NationDictTruncatedDataPage) { ASSERT_EQ(ss2.str(), ss.str()); } +TEST(TestJSONWithLocalFile, JSONOutput) { + std::string jsonOutput = R"###({ + "FileName": "alltypes_plain.parquet", + "Version": "0", + "CreatedBy": "impala version 1.3.0-INTERNAL (build 8a48ddb1eff84592b3fc06bc6f51ec120e1fffc9)", + "TotalRows": "8", + "NumberOfRowGroups": "1", + "NumberOfRealColumns": "11", + "NumberOfColumns": "11", + "Columns": [ + { "Id": "0", "Name": "id", "PhysicalType": "INT32", "LogicalType": "NONE" }, + { "Id": "1", "Name": "bool_col", "PhysicalType": "BOOLEAN", "LogicalType": "NONE" }, + { "Id": "2", "Name": "tinyint_col", "PhysicalType": "INT32", "LogicalType": "NONE" }, + { "Id": "3", "Name": "smallint_col", "PhysicalType": "INT32", "LogicalType": "NONE" }, + { "Id": "4", "Name": "int_col", "PhysicalType": "INT32", "LogicalType": "NONE" }, + { "Id": "5", "Name": "bigint_col", "PhysicalType": "INT64", "LogicalType": "NONE" }, + { "Id": "6", "Name": "float_col", "PhysicalType": "FLOAT", "LogicalType": "NONE" }, + { "Id": "7", "Name": "double_col", "PhysicalType": "DOUBLE", "LogicalType": "NONE" }, + { "Id": "8", "Name": "date_string_col", "PhysicalType": "BYTE_ARRAY", "LogicalType": "NONE" }, + { "Id": "9", "Name": "string_col", "PhysicalType": "BYTE_ARRAY", "LogicalType": "NONE" }, + { "Id": "10", "Name": "timestamp_col", "PhysicalType": "INT96", "LogicalType": "NONE" } + ], + "RowGroups": [ + { + "Id": "0", "TotalBytes": "671", "Rows": "8", + "ColumnChunks": [ + {"Id": "0", "Values": "8", "StatsSet": "False", + "Compression": "UNCOMPRESSED", "Encodings": "RLE PLAIN_DICTIONARY PLAIN ", "UncompressedSize": "73", "CompressedSize": "73" }, + {"Id": "1", "Values": "8", "StatsSet": "False", + "Compression": "UNCOMPRESSED", "Encodings": "RLE PLAIN_DICTIONARY PLAIN ", "UncompressedSize": "24", "CompressedSize": "24" }, + {"Id": "2", "Values": "8", "StatsSet": "False", + "Compression": "UNCOMPRESSED", "Encodings": "RLE PLAIN_DICTIONARY PLAIN ", "UncompressedSize": "47", "CompressedSize": "47" }, + {"Id": "3", "Values": "8", "StatsSet": "False", + "Compression": "UNCOMPRESSED", "Encodings": "RLE PLAIN_DICTIONARY PLAIN ", "UncompressedSize": "47", "CompressedSize": "47" }, + {"Id": "4", "Values": "8", "StatsSet": "False", + "Compression": "UNCOMPRESSED", "Encodings": "RLE PLAIN_DICTIONARY PLAIN ", "UncompressedSize": "47", "CompressedSize": "47" }, + {"Id": "5", "Values": "8", "StatsSet": "False", + "Compression": "UNCOMPRESSED", "Encodings": "RLE PLAIN_DICTIONARY PLAIN ", "UncompressedSize": "55", "CompressedSize": "55" }, + {"Id": "6", "Values": "8", "StatsSet": "False", + "Compression": "UNCOMPRESSED", "Encodings": "RLE PLAIN_DICTIONARY PLAIN ", "UncompressedSize": "47", "CompressedSize": "47" }, + {"Id": "7", "Values": "8", "StatsSet": "False", + "Compression": "UNCOMPRESSED", "Encodings": "RLE PLAIN_DICTIONARY PLAIN ", "UncompressedSize": "55", "CompressedSize": "55" }, + {"Id": "8", "Values": "8", "StatsSet": "False", + "Compression": "UNCOMPRESSED", "Encodings": "RLE PLAIN_DICTIONARY PLAIN ", "UncompressedSize": "88", "CompressedSize": "88" }, + {"Id": "9", "Values": "8", "StatsSet": "False", + "Compression": "UNCOMPRESSED", "Encodings": "RLE PLAIN_DICTIONARY PLAIN ", "UncompressedSize": "49", "CompressedSize": "49" }, + {"Id": "10", "Values": "8", "StatsSet": "False", + "Compression": "UNCOMPRESSED", "Encodings": "RLE PLAIN_DICTIONARY PLAIN ", "UncompressedSize": "139", "CompressedSize": "139" } + ] + } + ] +} +)###"; + + std::stringstream ss; + // empty list means print all + std::list<int> columns; + + auto reader = + ParquetFileReader::OpenFile(alltypes_plain(), false, default_reader_properties()); + ParquetFilePrinter printer(reader.get()); + printer.JSONPrint(ss, columns, "alltypes_plain.parquet"); + + ASSERT_EQ(jsonOutput, ss.str()); +} + } // namespace parquet http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/39ebf2af/tools/parquet_reader.cc ---------------------------------------------------------------------- diff --git a/tools/parquet_reader.cc b/tools/parquet_reader.cc index 25f81c1..7ef59dc 100644 --- a/tools/parquet_reader.cc +++ b/tools/parquet_reader.cc @@ -23,7 +23,7 @@ int main(int argc, char** argv) { if (argc > 5 || argc < 2) { - std::cerr << "Usage: parquet_reader [--only-metadata] [--no-memory-map] " + std::cerr << "Usage: parquet_reader [--only-metadata] [--no-memory-map] [--json]" "[--columns=...] <file>" << std::endl; return -1; @@ -32,6 +32,7 @@ int main(int argc, char** argv) { std::string filename; bool print_values = true; bool memory_map = true; + bool format_json = false; // Read command-line options const std::string COLUMNS_PREFIX = "--columns="; @@ -43,6 +44,8 @@ int main(int argc, char** argv) { print_values = false; } else if ((param = std::strstr(argv[i], "--no-memory-map"))) { memory_map = false; + } else if ((param = std::strstr(argv[i], "--json"))) { + format_json = true; } else if ((param = std::strstr(argv[i], COLUMNS_PREFIX.c_str()))) { value = std::strtok(param + COLUMNS_PREFIX.length(), ","); while (value) { @@ -58,7 +61,11 @@ int main(int argc, char** argv) { std::unique_ptr<parquet::ParquetFileReader> reader = parquet::ParquetFileReader::OpenFile(filename, memory_map); parquet::ParquetFilePrinter printer(reader.get()); - printer.DebugPrint(std::cout, columns, print_values); + if (format_json) { + printer.JSONPrint(std::cout, columns, filename.c_str()); + } else { + printer.DebugPrint(std::cout, columns, print_values, filename.c_str()); + } } catch (const std::exception& e) { std::cerr << "Parquet error: " << e.what() << std::endl; return -1;
