Repository: parquet-cpp Updated Branches: refs/heads/master fa6e47616 -> c67ee3e52
PARQUET-681: Add tool to scan a parquet file Added a ReadBatchValues() API to the Column class. Added a parquet-scan tool Separated examples into benchmarks/tools added clang tidy and clang format to benchmarks and tools Author: Deepak Majeti <[email protected]> Closes #144 from majetideepak/parquetscan and squashes the following commits: cc7f183 [Deepak Majeti] Removed GetRemainingInPage API 44da480 [Deepak Majeti] add scan all in public api 20829b8 [Deepak Majeti] clang-format da62354 [Deepak Majeti] ScanAllValues API e385f61 [Deepak Majeti] put clang-* in the root directory 9ff785c [Deepak Majeti] use c++ random d854bde [Deepak Majeti] parquet scan tool Project: http://git-wip-us.apache.org/repos/asf/parquet-cpp/repo Commit: http://git-wip-us.apache.org/repos/asf/parquet-cpp/commit/c67ee3e5 Tree: http://git-wip-us.apache.org/repos/asf/parquet-cpp/tree/c67ee3e5 Diff: http://git-wip-us.apache.org/repos/asf/parquet-cpp/diff/c67ee3e5 Branch: refs/heads/master Commit: c67ee3e521b93531f4b59d3183a04ad5d316cbae Parents: fa6e476 Author: Deepak Majeti <[email protected]> Authored: Mon Sep 5 15:49:36 2016 -0400 Committer: Wes McKinney <[email protected]> Committed: Mon Sep 5 15:49:36 2016 -0400 ---------------------------------------------------------------------- .clang-format | 65 +++++ .clang-tidy | 13 + .clang-tidy-ignore | 1 + CMakeLists.txt | 15 +- benchmarks/CMakeLists.txt | 29 +++ benchmarks/decode_benchmark.cc | 461 ++++++++++++++++++++++++++++++++++++ example/CMakeLists.txt | 36 --- example/decode_benchmark.cc | 460 ----------------------------------- example/parquet-dump-schema.cc | 39 --- example/parquet_reader.cc | 70 ------ src/.clang-format | 65 ----- src/.clang-tidy | 13 - src/.clang-tidy-ignore | 1 - src/parquet/api/reader.h | 1 + src/parquet/column/reader.h | 1 - src/parquet/column/scan-all.h | 67 ++++++ src/parquet/column/scanner.h | 1 - src/parquet/file/reader.cc | 6 +- src/parquet/schema/types.cc | 8 +- src/parquet/types-test.cc | 62 +++-- src/parquet/types.cc | 33 ++- src/parquet/types.h | 10 +- tools/CMakeLists.txt | 34 +++ tools/parquet-dump-schema.cc | 36 +++ tools/parquet-scan.cc | 108 +++++++++ tools/parquet_reader.cc | 67 ++++++ 26 files changed, 962 insertions(+), 740 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c67ee3e5/.clang-format ---------------------------------------------------------------------- diff --git a/.clang-format b/.clang-format new file mode 100644 index 0000000..7d5b3cf --- /dev/null +++ b/.clang-format @@ -0,0 +1,65 @@ +--- +Language: Cpp +# BasedOnStyle: Google +AccessModifierOffset: -1 +AlignAfterOpenBracket: false +AlignConsecutiveAssignments: false +AlignEscapedNewlinesLeft: true +AlignOperands: true +AlignTrailingComments: true +AllowAllParametersOfDeclarationOnNextLine: true +AllowShortBlocksOnASingleLine: true +AllowShortCaseLabelsOnASingleLine: false +AllowShortFunctionsOnASingleLine: Inline +AllowShortIfStatementsOnASingleLine: true +AllowShortLoopsOnASingleLine: false +AlwaysBreakAfterDefinitionReturnType: None +AlwaysBreakBeforeMultilineStrings: true +AlwaysBreakTemplateDeclarations: true +BinPackArguments: true +BinPackParameters: true +BreakBeforeBinaryOperators: None +BreakBeforeBraces: Attach +BreakBeforeTernaryOperators: true +BreakConstructorInitializersBeforeComma: false +ColumnLimit: 90 +CommentPragmas: '^ IWYU pragma:' +ConstructorInitializerAllOnOneLineOrOnePerLine: true +ConstructorInitializerIndentWidth: 4 +ContinuationIndentWidth: 4 +Cpp11BracedListStyle: true +DerivePointerAlignment: false +DisableFormat: false +ExperimentalAutoDetectBinPacking: false +ForEachMacros: [ foreach, Q_FOREACH, BOOST_FOREACH ] +IndentCaseLabels: true +IndentWidth: 2 +IndentWrappedFunctionNames: false +KeepEmptyLinesAtTheStartOfBlocks: false +MacroBlockBegin: '' +MacroBlockEnd: '' +MaxEmptyLinesToKeep: 1 +NamespaceIndentation: None +ObjCBlockIndentWidth: 2 +ObjCSpaceAfterProperty: false +ObjCSpaceBeforeProtocolList: false +PenaltyBreakBeforeFirstCallParameter: 1000 +PenaltyBreakComment: 300 +PenaltyBreakFirstLessLess: 120 +PenaltyBreakString: 1000 +PenaltyExcessCharacter: 1000000 +PenaltyReturnTypeOnItsOwnLine: 200 +PointerAlignment: Left +SpaceAfterCStyleCast: false +SpaceBeforeAssignmentOperators: true +SpaceBeforeParens: ControlStatements +SpaceInEmptyParentheses: false +SpacesBeforeTrailingComments: 2 +SpacesInAngles: false +SpacesInContainerLiterals: true +SpacesInCStyleCastParentheses: false +SpacesInParentheses: false +SpacesInSquareBrackets: false +Standard: Cpp11 +TabWidth: 8 +UseTab: Never http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c67ee3e5/.clang-tidy ---------------------------------------------------------------------- diff --git a/.clang-tidy b/.clang-tidy new file mode 100644 index 0000000..6fc3742 --- /dev/null +++ b/.clang-tidy @@ -0,0 +1,13 @@ +--- +Checks: 'clang-diagnostic-*,clang-analyzer-*,-clang-analyzer-alpha*,google-.*,modernize-.*,readablity-.*' +HeaderFilterRegex: 'parquet/.*' +AnalyzeTemporaryDtors: true +CheckOptions: + - key: google-readability-braces-around-statements.ShortStatementLines + value: '1' + - key: google-readability-function-size.StatementThreshold + value: '800' + - key: google-readability-namespace-comments.ShortNamespaceLines + value: '10' + - key: google-readability-namespace-comments.SpacesBeforeComments + value: '2' http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c67ee3e5/.clang-tidy-ignore ---------------------------------------------------------------------- diff --git a/.clang-tidy-ignore b/.clang-tidy-ignore new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/.clang-tidy-ignore @@ -0,0 +1 @@ + http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c67ee3e5/CMakeLists.txt ---------------------------------------------------------------------- diff --git a/CMakeLists.txt b/CMakeLists.txt index 5c26e79..86d0684 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -404,7 +404,7 @@ if (UNIX) --verbose=2 --linelength=90 --filter=-whitespace/comments,-readability/todo,-build/header_guard,-runtime/references,-readability/check,-build/c++11 - `find ${CMAKE_CURRENT_SOURCE_DIR}/src -name \\*.cc -or -name \\*.h | sed -e '/parquet\\/thrift/g'`) + `find ${CMAKE_CURRENT_SOURCE_DIR}/src ${CMAKE_CURRENT_SOURCE_DIR}/tools ${CMAKE_CURRENT_SOURCE_DIR}/benchmarks -name \\*.cc -or -name \\*.h | sed -e '/parquet\\/thrift/g'`) endif (UNIX) ############################################################ @@ -414,11 +414,11 @@ endif (UNIX) if (${CLANG_FORMAT_FOUND}) # runs clang format and updates files in place. add_custom_target(format ${BUILD_SUPPORT_DIR}/run-clang-format.sh ${CMAKE_CURRENT_SOURCE_DIR} ${CLANG_FORMAT_BIN} 1 - `find ${CMAKE_CURRENT_SOURCE_DIR}/src -name \\*.cc -or -name \\*.h | sed -e '/_generated/g'`) + `find ${CMAKE_CURRENT_SOURCE_DIR}/src ${CMAKE_CURRENT_SOURCE_DIR}/tools ${CMAKE_CURRENT_SOURCE_DIR}/benchmarks -name \\*.cc -or -name \\*.h | sed -e '/_generated/g'`) # runs clang format and exits with a non-zero exit code if any files need to be reformatted add_custom_target(check-format ${BUILD_SUPPORT_DIR}/run-clang-format.sh ${CMAKE_CURRENT_SOURCE_DIR} ${CLANG_FORMAT_BIN} 0 - `find ${CMAKE_CURRENT_SOURCE_DIR}/src -name \\*.cc -or -name \\*.h | sed -e '/_generated/g'`) + `find ${CMAKE_CURRENT_SOURCE_DIR}/src ${CMAKE_CURRENT_SOURCE_DIR}/tools ${CMAKE_CURRENT_SOURCE_DIR}/benchmarks -name \\*.cc -or -name \\*.h | sed -e '/_generated/g'`) endif() @@ -429,10 +429,10 @@ endif() if (${CLANG_TIDY_FOUND}) # runs clang-tidy and attempts to fix any warning automatically add_custom_target(clang-tidy ${BUILD_SUPPORT_DIR}/run-clang-tidy.sh ${CLANG_TIDY_BIN} ${CMAKE_BINARY_DIR}/compile_commands.json 1 - `find ${CMAKE_CURRENT_SOURCE_DIR}/src -name \\*.cc | sed -e '/_types/g' | sed -e '/_constants/g'`) + `find ${CMAKE_CURRENT_SOURCE_DIR}/src ${CMAKE_CURRENT_SOURCE_DIR}/tools ${CMAKE_CURRENT_SOURCE_DIR}/benchmarks -name \\*.cc | sed -e '/_types/g' | sed -e '/_constants/g'`) # runs clang-tidy and exits with a non-zero exit code if any errors are found. add_custom_target(check-clang-tidy ${BUILD_SUPPORT_DIR}/run-clang-tidy.sh ${CLANG_TIDY_BIN} ${CMAKE_BINARY_DIR}/compile_commands.json - 0 `find ${CMAKE_CURRENT_SOURCE_DIR}/src -name \\*.cc |grep -v -F -f ${CMAKE_CURRENT_SOURCE_DIR}/src/.clang-tidy-ignore`) + 0 `find ${CMAKE_CURRENT_SOURCE_DIR}/src ${CMAKE_CURRENT_SOURCE_DIR}/tools ${CMAKE_CURRENT_SOURCE_DIR}/benchmarks -name \\*.cc |grep -v -F -f ${CMAKE_CURRENT_SOURCE_DIR}/.clang-tidy-ignore`) endif() @@ -575,11 +575,12 @@ add_subdirectory(src/parquet/schema) add_subdirectory(src/parquet/thrift) add_subdirectory(src/parquet/util) -# Ensure that thrift compilation is done befor using its generated headers +# Ensure that thrift compilation is done before using its generated headers # in parquet code. add_dependencies(parquet_objlib parquet_thrift) -add_subdirectory(example) +add_subdirectory(benchmarks) +add_subdirectory(tools) add_custom_target(clean-all COMMAND ${CMAKE_BUILD_TOOL} clean http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c67ee3e5/benchmarks/CMakeLists.txt ---------------------------------------------------------------------- diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt new file mode 100644 index 0000000..1df5dea --- /dev/null +++ b/benchmarks/CMakeLists.txt @@ -0,0 +1,29 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +SET(LINK_LIBS + snappystatic + thriftstatic) + +if (PARQUET_BUILD_EXECUTABLES) + add_executable(decode_benchmark decode_benchmark.cc) + + # This uses private APIs + target_link_libraries(decode_benchmark ${LINK_LIBS} + parquet_static) + +endif() http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c67ee3e5/benchmarks/decode_benchmark.cc ---------------------------------------------------------------------- diff --git a/benchmarks/decode_benchmark.cc b/benchmarks/decode_benchmark.cc new file mode 100644 index 0000000..d3748ca --- /dev/null +++ b/benchmarks/decode_benchmark.cc @@ -0,0 +1,461 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <stdio.h> +#include <iostream> +#include <random> + +#include "parquet/compression/codec.h" +#include "parquet/encodings/plain-encoding.h" +#include "parquet/encodings/dictionary-encoding.h" +#include "parquet/encodings/delta-bit-pack-encoding.h" +#include "parquet/encodings/delta-byte-array-encoding.h" +#include "parquet/encodings/delta-length-byte-array-encoding.h" +#include "parquet/util/stopwatch.h" + +/** + * Test bed for encodings and some utilities to measure their throughput. + * TODO: this file needs some major cleanup. + */ + +class DeltaBitPackEncoder { + public: + explicit DeltaBitPackEncoder(int mini_block_size = 8) { + mini_block_size_ = mini_block_size; + } + + void Add(int64_t v) { values_.push_back(v); } + + uint8_t* Encode(int* encoded_len) { + uint8_t* result = new uint8_t[10 * 1024 * 1024]; + int num_mini_blocks = parquet::BitUtil::Ceil(num_values() - 1, mini_block_size_); + uint8_t* mini_block_widths = NULL; + + parquet::BitWriter writer(result, 10 * 1024 * 1024); + + // Writer the size of each block. We only use 1 block currently. + writer.PutVlqInt(num_mini_blocks * mini_block_size_); + + // Write the number of mini blocks. + writer.PutVlqInt(num_mini_blocks); + + // Write the number of values. + writer.PutVlqInt(num_values() - 1); + + // Write the first value. + writer.PutZigZagVlqInt(values_[0]); + + // Compute the values as deltas and the min delta. + int64_t min_delta = std::numeric_limits<int64_t>::max(); + for (int i = values_.size() - 1; i > 0; --i) { + values_[i] -= values_[i - 1]; + min_delta = std::min(min_delta, values_[i]); + } + + // Write out the min delta. + writer.PutZigZagVlqInt(min_delta); + + // We need to save num_mini_blocks bytes to store the bit widths of the mini + // blocks. + mini_block_widths = writer.GetNextBytePtr(num_mini_blocks); + + int idx = 1; + for (int i = 0; i < num_mini_blocks; ++i) { + int n = std::min(mini_block_size_, num_values() - idx); + + // Compute the max delta in this mini block. + int64_t max_delta = std::numeric_limits<int64_t>::min(); + for (int j = 0; j < n; ++j) { + max_delta = std::max(values_[idx + j], max_delta); + } + + // The bit width for this block is the number of bits needed to store + // (max_delta - min_delta). + int bit_width = parquet::BitUtil::NumRequiredBits(max_delta - min_delta); + mini_block_widths[i] = bit_width; + + // Encode this mini blocking using min_delta and bit_width + for (int j = 0; j < n; ++j) { + writer.PutValue(values_[idx + j] - min_delta, bit_width); + } + + // Pad out the last block. + for (int j = n; j < mini_block_size_; ++j) { + writer.PutValue(0, bit_width); + } + idx += n; + } + + writer.Flush(); + *encoded_len = writer.bytes_written(); + return result; + } + + int num_values() const { return values_.size(); } + + private: + int mini_block_size_; + std::vector<int64_t> values_; +}; + +class DeltaLengthByteArrayEncoder { + public: + explicit DeltaLengthByteArrayEncoder(int mini_block_size = 8) + : len_encoder_(mini_block_size), + buffer_(new uint8_t[10 * 1024 * 1024]), + offset_(0), + plain_encoded_len_(0) {} + + void Add(const std::string& s) { + Add(reinterpret_cast<const uint8_t*>(s.data()), s.size()); + } + + void Add(const uint8_t* ptr, int len) { + plain_encoded_len_ += len + sizeof(int); + len_encoder_.Add(len); + memcpy(buffer_ + offset_, ptr, len); + offset_ += len; + } + + uint8_t* Encode(int* encoded_len) { + uint8_t* encoded_lengths = len_encoder_.Encode(encoded_len); + memmove(buffer_ + *encoded_len + sizeof(int), buffer_, offset_); + memcpy(buffer_, encoded_len, sizeof(int)); + memcpy(buffer_ + sizeof(int), encoded_lengths, *encoded_len); + *encoded_len += offset_ + sizeof(int); + return buffer_; + } + + int num_values() const { return len_encoder_.num_values(); } + int plain_encoded_len() const { return plain_encoded_len_; } + + private: + DeltaBitPackEncoder len_encoder_; + uint8_t* buffer_; + int offset_; + int plain_encoded_len_; +}; + +class DeltaByteArrayEncoder { + public: + DeltaByteArrayEncoder() : plain_encoded_len_(0) {} + + void Add(const std::string& s) { + plain_encoded_len_ += s.size() + sizeof(int); + int min_len = std::min(s.size(), last_value_.size()); + int prefix_len = 0; + for (int i = 0; i < min_len; ++i) { + if (s[i] == last_value_[i]) { + ++prefix_len; + } else { + break; + } + } + prefix_len_encoder_.Add(prefix_len); + suffix_encoder_.Add( + reinterpret_cast<const uint8_t*>(s.data()) + prefix_len, s.size() - prefix_len); + last_value_ = s; + } + + uint8_t* Encode(int* encoded_len) { + int prefix_buffer_len; + uint8_t* prefix_buffer = prefix_len_encoder_.Encode(&prefix_buffer_len); + int suffix_buffer_len; + uint8_t* suffix_buffer = suffix_encoder_.Encode(&suffix_buffer_len); + + uint8_t* buffer = new uint8_t[10 * 1024 * 1024]; + memcpy(buffer, &prefix_buffer_len, sizeof(int)); + memcpy(buffer + sizeof(int), prefix_buffer, prefix_buffer_len); + memcpy(buffer + sizeof(int) + prefix_buffer_len, suffix_buffer, suffix_buffer_len); + *encoded_len = sizeof(int) + prefix_buffer_len + suffix_buffer_len; + return buffer; + } + + int num_values() const { return prefix_len_encoder_.num_values(); } + int plain_encoded_len() const { return plain_encoded_len_; } + + private: + DeltaBitPackEncoder prefix_len_encoder_; + DeltaLengthByteArrayEncoder suffix_encoder_; + std::string last_value_; + int plain_encoded_len_; +}; + +uint64_t TestPlainIntEncoding(const uint8_t* data, int num_values, int batch_size) { + uint64_t result = 0; + parquet::PlainDecoder<parquet::Int64Type> decoder(nullptr); + decoder.SetData(num_values, data, num_values * sizeof(int64_t)); + int64_t values[batch_size]; + for (int i = 0; i < num_values;) { + int n = decoder.Decode(values, batch_size); + for (int j = 0; j < n; ++j) { + result += values[j]; + } + i += n; + } + return result; +} + +uint64_t TestBinaryPackedEncoding(const char* name, const std::vector<int64_t>& values, + int benchmark_iters = -1, int benchmark_batch_size = 1) { + int mini_block_size; + if (values.size() < 8) { + mini_block_size = 8; + } else if (values.size() < 16) { + mini_block_size = 16; + } else { + mini_block_size = 32; + } + parquet::DeltaBitPackDecoder<parquet::Int64Type> decoder(nullptr); + DeltaBitPackEncoder encoder(mini_block_size); + for (size_t i = 0; i < values.size(); ++i) { + encoder.Add(values[i]); + } + + int raw_len = encoder.num_values() * sizeof(int); + int len; + uint8_t* buffer = encoder.Encode(&len); + + if (benchmark_iters == -1) { + printf("%s\n", name); + printf(" Raw len: %d\n", raw_len); + printf(" Encoded len: %d (%0.2f%%)\n", len, len * 100 / static_cast<float>(raw_len)); + decoder.SetData(encoder.num_values(), buffer, len); + for (int i = 0; i < encoder.num_values(); ++i) { + int64_t x = 0; + decoder.Decode(&x, 1); + if (values[i] != x) { + std::cerr << "Bad: " << i << std::endl; + std::cerr << " " << x << " != " << values[i] << std::endl; + break; + } + } + return 0; + } else { + printf("%s\n", name); + printf(" Raw len: %d\n", raw_len); + printf(" Encoded len: %d (%0.2f%%)\n", len, len * 100 / static_cast<float>(raw_len)); + + uint64_t result = 0; + int64_t buf[benchmark_batch_size]; + parquet::StopWatch sw; + sw.Start(); + for (int k = 0; k < benchmark_iters; ++k) { + decoder.SetData(encoder.num_values(), buffer, len); + for (size_t i = 0; i < values.size();) { + int n = decoder.Decode(buf, benchmark_batch_size); + for (int j = 0; j < n; ++j) { + result += buf[j]; + } + i += n; + } + } + uint64_t elapsed = sw.Stop(); + double num_ints = values.size() * benchmark_iters * 1000.; + printf("%s rate (batch size = %2d): %0.3fM per second.\n", name, benchmark_batch_size, + num_ints / elapsed); + return result; + } +} + +#define TEST(NAME, FN, DATA, BATCH_SIZE) \ + sw.Start(); \ + for (int i = 0; i < NUM_ITERS; ++i) { \ + FN(reinterpret_cast<uint8_t*>(&DATA[0]), NUM_VALUES, BATCH_SIZE); \ + } \ + elapsed = sw.Stop(); \ + printf("%s rate (batch size = %2d): %0.3fM per second.\n", NAME, BATCH_SIZE, \ + mult / elapsed); + +void TestPlainIntCompressed(parquet::Codec* codec, const std::vector<int64_t>& data, + int num_iters, int batch_size) { + const uint8_t* raw_data = reinterpret_cast<const uint8_t*>(&data[0]); + int uncompressed_len = data.size() * sizeof(int64_t); + uint8_t* decompressed_data = new uint8_t[uncompressed_len]; + + int max_compressed_size = codec->MaxCompressedLen(uncompressed_len, raw_data); + uint8_t* compressed_data = new uint8_t[max_compressed_size]; + int compressed_len = + codec->Compress(uncompressed_len, raw_data, max_compressed_size, compressed_data); + + printf("\n%s:\n Uncompressed len: %d\n Compressed len: %d\n", codec->name(), + uncompressed_len, compressed_len); + + double mult = num_iters * data.size() * 1000.; + parquet::StopWatch sw; + sw.Start(); + uint64_t r = 0; + for (int i = 0; i < num_iters; ++i) { + codec->Decompress( + compressed_len, compressed_data, uncompressed_len, decompressed_data); + r += TestPlainIntEncoding(decompressed_data, data.size(), batch_size); + } + int64_t elapsed = sw.Stop(); + printf("Compressed(%s) plain int rate (batch size = %2d): %0.3fM per second.\n", + codec->name(), batch_size, mult / elapsed); + + delete[] compressed_data; + delete[] decompressed_data; +} + +void TestBinaryPacking() { + std::vector<int64_t> values; + values.clear(); + for (int i = 0; i < 100; ++i) + values.push_back(0); + TestBinaryPackedEncoding("Zeros", values); + + values.clear(); + for (int i = 1; i <= 5; ++i) + values.push_back(i); + TestBinaryPackedEncoding("Example 1", values); + + values.clear(); + values.push_back(7); + values.push_back(5); + values.push_back(3); + values.push_back(1); + values.push_back(2); + values.push_back(3); + values.push_back(4); + values.push_back(5); + TestBinaryPackedEncoding("Example 2", values); + + // Test rand ints between 0 and 10K + values.clear(); + int seed = 0; + std::mt19937 gen(seed); + std::uniform_int_distribution<int> d(0, 10000); + for (int i = 0; i < 500000; ++i) { + values.push_back(d(gen)); + } + TestBinaryPackedEncoding("Rand [0, 10000)", values); + + // Test rand ints between 0 and 100 + values.clear(); + std::uniform_int_distribution<int> d1(0, 100); + for (int i = 0; i < 500000; ++i) { + values.push_back(d1(gen)); + } + TestBinaryPackedEncoding("Rand [0, 100)", values); +} + +void TestDeltaLengthByteArray() { + parquet::DeltaLengthByteArrayDecoder decoder(nullptr); + DeltaLengthByteArrayEncoder encoder; + + std::vector<std::string> values; + values.push_back("Hello"); + values.push_back("World"); + values.push_back("Foobar"); + values.push_back("ABCDEF"); + + for (size_t i = 0; i < values.size(); ++i) { + encoder.Add(values[i]); + } + + int len = 0; + uint8_t* buffer = encoder.Encode(&len); + printf("DeltaLengthByteArray\n Raw len: %d\n Encoded len: %d\n", + encoder.plain_encoded_len(), len); + decoder.SetData(encoder.num_values(), buffer, len); + for (int i = 0; i < encoder.num_values(); ++i) { + parquet::ByteArray v = {0, NULL}; + decoder.Decode(&v, 1); + std::string r = std::string(reinterpret_cast<const char*>(v.ptr), v.len); + if (r != values[i]) { std::cout << "Bad " << r << " != " << values[i] << std::endl; } + } +} + +void TestDeltaByteArray() { + parquet::DeltaByteArrayDecoder decoder(nullptr); + DeltaByteArrayEncoder encoder; + + std::vector<std::string> values; + + // Wikipedia example + values.push_back("myxa"); + values.push_back("myxophyta"); + values.push_back("myxopod"); + values.push_back("nab"); + values.push_back("nabbed"); + values.push_back("nabbing"); + values.push_back("nabit"); + values.push_back("nabk"); + values.push_back("nabob"); + values.push_back("nacarat"); + values.push_back("nacelle"); + + for (size_t i = 0; i < values.size(); ++i) { + encoder.Add(values[i]); + } + + int len = 0; + uint8_t* buffer = encoder.Encode(&len); + printf("DeltaLengthByteArray\n Raw len: %d\n Encoded len: %d\n", + encoder.plain_encoded_len(), len); + decoder.SetData(encoder.num_values(), buffer, len); + for (int i = 0; i < encoder.num_values(); ++i) { + parquet::ByteArray v; + decoder.Decode(&v, 1); + std::string r = std::string(reinterpret_cast<const char*>(v.ptr), v.len); + if (r != values[i]) { std::cout << "Bad " << r << " != " << values[i] << std::endl; } + } +} + +int main(int argc, char** argv) { + TestBinaryPacking(); + TestDeltaLengthByteArray(); + TestDeltaByteArray(); + + parquet::StopWatch sw; + uint64_t elapsed = 0; + + const int NUM_VALUES = 1024 * 1024; + const int NUM_ITERS = 10; + const double mult = NUM_VALUES * NUM_ITERS * 1000.; + + std::vector<int64_t> plain_int_data; + plain_int_data.resize(NUM_VALUES); + + TEST("Plain decoder", TestPlainIntEncoding, plain_int_data, 1); + TEST("Plain decoder", TestPlainIntEncoding, plain_int_data, 16); + TEST("Plain decoder", TestPlainIntEncoding, plain_int_data, 32); + TEST("Plain decoder", TestPlainIntEncoding, plain_int_data, 64); + + // Test rand ints between 0 and 10K + std::vector<int64_t> values; + int seed = 0; + std::mt19937 gen(seed); + std::uniform_int_distribution<int> d(0, 10000); + for (int i = 0; i < 1000000; ++i) { + values.push_back(d(gen)); + } + TestBinaryPackedEncoding("Rand 0-10K", values, 100, 1); + TestBinaryPackedEncoding("Rand 0-10K", values, 100, 16); + TestBinaryPackedEncoding("Rand 0-10K", values, 100, 32); + TestBinaryPackedEncoding("Rand 0-10K", values, 100, 64); + + parquet::SnappyCodec snappy_codec; + + TestPlainIntCompressed(&snappy_codec, values, 100, 1); + TestPlainIntCompressed(&snappy_codec, values, 100, 16); + TestPlainIntCompressed(&snappy_codec, values, 100, 32); + TestPlainIntCompressed(&snappy_codec, values, 100, 64); + + return 0; +} http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c67ee3e5/example/CMakeLists.txt ---------------------------------------------------------------------- diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt deleted file mode 100644 index d622d09..0000000 --- a/example/CMakeLists.txt +++ /dev/null @@ -1,36 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -SET(LINK_LIBS - snappystatic - thriftstatic) - -if (PARQUET_BUILD_EXECUTABLES) - add_executable(decode_benchmark decode_benchmark.cc) - - # This uses private APIs - target_link_libraries(decode_benchmark ${LINK_LIBS} - parquet_static) - - add_executable(parquet-dump-schema parquet-dump-schema.cc) - target_link_libraries(parquet-dump-schema ${LINK_LIBS} - parquet_static) - - add_executable(parquet_reader parquet_reader.cc) - target_link_libraries(parquet_reader ${LINK_LIBS} - parquet_static) -endif() http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c67ee3e5/example/decode_benchmark.cc ---------------------------------------------------------------------- diff --git a/example/decode_benchmark.cc b/example/decode_benchmark.cc deleted file mode 100644 index 4eb1975..0000000 --- a/example/decode_benchmark.cc +++ /dev/null @@ -1,460 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include <iostream> -#include <stdio.h> - -#include "parquet/compression/codec.h" -#include "parquet/encodings/plain-encoding.h" -#include "parquet/encodings/dictionary-encoding.h" -#include "parquet/encodings/delta-bit-pack-encoding.h" -#include "parquet/encodings/delta-byte-array-encoding.h" -#include "parquet/encodings/delta-length-byte-array-encoding.h" -#include "parquet/util/stopwatch.h" - -using namespace parquet; -using namespace std; - -/** - * Test bed for encodings and some utilities to measure their throughput. - * TODO: this file needs some major cleanup. - */ - -class DeltaBitPackEncoder { - public: - DeltaBitPackEncoder(int mini_block_size = 8) { - mini_block_size_ = mini_block_size; - } - - void Add(int64_t v) { - values_.push_back(v); - } - - uint8_t* Encode(int* encoded_len) { - uint8_t* result = new uint8_t[10 * 1024 * 1024]; - int num_mini_blocks = BitUtil::Ceil(num_values() - 1, mini_block_size_); - uint8_t* mini_block_widths = NULL; - - BitWriter writer(result, 10 * 1024 * 1024); - - // Writer the size of each block. We only use 1 block currently. - writer.PutVlqInt(num_mini_blocks * mini_block_size_); - - // Write the number of mini blocks. - writer.PutVlqInt(num_mini_blocks); - - // Write the number of values. - writer.PutVlqInt(num_values() - 1); - - // Write the first value. - writer.PutZigZagVlqInt(values_[0]); - - // Compute the values as deltas and the min delta. - int64_t min_delta = numeric_limits<int64_t>::max(); - for (int i = values_.size() - 1; i > 0; --i) { - values_[i] -= values_[i - 1]; - min_delta = min(min_delta, values_[i]); - } - - // Write out the min delta. - writer.PutZigZagVlqInt(min_delta); - - // We need to save num_mini_blocks bytes to store the bit widths of the mini blocks. - mini_block_widths = writer.GetNextBytePtr(num_mini_blocks); - - int idx = 1; - for (int i = 0; i < num_mini_blocks; ++i) { - int n = min(mini_block_size_, num_values() - idx); - - // Compute the max delta in this mini block. - int64_t max_delta = numeric_limits<int64_t>::min(); - for (int j = 0; j < n; ++j) { - max_delta = max(values_[idx + j], max_delta); - } - - // The bit width for this block is the number of bits needed to store - // (max_delta - min_delta). - int bit_width = BitUtil::NumRequiredBits(max_delta - min_delta); - mini_block_widths[i] = bit_width; - - // Encode this mini blocking using min_delta and bit_width - for (int j = 0; j < n; ++j) { - writer.PutValue(values_[idx + j] - min_delta, bit_width); - } - - // Pad out the last block. - for (int j = n; j < mini_block_size_; ++j) { - writer.PutValue(0, bit_width); - } - idx += n; - } - - writer.Flush(); - *encoded_len = writer.bytes_written(); - return result; - } - - int num_values() const { return values_.size(); } - - private: - int mini_block_size_; - vector<int64_t> values_; -}; - -class DeltaLengthByteArrayEncoder { - public: - DeltaLengthByteArrayEncoder(int mini_block_size = 8) : - len_encoder_(mini_block_size), - buffer_(new uint8_t[10 * 1024 * 1024]), - offset_(0), - plain_encoded_len_(0) { - } - - void Add(const string& s) { - Add(reinterpret_cast<const uint8_t*>(s.data()), s.size()); - } - - void Add(const uint8_t* ptr, int len) { - plain_encoded_len_ += len + sizeof(int); - len_encoder_.Add(len); - memcpy(buffer_ + offset_, ptr, len); - offset_ += len; - } - - uint8_t* Encode(int* encoded_len) { - uint8_t* encoded_lengths = len_encoder_.Encode(encoded_len); - memmove(buffer_ + *encoded_len + sizeof(int), buffer_, offset_); - memcpy(buffer_, encoded_len, sizeof(int)); - memcpy(buffer_ + sizeof(int), encoded_lengths, *encoded_len); - *encoded_len += offset_ + sizeof(int); - return buffer_; - } - - int num_values() const { return len_encoder_.num_values(); } - int plain_encoded_len() const { return plain_encoded_len_; } - - private: - DeltaBitPackEncoder len_encoder_; - uint8_t* buffer_; - int offset_; - int plain_encoded_len_; -}; - -class DeltaByteArrayEncoder { - public: - DeltaByteArrayEncoder() : plain_encoded_len_(0) {} - - void Add(const string& s) { - plain_encoded_len_ += s.size() + sizeof(int); - int min_len = min(s.size(), last_value_.size()); - int prefix_len = 0; - for (int i = 0; i < min_len; ++i) { - if (s[i] == last_value_[i]) { - ++prefix_len; - } else { - break; - } - } - prefix_len_encoder_.Add(prefix_len); - suffix_encoder_.Add(reinterpret_cast<const uint8_t*>(s.data()) + prefix_len, - s.size() - prefix_len); - last_value_ = s; - } - - uint8_t* Encode(int* encoded_len) { - int prefix_buffer_len; - uint8_t* prefix_buffer = prefix_len_encoder_.Encode(&prefix_buffer_len); - int suffix_buffer_len; - uint8_t* suffix_buffer = suffix_encoder_.Encode(&suffix_buffer_len); - - uint8_t* buffer = new uint8_t[10 * 1024 * 1024]; - memcpy(buffer, &prefix_buffer_len, sizeof(int)); - memcpy(buffer + sizeof(int), prefix_buffer, prefix_buffer_len); - memcpy(buffer + sizeof(int) + prefix_buffer_len, suffix_buffer, suffix_buffer_len); - *encoded_len = sizeof(int) + prefix_buffer_len + suffix_buffer_len; - return buffer; - } - - int num_values() const { return prefix_len_encoder_.num_values(); } - int plain_encoded_len() const { return plain_encoded_len_; } - - private: - DeltaBitPackEncoder prefix_len_encoder_; - DeltaLengthByteArrayEncoder suffix_encoder_; - string last_value_; - int plain_encoded_len_; -}; - - -uint64_t TestPlainIntEncoding(const uint8_t* data, int num_values, int batch_size) { - uint64_t result = 0; - PlainDecoder<Int64Type> decoder(nullptr); - decoder.SetData(num_values, data, num_values * sizeof(int64_t)); - int64_t values[batch_size]; - for (int i = 0; i < num_values;) { - int n = decoder.Decode(values, batch_size); - for (int j = 0; j < n; ++j) { - result += values[j]; - } - i += n; - } - return result; -} - -uint64_t TestBinaryPackedEncoding(const char* name, const vector<int64_t>& values, - int benchmark_iters = -1, int benchmark_batch_size = 1) { - int mini_block_size; - if (values.size() < 8) { - mini_block_size = 8; - } else if (values.size() < 16) { - mini_block_size = 16; - } else { - mini_block_size = 32; - } - DeltaBitPackDecoder<Int64Type> decoder(nullptr); - DeltaBitPackEncoder encoder(mini_block_size); - for (size_t i = 0; i < values.size(); ++i) { - encoder.Add(values[i]); - } - - int raw_len = encoder.num_values() * sizeof(int); - int len; - uint8_t* buffer = encoder.Encode(&len); - - if (benchmark_iters == -1) { - printf("%s\n", name); - printf(" Raw len: %d\n", raw_len); - printf(" Encoded len: %d (%0.2f%%)\n", len, len * 100 / (float)raw_len); - decoder.SetData(encoder.num_values(), buffer, len); - for (int i = 0; i < encoder.num_values(); ++i) { - int64_t x = 0; - decoder.Decode(&x, 1); - if (values[i] != x) { - cerr << "Bad: " << i << endl; - cerr << " " << x << " != " << values[i] << endl; - break; - } - } - return 0; - } else { - printf("%s\n", name); - printf(" Raw len: %d\n", raw_len); - printf(" Encoded len: %d (%0.2f%%)\n", len, len * 100 / (float)raw_len); - - uint64_t result = 0; - int64_t buf[benchmark_batch_size]; - StopWatch sw; - sw.Start();\ - for (int k = 0; k < benchmark_iters; ++k) { - decoder.SetData(encoder.num_values(), buffer, len); - for (size_t i = 0; i < values.size();) { - int n = decoder.Decode(buf, benchmark_batch_size); - for (int j = 0; j < n; ++j) { - result += buf[j]; - } - i += n; - } - } - uint64_t elapsed = sw.Stop(); - double num_ints = values.size() * benchmark_iters * 1000.; - printf("%s rate (batch size = %2d): %0.3fM per second.\n", - name, benchmark_batch_size, num_ints / elapsed); - return result; - } -} - -#define TEST(NAME, FN, DATA, BATCH_SIZE)\ - sw.Start();\ - for (int i = 0; i < NUM_ITERS; ++i) {\ - FN(reinterpret_cast<uint8_t*>(&DATA[0]), NUM_VALUES, BATCH_SIZE);\ - }\ - elapsed = sw.Stop();\ - printf("%s rate (batch size = %2d): %0.3fM per second.\n",\ - NAME, BATCH_SIZE, mult / elapsed); - -void TestPlainIntCompressed(Codec* codec, const vector<int64_t>& data, int num_iters, int batch_size) { - const uint8_t* raw_data = reinterpret_cast<const uint8_t*>(&data[0]); - int uncompressed_len = data.size() * sizeof(int64_t); - uint8_t* decompressed_data = new uint8_t[uncompressed_len]; - - int max_compressed_size = codec->MaxCompressedLen(uncompressed_len, raw_data); - uint8_t* compressed_data = new uint8_t[max_compressed_size]; - int compressed_len = codec->Compress(uncompressed_len, raw_data, - max_compressed_size, compressed_data); - - printf("\n%s:\n Uncompressed len: %d\n Compressed len: %d\n", - codec->name(), uncompressed_len, compressed_len); - - double mult = num_iters * data.size() * 1000.; - StopWatch sw; - sw.Start(); - uint64_t r = 0; - for (int i = 0; i < num_iters; ++i) { - codec->Decompress(compressed_len, compressed_data, - uncompressed_len, decompressed_data); - r += TestPlainIntEncoding(decompressed_data, data.size(), batch_size); - } - int64_t elapsed = sw.Stop();\ - printf("Compressed(%s) plain int rate (batch size = %2d): %0.3fM per second.\n", - codec->name(), batch_size, mult / elapsed); - - delete[] compressed_data; - delete[] decompressed_data; -} - -void TestBinaryPacking() { - vector<int64_t> values; - values.clear(); - for (int i = 0; i < 100; ++i) values.push_back(0); - TestBinaryPackedEncoding("Zeros", values); - - values.clear(); - for (int i = 1; i <= 5; ++i) values.push_back(i); - TestBinaryPackedEncoding("Example 1", values); - - values.clear(); - values.push_back(7); - values.push_back(5); - values.push_back(3); - values.push_back(1); - values.push_back(2); - values.push_back(3); - values.push_back(4); - values.push_back(5); - TestBinaryPackedEncoding("Example 2", values); - - // Test rand ints between 0 and 10K - values.clear(); - for (int i = 0; i < 500000; ++i) { - values.push_back(rand() % (10000)); - } - TestBinaryPackedEncoding("Rand [0, 10000)", values); - - // Test rand ints between 0 and 100 - values.clear(); - for (int i = 0; i < 500000; ++i) { - values.push_back(rand() % 100); - } - TestBinaryPackedEncoding("Rand [0, 100)", values); -} - -void TestDeltaLengthByteArray() { - DeltaLengthByteArrayDecoder decoder(nullptr); - DeltaLengthByteArrayEncoder encoder; - - vector<string> values; - values.push_back("Hello"); - values.push_back("World"); - values.push_back("Foobar"); - values.push_back("ABCDEF"); - - for (size_t i = 0; i < values.size(); ++i) { - encoder.Add(values[i]); - } - - int len = 0; - uint8_t* buffer = encoder.Encode(&len); - printf("DeltaLengthByteArray\n Raw len: %d\n Encoded len: %d\n", - encoder.plain_encoded_len(), len); - decoder.SetData(encoder.num_values(), buffer, len); - for (int i = 0; i < encoder.num_values(); ++i) { - ByteArray v = {0, NULL}; - decoder.Decode(&v, 1); - string r = string((char*)v.ptr, v.len); - if (r != values[i]) { - cout << "Bad " << r << " != " << values[i] << endl; - } - } -} - -void TestDeltaByteArray() { - DeltaByteArrayDecoder decoder(nullptr); - DeltaByteArrayEncoder encoder; - - vector<string> values; - - // Wikipedia example - values.push_back("myxa"); - values.push_back("myxophyta"); - values.push_back("myxopod"); - values.push_back("nab"); - values.push_back("nabbed"); - values.push_back("nabbing"); - values.push_back("nabit"); - values.push_back("nabk"); - values.push_back("nabob"); - values.push_back("nacarat"); - values.push_back("nacelle"); - - for (size_t i = 0; i < values.size(); ++i) { - encoder.Add(values[i]); - } - - int len = 0; - uint8_t* buffer = encoder.Encode(&len); - printf("DeltaLengthByteArray\n Raw len: %d\n Encoded len: %d\n", - encoder.plain_encoded_len(), len); - decoder.SetData(encoder.num_values(), buffer, len); - for (int i = 0; i < encoder.num_values(); ++i) { - ByteArray v; - decoder.Decode(&v, 1); - string r = string((char*)v.ptr, v.len); - if (r != values[i]) { - cout << "Bad " << r << " != " << values[i] << endl; - } - } -} - -int main(int argc, char** argv) { - TestBinaryPacking(); - TestDeltaLengthByteArray(); - TestDeltaByteArray(); - - StopWatch sw; - uint64_t elapsed = 0; - - const int NUM_VALUES = 1024 * 1024; - const int NUM_ITERS = 10; - const double mult = NUM_VALUES * NUM_ITERS * 1000.; - - vector<int64_t> plain_int_data; - plain_int_data.resize(NUM_VALUES); - - TEST("Plain decoder", TestPlainIntEncoding, plain_int_data, 1); - TEST("Plain decoder", TestPlainIntEncoding, plain_int_data, 16); - TEST("Plain decoder", TestPlainIntEncoding, plain_int_data, 32); - TEST("Plain decoder", TestPlainIntEncoding, plain_int_data, 64); - - // Test rand ints between 0 and 10K - vector<int64_t> values; - for (int i = 0; i < 1000000; ++i) { - values.push_back(rand() % 10000); - } - TestBinaryPackedEncoding("Rand 0-10K", values, 100, 1); - TestBinaryPackedEncoding("Rand 0-10K", values, 100, 16); - TestBinaryPackedEncoding("Rand 0-10K", values, 100, 32); - TestBinaryPackedEncoding("Rand 0-10K", values, 100, 64); - - SnappyCodec snappy_codec; - - TestPlainIntCompressed(&snappy_codec, values, 100, 1); - TestPlainIntCompressed(&snappy_codec, values, 100, 16); - TestPlainIntCompressed(&snappy_codec, values, 100, 32); - TestPlainIntCompressed(&snappy_codec, values, 100, 64); - - return 0; -} http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c67ee3e5/example/parquet-dump-schema.cc ---------------------------------------------------------------------- diff --git a/example/parquet-dump-schema.cc b/example/parquet-dump-schema.cc deleted file mode 100644 index ed7b570..0000000 --- a/example/parquet-dump-schema.cc +++ /dev/null @@ -1,39 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include <iostream> - -#include <parquet/api/reader.h> -#include <parquet/api/schema.h> - -using namespace parquet; - -int main(int argc, char** argv) { - std::string filename = argv[1]; - - try { - std::unique_ptr<ParquetFileReader> reader = ParquetFileReader::OpenFile(filename); - PrintSchema(reader->metadata()->schema_descriptor()->schema().get(), std::cout); - } catch (const std::exception& e) { - std::cerr << "Parquet error: " - << e.what() - << std::endl; - return -1; - } - - return 0; -} http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c67ee3e5/example/parquet_reader.cc ---------------------------------------------------------------------- diff --git a/example/parquet_reader.cc b/example/parquet_reader.cc deleted file mode 100644 index 2baa2b1..0000000 --- a/example/parquet_reader.cc +++ /dev/null @@ -1,70 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include <iostream> -#include <memory> -#include <list> - -#include <parquet/api/reader.h> - -using namespace parquet; - -int main(int argc, char** argv) { - if (argc > 5 || argc < 2) { - std::cerr << "Usage: parquet_reader [--only-metadata] [--no-memory-map] [--columns=...] <file>" - << std::endl; - return -1; - } - - std::string filename; - bool print_values = true; - bool memory_map = true; - - // Read command-line options - const std::string COLUMNS_PREFIX = "--columns="; - std::list<int> columns; - - char *param, *value; - for (int i = 1; i < argc; i++) { - if ((param = std::strstr(argv[i], "--only-metadata"))) { - print_values = false; - } else if ((param = std::strstr(argv[i], "--no-memory-map"))) { - memory_map = false; - } else if ((param = std::strstr(argv[i], COLUMNS_PREFIX.c_str()))) { - value = std::strtok(param+COLUMNS_PREFIX.length(), "," ); - while (value) { - columns.push_back(std::atoi(value)); - value = std::strtok(nullptr, "," ); - } - } else { - filename = argv[i]; - } - } - - try { - std::unique_ptr<ParquetFileReader> reader = ParquetFileReader::OpenFile(filename, - memory_map); - reader->DebugPrint(std::cout, columns, print_values); - } catch (const std::exception& e) { - std::cerr << "Parquet error: " - << e.what() - << std::endl; - return -1; - } - - return 0; -} http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c67ee3e5/src/.clang-format ---------------------------------------------------------------------- diff --git a/src/.clang-format b/src/.clang-format deleted file mode 100644 index 7d5b3cf..0000000 --- a/src/.clang-format +++ /dev/null @@ -1,65 +0,0 @@ ---- -Language: Cpp -# BasedOnStyle: Google -AccessModifierOffset: -1 -AlignAfterOpenBracket: false -AlignConsecutiveAssignments: false -AlignEscapedNewlinesLeft: true -AlignOperands: true -AlignTrailingComments: true -AllowAllParametersOfDeclarationOnNextLine: true -AllowShortBlocksOnASingleLine: true -AllowShortCaseLabelsOnASingleLine: false -AllowShortFunctionsOnASingleLine: Inline -AllowShortIfStatementsOnASingleLine: true -AllowShortLoopsOnASingleLine: false -AlwaysBreakAfterDefinitionReturnType: None -AlwaysBreakBeforeMultilineStrings: true -AlwaysBreakTemplateDeclarations: true -BinPackArguments: true -BinPackParameters: true -BreakBeforeBinaryOperators: None -BreakBeforeBraces: Attach -BreakBeforeTernaryOperators: true -BreakConstructorInitializersBeforeComma: false -ColumnLimit: 90 -CommentPragmas: '^ IWYU pragma:' -ConstructorInitializerAllOnOneLineOrOnePerLine: true -ConstructorInitializerIndentWidth: 4 -ContinuationIndentWidth: 4 -Cpp11BracedListStyle: true -DerivePointerAlignment: false -DisableFormat: false -ExperimentalAutoDetectBinPacking: false -ForEachMacros: [ foreach, Q_FOREACH, BOOST_FOREACH ] -IndentCaseLabels: true -IndentWidth: 2 -IndentWrappedFunctionNames: false -KeepEmptyLinesAtTheStartOfBlocks: false -MacroBlockBegin: '' -MacroBlockEnd: '' -MaxEmptyLinesToKeep: 1 -NamespaceIndentation: None -ObjCBlockIndentWidth: 2 -ObjCSpaceAfterProperty: false -ObjCSpaceBeforeProtocolList: false -PenaltyBreakBeforeFirstCallParameter: 1000 -PenaltyBreakComment: 300 -PenaltyBreakFirstLessLess: 120 -PenaltyBreakString: 1000 -PenaltyExcessCharacter: 1000000 -PenaltyReturnTypeOnItsOwnLine: 200 -PointerAlignment: Left -SpaceAfterCStyleCast: false -SpaceBeforeAssignmentOperators: true -SpaceBeforeParens: ControlStatements -SpaceInEmptyParentheses: false -SpacesBeforeTrailingComments: 2 -SpacesInAngles: false -SpacesInContainerLiterals: true -SpacesInCStyleCastParentheses: false -SpacesInParentheses: false -SpacesInSquareBrackets: false -Standard: Cpp11 -TabWidth: 8 -UseTab: Never http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c67ee3e5/src/.clang-tidy ---------------------------------------------------------------------- diff --git a/src/.clang-tidy b/src/.clang-tidy deleted file mode 100644 index 6fc3742..0000000 --- a/src/.clang-tidy +++ /dev/null @@ -1,13 +0,0 @@ ---- -Checks: 'clang-diagnostic-*,clang-analyzer-*,-clang-analyzer-alpha*,google-.*,modernize-.*,readablity-.*' -HeaderFilterRegex: 'parquet/.*' -AnalyzeTemporaryDtors: true -CheckOptions: - - key: google-readability-braces-around-statements.ShortStatementLines - value: '1' - - key: google-readability-function-size.StatementThreshold - value: '800' - - key: google-readability-namespace-comments.ShortNamespaceLines - value: '10' - - key: google-readability-namespace-comments.SpacesBeforeComments - value: '2' http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c67ee3e5/src/.clang-tidy-ignore ---------------------------------------------------------------------- diff --git a/src/.clang-tidy-ignore b/src/.clang-tidy-ignore deleted file mode 100644 index 8b13789..0000000 --- a/src/.clang-tidy-ignore +++ /dev/null @@ -1 +0,0 @@ - http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c67ee3e5/src/parquet/api/reader.h ---------------------------------------------------------------------- diff --git a/src/parquet/api/reader.h b/src/parquet/api/reader.h index 1e0c5e3..0bd4a24 100644 --- a/src/parquet/api/reader.h +++ b/src/parquet/api/reader.h @@ -20,6 +20,7 @@ // Column reader API #include "parquet/column/reader.h" +#include "parquet/column/scan-all.h" #include "parquet/exception.h" #include "parquet/file/reader.h" http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c67ee3e5/src/parquet/column/reader.h ---------------------------------------------------------------------- diff --git a/src/parquet/column/reader.h b/src/parquet/column/reader.h index 25df2b4..8243e85 100644 --- a/src/parquet/column/reader.h +++ b/src/parquet/column/reader.h @@ -67,7 +67,6 @@ class PARQUET_EXPORT ColumnReader { int64_t ReadDefinitionLevels(int64_t batch_size, int16_t* levels); // Read multiple repetition levels into preallocated memory - // // Returns the number of decoded repetition levels int64_t ReadRepetitionLevels(int64_t batch_size, int16_t* levels); http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c67ee3e5/src/parquet/column/scan-all.h ---------------------------------------------------------------------- diff --git a/src/parquet/column/scan-all.h b/src/parquet/column/scan-all.h new file mode 100644 index 0000000..fd63bff --- /dev/null +++ b/src/parquet/column/scan-all.h @@ -0,0 +1,67 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef PARQUET_SCAN_ALL_H +#define PARQUET_SCAN_ALL_H + +#include "parquet/column/reader.h" + +template <typename RType> +int64_t ScanAll(int32_t batch_size, int16_t* def_levels, int16_t* rep_levels, + uint8_t* values, int64_t* values_buffered, parquet::ColumnReader* reader) { + typedef typename RType::T Type; + auto typed_reader = static_cast<RType*>(reader); + auto vals = reinterpret_cast<Type*>(&values[0]); + return typed_reader->ReadBatch( + batch_size, def_levels, rep_levels, vals, values_buffered); +} + +int64_t ScanAllValues(int32_t batch_size, int16_t* def_levels, int16_t* rep_levels, + uint8_t* values, int64_t* values_buffered, parquet::ColumnReader* reader) { + switch (reader->type()) { + case parquet::Type::BOOLEAN: + return ScanAll<parquet::BoolReader>( + batch_size, def_levels, rep_levels, values, values_buffered, reader); + case parquet::Type::INT32: + return ScanAll<parquet::Int32Reader>( + batch_size, def_levels, rep_levels, values, values_buffered, reader); + case parquet::Type::INT64: + return ScanAll<parquet::Int64Reader>( + batch_size, def_levels, rep_levels, values, values_buffered, reader); + case parquet::Type::INT96: + return ScanAll<parquet::Int96Reader>( + batch_size, def_levels, rep_levels, values, values_buffered, reader); + case parquet::Type::FLOAT: + return ScanAll<parquet::FloatReader>( + batch_size, def_levels, rep_levels, values, values_buffered, reader); + case parquet::Type::DOUBLE: + return ScanAll<parquet::DoubleReader>( + batch_size, def_levels, rep_levels, values, values_buffered, reader); + case parquet::Type::BYTE_ARRAY: + return ScanAll<parquet::ByteArrayReader>( + batch_size, def_levels, rep_levels, values, values_buffered, reader); + case parquet::Type::FIXED_LEN_BYTE_ARRAY: + return ScanAll<parquet::FixedLenByteArrayReader>( + batch_size, def_levels, rep_levels, values, values_buffered, reader); + default: + parquet::ParquetException::NYI("type reader not implemented"); + } + // Unreachable code, but supress compiler warning + return 0; +} + +#endif // PARQUET_SCAN_ALL_H http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c67ee3e5/src/parquet/column/scanner.h ---------------------------------------------------------------------- diff --git a/src/parquet/column/scanner.h b/src/parquet/column/scanner.h index bc5c5ce..4373c7a 100644 --- a/src/parquet/column/scanner.h +++ b/src/parquet/column/scanner.h @@ -48,7 +48,6 @@ class PARQUET_EXPORT Scanner { value_offset_(0), values_buffered_(0), reader_(reader) { - // TODO: don't allocate for required fields def_levels_.resize(descr()->max_definition_level() > 0 ? batch_size_ : 0); rep_levels_.resize(descr()->max_repetition_level() > 0 ? batch_size_ : 0); } http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c67ee3e5/src/parquet/file/reader.cc ---------------------------------------------------------------------- diff --git a/src/parquet/file/reader.cc b/src/parquet/file/reader.cc index 50db1c0..62481f7 100644 --- a/src/parquet/file/reader.cc +++ b/src/parquet/file/reader.cc @@ -144,7 +144,7 @@ void ParquetFileReader::DebugPrint( for (auto i : selected_columns) { const ColumnDescriptor* descr = file_metadata->schema_descriptor()->Column(i); stream << "Column " << i << ": " << descr->name() << " (" - << type_to_string(descr->physical_type()) << ")" << std::endl; + << TypeToString(descr->physical_type()) << ")" << std::endl; } for (int r = 0; r < file_metadata->num_row_groups(); ++r) { @@ -173,10 +173,10 @@ void ParquetFileReader::DebugPrint( stream << " Statistics Not Set"; } stream << std::endl - << " compression: " << compression_to_string(column_chunk->compression()) + << " compression: " << CompressionToString(column_chunk->compression()) << ", encodings: "; for (auto encoding : column_chunk->encodings()) { - stream << encoding_to_string(encoding) << " "; + stream << EncodingToString(encoding) << " "; } stream << std::endl << " uncompressed size: " << column_chunk->total_uncompressed_size() http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c67ee3e5/src/parquet/schema/types.cc ---------------------------------------------------------------------- diff --git a/src/parquet/schema/types.cc b/src/parquet/schema/types.cc index 2e5d151..043ef00 100644 --- a/src/parquet/schema/types.cc +++ b/src/parquet/schema/types.cc @@ -101,7 +101,7 @@ PrimitiveNode::PrimitiveNode(const std::string& name, Repetition::type repetitio case LogicalType::JSON: case LogicalType::BSON: if (type != Type::BYTE_ARRAY) { - ss << logical_type_to_string(logical_type); + ss << LogicalTypeToString(logical_type); ss << " can only annotate BYTE_ARRAY fields"; throw ParquetException(ss.str()); } @@ -138,7 +138,7 @@ PrimitiveNode::PrimitiveNode(const std::string& name, Repetition::type repetitio case LogicalType::INT_16: case LogicalType::INT_32: if (type != Type::INT32) { - ss << logical_type_to_string(logical_type); + ss << LogicalTypeToString(logical_type); ss << " can only annotate INT32"; throw ParquetException(ss.str()); } @@ -149,7 +149,7 @@ PrimitiveNode::PrimitiveNode(const std::string& name, Repetition::type repetitio case LogicalType::UINT_64: case LogicalType::INT_64: if (type != Type::INT64) { - ss << logical_type_to_string(logical_type); + ss << LogicalTypeToString(logical_type); ss << " can only annotate INT64"; throw ParquetException(ss.str()); } @@ -167,7 +167,7 @@ PrimitiveNode::PrimitiveNode(const std::string& name, Repetition::type repetitio } break; default: - ss << logical_type_to_string(logical_type); + ss << LogicalTypeToString(logical_type); ss << " can not be applied to a primitive type"; throw ParquetException(ss.str()); } http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c67ee3e5/src/parquet/types-test.cc ---------------------------------------------------------------------- diff --git a/src/parquet/types-test.cc b/src/parquet/types-test.cc index 59ed456..06f202e 100644 --- a/src/parquet/types-test.cc +++ b/src/parquet/types-test.cc @@ -24,43 +24,41 @@ namespace parquet { TEST(TestTypeToString, PhysicalTypes) { - ASSERT_STREQ("BOOLEAN", type_to_string(Type::BOOLEAN).c_str()); - ASSERT_STREQ("INT32", type_to_string(Type::INT32).c_str()); - ASSERT_STREQ("INT64", type_to_string(Type::INT64).c_str()); - ASSERT_STREQ("INT96", type_to_string(Type::INT96).c_str()); - ASSERT_STREQ("FLOAT", type_to_string(Type::FLOAT).c_str()); - ASSERT_STREQ("DOUBLE", type_to_string(Type::DOUBLE).c_str()); - ASSERT_STREQ("BYTE_ARRAY", type_to_string(Type::BYTE_ARRAY).c_str()); - ASSERT_STREQ( - "FIXED_LEN_BYTE_ARRAY", type_to_string(Type::FIXED_LEN_BYTE_ARRAY).c_str()); + ASSERT_STREQ("BOOLEAN", TypeToString(Type::BOOLEAN).c_str()); + ASSERT_STREQ("INT32", TypeToString(Type::INT32).c_str()); + ASSERT_STREQ("INT64", TypeToString(Type::INT64).c_str()); + ASSERT_STREQ("INT96", TypeToString(Type::INT96).c_str()); + ASSERT_STREQ("FLOAT", TypeToString(Type::FLOAT).c_str()); + ASSERT_STREQ("DOUBLE", TypeToString(Type::DOUBLE).c_str()); + ASSERT_STREQ("BYTE_ARRAY", TypeToString(Type::BYTE_ARRAY).c_str()); + ASSERT_STREQ("FIXED_LEN_BYTE_ARRAY", TypeToString(Type::FIXED_LEN_BYTE_ARRAY).c_str()); } TEST(TestLogicalTypeToString, LogicalTypes) { - ASSERT_STREQ("NONE", logical_type_to_string(LogicalType::NONE).c_str()); - ASSERT_STREQ("UTF8", logical_type_to_string(LogicalType::UTF8).c_str()); - ASSERT_STREQ( - "MAP_KEY_VALUE", logical_type_to_string(LogicalType::MAP_KEY_VALUE).c_str()); - ASSERT_STREQ("LIST", logical_type_to_string(LogicalType::LIST).c_str()); - ASSERT_STREQ("ENUM", logical_type_to_string(LogicalType::ENUM).c_str()); - ASSERT_STREQ("DECIMAL", logical_type_to_string(LogicalType::DECIMAL).c_str()); - ASSERT_STREQ("DATE", logical_type_to_string(LogicalType::DATE).c_str()); - ASSERT_STREQ("TIME_MILLIS", logical_type_to_string(LogicalType::TIME_MILLIS).c_str()); - ASSERT_STREQ("TIME_MICROS", logical_type_to_string(LogicalType::TIME_MICROS).c_str()); + ASSERT_STREQ("NONE", LogicalTypeToString(LogicalType::NONE).c_str()); + ASSERT_STREQ("UTF8", LogicalTypeToString(LogicalType::UTF8).c_str()); + ASSERT_STREQ("MAP_KEY_VALUE", LogicalTypeToString(LogicalType::MAP_KEY_VALUE).c_str()); + ASSERT_STREQ("LIST", LogicalTypeToString(LogicalType::LIST).c_str()); + ASSERT_STREQ("ENUM", LogicalTypeToString(LogicalType::ENUM).c_str()); + ASSERT_STREQ("DECIMAL", LogicalTypeToString(LogicalType::DECIMAL).c_str()); + ASSERT_STREQ("DATE", LogicalTypeToString(LogicalType::DATE).c_str()); + ASSERT_STREQ("TIME_MILLIS", LogicalTypeToString(LogicalType::TIME_MILLIS).c_str()); + ASSERT_STREQ("TIME_MICROS", LogicalTypeToString(LogicalType::TIME_MICROS).c_str()); ASSERT_STREQ( - "TIMESTAMP_MILLIS", logical_type_to_string(LogicalType::TIMESTAMP_MILLIS).c_str()); + "TIMESTAMP_MILLIS", LogicalTypeToString(LogicalType::TIMESTAMP_MILLIS).c_str()); ASSERT_STREQ( - "TIMESTAMP_MICROS", logical_type_to_string(LogicalType::TIMESTAMP_MICROS).c_str()); - ASSERT_STREQ("UINT_8", logical_type_to_string(LogicalType::UINT_8).c_str()); - ASSERT_STREQ("UINT_16", logical_type_to_string(LogicalType::UINT_16).c_str()); - ASSERT_STREQ("UINT_32", logical_type_to_string(LogicalType::UINT_32).c_str()); - ASSERT_STREQ("UINT_64", logical_type_to_string(LogicalType::UINT_64).c_str()); - ASSERT_STREQ("INT_8", logical_type_to_string(LogicalType::INT_8).c_str()); - ASSERT_STREQ("INT_16", logical_type_to_string(LogicalType::INT_16).c_str()); - ASSERT_STREQ("INT_32", logical_type_to_string(LogicalType::INT_32).c_str()); - ASSERT_STREQ("INT_64", logical_type_to_string(LogicalType::INT_64).c_str()); - ASSERT_STREQ("JSON", logical_type_to_string(LogicalType::JSON).c_str()); - ASSERT_STREQ("BSON", logical_type_to_string(LogicalType::BSON).c_str()); - ASSERT_STREQ("INTERVAL", logical_type_to_string(LogicalType::INTERVAL).c_str()); + "TIMESTAMP_MICROS", LogicalTypeToString(LogicalType::TIMESTAMP_MICROS).c_str()); + ASSERT_STREQ("UINT_8", LogicalTypeToString(LogicalType::UINT_8).c_str()); + ASSERT_STREQ("UINT_16", LogicalTypeToString(LogicalType::UINT_16).c_str()); + ASSERT_STREQ("UINT_32", LogicalTypeToString(LogicalType::UINT_32).c_str()); + ASSERT_STREQ("UINT_64", LogicalTypeToString(LogicalType::UINT_64).c_str()); + ASSERT_STREQ("INT_8", LogicalTypeToString(LogicalType::INT_8).c_str()); + ASSERT_STREQ("INT_16", LogicalTypeToString(LogicalType::INT_16).c_str()); + ASSERT_STREQ("INT_32", LogicalTypeToString(LogicalType::INT_32).c_str()); + ASSERT_STREQ("INT_64", LogicalTypeToString(LogicalType::INT_64).c_str()); + ASSERT_STREQ("JSON", LogicalTypeToString(LogicalType::JSON).c_str()); + ASSERT_STREQ("BSON", LogicalTypeToString(LogicalType::BSON).c_str()); + ASSERT_STREQ("INTERVAL", LogicalTypeToString(LogicalType::INTERVAL).c_str()); } TEST(TypePrinter, StatisticsTypes) { http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c67ee3e5/src/parquet/types.cc ---------------------------------------------------------------------- diff --git a/src/parquet/types.cc b/src/parquet/types.cc index 7fc5017..97c769b 100644 --- a/src/parquet/types.cc +++ b/src/parquet/types.cc @@ -62,7 +62,7 @@ std::string FormatStatValue(Type::type parquet_type, const char* val) { return result.str(); } -std::string encoding_to_string(Encoding::type t) { +std::string EncodingToString(Encoding::type t) { switch (t) { case Encoding::PLAIN: return "PLAIN"; @@ -94,7 +94,7 @@ std::string encoding_to_string(Encoding::type t) { } } -std::string compression_to_string(Compression::type t) { +std::string CompressionToString(Compression::type t) { switch (t) { case Compression::UNCOMPRESSED: return "UNCOMPRESSED"; @@ -114,7 +114,7 @@ std::string compression_to_string(Compression::type t) { } } -std::string type_to_string(Type::type t) { +std::string TypeToString(Type::type t) { switch (t) { case Type::BOOLEAN: return "BOOLEAN"; @@ -146,7 +146,7 @@ std::string type_to_string(Type::type t) { } } -std::string logical_type_to_string(LogicalType::type t) { +std::string LogicalTypeToString(LogicalType::type t) { switch (t) { case LogicalType::NONE: return "NONE"; @@ -196,4 +196,29 @@ std::string logical_type_to_string(LogicalType::type t) { return "UNKNOWN"; } } + +int GetTypeByteSize(Type::type parquet_type) { + switch (parquet_type) { + case Type::BOOLEAN: + return type_traits<BooleanType::type_num>::value_byte_size; + case Type::INT32: + return type_traits<Int32Type::type_num>::value_byte_size; + case Type::INT64: + return type_traits<Int64Type::type_num>::value_byte_size; + case Type::INT96: + return type_traits<Int96Type::type_num>::value_byte_size; + case Type::DOUBLE: + return type_traits<DoubleType::type_num>::value_byte_size; + case Type::FLOAT: + return type_traits<FloatType::type_num>::value_byte_size; + case Type::BYTE_ARRAY: + return type_traits<ByteArrayType::type_num>::value_byte_size; + case Type::FIXED_LEN_BYTE_ARRAY: + return type_traits<FLBAType::type_num>::value_byte_size; + default: + return 0; + } + return 0; +} + } // namespace parquet http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c67ee3e5/src/parquet/types.h ---------------------------------------------------------------------- diff --git a/src/parquet/types.h b/src/parquet/types.h index cb67820..a4285be 100644 --- a/src/parquet/types.h +++ b/src/parquet/types.h @@ -269,15 +269,17 @@ inline std::string format_fwf(int width) { return ss.str(); } -std::string compression_to_string(Compression::type t); +std::string CompressionToString(Compression::type t); -std::string encoding_to_string(Encoding::type t); +std::string EncodingToString(Encoding::type t); -std::string logical_type_to_string(LogicalType::type t); +std::string LogicalTypeToString(LogicalType::type t); -std::string type_to_string(Type::type t); +std::string TypeToString(Type::type t); std::string FormatStatValue(Type::type parquet_type, const char* val); + +int GetTypeByteSize(Type::type t); } // namespace parquet #endif // PARQUET_TYPES_H http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c67ee3e5/tools/CMakeLists.txt ---------------------------------------------------------------------- diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt new file mode 100644 index 0000000..5c4eaa8 --- /dev/null +++ b/tools/CMakeLists.txt @@ -0,0 +1,34 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +SET(LINK_LIBS + snappystatic + thriftstatic) + +if (PARQUET_BUILD_EXECUTABLES) + add_executable(parquet-dump-schema parquet-dump-schema.cc) + target_link_libraries(parquet-dump-schema ${LINK_LIBS} + parquet_static) + + add_executable(parquet_reader parquet_reader.cc) + target_link_libraries(parquet_reader ${LINK_LIBS} + parquet_static) + + add_executable(parquet-scan parquet-scan.cc) + target_link_libraries(parquet-scan ${LINK_LIBS} + parquet_static) +endif() http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c67ee3e5/tools/parquet-dump-schema.cc ---------------------------------------------------------------------- diff --git a/tools/parquet-dump-schema.cc b/tools/parquet-dump-schema.cc new file mode 100644 index 0000000..deef2fd --- /dev/null +++ b/tools/parquet-dump-schema.cc @@ -0,0 +1,36 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <iostream> + +#include "parquet/api/reader.h" +#include "parquet/api/schema.h" + +int main(int argc, char** argv) { + std::string filename = argv[1]; + + try { + std::unique_ptr<parquet::ParquetFileReader> reader = + parquet::ParquetFileReader::OpenFile(filename); + PrintSchema(reader->metadata()->schema_descriptor()->schema().get(), std::cout); + } catch (const std::exception& e) { + std::cerr << "Parquet error: " << e.what() << std::endl; + return -1; + } + + return 0; +} http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c67ee3e5/tools/parquet-scan.cc ---------------------------------------------------------------------- diff --git a/tools/parquet-scan.cc b/tools/parquet-scan.cc new file mode 100644 index 0000000..d146a1d --- /dev/null +++ b/tools/parquet-scan.cc @@ -0,0 +1,108 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <ctime> +#include <iostream> +#include <memory> +#include <list> + +#include "parquet/api/reader.h" + +int main(int argc, char** argv) { + if (argc > 4 || argc < 1) { + std::cerr << "Usage: parquet-scan [--batch-size=] [--columns=...] <file>" + << std::endl; + return -1; + } + + std::string filename; + + // Read command-line options + int batch_size = 256; + const std::string COLUMNS_PREFIX = "--columns="; + const std::string BATCH_SIZE_PREFIX = "--batch-size="; + std::vector<int> columns; + int num_columns = 0; + + char *param, *value; + for (int i = 1; i < argc; i++) { + if ((param = std::strstr(argv[i], COLUMNS_PREFIX.c_str()))) { + value = std::strtok(param + COLUMNS_PREFIX.length(), ","); + while (value) { + columns.push_back(std::atoi(value)); + value = std::strtok(nullptr, ","); + num_columns++; + } + } else if ((param = std::strstr(argv[i], BATCH_SIZE_PREFIX.c_str()))) { + value = std::strtok(param + BATCH_SIZE_PREFIX.length(), " "); + if (value) { batch_size = std::atoi(value); } + } else { + filename = argv[i]; + } + } + + std::vector<int16_t> rep_levels(batch_size); + std::vector<int16_t> def_levels(batch_size); + try { + double total_time; + std::clock_t start_time = std::clock(); + std::unique_ptr<parquet::ParquetFileReader> reader = + parquet::ParquetFileReader::OpenFile(filename); + // columns are not specified explicitly. Add all columns + if (num_columns == 0) { + num_columns = reader->metadata()->num_columns(); + columns.resize(num_columns); + for (int i = 0; i < num_columns; i++) { + columns[i] = i; + } + } + + int64_t total_rows[num_columns]; + + for (int r = 0; r < reader->metadata()->num_row_groups(); ++r) { + auto group_reader = reader->RowGroup(r); + int col = 0; + for (auto i : columns) { + total_rows[col] = 0; + std::shared_ptr<parquet::ColumnReader> col_reader = group_reader->Column(i); + size_t value_byte_size = GetTypeByteSize(col_reader->descr()->physical_type()); + std::vector<uint8_t> values(batch_size * value_byte_size); + + int64_t values_read = 0; + while (col_reader->HasNext()) { + total_rows[col] += ScanAllValues(batch_size, def_levels.data(), + rep_levels.data(), values.data(), &values_read, col_reader.get()); + } + col++; + } + } + + total_time = (std::clock() - start_time) / static_cast<double>(CLOCKS_PER_SEC); + for (int ct = 1; ct < num_columns; ++ct) { + if (total_rows[0] != total_rows[ct]) { + std::cerr << "Parquet error: Total rows among columns do not match" << std::endl; + } + } + std::cout << total_rows[0] << " rows scanned in " << total_time << " seconds." + << std::endl; + } catch (const std::exception& e) { + std::cerr << "Parquet error: " << e.what() << std::endl; + return -1; + } + + return 0; +} http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c67ee3e5/tools/parquet_reader.cc ---------------------------------------------------------------------- diff --git a/tools/parquet_reader.cc b/tools/parquet_reader.cc new file mode 100644 index 0000000..ced84d5 --- /dev/null +++ b/tools/parquet_reader.cc @@ -0,0 +1,67 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <iostream> +#include <memory> +#include <list> + +#include "parquet/api/reader.h" + +int main(int argc, char** argv) { + if (argc > 5 || argc < 2) { + std::cerr << "Usage: parquet_reader [--only-metadata] [--no-memory-map] " + "[--columns=...] <file>" + << std::endl; + return -1; + } + + std::string filename; + bool print_values = true; + bool memory_map = true; + + // Read command-line options + const std::string COLUMNS_PREFIX = "--columns="; + std::list<int> columns; + + char *param, *value; + for (int i = 1; i < argc; i++) { + if ((param = std::strstr(argv[i], "--only-metadata"))) { + print_values = false; + } else if ((param = std::strstr(argv[i], "--no-memory-map"))) { + memory_map = false; + } else if ((param = std::strstr(argv[i], COLUMNS_PREFIX.c_str()))) { + value = std::strtok(param + COLUMNS_PREFIX.length(), ","); + while (value) { + columns.push_back(std::atoi(value)); + value = std::strtok(nullptr, ","); + } + } else { + filename = argv[i]; + } + } + + try { + std::unique_ptr<parquet::ParquetFileReader> reader = + parquet::ParquetFileReader::OpenFile(filename, memory_map); + reader->DebugPrint(std::cout, columns, print_values); + } catch (const std::exception& e) { + std::cerr << "Parquet error: " << e.what() << std::endl; + return -1; + } + + return 0; +}
