parquet-cpp git commit: PARQUET-681: Add tool to scan a parquet file

wesm Mon, 05 Sep 2016 12:50:18 -0700

Repository: parquet-cpp
Updated Branches:
  refs/heads/master fa6e47616 -> c67ee3e52



PARQUET-681: Add tool to scan a parquet file

Added a ReadBatchValues() API to the Column class.
Added a parquet-scan tool
Separated examples into benchmarks/tools
added clang tidy and clang format to benchmarks and tools

Author: Deepak Majeti <[email protected]>

Closes #144 from majetideepak/parquetscan and squashes the following commits:

cc7f183 [Deepak Majeti] Removed GetRemainingInPage API
44da480 [Deepak Majeti] add scan all in public api
20829b8 [Deepak Majeti] clang-format
da62354 [Deepak Majeti] ScanAllValues API
e385f61 [Deepak Majeti] put clang-* in the root directory
9ff785c [Deepak Majeti] use c++ random
d854bde [Deepak Majeti] parquet scan tool


Project: http://git-wip-us.apache.org/repos/asf/parquet-cpp/repo
Commit: http://git-wip-us.apache.org/repos/asf/parquet-cpp/commit/c67ee3e5
Tree: http://git-wip-us.apache.org/repos/asf/parquet-cpp/tree/c67ee3e5
Diff: http://git-wip-us.apache.org/repos/asf/parquet-cpp/diff/c67ee3e5

Branch: refs/heads/master
Commit: c67ee3e521b93531f4b59d3183a04ad5d316cbae
Parents: fa6e476
Author: Deepak Majeti <[email protected]>
Authored: Mon Sep 5 15:49:36 2016 -0400
Committer: Wes McKinney <[email protected]>
Committed: Mon Sep 5 15:49:36 2016 -0400

----------------------------------------------------------------------
 .clang-format                  |  65 +++++
 .clang-tidy                    |  13 +
 .clang-tidy-ignore             |   1 +
 CMakeLists.txt                 |  15 +-
 benchmarks/CMakeLists.txt      |  29 +++
 benchmarks/decode_benchmark.cc | 461 ++++++++++++++++++++++++++++++++++++
 example/CMakeLists.txt         |  36 ---
 example/decode_benchmark.cc    | 460 -----------------------------------
 example/parquet-dump-schema.cc |  39 ---
 example/parquet_reader.cc      |  70 ------
 src/.clang-format              |  65 -----
 src/.clang-tidy                |  13 -
 src/.clang-tidy-ignore         |   1 -
 src/parquet/api/reader.h       |   1 +
 src/parquet/column/reader.h    |   1 -
 src/parquet/column/scan-all.h  |  67 ++++++
 src/parquet/column/scanner.h   |   1 -
 src/parquet/file/reader.cc     |   6 +-
 src/parquet/schema/types.cc    |   8 +-
 src/parquet/types-test.cc      |  62 +++--
 src/parquet/types.cc           |  33 ++-
 src/parquet/types.h            |  10 +-
 tools/CMakeLists.txt           |  34 +++
 tools/parquet-dump-schema.cc   |  36 +++
 tools/parquet-scan.cc          | 108 +++++++++
 tools/parquet_reader.cc        |  67 ++++++
 26 files changed, 962 insertions(+), 740 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c67ee3e5/.clang-format
----------------------------------------------------------------------
diff --git a/.clang-format b/.clang-format
new file mode 100644
index 0000000..7d5b3cf
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,65 @@
+---
+Language:        Cpp
+# BasedOnStyle:  Google
+AccessModifierOffset: -1
+AlignAfterOpenBracket: false 
+AlignConsecutiveAssignments: false
+AlignEscapedNewlinesLeft: true
+AlignOperands:   true
+AlignTrailingComments: true
+AllowAllParametersOfDeclarationOnNextLine: true
+AllowShortBlocksOnASingleLine: true 
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: Inline
+AllowShortIfStatementsOnASingleLine: true
+AllowShortLoopsOnASingleLine: false 
+AlwaysBreakAfterDefinitionReturnType: None
+AlwaysBreakBeforeMultilineStrings: true
+AlwaysBreakTemplateDeclarations: true
+BinPackArguments: true
+BinPackParameters: true 
+BreakBeforeBinaryOperators: None
+BreakBeforeBraces: Attach
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializersBeforeComma: false
+ColumnLimit: 90 
+CommentPragmas:  '^ IWYU pragma:'
+ConstructorInitializerAllOnOneLineOrOnePerLine: true
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: true
+DerivePointerAlignment: false 
+DisableFormat:   false
+ExperimentalAutoDetectBinPacking: false
+ForEachMacros:   [ foreach, Q_FOREACH, BOOST_FOREACH ]
+IndentCaseLabels: true
+IndentWidth:     2
+IndentWrappedFunctionNames: false
+KeepEmptyLinesAtTheStartOfBlocks: false
+MacroBlockBegin: ''
+MacroBlockEnd:   ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+ObjCBlockIndentWidth: 2
+ObjCSpaceAfterProperty: false
+ObjCSpaceBeforeProtocolList: false
+PenaltyBreakBeforeFirstCallParameter: 1000
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 200
+PointerAlignment: Left
+SpaceAfterCStyleCast: false
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeParens: ControlStatements
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 2
+SpacesInAngles:  false
+SpacesInContainerLiterals: true
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+Standard:        Cpp11
+TabWidth:        8
+UseTab:          Never

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c67ee3e5/.clang-tidy
----------------------------------------------------------------------
diff --git a/.clang-tidy b/.clang-tidy
new file mode 100644
index 0000000..6fc3742
--- /dev/null
+++ b/.clang-tidy
@@ -0,0 +1,13 @@
+---
+Checks:          
'clang-diagnostic-*,clang-analyzer-*,-clang-analyzer-alpha*,google-.*,modernize-.*,readablity-.*'
+HeaderFilterRegex: 'parquet/.*'
+AnalyzeTemporaryDtors: true
+CheckOptions:
+  - key:             
google-readability-braces-around-statements.ShortStatementLines
+    value:           '1'
+  - key:             google-readability-function-size.StatementThreshold
+    value:           '800'
+  - key:             google-readability-namespace-comments.ShortNamespaceLines
+    value:           '10'
+  - key:             google-readability-namespace-comments.SpacesBeforeComments
+    value:           '2'

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c67ee3e5/.clang-tidy-ignore
----------------------------------------------------------------------
diff --git a/.clang-tidy-ignore b/.clang-tidy-ignore
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/.clang-tidy-ignore
@@ -0,0 +1 @@
+

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c67ee3e5/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5c26e79..86d0684 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -404,7 +404,7 @@ if (UNIX)
   --verbose=2
   --linelength=90
   
--filter=-whitespace/comments,-readability/todo,-build/header_guard,-runtime/references,-readability/check,-build/c++11
-    `find ${CMAKE_CURRENT_SOURCE_DIR}/src -name \\*.cc -or -name \\*.h | sed 
-e '/parquet\\/thrift/g'`)
+    `find ${CMAKE_CURRENT_SOURCE_DIR}/src ${CMAKE_CURRENT_SOURCE_DIR}/tools  
${CMAKE_CURRENT_SOURCE_DIR}/benchmarks -name \\*.cc -or -name \\*.h | sed -e 
'/parquet\\/thrift/g'`)
 endif (UNIX)
 
 ############################################################
@@ -414,11 +414,11 @@ endif (UNIX)
 if (${CLANG_FORMAT_FOUND})
   # runs clang format and updates files in place.
   add_custom_target(format ${BUILD_SUPPORT_DIR}/run-clang-format.sh 
${CMAKE_CURRENT_SOURCE_DIR} ${CLANG_FORMAT_BIN} 1
-  `find ${CMAKE_CURRENT_SOURCE_DIR}/src -name \\*.cc -or -name \\*.h | sed -e 
'/_generated/g'`)
+  `find ${CMAKE_CURRENT_SOURCE_DIR}/src ${CMAKE_CURRENT_SOURCE_DIR}/tools  
${CMAKE_CURRENT_SOURCE_DIR}/benchmarks -name \\*.cc -or -name \\*.h | sed -e 
'/_generated/g'`)
 
   # runs clang format and exits with a non-zero exit code if any files need to 
be reformatted
   add_custom_target(check-format ${BUILD_SUPPORT_DIR}/run-clang-format.sh 
${CMAKE_CURRENT_SOURCE_DIR} ${CLANG_FORMAT_BIN} 0
-  `find ${CMAKE_CURRENT_SOURCE_DIR}/src -name \\*.cc -or -name \\*.h | sed -e 
'/_generated/g'`)
+  `find ${CMAKE_CURRENT_SOURCE_DIR}/src ${CMAKE_CURRENT_SOURCE_DIR}/tools  
${CMAKE_CURRENT_SOURCE_DIR}/benchmarks -name \\*.cc -or -name \\*.h | sed -e 
'/_generated/g'`)
 endif()
 
 
@@ -429,10 +429,10 @@ endif()
 if (${CLANG_TIDY_FOUND})
   # runs clang-tidy and attempts to fix any warning automatically
   add_custom_target(clang-tidy ${BUILD_SUPPORT_DIR}/run-clang-tidy.sh 
${CLANG_TIDY_BIN} ${CMAKE_BINARY_DIR}/compile_commands.json 1
-  `find ${CMAKE_CURRENT_SOURCE_DIR}/src -name \\*.cc | sed -e '/_types/g' | 
sed -e '/_constants/g'`)
+  `find ${CMAKE_CURRENT_SOURCE_DIR}/src ${CMAKE_CURRENT_SOURCE_DIR}/tools  
${CMAKE_CURRENT_SOURCE_DIR}/benchmarks -name \\*.cc | sed -e '/_types/g' | sed 
-e '/_constants/g'`)
   # runs clang-tidy and exits with a non-zero exit code if any errors are 
found.
   add_custom_target(check-clang-tidy ${BUILD_SUPPORT_DIR}/run-clang-tidy.sh 
${CLANG_TIDY_BIN} ${CMAKE_BINARY_DIR}/compile_commands.json
-  0 `find ${CMAKE_CURRENT_SOURCE_DIR}/src -name \\*.cc |grep -v -F -f 
${CMAKE_CURRENT_SOURCE_DIR}/src/.clang-tidy-ignore`)
+  0 `find ${CMAKE_CURRENT_SOURCE_DIR}/src ${CMAKE_CURRENT_SOURCE_DIR}/tools  
${CMAKE_CURRENT_SOURCE_DIR}/benchmarks -name \\*.cc |grep -v -F -f 
${CMAKE_CURRENT_SOURCE_DIR}/.clang-tidy-ignore`)
 
 endif()
 
@@ -575,11 +575,12 @@ add_subdirectory(src/parquet/schema)
 add_subdirectory(src/parquet/thrift)
 add_subdirectory(src/parquet/util)
 
-# Ensure that thrift compilation is done befor using its generated headers
+# Ensure that thrift compilation is done before using its generated headers
 # in parquet code.
 add_dependencies(parquet_objlib parquet_thrift)
 
-add_subdirectory(example)
+add_subdirectory(benchmarks)
+add_subdirectory(tools)
 
 add_custom_target(clean-all
    COMMAND ${CMAKE_BUILD_TOOL} clean

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c67ee3e5/benchmarks/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
new file mode 100644
index 0000000..1df5dea
--- /dev/null
+++ b/benchmarks/CMakeLists.txt
@@ -0,0 +1,29 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+SET(LINK_LIBS
+  snappystatic
+  thriftstatic)
+
+if (PARQUET_BUILD_EXECUTABLES)
+  add_executable(decode_benchmark decode_benchmark.cc)
+
+  # This uses private APIs
+  target_link_libraries(decode_benchmark ${LINK_LIBS}
+       parquet_static)
+
+endif()

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c67ee3e5/benchmarks/decode_benchmark.cc
----------------------------------------------------------------------
diff --git a/benchmarks/decode_benchmark.cc b/benchmarks/decode_benchmark.cc
new file mode 100644
index 0000000..d3748ca
--- /dev/null
+++ b/benchmarks/decode_benchmark.cc
@@ -0,0 +1,461 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <stdio.h>
+#include <iostream>
+#include <random>
+
+#include "parquet/compression/codec.h"
+#include "parquet/encodings/plain-encoding.h"
+#include "parquet/encodings/dictionary-encoding.h"
+#include "parquet/encodings/delta-bit-pack-encoding.h"
+#include "parquet/encodings/delta-byte-array-encoding.h"
+#include "parquet/encodings/delta-length-byte-array-encoding.h"
+#include "parquet/util/stopwatch.h"
+
+/**
+ * Test bed for encodings and some utilities to measure their throughput.
+ * TODO: this file needs some major cleanup.
+ */
+
+class DeltaBitPackEncoder {
+ public:
+  explicit DeltaBitPackEncoder(int mini_block_size = 8) {
+    mini_block_size_ = mini_block_size;
+  }
+
+  void Add(int64_t v) { values_.push_back(v); }
+
+  uint8_t* Encode(int* encoded_len) {
+    uint8_t* result = new uint8_t[10 * 1024 * 1024];
+    int num_mini_blocks = parquet::BitUtil::Ceil(num_values() - 1, 
mini_block_size_);
+    uint8_t* mini_block_widths = NULL;
+
+    parquet::BitWriter writer(result, 10 * 1024 * 1024);
+
+    // Writer the size of each block. We only use 1 block currently.
+    writer.PutVlqInt(num_mini_blocks * mini_block_size_);
+
+    // Write the number of mini blocks.
+    writer.PutVlqInt(num_mini_blocks);
+
+    // Write the number of values.
+    writer.PutVlqInt(num_values() - 1);
+
+    // Write the first value.
+    writer.PutZigZagVlqInt(values_[0]);
+
+    // Compute the values as deltas and the min delta.
+    int64_t min_delta = std::numeric_limits<int64_t>::max();
+    for (int i = values_.size() - 1; i > 0; --i) {
+      values_[i] -= values_[i - 1];
+      min_delta = std::min(min_delta, values_[i]);
+    }
+
+    // Write out the min delta.
+    writer.PutZigZagVlqInt(min_delta);
+
+    // We need to save num_mini_blocks bytes to store the bit widths of the 
mini
+    // blocks.
+    mini_block_widths = writer.GetNextBytePtr(num_mini_blocks);
+
+    int idx = 1;
+    for (int i = 0; i < num_mini_blocks; ++i) {
+      int n = std::min(mini_block_size_, num_values() - idx);
+
+      // Compute the max delta in this mini block.
+      int64_t max_delta = std::numeric_limits<int64_t>::min();
+      for (int j = 0; j < n; ++j) {
+        max_delta = std::max(values_[idx + j], max_delta);
+      }
+
+      // The bit width for this block is the number of bits needed to store
+      // (max_delta - min_delta).
+      int bit_width = parquet::BitUtil::NumRequiredBits(max_delta - min_delta);
+      mini_block_widths[i] = bit_width;
+
+      // Encode this mini blocking using min_delta and bit_width
+      for (int j = 0; j < n; ++j) {
+        writer.PutValue(values_[idx + j] - min_delta, bit_width);
+      }
+
+      // Pad out the last block.
+      for (int j = n; j < mini_block_size_; ++j) {
+        writer.PutValue(0, bit_width);
+      }
+      idx += n;
+    }
+
+    writer.Flush();
+    *encoded_len = writer.bytes_written();
+    return result;
+  }
+
+  int num_values() const { return values_.size(); }
+
+ private:
+  int mini_block_size_;
+  std::vector<int64_t> values_;
+};
+
+class DeltaLengthByteArrayEncoder {
+ public:
+  explicit DeltaLengthByteArrayEncoder(int mini_block_size = 8)
+      : len_encoder_(mini_block_size),
+        buffer_(new uint8_t[10 * 1024 * 1024]),
+        offset_(0),
+        plain_encoded_len_(0) {}
+
+  void Add(const std::string& s) {
+    Add(reinterpret_cast<const uint8_t*>(s.data()), s.size());
+  }
+
+  void Add(const uint8_t* ptr, int len) {
+    plain_encoded_len_ += len + sizeof(int);
+    len_encoder_.Add(len);
+    memcpy(buffer_ + offset_, ptr, len);
+    offset_ += len;
+  }
+
+  uint8_t* Encode(int* encoded_len) {
+    uint8_t* encoded_lengths = len_encoder_.Encode(encoded_len);
+    memmove(buffer_ + *encoded_len + sizeof(int), buffer_, offset_);
+    memcpy(buffer_, encoded_len, sizeof(int));
+    memcpy(buffer_ + sizeof(int), encoded_lengths, *encoded_len);
+    *encoded_len += offset_ + sizeof(int);
+    return buffer_;
+  }
+
+  int num_values() const { return len_encoder_.num_values(); }
+  int plain_encoded_len() const { return plain_encoded_len_; }
+
+ private:
+  DeltaBitPackEncoder len_encoder_;
+  uint8_t* buffer_;
+  int offset_;
+  int plain_encoded_len_;
+};
+
+class DeltaByteArrayEncoder {
+ public:
+  DeltaByteArrayEncoder() : plain_encoded_len_(0) {}
+
+  void Add(const std::string& s) {
+    plain_encoded_len_ += s.size() + sizeof(int);
+    int min_len = std::min(s.size(), last_value_.size());
+    int prefix_len = 0;
+    for (int i = 0; i < min_len; ++i) {
+      if (s[i] == last_value_[i]) {
+        ++prefix_len;
+      } else {
+        break;
+      }
+    }
+    prefix_len_encoder_.Add(prefix_len);
+    suffix_encoder_.Add(
+        reinterpret_cast<const uint8_t*>(s.data()) + prefix_len, s.size() - 
prefix_len);
+    last_value_ = s;
+  }
+
+  uint8_t* Encode(int* encoded_len) {
+    int prefix_buffer_len;
+    uint8_t* prefix_buffer = prefix_len_encoder_.Encode(&prefix_buffer_len);
+    int suffix_buffer_len;
+    uint8_t* suffix_buffer = suffix_encoder_.Encode(&suffix_buffer_len);
+
+    uint8_t* buffer = new uint8_t[10 * 1024 * 1024];
+    memcpy(buffer, &prefix_buffer_len, sizeof(int));
+    memcpy(buffer + sizeof(int), prefix_buffer, prefix_buffer_len);
+    memcpy(buffer + sizeof(int) + prefix_buffer_len, suffix_buffer, 
suffix_buffer_len);
+    *encoded_len = sizeof(int) + prefix_buffer_len + suffix_buffer_len;
+    return buffer;
+  }
+
+  int num_values() const { return prefix_len_encoder_.num_values(); }
+  int plain_encoded_len() const { return plain_encoded_len_; }
+
+ private:
+  DeltaBitPackEncoder prefix_len_encoder_;
+  DeltaLengthByteArrayEncoder suffix_encoder_;
+  std::string last_value_;
+  int plain_encoded_len_;
+};
+
+uint64_t TestPlainIntEncoding(const uint8_t* data, int num_values, int 
batch_size) {
+  uint64_t result = 0;
+  parquet::PlainDecoder<parquet::Int64Type> decoder(nullptr);
+  decoder.SetData(num_values, data, num_values * sizeof(int64_t));
+  int64_t values[batch_size];
+  for (int i = 0; i < num_values;) {
+    int n = decoder.Decode(values, batch_size);
+    for (int j = 0; j < n; ++j) {
+      result += values[j];
+    }
+    i += n;
+  }
+  return result;
+}
+
+uint64_t TestBinaryPackedEncoding(const char* name, const 
std::vector<int64_t>& values,
+    int benchmark_iters = -1, int benchmark_batch_size = 1) {
+  int mini_block_size;
+  if (values.size() < 8) {
+    mini_block_size = 8;
+  } else if (values.size() < 16) {
+    mini_block_size = 16;
+  } else {
+    mini_block_size = 32;
+  }
+  parquet::DeltaBitPackDecoder<parquet::Int64Type> decoder(nullptr);
+  DeltaBitPackEncoder encoder(mini_block_size);
+  for (size_t i = 0; i < values.size(); ++i) {
+    encoder.Add(values[i]);
+  }
+
+  int raw_len = encoder.num_values() * sizeof(int);
+  int len;
+  uint8_t* buffer = encoder.Encode(&len);
+
+  if (benchmark_iters == -1) {
+    printf("%s\n", name);
+    printf("  Raw len: %d\n", raw_len);
+    printf("  Encoded len: %d (%0.2f%%)\n", len, len * 100 / 
static_cast<float>(raw_len));
+    decoder.SetData(encoder.num_values(), buffer, len);
+    for (int i = 0; i < encoder.num_values(); ++i) {
+      int64_t x = 0;
+      decoder.Decode(&x, 1);
+      if (values[i] != x) {
+        std::cerr << "Bad: " << i << std::endl;
+        std::cerr << "  " << x << " != " << values[i] << std::endl;
+        break;
+      }
+    }
+    return 0;
+  } else {
+    printf("%s\n", name);
+    printf("  Raw len: %d\n", raw_len);
+    printf("  Encoded len: %d (%0.2f%%)\n", len, len * 100 / 
static_cast<float>(raw_len));
+
+    uint64_t result = 0;
+    int64_t buf[benchmark_batch_size];
+    parquet::StopWatch sw;
+    sw.Start();
+    for (int k = 0; k < benchmark_iters; ++k) {
+      decoder.SetData(encoder.num_values(), buffer, len);
+      for (size_t i = 0; i < values.size();) {
+        int n = decoder.Decode(buf, benchmark_batch_size);
+        for (int j = 0; j < n; ++j) {
+          result += buf[j];
+        }
+        i += n;
+      }
+    }
+    uint64_t elapsed = sw.Stop();
+    double num_ints = values.size() * benchmark_iters * 1000.;
+    printf("%s rate (batch size = %2d): %0.3fM per second.\n", name, 
benchmark_batch_size,
+        num_ints / elapsed);
+    return result;
+  }
+}
+
+#define TEST(NAME, FN, DATA, BATCH_SIZE)                                       
\
+  sw.Start();                                                                  
\
+  for (int i = 0; i < NUM_ITERS; ++i) {                                        
\
+    FN(reinterpret_cast<uint8_t*>(&DATA[0]), NUM_VALUES, BATCH_SIZE);          
\
+  }                                                                            
\
+  elapsed = sw.Stop();                                                         
\
+  printf("%s rate (batch size = %2d): %0.3fM per second.\n", NAME, BATCH_SIZE, 
\
+      mult / elapsed);
+
+void TestPlainIntCompressed(parquet::Codec* codec, const std::vector<int64_t>& 
data,
+    int num_iters, int batch_size) {
+  const uint8_t* raw_data = reinterpret_cast<const uint8_t*>(&data[0]);
+  int uncompressed_len = data.size() * sizeof(int64_t);
+  uint8_t* decompressed_data = new uint8_t[uncompressed_len];
+
+  int max_compressed_size = codec->MaxCompressedLen(uncompressed_len, 
raw_data);
+  uint8_t* compressed_data = new uint8_t[max_compressed_size];
+  int compressed_len =
+      codec->Compress(uncompressed_len, raw_data, max_compressed_size, 
compressed_data);
+
+  printf("\n%s:\n  Uncompressed len: %d\n  Compressed len:   %d\n", 
codec->name(),
+      uncompressed_len, compressed_len);
+
+  double mult = num_iters * data.size() * 1000.;
+  parquet::StopWatch sw;
+  sw.Start();
+  uint64_t r = 0;
+  for (int i = 0; i < num_iters; ++i) {
+    codec->Decompress(
+        compressed_len, compressed_data, uncompressed_len, decompressed_data);
+    r += TestPlainIntEncoding(decompressed_data, data.size(), batch_size);
+  }
+  int64_t elapsed = sw.Stop();
+  printf("Compressed(%s) plain int rate (batch size = %2d): %0.3fM per 
second.\n",
+      codec->name(), batch_size, mult / elapsed);
+
+  delete[] compressed_data;
+  delete[] decompressed_data;
+}
+
+void TestBinaryPacking() {
+  std::vector<int64_t> values;
+  values.clear();
+  for (int i = 0; i < 100; ++i)
+    values.push_back(0);
+  TestBinaryPackedEncoding("Zeros", values);
+
+  values.clear();
+  for (int i = 1; i <= 5; ++i)
+    values.push_back(i);
+  TestBinaryPackedEncoding("Example 1", values);
+
+  values.clear();
+  values.push_back(7);
+  values.push_back(5);
+  values.push_back(3);
+  values.push_back(1);
+  values.push_back(2);
+  values.push_back(3);
+  values.push_back(4);
+  values.push_back(5);
+  TestBinaryPackedEncoding("Example 2", values);
+
+  // Test rand ints between 0 and 10K
+  values.clear();
+  int seed = 0;
+  std::mt19937 gen(seed);
+  std::uniform_int_distribution<int> d(0, 10000);
+  for (int i = 0; i < 500000; ++i) {
+    values.push_back(d(gen));
+  }
+  TestBinaryPackedEncoding("Rand [0, 10000)", values);
+
+  // Test rand ints between 0 and 100
+  values.clear();
+  std::uniform_int_distribution<int> d1(0, 100);
+  for (int i = 0; i < 500000; ++i) {
+    values.push_back(d1(gen));
+  }
+  TestBinaryPackedEncoding("Rand [0, 100)", values);
+}
+
+void TestDeltaLengthByteArray() {
+  parquet::DeltaLengthByteArrayDecoder decoder(nullptr);
+  DeltaLengthByteArrayEncoder encoder;
+
+  std::vector<std::string> values;
+  values.push_back("Hello");
+  values.push_back("World");
+  values.push_back("Foobar");
+  values.push_back("ABCDEF");
+
+  for (size_t i = 0; i < values.size(); ++i) {
+    encoder.Add(values[i]);
+  }
+
+  int len = 0;
+  uint8_t* buffer = encoder.Encode(&len);
+  printf("DeltaLengthByteArray\n  Raw len: %d\n  Encoded len: %d\n",
+      encoder.plain_encoded_len(), len);
+  decoder.SetData(encoder.num_values(), buffer, len);
+  for (int i = 0; i < encoder.num_values(); ++i) {
+    parquet::ByteArray v = {0, NULL};
+    decoder.Decode(&v, 1);
+    std::string r = std::string(reinterpret_cast<const char*>(v.ptr), v.len);
+    if (r != values[i]) { std::cout << "Bad " << r << " != " << values[i] << 
std::endl; }
+  }
+}
+
+void TestDeltaByteArray() {
+  parquet::DeltaByteArrayDecoder decoder(nullptr);
+  DeltaByteArrayEncoder encoder;
+
+  std::vector<std::string> values;
+
+  // Wikipedia example
+  values.push_back("myxa");
+  values.push_back("myxophyta");
+  values.push_back("myxopod");
+  values.push_back("nab");
+  values.push_back("nabbed");
+  values.push_back("nabbing");
+  values.push_back("nabit");
+  values.push_back("nabk");
+  values.push_back("nabob");
+  values.push_back("nacarat");
+  values.push_back("nacelle");
+
+  for (size_t i = 0; i < values.size(); ++i) {
+    encoder.Add(values[i]);
+  }
+
+  int len = 0;
+  uint8_t* buffer = encoder.Encode(&len);
+  printf("DeltaLengthByteArray\n  Raw len: %d\n  Encoded len: %d\n",
+      encoder.plain_encoded_len(), len);
+  decoder.SetData(encoder.num_values(), buffer, len);
+  for (int i = 0; i < encoder.num_values(); ++i) {
+    parquet::ByteArray v;
+    decoder.Decode(&v, 1);
+    std::string r = std::string(reinterpret_cast<const char*>(v.ptr), v.len);
+    if (r != values[i]) { std::cout << "Bad " << r << " != " << values[i] << 
std::endl; }
+  }
+}
+
+int main(int argc, char** argv) {
+  TestBinaryPacking();
+  TestDeltaLengthByteArray();
+  TestDeltaByteArray();
+
+  parquet::StopWatch sw;
+  uint64_t elapsed = 0;
+
+  const int NUM_VALUES = 1024 * 1024;
+  const int NUM_ITERS = 10;
+  const double mult = NUM_VALUES * NUM_ITERS * 1000.;
+
+  std::vector<int64_t> plain_int_data;
+  plain_int_data.resize(NUM_VALUES);
+
+  TEST("Plain decoder", TestPlainIntEncoding, plain_int_data, 1);
+  TEST("Plain decoder", TestPlainIntEncoding, plain_int_data, 16);
+  TEST("Plain decoder", TestPlainIntEncoding, plain_int_data, 32);
+  TEST("Plain decoder", TestPlainIntEncoding, plain_int_data, 64);
+
+  // Test rand ints between 0 and 10K
+  std::vector<int64_t> values;
+  int seed = 0;
+  std::mt19937 gen(seed);
+  std::uniform_int_distribution<int> d(0, 10000);
+  for (int i = 0; i < 1000000; ++i) {
+    values.push_back(d(gen));
+  }
+  TestBinaryPackedEncoding("Rand 0-10K", values, 100, 1);
+  TestBinaryPackedEncoding("Rand 0-10K", values, 100, 16);
+  TestBinaryPackedEncoding("Rand 0-10K", values, 100, 32);
+  TestBinaryPackedEncoding("Rand 0-10K", values, 100, 64);
+
+  parquet::SnappyCodec snappy_codec;
+
+  TestPlainIntCompressed(&snappy_codec, values, 100, 1);
+  TestPlainIntCompressed(&snappy_codec, values, 100, 16);
+  TestPlainIntCompressed(&snappy_codec, values, 100, 32);
+  TestPlainIntCompressed(&snappy_codec, values, 100, 64);
+
+  return 0;
+}

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c67ee3e5/example/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt
deleted file mode 100644
index d622d09..0000000
--- a/example/CMakeLists.txt
+++ /dev/null
@@ -1,36 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-SET(LINK_LIBS
-  snappystatic
-  thriftstatic)
-
-if (PARQUET_BUILD_EXECUTABLES)
-  add_executable(decode_benchmark decode_benchmark.cc)
-
-  # This uses private APIs
-  target_link_libraries(decode_benchmark ${LINK_LIBS}
-       parquet_static)
-
-  add_executable(parquet-dump-schema parquet-dump-schema.cc)
-  target_link_libraries(parquet-dump-schema ${LINK_LIBS}
-       parquet_static)
-
-  add_executable(parquet_reader parquet_reader.cc)
-  target_link_libraries(parquet_reader ${LINK_LIBS}
-       parquet_static)
-endif()

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c67ee3e5/example/decode_benchmark.cc
----------------------------------------------------------------------
diff --git a/example/decode_benchmark.cc b/example/decode_benchmark.cc
deleted file mode 100644
index 4eb1975..0000000
--- a/example/decode_benchmark.cc
+++ /dev/null
@@ -1,460 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include <iostream>
-#include <stdio.h>
-
-#include "parquet/compression/codec.h"
-#include "parquet/encodings/plain-encoding.h"
-#include "parquet/encodings/dictionary-encoding.h"
-#include "parquet/encodings/delta-bit-pack-encoding.h"
-#include "parquet/encodings/delta-byte-array-encoding.h"
-#include "parquet/encodings/delta-length-byte-array-encoding.h"
-#include "parquet/util/stopwatch.h"
-
-using namespace parquet;
-using namespace std;
-
-/**
- * Test bed for encodings and some utilities to measure their throughput.
- * TODO: this file needs some major cleanup.
- */
-
-class DeltaBitPackEncoder {
- public:
-  DeltaBitPackEncoder(int mini_block_size = 8) {
-    mini_block_size_ = mini_block_size;
-  }
-
-  void Add(int64_t v) {
-    values_.push_back(v);
-  }
-
-  uint8_t* Encode(int* encoded_len) {
-    uint8_t* result = new uint8_t[10 * 1024 * 1024];
-    int num_mini_blocks = BitUtil::Ceil(num_values() - 1, mini_block_size_);
-    uint8_t* mini_block_widths = NULL;
-
-    BitWriter writer(result, 10 * 1024 * 1024);
-
-    // Writer the size of each block. We only use 1 block currently.
-    writer.PutVlqInt(num_mini_blocks * mini_block_size_);
-
-    // Write the number of mini blocks.
-    writer.PutVlqInt(num_mini_blocks);
-
-    // Write the number of values.
-    writer.PutVlqInt(num_values() - 1);
-
-    // Write the first value.
-    writer.PutZigZagVlqInt(values_[0]);
-
-    // Compute the values as deltas and the min delta.
-    int64_t min_delta = numeric_limits<int64_t>::max();
-    for (int i = values_.size() - 1; i > 0; --i) {
-      values_[i] -= values_[i - 1];
-      min_delta = min(min_delta, values_[i]);
-    }
-
-    // Write out the min delta.
-    writer.PutZigZagVlqInt(min_delta);
-
-    // We need to save num_mini_blocks bytes to store the bit widths of the 
mini blocks.
-    mini_block_widths = writer.GetNextBytePtr(num_mini_blocks);
-
-    int idx = 1;
-    for (int i = 0; i < num_mini_blocks; ++i) {
-      int n = min(mini_block_size_, num_values() - idx);
-
-      // Compute the max delta in this mini block.
-      int64_t max_delta = numeric_limits<int64_t>::min();
-      for (int j = 0; j < n; ++j) {
-        max_delta = max(values_[idx + j], max_delta);
-      }
-
-      // The bit width for this block is the number of bits needed to store
-      // (max_delta - min_delta).
-      int bit_width = BitUtil::NumRequiredBits(max_delta - min_delta);
-      mini_block_widths[i] = bit_width;
-
-      // Encode this mini blocking using min_delta and bit_width
-      for (int j = 0; j < n; ++j) {
-        writer.PutValue(values_[idx + j] - min_delta, bit_width);
-      }
-
-      // Pad out the last block.
-      for (int j = n; j < mini_block_size_; ++j) {
-        writer.PutValue(0, bit_width);
-      }
-      idx += n;
-    }
-
-    writer.Flush();
-    *encoded_len = writer.bytes_written();
-    return result;
-  }
-
-  int num_values() const { return values_.size(); }
-
- private:
-  int mini_block_size_;
-  vector<int64_t> values_;
-};
-
-class DeltaLengthByteArrayEncoder {
- public:
-  DeltaLengthByteArrayEncoder(int mini_block_size = 8) :
-    len_encoder_(mini_block_size),
-    buffer_(new uint8_t[10 * 1024 * 1024]),
-    offset_(0),
-    plain_encoded_len_(0) {
-  }
-
-  void Add(const string& s) {
-    Add(reinterpret_cast<const uint8_t*>(s.data()), s.size());
-  }
-
-  void Add(const uint8_t* ptr, int len) {
-    plain_encoded_len_ += len + sizeof(int);
-    len_encoder_.Add(len);
-    memcpy(buffer_ + offset_, ptr, len);
-    offset_ += len;
-  }
-
-  uint8_t* Encode(int* encoded_len) {
-    uint8_t* encoded_lengths = len_encoder_.Encode(encoded_len);
-    memmove(buffer_ + *encoded_len + sizeof(int), buffer_, offset_);
-    memcpy(buffer_, encoded_len, sizeof(int));
-    memcpy(buffer_ + sizeof(int), encoded_lengths, *encoded_len);
-    *encoded_len += offset_ + sizeof(int);
-    return buffer_;
-  }
-
-  int num_values() const { return len_encoder_.num_values(); }
-  int plain_encoded_len() const { return plain_encoded_len_; }
-
- private:
-  DeltaBitPackEncoder len_encoder_;
-  uint8_t* buffer_;
-  int offset_;
-  int plain_encoded_len_;
-};
-
-class DeltaByteArrayEncoder {
- public:
-  DeltaByteArrayEncoder() : plain_encoded_len_(0) {}
-
-  void Add(const string& s) {
-    plain_encoded_len_ += s.size() + sizeof(int);
-    int min_len = min(s.size(), last_value_.size());
-    int prefix_len = 0;
-    for (int i = 0; i < min_len; ++i) {
-      if (s[i] == last_value_[i]) {
-        ++prefix_len;
-      } else {
-        break;
-      }
-    }
-    prefix_len_encoder_.Add(prefix_len);
-    suffix_encoder_.Add(reinterpret_cast<const uint8_t*>(s.data()) + 
prefix_len,
-        s.size() - prefix_len);
-    last_value_ = s;
-  }
-
-  uint8_t* Encode(int* encoded_len) {
-    int prefix_buffer_len;
-    uint8_t* prefix_buffer = prefix_len_encoder_.Encode(&prefix_buffer_len);
-    int suffix_buffer_len;
-    uint8_t* suffix_buffer = suffix_encoder_.Encode(&suffix_buffer_len);
-
-    uint8_t* buffer = new uint8_t[10 * 1024 * 1024];
-    memcpy(buffer, &prefix_buffer_len, sizeof(int));
-    memcpy(buffer + sizeof(int), prefix_buffer, prefix_buffer_len);
-    memcpy(buffer + sizeof(int) + prefix_buffer_len, suffix_buffer, 
suffix_buffer_len);
-    *encoded_len = sizeof(int) + prefix_buffer_len + suffix_buffer_len;
-    return buffer;
-  }
-
-  int num_values() const { return prefix_len_encoder_.num_values(); }
-  int plain_encoded_len() const { return plain_encoded_len_; }
-
- private:
-  DeltaBitPackEncoder prefix_len_encoder_;
-  DeltaLengthByteArrayEncoder suffix_encoder_;
-  string last_value_;
-  int plain_encoded_len_;
-};
-
-
-uint64_t TestPlainIntEncoding(const uint8_t* data, int num_values, int 
batch_size) {
-  uint64_t result = 0;
-  PlainDecoder<Int64Type> decoder(nullptr);
-  decoder.SetData(num_values, data, num_values * sizeof(int64_t));
-  int64_t values[batch_size];
-  for (int i = 0; i < num_values;) {
-    int n = decoder.Decode(values, batch_size);
-    for (int j = 0; j < n; ++j) {
-      result += values[j];
-    }
-    i += n;
-  }
-  return result;
-}
-
-uint64_t TestBinaryPackedEncoding(const char* name, const vector<int64_t>& 
values,
-    int benchmark_iters = -1, int benchmark_batch_size = 1) {
-  int mini_block_size;
-  if (values.size() < 8) {
-    mini_block_size = 8;
-  } else if (values.size() < 16) {
-    mini_block_size = 16;
-  } else {
-    mini_block_size = 32;
-  }
-  DeltaBitPackDecoder<Int64Type> decoder(nullptr);
-  DeltaBitPackEncoder encoder(mini_block_size);
-  for (size_t i = 0; i < values.size(); ++i) {
-    encoder.Add(values[i]);
-  }
-
-  int raw_len = encoder.num_values() * sizeof(int);
-  int len;
-  uint8_t* buffer = encoder.Encode(&len);
-
-  if (benchmark_iters == -1) {
-    printf("%s\n", name);
-    printf("  Raw len: %d\n", raw_len);
-    printf("  Encoded len: %d (%0.2f%%)\n", len, len * 100 / (float)raw_len);
-    decoder.SetData(encoder.num_values(), buffer, len);
-    for (int i = 0; i < encoder.num_values(); ++i) {
-      int64_t x = 0;
-      decoder.Decode(&x, 1);
-      if (values[i] != x) {
-        cerr << "Bad: " << i << endl;
-        cerr << "  " << x << " != " << values[i] << endl;
-        break;
-      }
-    }
-    return 0;
-  } else {
-    printf("%s\n", name);
-    printf("  Raw len: %d\n", raw_len);
-    printf("  Encoded len: %d (%0.2f%%)\n", len, len * 100 / (float)raw_len);
-
-    uint64_t result = 0;
-    int64_t buf[benchmark_batch_size];
-    StopWatch sw;
-    sw.Start();\
-    for (int k = 0; k < benchmark_iters; ++k) {
-      decoder.SetData(encoder.num_values(), buffer, len);
-      for (size_t i = 0; i < values.size();) {
-        int n = decoder.Decode(buf, benchmark_batch_size);
-        for (int j = 0; j < n; ++j) {
-          result += buf[j];
-        }
-        i += n;
-      }
-    }
-    uint64_t elapsed = sw.Stop();
-    double num_ints = values.size() * benchmark_iters * 1000.;
-    printf("%s rate (batch size = %2d): %0.3fM per second.\n",
-        name, benchmark_batch_size, num_ints / elapsed);
-    return result;
-  }
-}
-
-#define TEST(NAME, FN, DATA, BATCH_SIZE)\
-  sw.Start();\
-  for (int i = 0; i < NUM_ITERS; ++i) {\
-    FN(reinterpret_cast<uint8_t*>(&DATA[0]), NUM_VALUES, BATCH_SIZE);\
-  }\
-  elapsed = sw.Stop();\
-  printf("%s rate (batch size = %2d): %0.3fM per second.\n",\
-      NAME, BATCH_SIZE, mult / elapsed);
-
-void TestPlainIntCompressed(Codec* codec, const vector<int64_t>& data, int 
num_iters, int batch_size) {
-  const uint8_t* raw_data = reinterpret_cast<const uint8_t*>(&data[0]);
-  int uncompressed_len = data.size() * sizeof(int64_t);
-  uint8_t* decompressed_data = new uint8_t[uncompressed_len];
-
-  int max_compressed_size = codec->MaxCompressedLen(uncompressed_len, 
raw_data);
-  uint8_t* compressed_data = new uint8_t[max_compressed_size];
-  int compressed_len = codec->Compress(uncompressed_len, raw_data,
-      max_compressed_size, compressed_data);
-
-  printf("\n%s:\n  Uncompressed len: %d\n  Compressed len:   %d\n",
-      codec->name(), uncompressed_len, compressed_len);
-
-  double mult = num_iters * data.size() * 1000.;
-  StopWatch sw;
-  sw.Start();
-  uint64_t r = 0;
-  for (int i = 0; i < num_iters; ++i) {
-    codec->Decompress(compressed_len, compressed_data,
-        uncompressed_len, decompressed_data);
-    r += TestPlainIntEncoding(decompressed_data, data.size(), batch_size);
-  }
-  int64_t elapsed = sw.Stop();\
-  printf("Compressed(%s) plain int rate (batch size = %2d): %0.3fM per 
second.\n",
-      codec->name(), batch_size, mult / elapsed);
-
-  delete[] compressed_data;
-  delete[] decompressed_data;
-}
-
-void TestBinaryPacking() {
-  vector<int64_t> values;
-  values.clear();
-  for (int i = 0; i < 100; ++i) values.push_back(0);
-  TestBinaryPackedEncoding("Zeros", values);
-
-  values.clear();
-  for (int i = 1; i <= 5; ++i) values.push_back(i);
-  TestBinaryPackedEncoding("Example 1", values);
-
-  values.clear();
-  values.push_back(7);
-  values.push_back(5);
-  values.push_back(3);
-  values.push_back(1);
-  values.push_back(2);
-  values.push_back(3);
-  values.push_back(4);
-  values.push_back(5);
-  TestBinaryPackedEncoding("Example 2", values);
-
-  // Test rand ints between 0 and 10K
-  values.clear();
-  for (int i = 0; i < 500000; ++i) {
-    values.push_back(rand() % (10000));
-  }
-  TestBinaryPackedEncoding("Rand [0, 10000)", values);
-
-  // Test rand ints between 0 and 100
-  values.clear();
-  for (int i = 0; i < 500000; ++i) {
-    values.push_back(rand() % 100);
-  }
-  TestBinaryPackedEncoding("Rand [0, 100)", values);
-}
-
-void TestDeltaLengthByteArray() {
-  DeltaLengthByteArrayDecoder decoder(nullptr);
-  DeltaLengthByteArrayEncoder encoder;
-
-  vector<string> values;
-  values.push_back("Hello");
-  values.push_back("World");
-  values.push_back("Foobar");
-  values.push_back("ABCDEF");
-
-  for (size_t i = 0; i < values.size(); ++i) {
-    encoder.Add(values[i]);
-  }
-
-  int len = 0;
-  uint8_t* buffer = encoder.Encode(&len);
-  printf("DeltaLengthByteArray\n  Raw len: %d\n  Encoded len: %d\n",
-      encoder.plain_encoded_len(), len);
-  decoder.SetData(encoder.num_values(), buffer, len);
-  for (int i = 0; i < encoder.num_values(); ++i) {
-    ByteArray v = {0, NULL};
-    decoder.Decode(&v, 1);
-    string r = string((char*)v.ptr, v.len);
-    if (r != values[i]) {
-      cout << "Bad " << r << " != " << values[i] << endl;
-    }
-  }
-}
-
-void TestDeltaByteArray() {
-  DeltaByteArrayDecoder decoder(nullptr);
-  DeltaByteArrayEncoder encoder;
-
-  vector<string> values;
-
-  // Wikipedia example
-  values.push_back("myxa");
-  values.push_back("myxophyta");
-  values.push_back("myxopod");
-  values.push_back("nab");
-  values.push_back("nabbed");
-  values.push_back("nabbing");
-  values.push_back("nabit");
-  values.push_back("nabk");
-  values.push_back("nabob");
-  values.push_back("nacarat");
-  values.push_back("nacelle");
-
-  for (size_t i = 0; i < values.size(); ++i) {
-    encoder.Add(values[i]);
-  }
-
-  int len = 0;
-  uint8_t* buffer = encoder.Encode(&len);
-  printf("DeltaLengthByteArray\n  Raw len: %d\n  Encoded len: %d\n",
-      encoder.plain_encoded_len(), len);
-  decoder.SetData(encoder.num_values(), buffer, len);
-  for (int i = 0; i < encoder.num_values(); ++i) {
-    ByteArray v;
-    decoder.Decode(&v, 1);
-    string r = string((char*)v.ptr, v.len);
-    if (r != values[i]) {
-      cout << "Bad " << r << " != " << values[i] << endl;
-    }
-  }
-}
-
-int main(int argc, char** argv) {
-  TestBinaryPacking();
-  TestDeltaLengthByteArray();
-  TestDeltaByteArray();
-
-  StopWatch sw;
-  uint64_t elapsed = 0;
-
-  const int NUM_VALUES = 1024 * 1024;
-  const int NUM_ITERS = 10;
-  const double mult = NUM_VALUES * NUM_ITERS * 1000.;
-
-  vector<int64_t> plain_int_data;
-  plain_int_data.resize(NUM_VALUES);
-
-  TEST("Plain decoder", TestPlainIntEncoding, plain_int_data, 1);
-  TEST("Plain decoder", TestPlainIntEncoding, plain_int_data, 16);
-  TEST("Plain decoder", TestPlainIntEncoding, plain_int_data, 32);
-  TEST("Plain decoder", TestPlainIntEncoding, plain_int_data, 64);
-
-  // Test rand ints between 0 and 10K
-  vector<int64_t> values;
-  for (int i = 0; i < 1000000; ++i) {
-    values.push_back(rand() % 10000);
-  }
-  TestBinaryPackedEncoding("Rand 0-10K", values, 100, 1);
-  TestBinaryPackedEncoding("Rand 0-10K", values, 100, 16);
-  TestBinaryPackedEncoding("Rand 0-10K", values, 100, 32);
-  TestBinaryPackedEncoding("Rand 0-10K", values, 100, 64);
-
-  SnappyCodec snappy_codec;
-
-  TestPlainIntCompressed(&snappy_codec, values, 100, 1);
-  TestPlainIntCompressed(&snappy_codec, values, 100, 16);
-  TestPlainIntCompressed(&snappy_codec, values, 100, 32);
-  TestPlainIntCompressed(&snappy_codec, values, 100, 64);
-
-  return 0;
-}

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c67ee3e5/example/parquet-dump-schema.cc
----------------------------------------------------------------------
diff --git a/example/parquet-dump-schema.cc b/example/parquet-dump-schema.cc
deleted file mode 100644
index ed7b570..0000000
--- a/example/parquet-dump-schema.cc
+++ /dev/null
@@ -1,39 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include <iostream>
-
-#include <parquet/api/reader.h>
-#include <parquet/api/schema.h>
-
-using namespace parquet;
-
-int main(int argc, char** argv) {
-  std::string filename = argv[1];
-
-  try {
-    std::unique_ptr<ParquetFileReader> reader = 
ParquetFileReader::OpenFile(filename);
-    PrintSchema(reader->metadata()->schema_descriptor()->schema().get(), 
std::cout);
-  } catch (const std::exception& e) {
-    std::cerr << "Parquet error: "
-              << e.what()
-              << std::endl;
-    return -1;
-  }
-
-  return 0;
-}

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c67ee3e5/example/parquet_reader.cc
----------------------------------------------------------------------
diff --git a/example/parquet_reader.cc b/example/parquet_reader.cc
deleted file mode 100644
index 2baa2b1..0000000
--- a/example/parquet_reader.cc
+++ /dev/null
@@ -1,70 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include <iostream>
-#include <memory>
-#include <list>
-
-#include <parquet/api/reader.h>
-
-using namespace parquet;
-
-int main(int argc, char** argv) {
-  if (argc > 5 || argc < 2) {
-    std::cerr << "Usage: parquet_reader [--only-metadata] [--no-memory-map] 
[--columns=...] <file>"
-              << std::endl;
-    return -1;
-  }
-
-  std::string filename;
-  bool print_values = true;
-  bool memory_map = true;
-
-  // Read command-line options
-  const std::string COLUMNS_PREFIX = "--columns=";
-  std::list<int> columns;
-
-  char *param, *value;
-  for (int i = 1; i < argc; i++) {
-    if ((param = std::strstr(argv[i], "--only-metadata"))) {
-      print_values = false;
-    } else if ((param = std::strstr(argv[i], "--no-memory-map"))) {
-      memory_map = false;
-    } else if ((param = std::strstr(argv[i], COLUMNS_PREFIX.c_str()))) {
-      value = std::strtok(param+COLUMNS_PREFIX.length(), "," );
-      while (value) {
-        columns.push_back(std::atoi(value));
-        value = std::strtok(nullptr, "," );
-      }
-    } else {
-      filename = argv[i];
-    }
-  }
-
-  try {
-    std::unique_ptr<ParquetFileReader> reader = 
ParquetFileReader::OpenFile(filename,
-        memory_map);
-    reader->DebugPrint(std::cout, columns, print_values);
-  } catch (const std::exception& e) {
-    std::cerr << "Parquet error: "
-              << e.what()
-              << std::endl;
-    return -1;
-  }
-
-  return 0;
-}

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c67ee3e5/src/.clang-format
----------------------------------------------------------------------
diff --git a/src/.clang-format b/src/.clang-format
deleted file mode 100644
index 7d5b3cf..0000000
--- a/src/.clang-format
+++ /dev/null
@@ -1,65 +0,0 @@
----
-Language:        Cpp
-# BasedOnStyle:  Google
-AccessModifierOffset: -1
-AlignAfterOpenBracket: false 
-AlignConsecutiveAssignments: false
-AlignEscapedNewlinesLeft: true
-AlignOperands:   true
-AlignTrailingComments: true
-AllowAllParametersOfDeclarationOnNextLine: true
-AllowShortBlocksOnASingleLine: true 
-AllowShortCaseLabelsOnASingleLine: false
-AllowShortFunctionsOnASingleLine: Inline
-AllowShortIfStatementsOnASingleLine: true
-AllowShortLoopsOnASingleLine: false 
-AlwaysBreakAfterDefinitionReturnType: None
-AlwaysBreakBeforeMultilineStrings: true
-AlwaysBreakTemplateDeclarations: true
-BinPackArguments: true
-BinPackParameters: true 
-BreakBeforeBinaryOperators: None
-BreakBeforeBraces: Attach
-BreakBeforeTernaryOperators: true
-BreakConstructorInitializersBeforeComma: false
-ColumnLimit: 90 
-CommentPragmas:  '^ IWYU pragma:'
-ConstructorInitializerAllOnOneLineOrOnePerLine: true
-ConstructorInitializerIndentWidth: 4
-ContinuationIndentWidth: 4
-Cpp11BracedListStyle: true
-DerivePointerAlignment: false 
-DisableFormat:   false
-ExperimentalAutoDetectBinPacking: false
-ForEachMacros:   [ foreach, Q_FOREACH, BOOST_FOREACH ]
-IndentCaseLabels: true
-IndentWidth:     2
-IndentWrappedFunctionNames: false
-KeepEmptyLinesAtTheStartOfBlocks: false
-MacroBlockBegin: ''
-MacroBlockEnd:   ''
-MaxEmptyLinesToKeep: 1
-NamespaceIndentation: None
-ObjCBlockIndentWidth: 2
-ObjCSpaceAfterProperty: false
-ObjCSpaceBeforeProtocolList: false
-PenaltyBreakBeforeFirstCallParameter: 1000
-PenaltyBreakComment: 300
-PenaltyBreakFirstLessLess: 120
-PenaltyBreakString: 1000
-PenaltyExcessCharacter: 1000000
-PenaltyReturnTypeOnItsOwnLine: 200
-PointerAlignment: Left
-SpaceAfterCStyleCast: false
-SpaceBeforeAssignmentOperators: true
-SpaceBeforeParens: ControlStatements
-SpaceInEmptyParentheses: false
-SpacesBeforeTrailingComments: 2
-SpacesInAngles:  false
-SpacesInContainerLiterals: true
-SpacesInCStyleCastParentheses: false
-SpacesInParentheses: false
-SpacesInSquareBrackets: false
-Standard:        Cpp11
-TabWidth:        8
-UseTab:          Never

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c67ee3e5/src/.clang-tidy
----------------------------------------------------------------------
diff --git a/src/.clang-tidy b/src/.clang-tidy
deleted file mode 100644
index 6fc3742..0000000
--- a/src/.clang-tidy
+++ /dev/null
@@ -1,13 +0,0 @@
----
-Checks:          
'clang-diagnostic-*,clang-analyzer-*,-clang-analyzer-alpha*,google-.*,modernize-.*,readablity-.*'
-HeaderFilterRegex: 'parquet/.*'
-AnalyzeTemporaryDtors: true
-CheckOptions:
-  - key:             
google-readability-braces-around-statements.ShortStatementLines
-    value:           '1'
-  - key:             google-readability-function-size.StatementThreshold
-    value:           '800'
-  - key:             google-readability-namespace-comments.ShortNamespaceLines
-    value:           '10'
-  - key:             google-readability-namespace-comments.SpacesBeforeComments
-    value:           '2'

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c67ee3e5/src/.clang-tidy-ignore
----------------------------------------------------------------------
diff --git a/src/.clang-tidy-ignore b/src/.clang-tidy-ignore
deleted file mode 100644
index 8b13789..0000000
--- a/src/.clang-tidy-ignore
+++ /dev/null
@@ -1 +0,0 @@
-

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c67ee3e5/src/parquet/api/reader.h
----------------------------------------------------------------------
diff --git a/src/parquet/api/reader.h b/src/parquet/api/reader.h
index 1e0c5e3..0bd4a24 100644
--- a/src/parquet/api/reader.h
+++ b/src/parquet/api/reader.h
@@ -20,6 +20,7 @@
 
 // Column reader API
 #include "parquet/column/reader.h"
+#include "parquet/column/scan-all.h"
 #include "parquet/exception.h"
 #include "parquet/file/reader.h"
 

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c67ee3e5/src/parquet/column/reader.h
----------------------------------------------------------------------
diff --git a/src/parquet/column/reader.h b/src/parquet/column/reader.h
index 25df2b4..8243e85 100644
--- a/src/parquet/column/reader.h
+++ b/src/parquet/column/reader.h
@@ -67,7 +67,6 @@ class PARQUET_EXPORT ColumnReader {
   int64_t ReadDefinitionLevels(int64_t batch_size, int16_t* levels);
 
   // Read multiple repetition levels into preallocated memory
-  //
   // Returns the number of decoded repetition levels
   int64_t ReadRepetitionLevels(int64_t batch_size, int16_t* levels);
 

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c67ee3e5/src/parquet/column/scan-all.h
----------------------------------------------------------------------
diff --git a/src/parquet/column/scan-all.h b/src/parquet/column/scan-all.h
new file mode 100644
index 0000000..fd63bff
--- /dev/null
+++ b/src/parquet/column/scan-all.h
@@ -0,0 +1,67 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef PARQUET_SCAN_ALL_H
+#define PARQUET_SCAN_ALL_H
+
+#include "parquet/column/reader.h"
+
+template <typename RType>
+int64_t ScanAll(int32_t batch_size, int16_t* def_levels, int16_t* rep_levels,
+    uint8_t* values, int64_t* values_buffered, parquet::ColumnReader* reader) {
+  typedef typename RType::T Type;
+  auto typed_reader = static_cast<RType*>(reader);
+  auto vals = reinterpret_cast<Type*>(&values[0]);
+  return typed_reader->ReadBatch(
+      batch_size, def_levels, rep_levels, vals, values_buffered);
+}
+
+int64_t ScanAllValues(int32_t batch_size, int16_t* def_levels, int16_t* 
rep_levels,
+    uint8_t* values, int64_t* values_buffered, parquet::ColumnReader* reader) {
+  switch (reader->type()) {
+    case parquet::Type::BOOLEAN:
+      return ScanAll<parquet::BoolReader>(
+          batch_size, def_levels, rep_levels, values, values_buffered, reader);
+    case parquet::Type::INT32:
+      return ScanAll<parquet::Int32Reader>(
+          batch_size, def_levels, rep_levels, values, values_buffered, reader);
+    case parquet::Type::INT64:
+      return ScanAll<parquet::Int64Reader>(
+          batch_size, def_levels, rep_levels, values, values_buffered, reader);
+    case parquet::Type::INT96:
+      return ScanAll<parquet::Int96Reader>(
+          batch_size, def_levels, rep_levels, values, values_buffered, reader);
+    case parquet::Type::FLOAT:
+      return ScanAll<parquet::FloatReader>(
+          batch_size, def_levels, rep_levels, values, values_buffered, reader);
+    case parquet::Type::DOUBLE:
+      return ScanAll<parquet::DoubleReader>(
+          batch_size, def_levels, rep_levels, values, values_buffered, reader);
+    case parquet::Type::BYTE_ARRAY:
+      return ScanAll<parquet::ByteArrayReader>(
+          batch_size, def_levels, rep_levels, values, values_buffered, reader);
+    case parquet::Type::FIXED_LEN_BYTE_ARRAY:
+      return ScanAll<parquet::FixedLenByteArrayReader>(
+          batch_size, def_levels, rep_levels, values, values_buffered, reader);
+    default:
+      parquet::ParquetException::NYI("type reader not implemented");
+  }
+  // Unreachable code, but supress compiler warning
+  return 0;
+}
+
+#endif  // PARQUET_SCAN_ALL_H

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c67ee3e5/src/parquet/column/scanner.h
----------------------------------------------------------------------
diff --git a/src/parquet/column/scanner.h b/src/parquet/column/scanner.h
index bc5c5ce..4373c7a 100644
--- a/src/parquet/column/scanner.h
+++ b/src/parquet/column/scanner.h
@@ -48,7 +48,6 @@ class PARQUET_EXPORT Scanner {
         value_offset_(0),
         values_buffered_(0),
         reader_(reader) {
-    // TODO: don't allocate for required fields
     def_levels_.resize(descr()->max_definition_level() > 0 ? batch_size_ : 0);
     rep_levels_.resize(descr()->max_repetition_level() > 0 ? batch_size_ : 0);
   }

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c67ee3e5/src/parquet/file/reader.cc
----------------------------------------------------------------------
diff --git a/src/parquet/file/reader.cc b/src/parquet/file/reader.cc
index 50db1c0..62481f7 100644
--- a/src/parquet/file/reader.cc
+++ b/src/parquet/file/reader.cc
@@ -144,7 +144,7 @@ void ParquetFileReader::DebugPrint(
   for (auto i : selected_columns) {
     const ColumnDescriptor* descr = 
file_metadata->schema_descriptor()->Column(i);
     stream << "Column " << i << ": " << descr->name() << " ("
-           << type_to_string(descr->physical_type()) << ")" << std::endl;
+           << TypeToString(descr->physical_type()) << ")" << std::endl;
   }
 
   for (int r = 0; r < file_metadata->num_row_groups(); ++r) {
@@ -173,10 +173,10 @@ void ParquetFileReader::DebugPrint(
         stream << "  Statistics Not Set";
       }
       stream << std::endl
-             << "  compression: " << 
compression_to_string(column_chunk->compression())
+             << "  compression: " << 
CompressionToString(column_chunk->compression())
              << ", encodings: ";
       for (auto encoding : column_chunk->encodings()) {
-        stream << encoding_to_string(encoding) << " ";
+        stream << EncodingToString(encoding) << " ";
       }
       stream << std::endl
              << "  uncompressed size: " << 
column_chunk->total_uncompressed_size()

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c67ee3e5/src/parquet/schema/types.cc
----------------------------------------------------------------------
diff --git a/src/parquet/schema/types.cc b/src/parquet/schema/types.cc
index 2e5d151..043ef00 100644
--- a/src/parquet/schema/types.cc
+++ b/src/parquet/schema/types.cc
@@ -101,7 +101,7 @@ PrimitiveNode::PrimitiveNode(const std::string& name, 
Repetition::type repetitio
     case LogicalType::JSON:
     case LogicalType::BSON:
       if (type != Type::BYTE_ARRAY) {
-        ss << logical_type_to_string(logical_type);
+        ss << LogicalTypeToString(logical_type);
         ss << " can only annotate BYTE_ARRAY fields";
         throw ParquetException(ss.str());
       }
@@ -138,7 +138,7 @@ PrimitiveNode::PrimitiveNode(const std::string& name, 
Repetition::type repetitio
     case LogicalType::INT_16:
     case LogicalType::INT_32:
       if (type != Type::INT32) {
-        ss << logical_type_to_string(logical_type);
+        ss << LogicalTypeToString(logical_type);
         ss << " can only annotate INT32";
         throw ParquetException(ss.str());
       }
@@ -149,7 +149,7 @@ PrimitiveNode::PrimitiveNode(const std::string& name, 
Repetition::type repetitio
     case LogicalType::UINT_64:
     case LogicalType::INT_64:
       if (type != Type::INT64) {
-        ss << logical_type_to_string(logical_type);
+        ss << LogicalTypeToString(logical_type);
         ss << " can only annotate INT64";
         throw ParquetException(ss.str());
       }
@@ -167,7 +167,7 @@ PrimitiveNode::PrimitiveNode(const std::string& name, 
Repetition::type repetitio
       }
       break;
     default:
-      ss << logical_type_to_string(logical_type);
+      ss << LogicalTypeToString(logical_type);
       ss << " can not be applied to a primitive type";
       throw ParquetException(ss.str());
   }

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c67ee3e5/src/parquet/types-test.cc
----------------------------------------------------------------------
diff --git a/src/parquet/types-test.cc b/src/parquet/types-test.cc
index 59ed456..06f202e 100644
--- a/src/parquet/types-test.cc
+++ b/src/parquet/types-test.cc
@@ -24,43 +24,41 @@
 namespace parquet {
 
 TEST(TestTypeToString, PhysicalTypes) {
-  ASSERT_STREQ("BOOLEAN", type_to_string(Type::BOOLEAN).c_str());
-  ASSERT_STREQ("INT32", type_to_string(Type::INT32).c_str());
-  ASSERT_STREQ("INT64", type_to_string(Type::INT64).c_str());
-  ASSERT_STREQ("INT96", type_to_string(Type::INT96).c_str());
-  ASSERT_STREQ("FLOAT", type_to_string(Type::FLOAT).c_str());
-  ASSERT_STREQ("DOUBLE", type_to_string(Type::DOUBLE).c_str());
-  ASSERT_STREQ("BYTE_ARRAY", type_to_string(Type::BYTE_ARRAY).c_str());
-  ASSERT_STREQ(
-      "FIXED_LEN_BYTE_ARRAY", 
type_to_string(Type::FIXED_LEN_BYTE_ARRAY).c_str());
+  ASSERT_STREQ("BOOLEAN", TypeToString(Type::BOOLEAN).c_str());
+  ASSERT_STREQ("INT32", TypeToString(Type::INT32).c_str());
+  ASSERT_STREQ("INT64", TypeToString(Type::INT64).c_str());
+  ASSERT_STREQ("INT96", TypeToString(Type::INT96).c_str());
+  ASSERT_STREQ("FLOAT", TypeToString(Type::FLOAT).c_str());
+  ASSERT_STREQ("DOUBLE", TypeToString(Type::DOUBLE).c_str());
+  ASSERT_STREQ("BYTE_ARRAY", TypeToString(Type::BYTE_ARRAY).c_str());
+  ASSERT_STREQ("FIXED_LEN_BYTE_ARRAY", 
TypeToString(Type::FIXED_LEN_BYTE_ARRAY).c_str());
 }
 
 TEST(TestLogicalTypeToString, LogicalTypes) {
-  ASSERT_STREQ("NONE", logical_type_to_string(LogicalType::NONE).c_str());
-  ASSERT_STREQ("UTF8", logical_type_to_string(LogicalType::UTF8).c_str());
-  ASSERT_STREQ(
-      "MAP_KEY_VALUE", 
logical_type_to_string(LogicalType::MAP_KEY_VALUE).c_str());
-  ASSERT_STREQ("LIST", logical_type_to_string(LogicalType::LIST).c_str());
-  ASSERT_STREQ("ENUM", logical_type_to_string(LogicalType::ENUM).c_str());
-  ASSERT_STREQ("DECIMAL", 
logical_type_to_string(LogicalType::DECIMAL).c_str());
-  ASSERT_STREQ("DATE", logical_type_to_string(LogicalType::DATE).c_str());
-  ASSERT_STREQ("TIME_MILLIS", 
logical_type_to_string(LogicalType::TIME_MILLIS).c_str());
-  ASSERT_STREQ("TIME_MICROS", 
logical_type_to_string(LogicalType::TIME_MICROS).c_str());
+  ASSERT_STREQ("NONE", LogicalTypeToString(LogicalType::NONE).c_str());
+  ASSERT_STREQ("UTF8", LogicalTypeToString(LogicalType::UTF8).c_str());
+  ASSERT_STREQ("MAP_KEY_VALUE", 
LogicalTypeToString(LogicalType::MAP_KEY_VALUE).c_str());
+  ASSERT_STREQ("LIST", LogicalTypeToString(LogicalType::LIST).c_str());
+  ASSERT_STREQ("ENUM", LogicalTypeToString(LogicalType::ENUM).c_str());
+  ASSERT_STREQ("DECIMAL", LogicalTypeToString(LogicalType::DECIMAL).c_str());
+  ASSERT_STREQ("DATE", LogicalTypeToString(LogicalType::DATE).c_str());
+  ASSERT_STREQ("TIME_MILLIS", 
LogicalTypeToString(LogicalType::TIME_MILLIS).c_str());
+  ASSERT_STREQ("TIME_MICROS", 
LogicalTypeToString(LogicalType::TIME_MICROS).c_str());
   ASSERT_STREQ(
-      "TIMESTAMP_MILLIS", 
logical_type_to_string(LogicalType::TIMESTAMP_MILLIS).c_str());
+      "TIMESTAMP_MILLIS", 
LogicalTypeToString(LogicalType::TIMESTAMP_MILLIS).c_str());
   ASSERT_STREQ(
-      "TIMESTAMP_MICROS", 
logical_type_to_string(LogicalType::TIMESTAMP_MICROS).c_str());
-  ASSERT_STREQ("UINT_8", logical_type_to_string(LogicalType::UINT_8).c_str());
-  ASSERT_STREQ("UINT_16", 
logical_type_to_string(LogicalType::UINT_16).c_str());
-  ASSERT_STREQ("UINT_32", 
logical_type_to_string(LogicalType::UINT_32).c_str());
-  ASSERT_STREQ("UINT_64", 
logical_type_to_string(LogicalType::UINT_64).c_str());
-  ASSERT_STREQ("INT_8", logical_type_to_string(LogicalType::INT_8).c_str());
-  ASSERT_STREQ("INT_16", logical_type_to_string(LogicalType::INT_16).c_str());
-  ASSERT_STREQ("INT_32", logical_type_to_string(LogicalType::INT_32).c_str());
-  ASSERT_STREQ("INT_64", logical_type_to_string(LogicalType::INT_64).c_str());
-  ASSERT_STREQ("JSON", logical_type_to_string(LogicalType::JSON).c_str());
-  ASSERT_STREQ("BSON", logical_type_to_string(LogicalType::BSON).c_str());
-  ASSERT_STREQ("INTERVAL", 
logical_type_to_string(LogicalType::INTERVAL).c_str());
+      "TIMESTAMP_MICROS", 
LogicalTypeToString(LogicalType::TIMESTAMP_MICROS).c_str());
+  ASSERT_STREQ("UINT_8", LogicalTypeToString(LogicalType::UINT_8).c_str());
+  ASSERT_STREQ("UINT_16", LogicalTypeToString(LogicalType::UINT_16).c_str());
+  ASSERT_STREQ("UINT_32", LogicalTypeToString(LogicalType::UINT_32).c_str());
+  ASSERT_STREQ("UINT_64", LogicalTypeToString(LogicalType::UINT_64).c_str());
+  ASSERT_STREQ("INT_8", LogicalTypeToString(LogicalType::INT_8).c_str());
+  ASSERT_STREQ("INT_16", LogicalTypeToString(LogicalType::INT_16).c_str());
+  ASSERT_STREQ("INT_32", LogicalTypeToString(LogicalType::INT_32).c_str());
+  ASSERT_STREQ("INT_64", LogicalTypeToString(LogicalType::INT_64).c_str());
+  ASSERT_STREQ("JSON", LogicalTypeToString(LogicalType::JSON).c_str());
+  ASSERT_STREQ("BSON", LogicalTypeToString(LogicalType::BSON).c_str());
+  ASSERT_STREQ("INTERVAL", LogicalTypeToString(LogicalType::INTERVAL).c_str());
 }
 
 TEST(TypePrinter, StatisticsTypes) {

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c67ee3e5/src/parquet/types.cc
----------------------------------------------------------------------
diff --git a/src/parquet/types.cc b/src/parquet/types.cc
index 7fc5017..97c769b 100644
--- a/src/parquet/types.cc
+++ b/src/parquet/types.cc
@@ -62,7 +62,7 @@ std::string FormatStatValue(Type::type parquet_type, const 
char* val) {
   return result.str();
 }
 
-std::string encoding_to_string(Encoding::type t) {
+std::string EncodingToString(Encoding::type t) {
   switch (t) {
     case Encoding::PLAIN:
       return "PLAIN";
@@ -94,7 +94,7 @@ std::string encoding_to_string(Encoding::type t) {
   }
 }
 
-std::string compression_to_string(Compression::type t) {
+std::string CompressionToString(Compression::type t) {
   switch (t) {
     case Compression::UNCOMPRESSED:
       return "UNCOMPRESSED";
@@ -114,7 +114,7 @@ std::string compression_to_string(Compression::type t) {
   }
 }
 
-std::string type_to_string(Type::type t) {
+std::string TypeToString(Type::type t) {
   switch (t) {
     case Type::BOOLEAN:
       return "BOOLEAN";
@@ -146,7 +146,7 @@ std::string type_to_string(Type::type t) {
   }
 }
 
-std::string logical_type_to_string(LogicalType::type t) {
+std::string LogicalTypeToString(LogicalType::type t) {
   switch (t) {
     case LogicalType::NONE:
       return "NONE";
@@ -196,4 +196,29 @@ std::string logical_type_to_string(LogicalType::type t) {
       return "UNKNOWN";
   }
 }
+
+int GetTypeByteSize(Type::type parquet_type) {
+  switch (parquet_type) {
+    case Type::BOOLEAN:
+      return type_traits<BooleanType::type_num>::value_byte_size;
+    case Type::INT32:
+      return type_traits<Int32Type::type_num>::value_byte_size;
+    case Type::INT64:
+      return type_traits<Int64Type::type_num>::value_byte_size;
+    case Type::INT96:
+      return type_traits<Int96Type::type_num>::value_byte_size;
+    case Type::DOUBLE:
+      return type_traits<DoubleType::type_num>::value_byte_size;
+    case Type::FLOAT:
+      return type_traits<FloatType::type_num>::value_byte_size;
+    case Type::BYTE_ARRAY:
+      return type_traits<ByteArrayType::type_num>::value_byte_size;
+    case Type::FIXED_LEN_BYTE_ARRAY:
+      return type_traits<FLBAType::type_num>::value_byte_size;
+    default:
+      return 0;
+  }
+  return 0;
+}
+
 }  // namespace parquet

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c67ee3e5/src/parquet/types.h
----------------------------------------------------------------------
diff --git a/src/parquet/types.h b/src/parquet/types.h
index cb67820..a4285be 100644
--- a/src/parquet/types.h
+++ b/src/parquet/types.h
@@ -269,15 +269,17 @@ inline std::string format_fwf(int width) {
   return ss.str();
 }
 
-std::string compression_to_string(Compression::type t);
+std::string CompressionToString(Compression::type t);
 
-std::string encoding_to_string(Encoding::type t);
+std::string EncodingToString(Encoding::type t);
 
-std::string logical_type_to_string(LogicalType::type t);
+std::string LogicalTypeToString(LogicalType::type t);
 
-std::string type_to_string(Type::type t);
+std::string TypeToString(Type::type t);
 
 std::string FormatStatValue(Type::type parquet_type, const char* val);
+
+int GetTypeByteSize(Type::type t);
 }  // namespace parquet
 
 #endif  // PARQUET_TYPES_H

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c67ee3e5/tools/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt
new file mode 100644
index 0000000..5c4eaa8
--- /dev/null
+++ b/tools/CMakeLists.txt
@@ -0,0 +1,34 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+SET(LINK_LIBS
+  snappystatic
+  thriftstatic)
+
+if (PARQUET_BUILD_EXECUTABLES)
+  add_executable(parquet-dump-schema parquet-dump-schema.cc)
+  target_link_libraries(parquet-dump-schema ${LINK_LIBS}
+       parquet_static)
+
+  add_executable(parquet_reader parquet_reader.cc)
+  target_link_libraries(parquet_reader ${LINK_LIBS}
+       parquet_static)
+
+  add_executable(parquet-scan parquet-scan.cc)
+  target_link_libraries(parquet-scan ${LINK_LIBS}
+       parquet_static)
+endif()

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c67ee3e5/tools/parquet-dump-schema.cc
----------------------------------------------------------------------
diff --git a/tools/parquet-dump-schema.cc b/tools/parquet-dump-schema.cc
new file mode 100644
index 0000000..deef2fd
--- /dev/null
+++ b/tools/parquet-dump-schema.cc
@@ -0,0 +1,36 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <iostream>
+
+#include "parquet/api/reader.h"
+#include "parquet/api/schema.h"
+
+int main(int argc, char** argv) {
+  std::string filename = argv[1];
+
+  try {
+    std::unique_ptr<parquet::ParquetFileReader> reader =
+        parquet::ParquetFileReader::OpenFile(filename);
+    PrintSchema(reader->metadata()->schema_descriptor()->schema().get(), 
std::cout);
+  } catch (const std::exception& e) {
+    std::cerr << "Parquet error: " << e.what() << std::endl;
+    return -1;
+  }
+
+  return 0;
+}

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c67ee3e5/tools/parquet-scan.cc
----------------------------------------------------------------------
diff --git a/tools/parquet-scan.cc b/tools/parquet-scan.cc
new file mode 100644
index 0000000..d146a1d
--- /dev/null
+++ b/tools/parquet-scan.cc
@@ -0,0 +1,108 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <ctime>
+#include <iostream>
+#include <memory>
+#include <list>
+
+#include "parquet/api/reader.h"
+
+int main(int argc, char** argv) {
+  if (argc > 4 || argc < 1) {
+    std::cerr << "Usage: parquet-scan [--batch-size=] [--columns=...] <file>"
+              << std::endl;
+    return -1;
+  }
+
+  std::string filename;
+
+  // Read command-line options
+  int batch_size = 256;
+  const std::string COLUMNS_PREFIX = "--columns=";
+  const std::string BATCH_SIZE_PREFIX = "--batch-size=";
+  std::vector<int> columns;
+  int num_columns = 0;
+
+  char *param, *value;
+  for (int i = 1; i < argc; i++) {
+    if ((param = std::strstr(argv[i], COLUMNS_PREFIX.c_str()))) {
+      value = std::strtok(param + COLUMNS_PREFIX.length(), ",");
+      while (value) {
+        columns.push_back(std::atoi(value));
+        value = std::strtok(nullptr, ",");
+        num_columns++;
+      }
+    } else if ((param = std::strstr(argv[i], BATCH_SIZE_PREFIX.c_str()))) {
+      value = std::strtok(param + BATCH_SIZE_PREFIX.length(), " ");
+      if (value) { batch_size = std::atoi(value); }
+    } else {
+      filename = argv[i];
+    }
+  }
+
+  std::vector<int16_t> rep_levels(batch_size);
+  std::vector<int16_t> def_levels(batch_size);
+  try {
+    double total_time;
+    std::clock_t start_time = std::clock();
+    std::unique_ptr<parquet::ParquetFileReader> reader =
+        parquet::ParquetFileReader::OpenFile(filename);
+    // columns are not specified explicitly. Add all columns
+    if (num_columns == 0) {
+      num_columns = reader->metadata()->num_columns();
+      columns.resize(num_columns);
+      for (int i = 0; i < num_columns; i++) {
+        columns[i] = i;
+      }
+    }
+
+    int64_t total_rows[num_columns];
+
+    for (int r = 0; r < reader->metadata()->num_row_groups(); ++r) {
+      auto group_reader = reader->RowGroup(r);
+      int col = 0;
+      for (auto i : columns) {
+        total_rows[col] = 0;
+        std::shared_ptr<parquet::ColumnReader> col_reader = 
group_reader->Column(i);
+        size_t value_byte_size = 
GetTypeByteSize(col_reader->descr()->physical_type());
+        std::vector<uint8_t> values(batch_size * value_byte_size);
+
+        int64_t values_read = 0;
+        while (col_reader->HasNext()) {
+          total_rows[col] += ScanAllValues(batch_size, def_levels.data(),
+              rep_levels.data(), values.data(), &values_read, 
col_reader.get());
+        }
+        col++;
+      }
+    }
+
+    total_time = (std::clock() - start_time) / 
static_cast<double>(CLOCKS_PER_SEC);
+    for (int ct = 1; ct < num_columns; ++ct) {
+      if (total_rows[0] != total_rows[ct]) {
+        std::cerr << "Parquet error: Total rows among columns do not match" << 
std::endl;
+      }
+    }
+    std::cout << total_rows[0] << " rows scanned in " << total_time << " 
seconds."
+              << std::endl;
+  } catch (const std::exception& e) {
+    std::cerr << "Parquet error: " << e.what() << std::endl;
+    return -1;
+  }
+
+  return 0;
+}

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c67ee3e5/tools/parquet_reader.cc
----------------------------------------------------------------------
diff --git a/tools/parquet_reader.cc b/tools/parquet_reader.cc
new file mode 100644
index 0000000..ced84d5
--- /dev/null
+++ b/tools/parquet_reader.cc
@@ -0,0 +1,67 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <iostream>
+#include <memory>
+#include <list>
+
+#include "parquet/api/reader.h"
+
+int main(int argc, char** argv) {
+  if (argc > 5 || argc < 2) {
+    std::cerr << "Usage: parquet_reader [--only-metadata] [--no-memory-map] "
+                 "[--columns=...] <file>"
+              << std::endl;
+    return -1;
+  }
+
+  std::string filename;
+  bool print_values = true;
+  bool memory_map = true;
+
+  // Read command-line options
+  const std::string COLUMNS_PREFIX = "--columns=";
+  std::list<int> columns;
+
+  char *param, *value;
+  for (int i = 1; i < argc; i++) {
+    if ((param = std::strstr(argv[i], "--only-metadata"))) {
+      print_values = false;
+    } else if ((param = std::strstr(argv[i], "--no-memory-map"))) {
+      memory_map = false;
+    } else if ((param = std::strstr(argv[i], COLUMNS_PREFIX.c_str()))) {
+      value = std::strtok(param + COLUMNS_PREFIX.length(), ",");
+      while (value) {
+        columns.push_back(std::atoi(value));
+        value = std::strtok(nullptr, ",");
+      }
+    } else {
+      filename = argv[i];
+    }
+  }
+
+  try {
+    std::unique_ptr<parquet::ParquetFileReader> reader =
+        parquet::ParquetFileReader::OpenFile(filename, memory_map);
+    reader->DebugPrint(std::cout, columns, print_values);
+  } catch (const std::exception& e) {
+    std::cerr << "Parquet error: " << e.what() << std::endl;
+    return -1;
+  }
+
+  return 0;
+}

parquet-cpp git commit: PARQUET-681: Add tool to scan a parquet file

Reply via email to