[1/2] parquet-cpp git commit: PARQUET-494: Implement DictionaryEncoder and test dictionary decoding

julien Fri, 26 Feb 2016 09:53:23 -0800

Repository: parquet-cpp
Updated Branches:
  refs/heads/master 1df5a26d6 -> c6e069297



http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c6e06929/src/parquet/util/mem-pool.cc
----------------------------------------------------------------------
diff --git a/src/parquet/util/mem-pool.cc b/src/parquet/util/mem-pool.cc
new file mode 100644
index 0000000..6e56c28
--- /dev/null
+++ b/src/parquet/util/mem-pool.cc
@@ -0,0 +1,234 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Initially imported from Apache Impala on 2016-02-23, and has been modified
+// since for parquet-cpp
+
+#include "parquet/util/mem-pool.h"
+
+#include <stdio.h>
+
+#include <algorithm>
+#include <cstdint>
+#include <sstream>
+#include <string>
+
+#include "parquet/util/bit-util.h"
+
+namespace parquet_cpp {
+
+const int MemPool::INITIAL_CHUNK_SIZE;
+const int MemPool::MAX_CHUNK_SIZE;
+
+MemPool::MemPool()
+  : current_chunk_idx_(-1),
+    next_chunk_size_(INITIAL_CHUNK_SIZE),
+    total_allocated_bytes_(0),
+    peak_allocated_bytes_(0),
+    total_reserved_bytes_(0) {}
+
+MemPool::ChunkInfo::ChunkInfo(int64_t size, uint8_t* buf)
+  : data(buf),
+    size(size),
+    allocated_bytes(0) {
+}
+
+MemPool::~MemPool() {
+  int64_t total_bytes_released = 0;
+  for (size_t i = 0; i < chunks_.size(); ++i) {
+    total_bytes_released += chunks_[i].size;
+    free(chunks_[i].data);
+  }
+
+  DCHECK(chunks_.empty()) << "Must call FreeAll() or AcquireData() for this 
pool";
+}
+
+void MemPool::Clear() {
+  current_chunk_idx_ = -1;
+  for (auto chunk = chunks_.begin(); chunk != chunks_.end(); ++chunk) {
+    chunk->allocated_bytes = 0;
+  }
+  total_allocated_bytes_ = 0;
+  DCHECK(CheckIntegrity(false));
+}
+
+void MemPool::FreeAll() {
+  int64_t total_bytes_released = 0;
+  for (size_t i = 0; i < chunks_.size(); ++i) {
+    total_bytes_released += chunks_[i].size;
+    free(chunks_[i].data);
+  }
+  chunks_.clear();
+  next_chunk_size_ = INITIAL_CHUNK_SIZE;
+  current_chunk_idx_ = -1;
+  total_allocated_bytes_ = 0;
+  total_reserved_bytes_ = 0;
+}
+
+bool MemPool::FindChunk(int64_t min_size) {
+  // Try to allocate from a free chunk. The first free chunk, if any, will be 
immediately
+  // after the current chunk.
+  int first_free_idx = current_chunk_idx_ + 1;
+  // (cast size() to signed int in order to avoid everything else being cast to
+  // unsigned long, in particular -1)
+  while (++current_chunk_idx_  < static_cast<int>(chunks_.size())) {
+    // we found a free chunk
+    DCHECK_EQ(chunks_[current_chunk_idx_].allocated_bytes, 0);
+
+    if (chunks_[current_chunk_idx_].size >= min_size) {
+      // This chunk is big enough.  Move it before the other free chunks.
+      if (current_chunk_idx_ != first_free_idx) {
+        std::swap(chunks_[current_chunk_idx_], chunks_[first_free_idx]);
+        current_chunk_idx_ = first_free_idx;
+      }
+      break;
+    }
+  }
+
+  if (current_chunk_idx_ == static_cast<int>(chunks_.size())) {
+    // need to allocate new chunk.
+    int64_t chunk_size;
+    DCHECK_GE(next_chunk_size_, INITIAL_CHUNK_SIZE);
+    DCHECK_LE(next_chunk_size_, MAX_CHUNK_SIZE);
+
+    chunk_size = std::max<int64_t>(min_size, next_chunk_size_);
+
+    // Allocate a new chunk. Return early if malloc fails.
+    uint8_t* buf = reinterpret_cast<uint8_t*>(malloc(chunk_size));
+    if (UNLIKELY(buf == NULL)) {
+      DCHECK_EQ(current_chunk_idx_, static_cast<int>(chunks_.size()));
+      current_chunk_idx_ = static_cast<int>(chunks_.size()) - 1;
+      return false;
+    }
+
+    // If there are no free chunks put it at the end, otherwise before the 
first free.
+    if (first_free_idx == static_cast<int>(chunks_.size())) {
+      chunks_.push_back(ChunkInfo(chunk_size, buf));
+    } else {
+      current_chunk_idx_ = first_free_idx;
+      auto insert_chunk = chunks_.begin() + current_chunk_idx_;
+      chunks_.insert(insert_chunk, ChunkInfo(chunk_size, buf));
+    }
+    total_reserved_bytes_ += chunk_size;
+    // Don't increment the chunk size until the allocation succeeds: if an 
attempted
+    // large allocation fails we don't want to increase the chunk size further.
+    next_chunk_size_ = static_cast<int>(std::min<int64_t>(
+            chunk_size * 2, MAX_CHUNK_SIZE));
+  }
+
+  DCHECK_LT(current_chunk_idx_, static_cast<int>(chunks_.size()));
+  DCHECK(CheckIntegrity(true));
+  return true;
+}
+
+void MemPool::AcquireData(MemPool* src, bool keep_current) {
+  DCHECK(src->CheckIntegrity(false));
+  int num_acquired_chunks;
+  if (keep_current) {
+    num_acquired_chunks = src->current_chunk_idx_;
+  } else if (src->GetFreeOffset() == 0) {
+    // nothing in the last chunk
+    num_acquired_chunks = src->current_chunk_idx_;
+  } else {
+    num_acquired_chunks = src->current_chunk_idx_ + 1;
+  }
+
+  if (num_acquired_chunks <= 0) {
+    if (!keep_current) src->FreeAll();
+    return;
+  }
+
+  auto end_chunk = src->chunks_.begin() + num_acquired_chunks;
+  int64_t total_transfered_bytes = 0;
+  for (auto i = src->chunks_.begin(); i != end_chunk; ++i) {
+    total_transfered_bytes += i->size;
+  }
+  src->total_reserved_bytes_ -= total_transfered_bytes;
+  total_reserved_bytes_ += total_transfered_bytes;
+
+  // insert new chunks after current_chunk_idx_
+  auto insert_chunk = chunks_.begin() + current_chunk_idx_ + 1;
+  chunks_.insert(insert_chunk, src->chunks_.begin(), end_chunk);
+  src->chunks_.erase(src->chunks_.begin(), end_chunk);
+  current_chunk_idx_ += num_acquired_chunks;
+
+  if (keep_current) {
+    src->current_chunk_idx_ = 0;
+    DCHECK(src->chunks_.size() == 1 || src->chunks_[1].allocated_bytes == 0);
+    total_allocated_bytes_ += src->total_allocated_bytes_ - 
src->GetFreeOffset();
+    src->total_allocated_bytes_ = src->GetFreeOffset();
+  } else {
+    src->current_chunk_idx_ = -1;
+    total_allocated_bytes_ += src->total_allocated_bytes_;
+    src->total_allocated_bytes_ = 0;
+  }
+  peak_allocated_bytes_ = std::max(total_allocated_bytes_, 
peak_allocated_bytes_);
+
+  if (!keep_current) src->FreeAll();
+  DCHECK(CheckIntegrity(false));
+}
+
+std::string MemPool::DebugString() {
+  std::stringstream out;
+  char str[16];
+  out << "MemPool(#chunks=" << chunks_.size() << " [";
+  for (int i = 0; i < chunks_.size(); ++i) {
+    sprintf(str, "0x%lx=", reinterpret_cast<size_t>(chunks_[i].data)); // 
NOLINT
+    out << (i > 0 ? " " : "")
+        << str
+        << chunks_[i].size
+        << "/" << chunks_[i].allocated_bytes;
+  }
+  out << "] current_chunk=" << current_chunk_idx_
+      << " total_sizes=" << GetTotalChunkSizes()
+      << " total_alloc=" << total_allocated_bytes_
+      << ")";
+  return out.str();
+}
+
+int64_t MemPool::GetTotalChunkSizes() const {
+  int64_t result = 0;
+  for (int i = 0; i < chunks_.size(); ++i) {
+    result += chunks_[i].size;
+  }
+  return result;
+}
+
+bool MemPool::CheckIntegrity(bool current_chunk_empty) {
+  // check that current_chunk_idx_ points to the last chunk with allocated data
+  DCHECK_LT(current_chunk_idx_, static_cast<int>(chunks_.size()));
+  int64_t total_allocated = 0;
+  for (int i = 0; i < chunks_.size(); ++i) {
+    DCHECK_GT(chunks_[i].size, 0);
+    if (i < current_chunk_idx_) {
+      DCHECK_GT(chunks_[i].allocated_bytes, 0);
+    } else if (i == current_chunk_idx_) {
+      if (current_chunk_empty) {
+        DCHECK_EQ(chunks_[i].allocated_bytes, 0);
+      } else {
+        DCHECK_GT(chunks_[i].allocated_bytes, 0);
+      }
+    } else {
+      DCHECK_EQ(chunks_[i].allocated_bytes, 0);
+    }
+    total_allocated += chunks_[i].allocated_bytes;
+  }
+  DCHECK_EQ(total_allocated, total_allocated_bytes_);
+  return true;
+}
+
+} // namespace parquet_cpp

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c6e06929/src/parquet/util/mem-pool.h
----------------------------------------------------------------------
diff --git a/src/parquet/util/mem-pool.h b/src/parquet/util/mem-pool.h
new file mode 100644
index 0000000..88a8715
--- /dev/null
+++ b/src/parquet/util/mem-pool.h
@@ -0,0 +1,208 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Initially imported from Apache Impala on 2016-02-23, and has been modified
+// since for parquet-cpp
+
+#ifndef PARQUET_UTIL_MEM_POOL_H
+#define PARQUET_UTIL_MEM_POOL_H
+
+#include <stdio.h>
+#include <algorithm>
+#include <cstdint>
+#include <vector>
+#include <string>
+
+#include "parquet/util/logging.h"
+#include "parquet/util/bit-util.h"
+
+namespace parquet_cpp {
+
+/// A MemPool maintains a list of memory chunks from which it allocates memory 
in
+/// response to Allocate() calls;
+/// Chunks stay around for the lifetime of the mempool or until they are 
passed on to
+/// another mempool.
+//
+/// An Allocate() call will attempt to allocate memory from the chunk that was 
most
+/// recently added; if that chunk doesn't have enough memory to
+/// satisfy the allocation request, the free chunks are searched for one that 
is
+/// big enough otherwise a new chunk is added to the list.
+/// The current_chunk_idx_ always points to the last chunk with allocated 
memory.
+/// In order to keep allocation overhead low, chunk sizes double with each new 
one
+/// added, until they hit a maximum size.
+//
+///     Example:
+///     MemPool* p = new MemPool();
+///     for (int i = 0; i < 1024; ++i) {
+/// returns 8-byte aligned memory (effectively 24 bytes):
+///       .. = p->Allocate(17);
+///     }
+/// at this point, 17K have been handed out in response to Allocate() calls and
+/// 28K of chunks have been allocated (chunk sizes: 4K, 8K, 16K)
+/// We track total and peak allocated bytes. At this point they would be the 
same:
+/// 28k bytes.  A call to Clear will return the allocated memory so
+/// total_allocate_bytes_
+/// becomes 0 while peak_allocate_bytes_ remains at 28k.
+///     p->Clear();
+/// the entire 1st chunk is returned:
+///     .. = p->Allocate(4 * 1024);
+/// 4K of the 2nd chunk are returned:
+///     .. = p->Allocate(4 * 1024);
+/// a new 20K chunk is created
+///     .. = p->Allocate(20 * 1024);
+//
+///      MemPool* p2 = new MemPool();
+/// the new mempool receives all chunks containing data from p
+///      p2->AcquireData(p, false);
+/// At this point p.total_allocated_bytes_ would be 0 while 
p.peak_allocated_bytes_
+/// remains unchanged.
+/// The one remaining (empty) chunk is released:
+///    delete p;
+
+class MemPool {
+ public:
+  MemPool();
+
+  /// Frees all chunks of memory and subtracts the total allocated bytes
+  /// from the registered limits.
+  ~MemPool();
+
+  /// Allocates 8-byte aligned section of memory of 'size' bytes at the end
+  /// of the the current chunk. Creates a new chunk if there aren't any chunks
+  /// with enough capacity.
+  uint8_t* Allocate(int size) {
+    return Allocate<false>(size);
+  }
+
+  /// Returns 'byte_size' to the current chunk back to the mem pool. This can
+  /// only be used to return either all or part of the previous allocation 
returned
+  /// by Allocate().
+  void ReturnPartialAllocation(int byte_size) {
+    DCHECK_GE(byte_size, 0);
+    DCHECK(current_chunk_idx_ != -1);
+    ChunkInfo& info = chunks_[current_chunk_idx_];
+    DCHECK_GE(info.allocated_bytes, byte_size);
+    info.allocated_bytes -= byte_size;
+    total_allocated_bytes_ -= byte_size;
+  }
+
+  /// Makes all allocated chunks available for re-use, but doesn't delete any 
chunks.
+  void Clear();
+
+  /// Deletes all allocated chunks. FreeAll() or AcquireData() must be called 
for
+  /// each mem pool
+  void FreeAll();
+
+  /// Absorb all chunks that hold data from src. If keep_current is true, let 
src hold on
+  /// to its last allocated chunk that contains data.
+  /// All offsets handed out by calls to GetCurrentOffset() for 'src' become 
invalid.
+  void AcquireData(MemPool* src, bool keep_current);
+
+  std::string DebugString();
+
+  int64_t total_allocated_bytes() const { return total_allocated_bytes_; }
+  int64_t peak_allocated_bytes() const { return peak_allocated_bytes_; }
+  int64_t total_reserved_bytes() const { return total_reserved_bytes_; }
+
+  /// Return sum of chunk_sizes_.
+  int64_t GetTotalChunkSizes() const;
+
+ private:
+  friend class MemPoolTest;
+  static const int INITIAL_CHUNK_SIZE = 4 * 1024;
+
+  /// The maximum size of chunk that should be allocated. Allocations larger 
than this
+  /// size will get their own individual chunk.
+  static const int MAX_CHUNK_SIZE = 1024 * 1024;
+
+  struct ChunkInfo {
+    uint8_t* data; // Owned by the ChunkInfo.
+    int64_t size;  // in bytes
+
+    /// bytes allocated via Allocate() in this chunk
+    int64_t allocated_bytes;
+
+    explicit ChunkInfo(int64_t size, uint8_t* buf);
+
+    ChunkInfo()
+      : data(NULL),
+        size(0),
+        allocated_bytes(0) {}
+  };
+
+  /// chunk from which we served the last Allocate() call;
+  /// always points to the last chunk that contains allocated data;
+  /// chunks 0..current_chunk_idx_ are guaranteed to contain data
+  /// (chunks_[i].allocated_bytes > 0 for i: 0..current_chunk_idx_);
+  /// -1 if no chunks present
+  int current_chunk_idx_;
+
+  /// The size of the next chunk to allocate.
+  int64_t next_chunk_size_;
+
+  /// sum of allocated_bytes_
+  int64_t total_allocated_bytes_;
+
+  /// Maximum number of bytes allocated from this pool at one time.
+  int64_t peak_allocated_bytes_;
+
+  /// sum of all bytes allocated in chunks_
+  int64_t total_reserved_bytes_;
+
+  std::vector<ChunkInfo> chunks_;
+
+  /// Find or allocated a chunk with at least min_size spare capacity and 
update
+  /// current_chunk_idx_. Also updates chunks_, chunk_sizes_ and 
allocated_bytes_
+  /// if a new chunk needs to be created.
+  bool FindChunk(int64_t min_size);
+
+  /// Check integrity of the supporting data structures; always returns true 
but DCHECKs
+  /// all invariants.
+  /// If 'current_chunk_empty' is false, checks that the current chunk 
contains data.
+  bool CheckIntegrity(bool current_chunk_empty);
+
+  /// Return offset to unoccpied space in current chunk.
+  int GetFreeOffset() const {
+    if (current_chunk_idx_ == -1) return 0;
+    return chunks_[current_chunk_idx_].allocated_bytes;
+  }
+
+  template <bool CHECK_LIMIT_FIRST>
+  uint8_t* Allocate(int size) {
+    if (size == 0) return NULL;
+
+    int64_t num_bytes = BitUtil::RoundUp(size, 8);
+    if (current_chunk_idx_ == -1
+        || num_bytes + chunks_[current_chunk_idx_].allocated_bytes
+          > chunks_[current_chunk_idx_].size) {
+      // If we couldn't allocate a new chunk, return NULL.
+      if (UNLIKELY(!FindChunk(num_bytes))) return NULL;
+    }
+    ChunkInfo& info = chunks_[current_chunk_idx_];
+    uint8_t* result = info.data + info.allocated_bytes;
+    DCHECK_LE(info.allocated_bytes + num_bytes, info.size);
+    info.allocated_bytes += num_bytes;
+    total_allocated_bytes_ += num_bytes;
+    DCHECK_LE(current_chunk_idx_, chunks_.size() - 1);
+    peak_allocated_bytes_ = std::max(total_allocated_bytes_, 
peak_allocated_bytes_);
+    return result;
+  }
+};
+
+} // namespace parquet_cpp
+
+#endif // PARQUET_UTIL_MEM_POOL_H

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c6e06929/src/parquet/util/output.h
----------------------------------------------------------------------
diff --git a/src/parquet/util/output.h b/src/parquet/util/output.h
index 2a43a36..b466e0e 100644
--- a/src/parquet/util/output.h
+++ b/src/parquet/util/output.h
@@ -20,7 +20,6 @@
 
 #include <cstdint>
 #include <memory>
-#include <vector>
 
 #include "parquet/util/macros.h"
 

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c6e06929/src/parquet/util/rle-encoding.h
----------------------------------------------------------------------
diff --git a/src/parquet/util/rle-encoding.h b/src/parquet/util/rle-encoding.h
index 22b2c2f..77749f5 100644
--- a/src/parquet/util/rle-encoding.h
+++ b/src/parquet/util/rle-encoding.h
@@ -20,8 +20,8 @@
 #ifndef PARQUET_UTIL_RLE_ENCODING_H
 #define PARQUET_UTIL_RLE_ENCODING_H
 
-#include <algorithm>
 #include <math.h>
+#include <algorithm>
 
 #include "parquet/util/compiler-util.h"
 #include "parquet/util/bit-stream-utils.inline.h"

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c6e06929/src/parquet/util/rle-test.cc
----------------------------------------------------------------------
diff --git a/src/parquet/util/rle-test.cc b/src/parquet/util/rle-test.cc
index df020f5..5f18a6f 100644
--- a/src/parquet/util/rle-test.cc
+++ b/src/parquet/util/rle-test.cc
@@ -17,17 +17,18 @@
 
 // From Apache Impala as of 2016-01-29
 
+#include <gtest/gtest.h>
+#include <math.h>
 #include <stdlib.h>
 #include <stdio.h>
+
+#include <boost/utility.hpp>
+
 #include <cstdint>
 #include <iostream>
 #include <random>
 #include <vector>
 
-#include <boost/utility.hpp>
-#include <gtest/gtest.h>
-#include <math.h>
-
 #include "parquet/util/rle-encoding.h"
 #include "parquet/util/bit-stream-utils.inline.h"
 

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c6e06929/src/parquet/util/sse-util.h
----------------------------------------------------------------------
diff --git a/src/parquet/util/sse-util.h b/src/parquet/util/sse-util.h
index 588c30a..29bf2f9 100644
--- a/src/parquet/util/sse-util.h
+++ b/src/parquet/util/sse-util.h
@@ -25,6 +25,7 @@
 
 namespace parquet_cpp {
 
+
 /// This class contains constants useful for text processing with SSE4.2 
intrinsics.
 namespace SSEUtil {
   /// Number of characters that fit in 64/128 bit register.  SSE provides 
instructions
@@ -93,11 +94,17 @@ namespace SSEUtil {
 
 template<int MODE>
 static inline __m128i SSE4_cmpestrm(__m128i str1, int len1, __m128i str2, int 
len2) {
+#ifdef __clang__
   /// Use asm reg rather than Yz output constraint to workaround LLVM bug 
13199 -
   /// clang doesn't support Y-prefixed asm constraints.
   register volatile __m128i result asm("xmm0");
   __asm__ volatile ("pcmpestrm %5, %2, %1"
       : "=x"(result) : "x"(str1), "xm"(str2), "a"(len1), "d"(len2), "i"(MODE) 
: "cc");
+#else
+  __m128i result;
+  __asm__ volatile ("pcmpestrm %5, %2, %1"
+      : "=Yz"(result) : "x"(str1), "xm"(str2), "a"(len1), "d"(len2), "i"(MODE) 
: "cc");
+#endif
   return result;
 }
 
@@ -114,11 +121,22 @@ static inline uint32_t SSE4_crc32_u8(uint32_t crc, 
uint8_t v) {
   return crc;
 }
 
+static inline uint32_t SSE4_crc32_u16(uint32_t crc, uint16_t v) {
+  __asm__("crc32w %1, %0" : "+r"(crc) : "rm"(v));
+  return crc;
+}
+
 static inline uint32_t SSE4_crc32_u32(uint32_t crc, uint32_t v) {
   __asm__("crc32l %1, %0" : "+r"(crc) : "rm"(v));
   return crc;
 }
 
+static inline uint32_t SSE4_crc32_u64(uint32_t crc, uint64_t v) {
+  uint64_t result = crc;
+  __asm__("crc32q %1, %0" : "+r"(result) : "rm"(v));
+  return result;
+}
+
 static inline int64_t POPCNT_popcnt_u64(uint64_t a) {
   int64_t result;
   __asm__("popcntq %1, %0" : "=r"(result) : "mr"(a) : "cc");
@@ -148,7 +166,9 @@ static inline int SSE4_cmpestri(
 }
 
 #define SSE4_crc32_u8 _mm_crc32_u8
+#define SSE4_crc32_u16 _mm_crc32_u16
 #define SSE4_crc32_u32 _mm_crc32_u32
+#define SSE4_crc32_u64 _mm_crc32_u64
 #define POPCNT_popcnt_u64 _mm_popcnt_u64
 
 #else  // IR_COMPILE without SSE 4.2.
@@ -174,11 +194,21 @@ static inline uint32_t SSE4_crc32_u8(uint32_t crc, 
uint8_t v) {
   return 0;
 }
 
+static inline uint32_t SSE4_crc32_u16(uint32_t crc, uint16_t v) {
+  DCHECK(false) << "CPU doesn't support SSE 4.2";
+  return 0;
+}
+
 static inline uint32_t SSE4_crc32_u32(uint32_t crc, uint32_t v) {
   DCHECK(false) << "CPU doesn't support SSE 4.2";
   return 0;
 }
 
+static inline uint32_t SSE4_crc32_u64(uint32_t crc, uint64_t v) {
+  DCHECK(false) << "CPU doesn't support SSE 4.2";
+  return 0;
+}
+
 static inline int64_t POPCNT_popcnt_u64(uint64_t a) {
   DCHECK(false) << "CPU doesn't support SSE 4.2";
   return 0;

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c6e06929/src/parquet/util/stopwatch.h
----------------------------------------------------------------------
diff --git a/src/parquet/util/stopwatch.h b/src/parquet/util/stopwatch.h
index 076cfc8..14da2c4 100644
--- a/src/parquet/util/stopwatch.h
+++ b/src/parquet/util/stopwatch.h
@@ -18,11 +18,12 @@
 #ifndef PARQUET_UTIL_STOPWATCH_H
 #define PARQUET_UTIL_STOPWATCH_H
 
-#include <iostream>
 #include <stdio.h>
-#include <ctime>
 #include <sys/time.h>
 
+#include <iostream>
+#include <ctime>
+
 namespace parquet_cpp {
 
 class StopWatch {

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c6e06929/src/parquet/util/test-common.h
----------------------------------------------------------------------
diff --git a/src/parquet/util/test-common.h b/src/parquet/util/test-common.h
index 49c4131..9975ed9 100644
--- a/src/parquet/util/test-common.h
+++ b/src/parquet/util/test-common.h
@@ -31,6 +31,10 @@ namespace parquet_cpp {
 
 namespace test {
 
+typedef ::testing::Types<BooleanType, Int32Type, Int64Type, Int96Type,
+                         FloatType, DoubleType, ByteArrayType,
+                         FLBAType> ParquetTypes;
+
 template <typename T>
 static inline void assert_vector_equal(const vector<T>& left,
     const vector<T>& right) {
@@ -167,9 +171,9 @@ void random_fixed_byte_array(int n, uint32_t seed, uint8_t 
*buf, int len,
 }
 
 void random_byte_array(int n, uint32_t seed, uint8_t *buf,
-    ByteArray* out, int max_size) {
+    ByteArray* out, int min_size, int max_size) {
   std::mt19937 gen(seed);
-  std::uniform_int_distribution<int> d1(0, max_size);
+  std::uniform_int_distribution<int> d1(min_size, max_size);
   std::uniform_int_distribution<int> d2(0, 255);
   for (int i = 0; i < n; ++i) {
     out[i].len = d1(gen);
@@ -181,6 +185,11 @@ void random_byte_array(int n, uint32_t seed, uint8_t *buf,
   }
 }
 
+void random_byte_array(int n, uint32_t seed, uint8_t *buf,
+    ByteArray* out, int max_size) {
+  random_byte_array(n, seed, buf, out, 0, max_size);
+}
+
 } // namespace test
 } // namespace parquet_cpp

[1/2] parquet-cpp git commit: PARQUET-494: Implement DictionaryEncoder and test dictionary decoding

Reply via email to