This is an automated email from the ASF dual-hosted git repository.
uwe pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 2d4fe01 PARQUET-1583: [C++] Remove superfluous parquet::Vector class
2d4fe01 is described below
commit 2d4fe0153cd3edec6ddc8ca742d05c8e3a98f768
Author: Wes McKinney <[email protected]>
AuthorDate: Tue May 21 08:38:09 2019 +0200
PARQUET-1583: [C++] Remove superfluous parquet::Vector class
This class which is a syntactic sugar layer on `arrow::ResizableBuffer` was
only being used in one place, for dictionary decoding. I don't think we need to
maintain this, and it also features possible instantiated template symbol issues
Author: Wes McKinney <[email protected]>
Closes #4354 from wesm/PARQUET-1583 and squashes the following commits:
24535175 <Wes McKinney> lint
29af0799 <Wes McKinney> Remove superfluous parquet::Vector class
---
cpp/src/parquet/encoding.cc | 55 +++++++++++++++++++++++++-----------------
cpp/src/parquet/util/memory.cc | 50 --------------------------------------
cpp/src/parquet/util/memory.h | 22 -----------------
3 files changed, 33 insertions(+), 94 deletions(-)
diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc
index 3a4b342..ebb7aea 100644
--- a/cpp/src/parquet/encoding.cc
+++ b/cpp/src/parquet/encoding.cc
@@ -781,7 +781,7 @@ class DictDecoderImpl : public DecoderImpl, virtual public
DictDecoder<Type> {
explicit DictDecoderImpl(const ColumnDescriptor* descr,
::arrow::MemoryPool* pool =
::arrow::default_memory_pool())
: DecoderImpl(descr, Encoding::RLE_DICTIONARY),
- dictionary_(0, pool),
+ dictionary_(AllocateBuffer(pool, 0)),
byte_array_data_(AllocateBuffer(pool, 0)) {}
// Perform type-specific initiatialization
@@ -798,8 +798,8 @@ class DictDecoderImpl : public DecoderImpl, virtual public
DictDecoder<Type> {
int Decode(T* buffer, int max_values) override {
max_values = std::min(max_values, num_values_);
- int decoded_values =
- idx_decoder_.GetBatchWithDict(dictionary_.data(), buffer, max_values);
+ int decoded_values = idx_decoder_.GetBatchWithDict(
+ reinterpret_cast<const T*>(dictionary_->data()), buffer, max_values);
if (decoded_values != max_values) {
ParquetException::EofException();
}
@@ -809,9 +809,9 @@ class DictDecoderImpl : public DecoderImpl, virtual public
DictDecoder<Type> {
int DecodeSpaced(T* buffer, int num_values, int null_count, const uint8_t*
valid_bits,
int64_t valid_bits_offset) override {
- int decoded_values =
- idx_decoder_.GetBatchWithDictSpaced(dictionary_.data(), buffer,
num_values,
- null_count, valid_bits,
valid_bits_offset);
+ int decoded_values = idx_decoder_.GetBatchWithDictSpaced(
+ reinterpret_cast<const T*>(dictionary_->data()), buffer, num_values,
null_count,
+ valid_bits, valid_bits_offset);
if (decoded_values != num_values) {
ParquetException::EofException();
}
@@ -819,8 +819,15 @@ class DictDecoderImpl : public DecoderImpl, virtual public
DictDecoder<Type> {
}
protected:
+ inline void DecodeDict(TypedDecoder<Type>* dictionary) {
+ int num_dictionary_values = dictionary->values_left();
+ PARQUET_THROW_NOT_OK(dictionary_->Resize(num_dictionary_values *
sizeof(T)));
+ dictionary->Decode(reinterpret_cast<T*>(dictionary_->mutable_data()),
+ num_dictionary_values);
+ }
+
// Only one is set.
- Vector<T> dictionary_;
+ std::shared_ptr<ResizableBuffer> dictionary_;
// Data that contains the byte array data (byte_array_dictionary_ just has
the
// pointers).
@@ -831,9 +838,7 @@ class DictDecoderImpl : public DecoderImpl, virtual public
DictDecoder<Type> {
template <typename Type>
inline void DictDecoderImpl<Type>::SetDict(TypedDecoder<Type>* dictionary) {
- int num_dictionary_values = dictionary->values_left();
- dictionary_.Resize(num_dictionary_values);
- dictionary->Decode(dictionary_.data(), num_dictionary_values);
+ DecodeDict(dictionary);
}
template <>
@@ -845,12 +850,13 @@ template <>
inline void DictDecoderImpl<ByteArrayType>::SetDict(
TypedDecoder<ByteArrayType>* dictionary) {
int num_dictionary_values = dictionary->values_left();
- dictionary_.Resize(num_dictionary_values);
- dictionary->Decode(dictionary_.data(), num_dictionary_values);
+ DecodeDict(dictionary);
+
+ auto dict_values = reinterpret_cast<ByteArray*>(dictionary_->mutable_data());
int total_size = 0;
for (int i = 0; i < num_dictionary_values; ++i) {
- total_size += dictionary_[i].len;
+ total_size += dict_values[i].len;
}
if (total_size > 0) {
PARQUET_THROW_NOT_OK(byte_array_data_->Resize(total_size, false));
@@ -859,17 +865,18 @@ inline void DictDecoderImpl<ByteArrayType>::SetDict(
int offset = 0;
uint8_t* bytes_data = byte_array_data_->mutable_data();
for (int i = 0; i < num_dictionary_values; ++i) {
- memcpy(bytes_data + offset, dictionary_[i].ptr, dictionary_[i].len);
- dictionary_[i].ptr = bytes_data + offset;
- offset += dictionary_[i].len;
+ memcpy(bytes_data + offset, dict_values[i].ptr, dict_values[i].len);
+ dict_values[i].ptr = bytes_data + offset;
+ offset += dict_values[i].len;
}
}
template <>
inline void DictDecoderImpl<FLBAType>::SetDict(TypedDecoder<FLBAType>*
dictionary) {
int num_dictionary_values = dictionary->values_left();
- dictionary_.Resize(num_dictionary_values);
- dictionary->Decode(&dictionary_[0], num_dictionary_values);
+ DecodeDict(dictionary);
+
+ auto dict_values = reinterpret_cast<FLBA*>(dictionary_->mutable_data());
int fixed_len = descr_->type_length();
int total_size = num_dictionary_values * fixed_len;
@@ -877,8 +884,8 @@ inline void
DictDecoderImpl<FLBAType>::SetDict(TypedDecoder<FLBAType>* dictionar
PARQUET_THROW_NOT_OK(byte_array_data_->Resize(total_size, false));
uint8_t* bytes_data = byte_array_data_->mutable_data();
for (int32_t i = 0, offset = 0; i < num_dictionary_values; ++i, offset +=
fixed_len) {
- memcpy(bytes_data + offset, dictionary_[i].ptr, fixed_len);
- dictionary_[i].ptr = bytes_data + offset;
+ memcpy(bytes_data + offset, dict_values[i].ptr, fixed_len);
+ dict_values[i].ptr = bytes_data + offset;
}
}
@@ -897,6 +904,8 @@ class DictByteArrayDecoder : public
DictDecoderImpl<ByteArrayType>,
builder->Reserve(num_values);
::arrow::internal::BitmapReader bit_reader(valid_bits, valid_bits_offset,
num_values);
+ auto dict_values = reinterpret_cast<const ByteArray*>(dictionary_->data());
+
int values_decoded = 0;
while (values_decoded < num_values) {
bool is_valid = bit_reader.IsSet();
@@ -911,7 +920,7 @@ class DictByteArrayDecoder : public
DictDecoderImpl<ByteArrayType>,
while (true) {
// Consume all indices
if (is_valid) {
- const auto& val = dictionary_[indices_buffer[i]];
+ const auto& val = dict_values[indices_buffer[i]];
builder->Append(val.ptr, val.len);
++i;
} else {
@@ -948,12 +957,14 @@ class DictByteArrayDecoder : public
DictDecoderImpl<ByteArrayType>,
int values_decoded = 0;
builder->Reserve(num_values);
+ auto dict_values = reinterpret_cast<const ByteArray*>(dictionary_->data());
+
while (values_decoded < num_values) {
int32_t batch_size = std::min<int32_t>(buffer_size, num_values -
values_decoded);
int num_indices = idx_decoder_.GetBatch(indices_buffer, batch_size);
if (num_indices == 0) break;
for (int i = 0; i < num_indices; ++i) {
- const auto& val = dictionary_[indices_buffer[i]];
+ const auto& val = dict_values[indices_buffer[i]];
builder->Append(val.ptr, val.len);
}
values_decoded += num_indices;
diff --git a/cpp/src/parquet/util/memory.cc b/cpp/src/parquet/util/memory.cc
index 9289601..9640f88 100644
--- a/cpp/src/parquet/util/memory.cc
+++ b/cpp/src/parquet/util/memory.cc
@@ -65,56 +65,6 @@ std::unique_ptr<Codec> GetCodecFromArrow(Compression::type
codec) {
return result;
}
-template <class T>
-Vector<T>::Vector(int64_t size, MemoryPool* pool)
- : buffer_(AllocateBuffer(pool, size * sizeof(T))), size_(size),
capacity_(size) {
- if (size > 0) {
- data_ = reinterpret_cast<T*>(buffer_->mutable_data());
- } else {
- data_ = nullptr;
- }
-}
-
-template <class T>
-void Vector<T>::Reserve(int64_t new_capacity) {
- if (new_capacity > capacity_) {
- PARQUET_THROW_NOT_OK(buffer_->Resize(new_capacity * sizeof(T)));
- data_ = reinterpret_cast<T*>(buffer_->mutable_data());
- capacity_ = new_capacity;
- }
-}
-
-template <class T>
-void Vector<T>::Resize(int64_t new_size) {
- Reserve(new_size);
- size_ = new_size;
-}
-
-template <class T>
-void Vector<T>::Assign(int64_t size, const T val) {
- Resize(size);
- for (int64_t i = 0; i < size_; i++) {
- data_[i] = val;
- }
-}
-
-template <class T>
-void Vector<T>::Swap(Vector<T>& v) {
- buffer_.swap(v.buffer_);
- std::swap(size_, v.size_);
- std::swap(capacity_, v.capacity_);
- std::swap(data_, v.data_);
-}
-
-template class Vector<int32_t>;
-template class Vector<int64_t>;
-template class Vector<bool>;
-template class Vector<float>;
-template class Vector<double>;
-template class Vector<Int96>;
-template class Vector<ByteArray>;
-template class Vector<FixedLenByteArray>;
-
// ----------------------------------------------------------------------
// Arrow IO wrappers
diff --git a/cpp/src/parquet/util/memory.h b/cpp/src/parquet/util/memory.h
index 0d23c31..0a8cd71 100644
--- a/cpp/src/parquet/util/memory.h
+++ b/cpp/src/parquet/util/memory.h
@@ -56,28 +56,6 @@ using MutableBuffer = ::arrow::MutableBuffer;
using ResizableBuffer = ::arrow::ResizableBuffer;
using ResizableBuffer = ::arrow::ResizableBuffer;
-template <class T>
-class PARQUET_EXPORT Vector {
- public:
- explicit Vector(int64_t size, ::arrow::MemoryPool* pool);
- void Resize(int64_t new_size);
- void Reserve(int64_t new_capacity);
- void Assign(int64_t size, const T val);
- void Swap(Vector<T>& v);
- inline T& operator[](int64_t i) const { return data_[i]; }
-
- T* data() { return data_; }
- const T* data() const { return data_; }
-
- private:
- std::shared_ptr<ResizableBuffer> buffer_;
- int64_t size_;
- int64_t capacity_;
- T* data_;
-
- PARQUET_DISALLOW_COPY_AND_ASSIGN(Vector);
-};
-
// File input and output interfaces that translate arrow::Status to exceptions
class PARQUET_EXPORT FileInterface {