Repository: arrow Updated Branches: refs/heads/master a45b04712 -> 94f6247c4
ARROW-1508: C++: Add support for FixedSizeBinaryType in DictionaryBuilder Author: Uwe L. Korn <uw...@xhochy.com> Closes #1073 from xhochy/ARROW-1508 and squashes the following commits: cefee2c [Uwe L. Korn] ninja format c471bfa [Uwe L. Korn] Review comments 4bf354e [Uwe L. Korn] Use non-deprecated interfaces 2538926 [Uwe L. Korn] ARROW-1508: C++: Add support for FixedSizeBinaryType in DictionaryBuilder Project: http://git-wip-us.apache.org/repos/asf/arrow/repo Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/94f6247c Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/94f6247c Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/94f6247c Branch: refs/heads/master Commit: 94f6247c4ef49b9b4fe1cccdc172810c55c34a10 Parents: a45b047 Author: Uwe L. Korn <uw...@xhochy.com> Authored: Mon Sep 11 12:46:37 2017 +0200 Committer: Uwe L. Korn <uw...@xhochy.com> Committed: Mon Sep 11 12:46:37 2017 +0200 ---------------------------------------------------------------------- cpp/src/arrow/array-test.cc | 84 ++++++++++++++++++++++++++++++++++++++++ cpp/src/arrow/array.h | 1 + cpp/src/arrow/builder.cc | 71 ++++++++++++++++++++++++++++++++- cpp/src/arrow/builder.h | 11 ++++++ 4 files changed, 165 insertions(+), 2 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/arrow/blob/94f6247c/cpp/src/arrow/array-test.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc index c92c23d..a9596c5 100644 --- a/cpp/src/arrow/array-test.cc +++ b/cpp/src/arrow/array-test.cc @@ -1764,6 +1764,90 @@ TEST(TestStringDictionaryBuilder, DoubleTableSize) { ASSERT_TRUE(expected.Equals(result)); } +TEST(TestFixedSizeBinaryDictionaryBuilder, Basic) { + // Build the dictionary Array + DictionaryBuilder<FixedSizeBinaryType> builder(arrow::fixed_size_binary(4), + default_memory_pool()); + std::vector<uint8_t> test{12, 12, 11, 12}; + std::vector<uint8_t> test2{12, 12, 11, 11}; + ASSERT_OK(builder.Append(test.data())); + ASSERT_OK(builder.Append(test2.data())); + ASSERT_OK(builder.Append(test.data())); + + std::shared_ptr<Array> result; + ASSERT_OK(builder.Finish(&result)); + + // Build expected data + FixedSizeBinaryBuilder fsb_builder(arrow::fixed_size_binary(4)); + ASSERT_OK(fsb_builder.Append(test.data())); + ASSERT_OK(fsb_builder.Append(test2.data())); + std::shared_ptr<Array> fsb_array; + ASSERT_OK(fsb_builder.Finish(&fsb_array)); + auto dtype = std::make_shared<DictionaryType>(int8(), fsb_array); + + Int8Builder int_builder; + ASSERT_OK(int_builder.Append(0)); + ASSERT_OK(int_builder.Append(1)); + ASSERT_OK(int_builder.Append(0)); + std::shared_ptr<Array> int_array; + ASSERT_OK(int_builder.Finish(&int_array)); + + DictionaryArray expected(dtype, int_array); + ASSERT_TRUE(expected.Equals(result)); +} + +TEST(TestFixedSizeBinaryDictionaryBuilder, DoubleTableSize) { + // Build the dictionary Array + DictionaryBuilder<FixedSizeBinaryType> builder(arrow::fixed_size_binary(4), + default_memory_pool()); + // Build expected data + FixedSizeBinaryBuilder fsb_builder(arrow::fixed_size_binary(4)); + Int16Builder int_builder; + + // Fill with 1024 different values + for (int64_t i = 0; i < 1024; i++) { + std::vector<uint8_t> value{12, 12, static_cast<uint8_t>(i / 128), + static_cast<uint8_t>(i % 128)}; + ASSERT_OK(builder.Append(value.data())); + ASSERT_OK(fsb_builder.Append(value.data())); + ASSERT_OK(int_builder.Append(static_cast<uint16_t>(i))); + } + // Fill with an already existing value + std::vector<uint8_t> known_value{12, 12, 0, 1}; + for (int64_t i = 0; i < 1024; i++) { + ASSERT_OK(builder.Append(known_value.data())); + ASSERT_OK(int_builder.Append(1)); + } + + // Finalize result + std::shared_ptr<Array> result; + ASSERT_OK(builder.Finish(&result)); + + // Finalize expected data + std::shared_ptr<Array> fsb_array; + ASSERT_OK(fsb_builder.Finish(&fsb_array)); + auto dtype = std::make_shared<DictionaryType>(int16(), fsb_array); + std::shared_ptr<Array> int_array; + ASSERT_OK(int_builder.Finish(&int_array)); + + DictionaryArray expected(dtype, int_array); + ASSERT_TRUE(expected.Equals(result)); +} + +TEST(TestFixedSizeBinaryDictionaryBuilder, InvalidTypeAppend) { + // Build the dictionary Array + DictionaryBuilder<FixedSizeBinaryType> builder(arrow::fixed_size_binary(4), + default_memory_pool()); + // Build an array with different byte width + FixedSizeBinaryBuilder fsb_builder(arrow::fixed_size_binary(5)); + std::vector<uint8_t> value{100, 1, 1, 1, 1}; + ASSERT_OK(fsb_builder.Append(value.data())); + std::shared_ptr<Array> fsb_array; + ASSERT_OK(fsb_builder.Finish(&fsb_array)); + + ASSERT_RAISES(Invalid, builder.AppendArray(*fsb_array)); +} + // ---------------------------------------------------------------------- // List tests http://git-wip-us.apache.org/repos/asf/arrow/blob/94f6247c/cpp/src/arrow/array.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h index bfeedd2..994270d 100644 --- a/cpp/src/arrow/array.h +++ b/cpp/src/arrow/array.h @@ -512,6 +512,7 @@ class ARROW_EXPORT FixedSizeBinaryArray : public PrimitiveArray { int64_t null_count = 0, int64_t offset = 0); const uint8_t* GetValue(int64_t i) const; + const uint8_t* Value(int64_t i) const { return GetValue(i); } int32_t byte_width() const { return byte_width_; } http://git-wip-us.apache.org/repos/asf/arrow/blob/94f6247c/cpp/src/arrow/builder.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index 7966241..5945677 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -26,6 +26,7 @@ #include "arrow/array.h" #include "arrow/buffer.h" +#include "arrow/compare.h" #include "arrow/status.h" #include "arrow/table.h" #include "arrow/type.h" @@ -818,10 +819,25 @@ DictionaryBuilder<T>::DictionaryBuilder(const std::shared_ptr<DataType>& type, hash_table_(new PoolBuffer(pool)), hash_slots_(nullptr), dict_builder_(type, pool), + values_builder_(pool), + byte_width_(-1) { + if (!::arrow::CpuInfo::initialized()) { + ::arrow::CpuInfo::Init(); + } +} + +template <> +DictionaryBuilder<FixedSizeBinaryType>::DictionaryBuilder( + const std::shared_ptr<DataType>& type, MemoryPool* pool) + : ArrayBuilder(type, pool), + hash_table_(new PoolBuffer(pool)), + hash_slots_(nullptr), + dict_builder_(type, pool), values_builder_(pool) { if (!::arrow::CpuInfo::initialized()) { ::arrow::CpuInfo::Init(); } + byte_width_ = static_cast<const FixedSizeBinaryType&>(*type).byte_width(); } #ifndef ARROW_NO_DEPRECATED_API @@ -918,6 +934,24 @@ Status DictionaryBuilder<T>::AppendArray(const Array& array) { return Status::OK(); } +template <> +Status DictionaryBuilder<FixedSizeBinaryType>::AppendArray(const Array& array) { + if (!type_->Equals(*array.type())) { + return Status::Invalid("Cannot append FixedSizeBinary array with non-matching type"); + } + + const FixedSizeBinaryArray& numeric_array = + static_cast<const FixedSizeBinaryArray&>(array); + for (int64_t i = 0; i < array.length(); i++) { + if (array.IsNull(i)) { + RETURN_NOT_OK(AppendNull()); + } else { + RETURN_NOT_OK(Append(numeric_array.Value(i))); + } + } + return Status::OK(); +} + template <typename T> Status DictionaryBuilder<T>::AppendNull() { return values_builder_.AppendNull(); @@ -975,17 +1009,35 @@ typename DictionaryBuilder<T>::Scalar DictionaryBuilder<T>::GetDictionaryValue( return data[index]; } +template <> +const uint8_t* DictionaryBuilder<FixedSizeBinaryType>::GetDictionaryValue(int64_t index) { + return dict_builder_.GetValue(index); +} + template <typename T> int DictionaryBuilder<T>::HashValue(const Scalar& value) { return HashUtil::Hash(&value, sizeof(Scalar), 0); } +template <> +int DictionaryBuilder<FixedSizeBinaryType>::HashValue(const Scalar& value) { + return HashUtil::Hash(value, byte_width_, 0); +} + template <typename T> bool DictionaryBuilder<T>::SlotDifferent(hash_slot_t index, const Scalar& value) { const Scalar other = GetDictionaryValue(static_cast<int64_t>(index)); return other != value; } +template <> +bool DictionaryBuilder<FixedSizeBinaryType>::SlotDifferent(hash_slot_t index, + const Scalar& value) { + int32_t width = static_cast<const FixedSizeBinaryType&>(*type_).byte_width(); + const Scalar other = GetDictionaryValue(static_cast<int64_t>(index)); + return memcmp(other, value, width) != 0; +} + template <typename T> Status DictionaryBuilder<T>::AppendDictionary(const Scalar& value) { return dict_builder_.Append(value); @@ -1052,6 +1104,7 @@ template class DictionaryBuilder<Time64Type>; template class DictionaryBuilder<TimestampType>; template class DictionaryBuilder<FloatType>; template class DictionaryBuilder<DoubleType>; +template class DictionaryBuilder<FixedSizeBinaryType>; template class DictionaryBuilder<BinaryType>; template class DictionaryBuilder<StringType>; @@ -1315,6 +1368,11 @@ Status FixedSizeBinaryBuilder::Finish(std::shared_ptr<Array>* out) { return Status::OK(); } +const uint8_t* FixedSizeBinaryBuilder::GetValue(int64_t i) const { + const uint8_t* data_ptr = byte_builder_.data(); + return data_ptr + i * byte_width_; +} + // ---------------------------------------------------------------------- // Struct @@ -1438,6 +1496,7 @@ Status MakeDictionaryBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& DICTIONARY_BUILDER_CASE(DOUBLE, DictionaryBuilder<DoubleType>); DICTIONARY_BUILDER_CASE(STRING, StringDictionaryBuilder); DICTIONARY_BUILDER_CASE(BINARY, BinaryDictionaryBuilder); + DICTIONARY_BUILDER_CASE(FIXED_SIZE_BINARY, DictionaryBuilder<FixedSizeBinaryType>); default: return Status::NotImplemented(type->ToString()); } @@ -1472,8 +1531,12 @@ Status EncodeArrayToDictionary(const Array& input, MemoryPool* pool, DICTIONARY_ARRAY_CASE(DOUBLE, DictionaryBuilder<DoubleType>); DICTIONARY_ARRAY_CASE(STRING, StringDictionaryBuilder); DICTIONARY_ARRAY_CASE(BINARY, BinaryDictionaryBuilder); + DICTIONARY_ARRAY_CASE(FIXED_SIZE_BINARY, DictionaryBuilder<FixedSizeBinaryType>); default: - return Status::NotImplemented(type->ToString()); + std::stringstream ss; + ss << "Cannot encode array of type " << type->ToString(); + ss << " to dictionary"; + return Status::NotImplemented(ss.str()); } } #define DICTIONARY_COLUMN_CASE(ENUM, BuilderType) \ @@ -1516,8 +1579,12 @@ Status EncodeColumnToDictionary(const Column& input, MemoryPool* pool, DICTIONARY_COLUMN_CASE(DOUBLE, DictionaryBuilder<DoubleType>); DICTIONARY_COLUMN_CASE(STRING, StringDictionaryBuilder); DICTIONARY_COLUMN_CASE(BINARY, BinaryDictionaryBuilder); + DICTIONARY_COLUMN_CASE(FIXED_SIZE_BINARY, DictionaryBuilder<FixedSizeBinaryType>); default: - return Status::NotImplemented(type->ToString()); + std::stringstream ss; + ss << "Cannot encode column of type " << type->ToString(); + ss << " to dictionary"; + return Status::NotImplemented(ss.str()); } } http://git-wip-us.apache.org/repos/asf/arrow/blob/94f6247c/cpp/src/arrow/builder.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h index 3e8289f..bf7b317 100644 --- a/cpp/src/arrow/builder.h +++ b/cpp/src/arrow/builder.h @@ -756,6 +756,11 @@ class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder { /// \return size of values buffer so far int64_t value_data_length() const { return byte_builder_.length(); } + /// Temporary access to a value. + /// + /// This pointer becomes invalid on the next modifying operation. + const uint8_t* GetValue(int64_t i) const; + protected: int32_t byte_width_; BufferBuilder byte_builder_; @@ -866,6 +871,11 @@ struct DictionaryScalar<StringType> { using type = WrappedBinary; }; +template <> +struct DictionaryScalar<FixedSizeBinaryType> { + using type = uint8_t const*; +}; + } // namespace internal /// \brief Array builder for created encoded DictionaryArray from dense array @@ -921,6 +931,7 @@ class ARROW_EXPORT DictionaryBuilder : public ArrayBuilder { typename TypeTraits<T>::BuilderType dict_builder_; AdaptiveIntBuilder values_builder_; + int32_t byte_width_; }; class ARROW_EXPORT BinaryDictionaryBuilder : public DictionaryBuilder<BinaryType> {