This is an automated email from the ASF dual-hosted git repository.
uwe pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 65d0e19 ARROW-4776: [C++] Add DictionaryBuilder constructor which
takes a dictionary array
65d0e19 is described below
commit 65d0e1959a3a2a52acae85e9f5b465f345585476
Author: Benjamin Kietzman <[email protected]>
AuthorDate: Wed Mar 13 16:53:05 2019 +0100
ARROW-4776: [C++] Add DictionaryBuilder constructor which takes a
dictionary array
DictionaryBuilder can now be constructed with a user provided dictionary.
This is also available through MakeBuilder
Author: Benjamin Kietzman <[email protected]>
Author: Uwe L. Korn <[email protected]>
Closes #3867 from
bkietz/ARROW-4776-dictionarybuilder-should-support-bootstrapping-from-an-existing-dict-type
and squashes the following commits:
d2551c99 <Benjamin Kietzman> Use {String,Binary}DictionaryBuilder for those
types
91da93b9 <Uwe L. Korn> remove unused std::stringstream
4514860b <Benjamin Kietzman> Add tests for MakeBuilder(dictionary type)
5fcbe263 <Benjamin Kietzman> Add dictionary constructor to DictionaryBuilder
---
cpp/src/arrow/array-dict-test.cc | 136 +++++++++++++++++++++++++++++++++++-
cpp/src/arrow/array/builder_dict.cc | 38 ++++++++--
cpp/src/arrow/array/builder_dict.h | 4 ++
cpp/src/arrow/builder.cc | 41 +++++++++++
4 files changed, 210 insertions(+), 9 deletions(-)
diff --git a/cpp/src/arrow/array-dict-test.cc b/cpp/src/arrow/array-dict-test.cc
index fcc9fea..8289aee 100644
--- a/cpp/src/arrow/array-dict-test.cc
+++ b/cpp/src/arrow/array-dict-test.cc
@@ -63,17 +63,67 @@ TYPED_TEST(TestDictionaryBuilder, Basic) {
ASSERT_EQ(builder.length(), 4);
ASSERT_EQ(builder.null_count(), 1);
+ // Build expected data
+ auto dict_array = ArrayFromJSON(std::make_shared<TypeParam>(), "[1, 2]");
+ auto dict_type = dictionary(int8(), dict_array);
+
std::shared_ptr<Array> result;
ASSERT_OK(builder.Finish(&result));
+ auto int_array = ArrayFromJSON(int8(), "[0, 1, 0, null]");
+ DictionaryArray expected(dict_type, int_array);
+
+ ASSERT_TRUE(expected.Equals(result));
+}
+
+TYPED_TEST(TestDictionaryBuilder, ArrayInit) {
+ auto dict_array = ArrayFromJSON(std::make_shared<TypeParam>(), "[1, 2]");
+ auto dict_type = dictionary(int8(), dict_array);
+
+ DictionaryBuilder<TypeParam> builder(dict_array, default_memory_pool());
+ ASSERT_OK(builder.Append(static_cast<typename TypeParam::c_type>(1)));
+ ASSERT_OK(builder.Append(static_cast<typename TypeParam::c_type>(2)));
+ ASSERT_OK(builder.Append(static_cast<typename TypeParam::c_type>(1)));
+ ASSERT_OK(builder.AppendNull());
+
+ ASSERT_EQ(builder.length(), 4);
+ ASSERT_EQ(builder.null_count(), 1);
+
// Build expected data
+
+ std::shared_ptr<Array> result;
+ ASSERT_OK(builder.Finish(&result));
+
+ auto int_array = ArrayFromJSON(int8(), "[0, 1, 0, null]");
+ DictionaryArray expected(dict_type, int_array);
+
+ AssertArraysEqual(expected, *result);
+}
+
+TYPED_TEST(TestDictionaryBuilder, MakeBuilder) {
auto dict_array = ArrayFromJSON(std::make_shared<TypeParam>(), "[1, 2]");
- auto dict_type = std::make_shared<DictionaryType>(int8(), dict_array);
+ auto dict_type = dictionary(int8(), dict_array);
+ std::unique_ptr<ArrayBuilder> boxed_builder;
+ ASSERT_OK(MakeBuilder(default_memory_pool(), dict_type, &boxed_builder));
+ auto& builder = checked_cast<DictionaryBuilder<TypeParam>&>(*boxed_builder);
+
+ ASSERT_OK(builder.Append(static_cast<typename TypeParam::c_type>(1)));
+ ASSERT_OK(builder.Append(static_cast<typename TypeParam::c_type>(2)));
+ ASSERT_OK(builder.Append(static_cast<typename TypeParam::c_type>(1)));
+ ASSERT_OK(builder.AppendNull());
+
+ ASSERT_EQ(builder.length(), 4);
+ ASSERT_EQ(builder.null_count(), 1);
+
+ // Build expected data
+
+ std::shared_ptr<Array> result;
+ ASSERT_OK(builder.Finish(&result));
auto int_array = ArrayFromJSON(int8(), "[0, 1, 0, null]");
DictionaryArray expected(dict_type, int_array);
- ASSERT_TRUE(expected.Equals(result));
+ AssertArraysEqual(expected, *result);
}
TYPED_TEST(TestDictionaryBuilder, ArrayConversion) {
@@ -87,7 +137,7 @@ TYPED_TEST(TestDictionaryBuilder, ArrayConversion) {
// Build expected data
auto dict_array = ArrayFromJSON(type, "[1, 2]");
- auto dict_type = std::make_shared<DictionaryType>(int8(), dict_array);
+ auto dict_type = dictionary(int8(), dict_array);
auto int_array = ArrayFromJSON(int8(), "[0, 1, 0]");
DictionaryArray expected(dict_type, int_array);
@@ -239,6 +289,47 @@ TEST(TestStringDictionaryBuilder, Basic) {
ASSERT_TRUE(expected.Equals(result));
}
+TEST(TestStringDictionaryBuilder, ArrayInit) {
+ auto dict_array = ArrayFromJSON(utf8(), R"(["test", "test2"])");
+ auto int_array = ArrayFromJSON(int8(), "[0, 1, 0]");
+
+ // Build the dictionary Array
+ StringDictionaryBuilder builder(dict_array, default_memory_pool());
+ ASSERT_OK(builder.Append("test"));
+ ASSERT_OK(builder.Append("test2"));
+ ASSERT_OK(builder.Append("test"));
+
+ std::shared_ptr<Array> result;
+ ASSERT_OK(builder.Finish(&result));
+
+ // Build expected data
+ DictionaryArray expected(dictionary(int8(), dict_array), int_array);
+
+ AssertArraysEqual(expected, *result);
+}
+
+TEST(TestStringDictionaryBuilder, MakeBuilder) {
+ auto dict_array = ArrayFromJSON(utf8(), R"(["test", "test2"])");
+ auto dict_type = dictionary(int8(), dict_array);
+ auto int_array = ArrayFromJSON(int8(), "[0, 1, 0]");
+ std::unique_ptr<ArrayBuilder> boxed_builder;
+ ASSERT_OK(MakeBuilder(default_memory_pool(), dict_type, &boxed_builder));
+ auto& builder = checked_cast<StringDictionaryBuilder&>(*boxed_builder);
+
+ // Build the dictionary Array
+ ASSERT_OK(builder.Append("test"));
+ ASSERT_OK(builder.Append("test2"));
+ ASSERT_OK(builder.Append("test"));
+
+ std::shared_ptr<Array> result;
+ ASSERT_OK(builder.Finish(&result));
+
+ // Build expected data
+ DictionaryArray expected(dict_type, int_array);
+
+ AssertArraysEqual(expected, *result);
+}
+
// ARROW-4367
TEST(TestStringDictionaryBuilder, OnlyNull) {
// Build the dictionary Array
@@ -443,6 +534,45 @@ TEST(TestFixedSizeBinaryDictionaryBuilder, Basic) {
ASSERT_TRUE(expected.Equals(result));
}
+TEST(TestFixedSizeBinaryDictionaryBuilder, ArrayInit) {
+ // Build the dictionary Array
+ auto dict_array = ArrayFromJSON(fixed_size_binary(4), R"(["abcd", "wxyz"])");
+ util::string_view test = "abcd", test2 = "wxyz";
+ DictionaryBuilder<FixedSizeBinaryType> builder(dict_array,
default_memory_pool());
+ ASSERT_OK(builder.Append(test));
+ ASSERT_OK(builder.Append(test2));
+ ASSERT_OK(builder.Append(test));
+
+ std::shared_ptr<Array> result;
+ FinishAndCheckPadding(&builder, &result);
+
+ // Build expected data
+ auto indices = ArrayFromJSON(int8(), "[0, 1, 0]");
+ DictionaryArray expected(dictionary(int8(), dict_array), indices);
+ AssertArraysEqual(expected, *result);
+}
+
+TEST(TestFixedSizeBinaryDictionaryBuilder, MakeBuilder) {
+ // Build the dictionary Array
+ auto dict_array = ArrayFromJSON(fixed_size_binary(4), R"(["abcd", "wxyz"])");
+ auto dict_type = dictionary(int8(), dict_array);
+ std::unique_ptr<ArrayBuilder> boxed_builder;
+ ASSERT_OK(MakeBuilder(default_memory_pool(), dict_type, &boxed_builder));
+ auto& builder =
checked_cast<DictionaryBuilder<FixedSizeBinaryType>&>(*boxed_builder);
+ util::string_view test = "abcd", test2 = "wxyz";
+ ASSERT_OK(builder.Append(test));
+ ASSERT_OK(builder.Append(test2));
+ ASSERT_OK(builder.Append(test));
+
+ std::shared_ptr<Array> result;
+ FinishAndCheckPadding(&builder, &result);
+
+ // Build expected data
+ auto indices = ArrayFromJSON(int8(), "[0, 1, 0]");
+ DictionaryArray expected(dict_type, indices);
+ AssertArraysEqual(expected, *result);
+}
+
TEST(TestFixedSizeBinaryDictionaryBuilder, DeltaDictionary) {
// Build the dictionary Array
DictionaryBuilder<FixedSizeBinaryType> builder(arrow::fixed_size_binary(4),
diff --git a/cpp/src/arrow/array/builder_dict.cc
b/cpp/src/arrow/array/builder_dict.cc
index cfc3d3d..b5fa0d6 100644
--- a/cpp/src/arrow/array/builder_dict.cc
+++ b/cpp/src/arrow/array/builder_dict.cc
@@ -51,10 +51,8 @@ struct UnifyDictionaryValues {
Status Visit(const DataType&, void* = nullptr) {
// Default implementation for non-dictionary-supported datatypes
- std::stringstream ss;
- ss << "Unification of " << value_type_->ToString()
- << " dictionaries is not implemented";
- return Status::NotImplemented(ss.str());
+ return Status::NotImplemented("Unification of ", value_type_,
+ " dictionaries is not implemented");
}
template <typename T>
@@ -153,12 +151,32 @@ class DictionaryBuilder<T>::MemoTableImpl
public:
using MemoTableType = typename internal::HashTraits<T>::MemoTableType;
using MemoTableType::MemoTableType;
+
+ MemoTableImpl(const std::shared_ptr<Array>& dictionary)
+ : MemoTableImpl(dictionary->length()) {
+ const auto& values =
+ static_cast<const typename TypeTraits<T>::ArrayType&>(*dictionary);
+ for (int64_t i = 0; i < values.length(); ++i) {
+ ARROW_IGNORE_EXPR(this->GetOrInsert(values.GetView(i)));
+ }
+ }
};
template <typename T>
DictionaryBuilder<T>::~DictionaryBuilder() {}
template <typename T>
+DictionaryBuilder<T>::DictionaryBuilder(const std::shared_ptr<Array>&
dictionary,
+ MemoryPool* pool)
+ : ArrayBuilder(dictionary->type(), pool),
+ memo_table_(new MemoTableImpl(dictionary)),
+ delta_offset_(0),
+ byte_width_(-1),
+ values_builder_(pool) {
+ DCHECK_EQ(T::type_id, type_->id()) << "inconsistent type passed to
DictionaryBuilder";
+}
+
+template <typename T>
DictionaryBuilder<T>::DictionaryBuilder(const std::shared_ptr<DataType>& type,
MemoryPool* pool)
: ArrayBuilder(type, pool),
@@ -175,6 +193,12 @@ DictionaryBuilder<NullType>::DictionaryBuilder(const
std::shared_ptr<DataType>&
DCHECK_EQ(Type::NA, type->id()) << "inconsistent type passed to
DictionaryBuilder";
}
+DictionaryBuilder<NullType>::DictionaryBuilder(const std::shared_ptr<Array>&
dictionary,
+ MemoryPool* pool)
+ : ArrayBuilder(dictionary->type(), pool), values_builder_(pool) {
+ DCHECK_EQ(Type::NA, type_->id()) << "inconsistent type passed to
DictionaryBuilder";
+}
+
template <>
DictionaryBuilder<FixedSizeBinaryType>::DictionaryBuilder(
const std::shared_ptr<DataType>& type, MemoryPool* pool)
@@ -202,7 +226,8 @@ Status DictionaryBuilder<T>::Resize(int64_t capacity) {
delta_offset_ = 0;
}
RETURN_NOT_OK(values_builder_.Resize(capacity));
- return ArrayBuilder::Resize(capacity);
+ capacity_ = values_builder_.capacity();
+ return Status::OK();
}
Status DictionaryBuilder<NullType>::Resize(int64_t capacity) {
@@ -210,7 +235,8 @@ Status DictionaryBuilder<NullType>::Resize(int64_t
capacity) {
capacity = std::max(capacity, kMinBuilderCapacity);
RETURN_NOT_OK(values_builder_.Resize(capacity));
- return ArrayBuilder::Resize(capacity);
+ capacity_ = values_builder_.capacity();
+ return Status::OK();
}
template <typename T>
diff --git a/cpp/src/arrow/array/builder_dict.h
b/cpp/src/arrow/array/builder_dict.h
index 6f02716..204d609 100644
--- a/cpp/src/arrow/array/builder_dict.h
+++ b/cpp/src/arrow/array/builder_dict.h
@@ -68,6 +68,8 @@ class ARROW_EXPORT DictionaryBuilder : public ArrayBuilder {
// The DictionaryType is instantiated on the Finish() call.
DictionaryBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool);
+ DictionaryBuilder(const std::shared_ptr<Array>& dictionary, MemoryPool*
pool);
+
template <typename T1 = T>
explicit DictionaryBuilder(
typename std::enable_if<TypeTraits<T1>::is_parameter_free,
MemoryPool*>::type pool)
@@ -122,6 +124,8 @@ class ARROW_EXPORT DictionaryBuilder<NullType> : public
ArrayBuilder {
DictionaryBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool);
explicit DictionaryBuilder(MemoryPool* pool);
+ DictionaryBuilder(const std::shared_ptr<Array>& dictionary, MemoryPool*
pool);
+
/// \brief Append a scalar null value
Status AppendNull();
diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc
index 8d0ab19..9669c08 100644
--- a/cpp/src/arrow/builder.cc
+++ b/cpp/src/arrow/builder.cc
@@ -25,6 +25,8 @@
#include "arrow/status.h"
#include "arrow/type.h"
#include "arrow/util/checked_cast.h"
+#include "arrow/util/hashing.h"
+#include "arrow/visitor_inline.h"
namespace arrow {
@@ -38,6 +40,40 @@ class MemoryPool;
out->reset(new BuilderType(type, pool)); \
return Status::OK();
+struct DictionaryBuilderCase {
+ template <typename ValueType>
+ Status Visit(const ValueType&, typename ValueType::c_type* = nullptr) {
+ return CreateFor<ValueType>();
+ }
+
+ Status Visit(const BinaryType&) { return Create<BinaryDictionaryBuilder>(); }
+ Status Visit(const StringType&) { return Create<StringDictionaryBuilder>(); }
+ Status Visit(const FixedSizeBinaryType&) { return
CreateFor<FixedSizeBinaryType>(); }
+
+ Status Visit(const DataType& value_type) { return
NotImplemented(value_type); }
+ Status Visit(const HalfFloatType& value_type) { return
NotImplemented(value_type); }
+ Status NotImplemented(const DataType& value_type) {
+ return Status::NotImplemented(
+ "MakeBuilder: cannot construct builder for dictionaries with value
type ",
+ value_type);
+ }
+
+ template <typename ValueType>
+ Status CreateFor() {
+ return Create<DictionaryBuilder<ValueType>>();
+ }
+
+ template <typename BuilderType>
+ Status Create() {
+ out->reset(new BuilderType(dict_type.dictionary(), pool));
+ return Status::OK();
+ }
+
+ MemoryPool* pool;
+ const DictionaryType& dict_type;
+ std::unique_ptr<ArrayBuilder>* out;
+};
+
// Initially looked at doing this with vtables, but shared pointers makes it
// difficult
//
@@ -70,6 +106,11 @@ Status MakeBuilder(MemoryPool* pool, const
std::shared_ptr<DataType>& type,
BUILDER_CASE(BINARY, BinaryBuilder);
BUILDER_CASE(FIXED_SIZE_BINARY, FixedSizeBinaryBuilder);
BUILDER_CASE(DECIMAL, Decimal128Builder);
+ case Type::DICTIONARY: {
+ const auto& dict_type = static_cast<const DictionaryType&>(*type);
+ DictionaryBuilderCase visitor = {pool, dict_type, out};
+ return VisitTypeInline(*dict_type.dictionary()->type(), &visitor);
+ }
case Type::LIST: {
std::unique_ptr<ArrayBuilder> value_builder;
std::shared_ptr<DataType> value_type =