[ 
https://issues.apache.org/jira/browse/ARROW-2176?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16383632#comment-16383632
 ] 

ASF GitHub Bot commented on ARROW-2176:
---------------------------------------

xhochy closed pull request #1629: ARROW-2176: [C++] Extend DictionaryBuilder to 
support delta dictionaries
URL: https://github.com/apache/arrow/pull/1629
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc
index 1d321e677..4aaf18256 100644
--- a/cpp/src/arrow/array-test.cc
+++ b/cpp/src/arrow/array-test.cc
@@ -1770,6 +1770,154 @@ TYPED_TEST(TestDictionaryBuilder, DoubleTableSize) {
   }
 }
 
+TYPED_TEST(TestDictionaryBuilder, DeltaDictionary) {
+  DictionaryBuilder<TypeParam> builder(default_memory_pool());
+
+  ASSERT_OK(builder.Append(static_cast<typename TypeParam::c_type>(1)));
+  ASSERT_OK(builder.Append(static_cast<typename TypeParam::c_type>(2)));
+  ASSERT_OK(builder.Append(static_cast<typename TypeParam::c_type>(1)));
+  ASSERT_OK(builder.Append(static_cast<typename TypeParam::c_type>(2)));
+  std::shared_ptr<Array> result;
+  ASSERT_OK(builder.Finish(&result));
+
+  // Build expected data for the initial dictionary
+  NumericBuilder<TypeParam> dict_builder1;
+  ASSERT_OK(dict_builder1.Append(static_cast<typename TypeParam::c_type>(1)));
+  ASSERT_OK(dict_builder1.Append(static_cast<typename TypeParam::c_type>(2)));
+  std::shared_ptr<Array> dict_array1;
+  ASSERT_OK(dict_builder1.Finish(&dict_array1));
+  auto dtype1 = std::make_shared<DictionaryType>(int8(), dict_array1);
+
+  Int8Builder int_builder1;
+  ASSERT_OK(int_builder1.Append(0));
+  ASSERT_OK(int_builder1.Append(1));
+  ASSERT_OK(int_builder1.Append(0));
+  ASSERT_OK(int_builder1.Append(1));
+  std::shared_ptr<Array> int_array1;
+  ASSERT_OK(int_builder1.Finish(&int_array1));
+
+  DictionaryArray expected(dtype1, int_array1);
+  ASSERT_TRUE(expected.Equals(result));
+
+  // extend the dictionary builder with new data
+  ASSERT_OK(builder.Append(static_cast<typename TypeParam::c_type>(2)));
+  ASSERT_OK(builder.Append(static_cast<typename TypeParam::c_type>(3)));
+  ASSERT_OK(builder.Append(static_cast<typename TypeParam::c_type>(3)));
+  ASSERT_OK(builder.Append(static_cast<typename TypeParam::c_type>(1)));
+  ASSERT_OK(builder.Append(static_cast<typename TypeParam::c_type>(3)));
+
+  std::shared_ptr<Array> result_delta;
+  ASSERT_OK(builder.Finish(&result_delta));
+
+  // Build expected data for the delta dictionary
+  NumericBuilder<TypeParam> dict_builder2;
+  ASSERT_OK(dict_builder2.Append(static_cast<typename TypeParam::c_type>(3)));
+  std::shared_ptr<Array> dict_array2;
+  ASSERT_OK(dict_builder2.Finish(&dict_array2));
+  auto dtype2 = std::make_shared<DictionaryType>(int8(), dict_array2);
+
+  Int8Builder int_builder2;
+  ASSERT_OK(int_builder2.Append(1));
+  ASSERT_OK(int_builder2.Append(2));
+  ASSERT_OK(int_builder2.Append(2));
+  ASSERT_OK(int_builder2.Append(0));
+  ASSERT_OK(int_builder2.Append(2));
+  std::shared_ptr<Array> int_array2;
+  ASSERT_OK(int_builder2.Finish(&int_array2));
+
+  DictionaryArray expected_delta(dtype2, int_array2);
+  ASSERT_TRUE(expected_delta.Equals(result_delta));
+}
+
+TYPED_TEST(TestDictionaryBuilder, DoubleDeltaDictionary) {
+  DictionaryBuilder<TypeParam> builder(default_memory_pool());
+
+  ASSERT_OK(builder.Append(static_cast<typename TypeParam::c_type>(1)));
+  ASSERT_OK(builder.Append(static_cast<typename TypeParam::c_type>(2)));
+  ASSERT_OK(builder.Append(static_cast<typename TypeParam::c_type>(1)));
+  ASSERT_OK(builder.Append(static_cast<typename TypeParam::c_type>(2)));
+  std::shared_ptr<Array> result;
+  ASSERT_OK(builder.Finish(&result));
+
+  // Build expected data for the initial dictionary
+  NumericBuilder<TypeParam> dict_builder1;
+  ASSERT_OK(dict_builder1.Append(static_cast<typename TypeParam::c_type>(1)));
+  ASSERT_OK(dict_builder1.Append(static_cast<typename TypeParam::c_type>(2)));
+  std::shared_ptr<Array> dict_array1;
+  ASSERT_OK(dict_builder1.Finish(&dict_array1));
+  auto dtype1 = std::make_shared<DictionaryType>(int8(), dict_array1);
+
+  Int8Builder int_builder1;
+  ASSERT_OK(int_builder1.Append(0));
+  ASSERT_OK(int_builder1.Append(1));
+  ASSERT_OK(int_builder1.Append(0));
+  ASSERT_OK(int_builder1.Append(1));
+  std::shared_ptr<Array> int_array1;
+  ASSERT_OK(int_builder1.Finish(&int_array1));
+
+  DictionaryArray expected(dtype1, int_array1);
+  ASSERT_TRUE(expected.Equals(result));
+
+  // extend the dictionary builder with new data
+  ASSERT_OK(builder.Append(static_cast<typename TypeParam::c_type>(2)));
+  ASSERT_OK(builder.Append(static_cast<typename TypeParam::c_type>(3)));
+  ASSERT_OK(builder.Append(static_cast<typename TypeParam::c_type>(3)));
+  ASSERT_OK(builder.Append(static_cast<typename TypeParam::c_type>(1)));
+  ASSERT_OK(builder.Append(static_cast<typename TypeParam::c_type>(3)));
+
+  std::shared_ptr<Array> result_delta1;
+  ASSERT_OK(builder.Finish(&result_delta1));
+
+  // Build expected data for the delta dictionary
+  NumericBuilder<TypeParam> dict_builder2;
+  ASSERT_OK(dict_builder2.Append(static_cast<typename TypeParam::c_type>(3)));
+  std::shared_ptr<Array> dict_array2;
+  ASSERT_OK(dict_builder2.Finish(&dict_array2));
+  auto dtype2 = std::make_shared<DictionaryType>(int8(), dict_array2);
+
+  Int8Builder int_builder2;
+  ASSERT_OK(int_builder2.Append(1));
+  ASSERT_OK(int_builder2.Append(2));
+  ASSERT_OK(int_builder2.Append(2));
+  ASSERT_OK(int_builder2.Append(0));
+  ASSERT_OK(int_builder2.Append(2));
+  std::shared_ptr<Array> int_array2;
+  ASSERT_OK(int_builder2.Finish(&int_array2));
+
+  DictionaryArray expected_delta1(dtype2, int_array2);
+  ASSERT_TRUE(expected_delta1.Equals(result_delta1));
+
+  // extend the dictionary builder with new data again
+  ASSERT_OK(builder.Append(static_cast<typename TypeParam::c_type>(1)));
+  ASSERT_OK(builder.Append(static_cast<typename TypeParam::c_type>(2)));
+  ASSERT_OK(builder.Append(static_cast<typename TypeParam::c_type>(3)));
+  ASSERT_OK(builder.Append(static_cast<typename TypeParam::c_type>(4)));
+  ASSERT_OK(builder.Append(static_cast<typename TypeParam::c_type>(5)));
+
+  std::shared_ptr<Array> result_delta2;
+  ASSERT_OK(builder.Finish(&result_delta2));
+
+  // Build expected data for the delta dictionary again
+  NumericBuilder<TypeParam> dict_builder3;
+  ASSERT_OK(dict_builder3.Append(static_cast<typename TypeParam::c_type>(4)));
+  ASSERT_OK(dict_builder3.Append(static_cast<typename TypeParam::c_type>(5)));
+  std::shared_ptr<Array> dict_array3;
+  ASSERT_OK(dict_builder3.Finish(&dict_array3));
+  auto dtype3 = std::make_shared<DictionaryType>(int8(), dict_array3);
+
+  Int8Builder int_builder3;
+  ASSERT_OK(int_builder3.Append(0));
+  ASSERT_OK(int_builder3.Append(1));
+  ASSERT_OK(int_builder3.Append(2));
+  ASSERT_OK(int_builder3.Append(3));
+  ASSERT_OK(int_builder3.Append(4));
+  std::shared_ptr<Array> int_array3;
+  ASSERT_OK(int_builder3.Finish(&int_array3));
+
+  DictionaryArray expected_delta2(dtype3, int_array3);
+  ASSERT_TRUE(expected_delta2.Equals(result_delta2));
+}
+
 TEST(TestStringDictionaryBuilder, Basic) {
   // Build the dictionary Array
   StringDictionaryBuilder builder(default_memory_pool());
@@ -1835,6 +1983,146 @@ TEST(TestStringDictionaryBuilder, DoubleTableSize) {
   ASSERT_TRUE(expected.Equals(result));
 }
 
+TEST(TestStringDictionaryBuilder, DeltaDictionary) {
+  // Build the dictionary Array
+  StringDictionaryBuilder builder(default_memory_pool());
+  ASSERT_OK(builder.Append("test"));
+  ASSERT_OK(builder.Append("test2"));
+  ASSERT_OK(builder.Append("test"));
+
+  std::shared_ptr<Array> result;
+  ASSERT_OK(builder.Finish(&result));
+
+  // Build expected data
+  StringBuilder str_builder1;
+  ASSERT_OK(str_builder1.Append("test"));
+  ASSERT_OK(str_builder1.Append("test2"));
+  std::shared_ptr<Array> str_array1;
+  ASSERT_OK(str_builder1.Finish(&str_array1));
+  auto dtype1 = std::make_shared<DictionaryType>(int8(), str_array1);
+
+  Int8Builder int_builder1;
+  ASSERT_OK(int_builder1.Append(0));
+  ASSERT_OK(int_builder1.Append(1));
+  ASSERT_OK(int_builder1.Append(0));
+  std::shared_ptr<Array> int_array1;
+  ASSERT_OK(int_builder1.Finish(&int_array1));
+
+  DictionaryArray expected(dtype1, int_array1);
+  ASSERT_TRUE(expected.Equals(result));
+
+  // build a delta dictionary
+  ASSERT_OK(builder.Append("test2"));
+  ASSERT_OK(builder.Append("test3"));
+  ASSERT_OK(builder.Append("test2"));
+
+  std::shared_ptr<Array> result_delta;
+  ASSERT_OK(builder.Finish(&result_delta));
+
+  // Build expected data
+  StringBuilder str_builder2;
+  ASSERT_OK(str_builder2.Append("test3"));
+  std::shared_ptr<Array> str_array2;
+  ASSERT_OK(str_builder2.Finish(&str_array2));
+  auto dtype2 = std::make_shared<DictionaryType>(int8(), str_array2);
+
+  Int8Builder int_builder2;
+  ASSERT_OK(int_builder2.Append(1));
+  ASSERT_OK(int_builder2.Append(2));
+  ASSERT_OK(int_builder2.Append(1));
+  std::shared_ptr<Array> int_array2;
+  ASSERT_OK(int_builder2.Finish(&int_array2));
+
+  DictionaryArray expected_delta(dtype2, int_array2);
+  ASSERT_TRUE(expected_delta.Equals(result_delta));
+}
+
+TEST(TestStringDictionaryBuilder, BigDeltaDictionary) {
+  constexpr int16_t kTestLength = 2048;
+  // Build the dictionary Array
+  StringDictionaryBuilder builder(default_memory_pool());
+
+  StringBuilder str_builder1;
+  Int16Builder int_builder1;
+
+  for (int16_t idx = 0; idx < kTestLength; ++idx) {
+    std::stringstream sstream;
+    sstream << "test" << idx;
+    ASSERT_OK(builder.Append(sstream.str()));
+    ASSERT_OK(str_builder1.Append(sstream.str()));
+    ASSERT_OK(int_builder1.Append(idx));
+  }
+
+  std::shared_ptr<Array> result;
+  ASSERT_OK(builder.Finish(&result));
+
+  std::shared_ptr<Array> str_array1;
+  ASSERT_OK(str_builder1.Finish(&str_array1));
+  auto dtype1 = std::make_shared<DictionaryType>(int16(), str_array1);
+
+  std::shared_ptr<Array> int_array1;
+  ASSERT_OK(int_builder1.Finish(&int_array1));
+
+  DictionaryArray expected(dtype1, int_array1);
+  ASSERT_TRUE(expected.Equals(result));
+
+  // build delta 1
+  StringBuilder str_builder2;
+  Int16Builder int_builder2;
+
+  for (int16_t idx = 0; idx < kTestLength; ++idx) {
+    ASSERT_OK(builder.Append("test1"));
+    ASSERT_OK(int_builder2.Append(1));
+  }
+
+  for (int16_t idx = 0; idx < kTestLength; ++idx) {
+    ASSERT_OK(builder.Append("test_new_value1"));
+    ASSERT_OK(int_builder2.Append(kTestLength));
+  }
+  ASSERT_OK(str_builder2.Append("test_new_value1"));
+
+  std::shared_ptr<Array> result2;
+  ASSERT_OK(builder.Finish(&result2));
+
+  std::shared_ptr<Array> str_array2;
+  ASSERT_OK(str_builder2.Finish(&str_array2));
+  auto dtype2 = std::make_shared<DictionaryType>(int16(), str_array2);
+
+  std::shared_ptr<Array> int_array2;
+  ASSERT_OK(int_builder2.Finish(&int_array2));
+
+  DictionaryArray expected2(dtype2, int_array2);
+  ASSERT_TRUE(expected2.Equals(result2));
+
+  // build delta 2
+  StringBuilder str_builder3;
+  Int16Builder int_builder3;
+
+  for (int16_t idx = 0; idx < kTestLength; ++idx) {
+    ASSERT_OK(builder.Append("test2"));
+    ASSERT_OK(int_builder3.Append(2));
+  }
+
+  for (int16_t idx = 0; idx < kTestLength; ++idx) {
+    ASSERT_OK(builder.Append("test_new_value2"));
+    ASSERT_OK(int_builder3.Append(kTestLength + 1));
+  }
+  ASSERT_OK(str_builder3.Append("test_new_value2"));
+
+  std::shared_ptr<Array> result3;
+  ASSERT_OK(builder.Finish(&result3));
+
+  std::shared_ptr<Array> str_array3;
+  ASSERT_OK(str_builder3.Finish(&str_array3));
+  auto dtype3 = std::make_shared<DictionaryType>(int16(), str_array3);
+
+  std::shared_ptr<Array> int_array3;
+  ASSERT_OK(int_builder3.Finish(&int_array3));
+
+  DictionaryArray expected3(dtype3, int_array3);
+  ASSERT_TRUE(expected3.Equals(result3));
+}
+
 TEST(TestFixedSizeBinaryDictionaryBuilder, Basic) {
   // Build the dictionary Array
   DictionaryBuilder<FixedSizeBinaryType> builder(arrow::fixed_size_binary(4),
@@ -1867,6 +2155,65 @@ TEST(TestFixedSizeBinaryDictionaryBuilder, Basic) {
   ASSERT_TRUE(expected.Equals(result));
 }
 
+TEST(TestFixedSizeBinaryDictionaryBuilder, DeltaDictionary) {
+  // Build the dictionary Array
+  DictionaryBuilder<FixedSizeBinaryType> builder(arrow::fixed_size_binary(4),
+                                                 default_memory_pool());
+  std::vector<uint8_t> test{12, 12, 11, 12};
+  std::vector<uint8_t> test2{12, 12, 11, 11};
+  std::vector<uint8_t> test3{12, 12, 11, 10};
+
+  ASSERT_OK(builder.Append(test.data()));
+  ASSERT_OK(builder.Append(test2.data()));
+  ASSERT_OK(builder.Append(test.data()));
+
+  std::shared_ptr<Array> result1;
+  ASSERT_OK(builder.Finish(&result1));
+
+  // Build expected data
+  FixedSizeBinaryBuilder fsb_builder1(arrow::fixed_size_binary(4));
+  ASSERT_OK(fsb_builder1.Append(test.data()));
+  ASSERT_OK(fsb_builder1.Append(test2.data()));
+  std::shared_ptr<Array> fsb_array1;
+  ASSERT_OK(fsb_builder1.Finish(&fsb_array1));
+  auto dtype1 = std::make_shared<DictionaryType>(int8(), fsb_array1);
+
+  Int8Builder int_builder1;
+  ASSERT_OK(int_builder1.Append(0));
+  ASSERT_OK(int_builder1.Append(1));
+  ASSERT_OK(int_builder1.Append(0));
+  std::shared_ptr<Array> int_array1;
+  ASSERT_OK(int_builder1.Finish(&int_array1));
+
+  DictionaryArray expected1(dtype1, int_array1);
+  ASSERT_TRUE(expected1.Equals(result1));
+
+  // build delta dictionary
+  ASSERT_OK(builder.Append(test.data()));
+  ASSERT_OK(builder.Append(test2.data()));
+  ASSERT_OK(builder.Append(test3.data()));
+
+  std::shared_ptr<Array> result2;
+  ASSERT_OK(builder.Finish(&result2));
+
+  // Build expected data
+  FixedSizeBinaryBuilder fsb_builder2(arrow::fixed_size_binary(4));
+  ASSERT_OK(fsb_builder2.Append(test3.data()));
+  std::shared_ptr<Array> fsb_array2;
+  ASSERT_OK(fsb_builder2.Finish(&fsb_array2));
+  auto dtype2 = std::make_shared<DictionaryType>(int8(), fsb_array2);
+
+  Int8Builder int_builder2;
+  ASSERT_OK(int_builder2.Append(0));
+  ASSERT_OK(int_builder2.Append(1));
+  ASSERT_OK(int_builder2.Append(2));
+  std::shared_ptr<Array> int_array2;
+  ASSERT_OK(int_builder2.Finish(&int_array2));
+
+  DictionaryArray expected2(dtype2, int_array2);
+  ASSERT_TRUE(expected2.Equals(result2));
+}
+
 TEST(TestFixedSizeBinaryDictionaryBuilder, DoubleTableSize) {
   // Build the dictionary Array
   DictionaryBuilder<FixedSizeBinaryType> builder(arrow::fixed_size_binary(4),
diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc
index 6f9749d19..ef4e7fde9 100644
--- a/cpp/src/arrow/builder.cc
+++ b/cpp/src/arrow/builder.cc
@@ -818,6 +818,7 @@ DictionaryBuilder<T>::DictionaryBuilder(const 
std::shared_ptr<DataType>& type,
     : ArrayBuilder(type, pool),
       hash_slots_(nullptr),
       dict_builder_(type, pool),
+      overflow_dict_builder_(type, pool),
       values_builder_(pool),
       byte_width_(-1) {
   if (!::arrow::CpuInfo::initialized()) {
@@ -841,6 +842,7 @@ DictionaryBuilder<FixedSizeBinaryType>::DictionaryBuilder(
     : ArrayBuilder(type, pool),
       hash_slots_(nullptr),
       dict_builder_(type, pool),
+      overflow_dict_builder_(type, pool),
       values_builder_(pool),
       byte_width_(static_cast<const FixedSizeBinaryType&>(*type).byte_width()) 
{
   if (!::arrow::CpuInfo::initialized()) {
@@ -856,6 +858,7 @@ Status DictionaryBuilder<T>::Init(int64_t elements) {
   RETURN_NOT_OK(internal::NewHashTable(kInitialHashTableSize, pool_, 
&hash_table_));
   hash_slots_ = reinterpret_cast<int32_t*>(hash_table_->mutable_data());
   hash_table_size_ = kInitialHashTableSize;
+  entry_id_offset_ = 0;
   mod_bitmask_ = kInitialHashTableSize - 1;
   hash_table_load_threshold_ =
       static_cast<int64_t>(static_cast<double>(elements) * kMaxHashTableLoad);
@@ -893,24 +896,6 @@ Status DictionaryBuilder<NullType>::Resize(int64_t 
capacity) {
   }
 }
 
-template <typename T>
-Status DictionaryBuilder<T>::FinishInternal(std::shared_ptr<ArrayData>* out) {
-  std::shared_ptr<Array> dictionary;
-  RETURN_NOT_OK(dict_builder_.Finish(&dictionary));
-
-  RETURN_NOT_OK(values_builder_.FinishInternal(out));
-  (*out)->type = std::make_shared<DictionaryType>((*out)->type, dictionary);
-  return Status::OK();
-}
-
-Status DictionaryBuilder<NullType>::FinishInternal(std::shared_ptr<ArrayData>* 
out) {
-  std::shared_ptr<Array> dictionary = std::make_shared<NullArray>(0);
-
-  RETURN_NOT_OK(values_builder_.FinishInternal(out));
-  (*out)->type = std::make_shared<DictionaryType>((*out)->type, dictionary);
-  return Status::OK();
-}
-
 template <typename T>
 Status DictionaryBuilder<T>::Append(const Scalar& value) {
   RETURN_NOT_OK(Reserve(1));
@@ -930,7 +915,7 @@ Status DictionaryBuilder<T>::Append(const Scalar& value) {
 
   if (index == kHashSlotEmpty) {
     // Not in the hash table, so we insert it now
-    index = static_cast<hash_slot_t>(dict_builder_.length());
+    index = static_cast<hash_slot_t>(dict_builder_.length() + 
entry_id_offset_);
     hash_slots_[j] = index;
     RETURN_NOT_OK(AppendDictionary(value));
 
@@ -991,8 +976,8 @@ Status DictionaryBuilder<NullType>::AppendNull() { return 
values_builder_.Append
 
 template <typename T>
 Status DictionaryBuilder<T>::DoubleTableSize() {
-#define INNER_LOOP                                                \
-  Scalar value = GetDictionaryValue(static_cast<int64_t>(index)); \
+#define INNER_LOOP                                                             
  \
+  Scalar value = GetDictionaryValue(dict_builder_, 
static_cast<int64_t>(index)); \
   int64_t j = HashValue(value) & new_mod_bitmask;
 
   DOUBLE_TABLE_SIZE(, INNER_LOOP);
@@ -1002,14 +987,64 @@ Status DictionaryBuilder<T>::DoubleTableSize() {
 
 template <typename T>
 typename DictionaryBuilder<T>::Scalar DictionaryBuilder<T>::GetDictionaryValue(
-    int64_t index) {
-  const Scalar* data = reinterpret_cast<const 
Scalar*>(dict_builder_.data()->data());
+    typename TypeTraits<T>::BuilderType& dictionary_builder, int64_t index) {
+  const Scalar* data = reinterpret_cast<const 
Scalar*>(dictionary_builder.data()->data());
   return data[index];
 }
 
+template <typename T>
+Status DictionaryBuilder<T>::FinishInternal(std::shared_ptr<ArrayData>* out) {
+  entry_id_offset_ += dict_builder_.length();
+  RETURN_NOT_OK(overflow_dict_builder_.Append(
+      reinterpret_cast<const 
DictionaryBuilder<T>::Scalar*>(dict_builder_.data()->data()),
+      dict_builder_.length(), nullptr));
+
+  std::shared_ptr<Array> dictionary;
+  RETURN_NOT_OK(dict_builder_.Finish(&dictionary));
+
+  RETURN_NOT_OK(values_builder_.FinishInternal(out));
+  (*out)->type = std::make_shared<DictionaryType>((*out)->type, dictionary);
+
+  RETURN_NOT_OK(dict_builder_.Init(capacity_));
+  RETURN_NOT_OK(values_builder_.Init(capacity_));
+  return Status::OK();
+}
+
+Status DictionaryBuilder<NullType>::FinishInternal(std::shared_ptr<ArrayData>* 
out) {
+  std::shared_ptr<Array> dictionary = std::make_shared<NullArray>(0);
+
+  RETURN_NOT_OK(values_builder_.FinishInternal(out));
+  (*out)->type = std::make_shared<DictionaryType>((*out)->type, dictionary);
+  return Status::OK();
+}
+
 template <>
-const uint8_t* 
DictionaryBuilder<FixedSizeBinaryType>::GetDictionaryValue(int64_t index) {
-  return dict_builder_.GetValue(index);
+const uint8_t* DictionaryBuilder<FixedSizeBinaryType>::GetDictionaryValue(
+    typename TypeTraits<FixedSizeBinaryType>::BuilderType& dictionary_builder,
+    int64_t index) {
+  return dictionary_builder.GetValue(index);
+}
+
+template <>
+Status DictionaryBuilder<FixedSizeBinaryType>::FinishInternal(
+    std::shared_ptr<ArrayData>* out) {
+  entry_id_offset_ += dict_builder_.length();
+
+  for (uint64_t index = 0, limit = dict_builder_.length(); index < limit; 
++index) {
+    const Scalar value = GetDictionaryValue(dict_builder_, index);
+    RETURN_NOT_OK(overflow_dict_builder_.Append(value));
+  }
+
+  std::shared_ptr<Array> dictionary;
+  RETURN_NOT_OK(dict_builder_.Finish(&dictionary));
+
+  RETURN_NOT_OK(values_builder_.FinishInternal(out));
+  (*out)->type = std::make_shared<DictionaryType>((*out)->type, dictionary);
+
+  RETURN_NOT_OK(dict_builder_.Init(capacity_));
+  RETURN_NOT_OK(values_builder_.Init(capacity_));
+
+  return Status::OK();
 }
 
 template <typename T>
@@ -1024,16 +1059,34 @@ int64_t 
DictionaryBuilder<FixedSizeBinaryType>::HashValue(const Scalar& value) {
 
 template <typename T>
 bool DictionaryBuilder<T>::SlotDifferent(hash_slot_t index, const Scalar& 
value) {
-  const Scalar other = GetDictionaryValue(static_cast<int64_t>(index));
-  return other != value;
+  const bool value_found =
+      index >= entry_id_offset_ &&
+      GetDictionaryValue(dict_builder_, static_cast<int64_t>(index - 
entry_id_offset_)) ==
+          value;
+  const bool value_found_overflow =
+      entry_id_offset_ > 0 &&
+      GetDictionaryValue(overflow_dict_builder_, static_cast<int64_t>(index)) 
== value;
+  return !(value_found || value_found_overflow);
 }
 
 template <>
 bool DictionaryBuilder<FixedSizeBinaryType>::SlotDifferent(hash_slot_t index,
                                                            const Scalar& 
value) {
   int32_t width = static_cast<const FixedSizeBinaryType&>(*type_).byte_width();
-  const Scalar other = GetDictionaryValue(static_cast<int64_t>(index));
-  return memcmp(other, value, width) != 0;
+  bool value_found = false;
+  if (index >= entry_id_offset_) {
+    const Scalar other =
+        GetDictionaryValue(dict_builder_, static_cast<int64_t>(index - 
entry_id_offset_));
+    value_found = memcmp(other, value, width) == 0;
+  }
+
+  bool value_found_overflow = false;
+  if (entry_id_offset_ > 0) {
+    const Scalar other_overflow =
+        GetDictionaryValue(overflow_dict_builder_, 
static_cast<int64_t>(index));
+    value_found_overflow = memcmp(other_overflow, value, width) == 0;
+  }
+  return !(value_found || value_found_overflow);
 }
 
 template <typename T>
@@ -1041,47 +1094,82 @@ Status DictionaryBuilder<T>::AppendDictionary(const 
Scalar& value) {
   return dict_builder_.Append(value);
 }
 
-#define BINARY_DICTIONARY_SPECIALIZATIONS(Type)                                
     \
-  template <>                                                                  
     \
-  WrappedBinary DictionaryBuilder<Type>::GetDictionaryValue(int64_t index) {   
     \
-    int32_t v_len;                                                             
     \
-    const uint8_t* v = dict_builder_.GetValue(static_cast<int64_t>(index), 
&v_len); \
-    return WrappedBinary(v, v_len);                                            
     \
-  }                                                                            
     \
-                                                                               
     \
-  template <>                                                                  
     \
-  Status DictionaryBuilder<Type>::AppendDictionary(const WrappedBinary& value) 
{    \
-    return dict_builder_.Append(value.ptr_, value.length_);                    
     \
-  }                                                                            
     \
-                                                                               
     \
-  template <>                                                                  
     \
-  Status DictionaryBuilder<Type>::AppendArray(const Array& array) {            
     \
-    const BinaryArray& binary_array = static_cast<const BinaryArray&>(array);  
     \
-    WrappedBinary value(nullptr, 0);                                           
     \
-    for (int64_t i = 0; i < array.length(); i++) {                             
     \
-      if (array.IsNull(i)) {                                                   
     \
-        RETURN_NOT_OK(AppendNull());                                           
     \
-      } else {                                                                 
     \
-        value.ptr_ = binary_array.GetValue(i, &value.length_);                 
     \
-        RETURN_NOT_OK(Append(value));                                          
     \
-      }                                                                        
     \
-    }                                                                          
     \
-    return Status::OK();                                                       
     \
-  }                                                                            
     \
-                                                                               
     \
-  template <>                                                                  
     \
-  int64_t DictionaryBuilder<Type>::HashValue(const WrappedBinary& value) {     
     \
-    return HashUtil::Hash(value.ptr_, value.length_, 0);                       
     \
-  }                                                                            
     \
-                                                                               
     \
-  template <>                                                                  
     \
-  bool DictionaryBuilder<Type>::SlotDifferent(hash_slot_t index,               
     \
-                                              const WrappedBinary& value) {    
     \
-    int32_t other_length;                                                      
     \
-    const uint8_t* other_value =                                               
     \
-        dict_builder_.GetValue(static_cast<int64_t>(index), &other_length);    
     \
-    return !(other_length == value.length_ &&                                  
     \
-             0 == memcmp(other_value, value.ptr_, value.length_));             
     \
+#define BINARY_DICTIONARY_SPECIALIZATIONS(Type)                                
        \
+  template <>                                                                  
        \
+  WrappedBinary DictionaryBuilder<Type>::GetDictionaryValue(                   
        \
+      typename TypeTraits<Type>::BuilderType& dictionary_builder, int64_t 
index) {     \
+    int32_t v_len;                                                             
        \
+    const uint8_t* v = dictionary_builder.GetValue(                            
        \
+        static_cast<int64_t>(index - entry_id_offset_), &v_len);               
        \
+    return WrappedBinary(v, v_len);                                            
        \
+  }                                                                            
        \
+                                                                               
        \
+  template <>                                                                  
        \
+  Status DictionaryBuilder<Type>::AppendDictionary(const WrappedBinary& value) 
{       \
+    return dict_builder_.Append(value.ptr_, value.length_);                    
        \
+  }                                                                            
        \
+                                                                               
        \
+  template <>                                                                  
        \
+  Status DictionaryBuilder<Type>::AppendArray(const Array& array) {            
        \
+    const BinaryArray& binary_array = static_cast<const BinaryArray&>(array);  
        \
+    WrappedBinary value(nullptr, 0);                                           
        \
+    for (int64_t i = 0; i < array.length(); i++) {                             
        \
+      if (array.IsNull(i)) {                                                   
        \
+        RETURN_NOT_OK(AppendNull());                                           
        \
+      } else {                                                                 
        \
+        value.ptr_ = binary_array.GetValue(i, &value.length_);                 
        \
+        RETURN_NOT_OK(Append(value));                                          
        \
+      }                                                                        
        \
+    }                                                                          
        \
+    return Status::OK();                                                       
        \
+  }                                                                            
        \
+                                                                               
        \
+  template <>                                                                  
        \
+  int64_t DictionaryBuilder<Type>::HashValue(const WrappedBinary& value) {     
        \
+    return HashUtil::Hash(value.ptr_, value.length_, 0);                       
        \
+  }                                                                            
        \
+                                                                               
        \
+  template <>                                                                  
        \
+  bool DictionaryBuilder<Type>::SlotDifferent(hash_slot_t index,               
        \
+                                              const WrappedBinary& value) {    
        \
+    int32_t other_length;                                                      
        \
+    bool value_found = false;                                                  
        \
+    if (index >= entry_id_offset_) {                                           
        \
+      const uint8_t* other_value = dict_builder_.GetValue(                     
        \
+          static_cast<int64_t>(index - entry_id_offset_), &other_length);      
        \
+      value_found = other_length == value.length_ &&                           
        \
+                    memcmp(other_value, value.ptr_, value.length_) == 0;       
        \
+    }                                                                          
        \
+                                                                               
        \
+    bool value_found_overflow = false;                                         
        \
+    if (entry_id_offset_ > 0) {                                                
        \
+      const uint8_t* other_value_overflow =                                    
        \
+          overflow_dict_builder_.GetValue(static_cast<int64_t>(index), 
&other_length); \
+      value_found_overflow =                                                   
        \
+          other_length == value.length_ &&                                     
        \
+          memcmp(other_value_overflow, value.ptr_, value.length_) == 0;        
        \
+    }                                                                          
        \
+    return !(value_found || value_found_overflow);                             
        \
+  }                                                                            
        \
+                                                                               
        \
+  template <>                                                                  
        \
+  Status DictionaryBuilder<Type>::FinishInternal(std::shared_ptr<ArrayData>* 
out) {    \
+    entry_id_offset_ += dict_builder_.length();                                
        \
+    for (uint64_t index = 0, limit = dict_builder_.length(); index < limit; 
++index) { \
+      int32_t out_length;                                                      
        \
+      const uint8_t* value = dict_builder_.GetValue(index, &out_length);       
        \
+      RETURN_NOT_OK(overflow_dict_builder_.Append(value, out_length));         
        \
+    }                                                                          
        \
+                                                                               
        \
+    std::shared_ptr<Array> dictionary;                                         
        \
+    RETURN_NOT_OK(dict_builder_.Finish(&dictionary));                          
        \
+                                                                               
        \
+    RETURN_NOT_OK(values_builder_.FinishInternal(out));                        
        \
+    (*out)->type = std::make_shared<DictionaryType>((*out)->type, dictionary); 
        \
+                                                                               
        \
+    RETURN_NOT_OK(dict_builder_.Init(capacity_));                              
        \
+    RETURN_NOT_OK(values_builder_.Init(capacity_));                            
        \
+    return Status::OK();                                                       
        \
   }
 
 BINARY_DICTIONARY_SPECIALIZATIONS(StringType);
@@ -1344,6 +1432,9 @@ Status 
FixedSizeBinaryBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) {
   RETURN_NOT_OK(byte_builder_.Finish(&data));
 
   *out = ArrayData::Make(type_, length_, {null_bitmap_, data}, null_count_);
+
+  null_bitmap_ = nullptr;
+  capacity_ = length_ = null_count_ = 0;
   return Status::OK();
 }
 
diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h
index 9826a6c83..dabfb7506 100644
--- a/cpp/src/arrow/builder.h
+++ b/cpp/src/arrow/builder.h
@@ -109,13 +109,14 @@ class ARROW_EXPORT ArrayBuilder {
   std::shared_ptr<PoolBuffer> null_bitmap() const { return null_bitmap_; }
 
   /// \brief Return result of builder as an internal generic ArrayData
-  /// object. Resets builder
+  /// object. Resets builder except for dictionary builder
   ///
   /// \param[out] out the finalized ArrayData object
   /// \return Status
   virtual Status FinishInternal(std::shared_ptr<ArrayData>* out) = 0;
 
-  /// \brief Return result of builder as an Array object. Resets builder
+  /// \brief Return result of builder as an Array object.
+  ///        Resets the builder except for DictionaryBuilder
   ///
   /// \param[out] out the finalized Array object
   /// \return Status
@@ -851,6 +852,12 @@ struct DictionaryScalar<FixedSizeBinaryType> {
 }  // namespace internal
 
 /// \brief Array builder for created encoded DictionaryArray from dense array
+///
+/// Unlike other builders, dictionary builder does not completely reset the 
state
+/// on Finish calls. The arrays built after the initial Finish call will reuse
+/// the previously created encoding and build a delta dictionary when new terms
+/// occur.
+///
 /// data
 template <typename T>
 class ARROW_EXPORT DictionaryBuilder : public ArrayBuilder {
@@ -879,9 +886,13 @@ class ARROW_EXPORT DictionaryBuilder : public ArrayBuilder 
{
   Status Resize(int64_t capacity) override;
   Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
 
+  /// is the dictionary builder in the delta building mode
+  bool is_building_delta() { return entry_id_offset_ > 0; }
+
  protected:
   Status DoubleTableSize();
-  Scalar GetDictionaryValue(int64_t index);
+  Scalar GetDictionaryValue(typename TypeTraits<T>::BuilderType& 
dictionary_builder,
+                            int64_t index);
   int64_t HashValue(const Scalar& value);
   bool SlotDifferent(hash_slot_t slot, const Scalar& value);
   Status AppendDictionary(const Scalar& value);
@@ -892,11 +903,18 @@ class ARROW_EXPORT DictionaryBuilder : public 
ArrayBuilder {
   /// Size of the table. Must be a power of 2.
   int64_t hash_table_size_;
 
+  // offset for the entry ids. Used to build delta dictionaries,
+  // increased on every InternalFinish by the number of current entries
+  // in the dictionary
+  int64_t entry_id_offset_;
+
   // Store hash_table_size_ - 1, so that j & mod_bitmask_ is equivalent to j %
   // hash_table_size_, but uses far fewer CPU cycles
   int64_t mod_bitmask_;
 
   typename TypeTraits<T>::BuilderType dict_builder_;
+  typename TypeTraits<T>::BuilderType overflow_dict_builder_;
+
   AdaptiveIntBuilder values_builder_;
   int32_t byte_width_;
 


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


> [C++] Extend DictionaryBuilder to support delta dictionaries
> ------------------------------------------------------------
>
>                 Key: ARROW-2176
>                 URL: https://issues.apache.org/jira/browse/ARROW-2176
>             Project: Apache Arrow
>          Issue Type: New Feature
>          Components: C++
>            Reporter: Dimitri Vorona
>            Priority: Major
>              Labels: pull-request-available
>             Fix For: 0.9.0
>
>
> [The IPC format|https://arrow.apache.org/docs/ipc.html] specifies a 
> possibility of sending additional dictionary batches with a previously seen 
> id and a isDelta flag to extend the existing dictionaries with new entries. 
> Right now, the DictioniaryBuilder (as well as IPC writer and reader) do not 
> support generation of delta dictionaries.
> This pull request contains a basic implementation of the DictionaryBuilder 
> with delta dictionaries support. The use API can be seen in the dictionary 
> tests (i.e. 
> [here|https://github.com/alendit/arrow/blob/delta_dictionary_builder/cpp/src/arrow/array-test.cc#L1773]).
>  The basic idea is that the user just reuses the builder object after calling 
> Finish(Array*) for the first time. Subsequent calls to Append will create new 
> entries only for the unseen element and reuse id from previous dictionaries 
> for the seen ones.
> Some considerations:
>  # The API is pretty implicit, and additional flag for Finish, which 
> explicitly indicates a desire to use the builder for delta dictionary 
> generation might be expedient from the error avoidance point of view.
>  # Right now the implementation uses an additional "overflow dictionary" to 
> store the seen items. This adds a copy on each Finish call and an additional 
> lookup at each GetItem or Append call. I assume, we might get away with 
> returning Array slices at Finish, which would remove the need for an 
> additional overflow dictionary. If the gist of the PR is approved, I can look 
> into further optimizations.
> The Writer and Reader extensions would be pretty simple, since the 
> DictionaryBuilder API remains basically the same. 



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

Reply via email to