rok commented on code in PR #14341:
URL: https://github.com/apache/arrow/pull/14341#discussion_r1257304151


##########
cpp/src/parquet/encoding_test.cc:
##########
@@ -1910,5 +1909,310 @@ TEST(DeltaLengthByteArrayEncodingAdHoc, ArrowDirectPut) 
{
   CheckDecode(encoded, ::arrow::ArrayFromJSON(::arrow::large_binary(), 
values));
 }
 
+// ----------------------------------------------------------------------
+// DELTA_BYTE_ARRAY encode/decode tests.
+
+template <typename Type>
+class TestDeltaByteArrayEncoding : public TestEncodingBase<Type> {
+ public:
+  using c_type = typename Type::c_type;
+  static constexpr int TYPE = Type::type_num;
+
+  void InitData(int nvalues, double null_probability) {
+    const int seed = 42;
+    auto rand = ::arrow::random::RandomArrayGenerator(seed);
+    const int min_prefix_length = 0;
+    const int max_prefix_length = 100;
+    const int max_element_length = 1000;
+    const double prefixed_probability = 0.5;
+
+    const auto prefix_array = std::dynamic_pointer_cast<::arrow::StringArray>(
+        rand.String(nvalues, min_prefix_length, max_prefix_length, 
null_probability));
+    const auto do_prefix = std::dynamic_pointer_cast<::arrow::BooleanArray>(
+        rand.Boolean(nvalues,
+                     /*true_probability=*/prefixed_probability,
+                     /*null_probability=*/0.0));
+    ::arrow::StringBuilder builder(::arrow::default_memory_pool());
+
+    std::string prefix = "";
+    for (int i = 0; i < nvalues; i++) {
+      if (prefix_array->IsNull(i)) {
+        PARQUET_THROW_NOT_OK(builder.AppendNull());
+      } else {
+        const std::string element = prefix_array->GetString(i);
+        if (do_prefix->Value(i) && prefix.length() < max_element_length) {
+          prefix = prefix.append(element);
+        } else {
+          prefix = element;
+        }
+        PARQUET_THROW_NOT_OK(builder.Append(prefix));
+      }
+    }
+
+    std::shared_ptr<::arrow::StringArray> array;
+    ASSERT_OK(builder.Finish(&array));
+    num_values_ = static_cast<int>(array->length() - array->null_count());
+    draws_ = reinterpret_cast<c_type*>(array->value_data()->mutable_data());
+  }
+
+  void Execute(int nvalues, double null_probability) {
+    InitData(nvalues, null_probability);
+    CheckRoundtrip();
+  }
+
+  void ExecuteSpaced(int nvalues, int repeats, int64_t valid_bits_offset,
+                     double null_probability) {
+    InitData(nvalues, null_probability);
+
+    int64_t size = num_values_ + valid_bits_offset;
+    auto rand = ::arrow::random::RandomArrayGenerator(1923);
+    const auto array = rand.UInt8(size, 0, 100, null_probability);
+    const auto valid_bits = array->null_bitmap_data();
+    if (valid_bits) {
+      CheckRoundtripSpaced(valid_bits, valid_bits_offset);
+    }
+  }
+
+  void CheckRoundtrip() override {
+    auto encoder = MakeTypedEncoder<Type>(Encoding::DELTA_BYTE_ARRAY,
+                                          /*use_dictionary=*/false, 
descr_.get());
+    auto decoder = MakeTypedDecoder<Type>(Encoding::DELTA_BYTE_ARRAY, 
descr_.get());
+
+    encoder->Put(draws_, num_values_);
+    encode_buffer_ = encoder->FlushValues();
+
+    decoder->SetData(num_values_, encode_buffer_->data(),
+                     static_cast<int>(encode_buffer_->size()));
+    int values_decoded = decoder->Decode(decode_buf_, num_values_);
+    ASSERT_EQ(num_values_, values_decoded);
+    ASSERT_NO_FATAL_FAILURE(VerifyResults<c_type>(decode_buf_, draws_, 
num_values_));
+  }
+
+  void CheckRoundtripSpaced(const uint8_t* valid_bits,
+                            int64_t valid_bits_offset) override {
+    auto encoder = MakeTypedEncoder<Type>(Encoding::DELTA_BYTE_ARRAY,
+                                          /*use_dictionary=*/false, 
descr_.get());
+    auto decoder = MakeTypedDecoder<Type>(Encoding::DELTA_BYTE_ARRAY, 
descr_.get());
+    int null_count = 0;
+    for (auto i = 0; i < num_values_; i++) {
+      if (!bit_util::GetBit(valid_bits, valid_bits_offset + i)) {
+        null_count++;
+      }
+    }
+
+    encoder->PutSpaced(draws_, num_values_, valid_bits, valid_bits_offset);
+    encode_buffer_ = encoder->FlushValues();
+    decoder->SetData(num_values_ - null_count, encode_buffer_->data(),
+                     static_cast<int>(encode_buffer_->size()));
+    auto values_decoded = decoder->DecodeSpaced(decode_buf_, num_values_, 
null_count,
+                                                valid_bits, valid_bits_offset);
+    ASSERT_EQ(num_values_, values_decoded);
+    ASSERT_NO_FATAL_FAILURE(VerifyResultsSpaced<c_type>(decode_buf_, draws_, 
num_values_,
+                                                        valid_bits, 
valid_bits_offset));
+  }
+
+ protected:
+  USING_BASE_MEMBERS();
+};
+
+using TestDeltaByteArrayEncodingTypes = ::testing::Types<ByteArrayType, 
FLBAType>;
+TYPED_TEST_SUITE(TestDeltaByteArrayEncoding, TestDeltaByteArrayEncodingTypes);
+
+TYPED_TEST(TestDeltaByteArrayEncoding, BasicRoundTrip) {
+  ASSERT_NO_FATAL_FAILURE(this->Execute(0, 0));
+  // TODO
+
+  //  ASSERT_NO_FATAL_FAILURE(this->Execute(250, /*null_probability*/ 0));
+  //  ASSERT_NO_FATAL_FAILURE(this->ExecuteSpaced(
+  //      /*nvalues*/ 1234, /*repeats*/ 1, /*valid_bits_offset*/ 64, 
/*null_probability*/
+  //      0));
+  //
+  //  ASSERT_NO_FATAL_FAILURE(this->Execute(2000, /*null_probability*/ 0.1));
+  //  ASSERT_NO_FATAL_FAILURE(this->ExecuteSpaced(
+  //      /*nvalues*/ 1234, /*repeats*/ 10, /*valid_bits_offset*/ 64,
+  //      /*null_probability*/ 0.5));

Review Comment:
   It would segfault at encoding time in `ByteArrayVisitor` at the moment (I 
assume due to the way data is generated). I'm looking into it.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to