rok commented on code in PR #14341:
URL: https://github.com/apache/arrow/pull/14341#discussion_r1297745970
##########
cpp/src/parquet/encoding_test.cc:
##########
@@ -1908,4 +1909,249 @@ TEST(DeltaLengthByteArrayEncodingAdHoc, ArrowDirectPut)
{
CheckDecode(encoded, ::arrow::ArrayFromJSON(::arrow::large_binary(),
values));
}
+// ----------------------------------------------------------------------
+// DELTA_BYTE_ARRAY encode/decode tests.
+
+template <typename Type>
+class TestDeltaByteArrayEncoding : public
TestDeltaLengthByteArrayEncoding<Type> {
+ public:
+ using c_type = typename Type::c_type;
+ static constexpr int TYPE = Type::type_num;
+ static constexpr double prefixed_probability = 0.5;
+
+ void InitData(int nvalues, int repeats) {
+ num_values_ = nvalues * repeats;
+ input_bytes_.resize(num_values_ * sizeof(c_type));
+ output_bytes_.resize(num_values_ * sizeof(c_type));
+ draws_ = reinterpret_cast<c_type*>(input_bytes_.data());
+ decode_buf_ = reinterpret_cast<c_type*>(output_bytes_.data());
+ GeneratePrefixedData<c_type>(nvalues, draws_, &data_buffer_,
prefixed_probability);
+
+ // add some repeated values
+ for (int j = 1; j < repeats; ++j) {
+ for (int i = 0; i < nvalues; ++i) {
+ draws_[nvalues * j + i] = draws_[i];
+ }
+ }
+ }
+
+ Encoding::type GetEncoding() override { return Encoding::DELTA_BYTE_ARRAY; }
+
+ protected:
+ USING_BASE_MEMBERS();
+ std::vector<uint8_t> input_bytes_;
+ std::vector<uint8_t> output_bytes_;
+};
+
+using TestDeltaByteArrayEncodingTypes = ::testing::Types<ByteArrayType,
FLBAType>;
+TYPED_TEST_SUITE(TestDeltaByteArrayEncoding, TestDeltaByteArrayEncodingTypes);
+
+TYPED_TEST(TestDeltaByteArrayEncoding, BasicRoundTrip) {
+ ASSERT_NO_FATAL_FAILURE(this->Execute(0, /*repeats=*/0));
+ ASSERT_NO_FATAL_FAILURE(this->Execute(250, 5));
+ ASSERT_NO_FATAL_FAILURE(this->Execute(2000, 1));
+ ASSERT_NO_FATAL_FAILURE(this->ExecuteSpaced(
+ /*nvalues*/ 1234, /*repeats*/ 1, /*valid_bits_offset*/ 64,
/*null_probability*/
+ 0));
+ ASSERT_NO_FATAL_FAILURE(this->ExecuteSpaced(
+ /*nvalues*/ 1234, /*repeats*/ 10, /*valid_bits_offset*/ 64,
+ /*null_probability*/ 0.5));
+}
+
+template <typename Type>
+class DeltaByteArrayEncodingDirectPut : public TestEncodingBase<Type> {
+ public:
+ std::unique_ptr<TypedEncoder<Type>> encoder =
+ MakeTypedEncoder<Type>(Encoding::DELTA_BYTE_ARRAY);
+ std::unique_ptr<TypedDecoder<Type>> decoder =
+ MakeTypedDecoder<Type>(Encoding::DELTA_BYTE_ARRAY);
+
+ void CheckDirectPut(std::shared_ptr<::arrow::Array> array) {
+ ASSERT_NO_THROW(encoder->Put(*array));
+ auto buf = encoder->FlushValues();
+
+ int num_values = static_cast<int>(array->length() - array->null_count());
+ decoder->SetData(num_values, buf->data(), static_cast<int>(buf->size()));
+
+ typename EncodingTraits<Type>::Accumulator acc;
+ using BuilderType = typename EncodingTraits<Type>::BuilderType;
+ acc.builder = std::make_unique<BuilderType>(array->type(),
default_memory_pool());
+
+ ASSERT_EQ(num_values,
+ decoder->DecodeArrow(static_cast<int>(array->length()),
+ static_cast<int>(array->null_count()),
+ array->null_bitmap_data(), array->offset(),
&acc));
+
+ std::shared_ptr<::arrow::Array> result;
+ ASSERT_OK(acc.builder->Finish(&result));
+ ASSERT_EQ(array->length(), result->length());
+ ASSERT_OK(result->ValidateFull());
+
+ ::arrow::AssertArraysEqual(*array, *result);
+ }
+
+ void CheckRoundtripFLBA() {
+ constexpr int64_t kSize = 50;
+ constexpr int kSeed = 42;
+ constexpr int kByteWidth = 4;
+ ::arrow::random::RandomArrayGenerator rag{kSeed};
+ std::shared_ptr<::arrow::Array> values =
+ rag.FixedSizeBinary(/*size=*/0, /*byte_width=*/kByteWidth);
+ CheckDirectPut(values);
+
+ for (auto seed : {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}) {
+ values = rag.FixedSizeBinary(kSize + seed, kByteWidth);
+ CheckDirectPut(values);
+ }
+ }
+
+ void CheckRoundtripByteArray() {
+ constexpr int64_t kSize = 500;
+ constexpr int32_t kMinLength = 0;
+ constexpr int32_t kMaxLength = 10;
+ constexpr int32_t kNumUnique = 10;
+ constexpr double kNullProbability = 0.25;
+ constexpr int kSeed = 42;
+ ::arrow::random::RandomArrayGenerator rag{kSeed};
+ std::shared_ptr<::arrow::Array> values = rag.BinaryWithRepeats(
+ /*size=*/1, /*unique=*/1, kMinLength, kMaxLength, kNullProbability);
+ CheckDirectPut(values);
+
+ for (int i = 0; i < 10; ++i) {
+ values = rag.BinaryWithRepeats(kSize, kNumUnique, kMinLength, kMaxLength,
+ kNullProbability);
+ CheckDirectPut(values);
+ }
+ }
+
+ void CheckRoundtrip() override {
+ using ArrowType = typename EncodingTraits<Type>::ArrowType;
+ using IsFixedSizeBinary = ::arrow::is_fixed_size_binary_type<ArrowType>;
+
+ if constexpr (IsFixedSizeBinary::value) {
+ CheckRoundtripFLBA();
+ } else {
+ CheckRoundtripByteArray();
+ }
+ }
+
+ protected:
+ USING_BASE_MEMBERS();
+};
+
+TYPED_TEST_SUITE(DeltaByteArrayEncodingDirectPut,
TestDeltaByteArrayEncodingTypes);
+
+TYPED_TEST(DeltaByteArrayEncodingDirectPut, DirectPut) {
+ ASSERT_NO_FATAL_FAILURE(this->CheckRoundtrip());
+}
+
+TEST(DeltaByteArrayEncodingAdHoc, ArrowDirectPut) {
+ auto CheckEncode = [](const std::shared_ptr<::arrow::Array>& values,
+ const std::shared_ptr<Buffer>& encoded) {
+ auto encoder = MakeTypedEncoder<ByteArrayType>(Encoding::DELTA_BYTE_ARRAY);
+ ASSERT_NO_THROW(encoder->Put(*values));
+ auto buf = encoder->FlushValues();
+ ASSERT_TRUE(encoded->Equals(*buf));
+ };
+
+ auto ArrayToInt32Vector = [](const std::shared_ptr<::arrow::Array>& lengths)
{
+ std::vector<int32_t> vector;
+ auto data_ptr = checked_cast<::arrow::Int32Array*>(lengths.get());
+ for (int i = 0; i < lengths->length(); ++i) {
+ vector.push_back(data_ptr->GetView(i));
+ }
+ return vector;
+ };
+
+ auto CheckDecode = [](std::shared_ptr<Buffer> buf,
+ std::shared_ptr<::arrow::Array> values) {
+ int num_values = static_cast<int>(values->length());
+ auto decoder = MakeTypedDecoder<ByteArrayType>(Encoding::DELTA_BYTE_ARRAY);
+ decoder->SetData(num_values, buf->data(), static_cast<int>(buf->size()));
+
+ typename EncodingTraits<ByteArrayType>::Accumulator acc;
+ if (::arrow::is_string(values->type()->id())) {
+ acc.builder = std::make_unique<::arrow::StringBuilder>();
+ } else {
+ acc.builder = std::make_unique<::arrow::BinaryBuilder>();
+ }
+
+ ASSERT_EQ(num_values,
+ decoder->DecodeArrow(static_cast<int>(values->length()),
+ static_cast<int>(values->null_count()),
+ values->null_bitmap_data(),
values->offset(), &acc));
+
+ std::shared_ptr<::arrow::Array> result;
+ ASSERT_OK(acc.builder->Finish(&result));
+ ASSERT_EQ(num_values, result->length());
+ ASSERT_OK(result->ValidateFull());
+
+ auto upcast_result = CastBinaryTypesHelper(result, values->type());
+ ::arrow::AssertArraysEqual(*values, *upcast_result);
+ };
+
+ auto CheckEncodeDecode =
+ [&](std::string_view values, std::shared_ptr<::arrow::Array>
prefix_lengths,
+ std::shared_ptr<::arrow::Array> suffix_lengths, std::string_view
suffix_data) {
+ auto encoded =
+
::arrow::ConcatenateBuffers({DeltaEncode(ArrayToInt32Vector(prefix_lengths)),
Review Comment:
Is this approximately what you had in mind?
https://github.com/apache/arrow/pull/14341/commits/d808c5a8df6fa520064872760e0ec224248efe68
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]