wgtmac commented on code in PR #14341:
URL: https://github.com/apache/arrow/pull/14341#discussion_r1157921371


##########
cpp/src/parquet/encoding.cc:
##########
@@ -3149,9 +3160,14 @@ class DeltaByteArrayEncoder : public EncoderImpl, 
virtual public TypedEncoder<DT
           previous_len = src.len;
           prefix_length_encoder_.Put({static_cast<int32_t>(j)}, 1);
 
-          const uint8_t* suffix_ptr = src.ptr + j;
-          const uint32_t suffix_length = static_cast<uint32_t>(src.len - j);
           last_value_view = view;
+          const auto suffix_length = static_cast<uint32_t>(src.len - j);
+          const uint8_t* suffix_ptr;
+          if (suffix_length == 0) {
+            suffix_ptr = reinterpret_cast<const uint8_t*>("");

Review Comment:
   `""` is a temporary variable on stack which only lives in the current scope. 
What about `suffix_ptr = nullptr` or use a `static const std::string kEmpty = 
""`?



##########
cpp/src/parquet/encoding.cc:
##########
@@ -3187,21 +3203,74 @@ void DeltaByteArrayEncoder<DType>::Put(const T* src, 
int num_values) {
 
     auto view = string_view{reinterpret_cast<const char*>(value->ptr), 
value->len};
     uint32_t j = 0;
-    while (j < std::min(value->len, 
static_cast<uint32_t>(last_value_view.length()))) {
+    const uint32_t common_length =
+        std::min(value->len, static_cast<uint32_t>(last_value_view.length()));
+    while (j < common_length) {
       if (last_value_view[j] != view[j]) {
         break;
       }
       j++;
     }
 
+    last_value_view = view;
     prefix_lengths[i] = j;
-    const uint8_t* suffix_ptr = value->ptr + j;
-    const uint32_t suffix_length = static_cast<uint32_t>(value->len - j);
+    const auto suffix_length = static_cast<uint32_t>(value->len - j);
+    const uint8_t* suffix_ptr;
+    if (suffix_length == 0) {
+      suffix_ptr = reinterpret_cast<const uint8_t*>("");
+    } else {
+      suffix_ptr = value->ptr + j;
+    }
+    // Convert suffix to ByteArray so it can be passed to the suffix_encoder_.
+    const ByteArray suffix(suffix_length, suffix_ptr);
+    suffix_encoder_.Put(&suffix, 1);
+  }
+  prefix_length_encoder_.Put(prefix_lengths.data(), num_values);
+  last_value_ = last_value_view;
+}
+
+template <>
+void DeltaByteArrayEncoder<FLBAType>::Put(const FLBA* src, int num_values) {
+  if (num_values == 0) {
+    return;
+  }
+  ArrowPoolVector<int32_t> prefix_lengths(num_values,
+                                          
::arrow::stl::allocator<int32_t>(pool_));
+  std::string_view last_value_view = last_value_;
+  const int32_t len = descr_->type_length();
+
+  if (ARROW_PREDICT_FALSE(len >= static_cast<int32_t>(kMaxByteArraySize))) {
+    throw Status::Invalid("Parquet cannot store strings with size 2GB or 
more");
+  }
+
+  for (int i = 0; i < num_values; i++) {
+    // Convert to ByteArray so we can pass to the suffix_encoder_.
+    const FLBA* value = reinterpret_cast<const FLBA*>(&src[i].ptr);
+
+    auto view = string_view{reinterpret_cast<const char*>(value->ptr),
+                            static_cast<uint32_t>(len)};
+    int32_t j = 0;
+    const int32_t common_length =
+        std::min(len, static_cast<int32_t>(last_value_view.length()));
+    while (j < common_length) {
+      if (last_value_view[j] != view[j]) {
+        break;
+      }
+      j++;
+    }
+
     last_value_view = view;
+    prefix_lengths[i] = j;
+    const auto suffix_length = static_cast<uint32_t>(len - j);
+    const uint8_t* suffix_ptr;
+    if (suffix_length == 0) {
+      suffix_ptr = reinterpret_cast<const uint8_t*>("");

Review Comment:
   ditto



##########
cpp/src/parquet/encoding.cc:
##########
@@ -3177,8 +3194,7 @@ void DeltaByteArrayEncoder<DType>::Put(const T* src, int 
num_values) {
                                           
::arrow::stl::allocator<int32_t>(pool_));
   std::string_view last_value_view = last_value_;
 
-  int i = 0;
-  while (i < num_values) {
+  for (int i = 0; i < num_values; i++) {

Review Comment:
   nit: what about make the current function a specialization for ByteArrayType 
and throw an unimplemented exception in the generic one? This may make the line 
3199 less confusing.



##########
cpp/src/parquet/encoding.cc:
##########
@@ -3187,21 +3203,74 @@ void DeltaByteArrayEncoder<DType>::Put(const T* src, 
int num_values) {
 
     auto view = string_view{reinterpret_cast<const char*>(value->ptr), 
value->len};
     uint32_t j = 0;
-    while (j < std::min(value->len, 
static_cast<uint32_t>(last_value_view.length()))) {
+    const uint32_t common_length =
+        std::min(value->len, static_cast<uint32_t>(last_value_view.length()));
+    while (j < common_length) {
       if (last_value_view[j] != view[j]) {
         break;
       }
       j++;
     }
 
+    last_value_view = view;
     prefix_lengths[i] = j;
-    const uint8_t* suffix_ptr = value->ptr + j;
-    const uint32_t suffix_length = static_cast<uint32_t>(value->len - j);
+    const auto suffix_length = static_cast<uint32_t>(value->len - j);
+    const uint8_t* suffix_ptr;
+    if (suffix_length == 0) {
+      suffix_ptr = reinterpret_cast<const uint8_t*>("");

Review Comment:
   ditto. suffix_ptr = nullptr would be the simplest fix.



##########
cpp/src/parquet/encoding_test.cc:
##########
@@ -2224,15 +2223,24 @@ TEST(DeltaByteArrayEncodingAdHoc, ArrowDirectPut) {
     auto suffix_lengths = ::arrow::ArrayFromJSON(::arrow::int32(), R"([6, 0, 
0, 0])");
 
     constexpr std::string_view suffix_data = "axisba";
-    checkEncodeDecode(values, prefix_lengths, suffix_lengths, suffix_data);
+    CheckEncodeDecode(values, prefix_lengths, suffix_lengths, suffix_data);
   }
+
   {
     auto values = R"(["baaxis", "axis", "axis", "axis"])";
     auto prefix_lengths = ::arrow::ArrayFromJSON(::arrow::int32(), R"([0, 0, 
4, 4])");
     auto suffix_lengths = ::arrow::ArrayFromJSON(::arrow::int32(), R"([6, 4, 
0, 0])");
 
     constexpr std::string_view suffix_data = "baaxisaxis";
-    checkEncodeDecode(values, prefix_lengths, suffix_lengths, suffix_data);
+    CheckEncodeDecode(values, prefix_lengths, suffix_lengths, suffix_data);
+  }
+
+  {
+    auto values = R"(["καλημέρα", "καμηλιέρη", "καμηλιέρη", "καλημέρα"])";

Review Comment:
   Thanks for adding this!



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to