pitrou commented on code in PR #44053:
URL: https://github.com/apache/arrow/pull/44053#discussion_r1753658921


##########
cpp/src/arrow/compute/row/grouper.cc:
##########
@@ -170,42 +169,137 @@ struct SimpleKeySegmenter : public BaseRowSegmenter {
     return extends;
   }
 
-  Result<Segment> GetNextSegment(const Scalar& scalar, int64_t offset, int64_t 
length) {
+  ARROW_DEPRECATED("Deprecated in 18.0.0 along with GetSegments.")
+  Result<Segment> GetNextSegmentDeprecated(const Scalar& scalar, int64_t 
offset,
+                                           int64_t length) {
+    ARROW_SUPPRESS_DEPRECATION_WARNING
     ARROW_RETURN_NOT_OK(CheckType(*scalar.type));
     if (!scalar.is_valid) {
       return Status::Invalid("segmenting an invalid scalar");
     }
     auto data = checked_cast<const PrimitiveScalarBase&>(scalar).data();
-    bool extends = length > 0 ? Extend(data) : kEmptyExtends;
+    bool extends = length > 0 ? ExtendDeprecated(data) : kEmptyExtends;
     return MakeSegment(length, offset, length, extends);
+    ARROW_UNSUPPRESS_DEPRECATION_WARNING
   }
 
-  Result<Segment> GetNextSegment(const DataType& array_type, const uint8_t* 
array_bytes,
-                                 int64_t offset, int64_t length) {
+  ARROW_DEPRECATED("Deprecated in 18.0.0 along with GetSegments.")
+  Result<Segment> GetNextSegmentDeprecated(const DataType& array_type,
+                                           const uint8_t* array_bytes, int64_t 
offset,
+                                           int64_t length) {
+    ARROW_SUPPRESS_DEPRECATION_WARNING
     RETURN_NOT_OK(CheckType(array_type));
     DCHECK_LE(offset, length);
     int64_t byte_width = array_type.byte_width();
     int64_t match_length = GetMatchLength(array_bytes + offset * byte_width, 
byte_width,
                                           array_bytes, offset, length);
-    bool extends = length > 0 ? Extend(array_bytes + offset * byte_width) : 
kEmptyExtends;
+    bool extends =
+        length > 0 ? ExtendDeprecated(array_bytes + offset * byte_width) : 
kEmptyExtends;
     return MakeSegment(length, offset, match_length, extends);
+    ARROW_UNSUPPRESS_DEPRECATION_WARNING
   }
 
+  ARROW_DEPRECATED("Deprecated in 18.0.0. Use GetSegments instead.")
   Result<Segment> GetNextSegment(const ExecSpan& batch, int64_t offset) 
override {
+    ARROW_SUPPRESS_DEPRECATION_WARNING
     ARROW_RETURN_NOT_OK(CheckForGetNextSegment(batch, offset, {key_type_}));
     if (offset == batch.length) {
       return MakeSegment(batch.length, offset, 0, kEmptyExtends);
     }
     const auto& value = batch.values[0];
     if (value.is_scalar()) {
-      return GetNextSegment(*value.scalar, offset, batch.length);
+      return GetNextSegmentDeprecated(*value.scalar, offset, batch.length);
     }
     ARROW_DCHECK(value.is_array());
     const auto& array = value.array;
     if (array.GetNullCount() > 0) {
       return Status::NotImplemented("segmenting a nullable array");
     }
-    return GetNextSegment(*array.type, GetValuesAsBytes(array), offset, 
batch.length);
+    return GetNextSegmentDeprecated(*array.type, GetValuesAsBytes(array), 
offset,
+                                    batch.length);
+    ARROW_UNSUPPRESS_DEPRECATION_WARNING
+  }
+
+  Result<std::vector<Segment>> GetSegments(const ExecSpan& batch) override {
+    RETURN_NOT_OK(CheckForGetSegments(batch, {key_type_}));
+
+    if (batch.length == 0) {
+      return std::vector<Segment>{};
+    }
+
+    const auto& value = batch.values[0];
+    RETURN_NOT_OK(CheckType(*value.type()));
+
+    std::vector<Segment> segments;
+    const void* key_data;
+    if (value.is_scalar()) {
+      const auto& scalar = *value.scalar;
+      DCHECK(scalar.is_valid);
+      key_data = checked_cast<const PrimitiveScalarBase&>(scalar).data();
+      bool extends = Extend(key_data);
+      segments.push_back(MakeSegment(batch.length, 0, batch.length, extends));
+    } else {
+      DCHECK(value.is_array());
+      const auto& array = value.array;
+      DCHECK_EQ(array.GetNullCount(), 0);
+      auto data = GetValuesAsBytes(array);
+      int64_t byte_width = array.type->byte_width();
+      int64_t offset = 0;
+      bool extends = Extend(data);
+      while (offset < array.length) {
+        int64_t match_length = GetMatchLength(data + offset * byte_width, 
byte_width,
+                                              data, offset, array.length);
+        segments.push_back(MakeSegment(array.length, offset, match_length,
+                                       offset == 0 ? extends : false));
+        offset += match_length;
+      }
+      key_data = data + (array.length - 1) * byte_width;
+    }
+
+    SaveKeyData(key_data);
+
+    return segments;
+  }
+
+ private:
+  static Status CheckType(const DataType& type) {
+    if (!is_fixed_width(type)) {
+      return Status::Invalid("SimpleKeySegmenter does not support type ", 
type);
+    }
+    return Status::OK();
+  }
+
+  static const uint8_t* GetValuesAsBytes(const ArraySpan& data, int64_t offset 
= 0) {
+    DCHECK_GT(data.type->byte_width(), 0);
+    int64_t absolute_byte_offset = (data.offset + offset) * 
data.type->byte_width();
+    return data.GetValues<uint8_t>(1, absolute_byte_offset);
+  }
+
+  // Find the match-length of a value within a fixed-width buffer
+  static int64_t GetMatchLength(const uint8_t* match_bytes, int64_t 
match_width,
+                                const uint8_t* array_bytes, int64_t offset,
+                                int64_t length) {
+    int64_t cursor, byte_cursor;
+    for (cursor = offset, byte_cursor = match_width * cursor; cursor < length;
+         cursor++, byte_cursor += match_width) {
+      if (memcmp(match_bytes, array_bytes + byte_cursor,
+                 static_cast<size_t>(match_width)) != 0) {
+        break;
+      }
+    }
+    return std::min(cursor, length) - offset;
+  }
+
+  bool Extend(const void* data) {
+    if ARROW_PREDICT_FALSE (!extend_was_called_) {

Review Comment:
   Please let's put parentheses here
   ```suggestion
       if (ARROW_PREDICT_FALSE(!extend_was_called_)) {
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to