zanmato1984 commented on code in PR #44053:
URL: https://github.com/apache/arrow/pull/44053#discussion_r1753032974
##########
cpp/src/arrow/acero/aggregate_internal.h:
##########
@@ -131,17 +131,14 @@ void AggregatesToString(std::stringstream* ss, const
Schema& input_schema,
template <typename BatchHandler>
Status HandleSegments(RowSegmenter* segmenter, const ExecBatch& batch,
const std::vector<int>& ids, const BatchHandler&
handle_batch) {
- int64_t offset = 0;
ARROW_ASSIGN_OR_RAISE(auto segment_exec_batch, batch.SelectValues(ids));
ExecSpan segment_batch(segment_exec_batch);
- while (true) {
Review Comment:
This is the only call-site in non-testing codes.
##########
cpp/src/arrow/compute/row/grouper.cc:
##########
@@ -241,24 +336,9 @@ struct AnyKeysSegmenter : public BaseRowSegmenter {
return extends;
}
- // Runs the grouper on a single row. This is used to determine the group id
of the
- // first row of a new segment to see if it extends the previous segment.
- template <typename Batch>
- Result<group_id_t> MapGroupIdAt(const Batch& batch, int64_t offset) {
Review Comment:
Moved w/o touching the code inside.
##########
cpp/src/arrow/acero/hash_aggregate_test.cc:
##########
@@ -585,19 +585,12 @@ void TestGroupClassSupportedKeys(
void TestSegments(std::unique_ptr<RowSegmenter>& segmenter, const ExecSpan&
batch,
std::vector<Segment> expected_segments) {
- int64_t offset = 0, segment_num = 0;
- for (auto expected_segment : expected_segments) {
- SCOPED_TRACE("segment #" + ToChars(segment_num++));
- ASSERT_OK_AND_ASSIGN(auto segment, segmenter->GetNextSegment(batch,
offset));
- ASSERT_EQ(expected_segment, segment);
- offset = segment.offset + segment.length;
+ ASSERT_OK_AND_ASSIGN(auto actual_segments, segmenter->GetSegments(batch));
+ ASSERT_EQ(actual_segments.size(), expected_segments.size());
+ for (size_t i = 0; i < actual_segments.size(); ++i) {
+ SCOPED_TRACE("segment #" + ToChars(i));
+ ASSERT_EQ(actual_segments[i], expected_segments[i]);
}
- // Assert next is the last (empty) segment.
Review Comment:
The new API generates fewer segments than before - there is no meaningless
(`length == 0`) tailing segment.
##########
cpp/src/arrow/compute/row/grouper.h:
##########
@@ -97,7 +97,12 @@ class ARROW_EXPORT RowSegmenter {
virtual Status Reset() = 0;
/// \brief Get the next segment for the given batch starting from the given
offset
+ /// DEPRECATED: Due to its inefficiency, use GetSegments instead.
+ ARROW_DEPRECATED("Deprecated in 18.0.0. Use GetSegments instead.")
Review Comment:
Deprecate instead of removing it because I assume this is a public API (due
to `grouper.h` is a public header).
##########
cpp/src/arrow/compute/row/grouper.cc:
##########
@@ -102,21 +109,6 @@ Segment MakeSegment(int64_t batch_length, int64_t offset,
int64_t length, bool e
return Segment{offset, length, offset + length >= batch_length, extends};
}
-// Used by SimpleKeySegmenter::GetNextSegment to find the match-length of a
value within a
-// fixed-width buffer
-int64_t GetMatchLength(const uint8_t* match_bytes, int64_t match_width,
Review Comment:
Moved w/o touching the code inside.
##########
cpp/src/arrow/acero/hash_aggregate_test.cc:
##########
@@ -629,61 +622,47 @@ TEST(RowSegmenter, Basics) {
auto batch2 = ExecBatchFromJSON(types2, "[[1, 1], [1, 2], [2, 2]]");
auto batch1 = ExecBatchFromJSON(types1, "[[1], [1], [2]]");
ExecBatch batch0({}, 3);
- {
Review Comment:
The new API doesn't accept `offset` argument so no need for such testing.
##########
cpp/src/arrow/acero/hash_aggregate_test.cc:
##########
@@ -629,61 +622,47 @@ TEST(RowSegmenter, Basics) {
auto batch2 = ExecBatchFromJSON(types2, "[[1, 1], [1, 2], [2, 2]]");
auto batch1 = ExecBatchFromJSON(types1, "[[1], [1], [2]]");
ExecBatch batch0({}, 3);
- {
- SCOPED_TRACE("offset");
- ASSERT_OK_AND_ASSIGN(auto segmenter, MakeRowSegmenter(types0));
- ExecSpan span0(batch0);
- for (int64_t offset : {-1, 4}) {
- EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid,
- HasSubstr("invalid grouping segmenter
offset"),
- segmenter->GetNextSegment(span0,
offset));
- }
- }
{
SCOPED_TRACE("types0 segmenting of batch2");
ASSERT_OK_AND_ASSIGN(auto segmenter, MakeRowSegmenter(types0));
ExecSpan span2(batch2);
EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, HasSubstr("expected batch size 0
"),
- segmenter->GetNextSegment(span2, 0));
+ segmenter->GetSegments(span2));
ExecSpan span0(batch0);
- TestSegments(segmenter, span0, {{0, 3, true, true}, {3, 0, true, true}});
Review Comment:
Here and below are just because the new API doesn't generate the zero-length
ending segment.
##########
cpp/src/arrow/compute/row/grouper.cc:
##########
@@ -147,21 +152,15 @@ struct SimpleKeySegmenter : public BaseRowSegmenter {
save_key_data_(static_cast<size_t>(key_type_.type->byte_width())),
extend_was_called_(false) {}
- Status CheckType(const DataType& type) {
Review Comment:
Moved w/o touching the code inside.
##########
cpp/src/arrow/compute/row/grouper.cc:
##########
@@ -54,13 +55,8 @@ using group_id_t =
std::remove_const<decltype(kNoGroupId)>::type;
using GroupIdType = CTypeTraits<group_id_t>::ArrowType;
auto g_group_id_type = std::make_shared<GroupIdType>();
-inline const uint8_t* GetValuesAsBytes(const ArraySpan& data, int64_t offset =
0) {
Review Comment:
Moved w/o touching the code inside.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]