kou commented on code in PR #40484:
URL: https://github.com/apache/arrow/pull/40484#discussion_r1537044488


##########
cpp/src/arrow/compute/key_hash.cc:
##########
@@ -472,6 +483,7 @@ Status Hashing32::HashBatch(const ExecBatch& key_batch, 
uint32_t* hashes,
   LightContext ctx;
   ctx.hardware_flags = hardware_flags;
   ctx.stack = temp_stack;
+

Review Comment:
   Could you revert a needless change?



##########
cpp/src/arrow/compute/key_hash.cc:
##########
@@ -383,19 +383,30 @@ void Hashing32::HashMultiColumn(const 
std::vector<KeyColumnArray>& cols,
   uint32_t num_rows = static_cast<uint32_t>(cols[0].length());
 
   constexpr uint32_t max_batch_size = util::MiniBatch::kMiniBatchLength;
+  const uint32_t alloc_batch_size = std::min(num_rows, max_batch_size);
+  const int64_t estimate_alloc_size = 
EstimateBatchStackSize<uint32_t>(alloc_batch_size);
 
-  auto hash_temp_buf = util::TempVectorHolder<uint32_t>(ctx->stack, 
max_batch_size);
+  util::TempVectorStack temp_stack;
+  if (!ctx->stack) {
+    ARROW_CHECK_OK(temp_stack.Init(default_memory_pool(), 
estimate_alloc_size));
+    ctx->stack = &temp_stack;

Review Comment:
   Could you set `nullptr` to `ctx->stack` before this function is exited?



##########
cpp/src/arrow/compute/util.cc:
##########
@@ -58,6 +58,13 @@ void TempVectorStack::release(int id, uint32_t num_bytes) {
   --num_vectors_;
 }
 
+void TempVectorStack::CheckAllocSizeValid(int64_t estimate_alloc_size) {

Review Comment:
   `CheckOverflow()`, `CheckEnoughSize()` or something?



##########
cpp/src/arrow/compute/util.cc:
##########
@@ -58,6 +58,13 @@ void TempVectorStack::release(int id, uint32_t num_bytes) {
   --num_vectors_;
 }
 
+void TempVectorStack::CheckAllocSizeValid(int64_t estimate_alloc_size) {

Review Comment:
   Could you return `arrow::Status` instead of `void` here?
   
   `TempVectorStack::alloc()` will not be able to use it for now but 
`Hashing32::HashBatch()` can use it.



##########
cpp/src/arrow/compute/util.cc:
##########
@@ -58,6 +58,13 @@ void TempVectorStack::release(int id, uint32_t num_bytes) {
   --num_vectors_;
 }
 
+void TempVectorStack::CheckAllocSizeValid(int64_t estimate_alloc_size) {

Review Comment:
   ```suggestion
   void TempVectorStack::CheckAllocSizeValid(int64_t alloc_size) {
   ```



##########
cpp/src/arrow/compute/util.cc:
##########
@@ -58,6 +58,13 @@ void TempVectorStack::release(int id, uint32_t num_bytes) {
   --num_vectors_;
 }
 
+void TempVectorStack::CheckAllocSizeValid(int64_t estimate_alloc_size) {
+  ARROW_DCHECK_LE(estimate_alloc_size, buffer_size_)
+      << "TempVectorStack alloc overflow."

Review Comment:
   ```suggestion
         << "TempVectorStack alloc overflow. "
   ```



##########
cpp/src/arrow/compute/util.h:
##########
@@ -89,16 +89,23 @@ class ARROW_EXPORT TempVectorStack {
   Status Init(MemoryPool* pool, int64_t size) {
     num_vectors_ = 0;
     top_ = 0;
-    buffer_size_ = PaddedAllocationSize(size) + kPadding + 2 * 
sizeof(uint64_t);
+    buffer_size_ = PaddedAllocationSize(size) + 2 * sizeof(uint64_t);

Review Comment:
   Can we use `EstimateAllocSize()` here?



##########
cpp/src/arrow/compute/key_hash.h:
##########
@@ -219,5 +219,24 @@ class ARROW_EXPORT Hashing64 {
                       const uint8_t* keys, uint64_t* hashes);
 };
 
+template <typename T = uint32_t>
+static int64_t EstimateBatchStackSize(int32_t batch_size) {

Review Comment:
   Do we need to export this?



##########
cpp/src/arrow/compute/util.h:
##########
@@ -89,16 +89,23 @@ class ARROW_EXPORT TempVectorStack {
   Status Init(MemoryPool* pool, int64_t size) {
     num_vectors_ = 0;
     top_ = 0;
-    buffer_size_ = PaddedAllocationSize(size) + kPadding + 2 * 
sizeof(uint64_t);
+    buffer_size_ = PaddedAllocationSize(size) + 2 * sizeof(uint64_t);
     ARROW_ASSIGN_OR_RAISE(auto buffer, AllocateResizableBuffer(size, pool));
     // Ensure later operations don't accidentally read uninitialized memory.
     std::memset(buffer->mutable_data(), 0xFF, size);
     buffer_ = std::move(buffer);
     return Status::OK();
   }
 
+  static int64_t EstimateAllocSize(int64_t size) {
+    return PaddedAllocationSize(size) + 2 * sizeof(uint64_t);
+  }
+
+  int64_t StackBufferSize() const { return buffer_size_; }

Review Comment:
   Do we need this?



##########
cpp/src/arrow/compute/key_hash.cc:
##########
@@ -383,19 +383,30 @@ void Hashing32::HashMultiColumn(const 
std::vector<KeyColumnArray>& cols,
   uint32_t num_rows = static_cast<uint32_t>(cols[0].length());
 
   constexpr uint32_t max_batch_size = util::MiniBatch::kMiniBatchLength;
+  const uint32_t alloc_batch_size = std::min(num_rows, max_batch_size);
+  const int64_t estimate_alloc_size = 
EstimateBatchStackSize<uint32_t>(alloc_batch_size);

Review Comment:
   Can we use `auto` here?
   
   ```suggestion
     const auto alloc_batch_size = std::min(num_rows, max_batch_size);
     const auto estimate_alloc_size = 
EstimateBatchStackSize<uint32_t>(alloc_batch_size);
   ```



##########
cpp/src/arrow/compute/util.cc:
##########
@@ -35,7 +35,7 @@ void TempVectorStack::alloc(uint32_t num_bytes, uint8_t** 
data, int* id) {
   int64_t new_top = top_ + PaddedAllocationSize(num_bytes) + 2 * 
sizeof(uint64_t);
   // Stack overflow check (see GH-39582).

Review Comment:
   Could you move this comment to `CheckAllocSizeValid()`?



##########
cpp/src/arrow/compute/util.cc:
##########
@@ -58,6 +58,13 @@ void TempVectorStack::release(int id, uint32_t num_bytes) {
   --num_vectors_;
 }
 
+void TempVectorStack::CheckAllocSizeValid(int64_t estimate_alloc_size) {
+  ARROW_DCHECK_LE(estimate_alloc_size, buffer_size_)

Review Comment:
   I think that we should receive additional allocation size instead of total 
new allocation size here: 
   
   ```suggestion
     ARROW_DCHECK_LE(top_ + alloc_size, buffer_size_)
   ```



##########
cpp/src/arrow/compute/key_hash_test.cc:
##########
@@ -311,5 +311,32 @@ TEST(VectorHash, FixedLengthTailByteSafety) {
   HashFixedLengthFrom(/*key_length=*/19, /*num_rows=*/64, /*start_row=*/63);
 }
 
+TEST(HashBatch, AllocTempStackAsNeeded) {
+  auto arr = arrow::ArrayFromJSON(arrow::int32(), "[9,2,6]");
+  const int32_t batch_size = static_cast<int32_t>(arr->length());

Review Comment:
   ```suggestion
     const auto batch_size = static_cast<int32_t>(arr->length());
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to