zanmato1984 commented on code in PR #48108:
URL: https://github.com/apache/arrow/pull/48108#discussion_r2524866619


##########
cpp/src/arrow/util/fuzz_internal.h:
##########
@@ -0,0 +1,37 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+
+#include "arrow/type_fwd.h"
+#include "arrow/util/macros.h"
+
+namespace arrow::internal {
+
+// The default rss_limit_mb on OSS-Fuzz is 2560 MB and we want to fail 
allocations
+// before that limit is reached, otherwise the fuzz target gets killed 
(GH-48105).
+constexpr int64_t kFuzzingMemoryLimit = 2200LL * 1000 * 1000;
+
+/// Return a memory pool that will not allocate more than kFuzzingMemoryLimit 
bytes.
+ARROW_EXPORT MemoryPool* fuzzing_memory_pool();
+
+// Optionally log the outcome of fuzzing an input
+ARROW_EXPORT void NoteFuzzStatus(const Status&, const uint8_t* data, int64_t 
size);

Review Comment:
   Do you think a name like `LogFuzzStatus` can be more straightforward?



##########
cpp/src/arrow/util/fuzz_internal.cc:
##########
@@ -0,0 +1,39 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/util/fuzz_internal.h"
+
+#include "arrow/memory_pool.h"
+#include "arrow/status.h"
+#include "arrow/util/logging_internal.h"
+
+namespace arrow::internal {
+
+MemoryPool* fuzzing_memory_pool() {
+  static auto pool = std::make_shared<::arrow::CappedMemoryPool>(
+      ::arrow::default_memory_pool(), 
/*bytes_allocated_limit=*/kFuzzingMemoryLimit);

Review Comment:
   Do we want to expose the wrapped memory pool as a parameter and default it 
to `::arrow::default_memory_pool()`?



##########
cpp/src/arrow/memory_pool.h:
##########
@@ -245,6 +246,73 @@ class ARROW_EXPORT ProxyMemoryPool : public MemoryPool {
   std::unique_ptr<ProxyMemoryPoolImpl> impl_;
 };
 
+/// EXPERIMENTAL MemoryPool wrapper with an upper limit
+///
+/// Checking for limits is not done in a fully thread-safe way, therefore
+/// multi-threaded allocations might be able to go successfully above the
+/// configured limit.
+class ARROW_EXPORT CappedMemoryPool : public MemoryPool {
+ public:
+  CappedMemoryPool(MemoryPool* wrapped_pool, int64_t bytes_allocated_limit)
+      : wrapped_(wrapped_pool), bytes_allocated_limit_(bytes_allocated_limit) 
{}
+
+  using MemoryPool::Allocate;
+  using MemoryPool::Reallocate;
+
+  Status Allocate(int64_t size, int64_t alignment, uint8_t** out) override {
+    // XXX Another thread may allocate memory between the limit check and
+    // the `Allocate` call. It is possible for the two allocations to be 
successful
+    // while going above the limit.
+    // Solving this issue would require refactoring the `MemoryPool` 
implementation
+    // to delegate the limit check to `MemoryPoolStats`.
+    const auto attempted = size + wrapped_->bytes_allocated();
+    if (ARROW_PREDICT_FALSE(attempted > bytes_allocated_limit_)) {
+      return OutOfMemory(attempted);
+    }
+    return wrapped_->Allocate(size, alignment, out);
+  }
+
+  Status Reallocate(int64_t old_size, int64_t new_size, int64_t alignment,
+                    uint8_t** ptr) override {
+    const auto attempted = new_size - old_size + wrapped_->bytes_allocated();
+    if (ARROW_PREDICT_FALSE(attempted > bytes_allocated_limit_)) {
+      return OutOfMemory(attempted);
+    }
+    return wrapped_->Reallocate(old_size, new_size, alignment, ptr);
+  }
+
+  void Free(uint8_t* buffer, int64_t size, int64_t alignment) override {
+    return wrapped_->Free(buffer, size, alignment);
+  }
+
+  void ReleaseUnused() override { wrapped_->ReleaseUnused(); }
+
+  void PrintStats() override { wrapped_->PrintStats(); }
+
+  int64_t bytes_allocated() const override { return 
wrapped_->bytes_allocated(); }
+
+  int64_t max_memory() const override { return wrapped_->max_memory(); }
+
+  int64_t total_bytes_allocated() const override {
+    return wrapped_->total_bytes_allocated();
+  }
+
+  int64_t num_allocations() const override { return 
wrapped_->num_allocations(); }
+
+  std::string backend_name() const override { return wrapped_->backend_name(); 
}
+
+ private:
+  Status OutOfMemory(int64_t value) {
+    return Status::OutOfMemory(
+        "MemoryPool bytes_allocated cap exceeded: "
+        "limit=",
+        bytes_allocated_limit_, ", attempted=", value);
+  }
+
+  MemoryPool* wrapped_;
+  int64_t bytes_allocated_limit_;

Review Comment:
   Maybe this can be `const`?



##########
cpp/src/arrow/util/fuzz_internal.cc:
##########
@@ -0,0 +1,39 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/util/fuzz_internal.h"
+
+#include "arrow/memory_pool.h"
+#include "arrow/status.h"
+#include "arrow/util/logging_internal.h"
+
+namespace arrow::internal {
+
+MemoryPool* fuzzing_memory_pool() {
+  static auto pool = std::make_shared<::arrow::CappedMemoryPool>(
+      ::arrow::default_memory_pool(), 
/*bytes_allocated_limit=*/kFuzzingMemoryLimit);
+  return pool.get();
+}
+
+void NoteFuzzStatus(const Status& st, const uint8_t* data, int64_t size) {
+  if (st.IsOutOfMemory()) {
+    ARROW_LOG(WARNING) << "Fuzzing input with size=" << size

Review Comment:
   If this is for general purpose of status logging, maybe we can simply do:
   ```
   if (!st.ok()) {
     ARROW_LOG(WARNING) << "Fuzzing input with size=" << size << " hit: " << 
st.ToString();
   }
   ```
   Because the information of OOM (or any other non-OK status) is already 
encoded within the `st`.



##########
cpp/src/arrow/memory_pool.h:
##########
@@ -245,6 +246,73 @@ class ARROW_EXPORT ProxyMemoryPool : public MemoryPool {
   std::unique_ptr<ProxyMemoryPoolImpl> impl_;
 };
 
+/// EXPERIMENTAL MemoryPool wrapper with an upper limit
+///
+/// Checking for limits is not done in a fully thread-safe way, therefore
+/// multi-threaded allocations might be able to go successfully above the
+/// configured limit.
+class ARROW_EXPORT CappedMemoryPool : public MemoryPool {
+ public:
+  CappedMemoryPool(MemoryPool* wrapped_pool, int64_t bytes_allocated_limit)
+      : wrapped_(wrapped_pool), bytes_allocated_limit_(bytes_allocated_limit) 
{}
+
+  using MemoryPool::Allocate;
+  using MemoryPool::Reallocate;

Review Comment:
   Are these necessary?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to