felipecrv commented on code in PR #40647:
URL: https://github.com/apache/arrow/pull/40647#discussion_r1538062288
##########
cpp/src/arrow/memory_pool.h:
##########
@@ -35,44 +36,68 @@ namespace internal {
///////////////////////////////////////////////////////////////////////
// Helper tracking memory statistics
-class MemoryPoolStats {
- public:
- MemoryPoolStats() : bytes_allocated_(0), max_memory_(0) {}
-
- int64_t max_memory() const { return max_memory_.load(); }
-
- int64_t bytes_allocated() const { return bytes_allocated_.load(); }
+/// \brief Memory pool statistics
+///
+/// 64-byte aligned so that all atomic values are on the same cache line.
+class alignas(64) MemoryPoolStats {
+ private:
+ // All atomics are updated according to Acquire-Release ordering.
+ //
https://en.cppreference.com/w/cpp/atomic/memory_order#Release-Acquire_ordering
+ //
+ // max_memory_, total_allocated_bytes_, and num_allocs_ only go up (they are
+ // monotonically increasing) which can allow some optimizations.
+ std::atomic<int64_t> max_memory_{0};
+ std::atomic<int64_t> bytes_allocated_{0};
+ std::atomic<int64_t> total_allocated_bytes_{0};
+ std::atomic<int64_t> num_allocs_{0};
- int64_t total_bytes_allocated() const { return
total_allocated_bytes_.load(); }
+ public:
+ int64_t max_memory() const { return
max_memory_.load(std::memory_order_acquire); }
- int64_t num_allocations() const { return num_allocs_.load(); }
+ int64_t bytes_allocated() const {
+ return bytes_allocated_.load(std::memory_order_acquire);
+ }
- inline void UpdateAllocatedBytes(int64_t diff, bool is_free = false) {
- auto allocated = bytes_allocated_.fetch_add(diff) + diff;
- // "maximum" allocated memory is ill-defined in multi-threaded code,
- // so don't try to be too rigorous here
- if (diff > 0 && allocated > max_memory_) {
- max_memory_ = allocated;
- }
+ int64_t total_bytes_allocated() const {
+ return total_allocated_bytes_.load(std::memory_order_acquire);
+ }
- // Reallocations might just expand/contract the allocation in place or
might
- // copy to a new location. We can't really know, so we just represent the
- // optimistic case.
- if (diff > 0) {
- total_allocated_bytes_ += diff;
+ int64_t num_allocations() const { return
num_allocs_.load(std::memory_order_acquire); }
+
+ inline void DidAllocateBytes(int64_t size) {
+ // Issue the load before everything else. max_memory_ is monotonically
increasing,
+ // so we can use a relaxed load before the read-modify-write.
+ auto max_memory = max_memory_.load(std::memory_order_relaxed);
+ const auto old_bytes_allocated =
+ bytes_allocated_.fetch_add(size, std::memory_order_acq_rel);
+ // Issue store operations on values that we don't depend on to proceed
+ // with execution. When done, max_memory and old_bytes_allocated have
+ // a higher chance of being available on CPU registers. This also has the
+ // nice side-effect of putting 3 atomic stores close to each other in the
+ // instruction stream.
Review Comment:
`fetch_add` is not a store, it's a load+store. The ALU needs the latest
value to correctly calculate the sum, that then gets stored in the memory
position.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]