Diff
Modified: trunk/Source/WTF/ChangeLog (229435 => 229436)
--- trunk/Source/WTF/ChangeLog 2018-03-08 22:47:50 UTC (rev 229435)
+++ trunk/Source/WTF/ChangeLog 2018-03-08 22:58:23 UTC (rev 229436)
@@ -1,3 +1,19 @@
+2018-03-08 Filip Pizlo <fpi...@apple.com>
+
+ bmalloc mutex should be adaptive
+ https://bugs.webkit.org/show_bug.cgi?id=177839
+
+ Reviewed by Michael Saboff.
+
+ Add some comments that I thought of while copy-pasting this code.
+
+ Reland after failing to reproduce the WasmBench crash that caused it to get rolled out. Maybe that fixed
+ itself somehow?
+
+ * wtf/LockAlgorithmInlines.h:
+ * wtf/WordLock.cpp:
+ (WTF::WordLock::unlockSlow):
+
2018-03-08 Keith Miller <keith_mil...@apple.com>
Disable JIT on Cocoa 32-bit ARM.
Modified: trunk/Source/WTF/wtf/LockAlgorithmInlines.h (229435 => 229436)
--- trunk/Source/WTF/wtf/LockAlgorithmInlines.h 2018-03-08 22:47:50 UTC (rev 229435)
+++ trunk/Source/WTF/wtf/LockAlgorithmInlines.h 2018-03-08 22:58:23 UTC (rev 229436)
@@ -34,6 +34,10 @@
// the lock algorithm slow path without recompiling the world. Right now this should be included in two
// places (Lock.cpp and JSCell.cpp).
+// NOTE: It's a bug to use any memory order other than seq_cst in this code. The cost of seq_cst
+// fencing is negligible on slow paths, so any use of a more relaxed memory model is all risk and no
+// reward.
+
namespace WTF {
template<typename LockType, LockType isHeldBit, LockType hasParkedBit, typename Hooks>
Modified: trunk/Source/WTF/wtf/WordLock.cpp (229435 => 229436)
--- trunk/Source/WTF/wtf/WordLock.cpp 2018-03-08 22:47:50 UTC (rev 229435)
+++ trunk/Source/WTF/wtf/WordLock.cpp 2018-03-08 22:58:23 UTC (rev 229436)
@@ -76,6 +76,10 @@
} // anonymous namespace
+// NOTE: It's a bug to use any memory order other than seq_cst in this code. The cost of seq_cst
+// fencing is negligible on slow paths, so any use of a more relaxed memory model is all risk and no
+// reward.
+
NEVER_INLINE void WordLock::lockSlow()
{
unsigned spinCount = 0;
@@ -223,7 +227,8 @@
ASSERT(currentWordValue & isLockedBit);
ASSERT(currentWordValue & isQueueLockedBit);
ThreadData* queueHead = bitwise_cast<ThreadData*>(currentWordValue & ~queueHeadMask);
- ASSERT(queueHead);
+ RELEASE_ASSERT(queueHead);
+ RELEASE_ASSERT(queueHead->shouldPark); // This would have been set before the thread was enqueued, so it must still be set now.
ThreadData* newQueueHead = queueHead->nextInQueue;
// Either this was the only thread on the queue, in which case we delete the queue, or there
@@ -256,10 +261,12 @@
{
std::lock_guard<std::mutex> locker(queueHead->parkingLock);
queueHead->shouldPark = false;
+ // We have to do the notify while still holding the lock, since otherwise, we could try to
+ // do it after the queueHead has destructed. It's impossible for queueHead to destruct
+ // while we hold the lock, since it is either currently in the wait loop or it's before it
+ // so it has to grab the lock before destructing.
+ queueHead->parkingCondition.notify_one();
}
- // Doesn't matter if we notify_all() or notify_one() here since the only thread that could be
- // waiting is queueHead.
- queueHead->parkingCondition.notify_one();
// The old queue head can now contend for the lock again. We're done!
}
Modified: trunk/Source/bmalloc/ChangeLog (229435 => 229436)
--- trunk/Source/bmalloc/ChangeLog 2018-03-08 22:47:50 UTC (rev 229435)
+++ trunk/Source/bmalloc/ChangeLog 2018-03-08 22:58:23 UTC (rev 229436)
@@ -1,3 +1,35 @@
+2018-03-08 Filip Pizlo <fpi...@apple.com>
+
+ bmalloc mutex should be adaptive
+ https://bugs.webkit.org/show_bug.cgi?id=177839
+
+ Reviewed by Michael Saboff.
+
+ This pulls the WordLock algorithm into bmalloc, mostly by copy-pasting the code. We need to
+ copy paste because sometimes we build WTF without bmalloc, so WTF cannot rely on bmalloc for
+ anything other than malloc.
+
+ Reland after failing to reproduce the WasmBench crash that caused it to get rolled out. Maybe that fixed
+ itself somehow?
+
+ * bmalloc/Algorithm.h:
+ (bmalloc::compareExchangeWeak):
+ (bmalloc::compareExchangeStrong):
+ * bmalloc/PerThread.h:
+ * bmalloc/StaticMutex.cpp:
+ (bmalloc::StaticMutex::lockSlow):
+ (bmalloc::StaticMutex::unlockSlow):
+ (bmalloc::StaticMutex::lockSlowCase): Deleted.
+ * bmalloc/StaticMutex.h:
+ (bmalloc::StaticMutex::try_lock):
+ (bmalloc::StaticMutex::isLocked const):
+ (bmalloc::StaticMutex::init):
+ (bmalloc::StaticMutex::tryLock):
+ (bmalloc::StaticMutex::lock):
+ (bmalloc::StaticMutex::unlock):
+ (bmalloc::sleep): Deleted.
+ (bmalloc::waitUntilFalse): Deleted.
+
2018-02-16 Filip Pizlo <fpi...@apple.com>
Unreviewed, roll out r228306 (custom memcpy/memset) because the bots say that it was not a
Modified: trunk/Source/bmalloc/bmalloc/Algorithm.h (229435 => 229436)
--- trunk/Source/bmalloc/bmalloc/Algorithm.h 2018-03-08 22:47:50 UTC (rev 229435)
+++ trunk/Source/bmalloc/bmalloc/Algorithm.h 2018-03-08 22:58:23 UTC (rev 229436)
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2014 Apple Inc. All rights reserved.
+ * Copyright (C) 2014-2017 Apple Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -28,6 +28,7 @@
#include "BAssert.h"
#include <algorithm>
+#include <atomic>
#include <cstdint>
#include <cstddef>
#include <limits>
@@ -161,6 +162,24 @@
return bitCount<unsigned long>() - 1 - __builtin_clzl(value);
}
+// We need a CAS API that isn't the badly designed one from C++.
+template<typename T>
+bool compareExchangeWeak(std::atomic<T>& word, T expected, T desired, std::memory_order order = std::memory_order_seq_cst)
+{
+ // They could have made a sensible CAS API, but they didn't. Sneaky fact: for no good reason, the
+ // C++ API will mutate expected. I think that apologists will say something about how it saves you
+ // reloading the value around the back of your CAS loop, but that's nonsense. The delay along the
+ // back edge of a CAS loop has almost no impact on CAS loop performance. More often than not, we
+ // want to *add* delays to the back edge, not remove them.
+ return word.compare_exchange_weak(expected, desired, order, std::memory_order_relaxed);
+}
+
+template<typename T>
+bool compareExchangeStrong(std::atomic<T>& word, T expected, T desired, std::memory_order order = std::memory_order_seq_cst)
+{
+ return word.compare_exchange_strong(expected, desired, order, std::memory_order_relaxed);
+}
+
#define BOFFSETOF(class, field) (reinterpret_cast<ptrdiff_t>(&(reinterpret_cast<class*>(0x4000)->field)) - 0x4000)
template<typename T>
Modified: trunk/Source/bmalloc/bmalloc/PerThread.h (229435 => 229436)
--- trunk/Source/bmalloc/bmalloc/PerThread.h 2018-03-08 22:47:50 UTC (rev 229435)
+++ trunk/Source/bmalloc/bmalloc/PerThread.h 2018-03-08 22:58:23 UTC (rev 229436)
@@ -60,11 +60,11 @@
static void destructor(void*);
};
-#if HAVE_PTHREAD_MACHDEP_H
-
class Cache;
template<typename T> struct PerThreadStorage;
+#if HAVE_PTHREAD_MACHDEP_H
+
// For now, we only support PerThread<PerHeapKind<Cache>>. We can expand to other types by
// using more keys.
template<> struct PerThreadStorage<PerHeapKind<Cache>> {
@@ -82,7 +82,7 @@
}
};
-#else
+#endif
template<typename T> struct PerThreadStorage {
static bool s_didInitialize;
@@ -112,8 +112,6 @@
template<typename T> pthread_key_t PerThreadStorage<T>::s_key;
template<typename T> std::once_flag PerThreadStorage<T>::s_onceFlag;
-#endif
-
template<typename T>
BINLINE T* PerThread<T>::getFastCase()
{
Modified: trunk/Source/bmalloc/bmalloc/StaticMutex.cpp (229435 => 229436)
--- trunk/Source/bmalloc/bmalloc/StaticMutex.cpp 2018-03-08 22:47:50 UTC (rev 229435)
+++ trunk/Source/bmalloc/bmalloc/StaticMutex.cpp 2018-03-08 22:58:23 UTC (rev 229436)
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2014 Apple Inc. All rights reserved.
+ * Copyright (C) 2014-2017 Apple Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -23,31 +23,245 @@
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
+#include "StaticMutex.h"
+
+#include "PerThread.h"
#include "ScopeExit.h"
-#include "StaticMutex.h"
+#include <condition_variable>
+#include <mutex>
#include <thread>
namespace bmalloc {
-void StaticMutex::lockSlowCase()
+// FIXME: This duplicates code from WTF::WordLock.
+// https://bugs.webkit.org/show_bug.cgi?id=177719
+
+namespace {
+
+// This data structure serves three purposes:
+//
+// 1) A parking mechanism for threads that go to sleep. That involves just a system mutex and
+// condition variable.
+//
+// 2) A queue node for when a thread is on some lock's queue.
+//
+// 3) The queue head. This is kind of funky. When a thread is the head of a queue, it also serves as
+// the basic queue bookkeeping data structure. When a thread is dequeued, the next thread in the
+// queue takes on the queue head duties.
+struct ThreadData {
+ // The parking mechanism.
+ bool shouldPark { false };
+ std::mutex parkingLock;
+ std::condition_variable parkingCondition;
+
+ // The queue node.
+ ThreadData* nextInQueue { nullptr };
+
+ // The queue itself.
+ ThreadData* queueTail { nullptr };
+};
+
+ThreadData* myThreadData()
{
- // The longest critical section in bmalloc is much shorter than the
- // time it takes to make a system call to yield to the OS scheduler.
- // So, we try again a lot before we yield.
- static const size_t aLot = 256;
+ return PerThread<ThreadData>::get();
+}
+
+} // anonymous namespace
+
+// NOTE: It's a bug to use any memory order other than seq_cst in this code. The cost of seq_cst
+// fencing is negligible on slow paths, so any use of a more relaxed memory model is all risk and no
+// reward.
+
+BNO_INLINE void StaticMutex::lockSlow()
+{
+ unsigned spinCount = 0;
+
+ // This magic number turns out to be optimal based on past JikesRVM experiments.
+ const unsigned spinLimit = 40;
- if (!m_isSpinning.test_and_set()) {
- auto clear = makeScopeExit([&] { m_isSpinning.clear(); });
+ for (;;) {
+ uintptr_t currentWordValue = m_word.load();
+
+ if (!(currentWordValue & isLockedBit)) {
+ // It's not possible for someone to hold the queue lock while the lock itself is no longer
+ // held, since we will only attempt to acquire the queue lock when the lock is held and
+ // the queue lock prevents unlock.
+ BASSERT(!(currentWordValue & isQueueLockedBit));
+ if (compareExchangeWeak(m_word, currentWordValue, currentWordValue | isLockedBit)) {
+ // Success! We acquired the lock.
+ return;
+ }
+ }
- for (size_t i = 0; i < aLot; ++i) {
- if (try_lock())
+ // If there is no queue and we haven't spun too much, we can just try to spin around again.
+ if (!(currentWordValue & ~queueHeadMask) && spinCount < spinLimit) {
+ spinCount++;
+ sched_yield();
+ continue;
+ }
+
+ // Need to put ourselves on the queue. Create the queue if one does not exist. This requries
+ // owning the queue for a little bit. The lock that controls the queue is itself a spinlock.
+ // But before we acquire the queue spinlock, we make sure that we have a ThreadData for this
+ // thread.
+ ThreadData* me = myThreadData();
+ BASSERT(!me->shouldPark);
+ BASSERT(!me->nextInQueue);
+ BASSERT(!me->queueTail);
+
+ // Reload the current word value, since some time may have passed.
+ currentWordValue = m_word.load();
+
+ // We proceed only if the queue lock is not held, the lock is held, and we succeed in
+ // acquiring the queue lock.
+ if ((currentWordValue & isQueueLockedBit)
+ || !(currentWordValue & isLockedBit)
+ || !compareExchangeWeak(m_word, currentWordValue, currentWordValue | isQueueLockedBit)) {
+ sched_yield();
+ continue;
+ }
+
+ me->shouldPark = true;
+
+ // We own the queue. Nobody can enqueue or dequeue until we're done. Also, it's not possible
+ // to release the lock while we hold the queue lock.
+ ThreadData* queueHead = reinterpret_cast<ThreadData*>(currentWordValue & ~queueHeadMask);
+ if (queueHead) {
+ // Put this thread at the end of the queue.
+ queueHead->queueTail->nextInQueue = me;
+ queueHead->queueTail = me;
+
+ // Release the queue lock.
+ currentWordValue = m_word.load();
+ BASSERT(currentWordValue & ~queueHeadMask);
+ BASSERT(currentWordValue & isQueueLockedBit);
+ BASSERT(currentWordValue & isLockedBit);
+ m_word.store(currentWordValue & ~isQueueLockedBit);
+ } else {
+ // Make this thread be the queue-head.
+ queueHead = me;
+ me->queueTail = me;
+
+ // Release the queue lock and install ourselves as the head. No need for a CAS loop, since
+ // we own the queue lock.
+ currentWordValue = m_word.load();
+ BASSERT(~(currentWordValue & ~queueHeadMask));
+ BASSERT(currentWordValue & isQueueLockedBit);
+ BASSERT(currentWordValue & isLockedBit);
+ uintptr_t newWordValue = currentWordValue;
+ newWordValue |= reinterpret_cast<uintptr_t>(queueHead);
+ newWordValue &= ~isQueueLockedBit;
+ m_word.store(newWordValue);
+ }
+
+ // At this point everyone who acquires the queue lock will see me on the queue, and anyone who
+ // acquires me's lock will see that me wants to park. Note that shouldPark may have been
+ // cleared as soon as the queue lock was released above, but it will happen while the
+ // releasing thread holds me's parkingLock.
+
+ {
+ std::unique_lock<std::mutex> locker(me->parkingLock);
+ while (me->shouldPark)
+ me->parkingCondition.wait(locker);
+ }
+
+ BASSERT(!me->shouldPark);
+ BASSERT(!me->nextInQueue);
+ BASSERT(!me->queueTail);
+
+ // Now we can loop around and try to acquire the lock again.
+ }
+}
+
+BNO_INLINE void StaticMutex::unlockSlow()
+{
+ // The fast path can fail either because of spurious weak CAS failure, or because someone put a
+ // thread on the queue, or the queue lock is held. If the queue lock is held, it can only be
+ // because someone *will* enqueue a thread onto the queue.
+
+ // Acquire the queue lock, or release the lock. This loop handles both lock release in case the
+ // fast path's weak CAS spuriously failed and it handles queue lock acquisition if there is
+ // actually something interesting on the queue.
+ for (;;) {
+ uintptr_t currentWordValue = m_word.load();
+
+ BASSERT(currentWordValue & isLockedBit);
+
+ if (currentWordValue == isLockedBit) {
+ if (compareExchangeWeak(m_word, isLockedBit, clear)) {
+ // The fast path's weak CAS had spuriously failed, and now we succeeded. The lock is
+ // unlocked and we're done!
return;
+ }
+ // Loop around and try again.
+ sched_yield();
+ continue;
}
+
+ if (currentWordValue & isQueueLockedBit) {
+ sched_yield();
+ continue;
+ }
+
+ // If it wasn't just a spurious weak CAS failure and if the queue lock is not held, then there
+ // must be an entry on the queue.
+ BASSERT(currentWordValue & ~queueHeadMask);
+
+ if (compareExchangeWeak(m_word, currentWordValue, currentWordValue | isQueueLockedBit))
+ break;
}
- // Avoid spinning pathologically.
- while (!try_lock())
- sched_yield();
+ uintptr_t currentWordValue = m_word.load();
+
+ // After we acquire the queue lock, the lock must still be held and the queue must be
+ // non-empty. The queue must be non-empty since only the lockSlow() method could have held the
+ // queue lock and if it did then it only releases it after putting something on the queue.
+ BASSERT(currentWordValue & isLockedBit);
+ BASSERT(currentWordValue & isQueueLockedBit);
+ ThreadData* queueHead = reinterpret_cast<ThreadData*>(currentWordValue & ~queueHeadMask);
+ RELEASE_BASSERT(queueHead);
+ RELEASE_BASSERT(queueHead->shouldPark);
+
+ ThreadData* newQueueHead = queueHead->nextInQueue;
+ // Either this was the only thread on the queue, in which case we delete the queue, or there
+ // are still more threads on the queue, in which case we create a new queue head.
+ if (newQueueHead)
+ newQueueHead->queueTail = queueHead->queueTail;
+
+ // Change the queue head, possibly removing it if newQueueHead is null. No need for a CAS loop,
+ // since we hold the queue lock and the lock itself so nothing about the lock can change right
+ // now.
+ currentWordValue = m_word.load();
+ BASSERT(currentWordValue & isLockedBit);
+ BASSERT(currentWordValue & isQueueLockedBit);
+ BASSERT((currentWordValue & ~queueHeadMask) == reinterpret_cast<uintptr_t>(queueHead));
+ uintptr_t newWordValue = currentWordValue;
+ newWordValue &= ~isLockedBit; // Release the lock.
+ newWordValue &= ~isQueueLockedBit; // Release the queue lock.
+ newWordValue &= queueHeadMask; // Clear out the old queue head.
+ newWordValue |= reinterpret_cast<uintptr_t>(newQueueHead); // Install new queue head.
+ m_word.store(newWordValue);
+
+ // Now the lock is available for acquisition. But we just have to wake up the old queue head.
+ // After that, we're done!
+
+ queueHead->nextInQueue = nullptr;
+ queueHead->queueTail = nullptr;
+
+ // We do this carefully because this may run either before or during the parkingLock critical
+ // section in lockSlow().
+ {
+ std::lock_guard<std::mutex> locker(queueHead->parkingLock);
+ RELEASE_BASSERT(queueHead->shouldPark);
+ queueHead->shouldPark = false;
+ // We have to do the notify while still holding the lock, since otherwise, we could try to
+ // do it after the queueHead has destructed. It's impossible for queueHead to destruct
+ // while we hold the lock, since it is either currently in the wait loop or it's before it
+ // so it has to grab the lock before destructing.
+ queueHead->parkingCondition.notify_one();
+ }
+
+ // The old queue head can now contend for the lock again. We're done!
}
} // namespace bmalloc
Modified: trunk/Source/bmalloc/bmalloc/StaticMutex.h (229435 => 229436)
--- trunk/Source/bmalloc/bmalloc/StaticMutex.h 2018-03-08 22:47:50 UTC (rev 229435)
+++ trunk/Source/bmalloc/bmalloc/StaticMutex.h 2018-03-08 22:58:23 UTC (rev 229436)
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2014 Apple Inc. All rights reserved.
+ * Copyright (C) 2014-2017 Apple Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -26,7 +26,9 @@
#ifndef StaticMutex_h
#define StaticMutex_h
+#include "Algorithm.h"
#include "BAssert.h"
+#include "BExport.h"
#include <atomic>
#include <mutex>
#include <thread>
@@ -36,6 +38,16 @@
// Use StaticMutex in static storage, where global constructors and exit-time
// destructors are prohibited, but all memory is zero-initialized automatically.
+// This uses the code from what WTF used to call WordLock. It's a fully adaptive mutex that uses
+// sizeof(void*) storage. It has a fast path that is similar to a spinlock, and a slow path that is
+// similar to std::mutex.
+
+// NOTE: In general, this is a great lock to use if you are very low in the stack. WTF continues to
+// have a copy of this code.
+
+// FIXME: Either fold bmalloc into WTF or find a better way to share code between them.
+// https://bugs.webkit.org/show_bug.cgi?id=177719
+
namespace bmalloc {
class StaticMutex {
@@ -45,57 +57,57 @@
public:
void lock();
- bool try_lock();
+ bool tryLock();
+ bool try_lock() { return tryLock(); }
void unlock();
+
+ bool isHeld() const;
+ bool isLocked() const { return isHeld(); }
private:
- BEXPORT void lockSlowCase();
+ BEXPORT void lockSlow();
+ BEXPORT void unlockSlow();
- std::atomic_flag m_flag;
- std::atomic_flag m_isSpinning;
+ static const uintptr_t clear = 0;
+ static const uintptr_t isLockedBit = 1;
+ static const uintptr_t isQueueLockedBit = 2;
+ static const uintptr_t queueHeadMask = 3;
+
+ std::atomic<uintptr_t> m_word;
};
-static inline void sleep(
- std::unique_lock<StaticMutex>& lock, std::chrono::milliseconds duration)
+inline void StaticMutex::init()
{
- if (duration == std::chrono::milliseconds(0))
- return;
-
- lock.unlock();
- std::this_thread::sleep_for(duration);
- lock.lock();
+ m_word.store(0, std::memory_order_relaxed);
}
-static inline void waitUntilFalse(
- std::unique_lock<StaticMutex>& lock, std::chrono::milliseconds sleepDuration,
- bool& condition)
+inline bool StaticMutex::tryLock()
{
- while (condition) {
- condition = false;
- sleep(lock, sleepDuration);
- }
-}
+ for (;;) {
+ uintptr_t currentWordValue = m_word.load(std::memory_order_relaxed);
-inline void StaticMutex::init()
-{
- m_flag.clear();
- m_isSpinning.clear();
-}
+ if (currentWordValue & isLockedBit)
+ return false;
-inline bool StaticMutex::try_lock()
-{
- return !m_flag.test_and_set(std::memory_order_acquire);
+ if (compareExchangeWeak(m_word, currentWordValue, currentWordValue | isLockedBit, std::memory_order_acquire))
+ return true;
+ }
}
inline void StaticMutex::lock()
{
- if (!try_lock())
- lockSlowCase();
+ if (compareExchangeWeak(m_word, clear, isLockedBit, std::memory_order_acquire))
+ return;
+
+ lockSlow();
}
inline void StaticMutex::unlock()
{
- m_flag.clear(std::memory_order_release);
+ if (compareExchangeWeak(m_word, isLockedBit, clear, std::memory_order_release))
+ return;
+
+ unlockSlow();
}
} // namespace bmalloc