Diff
Modified: trunk/Source/WebCore/ChangeLog (278230 => 278231)
--- trunk/Source/WebCore/ChangeLog 2021-05-29 00:18:45 UTC (rev 278230)
+++ trunk/Source/WebCore/ChangeLog 2021-05-29 00:52:26 UTC (rev 278231)
@@ -1,3 +1,43 @@
+2021-05-28 Chris Dumez <cdu...@apple.com>
+
+ DelayDSPKernel::process() is slow
+ https://bugs.webkit.org/show_bug.cgi?id=226358
+
+ Reviewed by Darin Adler.
+
+ When I profiled the demo at https://jsfiddle.net/KrisJohnson/s5vL24o1/123/ (in the context of Bug 222098),
+ I noticed that 20% of the CPU time was spent under DelayDSPKernel::process().
+
+ To improve this, we now vectorize DelayDSPKernel::process() in the common case where there is no automation
+ and the delay time is constant.
+
+ The implementation is very similar to the one in Blink:
+ - https://github.com/chromium/chromium/blob/master/third_party/blink/renderer/platform/audio/audio_delay_dsp_kernel.cc
+
+ Some differences compared to the Blink implementation:
+ - I did not vectorize the A-rate case for simplicity. It is not as common and it is more complicated.
+ We may consider doing this in the future if really needed.
+ - On Cocoa, we leveage Accelerate's vDSP_vintb() to do the interpolation instead of doing 2 separate
+ operations.
+
+ This doesn't fix Bug 222098 but it does improve the situation quite a bit. I also see that the CPU time
+ spent under DelayDSPKernel::process() went from ~20% to 1.2% on this demo.
+
+ No new tests, no Web-facing behavior change, just a performance optimization.
+
+ * Modules/webaudio/DelayDSPKernel.cpp:
+ (WebCore::copyToCircularBuffer):
+ (WebCore::DelayDSPKernel::DelayDSPKernel):
+ (WebCore::DelayDSPKernel::bufferLengthForDelay const):
+ (WebCore::DelayDSPKernel::process):
+ (WebCore::DelayDSPKernel::processARate):
+ (WebCore::DelayDSPKernel::processKRate):
+ * Modules/webaudio/DelayDSPKernel.h:
+ * platform/audio/VectorMath.cpp:
+ (WebCore::VectorMath::substract):
+ (WebCore::VectorMath::interpolate):
+ * platform/audio/VectorMath.h:
+
2021-05-28 Brent Fulgham <bfulg...@apple.com>
[Cocoa] Prevent GPU Process from attempt to connect to the AppSSO service (Part 2)
Modified: trunk/Source/WebCore/Modules/webaudio/DelayDSPKernel.cpp (278230 => 278231)
--- trunk/Source/WebCore/Modules/webaudio/DelayDSPKernel.cpp 2021-05-29 00:18:45 UTC (rev 278230)
+++ trunk/Source/WebCore/Modules/webaudio/DelayDSPKernel.cpp 2021-05-29 00:52:26 UTC (rev 278231)
@@ -29,13 +29,43 @@
#include "DelayDSPKernel.h"
#include "AudioUtilities.h"
+#include "VectorMath.h"
#include <algorithm>
namespace WebCore {
+static size_t bufferLengthForDelay(double maxDelayTime, double sampleRate)
+{
+ // Compute the length of the buffer needed to handle a max delay of |maxDelayTime|. Add an additional render quantum frame size so we can
+ // vectorize the delay processing. The extra space is needed so that writes to the buffer won't overlap reads from the buffer.
+ return AudioUtilities::renderQuantumSize + AudioUtilities::timeToSampleFrame(maxDelayTime, sampleRate, AudioUtilities::SampleFrameRounding::Up);
+}
+
+// Returns (a - b) if a is greater than b, 0 otherwise.
+template<typename T> static inline size_t positiveSubtract(T a, T b)
+{
+ return a <= b ? 0 : static_cast<size_t>(a - b);
+}
+
+static void copyToCircularBuffer(float* buffer, size_t writeIndex, size_t bufferLength, const float* source, size_t framesToProcess)
+{
+ // The algorithm below depends on this being true because we don't expect to have to fill the entire buffer more than once.
+ RELEASE_ASSERT(bufferLength >= framesToProcess);
+
+ // Copy |framesToProcess| values from |source| to the circular buffer that starts at |buffer| of length |bufferLength|. The
+ // copy starts at index |writeIndex| into the buffer.
+ auto* writePointer = &buffer[writeIndex];
+ size_t remainder = positiveSubtract(bufferLength, writeIndex);
+
+ // Copy the frames over, carefully handling the case where we need to wrap around to the beginning of the buffer.
+ memcpy(writePointer, source, sizeof(*writePointer) * std::min(framesToProcess, remainder));
+ memcpy(buffer, source + remainder, sizeof(*buffer) * positiveSubtract(framesToProcess, remainder));
+}
+
DelayDSPKernel::DelayDSPKernel(DelayProcessor* processor)
: AudioDSPKernel(processor)
, m_delayTimes(AudioUtilities::renderQuantumSize)
+ , m_tempBuffer(AudioUtilities::renderQuantumSize)
{
ASSERT(processor && processor->sampleRate() > 0);
if (!(processor && processor->sampleRate() > 0))
@@ -52,6 +82,7 @@
DelayDSPKernel::DelayDSPKernel(double maxDelayTime, float sampleRate)
: AudioDSPKernel(sampleRate)
, m_maxDelayTime(maxDelayTime)
+ , m_tempBuffer(AudioUtilities::renderQuantumSize)
{
ASSERT(maxDelayTime > 0.0);
if (maxDelayTime <= 0.0)
@@ -65,70 +96,107 @@
m_buffer.resize(bufferLength);
}
-size_t DelayDSPKernel::bufferLengthForDelay(double maxDelayTime, double sampleRate) const
-{
- // Compute the length of the buffer needed to handle a max delay of |maxDelayTime|. One is
- // added to handle the case where the actual delay equals the maximum delay.
- return 1 + AudioUtilities::timeToSampleFrame(maxDelayTime, sampleRate, AudioUtilities::SampleFrameRounding::Up);
-}
-
void DelayDSPKernel::process(const float* source, float* destination, size_t framesToProcess)
{
- size_t bufferLength = m_buffer.size();
- float* buffer = m_buffer.data();
-
- ASSERT(bufferLength);
- if (!bufferLength)
- return;
-
+ ASSERT(m_buffer.size());
ASSERT(source && destination);
- if (!source || !destination)
+ if (UNLIKELY(m_buffer.isEmpty() || !source || !destination))
return;
- float sampleRate = this->sampleRate();
- double delayTime = 0;
- float* delayTimes = m_delayTimes.data();
- double maxTime = maxDelayTime();
-
bool sampleAccurate = delayProcessor() && delayProcessor()->delayTime().hasSampleAccurateValues();
bool shouldUseARate = delayProcessor() && delayProcessor()->delayTime().automationRate() == AutomationRate::ARate;
-
if (sampleAccurate && shouldUseARate)
- delayProcessor()->delayTime().calculateSampleAccurateValues(delayTimes, framesToProcess);
- else {
- delayTime = delayProcessor() ? delayProcessor()->delayTime().finalValue() : m_desiredDelayFrames / sampleRate;
- // Make sure the delay time is in a valid range.
- delayTime = std::clamp(delayTime, 0.0, maxTime);
- }
+ processARate(source, destination, framesToProcess);
+ else
+ processKRate(source, destination, framesToProcess);
+}
+void DelayDSPKernel::processARate(const float* source, float* destination, size_t framesToProcess)
+{
+ size_t bufferLength = m_buffer.size();
+ auto* buffer = m_buffer.data();
+
+ delayProcessor()->delayTime().calculateSampleAccurateValues(m_delayTimes.data(), framesToProcess);
+
+ copyToCircularBuffer(buffer, m_writeIndex, bufferLength, source, framesToProcess);
+
for (unsigned i = 0; i < framesToProcess; ++i) {
- if (sampleAccurate && shouldUseARate) {
- delayTime = delayTimes[i];
- delayTime = std::clamp(delayTime, 0.0, maxTime);
- }
+ double delayTime = std::clamp<double>(m_delayTimes[i], 0.0, maxDelayTime());
+ double desiredDelayFrames = delayTime * sampleRate();
- double desiredDelayFrames = delayTime * sampleRate;
-
double readPosition = m_writeIndex + bufferLength - desiredDelayFrames;
if (readPosition >= bufferLength)
readPosition -= bufferLength;
// Linearly interpolate in-between delay times.
- int readIndex1 = static_cast<int>(readPosition);
- int readIndex2 = (readIndex1 + 1) % bufferLength;
- double interpolationFactor = readPosition - readIndex1;
+ size_t readIndex1 = static_cast<size_t>(readPosition);
+ size_t readIndex2 = (readIndex1 + 1) % bufferLength;
+ float interpolationFactor = readPosition - readIndex1;
- double input = static_cast<float>(*source++);
- buffer[m_writeIndex] = static_cast<float>(input);
m_writeIndex = (m_writeIndex + 1) % bufferLength;
- double sample1 = buffer[readIndex1];
- double sample2 = buffer[readIndex2];
+ float sample1 = buffer[readIndex1];
+ float sample2 = buffer[readIndex2];
+ destination[i] = sample1 + interpolationFactor * (sample2 - sample1);
+ }
+}
- double output = (1.0 - interpolationFactor) * sample1 + interpolationFactor * sample2;
+// Optimized version of processARate() when the delayTime is constant.
+void DelayDSPKernel::processKRate(const float* source, float* destination, size_t framesToProcess)
+{
+ size_t bufferLength = m_buffer.size();
+ auto* buffer = m_buffer.data();
- *destination++ = static_cast<float>(output);
- }
+ double delayTime = delayProcessor() ? delayProcessor()->delayTime().finalValue() : m_desiredDelayFrames / sampleRate();
+ // Make sure the delay time is in a valid range.
+ delayTime = std::clamp(delayTime, 0.0, maxDelayTime());
+ double desiredDelayFrames = delayTime * sampleRate();
+
+ double readPosition = m_writeIndex + bufferLength - desiredDelayFrames;
+ if (readPosition >= bufferLength)
+ readPosition -= bufferLength;
+
+ // Linearly interpolate in-between delay times. |readIndex1| and |readIndex2| are the indices of the frames to be used
+ // for interpolation.
+ size_t readIndex1 = static_cast<size_t>(readPosition);
+ float interpolationFactor = readPosition - readIndex1;
+ auto* bufferEnd = &buffer[bufferLength];
+ ASSERT(static_cast<unsigned>(bufferLength) >= framesToProcess);
+
+ // sample1 and sample2 hold the current and next samples in the buffer. These are used for interoplating the delay value.
+ // To reduce memory usage and an extra memcpy, sample1 can be the same as destination.
+ // VectorMath::interpolate() below has an optimization in the case where the input buffer is the same as the output one.
+ auto* sample1 = destination;
+
+ // Copy data from the source into the buffer, starting at the write index. The buffer is circular, so carefully handle
+ // the wrapping of the write pointer.
+ copyToCircularBuffer(buffer, m_writeIndex, bufferLength, source, framesToProcess);
+ m_writeIndex = (m_writeIndex + framesToProcess) % bufferLength;
+
+ // Now copy out the samples from the buffer, starting at the read pointer, carefully handling wrapping of the read pointer.
+ auto* readPointer = &buffer[readIndex1];
+
+ size_t remainder = positiveSubtract(bufferEnd, readPointer);
+ memcpy(sample1, readPointer, sizeof(*sample1) * std::min(framesToProcess, remainder));
+ memcpy(sample1 + remainder, buffer, sizeof(*sample1) * positiveSubtract(framesToProcess, remainder));
+
+ // If interpolationFactor is 0, we don't need to do any interpolation and sample1 contains the desired values.
+ if (!interpolationFactor)
+ return;
+
+ ASSERT(framesToProcess <= m_tempBuffer.size());
+
+ size_t readIndex2 = (readIndex1 + 1) % bufferLength;
+ auto* sample2 = m_tempBuffer.data();
+
+ readPointer = &buffer[readIndex2];
+ remainder = positiveSubtract(bufferEnd, readPointer);
+ memcpy(sample2, readPointer, sizeof(*sample2) * std::min(framesToProcess, remainder));
+ memcpy(sample2 + remainder, buffer, sizeof(*sample2) * positiveSubtract(framesToProcess, remainder));
+
+ // Interpolate samples.
+ // destination[k] = sample1[k] + interpolationFactor * (sample2[k] - sample1[k]);
+ VectorMath::interpolate(sample1, sample2, interpolationFactor, destination, framesToProcess);
}
void DelayDSPKernel::processOnlyAudioParams(size_t framesToProcess)
Modified: trunk/Source/WebCore/Modules/webaudio/DelayDSPKernel.h (278230 => 278231)
--- trunk/Source/WebCore/Modules/webaudio/DelayDSPKernel.h 2021-05-29 00:18:45 UTC (rev 278230)
+++ trunk/Source/WebCore/Modules/webaudio/DelayDSPKernel.h 2021-05-29 00:52:26 UTC (rev 278231)
@@ -50,15 +50,19 @@
bool requiresTailProcessing() const final;
private:
+ void processARate(const float* source, float* destination, size_t framesToProcess);
+ void processKRate(const float* source, float* destination, size_t framesToProcess);
+
AudioFloatArray m_buffer;
double m_maxDelayTime;
- int m_writeIndex { 0 };
+ size_t m_writeIndex { 0 };
double m_desiredDelayFrames;
AudioFloatArray m_delayTimes;
+ // Temporary buffer used to hold the second sample for interpolation if needed.
+ AudioFloatArray m_tempBuffer;
DelayProcessor* delayProcessor() { return static_cast<DelayProcessor*>(processor()); }
- size_t bufferLengthForDelay(double delayTime, double sampleRate) const;
};
} // namespace WebCore
Modified: trunk/Source/WebCore/platform/audio/AudioArray.h (278230 => 278231)
--- trunk/Source/WebCore/platform/audio/AudioArray.h 2021-05-29 00:18:45 UTC (rev 278230)
+++ trunk/Source/WebCore/platform/audio/AudioArray.h 2021-05-29 00:52:26 UTC (rev 278231)
@@ -72,6 +72,7 @@
T* data() { return m_allocation; }
const T* data() const { return m_allocation; }
size_t size() const { return m_size; }
+ bool isEmpty() const { return !m_size; }
T& at(size_t i)
{
Modified: trunk/Source/WebCore/platform/audio/VectorMath.cpp (278230 => 278231)
--- trunk/Source/WebCore/platform/audio/VectorMath.cpp 2021-05-29 00:18:45 UTC (rev 278230)
+++ trunk/Source/WebCore/platform/audio/VectorMath.cpp 2021-05-29 00:52:26 UTC (rev 278231)
@@ -62,6 +62,11 @@
vDSP_vadd(inputVector1, 1, inputVector2, 1, outputVector, 1, numberOfElementsToProcess);
}
+void substract(const float* inputVector1, const float* inputVector2, float* outputVector, size_t numberOfElementsToProcess)
+{
+ vDSP_vsub(inputVector1, 1, inputVector2, 1, outputVector, 1, numberOfElementsToProcess);
+}
+
void addScalar(const float* inputVector, float scalar, float* outputVector, size_t numberOfElementsToProcess)
{
vDSP_vsadd(inputVector, 1, &scalar, outputVector, 1, numberOfElementsToProcess);
@@ -72,6 +77,11 @@
vDSP_vmul(inputVector1, 1, inputVector2, 1, outputVector, 1, numberOfElementsToProcess);
}
+void interpolate(const float* inputVector1, float* inputVector2, float interpolationFactor, float* outputVector, size_t numberOfElementsToProcess)
+{
+ vDSP_vintb(inputVector1, 1, inputVector2, 1, &interpolationFactor, outputVector, 1, numberOfElementsToProcess);
+}
+
void multiplyComplex(const float* realVector1, const float* imagVector1, const float* realVector2, const float* imag2P, float* realOutputVector, float* imagDestP, size_t numberOfElementsToProcess)
{
DSPSplitComplex sc1;
@@ -430,6 +440,115 @@
}
}
+void substract(const float* inputVector1, const float* inputVector2, float* outputVector, size_t numberOfElementsToProcess)
+{
+ size_t n = numberOfElementsToProcess;
+
+#if CPU(X86_SSE2)
+ // If the inputVector address is not 16-byte aligned, the first several frames (at most three) should be processed separately.
+ while (!is16ByteAligned(inputVector1) && n) {
+ *outputVector = *inputVector1 - *inputVector2;
+ inputVector1++;
+ inputVector2++;
+ outputVector++;
+ n--;
+ }
+
+ // Now the inputVector1 address is aligned and start to apply SSE.
+ size_t group = n / 4;
+ __m128* pSource1;
+ __m128* pSource2;
+ __m128* pDest;
+ __m128 source2;
+ __m128 dest;
+
+ bool source2Aligned = is16ByteAligned(inputVector2);
+ bool destAligned = is16ByteAligned(outputVector);
+
+ if (source2Aligned && destAligned) { // all aligned
+ while (group--) {
+ pSource1 = reinterpret_cast<__m128*>(const_cast<float*>(inputVector1));
+ pSource2 = reinterpret_cast<__m128*>(const_cast<float*>(inputVector2));
+ pDest = reinterpret_cast<__m128*>(outputVector);
+ *pDest = _mm_sub_ps(*pSource1, *pSource2);
+
+ inputVector1 += 4;
+ inputVector2 += 4;
+ outputVector += 4;
+ }
+ } else if (source2Aligned && !destAligned) { // source2 aligned but dest not aligned
+ while (group--) {
+ pSource1 = reinterpret_cast<__m128*>(const_cast<float*>(inputVector1));
+ pSource2 = reinterpret_cast<__m128*>(const_cast<float*>(inputVector2));
+ dest = _mm_sub_ps(*pSource1, *pSource2);
+ _mm_storeu_ps(outputVector, dest);
+
+ inputVector1 += 4;
+ inputVector2 += 4;
+ outputVector += 4;
+ }
+ } else if (!source2Aligned && destAligned) { // source2 not aligned but dest aligned
+ while (group--) {
+ pSource1 = reinterpret_cast<__m128*>(const_cast<float*>(inputVector1));
+ source2 = _mm_loadu_ps(inputVector2);
+ pDest = reinterpret_cast<__m128*>(outputVector);
+ *pDest = _mm_sub_ps(*pSource1, source2);
+
+ inputVector1 += 4;
+ inputVector2 += 4;
+ outputVector += 4;
+ }
+ } else if (!source2Aligned && !destAligned) { // both source2 and dest not aligned
+ while (group--) {
+ pSource1 = reinterpret_cast<__m128*>(const_cast<float*>(inputVector1));
+ source2 = _mm_loadu_ps(inputVector2);
+ dest = _mm_sub_ps(*pSource1, source2);
+ _mm_storeu_ps(outputVector, dest);
+
+ inputVector1 += 4;
+ inputVector2 += 4;
+ outputVector += 4;
+ }
+ }
+
+ // Non-SSE handling for remaining frames which is less than 4.
+ n %= 4;
+#elif HAVE(ARM_NEON_INTRINSICS)
+ size_t tailFrames = n % 4;
+ const float* endP = outputVector + n - tailFrames;
+
+ while (outputVector < endP) {
+ float32x4_t source1 = vld1q_f32(inputVector1);
+ float32x4_t source2 = vld1q_f32(inputVector2);
+ vst1q_f32(outputVector, vsubq_f32(source1, source2));
+
+ inputVector1 += 4;
+ inputVector2 += 4;
+ outputVector += 4;
+ }
+ n = tailFrames;
+#endif
+ while (n--) {
+ *outputVector = *inputVector1 - *inputVector2;
+ ++inputVector1;
+ ++inputVector2;
+ ++outputVector;
+ }
+}
+
+void interpolate(const float* inputVector1, float* inputVector2, float interpolationFactor, float* outputVector, size_t numberOfElementsToProcess)
+{
+ if (inputVector1 != outputVector)
+ memcpy(outputVector, inputVector1, numberOfElementsToProcess * sizeof(float));
+
+ // inputVector2[k] = inputVector2[k] - inputVector1[k]
+ substract(inputVector2, inputVector1, inputVector2, numberOfElementsToProcess);
+
+ // outputVector[k] = outputVector[k] + interpolationFactor * inputVector2[k]
+ // = inputVector1[k] + interpolationFactor * (inputVector2[k] - inputVector1[k]);
+ multiplyByScalarThenAddToOutput(inputVector2, interpolationFactor, outputVector, numberOfElementsToProcess);
+}
+
void multiply(const float* inputVector1, const float* inputVector2, float* outputVector, size_t numberOfElementsToProcess)
{
size_t n = numberOfElementsToProcess;
Modified: trunk/Source/WebCore/platform/audio/VectorMath.h (278230 => 278231)
--- trunk/Source/WebCore/platform/audio/VectorMath.h 2021-05-29 00:18:45 UTC (rev 278230)
+++ trunk/Source/WebCore/platform/audio/VectorMath.h 2021-05-29 00:52:26 UTC (rev 278231)
@@ -47,6 +47,7 @@
void multiplyByScalar(const float* inputVector, float scalar, float* outputVector, size_t numberOfElementsToProcess);
void addScalar(const float* inputVector, float scalar, float* outputVector, size_t numberOfElementsToProcess);
void add(const float* inputVector1, const float* inputVector2, float* outputVector, size_t numberOfElementsToProcess);
+void substract(const float* inputVector1, const float* inputVector2, float* outputVector, size_t numberOfElementsToProcess);
// Finds the maximum magnitude of a float vector.
float maximumMagnitude(const float* inputVector, size_t numberOfElementsToProcess);
@@ -65,6 +66,12 @@
void linearToDecibels(const float* inputVector, float* outputVector, size_t numberOfElementsToProcess);
+// Calculates the linear interpolation between the supplied single-precision vectors
+// for (n = 0; n < numberOfElementsToProcess; ++n)
+// outputVector[n] = inputVector1[n] + interpolationFactor * (inputVector2[n] - inputVector1[n]);
+// NOTE: Internal implementation may modify inputVector2.
+void interpolate(const float* inputVector1, float* inputVector2, float interpolationFactor, float* outputVector, size_t numberOfElementsToProcess);
+
} // namespace VectorMath
} // namespace WebCore