I'm having a look at the moment, here are things that I find sketchy, note that
I don't know the low-level VapourSynth API.
1\. You are not using `sampleType` or `bitsPerSample` in your example.
> From what I understood, those format fields are supposed to tell you what
> your `ptr UncheckedArray[byte]` really is, if you are like in the C++ case,
> it's a type-erased float pointer so you should use the same. Note that
> float32 vs int might be very relevant as floating point vector operations are
> way more optimized than integers (for matrix multiplication the ratio is
> between 10x to 100x depending on SSE4.1 AVX2 / AVX512 and fused multiply add
> support)
2\. Profiling shows that row0 access is a bottleneck
> 3\. Your iteration strategy seems problematic with repeated query to the API
> to get the stride and pointer in vsframe.nim
proc getPtr(frame:ptr VSFrameRef, plane:cint):uint =
## Retrieves an `uint` rather than the pointer in order to perform
pointer arithmetic with it
cast[uint]( API.getWritePtr(frame, plane) )
proc stride(frame:ptr VSFrameRef, plane:cint):uint =
API.getStride( frame, plane ).uint
#type
# Row = ptr UncheckedArray[uint8]
proc `[]`*(frame:ptr VSFrameRef, plane:cint, row:uint):ptr
UncheckedArray[uint8] =
let ini:uint = frame.getPtr(plane) # Don't use other thing here
let stride:uint = frame.getStride( plane )
result = cast[ptr UncheckedArray[uint8]]( ini + row * stride )
#assert(result != nil)
proc `[]`*(frame:ptr VSFrameRef, plane:cint, row:cint):ptr
UncheckedArray[uint8] =
let ini:uint = frame.getPtr(plane) # Don't use other thing here
let stride:uint = frame.getStride( plane )
result = cast[ptr UncheckedArray[uint8]]( ini + row.uint * stride )
proc `[]`*(frame:ptr VSFrameRef, plane:cint ):Plane =
let ini = frame.getPtr(plane)
let stride = frame.stride(plane)
return Plane(ini:ini,stride:stride)
proc `[]`*(plane:Plane, row:cint):Row = #ptr UncheckedArray[uint8] =
cast[ptr UncheckedArray[uint8]]( plane.ini + row.uint * plane.stride )
Run
Instead I suggest you use a scheme like this, only get the plane/query the API
once and use the plane object that holds the metadata. This way the compiler
can do inlining and loop hoisting on all those accesses.
The main issue with calling the VapourSynth API repeatedly is that all those
function calls are just polluting your cache, requires the compiler to move the
function parameter and result to and from the stack in an inner loop.
import ../src/vapoursynth
type Plane*[T: SomeNumber] = object
# T depends on the frame format:
# - sample_type (uint or float)
# - bitsPerSample (8, 16, 32, 64)
# The plane must be instantiated with
# the proper uint8 ... 64, float32, float64
data: ptr UncheckedArray[T]
rows: int
cols: int
rowStride: int
func getPlane*(frame: ptr VSFrameRef, plane: int, T:
typedesc[SomeInteger]): Plane[T] {.inline.} =
## Extract a plane
result.data = cast[ptr UncheckedArray[T]](API.getWritePtr(frame, plane))
result.rows = API.getFrameHeight(frame, plane)
result.cols = API.getFrameWidth(frame, plane)
result.rowStride = API.getStride(frame, plane)
template `[]`*[T](plane: Plane[T], row: int, col: int): T =
plane.data[plane.row * plane.rowStride + plane.col]
template `[]=`*[T](plane: Plane[T], row: int, col: int, val: T) =
plane.data[plane.row * plane.rowStride + plane.col] = val
Run
Regarding iteration strategy, your scheme is doing 3 times more memory access
than a basic scheme for each row and each column (total 9x). Instead you should
iterate once per row/column but in-place accumulate with `dst[dstIdx] +=
src[srcIdx] * filter[filterIdx]`
A starter point could be my basic convolution scheme in Laser
[https://github.com/numforge/laser/blob/d1e6ae61/benchmarks/convolution/conv2d_direct_convolution.nim#L8-L73](https://github.com/numforge/laser/blob/d1e6ae61/benchmarks/convolution/conv2d_direct_convolution.nim#L8-L73)
It's for batch of images with potentially different plane numbers (called
channels) between in and out (for example RGB --> Grayscale) but the iteration
should be faster than yours:
Note that here I tracked the indices manually instead of using an intermediate
`Plane` object but both methods should be the same with an optimizing compiler.
proc conv2d_direct*[T](
oim: var Tensor[T], # Output images
iim: Tensor[T], # Input images
ishape: TensorShape, # Shape of a single image
kernel: Tensor[T], # filter kernel
kshape: KernelShape, # kernel shape (should be const)
padding: Padding, # Padding (should be const)
strides: Strides # Strides (should be const)
) =
## oim must be zero-initialized
# Reminder: convolution deep learning == cross-correlation signal
processing
assert ishape.c == kshape.c_in
let out_shape = conv2d_out_shape(ishape, kshape, padding, strides)
assert oim.len == out_shape.n * out_shape.c * out_shape.h * out_shape.w
let
N = ishape.n
H = ishape.h
W = ishape.w
outH = out_shape.h
outW = out_shape.w
kdata = cast[ptr UncheckedArray[T]](kernel[0].unsafeaddr)
let # Should be const but padding.h causes problem and padding[0] indexing
# doesn't work in generic proc
C_out = kshape.c_out
C_in = kshape.c_in
kH = kshape.kH
kW = kshape.kW
pH = padding.h
pW = padding.w
sH = strides.h
sW = strides.w
let odata = cast[ptr UncheckedArray[T]](oim[0].addr)
let idata = cast[ptr UncheckedArray[T]](iim[0].unsafeaddr)
const arithmetic_intensity = 12 # number of FLOP per byte for the
convolution
const flop_per_elem = arithmetic_intensity div sizeof(T)
let parallelize = OMP_MEMORY_BOUND_GRAIN_SIZE div flop_per_elem < outH *
outW
for n in 0 ..< N:
for co in 0 ..< C_out:
for ci in 0 ..< C_in:
# We parallelize over the image height to deal with cases
# where we have a single image or a low number of channels
# We assume no false sharing with a proper grain size
omp_parallel_if(parallelize):
omp_for(oh, outH, use_simd = true, nowait=true):
# output height
let ih = sH * oh
# input height
for ow in 0 ..< outW:
# output width
let iw = sH * ow
# input width
# The following should be loop hoisted by the compiler
let oidx = ow + outW * (oh + outH * (co + C_out * n))
# odata[n][co][oh][ow]
# Entering conv kernel region
for krow in 0 ..< kH:
let row = ih + krow - pH
if row <% H: # Unsigned '<' does 0 < row < H.
for kcol in 0 ..< kW:
let col = iw + kcol - pW
if col <% W: # Unsigned '<' does 0 < row < H.
let iidx = col + W * (row + H * (ci + C_in * n))
# idata[n][ci][row][col]
let kidx = kcol + kW * (krow + kH * (ci + C_in * co))
# kdata[co][ci][krow][kcol]
odata[oidx] += idata[iidx] * kdata[kidx]
Run