I'm having a look at the moment, here are things that I find sketchy, note that 
I don't know the low-level VapourSynth API.

1\. You are not using `sampleType` or `bitsPerSample` in your example.

> From what I understood, those format fields are supposed to tell you what 
> your `ptr UncheckedArray[byte]` really is, if you are like in the C++ case, 
> it's a type-erased float pointer so you should use the same. Note that 
> float32 vs int might be very relevant as floating point vector operations are 
> way more optimized than integers (for matrix multiplication the ratio is 
> between 10x to 100x depending on SSE4.1 AVX2 / AVX512 and fused multiply add 
> support)

2\. Profiling shows that row0 access is a bottleneck

> 3\. Your iteration strategy seems problematic with repeated query to the API 
> to get the stride and pointer in vsframe.nim
    
    
    proc getPtr(frame:ptr VSFrameRef, plane:cint):uint =
      ## Retrieves an `uint` rather than the pointer in order to perform 
pointer arithmetic with it
      cast[uint]( API.getWritePtr(frame, plane) )
    
    proc stride(frame:ptr VSFrameRef, plane:cint):uint =
      API.getStride( frame, plane ).uint
    #type
    #  Row = ptr UncheckedArray[uint8]
    
    proc `[]`*(frame:ptr VSFrameRef, plane:cint, row:uint):ptr 
UncheckedArray[uint8] =
      let ini:uint = frame.getPtr(plane)            # Don't use other thing here
      let stride:uint = frame.getStride( plane )
      result = cast[ptr UncheckedArray[uint8]]( ini + row * stride )
      #assert(result != nil)
    
    proc `[]`*(frame:ptr VSFrameRef, plane:cint, row:cint):ptr 
UncheckedArray[uint8] =
      let ini:uint = frame.getPtr(plane)            # Don't use other thing here
      let stride:uint = frame.getStride( plane )
      result = cast[ptr UncheckedArray[uint8]]( ini + row.uint * stride )
    
    proc `[]`*(frame:ptr VSFrameRef, plane:cint ):Plane =
      let ini = frame.getPtr(plane)
      let stride = frame.stride(plane)
      return Plane(ini:ini,stride:stride)
    
    proc `[]`*(plane:Plane, row:cint):Row = #ptr UncheckedArray[uint8] =
      cast[ptr UncheckedArray[uint8]]( plane.ini + row.uint * plane.stride )
    
    
    Run

Instead I suggest you use a scheme like this, only get the plane/query the API 
once and use the plane object that holds the metadata. This way the compiler 
can do inlining and loop hoisting on all those accesses.

The main issue with calling the VapourSynth API repeatedly is that all those 
function calls are just polluting your cache, requires the compiler to move the 
function parameter and result to and from the stack in an inner loop.
    
    
    import ../src/vapoursynth
    
    type Plane*[T: SomeNumber] = object
      # T depends on the frame format:
      # - sample_type (uint or float)
      # - bitsPerSample (8, 16, 32, 64)
      # The plane must be instantiated with
      # the proper uint8 ... 64, float32, float64
      data: ptr UncheckedArray[T]
      rows: int
      cols: int
      rowStride: int
    
    func getPlane*(frame: ptr VSFrameRef, plane: int, T: 
typedesc[SomeInteger]): Plane[T] {.inline.} =
      ## Extract a plane
      result.data = cast[ptr UncheckedArray[T]](API.getWritePtr(frame, plane))
      result.rows = API.getFrameHeight(frame, plane)
      result.cols = API.getFrameWidth(frame, plane)
      result.rowStride = API.getStride(frame, plane)
    
    template `[]`*[T](plane: Plane[T], row: int, col: int): T =
      plane.data[plane.row * plane.rowStride + plane.col]
    
    template `[]=`*[T](plane: Plane[T], row: int, col: int, val: T) =
      plane.data[plane.row * plane.rowStride + plane.col] = val
    
    
    Run

Regarding iteration strategy, your scheme is doing 3 times more memory access 
than a basic scheme for each row and each column (total 9x). Instead you should 
iterate once per row/column but in-place accumulate with `dst[dstIdx] += 
src[srcIdx] * filter[filterIdx]`

A starter point could be my basic convolution scheme in Laser 
[https://github.com/numforge/laser/blob/d1e6ae61/benchmarks/convolution/conv2d_direct_convolution.nim#L8-L73](https://github.com/numforge/laser/blob/d1e6ae61/benchmarks/convolution/conv2d_direct_convolution.nim#L8-L73)

It's for batch of images with potentially different plane numbers (called 
channels) between in and out (for example RGB --> Grayscale) but the iteration 
should be faster than yours:

Note that here I tracked the indices manually instead of using an intermediate 
`Plane` object but both methods should be the same with an optimizing compiler.
    
    
    proc conv2d_direct*[T](
        oim: var Tensor[T],     # Output images
        iim: Tensor[T],         # Input images
        ishape: TensorShape,    # Shape of a single image
        kernel: Tensor[T],      # filter kernel
        kshape: KernelShape,    # kernel shape (should be const)
        padding: Padding,       # Padding (should be const)
        strides: Strides        # Strides (should be const)
      ) =
      ## oim must be zero-initialized
      # Reminder: convolution deep learning == cross-correlation signal 
processing
      
      assert ishape.c == kshape.c_in
      let out_shape = conv2d_out_shape(ishape, kshape, padding, strides)
      assert oim.len == out_shape.n * out_shape.c * out_shape.h * out_shape.w
      
      let
        N = ishape.n
        H = ishape.h
        W = ishape.w
        outH = out_shape.h
        outW = out_shape.w
        kdata = cast[ptr UncheckedArray[T]](kernel[0].unsafeaddr)
      
      let # Should be const but padding.h causes problem and padding[0] indexing
          # doesn't work in generic proc
        C_out = kshape.c_out
        C_in = kshape.c_in
        kH = kshape.kH
        kW = kshape.kW
        pH = padding.h
        pW = padding.w
        sH = strides.h
        sW = strides.w
      
      let odata = cast[ptr UncheckedArray[T]](oim[0].addr)
      let idata = cast[ptr UncheckedArray[T]](iim[0].unsafeaddr)
      
      const arithmetic_intensity = 12 # number of FLOP per byte for the 
convolution
      const flop_per_elem = arithmetic_intensity div sizeof(T)
      let parallelize = OMP_MEMORY_BOUND_GRAIN_SIZE div flop_per_elem < outH * 
outW
      
      for n in 0 ..< N:
        for co in 0 ..< C_out:
          for ci in 0 ..< C_in:
            # We parallelize over the image height to deal with cases
            # where we have a single image or a low number of channels
            
            # We assume no false sharing with a proper grain size
            omp_parallel_if(parallelize):
              omp_for(oh, outH, use_simd = true, nowait=true):                  
# output height
                let ih = sH * oh                                                
# input height
                for ow in 0 ..< outW:                                           
# output width
                  let iw = sH * ow                                              
# input width
                  # The following should be loop hoisted by the compiler
                  let oidx = ow + outW * (oh + outH * (co + C_out * n))         
# odata[n][co][oh][ow]
                  # Entering conv kernel region
                  for krow in 0 ..< kH:
                    let row = ih + krow - pH
                    if row <% H:     # Unsigned '<' does 0 < row < H.
                      for kcol in 0 ..< kW:
                        let col = iw + kcol - pW
                        if col <% W: # Unsigned '<' does 0 < row < H.
                          let iidx = col + W * (row + H * (ci + C_in * n))      
# idata[n][ci][row][col]
                          let kidx = kcol + kW * (krow + kH * (ci + C_in * co)) 
# kdata[co][ci][krow][kcol]
                          odata[oidx] += idata[iidx] * kdata[kidx]
    
    
    Run

Reply via email to