anijain2305 opened a new pull request #4277: [ARM][Topi] Supporting Int8 in 
Spatial schedule.
URL: https://github.com/apache/incubator-tvm/pull/4277
 
 
   I am working on improving the performance of Int8 conv on Raspberry Pi 3.
   
   For Conv2D, there is an upcast from int8 to int32 before performing the 
dot-product. ARM ISA has an instruction called vmlal.s16 that takes 3  SIMD 
registers each containing 4 16-bit values and does FMA producing 1  SIMD 
register containing 4 32-bit values. However, LLVM (4.0, 6.0 and 8.0) is not 
able to figure this out itself. In the absence of this PR, assembly looks 
something like this
   
   ~~~
        add     r2, sp, #304
        vmlal.s16       q12, d0, d4
        vld1.64 {d0, d1}, [r2:128]
        add     r2, sp, #208
        vmlal.s16       q12, d10, d0
        vld1.64 {d0, d1}, [r2:128]
        add     r2, sp, #192
        vmlal.s16       q12, d8, d0
        vld1.64 {d0, d1}, [r2:128]
        sub     r2, r6, #7
        mov     r6, r7
        vmlal.s16       q12, d0, d14
        vld1.8  {d0[]}, [r2]
        add     r2, sp, #192
        vmovl.s8        q4, d0
        vst1.64 {d8, d9}, [r2:128]
        add     r2, sp, #240
        vld1.64 {d0, d1}, [r2:128]
        add     r2, sp, #176
        vmlal.s16       q14, d8, d0
        vld1.64 {d0, d1}, [r2:128]
        add     r2, sp, #128
        vmlal.s16       q14, d0, d12
        vld1.64 {d12, d13}, [r2:128]
        add     r2, sp, #288
        vld1.64 {d0, d1}, [r2:128]
        add     r2, sp, #96
        vmlal.s16       q14, d12, d0
        vld1.64 {d8, d9}, [r2:128]
        add     r2, sp, #352
        vld1.64 {d0, d1}, [r2:128]
        add     r2, sp, #224
        vmlal.s16       q14, d8, d0
        vld1.64 {d14, d15}, [r2:128]
        add     r2, sp, #80
        vmlal.s16       q14, d14, d2
        vld1.64 {d2, d3}, [r2:128]
        add     r2, sp, #304
        vld1.64 {d0, d1}, [r2:128]
        add     r2, sp, #208
        vmlal.s16       q14, d6, d4
        vld1.64 {d4, d5}, [r2:128]
        add     r2, sp, #240
        vmlal.s16       q14, d2, d0
        vmlal.s16       q14, d10, d4
        vld1.64 {d10, d11}, [r2:128]
        add     r2, sp, #192
        vld1.64 {d6, d7}, [r2:128]
        add     r2, sp, #160
        vmlal.s16       q13, d7, d11
        vld1.64 {d10, d11}, [r2:128]
        add     r2, sp, #176
        vld1.64 {d6, d7}, [r2:128]
        add     r2, sp, #288
        vmlal.s16       q13, d7, d11
        vld1.64 {d10, d11}, [r2:128]
        add     r2, sp, #352
        vmlal.s16       q13, d13, d11
        vld1.64 {d10, d11}, [r2:128]
        add     r2, sp, #336
        vld1.64 {d6, d7}, [r2:128]
        add     r2, sp, #144
        vmlal.s16       q13, d9, d11
        vld1.64 {d8, d9}, [r2:128]
        add     r2, sp, #320
        vld1.64 {d10, d11}, [r2:128]
        add     r2, sp, #272
        vmlal.s16       q13, d15, d7
        vmlal.s16       q13, d9, d11
        vmlal.s16       q13, d3, d1
        vld1.64 {d0, d1}, [r2:128]
        add     r2, sp, #256
        vmlal.s16       q13, d1, d5
        vld1.64 {d0, d1}, [r2:128]
        add     r2, sp, #112
        vld1.64 {d2, d3}, [r2:128]
        vmlal.s16       q14, d0, d2
        vmlal.s16       q13, d1, d3
        bne     .LBB6_5
   ~~~
   
   However, if we add an intermediate upcasting to int16 i.e. instead of going 
from int8 to int32, we can go from int8 to int16 and then this int16 can go to 
conv2D, it results in much better packing of compute and memory instructions 
   
   ~~~
   .LBB7_6:
        add     r2, lr, r8
        add     r1, r12, #32
        vld1.64 {d0, d1}, [r1:128]
        add     r1, r12, #16
        add     r3, r2, #10
        add     r8, r8, #8
        vld1.64 {d4, d5}, [r1:128]
        mov     r1, r2
        vld1.16 {d6[]}, [r1:16], r7
        cmp     r8, #24
        vld1.16 {d7[]}, [r3:16]
        add     r3, r2, #2
        vld1.16 {d10[]}, [r1:16]
        add     r1, r2, #8
        vld1.16 {d2, d3}, [r10:128], r0
        vmlal.s16       q14, d6, d2
        vmlal.s16       q13, d6, d3
        vld1.16 {d6[]}, [r3:16]
        add     r3, r2, #12
        vmlal.s16       q12, d6, d2
        vmlal.s16       q11, d6, d3
        mov     r12, r10
        vmlal.s16       q14, d6, d4
        vld1.16 {d8[]}, [r3:16]
        vmlal.s16       q13, d6, d5
        add     r3, r2, #4
        vld1.16 {d6[]}, [r1:16]
        vmlal.s16       q15, d7, d3
        vmlal.s16       q9, d6, d2
        add     r1, r2, #6
        vmlal.s16       q8, d7, d2
        vld1.16 {d9[]}, [r3:16]
        vmlal.s16       q10, d6, d3
        vmlal.s16       q15, d8, d5
        vld1.16 {d2[]}, [r1:16]
        vmlal.s16       q12, d9, d4
        vmlal.s16       q11, d9, d5
        vmlal.s16       q9, d7, d4
        vmlal.s16       q10, d7, d5
        vmlal.s16       q8, d8, d4
        vmlal.s16       q13, d9, d1
        vmlal.s16       q14, d9, d0
        vmlal.s16       q15, d10, d1
        vmlal.s16       q8, d10, d0
        vmlal.s16       q10, d8, d1
        vmlal.s16       q9, d8, d0
        vmlal.s16       q11, d2, d1
        vmlal.s16       q12, d2, d0
   ~~~
   
   I tested this with one Conv2D and with auto-tuning.
   * Workload - Input shape - 1, 64, 16, 16, Kernel - 64, 64, 3, 3
   * Platform - Raspberry Pi - 0.6 GHz
   * Auto-tuning done for both FP32 and Int8 conv
   * FP32 Latency = 3817 us, Int8 latency = 3015 us
   
   

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


With regards,
Apache Git Services

Reply via email to