anijain2305 opened a new pull request #4277: [ARM][Topi] Supporting Int8 in Spatial schedule. URL: https://github.com/apache/incubator-tvm/pull/4277 I am working on improving the performance of Int8 conv on Raspberry Pi 3. For Conv2D, there is an upcast from int8 to int32 before performing the dot-product. ARM ISA has an instruction called vmlal.s16 that takes 3 SIMD registers each containing 4 16-bit values and does FMA producing 1 SIMD register containing 4 32-bit values. However, LLVM (4.0, 6.0 and 8.0) is not able to figure this out itself. In the absence of this PR, assembly looks something like this ~~~ add r2, sp, #304 vmlal.s16 q12, d0, d4 vld1.64 {d0, d1}, [r2:128] add r2, sp, #208 vmlal.s16 q12, d10, d0 vld1.64 {d0, d1}, [r2:128] add r2, sp, #192 vmlal.s16 q12, d8, d0 vld1.64 {d0, d1}, [r2:128] sub r2, r6, #7 mov r6, r7 vmlal.s16 q12, d0, d14 vld1.8 {d0[]}, [r2] add r2, sp, #192 vmovl.s8 q4, d0 vst1.64 {d8, d9}, [r2:128] add r2, sp, #240 vld1.64 {d0, d1}, [r2:128] add r2, sp, #176 vmlal.s16 q14, d8, d0 vld1.64 {d0, d1}, [r2:128] add r2, sp, #128 vmlal.s16 q14, d0, d12 vld1.64 {d12, d13}, [r2:128] add r2, sp, #288 vld1.64 {d0, d1}, [r2:128] add r2, sp, #96 vmlal.s16 q14, d12, d0 vld1.64 {d8, d9}, [r2:128] add r2, sp, #352 vld1.64 {d0, d1}, [r2:128] add r2, sp, #224 vmlal.s16 q14, d8, d0 vld1.64 {d14, d15}, [r2:128] add r2, sp, #80 vmlal.s16 q14, d14, d2 vld1.64 {d2, d3}, [r2:128] add r2, sp, #304 vld1.64 {d0, d1}, [r2:128] add r2, sp, #208 vmlal.s16 q14, d6, d4 vld1.64 {d4, d5}, [r2:128] add r2, sp, #240 vmlal.s16 q14, d2, d0 vmlal.s16 q14, d10, d4 vld1.64 {d10, d11}, [r2:128] add r2, sp, #192 vld1.64 {d6, d7}, [r2:128] add r2, sp, #160 vmlal.s16 q13, d7, d11 vld1.64 {d10, d11}, [r2:128] add r2, sp, #176 vld1.64 {d6, d7}, [r2:128] add r2, sp, #288 vmlal.s16 q13, d7, d11 vld1.64 {d10, d11}, [r2:128] add r2, sp, #352 vmlal.s16 q13, d13, d11 vld1.64 {d10, d11}, [r2:128] add r2, sp, #336 vld1.64 {d6, d7}, [r2:128] add r2, sp, #144 vmlal.s16 q13, d9, d11 vld1.64 {d8, d9}, [r2:128] add r2, sp, #320 vld1.64 {d10, d11}, [r2:128] add r2, sp, #272 vmlal.s16 q13, d15, d7 vmlal.s16 q13, d9, d11 vmlal.s16 q13, d3, d1 vld1.64 {d0, d1}, [r2:128] add r2, sp, #256 vmlal.s16 q13, d1, d5 vld1.64 {d0, d1}, [r2:128] add r2, sp, #112 vld1.64 {d2, d3}, [r2:128] vmlal.s16 q14, d0, d2 vmlal.s16 q13, d1, d3 bne .LBB6_5 ~~~ However, if we add an intermediate upcasting to int16 i.e. instead of going from int8 to int32, we can go from int8 to int16 and then this int16 can go to conv2D, it results in much better packing of compute and memory instructions ~~~ .LBB7_6: add r2, lr, r8 add r1, r12, #32 vld1.64 {d0, d1}, [r1:128] add r1, r12, #16 add r3, r2, #10 add r8, r8, #8 vld1.64 {d4, d5}, [r1:128] mov r1, r2 vld1.16 {d6[]}, [r1:16], r7 cmp r8, #24 vld1.16 {d7[]}, [r3:16] add r3, r2, #2 vld1.16 {d10[]}, [r1:16] add r1, r2, #8 vld1.16 {d2, d3}, [r10:128], r0 vmlal.s16 q14, d6, d2 vmlal.s16 q13, d6, d3 vld1.16 {d6[]}, [r3:16] add r3, r2, #12 vmlal.s16 q12, d6, d2 vmlal.s16 q11, d6, d3 mov r12, r10 vmlal.s16 q14, d6, d4 vld1.16 {d8[]}, [r3:16] vmlal.s16 q13, d6, d5 add r3, r2, #4 vld1.16 {d6[]}, [r1:16] vmlal.s16 q15, d7, d3 vmlal.s16 q9, d6, d2 add r1, r2, #6 vmlal.s16 q8, d7, d2 vld1.16 {d9[]}, [r3:16] vmlal.s16 q10, d6, d3 vmlal.s16 q15, d8, d5 vld1.16 {d2[]}, [r1:16] vmlal.s16 q12, d9, d4 vmlal.s16 q11, d9, d5 vmlal.s16 q9, d7, d4 vmlal.s16 q10, d7, d5 vmlal.s16 q8, d8, d4 vmlal.s16 q13, d9, d1 vmlal.s16 q14, d9, d0 vmlal.s16 q15, d10, d1 vmlal.s16 q8, d10, d0 vmlal.s16 q10, d8, d1 vmlal.s16 q9, d8, d0 vmlal.s16 q11, d2, d1 vmlal.s16 q12, d2, d0 ~~~ I tested this with one Conv2D and with auto-tuning. * Workload - Input shape - 1, 64, 16, 16, Kernel - 64, 64, 3, 3 * Platform - Raspberry Pi - 0.6 GHz * Auto-tuning done for both FP32 and Int8 conv * FP32 Latency = 3817 us, Int8 latency = 3015 us
---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [email protected] With regards, Apache Git Services
