[ 
https://issues.apache.org/jira/browse/ARROW-13382?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17383510#comment-17383510
 ] 

David Li commented on ARROW-13382:
----------------------------------

>From a "good" run, here's the disassembled SumArray:
{noformat}
__ZN5arrow7compute6detail8SumArrayIxxZNS1_8SumArrayIxxEET0_RKNS_9ArrayDataEEUlxE_EENSt3__19enable_ifIXntsr3std17is_floating_pointIS4_EE5valueES4_E4typeES7_OT1_:
00000000004d13a0        pushq   %rbp
00000000004d13a1        movq    %rsp, %rbp
00000000004d13a4        pushq   %r15
00000000004d13a6        pushq   %r14
00000000004d13a8        pushq   %rbx
00000000004d13a9        subq    $0x28, %rsp
00000000004d13ad        movq    0x20(%rdi), %rdx
00000000004d13b1        movq    0x28(%rdi), %rax
00000000004d13b5        movq    0x10(%rax), %rcx
00000000004d13b9        testq   %rcx, %rcx
00000000004d13bc        je      0x4d13d2
00000000004d13be        cmpb    $0x0, 0x9(%rcx)
00000000004d13c2        je      0x4d1458
00000000004d13c8        movq    0x10(%rcx), %rcx
00000000004d13cc        leaq    (%rcx,%rdx,8), %r15
00000000004d13d0        jmp     0x4d13d5
00000000004d13d2        xorl    %r15d, %r15d
00000000004d13d5        movq    0x10(%rdi), %rcx
00000000004d13d9        movq    (%rax), %rax
00000000004d13dc        testq   %rax, %rax
00000000004d13df        je      0x4d1435
00000000004d13e1        cmpb    $0x0, 0x9(%rax)
00000000004d13e5        je      0x4d1435
00000000004d13e7        movq    0x10(%rax), %rsi
00000000004d13eb        testq   %rsi, %rsi
00000000004d13ee        je      0x4d1435
00000000004d13f0        leaq    -0x40(%rbp), %rbx
00000000004d13f4        movq    %rbx, %rdi
00000000004d13f7        callq   
__ZN5arrow8internal19BaseSetBitRunReaderILb0EEC1EPKhxx ## 
arrow::internal::BaseSetBitRunReader<false>::BaseSetBitRunReader(unsigned char 
const*, long long, long long)
00000000004d13fc        movq    %rbx, %rdi
00000000004d13ff        callq   
__ZN5arrow8internal19BaseSetBitRunReaderILb0EE7NextRunEv ## 
arrow::internal::BaseSetBitRunReader<false>::NextRun()
00000000004d1404        xorl    %ebx, %ebx
00000000004d1406        testq   %rdx, %rdx
00000000004d1409        je      0x4d144a
00000000004d140b        leaq    -0x40(%rbp), %r14
00000000004d140f        testq   %rdx, %rdx
00000000004d1412        jle     0x4d1426
00000000004d1414        leaq    (%r15,%rax,8), %rax
00000000004d1418        xorl    %ecx, %ecx
00000000004d141a        addq    (%rax,%rcx,8), %rbx
00000000004d141e        incq    %rcx
00000000004d1421        cmpq    %rcx, %rdx
00000000004d1424        jne     0x4d141a
00000000004d1426        movq    %r14, %rdi
00000000004d1429        callq   
__ZN5arrow8internal19BaseSetBitRunReaderILb0EE7NextRunEv ## 
arrow::internal::BaseSetBitRunReader<false>::NextRun()
00000000004d142e        testq   %rdx, %rdx
00000000004d1431        jne     0x4d140f
00000000004d1433        jmp     0x4d144a
00000000004d1435        xorl    %ebx, %ebx
00000000004d1437        testq   %rcx, %rcx
00000000004d143a        jle     0x4d144a
00000000004d143c        xorl    %eax, %eax
00000000004d143e        addq    (%r15,%rax,8), %rbx
00000000004d1442        incq    %rax
00000000004d1445        cmpq    %rax, %rcx
00000000004d1448        jne     0x4d143e
00000000004d144a        movq    %rbx, %rax
00000000004d144d        addq    $0x28, %rsp
00000000004d1451        popq    %rbx
00000000004d1452        popq    %r14
00000000004d1454        popq    %r15
00000000004d1456        popq    %rbp
00000000004d1457        retq
00000000004d1458        xorl    %ecx, %ecx
00000000004d145a        jmp     0x4d13cc
00000000004d145f        nop {noformat}
>From a "bad" run, here's the disassembled SumArray:
{noformat}
__ZN5arrow7compute6detail8SumArrayIxxZNS1_8SumArrayIxxEET0_RKNS_9ArrayDataEEUlxE_EENSt3__19enable_ifIXntsr3std17is_floating_pointIS4_EE5valueES4_E4typeES7_OT1_:
000000000082ce50        pushq   %rbp
000000000082ce51        movq    %rsp, %rbp
000000000082ce54        pushq   %r15
000000000082ce56        pushq   %r14
000000000082ce58        pushq   %rbx
000000000082ce59        subq    $0x28, %rsp
000000000082ce5d        movq    0x20(%rdi), %rdx
000000000082ce61        movq    0x28(%rdi), %rax
000000000082ce65        movq    0x10(%rax), %rcx
000000000082ce69        testq   %rcx, %rcx
000000000082ce6c        je      0x82ce82
000000000082ce6e        cmpb    $0x0, 0x9(%rcx)
000000000082ce72        je      0x82d017
000000000082ce78        movq    0x10(%rcx), %rcx
000000000082ce7c        leaq    (%rcx,%rdx,8), %rbx
000000000082ce80        jmp     0x82ce84
000000000082ce82        xorl    %ebx, %ebx
000000000082ce84        movq    0x10(%rdi), %rcx
000000000082ce88        movq    (%rax), %rax
000000000082ce8b        testq   %rax, %rax
000000000082ce8e        je      0x82cf7a
000000000082ce94        cmpb    $0x0, 0x9(%rax)
000000000082ce98        je      0x82cf7a
000000000082ce9e        movq    0x10(%rax), %rsi
000000000082cea2        testq   %rsi, %rsi
000000000082cea5        je      0x82cf7a
000000000082ceab        leaq    -0x40(%rbp), %r14
000000000082ceaf        movq    %r14, %rdi
000000000082ceb2        callq   
__ZN5arrow8internal19BaseSetBitRunReaderILb0EEC1EPKhxx ## 
arrow::internal::BaseSetBitRunReader<false>::BaseSetBitRunReader(unsigned char 
const*, long long, long long)
000000000082ceb7        movq    %r14, %rdi
000000000082ceba        callq   
__ZN5arrow8internal19BaseSetBitRunReaderILb0EE7NextRunEv ## 
arrow::internal::BaseSetBitRunReader<false>::NextRun()
000000000082cebf        xorl    %r15d, %r15d
000000000082cec2        testq   %rdx, %rdx
000000000082cec5        je      0x82d006
000000000082cecb        vmovdqa64       0x47c06b(%rip), %zmm3
000000000082ced5        leaq    -0x40(%rbp), %r14
000000000082ced9        testq   %rdx, %rdx
000000000082cedc        jle     0x82cf57
000000000082cede        leaq    0x7(%rdx), %rcx
000000000082cee2        andq    $-0x8, %rcx
000000000082cee6        decq    %rdx
000000000082cee9        vpbroadcastq    %rdx, %zmm0
000000000082ceef        vmovq   %r15, %xmm2
000000000082cef4        leaq    (%rbx,%rax,8), %rax
000000000082cef8        xorl    %edx, %edx
000000000082cefa        vmovdqa64       %zmm2, %zmm1
000000000082cf00        vpbroadcastq    %rdx, %zmm2
000000000082cf06        vpaddq  %zmm3, %zmm2, %zmm2
000000000082cf0c        vpcmpleuq       %zmm0, %zmm2, %k1
000000000082cf13        vmovdqu64       (%rax), %zmm2 {%k1} {z}
000000000082cf19        vpaddq  %zmm1, %zmm2, %zmm2
000000000082cf1f        addq    $0x8, %rdx
000000000082cf23        addq    $0x40, %rax
000000000082cf27        cmpq    %rdx, %rcx
000000000082cf2a        jne     0x82cefa
000000000082cf2c        vmovdqa64       %zmm2, %zmm1 {%k1}
000000000082cf32        vextracti64x4   $0x1, %zmm1, %ymm0
000000000082cf39        vpaddq  %zmm0, %zmm1, %zmm0
000000000082cf3f        vextracti128    $0x1, %ymm0, %xmm1
000000000082cf45        vpaddq  %xmm1, %xmm0, %xmm0
000000000082cf49        vpshufd $0x4e, %xmm0, %xmm1
000000000082cf4e        vpaddq  %xmm1, %xmm0, %xmm0
000000000082cf52        vmovq   %xmm0, %r15
000000000082cf57        movq    %r14, %rdi
000000000082cf5a        vzeroupper
000000000082cf5d        callq   
__ZN5arrow8internal19BaseSetBitRunReaderILb0EE7NextRunEv ## 
arrow::internal::BaseSetBitRunReader<false>::NextRun()
000000000082cf62        testq   %rdx, %rdx
000000000082cf65        vmovdqa64       0x47bfd1(%rip), %zmm3
000000000082cf6f        jne     0x82ced9
000000000082cf75        jmp     0x82d006
000000000082cf7a        testq   %rcx, %rcx
000000000082cf7d        jle     0x82d003
000000000082cf83        leaq    0x7(%rcx), %rax
000000000082cf87        andq    $-0x8, %rax
000000000082cf8b        decq    %rcx
000000000082cf8e        vpbroadcastq    %rcx, %zmm0
000000000082cf94        vpxor   %xmm3, %xmm3, %xmm3
000000000082cf98        xorl    %ecx, %ecx
000000000082cf9a        vmovdqa64       0x47bf9c(%rip), %zmm2
000000000082cfa4        vmovdqa64       %zmm3, %zmm1
000000000082cfaa        vpbroadcastq    %rcx, %zmm3
000000000082cfb0        vpaddq  %zmm2, %zmm3, %zmm3
000000000082cfb6        vpcmpleuq       %zmm0, %zmm3, %k1
000000000082cfbd        vmovdqu64       (%rbx), %zmm3 {%k1} {z}
000000000082cfc3        vpaddq  %zmm1, %zmm3, %zmm3
000000000082cfc9        addq    $0x8, %rcx
000000000082cfcd        addq    $0x40, %rbx
000000000082cfd1        cmpq    %rcx, %rax
000000000082cfd4        jne     0x82cfa4
000000000082cfd6        vmovdqa64       %zmm3, %zmm1 {%k1}
000000000082cfdc        vextracti64x4   $0x1, %zmm1, %ymm0
000000000082cfe3        vpaddq  %zmm0, %zmm1, %zmm0
000000000082cfe9        vextracti128    $0x1, %ymm0, %xmm1
000000000082cfef        vpaddq  %xmm1, %xmm0, %xmm0
000000000082cff3        vpshufd $0x4e, %xmm0, %xmm1
000000000082cff8        vpaddq  %xmm1, %xmm0, %xmm0
000000000082cffc        vmovq   %xmm0, %r15
000000000082d001        jmp     0x82d006
000000000082d003        xorl    %r15d, %r15d
000000000082d006        movq    %r15, %rax
000000000082d009        addq    $0x28, %rsp
000000000082d00d        popq    %rbx
000000000082d00e        popq    %r14
000000000082d010        popq    %r15
000000000082d012        popq    %rbp
000000000082d013        vzeroupper
000000000082d016        retq
000000000082d017        xorl    %ecx, %ecx
000000000082d019        jmp     0x82ce7c {noformat}
Note the sudden appearance of lots of SIMD instructions and the AVX512 zmm 
registers (as well as the AVX2 ymm registers)

> [C++] Aggregation over scalars fails autobrew R job
> ---------------------------------------------------
>
>                 Key: ARROW-13382
>                 URL: https://issues.apache.org/jira/browse/ARROW-13382
>             Project: Apache Arrow
>          Issue Type: Bug
>          Components: C++
>            Reporter: David Li
>            Assignee: David Li
>            Priority: Blocker
>             Fix For: 5.0.0
>
>
> [https://github.com/ursacomputing/crossbow/runs/3091873413#step:7:488]
> {noformat}
>  *** caught illegal operation ***
> address 0x109dc30cc, cause 'illegal opcode'Traceback:
>  1: compute__CallFunction(function_name, args, options)
>  2: call_function(FUN, a, options = list(na.rm = na.rm, na.min_count = 
> na.min_count))
>  3: scalar_aggregate("sum", ..., na.rm = na.rm)
>  4: sum.ArrowDatum(<environment>, na.rm = FALSE)
>  5: eval_bare(expr, quo_get_env(quo))
>  6: quasi_label(enquo(object), arg = "object") {noformat}
> I would guess at first glance the compiler is autovectorizing something more 
> than necessary?



--
This message was sent by Atlassian Jira
(v8.3.4#803005)

Reply via email to