[
https://issues.apache.org/jira/browse/ARROW-13382?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17383510#comment-17383510
]
David Li commented on ARROW-13382:
----------------------------------
>From a "good" run, here's the disassembled SumArray:
{noformat}
__ZN5arrow7compute6detail8SumArrayIxxZNS1_8SumArrayIxxEET0_RKNS_9ArrayDataEEUlxE_EENSt3__19enable_ifIXntsr3std17is_floating_pointIS4_EE5valueES4_E4typeES7_OT1_:
00000000004d13a0 pushq %rbp
00000000004d13a1 movq %rsp, %rbp
00000000004d13a4 pushq %r15
00000000004d13a6 pushq %r14
00000000004d13a8 pushq %rbx
00000000004d13a9 subq $0x28, %rsp
00000000004d13ad movq 0x20(%rdi), %rdx
00000000004d13b1 movq 0x28(%rdi), %rax
00000000004d13b5 movq 0x10(%rax), %rcx
00000000004d13b9 testq %rcx, %rcx
00000000004d13bc je 0x4d13d2
00000000004d13be cmpb $0x0, 0x9(%rcx)
00000000004d13c2 je 0x4d1458
00000000004d13c8 movq 0x10(%rcx), %rcx
00000000004d13cc leaq (%rcx,%rdx,8), %r15
00000000004d13d0 jmp 0x4d13d5
00000000004d13d2 xorl %r15d, %r15d
00000000004d13d5 movq 0x10(%rdi), %rcx
00000000004d13d9 movq (%rax), %rax
00000000004d13dc testq %rax, %rax
00000000004d13df je 0x4d1435
00000000004d13e1 cmpb $0x0, 0x9(%rax)
00000000004d13e5 je 0x4d1435
00000000004d13e7 movq 0x10(%rax), %rsi
00000000004d13eb testq %rsi, %rsi
00000000004d13ee je 0x4d1435
00000000004d13f0 leaq -0x40(%rbp), %rbx
00000000004d13f4 movq %rbx, %rdi
00000000004d13f7 callq
__ZN5arrow8internal19BaseSetBitRunReaderILb0EEC1EPKhxx ##
arrow::internal::BaseSetBitRunReader<false>::BaseSetBitRunReader(unsigned char
const*, long long, long long)
00000000004d13fc movq %rbx, %rdi
00000000004d13ff callq
__ZN5arrow8internal19BaseSetBitRunReaderILb0EE7NextRunEv ##
arrow::internal::BaseSetBitRunReader<false>::NextRun()
00000000004d1404 xorl %ebx, %ebx
00000000004d1406 testq %rdx, %rdx
00000000004d1409 je 0x4d144a
00000000004d140b leaq -0x40(%rbp), %r14
00000000004d140f testq %rdx, %rdx
00000000004d1412 jle 0x4d1426
00000000004d1414 leaq (%r15,%rax,8), %rax
00000000004d1418 xorl %ecx, %ecx
00000000004d141a addq (%rax,%rcx,8), %rbx
00000000004d141e incq %rcx
00000000004d1421 cmpq %rcx, %rdx
00000000004d1424 jne 0x4d141a
00000000004d1426 movq %r14, %rdi
00000000004d1429 callq
__ZN5arrow8internal19BaseSetBitRunReaderILb0EE7NextRunEv ##
arrow::internal::BaseSetBitRunReader<false>::NextRun()
00000000004d142e testq %rdx, %rdx
00000000004d1431 jne 0x4d140f
00000000004d1433 jmp 0x4d144a
00000000004d1435 xorl %ebx, %ebx
00000000004d1437 testq %rcx, %rcx
00000000004d143a jle 0x4d144a
00000000004d143c xorl %eax, %eax
00000000004d143e addq (%r15,%rax,8), %rbx
00000000004d1442 incq %rax
00000000004d1445 cmpq %rax, %rcx
00000000004d1448 jne 0x4d143e
00000000004d144a movq %rbx, %rax
00000000004d144d addq $0x28, %rsp
00000000004d1451 popq %rbx
00000000004d1452 popq %r14
00000000004d1454 popq %r15
00000000004d1456 popq %rbp
00000000004d1457 retq
00000000004d1458 xorl %ecx, %ecx
00000000004d145a jmp 0x4d13cc
00000000004d145f nop {noformat}
>From a "bad" run, here's the disassembled SumArray:
{noformat}
__ZN5arrow7compute6detail8SumArrayIxxZNS1_8SumArrayIxxEET0_RKNS_9ArrayDataEEUlxE_EENSt3__19enable_ifIXntsr3std17is_floating_pointIS4_EE5valueES4_E4typeES7_OT1_:
000000000082ce50 pushq %rbp
000000000082ce51 movq %rsp, %rbp
000000000082ce54 pushq %r15
000000000082ce56 pushq %r14
000000000082ce58 pushq %rbx
000000000082ce59 subq $0x28, %rsp
000000000082ce5d movq 0x20(%rdi), %rdx
000000000082ce61 movq 0x28(%rdi), %rax
000000000082ce65 movq 0x10(%rax), %rcx
000000000082ce69 testq %rcx, %rcx
000000000082ce6c je 0x82ce82
000000000082ce6e cmpb $0x0, 0x9(%rcx)
000000000082ce72 je 0x82d017
000000000082ce78 movq 0x10(%rcx), %rcx
000000000082ce7c leaq (%rcx,%rdx,8), %rbx
000000000082ce80 jmp 0x82ce84
000000000082ce82 xorl %ebx, %ebx
000000000082ce84 movq 0x10(%rdi), %rcx
000000000082ce88 movq (%rax), %rax
000000000082ce8b testq %rax, %rax
000000000082ce8e je 0x82cf7a
000000000082ce94 cmpb $0x0, 0x9(%rax)
000000000082ce98 je 0x82cf7a
000000000082ce9e movq 0x10(%rax), %rsi
000000000082cea2 testq %rsi, %rsi
000000000082cea5 je 0x82cf7a
000000000082ceab leaq -0x40(%rbp), %r14
000000000082ceaf movq %r14, %rdi
000000000082ceb2 callq
__ZN5arrow8internal19BaseSetBitRunReaderILb0EEC1EPKhxx ##
arrow::internal::BaseSetBitRunReader<false>::BaseSetBitRunReader(unsigned char
const*, long long, long long)
000000000082ceb7 movq %r14, %rdi
000000000082ceba callq
__ZN5arrow8internal19BaseSetBitRunReaderILb0EE7NextRunEv ##
arrow::internal::BaseSetBitRunReader<false>::NextRun()
000000000082cebf xorl %r15d, %r15d
000000000082cec2 testq %rdx, %rdx
000000000082cec5 je 0x82d006
000000000082cecb vmovdqa64 0x47c06b(%rip), %zmm3
000000000082ced5 leaq -0x40(%rbp), %r14
000000000082ced9 testq %rdx, %rdx
000000000082cedc jle 0x82cf57
000000000082cede leaq 0x7(%rdx), %rcx
000000000082cee2 andq $-0x8, %rcx
000000000082cee6 decq %rdx
000000000082cee9 vpbroadcastq %rdx, %zmm0
000000000082ceef vmovq %r15, %xmm2
000000000082cef4 leaq (%rbx,%rax,8), %rax
000000000082cef8 xorl %edx, %edx
000000000082cefa vmovdqa64 %zmm2, %zmm1
000000000082cf00 vpbroadcastq %rdx, %zmm2
000000000082cf06 vpaddq %zmm3, %zmm2, %zmm2
000000000082cf0c vpcmpleuq %zmm0, %zmm2, %k1
000000000082cf13 vmovdqu64 (%rax), %zmm2 {%k1} {z}
000000000082cf19 vpaddq %zmm1, %zmm2, %zmm2
000000000082cf1f addq $0x8, %rdx
000000000082cf23 addq $0x40, %rax
000000000082cf27 cmpq %rdx, %rcx
000000000082cf2a jne 0x82cefa
000000000082cf2c vmovdqa64 %zmm2, %zmm1 {%k1}
000000000082cf32 vextracti64x4 $0x1, %zmm1, %ymm0
000000000082cf39 vpaddq %zmm0, %zmm1, %zmm0
000000000082cf3f vextracti128 $0x1, %ymm0, %xmm1
000000000082cf45 vpaddq %xmm1, %xmm0, %xmm0
000000000082cf49 vpshufd $0x4e, %xmm0, %xmm1
000000000082cf4e vpaddq %xmm1, %xmm0, %xmm0
000000000082cf52 vmovq %xmm0, %r15
000000000082cf57 movq %r14, %rdi
000000000082cf5a vzeroupper
000000000082cf5d callq
__ZN5arrow8internal19BaseSetBitRunReaderILb0EE7NextRunEv ##
arrow::internal::BaseSetBitRunReader<false>::NextRun()
000000000082cf62 testq %rdx, %rdx
000000000082cf65 vmovdqa64 0x47bfd1(%rip), %zmm3
000000000082cf6f jne 0x82ced9
000000000082cf75 jmp 0x82d006
000000000082cf7a testq %rcx, %rcx
000000000082cf7d jle 0x82d003
000000000082cf83 leaq 0x7(%rcx), %rax
000000000082cf87 andq $-0x8, %rax
000000000082cf8b decq %rcx
000000000082cf8e vpbroadcastq %rcx, %zmm0
000000000082cf94 vpxor %xmm3, %xmm3, %xmm3
000000000082cf98 xorl %ecx, %ecx
000000000082cf9a vmovdqa64 0x47bf9c(%rip), %zmm2
000000000082cfa4 vmovdqa64 %zmm3, %zmm1
000000000082cfaa vpbroadcastq %rcx, %zmm3
000000000082cfb0 vpaddq %zmm2, %zmm3, %zmm3
000000000082cfb6 vpcmpleuq %zmm0, %zmm3, %k1
000000000082cfbd vmovdqu64 (%rbx), %zmm3 {%k1} {z}
000000000082cfc3 vpaddq %zmm1, %zmm3, %zmm3
000000000082cfc9 addq $0x8, %rcx
000000000082cfcd addq $0x40, %rbx
000000000082cfd1 cmpq %rcx, %rax
000000000082cfd4 jne 0x82cfa4
000000000082cfd6 vmovdqa64 %zmm3, %zmm1 {%k1}
000000000082cfdc vextracti64x4 $0x1, %zmm1, %ymm0
000000000082cfe3 vpaddq %zmm0, %zmm1, %zmm0
000000000082cfe9 vextracti128 $0x1, %ymm0, %xmm1
000000000082cfef vpaddq %xmm1, %xmm0, %xmm0
000000000082cff3 vpshufd $0x4e, %xmm0, %xmm1
000000000082cff8 vpaddq %xmm1, %xmm0, %xmm0
000000000082cffc vmovq %xmm0, %r15
000000000082d001 jmp 0x82d006
000000000082d003 xorl %r15d, %r15d
000000000082d006 movq %r15, %rax
000000000082d009 addq $0x28, %rsp
000000000082d00d popq %rbx
000000000082d00e popq %r14
000000000082d010 popq %r15
000000000082d012 popq %rbp
000000000082d013 vzeroupper
000000000082d016 retq
000000000082d017 xorl %ecx, %ecx
000000000082d019 jmp 0x82ce7c {noformat}
Note the sudden appearance of lots of SIMD instructions and the AVX512 zmm
registers (as well as the AVX2 ymm registers)
> [C++] Aggregation over scalars fails autobrew R job
> ---------------------------------------------------
>
> Key: ARROW-13382
> URL: https://issues.apache.org/jira/browse/ARROW-13382
> Project: Apache Arrow
> Issue Type: Bug
> Components: C++
> Reporter: David Li
> Assignee: David Li
> Priority: Blocker
> Fix For: 5.0.0
>
>
> [https://github.com/ursacomputing/crossbow/runs/3091873413#step:7:488]
> {noformat}
> *** caught illegal operation ***
> address 0x109dc30cc, cause 'illegal opcode'Traceback:
> 1: compute__CallFunction(function_name, args, options)
> 2: call_function(FUN, a, options = list(na.rm = na.rm, na.min_count =
> na.min_count))
> 3: scalar_aggregate("sum", ..., na.rm = na.rm)
> 4: sum.ArrowDatum(<environment>, na.rm = FALSE)
> 5: eval_bare(expr, quo_get_env(quo))
> 6: quasi_label(enquo(object), arg = "object") {noformat}
> I would guess at first glance the compiler is autovectorizing something more
> than necessary?
--
This message was sent by Atlassian Jira
(v8.3.4#803005)