> Can you post a disassembly of hflip_byte_c? > > > in O1 : clang -S -O1 test_asm_gen.c
.section __TEXT,__text,regular,pure_instructions .macosx_version_min 10, 12 .globl _hflip_byte_c .p2align 4, 0x90 _hflip_byte_c: ## @hflip_byte_c .cfi_startproc ## BB#0: pushq %rbp Ltmp0: .cfi_def_cfa_offset 16 Ltmp1: .cfi_offset %rbp, -16 movq %rsp, %rbp Ltmp2: .cfi_def_cfa_register %rbp testl %edx, %edx jle LBB0_3 ## BB#1: movl %edx, %eax .p2align 4, 0x90 LBB0_2: ## =>This Inner Loop Header: Depth=1 movzbl (%rdi), %ecx movb %cl, (%rsi) decq %rdi incq %rsi decq %rax jne LBB0_2 LBB0_3: popq %rbp retq .cfi_endproc .subsections_via_symbols in O2 or O3 : clang -S -O3 test_asm_gen.c If i correctly understand, same idea than paul's patch but processing two xmm in the main loop .section __TEXT,__text,regular,pure_instructions .macosx_version_min 10, 12 .section __TEXT,__literal16,16byte_literals .p2align 4 LCPI0_0: .byte 15 ## 0xf .byte 14 ## 0xe .byte 13 ## 0xd .byte 12 ## 0xc .byte 11 ## 0xb .byte 10 ## 0xa .byte 9 ## 0x9 .byte 8 ## 0x8 .byte 7 ## 0x7 .byte 6 ## 0x6 .byte 5 ## 0x5 .byte 4 ## 0x4 .byte 3 ## 0x3 .byte 2 ## 0x2 .byte 1 ## 0x1 .byte 0 ## 0x0 .section __TEXT,__text,regular,pure_instructions .globl _hflip_byte_c .p2align 4, 0x90 _hflip_byte_c: ## @hflip_byte_c .cfi_startproc ## BB#0: pushq %rbp Ltmp0: .cfi_def_cfa_offset 16 Ltmp1: .cfi_offset %rbp, -16 movq %rsp, %rbp Ltmp2: .cfi_def_cfa_register %rbp ## kill: %EDX<def> %EDX<kill> %RDX<def> testl %edx, %edx jle LBB0_17 ## BB#1: movl %edx, %r8d cmpl $32, %edx jae LBB0_3 ## BB#2: xorl %r11d, %r11d jmp LBB0_11 LBB0_3: andl $31, %edx movq %r8, %r11 subq %rdx, %r11 je LBB0_7 ## BB#4: leaq 1(%rdi), %rax cmpq %rsi, %rax jbe LBB0_8 ## BB#5: leaq (%rsi,%r8), %r9 movl $1, %eax subq %r8, %rax addq %rdi, %rax cmpq %r9, %rax jae LBB0_8 LBB0_7: xorl %r11d, %r11d jmp LBB0_11 LBB0_8: leaq -15(%rdi), %r9 leaq 16(%rsi), %rax movdqa LCPI0_0(%rip), %xmm0 ## xmm0 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] movq %r11, %r10 .p2align 4, 0x90 LBB0_9: ## =>This Inner Loop Header: Depth=1 movdqu -16(%r9), %xmm1 movdqu (%r9), %xmm2 pshufb %xmm0, %xmm2 pshufb %xmm0, %xmm1 movdqu %xmm2, -16(%rax) movdqu %xmm1, (%rax) addq $-32, %r9 addq $32, %rax addq $-32, %r10 jne LBB0_9 ## BB#10: testl %edx, %edx je LBB0_17 LBB0_11: movl %r8d, %eax subl %r11d, %eax leaq -1(%r8), %r9 subq %r11, %r9 andq $3, %rax je LBB0_14 ## BB#12: movq %rdi, %rdx subq %r11, %rdx negq %rax .p2align 4, 0x90 LBB0_13: ## =>This Inner Loop Header: Depth=1 movzbl (%rdx), %ecx movb %cl, (%rsi,%r11) incq %r11 decq %rdx incq %rax jne LBB0_13 LBB0_14: cmpq $3, %r9 jb LBB0_17 ## BB#15: subq %r11, %r8 subq %r11, %rdi leaq 3(%rsi,%r11), %rax .p2align 4, 0x90 LBB0_16: ## =>This Inner Loop Header: Depth=1 movzbl (%rdi), %ecx movb %cl, -3(%rax) movzbl -1(%rdi), %ecx movb %cl, -2(%rax) movzbl -2(%rdi), %ecx movb %cl, -1(%rax) movzbl -3(%rdi), %ecx movb %cl, (%rax) addq $-4, %rdi addq $4, %rax addq $-4, %r8 jne LBB0_16 LBB0_17: popq %rbp retq .cfi_endproc .subsections_via_symbols _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel