On 6/10/2011 10:59 AM, S. Erisman wrote:

The _mm_* function _do_ indeed get compiled down to SSE assembly instructions.

For reference... Here is what the non-SSE code compiles down too:

rfx_decode_YCbCr_to_RGB():
   0:    55                       push   %ebp
   1:    31 d2                    xor    %edx,%edx
   3:    89 e5                    mov    %esp,%ebp
   5:    57                       push   %edi
   6:    56                       push   %esi
   7:    53                       push   %ebx
   8:    83 ec 0c                 sub    $0xc,%esp
   b:    90                       nop
   c:    8d 74 26 00              lea    0x0(%esi,%eiz,1),%esi
  10:    8b 5d 0c                 mov    0xc(%ebp),%ebx
  13:    8b 7d 10                 mov    0x10(%ebp),%edi
  16:    8b 45 08                 mov    0x8(%ebp),%eax
  19:    0f b7 0c 53              movzwl (%ebx,%edx,2),%ecx
  1d:    0f b7 1c 57              movzwl (%edi,%edx,2),%ebx
  21:    0f b7 34 50              movzwl (%eax,%edx,2),%esi
  25:    89 d8                    mov    %ebx,%eax
  27:    89 df                    mov    %ebx,%edi
  29:    66 c1 f8 03              sar    $0x3,%ax
  2d:    66 83 ee 80              sub    $0xffffff80,%si
  31:    66 c1 ff 05              sar    $0x5,%di
  35:    66 89 45 f0              mov    %ax,-0x10(%ebp)
  39:    89 d8                    mov    %ebx,%eax
  3b:    66 89 7d f2              mov    %di,-0xe(%ebp)
  3f:    66 c1 f8 02              sar    $0x2,%ax
  43:    8d 3c 1e                 lea    (%esi,%ebx,1),%edi
  46:    8d 04 07                 lea    (%edi,%eax,1),%eax
  49:    31 ff                    xor    %edi,%edi
  4b:    66 03 45 f0              add    -0x10(%ebp),%ax
  4f:    66 03 45 f2              add    -0xe(%ebp),%ax
  53:    78 0c                    js     61 <rfx_decode_YCbCr_to_RGB+0x61>
  55:    66 3d ff 00              cmp    $0xff,%ax
  59:    bf ff 00 00 00           mov    $0xff,%edi
  5e:    0f 4e f8                 cmovle %eax,%edi
  61:    8b 45 08                 mov    0x8(%ebp),%eax
  64:    66 89 3c 50              mov    %di,(%eax,%edx,2)
  68:    89 cf                    mov    %ecx,%edi
  6a:    89 f0                    mov    %esi,%eax
  6c:    66 c1 ff 02              sar    $0x2,%di
  70:    66 89 7d ea              mov    %di,-0x16(%ebp)
  74:    89 cf                    mov    %ecx,%edi
  76:    66 2b 45 ea              sub    -0x16(%ebp),%ax
  7a:    66 c1 ff 05              sar    $0x5,%di
  7e:    66 29 f8                 sub    %di,%ax
  81:    89 df                    mov    %ebx,%edi
  83:    66 2b 45 f0              sub    -0x10(%ebp),%ax
  87:    66 d1 ff                 sar    %di
  8a:    66 c1 fb 04              sar    $0x4,%bx
  8e:    66 29 f8                 sub    %di,%ax
  91:    89 cf                    mov    %ecx,%edi
  93:    66 2b 45 f2              sub    -0xe(%ebp),%ax
  97:    66 c1 ff 04              sar    $0x4,%di
  9b:    66 29 d8                 sub    %bx,%ax
  9e:    31 db                    xor    %ebx,%ebx
  a0:    66 29 f8                 sub    %di,%ax
  a3:    78 0c                    js     b1 <rfx_decode_YCbCr_to_RGB+0xb1>
  a5:    66 3d ff 00              cmp    $0xff,%ax
  a9:    bb ff 00 00 00           mov    $0xff,%ebx
  ae:    0f 4e d8                 cmovle %eax,%ebx
  b1:    8b 45 0c                 mov    0xc(%ebp),%eax
  b4:    01 ce                    add    %ecx,%esi
  b6:    66 89 1c 50              mov    %bx,(%eax,%edx,2)
  ba:    0f b7 5d ea              movzwl -0x16(%ebp),%ebx
  be:    89 c8                    mov    %ecx,%eax
  c0:    66 d1 f8                 sar    %ax
  c3:    66 c1 f9 06              sar    $0x6,%cx
  c7:    8d 3c 1e                 lea    (%esi,%ebx,1),%edi
  ca:    01 c7                    add    %eax,%edi
  cc:    31 c0                    xor    %eax,%eax
  ce:    66 01 f9                 add    %di,%cx
  d1:    78 0d                    js     e0 <rfx_decode_YCbCr_to_RGB+0xe0>
  d3:    66 81 f9 ff 00           cmp    $0xff,%cx
  d8:    b8 ff 00 00 00           mov    $0xff,%eax
  dd:    0f 4e c1                 cmovle %ecx,%eax
  e0:    8b 7d 10                 mov    0x10(%ebp),%edi
  e3:    66 89 04 57              mov    %ax,(%edi,%edx,2)
  e7:    83 c2 01                 add    $0x1,%edx
  ea:    81 fa 00 10 00 00        cmp    $0x1000,%edx
  f0:    0f 85 1a ff ff ff        jne    10 <rfx_decode_YCbCr_to_RGB+0x10>
  f6:    83 c4 0c                 add    $0xc,%esp
  f9:    5b                       pop    %ebx
  fa:    5e                       pop    %esi
  fb:    5f                       pop    %edi
  fc:    5d                       pop    %ebp
  fd:    c3                       ret
  fe:    66 90                    xchg   %ax,%ax

Thanks,
 Steve
------------------------------------------------------------------------------
EditLive Enterprise is the world's most technically advanced content
authoring tool. Experience the power of Track Changes, Inline Image
Editing and ensure content is compliant with Accessibility Checking.
http://p.sf.net/sfu/ephox-dev2dev
_______________________________________________
Freerdp-devel mailing list
Freerdp-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/freerdp-devel

Reply via email to