On 07/24/2018 07:18 PM, Segher Boessenkool wrote:

This patch allows combine to combine two insns into two.  This helps
in many cases, by reducing instruction path length, and also allowing
further combinations to happen.  PR85160 is a typical example of code
that it can improve.

I cannot state with certainty that the improvements to our most notorious routine between 8.2 and current trunk are solely due to this change, but the differences are telling (see attached Fortran code - the analysis is about the third loop).

Number of instructions for this loop (Skylake i9-7900).

gfortran82 -S -Ofast -march=native -mtune=native:

  458 verint.s.82.loop3

gfortran90 -S -Ofast -march=native -mtune=native:

  396 verint.s.90.loop3

But the most stunning difference is the use of the stack [ nn(rsp) ] - see the attached files ...

--
Toon Moene - e-mail: t...@moene.org - phone: +31 346 214290
Saturnushof 14, 3738 XG  Maartensdijk, The Netherlands
At home: http://moene.org/~toon/; weather: http://moene.org/~hirlam/
Progress of GNU Fortran: http://gcc.gnu.org/wiki/GFortran#news
# 1 "/scratch/hirlam/hl_home/MPI/lib/src/grdy/verint.F"
# 1 "<built-in>"
# 1 "<command-line>"
# 1 "/scratch/hirlam/hl_home/MPI/lib/src/grdy/verint.F"
c Library:grdy $RCSfile$, $Revision: 7536 $
c checked in by $Author: ovignes $ at $Date: 2009-12-18 14:23:36 +0100 (Fri, 18 Dec 2009) $
c $State$, $Locker$
c $Log$
c Revision 1.3  1999/04/22 09:30:45  DagBjoerge
c MPP code
c
c Revision 1.2  1999/03/09 10:23:13  GerardCats
c Add SGI paralllellisation directives DOACROSS
c
c Revision 1.1  1996/09/06 13:12:18  GCats
c Created from grdy.apl, 1 version 2.6.1, by Gerard Cats
c
      SUBROUTINE VERINT (
     I   KLON   , KLAT   , KLEV   , KINT  , KHALO
     I , KLON1  , KLON2  , KLAT1  , KLAT2
     I , KP     , KQ     , KR
     R , PARG   , PRES
     R , PALFH  , PBETH
     R , PALFA  , PBETA  , PGAMA   )
C
C*******************************************************************
C
C  VERINT - THREE DIMENSIONAL INTERPOLATION
C
C  PURPOSE:
C
C  THREE DIMENSIONAL INTERPOLATION
C
C  INPUT PARAMETERS:
C
C  KLON      NUMBER OF GRIDPOINTS IN X-DIRECTION
C  KLAT      NUMBER OF GRIDPOINTS IN Y-DIRECTION
C  KLEV      NUMBER OF VERTICAL LEVELS
C  KINT      TYPE OF INTERPOLATION
C            = 1 - LINEAR
C            = 2 - QUADRATIC
C            = 3 - CUBIC
C            = 4 - MIXED CUBIC/LINEAR
C  KLON1     FIRST GRIDPOINT IN X-DIRECTION
C  KLON2     LAST  GRIDPOINT IN X-DIRECTION
C  KLAT1     FIRST GRIDPOINT IN Y-DIRECTION
C  KLAT2     LAST  GRIDPOINT IN Y-DIRECTION
C  KP        ARRAY OF INDEXES FOR HORIZONTAL DISPLACEMENTS
C  KQ        ARRAY OF INDEXES FOR HORIZONTAL DISPLACEMENTS
C  KR        ARRAY OF INDEXES FOR VERTICAL   DISPLACEMENTS
C  PARG      ARRAY OF ARGUMENTS
C  PALFH     ALFA HAT
C  PBETH     BETA HAT
C  PALFA     ARRAY OF WEIGHTS IN X-DIRECTION
C  PBETA     ARRAY OF WEIGHTS IN Y-DIRECTION
C  PGAMA     ARRAY OF WEIGHTS IN VERTICAL DIRECTION
C
C  OUTPUT PARAMETERS:
C
C  PRES      INTERPOLATED FIELD
C
C  HISTORY:
C
C  J.E. HAUGEN       1      1992
C
C*******************************************************************
C
      IMPLICIT NONE
C
      INTEGER KLON   , KLAT   , KLEV   , KINT   , KHALO,
     I        KLON1  , KLON2  , KLAT1  , KLAT2
C
      INTEGER   KP(KLON,KLAT), KQ(KLON,KLAT), KR(KLON,KLAT)
      REAL    PARG(2-KHALO:KLON+KHALO-1,2-KHALO:KLAT+KHALO-1,KLEV)  ,   
     R        PRES(KLON,KLAT)     ,
     R       PALFH(KLON,KLAT)     ,  PBETH(KLON,KLAT)  ,
     R       PALFA(KLON,KLAT,4)   ,  PBETA(KLON,KLAT,4),
     R       PGAMA(KLON,KLAT,4)
C
      INTEGER JX, JY, IDX, IDY, ILEV
      REAL Z1MAH, Z1MBH
C
      IF (KINT.EQ.1) THEN
C  LINEAR INTERPOLATION
C
      DO JY = KLAT1,KLAT2
      DO JX = KLON1,KLON2
         IDX  = KP(JX,JY)
         IDY  = KQ(JX,JY)
         ILEV = KR(JX,JY)
C
         PRES(JX,JY) = PGAMA(JX,JY,1)*(
C
     +   PBETA(JX,JY,1)*( PALFA(JX,JY,1)*PARG(IDX-1,IDY-1,ILEV-1)
     +                  + PALFA(JX,JY,2)*PARG(IDX  ,IDY-1,ILEV-1) )
     + + PBETA(JX,JY,2)*( PALFA(JX,JY,1)*PARG(IDX-1,IDY  ,ILEV-1)
     +                  + PALFA(JX,JY,2)*PARG(IDX  ,IDY  ,ILEV-1) ) )
C    +
     +               + PGAMA(JX,JY,2)*(
C    +
     +   PBETA(JX,JY,1)*( PALFA(JX,JY,1)*PARG(IDX-1,IDY-1,ILEV  )
     +                  + PALFA(JX,JY,2)*PARG(IDX  ,IDY-1,ILEV  ) )
     + + PBETA(JX,JY,2)*( PALFA(JX,JY,1)*PARG(IDX-1,IDY  ,ILEV  )
     +                  + PALFA(JX,JY,2)*PARG(IDX  ,IDY  ,ILEV  ) ) )
      ENDDO
      ENDDO
C
      ELSE
     +IF (KINT.EQ.2) THEN
C  QUADRATIC INTERPOLATION
C
      DO JY = KLAT1,KLAT2
      DO JX = KLON1,KLON2
         IDX  = KP(JX,JY)
         IDY  = KQ(JX,JY)
         ILEV = KR(JX,JY)
C
         PRES(JX,JY) = PGAMA(JX,JY,1)*(
C
     +   PBETA(JX,JY,1)*( PALFA(JX,JY,1)*PARG(IDX-1,IDY-1,ILEV-1)
     +                  + PALFA(JX,JY,2)*PARG(IDX  ,IDY-1,ILEV-1)
     +                  + PALFA(JX,JY,3)*PARG(IDX+1,IDY-1,ILEV-1) )
     + + PBETA(JX,JY,2)*( PALFA(JX,JY,1)*PARG(IDX-1,IDY  ,ILEV-1)
     +                  + PALFA(JX,JY,2)*PARG(IDX  ,IDY  ,ILEV-1)
     +                  + PALFA(JX,JY,3)*PARG(IDX+1,IDY  ,ILEV-1) )
     + + PBETA(JX,JY,3)*( PALFA(JX,JY,1)*PARG(IDX-1,IDY+1,ILEV-1)
     +                  + PALFA(JX,JY,2)*PARG(IDX  ,IDY+1,ILEV-1)
     +                  + PALFA(JX,JY,3)*PARG(IDX+1,IDY+1,ILEV-1) ) )
C    +
     +               + PGAMA(JX,JY,2)*(
C    +
     +   PBETA(JX,JY,1)*( PALFA(JX,JY,1)*PARG(IDX-1,IDY-1,ILEV  )
     +                  + PALFA(JX,JY,2)*PARG(IDX  ,IDY-1,ILEV  )
     +                  + PALFA(JX,JY,3)*PARG(IDX+1,IDY-1,ILEV  ) )
     + + PBETA(JX,JY,2)*( PALFA(JX,JY,1)*PARG(IDX-1,IDY  ,ILEV  )
     +                  + PALFA(JX,JY,2)*PARG(IDX  ,IDY  ,ILEV  )
     +                  + PALFA(JX,JY,3)*PARG(IDX+1,IDY  ,ILEV  ) )
     + + PBETA(JX,JY,3)*( PALFA(JX,JY,1)*PARG(IDX-1,IDY+1,ILEV  )
     +                  + PALFA(JX,JY,2)*PARG(IDX  ,IDY+1,ILEV  )
     +                  + PALFA(JX,JY,3)*PARG(IDX+1,IDY+1,ILEV  ) ) )
C    +
     +               + PGAMA(JX,JY,3)*(
C    +
     +   PBETA(JX,JY,1)*( PALFA(JX,JY,1)*PARG(IDX-1,IDY-1,ILEV+1)
     +                  + PALFA(JX,JY,2)*PARG(IDX  ,IDY-1,ILEV+1)
     +                  + PALFA(JX,JY,3)*PARG(IDX+1,IDY-1,ILEV+1) )
     + + PBETA(JX,JY,2)*( PALFA(JX,JY,1)*PARG(IDX-1,IDY  ,ILEV+1)
     +                  + PALFA(JX,JY,2)*PARG(IDX  ,IDY  ,ILEV+1)
     +                  + PALFA(JX,JY,3)*PARG(IDX+1,IDY  ,ILEV+1) )
     + + PBETA(JX,JY,3)*( PALFA(JX,JY,1)*PARG(IDX-1,IDY+1,ILEV+1)
     +                  + PALFA(JX,JY,2)*PARG(IDX  ,IDY+1,ILEV+1)
     +                  + PALFA(JX,JY,3)*PARG(IDX+1,IDY+1,ILEV+1) ) )
      ENDDO
      ENDDO
C
      ELSE
     +IF (KINT.EQ.3) THEN
C  CUBIC INTERPOLATION
C
      DO JY = KLAT1,KLAT2
      DO JX = KLON1,KLON2
         IDX  = KP(JX,JY)
         IDY  = KQ(JX,JY)
         ILEV = KR(JX,JY)
C
         PRES(JX,JY) = PGAMA(JX,JY,1)*(
C
     +   PBETA(JX,JY,1)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY-2,ILEV-2)
     +                  + PALFA(JX,JY,2)*PARG(IDX-1,IDY-2,ILEV-2)
     +                  + PALFA(JX,JY,3)*PARG(IDX  ,IDY-2,ILEV-2)
     +                  + PALFA(JX,JY,4)*PARG(IDX+1,IDY-2,ILEV-2) )
     + + PBETA(JX,JY,2)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY-1,ILEV-2)
     +                  + PALFA(JX,JY,2)*PARG(IDX-1,IDY-1,ILEV-2)
     +                  + PALFA(JX,JY,3)*PARG(IDX  ,IDY-1,ILEV-2)
     +                  + PALFA(JX,JY,4)*PARG(IDX+1,IDY-1,ILEV-2) ) 
     + + PBETA(JX,JY,3)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY  ,ILEV-2)
     +                  + PALFA(JX,JY,2)*PARG(IDX-1,IDY  ,ILEV-2)
     +                  + PALFA(JX,JY,3)*PARG(IDX  ,IDY  ,ILEV-2)
     +                  + PALFA(JX,JY,4)*PARG(IDX+1,IDY  ,ILEV-2) )
     + + PBETA(JX,JY,4)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY+1,ILEV-2)
     +                  + PALFA(JX,JY,2)*PARG(IDX-1,IDY+1,ILEV-2)
     +                  + PALFA(JX,JY,3)*PARG(IDX  ,IDY+1,ILEV-2)
     +                  + PALFA(JX,JY,4)*PARG(IDX+1,IDY+1,ILEV-2) ) )
C    +
     +               + PGAMA(JX,JY,2)*(
C    +
     +   PBETA(JX,JY,1)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY-2,ILEV-1)
     +                  + PALFA(JX,JY,2)*PARG(IDX-1,IDY-2,ILEV-1)
     +                  + PALFA(JX,JY,3)*PARG(IDX  ,IDY-2,ILEV-1)
     +                  + PALFA(JX,JY,4)*PARG(IDX+1,IDY-2,ILEV-1) )
     + + PBETA(JX,JY,2)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY-1,ILEV-1)
     +                  + PALFA(JX,JY,2)*PARG(IDX-1,IDY-1,ILEV-1)
     +                  + PALFA(JX,JY,3)*PARG(IDX  ,IDY-1,ILEV-1)
     +                  + PALFA(JX,JY,4)*PARG(IDX+1,IDY-1,ILEV-1) ) 
     + + PBETA(JX,JY,3)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY  ,ILEV-1)
     +                  + PALFA(JX,JY,2)*PARG(IDX-1,IDY  ,ILEV-1)
     +                  + PALFA(JX,JY,3)*PARG(IDX  ,IDY  ,ILEV-1)
     +                  + PALFA(JX,JY,4)*PARG(IDX+1,IDY  ,ILEV-1) )
     + + PBETA(JX,JY,4)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY+1,ILEV-1)
     +                  + PALFA(JX,JY,2)*PARG(IDX-1,IDY+1,ILEV-1)
     +                  + PALFA(JX,JY,3)*PARG(IDX  ,IDY+1,ILEV-1)
     +                  + PALFA(JX,JY,4)*PARG(IDX+1,IDY+1,ILEV-1) ) )
C    +
     +               + PGAMA(JX,JY,3)*(
C    +
     +   PBETA(JX,JY,1)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY-2,ILEV  )
     +                  + PALFA(JX,JY,2)*PARG(IDX-1,IDY-2,ILEV  )
     +                  + PALFA(JX,JY,3)*PARG(IDX  ,IDY-2,ILEV  )
     +                  + PALFA(JX,JY,4)*PARG(IDX+1,IDY-2,ILEV  ) )
     + + PBETA(JX,JY,2)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY-1,ILEV  )
     +                  + PALFA(JX,JY,2)*PARG(IDX-1,IDY-1,ILEV  )
     +                  + PALFA(JX,JY,3)*PARG(IDX  ,IDY-1,ILEV  )
     +                  + PALFA(JX,JY,4)*PARG(IDX+1,IDY-1,ILEV  ) ) 
     + + PBETA(JX,JY,3)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY  ,ILEV  )
     +                  + PALFA(JX,JY,2)*PARG(IDX-1,IDY  ,ILEV  )
     +                  + PALFA(JX,JY,3)*PARG(IDX  ,IDY  ,ILEV  )
     +                  + PALFA(JX,JY,4)*PARG(IDX+1,IDY  ,ILEV  ) )
     + + PBETA(JX,JY,4)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY+1,ILEV  )
     +                  + PALFA(JX,JY,2)*PARG(IDX-1,IDY+1,ILEV  )
     +                  + PALFA(JX,JY,3)*PARG(IDX  ,IDY+1,ILEV  )
     +                  + PALFA(JX,JY,4)*PARG(IDX+1,IDY+1,ILEV  ) ) )
C    +
     +               + PGAMA(JX,JY,4)*(
C    +
     +   PBETA(JX,JY,1)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY-2,ILEV+1)
     +                  + PALFA(JX,JY,2)*PARG(IDX-1,IDY-2,ILEV+1)
     +                  + PALFA(JX,JY,3)*PARG(IDX  ,IDY-2,ILEV+1)
     +                  + PALFA(JX,JY,4)*PARG(IDX+1,IDY-2,ILEV+1) )
     + + PBETA(JX,JY,2)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY-1,ILEV+1)
     +                  + PALFA(JX,JY,2)*PARG(IDX-1,IDY-1,ILEV+1)
     +                  + PALFA(JX,JY,3)*PARG(IDX  ,IDY-1,ILEV+1)
     +                  + PALFA(JX,JY,4)*PARG(IDX+1,IDY-1,ILEV+1) )
     + + PBETA(JX,JY,3)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY  ,ILEV+1)
     +                  + PALFA(JX,JY,2)*PARG(IDX-1,IDY  ,ILEV+1)
     +                  + PALFA(JX,JY,3)*PARG(IDX  ,IDY  ,ILEV+1)
     +                  + PALFA(JX,JY,4)*PARG(IDX+1,IDY  ,ILEV+1) )
     + + PBETA(JX,JY,4)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY+1,ILEV+1)
     +                  + PALFA(JX,JY,2)*PARG(IDX-1,IDY+1,ILEV+1)
     +                  + PALFA(JX,JY,3)*PARG(IDX  ,IDY+1,ILEV+1)
     +                  + PALFA(JX,JY,4)*PARG(IDX+1,IDY+1,ILEV+1) ) )
      ENDDO
      ENDDO
C
      ELSE
     +IF (KINT.EQ.4) THEN
C  MIXED CUBIC/LINEAR INTERPOLATION
C
      DO JY = KLAT1,KLAT2
      DO JX = KLON1,KLON2
         IDX  = KP(JX,JY)
         IDY  = KQ(JX,JY)
         ILEV = KR(JX,JY)
C
         Z1MAH = 1.0 - PALFH(JX,JY)
         Z1MBH = 1.0 - PBETH(JX,JY)
C
         PRES(JX,JY) = PGAMA(JX,JY,1)*(
C
     +   PBETH(JX,JY)  *( PALFH(JX,JY)  *PARG(IDX-1,IDY-1,ILEV-2)
     +                  + Z1MAH         *PARG(IDX  ,IDY-1,ILEV-2) )
     + + Z1MBH         *( PALFH(JX,JY)  *PARG(IDX-1,IDY  ,ILEV-2)
     +                  + Z1MAH         *PARG(IDX  ,IDY  ,ILEV-2) ) )
C    +
     +               + PGAMA(JX,JY,4)*(
C    +
     +   PBETH(JX,JY)  *( PALFH(JX,JY)  *PARG(IDX-1,IDY-1,ILEV+1)
     +                  + Z1MAH         *PARG(IDX  ,IDY-1,ILEV+1) )
     + + Z1MBH         *( PALFH(JX,JY)  *PARG(IDX-1,IDY  ,ILEV+1)
     +                  + Z1MAH         *PARG(IDX  ,IDY  ,ILEV+1) ) )
C    +
     +               + PGAMA(JX,JY,2)*(
C    +
     +   PBETA(JX,JY,1)*( PALFH(JX,JY)  *PARG(IDX-1,IDY-2,ILEV-1)
     +                  + Z1MAH         *PARG(IDX  ,IDY-2,ILEV-1) )
     + + PBETA(JX,JY,2)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY-1,ILEV-1)
     +                  + PALFA(JX,JY,2)*PARG(IDX-1,IDY-1,ILEV-1)
     +                  + PALFA(JX,JY,3)*PARG(IDX  ,IDY-1,ILEV-1)
     +                  + PALFA(JX,JY,4)*PARG(IDX+1,IDY-1,ILEV-1) )
     + + PBETA(JX,JY,3)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY  ,ILEV-1)
     +                  + PALFA(JX,JY,2)*PARG(IDX-1,IDY  ,ILEV-1)
     +                  + PALFA(JX,JY,3)*PARG(IDX  ,IDY  ,ILEV-1)
     +                  + PALFA(JX,JY,4)*PARG(IDX+1,IDY  ,ILEV-1) )
     + + PBETA(JX,JY,4)*( PALFH(JX,JY)  *PARG(IDX-1,IDY+1,ILEV-1)
     +                  + Z1MAH         *PARG(IDX  ,IDY+1,ILEV-1) ) )
C    +
     +               + PGAMA(JX,JY,3)*(
C    +
     +   PBETA(JX,JY,1)*( PALFH(JX,JY)  *PARG(IDX-1,IDY-2,ILEV  )
     +                  + Z1MAH         *PARG(IDX  ,IDY-2,ILEV  ) )
     + + PBETA(JX,JY,2)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY-1,ILEV  )
     +                  + PALFA(JX,JY,2)*PARG(IDX-1,IDY-1,ILEV  )
     +                  + PALFA(JX,JY,3)*PARG(IDX  ,IDY-1,ILEV  )
     +                  + PALFA(JX,JY,4)*PARG(IDX+1,IDY-1,ILEV  ) ) 
     + + PBETA(JX,JY,3)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY  ,ILEV  )
     +                  + PALFA(JX,JY,2)*PARG(IDX-1,IDY  ,ILEV  )
     +                  + PALFA(JX,JY,3)*PARG(IDX  ,IDY  ,ILEV  )
     +                  + PALFA(JX,JY,4)*PARG(IDX+1,IDY  ,ILEV  ) )
     + + PBETA(JX,JY,4)*( PALFH(JX,JY)  *PARG(IDX-1,IDY+1,ILEV  )
     +                  + Z1MAH         *PARG(IDX  ,IDY+1,ILEV  ) ) )
      ENDDO
      ENDDO
C
      ENDIF
C
      RETURN
      END
        movq    32(%rsp), %rdx
        movq    72(%rsp), %rdx
        movq    80(%rsp), %rdx
        vmovaps %ymm18, 296(%rsp)
        movq    56(%rsp), %rdx
        movq    48(%rsp), %rdx
        movq    64(%rsp), %rdx
        movq    40(%rsp), %rdx
        vaddps  296(%rsp), %ymm19, %ymm19
        cmpq    24(%rsp), %rax
        movq    1384(%rsp), %rdx
        vmovaps %ymm29, 2184(%rsp)
        vmovaps %ymm30, 2216(%rsp)
        vmovaps %ymm28, 2248(%rsp)
        vmovaps %ymm17, 2280(%rsp)
        vmovaps %ymm19, 2312(%rsp)
        vmovaps %ymm31, 2344(%rsp)
        vmovaps %ymm25, 2376(%rsp)
        vmovaps %ymm21, 2408(%rsp)
        vmovaps %ymm16, 2440(%rsp)
        vmovaps %ymm30, 2472(%rsp)
        vmovaps %ymm13, 2504(%rsp)
        vmovaps %ymm11, 2536(%rsp)
        vmovaps %ymm28, 2568(%rsp)
        vmovaps %ymm4, 2600(%rsp)
        vmovaps %ymm19, 2632(%rsp)
        vmovaps %ymm17, 2664(%rsp)
        vmovaps %ymm16, 2696(%rsp)
        vmovaps %ymm19, 2728(%rsp)
        movq    1256(%rsp), %rdx
        vmovaps %ymm31, 2760(%rsp)
        vmovaps %ymm21, 2792(%rsp)
        vmovaps %ymm31, 2152(%rsp)
        vmovaps %ymm17, 2824(%rsp)
        vmovaps %ymm21, 2856(%rsp)
        vmovaps %ymm17, 2888(%rsp)
        vmovaps %ymm21, 2920(%rsp)
        vmovaps %ymm16, 2952(%rsp)
        vmovaps %ymm11, 2984(%rsp)
        vmovaps %ymm31, 3016(%rsp)
        vmovaps %ymm21, 3048(%rsp)
        vmovaps %ymm31, 3080(%rsp)
        vmovaps %ymm21, 3112(%rsp)
        vmovaps %ymm29, 3144(%rsp)
        vmovaps %ymm31, 3176(%rsp)
        vmovaps %ymm16, 3208(%rsp)
        vmovaps %ymm29, 3240(%rsp)
        vmovaps %ymm4, 3272(%rsp)
        vmovaps %ymm31, 3304(%rsp)
        vmovaps %ymm29, 3336(%rsp)
        vmovaps %ymm26, 3368(%rsp)
        vmovaps %ymm25, 3400(%rsp)
        vmovaps %ymm26, 3432(%rsp)
        vmovaps %ymm25, 3464(%rsp)
        vmovaps %ymm25, 3496(%rsp)
        vmovaps %ymm31, 3528(%rsp)
        vmovaps %ymm2, 3560(%rsp)
        vmovaps %ymm29, 3592(%rsp)
        vmovaps 3144(%rsp), %ymm8
        vshuff32x4      $0, 3176(%rsp), %ymm8, %ymm0
        vshuff32x4      $0, 3208(%rsp), %ymm11, %ymm11
        vmovaps 3016(%rsp), %ymm10
        vmovaps 2152(%rsp), %ymm31
        vshuff32x4      $0, 3272(%rsp), %ymm21, %ymm21
        vshuff32x4      $0, 3240(%rsp), %ymm15, %ymm15
        vshuff32x4      $0, 3048(%rsp), %ymm10, %ymm0
        vmovaps 3080(%rsp), %ymm10
        vshuff32x4      $0, 3112(%rsp), %ymm10, %ymm5
        vmovaps 2632(%rsp), %ymm5
        vshuff32x4      $0, 2664(%rsp), %ymm5, %ymm8
        vmovaps 2696(%rsp), %ymm5
        vshuff32x4      $0, 2728(%rsp), %ymm5, %ymm5
        vmovaps 2760(%rsp), %ymm15
        vshuff32x4      $0, 2792(%rsp), %ymm15, %ymm9
        vmovaps 2824(%rsp), %ymm15
        vmovaps 2888(%rsp), %ymm0
        vshuff32x4      $0, 2920(%rsp), %ymm0, %ymm8
        vmovaps 2952(%rsp), %ymm0
        vshuff32x4      $0, 2984(%rsp), %ymm0, %ymm0
        vshuff32x4      $0, 2856(%rsp), %ymm15, %ymm8
        vshuff32x4      $0, 3560(%rsp), %ymm19, %ymm19
        vmovaps 3304(%rsp), %ymm7
        vshuff32x4      $0, 3336(%rsp), %ymm7, %ymm0
        vmovaps 3368(%rsp), %ymm7
        vshuff32x4      $0, 3528(%rsp), %ymm13, %ymm13
        vshuff32x4      $0, 3400(%rsp), %ymm7, %ymm1
        vmovaps 3432(%rsp), %ymm18
        vshuff32x4      $0, 3592(%rsp), %ymm20, %ymm20
        vshuff32x4      $0, 3464(%rsp), %ymm18, %ymm18
        vshuff32x4      $0, 3496(%rsp), %ymm26, %ymm1
        vmovaps 2184(%rsp), %ymm29
        movq    1224(%rsp), %rdx
        vmovaps 2440(%rsp), %ymm20
        vshuff32x4      $0, 2216(%rsp), %ymm29, %ymm0
        vmovaps 2248(%rsp), %ymm29
        vshuff32x4      $0, 2280(%rsp), %ymm29, %ymm1
        vmovaps 2312(%rsp), %ymm29
        movq    1288(%rsp), %rdx
        vshuff32x4      $0, 2344(%rsp), %ymm29, %ymm1
        vmovaps 2376(%rsp), %ymm29
        vshuff32x4      $0, 2408(%rsp), %ymm29, %ymm2
        vshuff32x4      $0, 2472(%rsp), %ymm20, %ymm1
        vmovaps 2504(%rsp), %ymm20
        vshuff32x4      $0, 2600(%rsp), %ymm28, %ymm3
        vshuff32x4      $0, 2536(%rsp), %ymm20, %ymm2
        movq    1320(%rsp), %rdx
        vshuff32x4      $0, 2568(%rsp), %ymm30, %ymm2
        movq    1352(%rsp), %rdx
        movq    1192(%rsp), %rdx
        cmpq    1160(%rsp), %rax

Reply via email to