On 07/24/2018 07:18 PM, Segher Boessenkool wrote:
This patch allows combine to combine two insns into two. This helps
in many cases, by reducing instruction path length, and also allowing
further combinations to happen. PR85160 is a typical example of code
that it can improve.
I cannot state with certainty that the improvements to our most
notorious routine between 8.2 and current trunk are solely due to this
change, but the differences are telling (see attached Fortran code - the
analysis is about the third loop).
Number of instructions for this loop (Skylake i9-7900).
gfortran82 -S -Ofast -march=native -mtune=native:
458 verint.s.82.loop3
gfortran90 -S -Ofast -march=native -mtune=native:
396 verint.s.90.loop3
But the most stunning difference is the use of the stack [ nn(rsp) ] -
see the attached files ...
--
Toon Moene - e-mail: t...@moene.org - phone: +31 346 214290
Saturnushof 14, 3738 XG Maartensdijk, The Netherlands
At home: http://moene.org/~toon/; weather: http://moene.org/~hirlam/
Progress of GNU Fortran: http://gcc.gnu.org/wiki/GFortran#news
# 1 "/scratch/hirlam/hl_home/MPI/lib/src/grdy/verint.F"
# 1 "<built-in>"
# 1 "<command-line>"
# 1 "/scratch/hirlam/hl_home/MPI/lib/src/grdy/verint.F"
c Library:grdy $RCSfile$, $Revision: 7536 $
c checked in by $Author: ovignes $ at $Date: 2009-12-18 14:23:36 +0100 (Fri, 18 Dec 2009) $
c $State$, $Locker$
c $Log$
c Revision 1.3 1999/04/22 09:30:45 DagBjoerge
c MPP code
c
c Revision 1.2 1999/03/09 10:23:13 GerardCats
c Add SGI paralllellisation directives DOACROSS
c
c Revision 1.1 1996/09/06 13:12:18 GCats
c Created from grdy.apl, 1 version 2.6.1, by Gerard Cats
c
SUBROUTINE VERINT (
I KLON , KLAT , KLEV , KINT , KHALO
I , KLON1 , KLON2 , KLAT1 , KLAT2
I , KP , KQ , KR
R , PARG , PRES
R , PALFH , PBETH
R , PALFA , PBETA , PGAMA )
C
C*******************************************************************
C
C VERINT - THREE DIMENSIONAL INTERPOLATION
C
C PURPOSE:
C
C THREE DIMENSIONAL INTERPOLATION
C
C INPUT PARAMETERS:
C
C KLON NUMBER OF GRIDPOINTS IN X-DIRECTION
C KLAT NUMBER OF GRIDPOINTS IN Y-DIRECTION
C KLEV NUMBER OF VERTICAL LEVELS
C KINT TYPE OF INTERPOLATION
C = 1 - LINEAR
C = 2 - QUADRATIC
C = 3 - CUBIC
C = 4 - MIXED CUBIC/LINEAR
C KLON1 FIRST GRIDPOINT IN X-DIRECTION
C KLON2 LAST GRIDPOINT IN X-DIRECTION
C KLAT1 FIRST GRIDPOINT IN Y-DIRECTION
C KLAT2 LAST GRIDPOINT IN Y-DIRECTION
C KP ARRAY OF INDEXES FOR HORIZONTAL DISPLACEMENTS
C KQ ARRAY OF INDEXES FOR HORIZONTAL DISPLACEMENTS
C KR ARRAY OF INDEXES FOR VERTICAL DISPLACEMENTS
C PARG ARRAY OF ARGUMENTS
C PALFH ALFA HAT
C PBETH BETA HAT
C PALFA ARRAY OF WEIGHTS IN X-DIRECTION
C PBETA ARRAY OF WEIGHTS IN Y-DIRECTION
C PGAMA ARRAY OF WEIGHTS IN VERTICAL DIRECTION
C
C OUTPUT PARAMETERS:
C
C PRES INTERPOLATED FIELD
C
C HISTORY:
C
C J.E. HAUGEN 1 1992
C
C*******************************************************************
C
IMPLICIT NONE
C
INTEGER KLON , KLAT , KLEV , KINT , KHALO,
I KLON1 , KLON2 , KLAT1 , KLAT2
C
INTEGER KP(KLON,KLAT), KQ(KLON,KLAT), KR(KLON,KLAT)
REAL PARG(2-KHALO:KLON+KHALO-1,2-KHALO:KLAT+KHALO-1,KLEV) ,
R PRES(KLON,KLAT) ,
R PALFH(KLON,KLAT) , PBETH(KLON,KLAT) ,
R PALFA(KLON,KLAT,4) , PBETA(KLON,KLAT,4),
R PGAMA(KLON,KLAT,4)
C
INTEGER JX, JY, IDX, IDY, ILEV
REAL Z1MAH, Z1MBH
C
IF (KINT.EQ.1) THEN
C LINEAR INTERPOLATION
C
DO JY = KLAT1,KLAT2
DO JX = KLON1,KLON2
IDX = KP(JX,JY)
IDY = KQ(JX,JY)
ILEV = KR(JX,JY)
C
PRES(JX,JY) = PGAMA(JX,JY,1)*(
C
+ PBETA(JX,JY,1)*( PALFA(JX,JY,1)*PARG(IDX-1,IDY-1,ILEV-1)
+ + PALFA(JX,JY,2)*PARG(IDX ,IDY-1,ILEV-1) )
+ + PBETA(JX,JY,2)*( PALFA(JX,JY,1)*PARG(IDX-1,IDY ,ILEV-1)
+ + PALFA(JX,JY,2)*PARG(IDX ,IDY ,ILEV-1) ) )
C +
+ + PGAMA(JX,JY,2)*(
C +
+ PBETA(JX,JY,1)*( PALFA(JX,JY,1)*PARG(IDX-1,IDY-1,ILEV )
+ + PALFA(JX,JY,2)*PARG(IDX ,IDY-1,ILEV ) )
+ + PBETA(JX,JY,2)*( PALFA(JX,JY,1)*PARG(IDX-1,IDY ,ILEV )
+ + PALFA(JX,JY,2)*PARG(IDX ,IDY ,ILEV ) ) )
ENDDO
ENDDO
C
ELSE
+IF (KINT.EQ.2) THEN
C QUADRATIC INTERPOLATION
C
DO JY = KLAT1,KLAT2
DO JX = KLON1,KLON2
IDX = KP(JX,JY)
IDY = KQ(JX,JY)
ILEV = KR(JX,JY)
C
PRES(JX,JY) = PGAMA(JX,JY,1)*(
C
+ PBETA(JX,JY,1)*( PALFA(JX,JY,1)*PARG(IDX-1,IDY-1,ILEV-1)
+ + PALFA(JX,JY,2)*PARG(IDX ,IDY-1,ILEV-1)
+ + PALFA(JX,JY,3)*PARG(IDX+1,IDY-1,ILEV-1) )
+ + PBETA(JX,JY,2)*( PALFA(JX,JY,1)*PARG(IDX-1,IDY ,ILEV-1)
+ + PALFA(JX,JY,2)*PARG(IDX ,IDY ,ILEV-1)
+ + PALFA(JX,JY,3)*PARG(IDX+1,IDY ,ILEV-1) )
+ + PBETA(JX,JY,3)*( PALFA(JX,JY,1)*PARG(IDX-1,IDY+1,ILEV-1)
+ + PALFA(JX,JY,2)*PARG(IDX ,IDY+1,ILEV-1)
+ + PALFA(JX,JY,3)*PARG(IDX+1,IDY+1,ILEV-1) ) )
C +
+ + PGAMA(JX,JY,2)*(
C +
+ PBETA(JX,JY,1)*( PALFA(JX,JY,1)*PARG(IDX-1,IDY-1,ILEV )
+ + PALFA(JX,JY,2)*PARG(IDX ,IDY-1,ILEV )
+ + PALFA(JX,JY,3)*PARG(IDX+1,IDY-1,ILEV ) )
+ + PBETA(JX,JY,2)*( PALFA(JX,JY,1)*PARG(IDX-1,IDY ,ILEV )
+ + PALFA(JX,JY,2)*PARG(IDX ,IDY ,ILEV )
+ + PALFA(JX,JY,3)*PARG(IDX+1,IDY ,ILEV ) )
+ + PBETA(JX,JY,3)*( PALFA(JX,JY,1)*PARG(IDX-1,IDY+1,ILEV )
+ + PALFA(JX,JY,2)*PARG(IDX ,IDY+1,ILEV )
+ + PALFA(JX,JY,3)*PARG(IDX+1,IDY+1,ILEV ) ) )
C +
+ + PGAMA(JX,JY,3)*(
C +
+ PBETA(JX,JY,1)*( PALFA(JX,JY,1)*PARG(IDX-1,IDY-1,ILEV+1)
+ + PALFA(JX,JY,2)*PARG(IDX ,IDY-1,ILEV+1)
+ + PALFA(JX,JY,3)*PARG(IDX+1,IDY-1,ILEV+1) )
+ + PBETA(JX,JY,2)*( PALFA(JX,JY,1)*PARG(IDX-1,IDY ,ILEV+1)
+ + PALFA(JX,JY,2)*PARG(IDX ,IDY ,ILEV+1)
+ + PALFA(JX,JY,3)*PARG(IDX+1,IDY ,ILEV+1) )
+ + PBETA(JX,JY,3)*( PALFA(JX,JY,1)*PARG(IDX-1,IDY+1,ILEV+1)
+ + PALFA(JX,JY,2)*PARG(IDX ,IDY+1,ILEV+1)
+ + PALFA(JX,JY,3)*PARG(IDX+1,IDY+1,ILEV+1) ) )
ENDDO
ENDDO
C
ELSE
+IF (KINT.EQ.3) THEN
C CUBIC INTERPOLATION
C
DO JY = KLAT1,KLAT2
DO JX = KLON1,KLON2
IDX = KP(JX,JY)
IDY = KQ(JX,JY)
ILEV = KR(JX,JY)
C
PRES(JX,JY) = PGAMA(JX,JY,1)*(
C
+ PBETA(JX,JY,1)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY-2,ILEV-2)
+ + PALFA(JX,JY,2)*PARG(IDX-1,IDY-2,ILEV-2)
+ + PALFA(JX,JY,3)*PARG(IDX ,IDY-2,ILEV-2)
+ + PALFA(JX,JY,4)*PARG(IDX+1,IDY-2,ILEV-2) )
+ + PBETA(JX,JY,2)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY-1,ILEV-2)
+ + PALFA(JX,JY,2)*PARG(IDX-1,IDY-1,ILEV-2)
+ + PALFA(JX,JY,3)*PARG(IDX ,IDY-1,ILEV-2)
+ + PALFA(JX,JY,4)*PARG(IDX+1,IDY-1,ILEV-2) )
+ + PBETA(JX,JY,3)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY ,ILEV-2)
+ + PALFA(JX,JY,2)*PARG(IDX-1,IDY ,ILEV-2)
+ + PALFA(JX,JY,3)*PARG(IDX ,IDY ,ILEV-2)
+ + PALFA(JX,JY,4)*PARG(IDX+1,IDY ,ILEV-2) )
+ + PBETA(JX,JY,4)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY+1,ILEV-2)
+ + PALFA(JX,JY,2)*PARG(IDX-1,IDY+1,ILEV-2)
+ + PALFA(JX,JY,3)*PARG(IDX ,IDY+1,ILEV-2)
+ + PALFA(JX,JY,4)*PARG(IDX+1,IDY+1,ILEV-2) ) )
C +
+ + PGAMA(JX,JY,2)*(
C +
+ PBETA(JX,JY,1)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY-2,ILEV-1)
+ + PALFA(JX,JY,2)*PARG(IDX-1,IDY-2,ILEV-1)
+ + PALFA(JX,JY,3)*PARG(IDX ,IDY-2,ILEV-1)
+ + PALFA(JX,JY,4)*PARG(IDX+1,IDY-2,ILEV-1) )
+ + PBETA(JX,JY,2)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY-1,ILEV-1)
+ + PALFA(JX,JY,2)*PARG(IDX-1,IDY-1,ILEV-1)
+ + PALFA(JX,JY,3)*PARG(IDX ,IDY-1,ILEV-1)
+ + PALFA(JX,JY,4)*PARG(IDX+1,IDY-1,ILEV-1) )
+ + PBETA(JX,JY,3)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY ,ILEV-1)
+ + PALFA(JX,JY,2)*PARG(IDX-1,IDY ,ILEV-1)
+ + PALFA(JX,JY,3)*PARG(IDX ,IDY ,ILEV-1)
+ + PALFA(JX,JY,4)*PARG(IDX+1,IDY ,ILEV-1) )
+ + PBETA(JX,JY,4)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY+1,ILEV-1)
+ + PALFA(JX,JY,2)*PARG(IDX-1,IDY+1,ILEV-1)
+ + PALFA(JX,JY,3)*PARG(IDX ,IDY+1,ILEV-1)
+ + PALFA(JX,JY,4)*PARG(IDX+1,IDY+1,ILEV-1) ) )
C +
+ + PGAMA(JX,JY,3)*(
C +
+ PBETA(JX,JY,1)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY-2,ILEV )
+ + PALFA(JX,JY,2)*PARG(IDX-1,IDY-2,ILEV )
+ + PALFA(JX,JY,3)*PARG(IDX ,IDY-2,ILEV )
+ + PALFA(JX,JY,4)*PARG(IDX+1,IDY-2,ILEV ) )
+ + PBETA(JX,JY,2)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY-1,ILEV )
+ + PALFA(JX,JY,2)*PARG(IDX-1,IDY-1,ILEV )
+ + PALFA(JX,JY,3)*PARG(IDX ,IDY-1,ILEV )
+ + PALFA(JX,JY,4)*PARG(IDX+1,IDY-1,ILEV ) )
+ + PBETA(JX,JY,3)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY ,ILEV )
+ + PALFA(JX,JY,2)*PARG(IDX-1,IDY ,ILEV )
+ + PALFA(JX,JY,3)*PARG(IDX ,IDY ,ILEV )
+ + PALFA(JX,JY,4)*PARG(IDX+1,IDY ,ILEV ) )
+ + PBETA(JX,JY,4)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY+1,ILEV )
+ + PALFA(JX,JY,2)*PARG(IDX-1,IDY+1,ILEV )
+ + PALFA(JX,JY,3)*PARG(IDX ,IDY+1,ILEV )
+ + PALFA(JX,JY,4)*PARG(IDX+1,IDY+1,ILEV ) ) )
C +
+ + PGAMA(JX,JY,4)*(
C +
+ PBETA(JX,JY,1)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY-2,ILEV+1)
+ + PALFA(JX,JY,2)*PARG(IDX-1,IDY-2,ILEV+1)
+ + PALFA(JX,JY,3)*PARG(IDX ,IDY-2,ILEV+1)
+ + PALFA(JX,JY,4)*PARG(IDX+1,IDY-2,ILEV+1) )
+ + PBETA(JX,JY,2)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY-1,ILEV+1)
+ + PALFA(JX,JY,2)*PARG(IDX-1,IDY-1,ILEV+1)
+ + PALFA(JX,JY,3)*PARG(IDX ,IDY-1,ILEV+1)
+ + PALFA(JX,JY,4)*PARG(IDX+1,IDY-1,ILEV+1) )
+ + PBETA(JX,JY,3)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY ,ILEV+1)
+ + PALFA(JX,JY,2)*PARG(IDX-1,IDY ,ILEV+1)
+ + PALFA(JX,JY,3)*PARG(IDX ,IDY ,ILEV+1)
+ + PALFA(JX,JY,4)*PARG(IDX+1,IDY ,ILEV+1) )
+ + PBETA(JX,JY,4)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY+1,ILEV+1)
+ + PALFA(JX,JY,2)*PARG(IDX-1,IDY+1,ILEV+1)
+ + PALFA(JX,JY,3)*PARG(IDX ,IDY+1,ILEV+1)
+ + PALFA(JX,JY,4)*PARG(IDX+1,IDY+1,ILEV+1) ) )
ENDDO
ENDDO
C
ELSE
+IF (KINT.EQ.4) THEN
C MIXED CUBIC/LINEAR INTERPOLATION
C
DO JY = KLAT1,KLAT2
DO JX = KLON1,KLON2
IDX = KP(JX,JY)
IDY = KQ(JX,JY)
ILEV = KR(JX,JY)
C
Z1MAH = 1.0 - PALFH(JX,JY)
Z1MBH = 1.0 - PBETH(JX,JY)
C
PRES(JX,JY) = PGAMA(JX,JY,1)*(
C
+ PBETH(JX,JY) *( PALFH(JX,JY) *PARG(IDX-1,IDY-1,ILEV-2)
+ + Z1MAH *PARG(IDX ,IDY-1,ILEV-2) )
+ + Z1MBH *( PALFH(JX,JY) *PARG(IDX-1,IDY ,ILEV-2)
+ + Z1MAH *PARG(IDX ,IDY ,ILEV-2) ) )
C +
+ + PGAMA(JX,JY,4)*(
C +
+ PBETH(JX,JY) *( PALFH(JX,JY) *PARG(IDX-1,IDY-1,ILEV+1)
+ + Z1MAH *PARG(IDX ,IDY-1,ILEV+1) )
+ + Z1MBH *( PALFH(JX,JY) *PARG(IDX-1,IDY ,ILEV+1)
+ + Z1MAH *PARG(IDX ,IDY ,ILEV+1) ) )
C +
+ + PGAMA(JX,JY,2)*(
C +
+ PBETA(JX,JY,1)*( PALFH(JX,JY) *PARG(IDX-1,IDY-2,ILEV-1)
+ + Z1MAH *PARG(IDX ,IDY-2,ILEV-1) )
+ + PBETA(JX,JY,2)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY-1,ILEV-1)
+ + PALFA(JX,JY,2)*PARG(IDX-1,IDY-1,ILEV-1)
+ + PALFA(JX,JY,3)*PARG(IDX ,IDY-1,ILEV-1)
+ + PALFA(JX,JY,4)*PARG(IDX+1,IDY-1,ILEV-1) )
+ + PBETA(JX,JY,3)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY ,ILEV-1)
+ + PALFA(JX,JY,2)*PARG(IDX-1,IDY ,ILEV-1)
+ + PALFA(JX,JY,3)*PARG(IDX ,IDY ,ILEV-1)
+ + PALFA(JX,JY,4)*PARG(IDX+1,IDY ,ILEV-1) )
+ + PBETA(JX,JY,4)*( PALFH(JX,JY) *PARG(IDX-1,IDY+1,ILEV-1)
+ + Z1MAH *PARG(IDX ,IDY+1,ILEV-1) ) )
C +
+ + PGAMA(JX,JY,3)*(
C +
+ PBETA(JX,JY,1)*( PALFH(JX,JY) *PARG(IDX-1,IDY-2,ILEV )
+ + Z1MAH *PARG(IDX ,IDY-2,ILEV ) )
+ + PBETA(JX,JY,2)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY-1,ILEV )
+ + PALFA(JX,JY,2)*PARG(IDX-1,IDY-1,ILEV )
+ + PALFA(JX,JY,3)*PARG(IDX ,IDY-1,ILEV )
+ + PALFA(JX,JY,4)*PARG(IDX+1,IDY-1,ILEV ) )
+ + PBETA(JX,JY,3)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY ,ILEV )
+ + PALFA(JX,JY,2)*PARG(IDX-1,IDY ,ILEV )
+ + PALFA(JX,JY,3)*PARG(IDX ,IDY ,ILEV )
+ + PALFA(JX,JY,4)*PARG(IDX+1,IDY ,ILEV ) )
+ + PBETA(JX,JY,4)*( PALFH(JX,JY) *PARG(IDX-1,IDY+1,ILEV )
+ + Z1MAH *PARG(IDX ,IDY+1,ILEV ) ) )
ENDDO
ENDDO
C
ENDIF
C
RETURN
END
movq 32(%rsp), %rdx
movq 72(%rsp), %rdx
movq 80(%rsp), %rdx
vmovaps %ymm18, 296(%rsp)
movq 56(%rsp), %rdx
movq 48(%rsp), %rdx
movq 64(%rsp), %rdx
movq 40(%rsp), %rdx
vaddps 296(%rsp), %ymm19, %ymm19
cmpq 24(%rsp), %rax
movq 1384(%rsp), %rdx
vmovaps %ymm29, 2184(%rsp)
vmovaps %ymm30, 2216(%rsp)
vmovaps %ymm28, 2248(%rsp)
vmovaps %ymm17, 2280(%rsp)
vmovaps %ymm19, 2312(%rsp)
vmovaps %ymm31, 2344(%rsp)
vmovaps %ymm25, 2376(%rsp)
vmovaps %ymm21, 2408(%rsp)
vmovaps %ymm16, 2440(%rsp)
vmovaps %ymm30, 2472(%rsp)
vmovaps %ymm13, 2504(%rsp)
vmovaps %ymm11, 2536(%rsp)
vmovaps %ymm28, 2568(%rsp)
vmovaps %ymm4, 2600(%rsp)
vmovaps %ymm19, 2632(%rsp)
vmovaps %ymm17, 2664(%rsp)
vmovaps %ymm16, 2696(%rsp)
vmovaps %ymm19, 2728(%rsp)
movq 1256(%rsp), %rdx
vmovaps %ymm31, 2760(%rsp)
vmovaps %ymm21, 2792(%rsp)
vmovaps %ymm31, 2152(%rsp)
vmovaps %ymm17, 2824(%rsp)
vmovaps %ymm21, 2856(%rsp)
vmovaps %ymm17, 2888(%rsp)
vmovaps %ymm21, 2920(%rsp)
vmovaps %ymm16, 2952(%rsp)
vmovaps %ymm11, 2984(%rsp)
vmovaps %ymm31, 3016(%rsp)
vmovaps %ymm21, 3048(%rsp)
vmovaps %ymm31, 3080(%rsp)
vmovaps %ymm21, 3112(%rsp)
vmovaps %ymm29, 3144(%rsp)
vmovaps %ymm31, 3176(%rsp)
vmovaps %ymm16, 3208(%rsp)
vmovaps %ymm29, 3240(%rsp)
vmovaps %ymm4, 3272(%rsp)
vmovaps %ymm31, 3304(%rsp)
vmovaps %ymm29, 3336(%rsp)
vmovaps %ymm26, 3368(%rsp)
vmovaps %ymm25, 3400(%rsp)
vmovaps %ymm26, 3432(%rsp)
vmovaps %ymm25, 3464(%rsp)
vmovaps %ymm25, 3496(%rsp)
vmovaps %ymm31, 3528(%rsp)
vmovaps %ymm2, 3560(%rsp)
vmovaps %ymm29, 3592(%rsp)
vmovaps 3144(%rsp), %ymm8
vshuff32x4 $0, 3176(%rsp), %ymm8, %ymm0
vshuff32x4 $0, 3208(%rsp), %ymm11, %ymm11
vmovaps 3016(%rsp), %ymm10
vmovaps 2152(%rsp), %ymm31
vshuff32x4 $0, 3272(%rsp), %ymm21, %ymm21
vshuff32x4 $0, 3240(%rsp), %ymm15, %ymm15
vshuff32x4 $0, 3048(%rsp), %ymm10, %ymm0
vmovaps 3080(%rsp), %ymm10
vshuff32x4 $0, 3112(%rsp), %ymm10, %ymm5
vmovaps 2632(%rsp), %ymm5
vshuff32x4 $0, 2664(%rsp), %ymm5, %ymm8
vmovaps 2696(%rsp), %ymm5
vshuff32x4 $0, 2728(%rsp), %ymm5, %ymm5
vmovaps 2760(%rsp), %ymm15
vshuff32x4 $0, 2792(%rsp), %ymm15, %ymm9
vmovaps 2824(%rsp), %ymm15
vmovaps 2888(%rsp), %ymm0
vshuff32x4 $0, 2920(%rsp), %ymm0, %ymm8
vmovaps 2952(%rsp), %ymm0
vshuff32x4 $0, 2984(%rsp), %ymm0, %ymm0
vshuff32x4 $0, 2856(%rsp), %ymm15, %ymm8
vshuff32x4 $0, 3560(%rsp), %ymm19, %ymm19
vmovaps 3304(%rsp), %ymm7
vshuff32x4 $0, 3336(%rsp), %ymm7, %ymm0
vmovaps 3368(%rsp), %ymm7
vshuff32x4 $0, 3528(%rsp), %ymm13, %ymm13
vshuff32x4 $0, 3400(%rsp), %ymm7, %ymm1
vmovaps 3432(%rsp), %ymm18
vshuff32x4 $0, 3592(%rsp), %ymm20, %ymm20
vshuff32x4 $0, 3464(%rsp), %ymm18, %ymm18
vshuff32x4 $0, 3496(%rsp), %ymm26, %ymm1
vmovaps 2184(%rsp), %ymm29
movq 1224(%rsp), %rdx
vmovaps 2440(%rsp), %ymm20
vshuff32x4 $0, 2216(%rsp), %ymm29, %ymm0
vmovaps 2248(%rsp), %ymm29
vshuff32x4 $0, 2280(%rsp), %ymm29, %ymm1
vmovaps 2312(%rsp), %ymm29
movq 1288(%rsp), %rdx
vshuff32x4 $0, 2344(%rsp), %ymm29, %ymm1
vmovaps 2376(%rsp), %ymm29
vshuff32x4 $0, 2408(%rsp), %ymm29, %ymm2
vshuff32x4 $0, 2472(%rsp), %ymm20, %ymm1
vmovaps 2504(%rsp), %ymm20
vshuff32x4 $0, 2600(%rsp), %ymm28, %ymm3
vshuff32x4 $0, 2536(%rsp), %ymm20, %ymm2
movq 1320(%rsp), %rdx
vshuff32x4 $0, 2568(%rsp), %ymm30, %ymm2
movq 1352(%rsp), %rdx
movq 1192(%rsp), %rdx
cmpq 1160(%rsp), %rax