Hello community, here is the log from the commit of package libatlas3 for openSUSE:Factory checked in at 2015-08-11 08:26:22 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Comparing /work/SRC/openSUSE:Factory/libatlas3 (Old) and /work/SRC/openSUSE:Factory/.libatlas3.new (New) ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Package is "libatlas3" Changes: -------- --- /work/SRC/openSUSE:Factory/libatlas3/libatlas3.changes 2015-06-09 12:25:13.000000000 +0200 +++ /work/SRC/openSUSE:Factory/.libatlas3.new/libatlas3.changes 2015-08-11 08:26:23.000000000 +0200 @@ -1,0 +2,40 @@ +Sun Aug 9 13:01:20 UTC 2015 - p.drou...@gmail.com + +- Update to version 3.10.2 + * Fixed all errataed bugs: + + Failure to init workspace can cause NaNs in SYRK + + Complex row-major Q-type factorizations produce bad TAU + + Failure to cast causes integer overflow on 64-byt platforms + + Missing IBM S390 assembly file + * Fixed Make.bin to have threaded latime built to do parallel cache flushing + * Extended extract string lengths as patched by SAGE folks + * Backported fixes & some arch support to configure framework, including + host of Itanium and UST1 stuff provided by SAGE folks + NOTE: 3.10.2 is terribly out of date, and was released only because the + threading rewrite it taking too long. If possible, you should use a + developer release after testing that it works for your particular + platform. In particular, developer releases are *much* faster for any + x86 that uses AVX or later SIMD ISA, or any machine with ncores >= 8. + The developer release also supports ARM architectures better (though + performance is not hugely better if you can get stable installed). + +------------------------------------------------------------------- +Wed Aug 5 13:05:41 UTC 2015 - norm...@linux.vnet.ibm.com + +- For ppc64/ppc64le architectures: + Add support of Power8 cpu + Do not support lvx files for ppc64le (temporarily) + In spec create power8 archives files if do not exist yet + POWER864VSX from POWER764VSX and + POWER864LEVSX from POWER764LEVSX + removed patch: + xlf.command.not.found.patch + libatlas.ppc64le-abiv2.patch + new patches: + issue_64.patch + atlas.3.10.1-ppc64le_abiv2.patch + atlas-new_archdef_for_ppc64le.patch + atlas.3.10.1-add_power8_cpu.patch + atlas.3.10.2-ppc64le_do_not_use_files_with_lvx.patch + +------------------------------------------------------------------- Old: ---- atlas3.10.1.tar.bz2 libatlas.ppc64le-abiv2.patch xlf.command.not.found.patch New: ---- atlas-new_archdef_for_ppc64le.patch atlas.3.10.1-add_power8_cpu.patch atlas.3.10.1-ppc64le_abiv2.patch atlas.3.10.2-ppc64le_do_not_use_files_with_lvx.patch atlas3.10.2.tar.bz2 issue_64.patch ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Other differences: ------------------ ++++++ libatlas3.spec ++++++ --- /var/tmp/diff_new_pack.sslbli/_old 2015-08-11 08:26:24.000000000 +0200 +++ /var/tmp/diff_new_pack.sslbli/_new 2015-08-11 08:26:24.000000000 +0200 @@ -19,7 +19,7 @@ %define enable_native_atlas 0 Name: libatlas3 -Version: 3.10.1 +Version: 3.10.2 Release: 0 Summary: Automatically Tuned Linear Algebra Software License: BSD-3-Clause and GPL-2.0 @@ -33,12 +33,16 @@ Source5: %name-rpmlintrc Patch0: atlas-suse-shared.patch Patch1: atlas-hack.patch -# for ppc64le -# http://sourceforge.net/p/math-atlas/mailman/message/32471499/ +# for ppc64 ppc64le +# https://bugzilla.redhat.com/show_bug.cgi?id=1080073#c40 Patch10: getdoublearr.stripwhite.patch -Patch11: xlf.command.not.found.patch +Patch11: issue_64.patch Patch12: initialize_malloc_memory.invtrsm.wms.oct23.patch -Patch13: libatlas.ppc64le-abiv2.patch +Patch13: atlas.3.10.1-ppc64le_abiv2.patch +Patch14: atlas-new_archdef_for_ppc64le.patch +Patch15: atlas.3.10.1-add_power8_cpu.patch +# for ppc64le tempo patch +Patch16: atlas.3.10.2-ppc64le_do_not_use_files_with_lvx.patch BuildRoot: %{_tmppath}/%{name}-%{version}-build BuildRequires: gcc-fortran @@ -194,16 +198,51 @@ %ifarch x86_64 i586 %patch1 %endif -%ifarch ppc64le +%ifarch ppc64le ppc64 %patch10 -p1 %patch11 -p1 %patch12 -p1 %patch13 -p1 +%patch14 -p1 +%patch15 -p1 +%endif +%ifarch ppc64le +%patch16 -p1 %endif cp %{SOURCE2} doc cp %{SOURCE3} %{SOURCE4} CONFIG/ARCHS/ +# if Power8 archdef do not exist yet +# then use the Power7 one that may be the same. +# do that for BE and LE: +%ifarch ppc64 ppc64le +P8archdef='POWER864VSX.tar.bz2' +P7archdef='POWER764VSX.tar.bz2' +if [ ! -e CONFIG/ARCHS/${P8archdef} ]; then + cp CONFIG/ARCHS/${P7archdef} /tmp/ + pushd /tmp + tar -xjf ${P7archdef} + rm -rf POWER864VSX + mv POWER764VSX POWER864VSX + tar -cjf ${P8archdef} POWER864VSX + popd + mv /tmp/${P8archdef} CONFIG/ARCHS/ +fi +P8archdef='POWER864LEVSX.tar.bz2' +P7archdef='POWER764LEVSX.tar.bz2' +if [ ! -e CONFIG/ARCHS/${P8archdef} ]; then + cp CONFIG/ARCHS/${P7archdef} /tmp/ + pushd /tmp + tar -xjf ${P7archdef} + rm -rf POWER864LEVSX + mv POWER764LEVSX POWER864LEVSX + tar -cjf ${P8archdef} POWER864LEVSX + popd + mv /tmp/${P8archdef} CONFIG/ARCHS/ +fi +%endif + %build for type in %{types}; do if [ "$type" = "base" ]; then @@ -239,12 +278,6 @@ sed -i 's#-m64#-m32#g' Make.inc %endif -# use the provided archdef file for ppc64le -# and force its usage in INSTFLAGS. -%ifarch ppc64le - sed -i 's#\(ARCH = POWER.64\)VSX#\1LEVSX#' Make.inc - sed -i 's#\(INSTFLAGS =.*\) -a 0#\1 -a 1#' Make.inc -%endif make build %{?_smp_mflags} cd lib make shared %{?_smp_mflags} ++++++ atlas-new_archdef_for_ppc64le.patch ++++++ Subject: atlas new archdef for ppc64le From: Michel Normand <norm...@linux.vnet.ibm.com> Date: Sun, 13 Jun 2014 18:02:47 +0200 Need to define different archdef names for ppc64 (that is Big Endian) and ppc64le (that is Little Endian). This is already done upstream in atlas 3.11.30 with issue https://sourceforge.net/p/math-atlas/patches/66/ Required at least as long as I need the bypass of atlas.3.10.2-ppc64le_do_not_use_files_with_lvx.patch Signed-off-by: Michel Normand <norm...@linux.vnet.ibm.com> --- CONFIG/src/SpewMakeInc.c | 4 ++++ 1 file changed, 4 insertions(+) Index: ATLAS/CONFIG/src/SpewMakeInc.c =================================================================== --- ATLAS.orig/CONFIG/src/SpewMakeInc.c +++ ATLAS/CONFIG/src/SpewMakeInc.c @@ -542,6 +542,10 @@ int main(int nargs, char **args) fprintf(fpout, "# -------------------------------------------------\n"); fprintf(fpout, " ARCH = %s", machnam[mach]); fprintf(fpout, "%d", ptrbits); + /* for ppc64le archi add 'LE' characters */ + #if defined(__powerpc64__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) + fprintf(fpout, "%s", "LE"); + #endif if (ISAX) fprintf(fpout, "%s", ISAXNAM[ISAX]); if (!USEIEEE) ++++++ atlas.3.10.1-add_power8_cpu.patch ++++++ From: Michel Normand <norm...@linux.vnet.ibm.com> Subject: atlas.3.10.1 add power8 cpu Date: Thu, 18 Sep 2014 15:13:24 +0200 atlas.3.10.1 add Power8 cpu tracked upstream by issue 67 https://sourceforge.net/p/math-atlas/patches/67/ Signed-off-by: Michel Normand <norm...@linux.vnet.ibm.com> --- CONFIG/ARCHS/Make.ext | 7 +++++++ CONFIG/include/atlconf.h | 6 +++--- CONFIG/src/atlcomp.txt | 6 ++++++ CONFIG/src/backend/archinfo_aix.c | 2 ++ CONFIG/src/backend/archinfo_linux.c | 1 + include/atlas_pca.h | 2 +- 6 files changed, 20 insertions(+), 4 deletions(-) Index: ATLAS/CONFIG/ARCHS/Make.ext =================================================================== --- ATLAS.orig/CONFIG/ARCHS/Make.ext +++ ATLAS/CONFIG/ARCHS/Make.ext @@ -33,6 +33,7 @@ files = AMD64K10h32SSE3.tar.bz2 AMD64K10 MIPSR1xK64.tar.bz2 Makefile P432SSE2.tar.bz2 P4E32SSE3.tar.bz2 \ P4E64SSE3.tar.bz2 PIII32SSE1.tar.bz2 POWER432.tar.bz2 \ POWER464.tar.bz2 POWER564.tar.bz2 POWER764VSX.tar.bz2 \ + POWER864VSX.tar.bz2 \ PPCG432AltiVec.tar.bz2 PPCG532AltiVec.tar.bz2 PPCG564AltiVec.tar.bz2 \ PPRO32.tar.bz2 USIII32.tar.bz2 USIII64.tar.bz2 USIV32.tar.bz2 \ USIV64.tar.bz2 UST232.tar.bz2 UST264.tar.bz2 atlas_test1.1.3.tar.bz2 \ @@ -302,6 +303,12 @@ POWER764VSX.tar.bz2 : $(basdr)/POWER764V /tmp/POWER764VSX.tar POWER764VSX bzip2 /tmp/POWER764VSX.tar mv /tmp/POWER764VSX.tar.bz2 ./. +POWER864VSX.tar.bz2 : $(basdr)/POWER864VSX + - rm -f /tmp/POWER864VSX.tar /tmp/POWER864VSX.tar.bz2 + cd $(basdr) ; tar --dereference --exclude 'CVS' -c -f \ + /tmp/POWER864VSX.tar POWER864VSX + bzip2 /tmp/POWER864VSX.tar + mv /tmp/POWER864VSX.tar.bz2 ./. IBMz1032.tar.bz2 : $(basdr)/IBMz1032 - rm -f /tmp/IBMz1032.tar /tmp/IBMz1032.tar.bz2 cd $(basdr) ; tar --dereference --exclude 'CVS' -c -f \ Index: ATLAS/CONFIG/include/atlconf.h =================================================================== --- ATLAS.orig/CONFIG/include/atlconf.h +++ ATLAS/CONFIG/include/atlconf.h @@ -18,10 +18,10 @@ enum OSTYPE {OSOther=0, OSLinux, OSSunOS enum ARCHFAM {AFOther=0, AFPPC, AFSPARC, AFALPHA, AFX86, AFIA64, AFMIPS, AFARM, AFS390}; -#define NMACH 47 +#define NMACH 48 static char *machnam[NMACH] = {"UNKNOWN", "POWER3", "POWER4", "POWER5", "PPCG4", "PPCG5", - "POWER6", "POWER7", "IBMz9", "IBMz10", "IBMz196", + "POWER6", "POWER7", "POWER8", "IBMz9", "IBMz10", "IBMz196", "x86x87", "x86SSE1", "x86SSE2", "x86SSE3", "P5", "P5MMX", "PPRO", "PII", "PIII", "PM", "CoreSolo", "CoreDuo", "Core2Solo", "Core2", "Corei1", "Corei2", "Atom", "P4", "P4E", @@ -30,7 +30,7 @@ static char *machnam[NMACH] = "USI", "USII", "USIII", "USIV", "UST2", "UnknownUS", "MIPSR1xK", "MIPSICE9", "ARMv7"}; enum MACHTYPE {MACHOther, IbmPwr3, IbmPwr4, IbmPwr5, PPCG4, PPCG5, - IbmPwr6, IbmPwr7, + IbmPwr6, IbmPwr7, IbmPwr8, IbmZ9, IbmZ10, IbmZ196, /* s390(x) in Linux */ x86x87, x86SSE1, x86SSE2, x86SSE3, /* generic targets */ IntP5, IntP5MMX, IntPPRO, IntPII, IntPIII, IntPM, IntCoreS, Index: ATLAS/CONFIG/src/atlcomp.txt =================================================================== --- ATLAS.orig/CONFIG/src/atlcomp.txt +++ ATLAS/CONFIG/src/atlcomp.txt @@ -186,6 +186,10 @@ MACH=PPCG5 OS=ALL LVL=1000 COMPS=dmc,icc 'gcc' '-mpowerpc64 -maltivec -mabi=altivec -mcpu=970 -mtune=970 -O2' MACH=PPCG5 OS=ALL LVL=1000 COMPS=skc 'gcc' '-mpowerpc64 -maltivec -mabi=altivec -mcpu=970 -mtune=970 -O2 -mvrsave' +MACH=POWER8 OS=ALL LVL=1010 COMPS=icc,smc,dmc,skc,dkc,xcc,gcc + 'gcc' '-O2 -mvsx -mcpu=power8 -mtune=power8 -m64 -mvrsave -funroll-all-loops' +MACH=POWER8 OS=ALL LVL=1010 COMPS=f77 + 'gfortran' '-O2 -mvsx -mcpu=power8 -mtune=power8 -m64 -mvrsave -funroll-all-loops' MACH=POWER7 OS=ALL LVL=1010 COMPS=icc,smc,dmc,skc,dkc,xcc,gcc 'gcc' '-O2 -mvsx -mcpu=power7 -mtune=power7 -m64 -mvrsave -funroll-all-loops' MACH=POWER7 OS=ALL LVL=1010 COMPS=f77 @@ -206,6 +210,8 @@ MACH=POWER4 OS=ALL LVL=1010 COMPS=icc,dm 'gcc' '-mcpu=power4 -mtune=power4 -O3 -fno-schedule-insns -fno-rerun-loop-opt' MACH=POWER4 OS=ALL LVL=1010 COMPS=f77 'xlf' '-qtune=pwr4 -qarch=pwr4 -O3 -qmaxmem=-1 -qfloat=hsflt' +MACH=POWER8 OS=ALL LVL=1010 COMPS=f77 + 'xlf' '-qtune=pwr8 -qarch=pwr8 -O3 -qmaxmem=-1 -qfloat=hsflt' # # IBM System z or zEnterprise. # These compiler flags given by IBM; -O3 -funroll-loops are chosen because Index: ATLAS/CONFIG/src/backend/archinfo_linux.c =================================================================== --- ATLAS.orig/CONFIG/src/backend/archinfo_linux.c +++ ATLAS/CONFIG/src/backend/archinfo_linux.c @@ -77,6 +77,7 @@ enum MACHTYPE ProbeArch() else if (strstr(res, "7455")) mach = PPCG4; else if (strstr(res, "PPC970FX")) mach = PPCG5; else if (strstr(res, "PPC970MP")) mach = PPCG5; + else if (strstr(res, "POWER8")) mach = IbmPwr8; else if (strstr(res, "POWER7")) mach = IbmPwr7; else if (strstr(res, "POWER6")) mach = IbmPwr6; else if (strstr(res, "POWER5")) mach = IbmPwr5; Index: ATLAS/include/atlas_pca.h =================================================================== --- ATLAS.orig/include/atlas_pca.h +++ ATLAS/include/atlas_pca.h @@ -26,7 +26,7 @@ #endif #elif defined(ATL_ARCH_POWER3) || defined(ATL_ARCH_POWER4) || \ defined(ATL_ARCH_POWER5) || defined(ATL_ARCH_POWER6) || \ - defined(ATL_ARCH_POWER7) + defined(ATL_ARCH_POWER7) || defined(ATL_ARCH_POWER8) #ifdef __GNUC__ #define ATL_membarrier __asm__ __volatile__ ("dcs") /* #define ATL_USEPCA 1 */ Index: ATLAS/CONFIG/src/backend/archinfo_aix.c =================================================================== --- ATLAS.orig/CONFIG/src/backend/archinfo_aix.c +++ ATLAS/CONFIG/src/backend/archinfo_aix.c @@ -67,6 +67,8 @@ enum MACHTYPE ProbeArch() { if (strstr(res, "PowerPC_POWER5")) mach = IbmPwr5; + else if (strstr(res, "PowerPC_POWER8")) + mach = IbmPwr8; else if (strstr(res, "PowerPC_POWER7")) mach = IbmPwr7; else if (strstr(res, "PowerPC_POWER6")) ++++++ atlas.3.10.1-ppc64le_abiv2.patch ++++++ From: Michel Normand <norm...@linux.vnet.ibm.com> Subject: atlas.ppc64le abiv2 Date: Mon, 14 Apr 2014 18:03:06 +0200 References: http://sourceforge.net/p/math-atlas/mailman/message/32471499/ atlas.ppc64le abiv2 * do not use opd section for ABI V2 * define TOC in r2 in function call TODO: may be not required everywhere. based on work of Guy and Thierry TODO: still have to work on stack FSIZE TODO: for ATLAS/tune/blas/gemm/CASES/ATL_dmm4x4x80_ppc.c need to better understand the change about ld pC0 Signed-off-by: Michel Normand <norm...@linux.vnet.ibm.com> --- CONFIG/src/backend/probe_AltiVec.S | 2 +- CONFIG/src/backend/probe_VSX.S | 2 +- src/threads/ATL_DecAtomicCount_ppc.S | 2 +- src/threads/ATL_ResetAtomicCount_ppc.S | 2 +- tune/blas/gemm/CASES/ATL_cmm4x4x128_av.c | 9 ++++++++- tune/blas/gemm/CASES/ATL_dmm4x4x2pf_av.c | 9 ++++++++- tune/blas/gemm/CASES/ATL_dmm4x4x32_ppc.c | 9 ++++++++- tune/blas/gemm/CASES/ATL_dmm4x4x80_ppc.c | 20 ++++++++++++++++++-- tune/blas/gemm/CASES/ATL_smm4x4x128_av.c | 23 ++++++++++++++++++++++- 9 files changed, 68 insertions(+), 10 deletions(-) Index: ATLAS/CONFIG/src/backend/probe_AltiVec.S =================================================================== --- ATLAS.orig/CONFIG/src/backend/probe_AltiVec.S +++ ATLAS/CONFIG/src/backend/probe_AltiVec.S @@ -6,7 +6,7 @@ * */ .text -#if defined(ATL_USE64BITS) && defined (ATL_OS_Linux) +#if defined(ATL_USE64BITS) && defined (ATL_OS_Linux) && _CALL_ELF != 2 .align 2 .globl ATL_asmdecor(do_vsum) .section ".opd","aw" Index: ATLAS/CONFIG/src/backend/probe_VSX.S =================================================================== --- ATLAS.orig/CONFIG/src/backend/probe_VSX.S +++ ATLAS/CONFIG/src/backend/probe_VSX.S @@ -6,7 +6,7 @@ * */ .text -#if defined(ATL_USE64BITS) && defined (ATL_OS_Linux) +#if defined(ATL_USE64BITS) && defined (ATL_OS_Linux) && _CALL_ELF != 2 .align 2 .globl ATL_asmdecor(do_vsum) .section ".opd","aw" Index: ATLAS/src/threads/ATL_DecAtomicCount_ppc.S =================================================================== --- ATLAS.orig/src/threads/ATL_DecAtomicCount_ppc.S +++ ATLAS/src/threads/ATL_DecAtomicCount_ppc.S @@ -4,7 +4,7 @@ .globl _ATL_DecAtomicCount _ATL_DecAtomicCount: #else - #if defined(ATL_USE64BITS) + #if defined(ATL_USE64BITS) && _CALL_ELF != 2 /* * Official Program Descripter section, seg fault w/o it on Linux/PPC64 */ Index: ATLAS/src/threads/ATL_ResetAtomicCount_ppc.S =================================================================== --- ATLAS.orig/src/threads/ATL_ResetAtomicCount_ppc.S +++ ATLAS/src/threads/ATL_ResetAtomicCount_ppc.S @@ -4,7 +4,7 @@ .globl _ATL_ResetAtomicCount _ATL_ResetAtomicCount: #else - #if defined(ATL_USE64BITS) + #if defined(ATL_USE64BITS) && _CALL_ELF != 2 /* * Official Program Descripter section, seg fault w/o it on Linux/PPC64 */ Index: ATLAS/tune/blas/gemm/CASES/ATL_cmm4x4x128_av.c =================================================================== --- ATLAS.orig/tune/blas/gemm/CASES/ATL_cmm4x4x128_av.c +++ ATLAS/tune/blas/gemm/CASES/ATL_cmm4x4x128_av.c @@ -181,7 +181,7 @@ void ATL_USERMM(const int M, const int N .globl Mjoin(_,ATL_USERMM) Mjoin(_,ATL_USERMM): #else - #if defined(ATL_USE64BITS) + #if defined(ATL_USE64BITS) && _CALL_ELF != 2 /* * Official Program Descripter section, seg fault w/o it on Linux/PPC64 */ @@ -258,8 +258,15 @@ ATL_USERMM: eqv r0, r0, r0 /* all 1s */ ATL_WriteVRSAVE(r0) /* signal we use all vector regs */ #if defined (ATL_USE64BITS) +#if _CALL_ELF == 2 +/* ABIv2 */ + ld pC0, FSIZE+104(r1) + ld ldc, FSIZE+112(r1) +#else +/* ABIv1 */ ld pC0, FSIZE+120(r1) ld ldc, FSIZE+128(r1) +#endif #elif defined(ATL_AS_OSX_PPC) lwz pC0, FSIZE+60(r1) lwz ldc, FSIZE+64(r1) Index: ATLAS/tune/blas/gemm/CASES/ATL_dmm4x4x2pf_av.c =================================================================== --- ATLAS.orig/tune/blas/gemm/CASES/ATL_dmm4x4x2pf_av.c +++ ATLAS/tune/blas/gemm/CASES/ATL_dmm4x4x2pf_av.c @@ -279,7 +279,7 @@ void ATL_USERMM(const int M, const int N #endif .text #ifdef ATL_GAS_LINUX_PPC - #if defined(ATL_USE64BITS) + #if defined(ATL_USE64BITS) && _CALL_ELF != 2 /* * No idea what this does, but seg fault without it (I think it is * partially resp for making code callable from both static & PIC code) @@ -405,8 +405,15 @@ Mjoin(_,ATL_USERMM): */ #ifdef ATL_GAS_LINUX_PPC #ifdef ATL_USE64BITS + #if _CALL_ELF == 2 + /* ABIv2 */ + ld pC0, 104(r1) + ld incCn, 112(r1) + #else + /* ABIv1 */ ld pC0, 120(r1) ld incCn, 128(r1) + #endif #else lwz incCn, FSIZE+8(r1) #endif Index: ATLAS/tune/blas/gemm/CASES/ATL_dmm4x4x32_ppc.c =================================================================== --- ATLAS.orig/tune/blas/gemm/CASES/ATL_dmm4x4x32_ppc.c +++ ATLAS/tune/blas/gemm/CASES/ATL_dmm4x4x32_ppc.c @@ -268,7 +268,7 @@ Mjoin(.,ATL_USERMM): .globl Mjoin(_,ATL_USERMM) Mjoin(_,ATL_USERMM): #else - #if defined(ATL_USE64BITS) + #if defined(ATL_USE64BITS) && _CALL_ELF != 2 /* * Official Program Descripter section, seg fault w/o it on Linux/PPC64 */ @@ -324,8 +324,15 @@ ATL_USERMM: #endif #ifdef ATL_USE64BITS +#if _CALL_ELF == 2 +/* ABIv2 */ + ld pC0, 104(r1) + ld incCn, 112(r1) +#else +/* ABIv1 */ ld pC0, 120(r1) ld incCn, 128(r1) +#endif #elif defined(ATL_AS_OSX_PPC) || defined(ATL_AS_AIX_PPC) lwz pC0, 68(r1) lwz incCn, 72(r1) Index: ATLAS/tune/blas/gemm/CASES/ATL_dmm4x4x80_ppc.c =================================================================== --- ATLAS.orig/tune/blas/gemm/CASES/ATL_dmm4x4x80_ppc.c +++ ATLAS/tune/blas/gemm/CASES/ATL_dmm4x4x80_ppc.c @@ -170,13 +170,21 @@ void ATL_USERMM(const int M, const int N const TYPE beta, TYPE *C, const int ldc) (r10) 8(r1) ******************************************************************************* -64 bit ABIs: +64 bit ABIv1s: r3 r4 r5 r6/f1 void ATL_USERMM(const int M, const int N, const int K, const TYPE alpha, r7 r8 r9 r10 const TYPE *A, const int lda, const TYPE *B, const int ldb, f2 120(r1) 128(r1) const TYPE beta, TYPE *C, const int ldc) + +64 bit ABIv2s: + r3 r4 r5 r6/f1 +void ATL_USERMM(const int M, const int N, const int K, const TYPE alpha, + r7 r8 r9 r10 + const TYPE *A, const int lda, const TYPE *B, const int ldb, + f2 104(r1) 112(r1) + const TYPE beta, TYPE *C, const int ldc) #endif #ifdef ATL_AS_AIX_PPC .csect .text[PR] @@ -202,7 +210,7 @@ Mjoin(.,ATL_USERMM): .globl Mjoin(_,ATL_USERMM) Mjoin(_,ATL_USERMM): #else - #if defined(ATL_USE64BITS) + #if defined(ATL_USE64BITS) && _CALL_ELF != 2 /* * Official Program Descripter section, seg fault w/o it on Linux/PPC64 */ @@ -257,9 +265,17 @@ ATL_USERMM: #endif #endif + #if defined (ATL_USE64BITS) +#if _CALL_ELF == 2 +/* ABIv2 */ + ld pC0, 104(r1) + ld incCn, 112(r1) +#else +/* ABIv1 */ ld pC0, 120(r1) ld incCn, 128(r1) +#endif #elif defined(ATL_AS_OSX_PPC) || defined(ATL_AS_AIX_PPC) lwz pC0, 68(r1) lwz incCn, 72(r1) Index: ATLAS/tune/blas/gemm/CASES/ATL_smm4x4x128_av.c =================================================================== --- ATLAS.orig/tune/blas/gemm/CASES/ATL_smm4x4x128_av.c +++ ATLAS/tune/blas/gemm/CASES/ATL_smm4x4x128_av.c @@ -196,7 +196,7 @@ void ATL_USERMM(const int M, const int N .globl Mjoin(_,ATL_USERMM) Mjoin(_,ATL_USERMM): #else - #if defined(ATL_USE64BITS) + #if defined(ATL_USE64BITS) && _CALL_ELF != 2 /* * Official Program Descripter section, seg fault w/o it on Linux/PPC64 */ @@ -221,8 +221,15 @@ ATL_USERMM: * kernel instead */ #if defined (ATL_USE64BITS) +#if _CALL_ELF == 2 +/* ABIv2 */ + ld r10, 104(r1) + ld r5, 112(r1) +#else +/* ABIv1 */ ld r10, 120(r1) ld r5, 128(r1) +#endif #elif defined(ATL_AS_OSX_PPC) lwz r10, 60(r1) lwz r5, 64(r1) @@ -285,8 +292,15 @@ ATL_USERMM: eqv r0, r0, r0 /* all 1s */ ATL_WriteVRSAVE(r0) /* signal we use all vector regs */ #if defined (ATL_USE64BITS) +#if _CALL_ELF == 2 + /* ABIv2 */ + ld pC0, FSIZE+104(r1) + ld ldc, FSIZE+112(r1) +#else + /* ABIv1 */ ld pC0, FSIZE+120(r1) ld ldc, FSIZE+128(r1) +#endif #elif defined(ATL_AS_OSX_PPC) lwz pC0, FSIZE+60(r1) lwz ldc, FSIZE+64(r1) @@ -4258,8 +4272,15 @@ UNALIGNED_C: eqv r0, r0, r0 /* all 1s */ ATL_WriteVRSAVE(r0) /* signal we use all vector regs */ #if defined (ATL_USE64BITS) +#if _CALL_ELF == 2 + /* ABIv2 */ + ld pC0, FSIZE+104(r1) + ld ldc, FSIZE+112(r1) +#else + /* ABIv1 */ ld pC0, FSIZE+120(r1) ld ldc, FSIZE+128(r1) +#endif #elif defined(ATL_AS_OSX_PPC) lwz pC0, FSIZE+60(r1) lwz ldc, FSIZE+64(r1) ++++++ atlas.3.10.2-ppc64le_do_not_use_files_with_lvx.patch ++++++ From: Michel Normand <norm...@linux.vnet.ibm.com> Subject: atlas.3.10.2 ppc64le do not use files with lvx Date: Tue, 12 Aug 2014 16:07:06 +0200 ppc64le do not use files with lvx This is a temporary patch as long as the related files are not ported yet to ppc64 little-endian. Warning: patch to be applied only for ppc64le architecture and will also need atlas-new_archdef_for_ppc64le.patch Signed-off-by: Michel Normand <norm...@linux.vnet.ibm.com> --- tune/blas/gemm/CASES/ccases.flg | 6 +----- tune/blas/gemm/CASES/dcases.flg | 8 +------- tune/blas/gemm/CASES/dcases.vnb | 4 ---- tune/blas/gemm/CASES/scases.flg | 9 +-------- tune/blas/gemm/CASES/scases.vnb | 3 --- tune/blas/gemm/CASES/zcases.flg | 8 +------- 6 files changed, 4 insertions(+), 34 deletions(-) Index: ATLAS/tune/blas/gemm/CASES/ccases.flg =================================================================== --- ATLAS.orig/tune/blas/gemm/CASES/ccases.flg +++ ATLAS/tune/blas/gemm/CASES/ccases.flg @@ -1,5 +1,5 @@ <ID> <flag> <mb> <nb> <kb> <muladd> <lat> <mu> <nu> <ku> <rout> "<Contributer>" -24 +22 304 192 4 3 8 0 4 4 3 8 ATL_mm4x3x8p.c "R. Clint Whaley" \ gcc -mcpu=ultrasparc -mtune=ultrasparc -fomit-frame-pointer -O @@ -48,13 +48,9 @@ gcc 328 480 8 8 2 1 1 8 8 2 ATL_mm8x8x2.c "R. Clint Whaley" \ gcc -fomit-frame-pointer -O2 -fno-tree-loop-optimize -329 192 4 4 4 1 16 4 4 4 ATL_cmm4x4x128_av.c "R. Clint Whaley" \ -gcc --x assembler-with-cpp 331 192 4 4 1 1 1 4 4 1 ATL_smm4x4xURx_mips.c "R. Clint Whaley" \ gcc -x assembler-with-cpp -mips4 -332 192 8 2 4 1 0 8 2 4 ATL_smm8x2x4_av.c "IBM" 333 448 4 4 2 1 1 4 4 2 ATL_smm4x4x2pf_arm.c "R. Clint Whaley" \ gcc -x assembler-with-cpp -mfpu=vfpv3 Index: ATLAS/tune/blas/gemm/CASES/scases.flg =================================================================== --- ATLAS.orig/tune/blas/gemm/CASES/scases.flg +++ ATLAS/tune/blas/gemm/CASES/scases.flg @@ -1,5 +1,5 @@ <ID> <flag> <mb> <nb> <kb> <muladd> <lat> <mu> <nu> <ku> <rout> "<Contributer>" -25 +22 304 192 4 3 8 0 4 4 3 8 ATL_mm4x3x8p.c "R. Clint Whaley" \ gcc -mcpu=ultrasparc -mtune=ultrasparc -fomit-frame-pointer -O @@ -48,16 +48,9 @@ gcc 328 480 8 8 2 1 1 8 8 2 ATL_mm8x8x2.c "R. Clint Whaley" \ gcc -fomit-frame-pointer -O2 -fno-tree-loop-optimize -329 192 4 4 4 1 16 4 4 4 ATL_smm4x4x128_av.c "R. Clint Whaley" \ -gcc --x assembler-with-cpp -330 200 92 92 92 1 16 92 92 92 ATL_smm4x4x128_av.c "R. Clint Whaley" \ -gcc --x assembler-with-cpp 331 192 4 4 1 1 1 4 4 1 ATL_smm4x4xURx_mips.c "R. Clint Whaley" \ gcc -x assembler-with-cpp -mips4 -332 192 8 2 4 1 0 8 2 4 ATL_smm8x2x4_av.c "IBM" 333 448 4 4 2 1 1 4 4 2 ATL_smm4x4x2pf_arm.c "R. Clint Whaley" \ gcc -x assembler-with-cpp -mfpu=vfpv3 Index: ATLAS/tune/blas/gemm/CASES/scases.vnb =================================================================== --- ATLAS.orig/tune/blas/gemm/CASES/scases.vnb +++ ATLAS/tune/blas/gemm/CASES/scases.vnb @@ -31,9 +31,6 @@ # Defaults: TA='t', TB='n', SSE=0, X87=0, LDBOT=1, RTKU=0, AOUTER=0, # KBMAX=KU, KBMIN=KU, BETAN1=0, RTMN=1 # -ID=1 ROUT='ATL_smm4x4x128_av.c' AUTH='R. Clint Whaley' MU=4 NU=4 KU=4 \ - LDKB=1 LDBOT=1 KBMIN=4 KBMAX=128 ASM=GAS_PPC \ - COMP='gcc' FLAGS='-x assembler-with-cpp' ID=2 ROUT='ATL_smm4x4x16_av.c' AUTH='R. Clint Whaley' MU=4 NU=4 KU=16 \ LDKB=1 LDBOT=0 KBMIN=16 KBMAX=2048 ASM=GAS_SPARC \ COMP='gcc' FLAGS='-x assembler-with-cpp' Index: ATLAS/tune/blas/gemm/CASES/dcases.flg =================================================================== --- ATLAS.orig/tune/blas/gemm/CASES/dcases.flg +++ ATLAS/tune/blas/gemm/CASES/dcases.flg @@ -1,5 +1,5 @@ <ID> <flag> <mb> <nb> <kb> <muladd> <lat> <mu> <nu> <ku> <rout> "<Contributer>" -32 +30 306 192 4 3 8 0 4 4 3 8 ATL_mm4x3x8p.c "R. Clint Whaley" \ gcc -mcpu=ultrasparc -mtune=ultrasparc -fomit-frame-pointer -O -fno-schedule-insns -fno-schedule-insns2 @@ -79,12 +79,6 @@ gcc 336 192 4 4 1 1 1 4 4 1 ATL_dmm4x4xURx_mips.c "R. Clint Whaley" \ gcc -x assembler-with-cpp -mips4 -337 192 4 4 1 1 16 4 4 1 ATL_dmm4x4x80_ppc.c "Whaley & Castaldo" \ -gcc --x assembler-with-cpp -338 192 8 4 2 1 0 8 4 2 ATL_dmm8x4x2_vsx.c "IBM" \ -gcc --O3 -mvsx 339 448 4 4 2 1 1 4 4 2 ATL_dmm4x4x2pf_arm.c "R. Clint Whaley" \ gcc -x assembler-with-cpp -mfpu=vfpv3 Index: ATLAS/tune/blas/gemm/CASES/dcases.vnb =================================================================== --- ATLAS.orig/tune/blas/gemm/CASES/dcases.vnb +++ ATLAS/tune/blas/gemm/CASES/dcases.vnb @@ -53,10 +53,6 @@ ID=6 ROUT='ATL_dmm4x1x90_x87.c' AUTH='R ID=7 ROUT='ATL_dmm8x1x120_sse2.c' AUTH='R. Clint Whaley' \ MU=8 NU=1 KU=1 KBMAX=512 ASM=GAS_x8664 BETAN1=1 \ COMP='gcc' FLAGS='-m64 -x assembler-with-cpp' -ID=70 ROUT='ATL_dmm4x4x80_ppc.c' AUTH='R. Clint Whaley' TA='T', TB='N' \ - MU=4 NU=4 KU=1 KBMIN=1 KBMAX=80 ASM=GAS_PPC BETAN1=0 LDBOT=0 \ - LDAB=0 LDISKB=1 RTN=1 RTM=1 RTK=0 \ - COMP='gcc' FLAGS='-x assembler-with-cpp' ID=80 ROUT='ATL_dmm4x4x16r8_US.c' AUTH='R. Clint Whaley' TA='T', TB='N' \ MU=4 NU=4 KU=24 KBMIN=24 KBMAX=512 ASM=GAS_SPARC BETAN1=0 \ LDAB=0 RTK=1 RTN=1 RTM=1 LDBOT=0 LDISKB=1 LDAB=1 \ Index: ATLAS/tune/blas/gemm/CASES/zcases.flg =================================================================== --- ATLAS.orig/tune/blas/gemm/CASES/zcases.flg +++ ATLAS/tune/blas/gemm/CASES/zcases.flg @@ -1,5 +1,5 @@ <ID> <flag> <mb> <nb> <kb> <muladd> <lat> <mu> <nu> <ku> <rout> "<Contributer>" -31 +29 306 192 4 3 8 0 4 4 3 8 ATL_mm4x3x8p.c "R. Clint Whaley" \ gcc -mcpu=ultrasparc -mtune=ultrasparc -fomit-frame-pointer -O -fno-schedule-insns -fno-schedule-insns2 @@ -76,12 +76,6 @@ gcc 336 192 4 4 1 1 1 4 4 1 ATL_dmm4x4xURx_mips.c "R. Clint Whaley" \ gcc -x assembler-with-cpp -mips4 -337 192 4 4 1 1 16 4 4 1 ATL_dmm4x4x80_ppc.c "Whaley & Castaldo" \ -gcc --x assembler-with-cpp -338 192 8 4 2 1 0 8 4 2 ATL_dmm8x4x2_vsx.c "IBM" \ -gcc --O3 -mvsx 339 448 4 4 2 1 1 4 4 2 ATL_dmm4x4x2pf_arm.c "R. Clint Whaley" \ gcc -x assembler-with-cpp -mfpu=vfpv3 ++++++ atlas3.10.1.tar.bz2 -> atlas3.10.2.tar.bz2 ++++++ ++++ 10483 lines of diff (skipped) ++++++ issue_64.patch ++++++ From: Michel Normand <norm...@linux.vnet.ibm.com> Subject: issue 64 Date: Mon, 07 Jul 2014 17:15:03 +0200 issue 64, patch as suggested by Clint but not tested by myself. Signed-off-by: Michel Normand <norm...@linux.vnet.ibm.com> --- tune/blas/level3/invtrsm.c | 3 +++ 1 file changed, 3 insertions(+) Index: ATLAS/tune/blas/level3/invtrsm.c =================================================================== --- ATLAS.orig/tune/blas/level3/invtrsm.c +++ ATLAS/tune/blas/level3/invtrsm.c @@ -257,6 +257,9 @@ static void MakeHEDiagDom int j; const int lda2=(lda SHIFT), ldap1=((lda+1)SHIFT); + /* as per issue 64 */ + Mjoin(PATL,gegen)(N, N, A, lda, N*N+lda); + if (Order == CblasRowMajor) { if (Uplo == CblasLower) Uplo = CblasUpper;