Blob Blame Raw
From: Michel Normand <normand@linux.vnet.ibm.com>
Subject: atlas.3.10.2 ppc64le abiv2 patch
Date: Mon, 28 Jul 2014 04:29:05 -0400

atlas.3.10.2 abiv2 step2 complete the changes already present in atlas 3.10.2
* still some files with opd ABI V1 to be disabled for ABI V2
 tune/blas/gemm/CASES/ATL_dmm4x4x32_ppc.c
 tune/blas/gemm/CASES/ATL_dmm4x4x80_ppc.c
 tune/blas/gemm/CASES/ATL_smm4x4x128_av.c

atlas.3.10.2 ppc64le abiv2 step3
* change offsets of parameters read from stack to avoid some segfaults.
  (values changes 120 => 104 and 128 => 112 identified by gdb investigation)

Despite this step3 patch there are two Remaining problems for ppc64le archi:
* TODO: still have seg-faults in console during build/check
but is not critical (without make check) and rpm are generated on fedora.
unable to investigate because of problem tracked by issue 950
https://sourceforge.net/p/math-atlas/support-requests/950/

* TODO: make check failure because xsslvtst execution failure
related to vector assembly code that assumes big-endian env
as written in ATL_cmm4x4x128_av.c and ATL_smm4x4x128_av.c.
Would need significant work to support little-endian as per
endianess comments of all PowerPC vector instructions in:
https://www-01.ibm.com/chips/techlib/techlib.nsf/techdocs/FBFA164F824370F987256D6A006F424D/$file/vector_simd_pem.ppc.2005AUG23.pdf

Signed-off-by: Michel Normand <normand@linux.vnet.ibm.com>
---
 tune/blas/gemm/CASES/ATL_cmm4x4x128_av.c |    7 +++++++
 tune/blas/gemm/CASES/ATL_dmm4x4x2pf_av.c |    7 +++++++
 tune/blas/gemm/CASES/ATL_dmm4x4x32_ppc.c |    9 ++++++++-
 tune/blas/gemm/CASES/ATL_dmm4x4x80_ppc.c |   20 ++++++++++++++++++--
 tune/blas/gemm/CASES/ATL_smm4x4x128_av.c |   23 ++++++++++++++++++++++-
 5 files changed, 62 insertions(+), 4 deletions(-)

Index: ATLAS/tune/blas/gemm/CASES/ATL_dmm4x4x32_ppc.c
===================================================================
--- ATLAS.orig/tune/blas/gemm/CASES/ATL_dmm4x4x32_ppc.c
+++ ATLAS/tune/blas/gemm/CASES/ATL_dmm4x4x32_ppc.c
@@ -268,7 +268,7 @@ Mjoin(.,ATL_USERMM):
 	.globl  Mjoin(_,ATL_USERMM)
 Mjoin(_,ATL_USERMM):
    #else
-      #if defined(ATL_USE64BITS)
+      #if defined(ATL_USE64BITS) && _CALL_ELF != 2
 /*
  *      Official Program Descripter section, seg fault w/o it on Linux/PPC64
  */
@@ -324,8 +324,15 @@ ATL_USERMM:
 #endif
 
 #ifdef ATL_USE64BITS
+#if _CALL_ELF == 2
+/* ABIv2 */
+        ld      pC0, 104(r1)
+        ld      incCn, 112(r1)
+#else
+/* ABIv1 */
         ld      pC0, 120(r1)
         ld      incCn, 128(r1)
+#endif
 #elif defined(ATL_AS_OSX_PPC) || defined(ATL_AS_AIX_PPC)
         lwz     pC0, 68(r1)
         lwz     incCn,  72(r1)
Index: ATLAS/tune/blas/gemm/CASES/ATL_dmm4x4x80_ppc.c
===================================================================
--- ATLAS.orig/tune/blas/gemm/CASES/ATL_dmm4x4x80_ppc.c
+++ ATLAS/tune/blas/gemm/CASES/ATL_dmm4x4x80_ppc.c
@@ -170,13 +170,21 @@ void ATL_USERMM(const int M, const int N
                 const TYPE beta, TYPE *C, const int ldc)
                                   (r10)    8(r1)
 *******************************************************************************
-64 bit ABIs:
+64 bit ABIv1s:
                          r3           r4           r5             r6/f1
 void ATL_USERMM(const int M, const int N, const int K, const TYPE alpha,
                            r7             r8             r9            r10
                 const TYPE *A, const int lda, const TYPE *B, const int ldb,
                              f2   120(r1)        128(r1)
                 const TYPE beta, TYPE *C, const int ldc)
+
+64 bit ABIv2s:
+                         r3           r4           r5             r6/f1
+void ATL_USERMM(const int M, const int N, const int K, const TYPE alpha,
+                           r7             r8             r9            r10
+                const TYPE *A, const int lda, const TYPE *B, const int ldb,
+                             f2   104(r1)        112(r1)
+                const TYPE beta, TYPE *C, const int ldc)
 #endif
 #ifdef ATL_AS_AIX_PPC
         .csect .text[PR]
@@ -202,7 +210,7 @@ Mjoin(.,ATL_USERMM):
 	.globl  Mjoin(_,ATL_USERMM)
 Mjoin(_,ATL_USERMM):
    #else
-      #if defined(ATL_USE64BITS)
+      #if defined(ATL_USE64BITS) && _CALL_ELF != 2
 /*
  *      Official Program Descripter section, seg fault w/o it on Linux/PPC64
  */
@@ -257,9 +265,17 @@ ATL_USERMM:
    #endif
 #endif
 
+
 #if defined (ATL_USE64BITS)
+#if _CALL_ELF == 2
+/* ABIv2 */
+        ld      pC0, 104(r1)
+        ld      incCn, 112(r1)
+#else
+/* ABIv1 */
         ld      pC0, 120(r1)
         ld      incCn, 128(r1)
+#endif
 #elif defined(ATL_AS_OSX_PPC) || defined(ATL_AS_AIX_PPC)
         lwz     pC0, 68(r1)
         lwz     incCn,  72(r1)
Index: ATLAS/tune/blas/gemm/CASES/ATL_smm4x4x128_av.c
===================================================================
--- ATLAS.orig/tune/blas/gemm/CASES/ATL_smm4x4x128_av.c
+++ ATLAS/tune/blas/gemm/CASES/ATL_smm4x4x128_av.c
@@ -196,7 +196,7 @@ void ATL_USERMM(const int M, const int N
 	.globl  Mjoin(_,ATL_USERMM)
 Mjoin(_,ATL_USERMM):
 #else
-   #if defined(ATL_USE64BITS)
+   #if defined(ATL_USE64BITS) && _CALL_ELF != 2
 /*
  *      Official Program Descripter section, seg fault w/o it on Linux/PPC64
  */
@@ -221,8 +221,15 @@ ATL_USERMM:
  *      kernel instead
  */
 #if defined (ATL_USE64BITS)
+#if _CALL_ELF == 2
+/* ABIv2 */
+        ld      r10, 104(r1)
+        ld      r5, 112(r1)
+#else
+/* ABIv1 */
         ld      r10, 120(r1)
         ld      r5, 128(r1)
+#endif
 #elif defined(ATL_AS_OSX_PPC)
         lwz     r10, 60(r1)
         lwz     r5,  64(r1)
@@ -285,8 +292,15 @@ ATL_USERMM:
         eqv     r0, r0, r0      /* all 1s */
         ATL_WriteVRSAVE(r0)     /* signal we use all vector regs */
 #if defined (ATL_USE64BITS)
+#if _CALL_ELF == 2
+        /* ABIv2 */
+        ld      pC0, FSIZE+104(r1)
+        ld      ldc, FSIZE+112(r1)
+#else
+        /* ABIv1 */
         ld      pC0, FSIZE+120(r1)
         ld      ldc, FSIZE+128(r1)
+#endif
 #elif defined(ATL_AS_OSX_PPC)
         lwz     pC0, FSIZE+60(r1)
         lwz     ldc,  FSIZE+64(r1)
@@ -4258,8 +4272,15 @@ UNALIGNED_C:
         eqv     r0, r0, r0      /* all 1s */
         ATL_WriteVRSAVE(r0)     /* signal we use all vector regs */
 #if defined (ATL_USE64BITS)
+#if _CALL_ELF == 2
+        /* ABIv2 */
+        ld      pC0, FSIZE+104(r1)
+        ld      ldc, FSIZE+112(r1)
+#else
+        /* ABIv1 */
         ld      pC0, FSIZE+120(r1)
         ld      ldc, FSIZE+128(r1)
+#endif
 #elif defined(ATL_AS_OSX_PPC)
         lwz     pC0, FSIZE+60(r1)
         lwz     ldc,  FSIZE+64(r1)
Index: ATLAS/tune/blas/gemm/CASES/ATL_cmm4x4x128_av.c
===================================================================
--- ATLAS.orig/tune/blas/gemm/CASES/ATL_cmm4x4x128_av.c
+++ ATLAS/tune/blas/gemm/CASES/ATL_cmm4x4x128_av.c
@@ -258,8 +258,15 @@ ATL_USERMM:
         eqv     r0, r0, r0      /* all 1s */
         ATL_WriteVRSAVE(r0)     /* signal we use all vector regs */
 #if defined (ATL_USE64BITS)
+#if _CALL_ELF == 2
+/* ABIv2 */
+        ld      pC0, FSIZE+104(r1)
+        ld      ldc, FSIZE+112(r1)
+#else
+/* ABIv1 */
         ld      pC0, FSIZE+120(r1)
         ld      ldc, FSIZE+128(r1)
+#endif
 #elif defined(ATL_AS_OSX_PPC)
         lwz     pC0, FSIZE+60(r1)
         lwz     ldc,  FSIZE+64(r1)
Index: ATLAS/tune/blas/gemm/CASES/ATL_dmm4x4x2pf_av.c
===================================================================
--- ATLAS.orig/tune/blas/gemm/CASES/ATL_dmm4x4x2pf_av.c
+++ ATLAS/tune/blas/gemm/CASES/ATL_dmm4x4x2pf_av.c
@@ -405,8 +405,15 @@ Mjoin(_,ATL_USERMM):
  */
 #ifdef ATL_GAS_LINUX_PPC
    #ifdef ATL_USE64BITS
+      #if _CALL_ELF == 2
+      /* ABIv2 */
+        ld      pC0, 104(r1)
+        ld      incCn, 112(r1)
+      #else
+      /* ABIv1 */
 	ld 	pC0, 120(r1)
 	ld 	incCn, 128(r1)
+      #endif
    #else
 	lwz	incCn, FSIZE+8(r1)
    #endif