diff --git a/kernel/power/dgemm_kernel_power10.c b/kernel/power/dgemm_kernel_power10.c index cdd846891f..cf10cc4756 100644 --- a/kernel/power/dgemm_kernel_power10.c +++ b/kernel/power/dgemm_kernel_power10.c @@ -92,7 +92,18 @@ typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); rowC = (v4sf_t *) &CO[1* ldc+J]; \ rowC[0] += result[1] * alpha; #endif - +#define KERNEL(i) \ + rowA = (vec_t *)&AO[i<< 3];\ + rowB = *((__vector_pair *)((void *)&BO[i << 3]));\ + rowB1 = *((__vector_pair *)((void *)&BO[(i << 3) + 4]));\ + __builtin_mma_xvf64gerpp(&acc0, rowB, rowA[0]);\ + __builtin_mma_xvf64gerpp(&acc1, rowB1, rowA[0]);\ + __builtin_mma_xvf64gerpp(&acc2, rowB, rowA[1]);\ + __builtin_mma_xvf64gerpp(&acc3, rowB1, rowA[1]);\ + __builtin_mma_xvf64gerpp(&acc4, rowB, rowA[2]);\ + __builtin_mma_xvf64gerpp(&acc5, rowB1, rowA[2]);\ + __builtin_mma_xvf64gerpp(&acc6, rowB, rowA[3]);\ + __builtin_mma_xvf64gerpp(&acc7, rowB1, rowA[3]); #define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory"); #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) @@ -188,7 +199,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, v4sf_t *rowC; v4sf_t result[4]; __vector_quad acc0, acc1, acc2, acc3, acc4,acc5,acc6,acc7; - BLASLONG l = 0; + BLASLONG l = 1; vec_t *rowA = (vec_t *) & AO[0]; __vector_pair rowB, rowB1; rowB = *((__vector_pair *)((void *)&BO[0])); @@ -201,20 +212,55 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, __builtin_mma_xvf64ger (&acc5, rowB1, rowA[2]); __builtin_mma_xvf64ger (&acc6, rowB, rowA[3]); __builtin_mma_xvf64ger (&acc7, rowB1, rowA[3]); - for (l = 1; l < temp; l++) - { - rowA = (vec_t *) & AO[l << 3]; - rowB = *((__vector_pair *)((void *)&BO[l << 3])); - rowB1 = *((__vector_pair *)((void *)&BO[(l << 3) + 4])); - __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); - __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]); - __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[1]); - __builtin_mma_xvf64gerpp (&acc3, rowB1, rowA[1]); - __builtin_mma_xvf64gerpp (&acc4, rowB, rowA[2]); - __builtin_mma_xvf64gerpp (&acc5, rowB1, rowA[2]); - __builtin_mma_xvf64gerpp (&acc6, rowB, rowA[3]); - __builtin_mma_xvf64gerpp (&acc7, rowB1, rowA[3]); - } + for (l = 1; l + 15 < temp; l += 16) + { + KERNEL (l); + KERNEL (l+1); + KERNEL (l+2); + KERNEL (l+3); + KERNEL (l+4); + KERNEL (l+5); + KERNEL (l+6); + KERNEL (l+7); + KERNEL (l+8); + KERNEL (l+9); + KERNEL (l+10); + KERNEL (l+11); + KERNEL (l+12); + KERNEL (l+13); + KERNEL (l+14); + KERNEL (l+15); + } + if ((temp - l) & 8) + { + KERNEL(l); + KERNEL(l+1); + KERNEL(l+2); + KERNEL(l+3); + KERNEL(l+4); + KERNEL(l+5); + KERNEL(l+6); + KERNEL(l+7); + l += 8; + } + if ((temp - l) & 4) + { + KERNEL(l); + KERNEL(l+1); + KERNEL(l+2); + KERNEL(l+3); + l += 4; + } + if ((temp - l) & 2) + { + KERNEL(l); + KERNEL(l+1); + l += 2; + } + if ((temp - l) & 1) + { + KERNEL(l); + } SAVE_ACC (&acc0, 0); SAVE_ACC1 (&acc1, 0); SAVE_ACC (&acc2, 2); diff --git a/kernel/power/dgemv_n_microk_power10.c b/kernel/power/dgemv_n_microk_power10.c index 65743731ea..bbc1154691 100644 --- a/kernel/power/dgemv_n_microk_power10.c +++ b/kernel/power/dgemv_n_microk_power10.c @@ -25,8 +25,53 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ +#define HAVE_KERNEL_4x2 1 +#define HAVE_KERNEL_4x1 1 #define HAVE_KERNEL_4x4 1 +static void dgemv_kernel_4x2(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *xo, FLOAT *y, FLOAT alpha) +{ + FLOAT x0,x1; + x0 = xo[0] * alpha; + x1 = xo[1] * alpha; + __vector double v_x0 = {x0,x0}; + __vector double v_x1 = {x1,x1}; + __vector double* v_y =(__vector double*)y; + __vector double* va0 = (__vector double*)a0; + __vector double* va1 = (__vector double*)a1; + for (int i=0; i< n/2; i+=2) + { + + v_y[i]+= va0[i] * v_x0 + va1[i] * v_x1; + v_y[i+1]+= va0[i+1] * v_x0 + va1[i+1] * v_x1; + + } + + +} + +static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *xo, FLOAT *y, FLOAT alpha) +{ + BLASLONG i; + FLOAT x[1] __attribute__ ((aligned (16))); + + FLOAT x0,x1; + x0 = xo[0] * alpha; + + __vector double v_x0 = {x0,x0}; + __vector double* v_y =(__vector double*)y; + __vector double* va0 = (__vector double*)a0; + for (int i=0; i< n/2; i+=2) + { + + v_y[i]+= va0[i] * v_x0 ; + v_y[i+1]+= va0[i+1] * v_x0 ; + + } + +} + + static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y, double alpha) { double *a0;