From 680a77fafc322696f07dcf18c1ca0247c5613a12 Mon Sep 17 00:00:00 2001 From: pengxu Date: Tue, 5 Mar 2024 20:36:59 +0800 Subject: [PATCH] Optimized ssymv and dsymv kernel LSX for LoongArch --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 6 + kernel/loongarch64/dsymv_L_lsx.S | 432 +++++++++++++++++++++++ kernel/loongarch64/dsymv_U_lsx.S | 420 ++++++++++++++++++++++ kernel/loongarch64/ssymv_L_lsx.S | 429 ++++++++++++++++++++++ kernel/loongarch64/ssymv_U_lsx.S | 417 ++++++++++++++++++++++ 5 files changed, 1704 insertions(+) create mode 100644 kernel/loongarch64/dsymv_L_lsx.S create mode 100644 kernel/loongarch64/dsymv_U_lsx.S create mode 100644 kernel/loongarch64/ssymv_L_lsx.S create mode 100644 kernel/loongarch64/ssymv_U_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index 5b54a2adad..068b3cf4c4 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -88,9 +88,15 @@ ZSUMKERNEL = csum_lsx.S SGEMVNKERNEL = sgemv_n_lsx.S SGEMVTKERNEL = sgemv_t_lsx.S +SSYMV_U_KERNEL = ssymv_U_lsx.S +SSYMV_L_KERNEL = ssymv_L_lsx.S + DGEMVNKERNEL = dgemv_n_lsx.S DGEMVTKERNEL = dgemv_t_lsx.S +DSYMV_U_KERNEL = dsymv_U_lsx.S +DSYMV_L_KERNEL = dsymv_L_lsx.S + DGEMMKERNEL = dgemm_kernel_8x4.S DGEMMINCOPY = dgemm_ncopy_8_lsx.S DGEMMITCOPY = dgemm_tcopy_8_lsx.S diff --git a/kernel/loongarch64/dsymv_L_lsx.S b/kernel/loongarch64/dsymv_L_lsx.S new file mode 100644 index 0000000000..1fd0d26f58 --- /dev/null +++ b/kernel/loongarch64/dsymv_L_lsx.S @@ -0,0 +1,432 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +/* Param */ +#define M $r4 +#define N $r5 +#define A $r6 +#define LDA $r7 +#define X $r8 +#define INCX $r9 +#define Y $r10 +#define INCY $r11 +#define BUFFER $r16 +#define ALPHA $f0 + +#define JY $r18 +#define JX $r31 +#define T0 $r19 +#define T1 $r20 +#define AO3 $r12 +#define AO4 $r13 +#define I $r14 +#define J $r15 +#define AO1 $r23 +#define AO2 $r24 +#define IX $r25 +#define IY $r26 +#define II $r27 +#define T2 $r28 +#define T3 $r29 +#define T4 $r30 + +/* LSX vectors */ +#define U0 $vr31 +#define U1 $vr1 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 +#define U8 $vr8 +#define U9 $vr9 +#define U10 $vr10 +#define U11 $vr11 +#define U12 $vr12 +#define U13 $vr13 +#define U14 $vr14 +#define U15 $vr15 +#define U16 $vr16 +#define VALPHA $vr17 + +#define a2 $f2 +#define a3 $f3 +#define a4 $f4 +#define a5 $f5 +#define a6 $f6 +#define a7 $f7 +#define a8 $f8 +#define a9 $f9 + + + PROLOGUE + + LDARG BUFFER, $sp, 0 + + addi.d $sp, $sp, -88 + + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 32 + SDARG $r27, $sp, 40 + SDARG $r28, $sp, 48 + SDARG $r29, $sp, 56 + SDARG $r30, $sp, 64 + SDARG $r31, $sp, 72 + ST ALPHA, $sp, 80 + + vldrepl.d VALPHA, $sp, 80 + + slli.d LDA, LDA, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + + bge $r0, M, .L999 + bge $r0, N, .L999 + + move J, $r0 + move JY, $r0 + move JX, $r0 + move AO1, A + + beq J, N, .L999 + +.L01: + MTC a2, $r0 //temp2 + fldx.d a6, X, JX + fmul.d a3, ALPHA, a6 //temp1 + vshuf4i.d U3, U3, 0x00 + vshuf4i.d U2, U2, 0x00 + + mul.d T0, J, LDA + slli.d T1, J, BASE_SHIFT + add.d T0, T0, T1 + fldx.d a6, AO1, T0 + fldx.d a4, Y, JY + fmadd.d a4, a3, a6, a4 + fstx.d a4, Y, JY + + move IY, JY + move IX, JX + addi.d II, J, 1 + move I, II + slli.d II, II, BASE_SHIFT + + sub.d T0, M, J + addi.d T0, T0, -1 + srai.d T0, T0, 3 + add.d T0, T0, J + addi.d T0, T0, 1 + beq I, T0, .L03 + bge I, T0, .L03 + + mul.d T1, J, LDA + add.d T1, T1, II + +.L02: /* /8 */ + vldx U1, AO1, T1 + addi.d T1, T1, 16 + vldx U14, AO1, T1 + addi.d T1, T1, 16 + vldx U15, AO1, T1 + addi.d T1, T1, 16 + vldx U16, AO1, T1 + addi.d T1, T1, 16 + + add.d T2, IY, INCY + fldx.d $f4, Y, T2 + add.d T2, T2, INCY + fldx.d $f5, Y, T2 + add.d T2, T2, INCY + fldx.d $f6, Y, T2 + add.d T2, T2, INCY + fldx.d $f7, Y, T2 + + add.d T2, T2, INCY + fldx.d $f8, Y, T2 + add.d T2, T2, INCY + fldx.d $f9, Y, T2 + add.d T2, T2, INCY + fldx.d $f10, Y, T2 + add.d T2, T2, INCY + fldx.d $f11, Y, T2 + + vextrins.d U4, U5, 0x10 + vextrins.d U6, U7, 0x10 + vextrins.d U8, U9, 0x10 + vextrins.d U10, U11, 0x10 + + vfmadd.d U4, U3, U1, U4 + vfmadd.d U6, U3, U14, U6 + vfmadd.d U8, U3, U15, U8 + vfmadd.d U10, U3, U16, U10 + + vextrins.d U5, U4, 0x01 + vextrins.d U7, U6, 0x01 + vextrins.d U9, U8, 0x01 + vextrins.d U11, U10, 0x01 + + add.d T2, IY, INCY + fstx.d $f4, Y, T2 + add.d T2, T2, INCY + fstx.d $f5, Y, T2 + add.d T2, T2, INCY + fstx.d $f6, Y, T2 + add.d T2, T2, INCY + fstx.d $f7, Y, T2 + + add.d T2, T2, INCY + fstx.d $f8, Y, T2 + add.d T2, T2, INCY + fstx.d $f9, Y, T2 + add.d T2, T2, INCY + fstx.d $f10, Y, T2 + add.d T2, T2, INCY + fstx.d $f11, Y, T2 + + slli.d T2, INCY, 3 + add.d IY, IY, T2 + + add.d T2, IX, INCX + fldx.d $f4, X, T2 + add.d T2, T2, INCX + fldx.d $f5, X, T2 + add.d T2, T2, INCX + fldx.d $f6, X, T2 + add.d T2, T2, INCX + fldx.d $f7, X, T2 + + add.d T2, T2, INCX + fldx.d $f8, X, T2 + add.d T2, T2, INCX + fldx.d $f9, X, T2 + add.d T2, T2, INCX + fldx.d $f10, X, T2 + add.d T2, T2, INCX + fldx.d $f11, X, T2 + + vextrins.d U4, U5, 0x10 + vextrins.d U6, U7, 0x10 + vextrins.d U8, U9, 0x10 + vextrins.d U10, U11, 0x10 + + vand.v $vr12, $vr2, $vr2 + + vfmadd.d U2, U1, U4, U2 + vfsub.d U2, U2, $vr12 + vfmadd.d U2, U14, U6, U2 + vfmadd.d U2, U15, U8, U2 + vfmadd.d U2, U16, U10, U2 + + vextrins.d U4, U2, 0x01 + + fadd.d $f2, $f2, $f4 + fadd.d $f2, $f2, $f12 + + vextrins.d U2, U2, 0x10 + + slli.d T2, INCX, 3 + add.d IX, IX, T2 + + addi.d II, II, 64 + addi.d I, I, 1 + blt I, T0, .L02 + +.L03: /* &4 */ + sub.d T0, M, J + addi.d T0, T0, -1 + andi T0, T0, 4 + beq $r0, T0, .L04 + + mul.d T1, J, LDA + add.d T1, T1, II + addi.d T2, T1, 16 + + vldx U1, AO1, T1 + vldx U14, AO1, T2 + + add.d T1, IY, INCY + add.d T2, T1, INCY + add.d T3, T2, INCY + add.d T4, T3, INCY + + fldx.d $f4, Y, T1 + fldx.d $f5, Y, T2 + fldx.d $f6, Y, T3 + fldx.d $f7, Y, T4 + + vextrins.d U4, U5, 0x10 + vextrins.d U6, U7, 0x10 + + vfmadd.d U4, U3, U1, U4 + vfmadd.d U6, U3, U14, U6 + + vextrins.d U5, U4, 0x01 + vextrins.d U7, U6, 0x01 + + fstx.d $f4, Y, T1 + fstx.d $f5, Y, T2 + fstx.d $f6, Y, T3 + fstx.d $f7, Y, T4 + + slli.d T1, INCY, 2 + add.d IY, IY, T1 + + add.d T1, IX, INCX + add.d T2, T1, INCX + add.d T3, T2, INCX + add.d T4, T3, INCX + + fldx.d $f4, X, T1 + fldx.d $f5, X, T2 + fldx.d $f6, X, T3 + fldx.d $f7, X, T4 + + vextrins.d U4, U5, 0x10 + vextrins.d U6, U7, 0x10 + + vand.v $vr12, $vr2, $vr2 + + vfmadd.d U2, U1, U4, U2 + vfsub.d U2, U2, $vr12 + vfmadd.d U2, U14, U6, U2 + + vextrins.d U4, U2, 0x01 + + fadd.d $f2, $f2, $f4 + fadd.d $f2, $f2, $f12 + + vextrins.d U2, U2, 0x10 + + slli.d T2, INCX, 2 + add.d IX, IX, T2 + + addi.d II, II, 32 + +.L04: /* &2 */ + sub.d T0, M, J + addi.d T0, T0, -1 + andi T0, T0, 2 + beq $r0, T0, .L05 + + mul.d T1, J, LDA + add.d T1, T1, II + + vldx U1, AO1, T1 + + add.d T1, IY, INCY + add.d T2, T1, INCY + + fldx.d $f6, Y, T1 + fldx.d $f7, Y, T2 + + vextrins.d U6, U7, 0x10 + vfmadd.d U6, U3, U1, U6 + vextrins.d U7, U6, 0x01 + + fstx.d $f6, Y, T1 + fstx.d $f7, Y, T2 + + slli.d T1, INCY, 1 + add.d IY, IY, T1 + + add.d T1, IX, INCX + add.d T2, T1, INCX + + fldx.d $f6, X, T1 + fldx.d $f7, X, T2 + + vextrins.d U6, U7, 0x10 + vand.v U12, U2, U2 + + vfmadd.d U2, U1, U6, U2 + vfsub.d U2, U2, U12 + + vextrins.d U4, U2, 0x01 + fadd.d $f2, $f2, $f4 + fadd.d $f2, $f2, $f12 + + vextrins.d U2, U2, 0x10 + + slli.d T2, INCX, 1 + add.d IX, IX, T2 + + addi.d II, II, 16 + +.L05: /* &1 */ + sub.d T0, M, J + addi.d T0, T0, -1 + andi T0, T0, 1 + beq $r0, T0, .L06 + + mul.d T1, J, LDA + add.d T1, T1, II + + fldx.d $f4, AO1, T1 + add.d IY, IY, INCY + fldx.d $f6, Y, IY + fmadd.d $f6, $f3, $f4, $f6 + fstx.d $f6, Y, IY + + add.d IX, IX, INCX + fldx.d $f6, X, IX + fmadd.d $f2, $f4, $f6, $f2 + + addi.d II, II, 8 + +.L06: + fldx.d $f6, Y, JY + fmadd.d $f6, ALPHA, $f2, $f6 + fstx.d $f6, Y, JY + + add.d JX, JX, INCX + add.d JY, JY, INCY + + addi.d J, J, 1 + blt J, N, .L01 + +.L999: + LDARG $r23, $sp, 0 + LDARG $r24, $sp, 8 + LDARG $r25, $sp, 16 + LDARG $r26, $sp, 32 + LDARG $r27, $sp, 40 + LDARG $r28, $sp, 48 + LDARG $r29, $sp, 56 + LDARG $r30, $sp, 64 + LDARG $r31, $sp, 72 + + addi.d $sp, $sp, 88 + jirl $r0, $r1, 0x0 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/dsymv_U_lsx.S b/kernel/loongarch64/dsymv_U_lsx.S new file mode 100644 index 0000000000..f708196aaa --- /dev/null +++ b/kernel/loongarch64/dsymv_U_lsx.S @@ -0,0 +1,420 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +/* Param */ +#define M $r4 +#define N $r5 +#define A $r6 +#define LDA $r7 +#define X $r8 +#define INCX $r9 +#define Y $r10 +#define INCY $r11 +#define BUFFER $r16 +#define ALPHA $f0 + +#define JY $r18 +#define JX $r31 +#define T0 $r19 +#define T1 $r20 +#define M1 $r12 +#define AO4 $r13 +#define I $r14 +#define J $r15 +#define AO1 $r23 +#define AO2 $r24 +#define IX $r25 +#define IY $r26 +#define II $r27 +#define T2 $r28 +#define T3 $r29 +#define T4 $r30 + +/* LSX vectors */ +#define U0 $vr31 +#define U1 $vr1 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 +#define U8 $vr8 +#define U9 $vr9 +#define U10 $vr10 +#define U11 $vr11 +#define U12 $vr12 +#define U13 $vr13 +#define U14 $vr14 +#define U15 $vr15 +#define U16 $vr16 +#define VALPHA $vr17 + +#define a2 $f2 +#define a3 $f3 +#define a4 $f4 +#define a5 $f5 +#define a6 $f6 +#define a7 $f7 +#define a8 $f8 +#define a9 $f9 + + + PROLOGUE + + LDARG BUFFER, $sp, 0 + + addi.d $sp, $sp, -88 + + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 32 + SDARG $r27, $sp, 40 + SDARG $r28, $sp, 48 + SDARG $r29, $sp, 56 + SDARG $r30, $sp, 64 + SDARG $r31, $sp, 72 + ST ALPHA, $sp, 80 + + vldrepl.d VALPHA, $sp, 80 + + slli.d LDA, LDA, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + + bge $r0, M, .L999 + bge $r0, N, .L999 + + sub.d M1, M, N + + mul.d JY, M1, INCY + mul.d JX, M1, INCX + + move J, M1 + move AO1, A + + beq J, M, .L999 + +.L01: + MTC $f2, $r0 //temp2 + fldx.d $f6, X, JX + fmul.d $f3, ALPHA, $f6 //temp1 + vshuf4i.d U3, U3, 0x00 + vshuf4i.d U2, U2, 0x00 + + move IY, $r0 + move IX, $r0 + move II, $r0 + move I, $r0 + + srai.d T0, J, 3 + beq I, T0, .L03 + + mul.d T1, J, LDA + add.d T1, T1, II + +.L02: /* /8 */ + vldx U1, AO1, T1 + addi.d T1, T1, 16 + vldx U14, AO1, T1 + addi.d T1, T1, 16 + vldx U15, AO1, T1 + addi.d T1, T1, 16 + vldx U16, AO1, T1 + addi.d T1, T1, 16 + + fldx.d $f4, Y, IY + add.d T2, IY, INCY + fldx.d $f5, Y, T2 + add.d T2, T2, INCY + fldx.d $f6, Y, T2 + add.d T2, T2, INCY + fldx.d $f7, Y, T2 + + add.d T2, T2, INCY + fldx.d $f8, Y, T2 + add.d T2, T2, INCY + fldx.d $f9, Y, T2 + add.d T2, T2, INCY + fldx.d $f10, Y, T2 + add.d T2, T2, INCY + fldx.d $f11, Y, T2 + + vextrins.d U4, U5, 0x10 + vextrins.d U6, U7, 0x10 + vextrins.d U8, U9, 0x10 + vextrins.d U10, U11, 0x10 + + vfmadd.d U4, U3, U1, U4 + vfmadd.d U6, U3, U14, U6 + vfmadd.d U8, U3, U15, U8 + vfmadd.d U10, U3, U16, U10 + + vextrins.d U5, U4, 0x01 + vextrins.d U7, U6, 0x01 + vextrins.d U9, U8, 0x01 + vextrins.d U11, U10, 0x01 + + fstx.d $f4, Y, IY + add.d T2, IY, INCY + fstx.d $f5, Y, T2 + add.d T2, T2, INCY + fstx.d $f6, Y, T2 + add.d T2, T2, INCY + fstx.d $f7, Y, T2 + + add.d T2, T2, INCY + fstx.d $f8, Y, T2 + add.d T2, T2, INCY + fstx.d $f9, Y, T2 + add.d T2, T2, INCY + fstx.d $f10, Y, T2 + add.d T2, T2, INCY + fstx.d $f11, Y, T2 + + slli.d T2, INCY, 3 + add.d IY, IY, T2 + + fldx.d $f4, X, IX + add.d T2, IX, INCX + fldx.d $f5, X, T2 + add.d T2, T2, INCX + fldx.d $f6, X, T2 + add.d T2, T2, INCX + fldx.d $f7, X, T2 + + add.d T2, T2, INCX + fldx.d $f8, X, T2 + add.d T2, T2, INCX + fldx.d $f9, X, T2 + add.d T2, T2, INCX + fldx.d $f10, X, T2 + add.d T2, T2, INCX + fldx.d $f11, X, T2 + + vextrins.d U4, U5, 0x10 + vextrins.d U6, U7, 0x10 + vextrins.d U8, U9, 0x10 + vextrins.d U10, U11, 0x10 + + vand.v $vr12, $vr2, $vr2 + + vfmadd.d U2, U1, U4, U2 + vfsub.d U2, U2, $vr12 + vfmadd.d U2, U14, U6, U2 + vfmadd.d U2, U15, U8, U2 + vfmadd.d U2, U16, U10, U2 + + vextrins.d U4, U2, 0x01 + + fadd.d $f2, $f2, $f4 + fadd.d $f2, $f2, $f12 + + vextrins.d U2, U2, 0x10 + + slli.d T2, INCX, 3 + add.d IX, IX, T2 + + addi.d II, II, 64 + addi.d I, I, 1 + blt I, T0, .L02 + +.L03: /* &4 */ + andi T0, J, 4 + beq $r0, T0, .L04 + + mul.d T1, J, LDA + add.d T1, T1, II + addi.d T2, T1, 16 + + vldx U1, AO1, T1 + vldx U14, AO1, T2 + + move T1, IY + add.d T2, T1, INCY + add.d T3, T2, INCY + add.d T4, T3, INCY + + fldx.d $f4, Y, T1 + fldx.d $f5, Y, T2 + fldx.d $f6, Y, T3 + fldx.d $f7, Y, T4 + + vextrins.d U4, U5, 0x10 + vextrins.d U6, U7, 0x10 + + vfmadd.d U4, U3, U1, U4 + vfmadd.d U6, U3, U14, U6 + + vextrins.d U5, U4, 0x01 + vextrins.d U7, U6, 0x01 + + fstx.d $f4, Y, T1 + fstx.d $f5, Y, T2 + fstx.d $f6, Y, T3 + fstx.d $f7, Y, T4 + + slli.d T1, INCY, 2 + add.d IY, IY, T1 + + move T1, IX + add.d T2, T1, INCX + add.d T3, T2, INCX + add.d T4, T3, INCX + + fldx.d $f4, X, T1 + fldx.d $f5, X, T2 + fldx.d $f6, X, T3 + fldx.d $f7, X, T4 + + vextrins.d U4, U5, 0x10 + vextrins.d U6, U7, 0x10 + + vand.v $vr12, $vr2, $vr2 + + vfmadd.d U2, U1, U4, U2 + vfsub.d U2, U2, $vr12 + vfmadd.d U2, U14, U6, U2 + + vextrins.d U4, U2, 0x01 + + fadd.d $f2, $f2, $f4 + fadd.d $f2, $f2, $f12 + + vextrins.d U2, U2, 0x10 + + slli.d T2, INCX, 2 + add.d IX, IX, T2 + + addi.d II, II, 32 + +.L04: /* &2 */ + andi T0, J, 2 + beq $r0, T0, .L05 + + mul.d T1, J, LDA + add.d T1, T1, II + + vldx $vr1, AO1, T1 + + move T1, IY + add.d T2, T1, INCY + + fldx.d $f6, Y, T1 + fldx.d $f7, Y, T2 + + vextrins.d U6, U7, 0x10 + vfmadd.d U6, U3, U1, U6 + vextrins.d U7, U6, 0x01 + + fstx.d $f6, Y, T1 + fstx.d $f7, Y, T2 + + slli.d T1, INCY, 1 + add.d IY, IY, T1 + + move T1, IX + add.d T2, T1, INCX + + fldx.d $f6, X, T1 + fldx.d $f7, X, T2 + + vextrins.d U6, U7, 0x10 + vand.v U12, U2, U2 + + vfmadd.d U2, U1, U6, U2 + vfsub.d U2, U2, U12 + + vextrins.d U4, U2, 0x01 + fadd.d $f2, $f2, $f4 + fadd.d $f2, $f2, $f12 + + vextrins.d U2, U2, 0x10 + + slli.d T2, INCX, 1 + add.d IX, IX, T2 + + addi.d II, II, 16 + +.L05: /* &1 */ + andi T0, J, 1 + beq $r0, T0, .L06 + + mul.d T1, J, LDA + add.d T1, T1, II + + fldx.d $f4, AO1, T1 + fldx.d $f6, Y, IY + fmadd.d $f6, $f3, $f4, $f6 + fstx.d $f6, Y, IY + add.d IY, IY, INCY + + fldx.d $f6, X, IX + fmadd.d $f2, $f4, $f6, $f2 + add.d IX, IX, INCX + + addi.d II, II, 8 + +.L06: + mul.d T1, J, LDA + slli.d T2, J, BASE_SHIFT + add.d T1, T1, T2 + + fldx.d $f6, Y, JY + fldx.d $f4, AO1, T1 + fmadd.d $f6, $f3, $f4, $f6 + fmul.d $f7, ALPHA, $f2 + fadd.d $f6, $f6, $f7 + + fstx.d $f6, Y, JY + + add.d JX, JX, INCX + add.d JY, JY, INCY + + addi.d J, J, 1 + blt J, M, .L01 + +.L999: + LDARG $r23, $sp, 0 + LDARG $r24, $sp, 8 + LDARG $r25, $sp, 16 + LDARG $r26, $sp, 32 + LDARG $r27, $sp, 40 + LDARG $r28, $sp, 48 + LDARG $r29, $sp, 56 + LDARG $r30, $sp, 64 + LDARG $r31, $sp, 72 + + addi.d $sp, $sp, 88 + jirl $r0, $r1, 0x0 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/ssymv_L_lsx.S b/kernel/loongarch64/ssymv_L_lsx.S new file mode 100644 index 0000000000..949e9e9025 --- /dev/null +++ b/kernel/loongarch64/ssymv_L_lsx.S @@ -0,0 +1,429 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +/* Param */ +#define M $r4 +#define N $r5 +#define A $r6 +#define LDA $r7 +#define X $r8 +#define INCX $r9 +#define Y $r10 +#define INCY $r11 +#define BUFFER $r16 +#define ALPHA $f0 + +#define JY $r18 +#define JX $r31 +#define T0 $r19 +#define T1 $r20 +#define AO3 $r12 +#define AO4 $r13 +#define I $r14 +#define J $r15 +#define AO1 $r23 +#define AO2 $r24 +#define IX $r25 +#define IY $r26 +#define II $r27 +#define T2 $r28 +#define T3 $r29 +#define T4 $r30 + +/* LSX vectors */ +#define U0 $vr31 +#define U1 $vr1 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 +#define U8 $vr8 +#define U9 $vr9 +#define U10 $vr10 +#define U11 $vr11 +#define U12 $vr12 +#define U13 $vr13 +#define U14 $vr14 +#define U15 $vr15 +#define U16 $vr16 +#define VALPHA $vr17 + +#define a2 $f2 +#define a3 $f3 +#define a4 $f4 +#define a5 $f5 +#define a6 $f6 +#define a7 $f7 +#define a8 $f8 +#define a9 $f9 + + + PROLOGUE + + LDARG BUFFER, $sp, 0 + + addi.d $sp, $sp, -88 + + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 32 + SDARG $r27, $sp, 40 + SDARG $r28, $sp, 48 + SDARG $r29, $sp, 56 + SDARG $r30, $sp, 64 + SDARG $r31, $sp, 72 + ST ALPHA, $sp, 80 + + vldrepl.w VALPHA, $sp, 80 + + slli.d LDA, LDA, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + + bge $r0, M, .L999 + bge $r0, N, .L999 + + move J, $r0 + move JY, $r0 + move JX, $r0 + move AO1, A + + beq J, N, .L999 + +.L01: + MTC a2, $r0 //temp2 + fldx.s a6, X, JX + fmul.s a3, ALPHA, a6 //temp1 + vpermi.w U3, U3, 0x00 + vpermi.w U2, U2, 0x00 + + mul.w T0, J, LDA + slli.d T1, J, BASE_SHIFT + add.w T0, T0, T1 + fldx.s a6, AO1, T0 + fldx.s a4, Y, JY + fmadd.s a4, a3, a6, a4 + fstx.s a4, Y, JY + + move IY, JY + move IX, JX + addi.d II, J, 1 + move I, II + slli.d II, II, BASE_SHIFT + + sub.d T0, M, J + addi.d T0, T0, -1 + srai.d T0, T0, 3 + add.d T0, T0, J + addi.d T0, T0, 1 + beq I, T0, .L03 + bge I, T0, .L03 + + mul.w T1, J, LDA + add.d T1, T1, II + +.L02: /* /8 */ + vldx U1, AO1, T1 + addi.d T1, T1, 16 + vldx U14, AO1, T1 + addi.d T1, T1, 16 + + add.d T2, IY, INCY + fldx.s $f4, Y, T2 + add.d T2, T2, INCY + fldx.s $f5, Y, T2 + add.d T2, T2, INCY + fldx.s $f6, Y, T2 + add.d T2, T2, INCY + fldx.s $f7, Y, T2 + + add.d T2, T2, INCY + fldx.s $f8, Y, T2 + add.d T2, T2, INCY + fldx.s $f9, Y, T2 + add.d T2, T2, INCY + fldx.s $f10, Y, T2 + add.d T2, T2, INCY + fldx.s $f11, Y, T2 + + vextrins.w U4, U5, 0x10 + vextrins.w U4, U6, 0x20 + vextrins.w U4, U7, 0x30 + vextrins.w U8, U9, 0x10 + vextrins.w U8, U10, 0x20 + vextrins.w U8, U11, 0x30 + + vfmadd.s U4, U3, U1, U4 + vfmadd.s U8, U3, U14, U8 + + vextrins.w U5, U4, 0x01 + vextrins.w U6, U4, 0x02 + vextrins.w U7, U4, 0x03 + vextrins.w U9, U8, 0x01 + vextrins.w U10, U8, 0x02 + vextrins.w U11, U8, 0x03 + + add.d T2, IY, INCY + fstx.s $f4, Y, T2 + add.d T2, T2, INCY + fstx.s $f5, Y, T2 + add.d T2, T2, INCY + fstx.s $f6, Y, T2 + add.d T2, T2, INCY + fstx.s $f7, Y, T2 + + add.d T2, T2, INCY + fstx.s $f8, Y, T2 + add.d T2, T2, INCY + fstx.s $f9, Y, T2 + add.d T2, T2, INCY + fstx.s $f10, Y, T2 + add.d T2, T2, INCY + fstx.s $f11, Y, T2 + + slli.d T2, INCY, 3 + add.d IY, IY, T2 + + add.d T2, IX, INCX + fldx.s $f4, X, T2 + add.d T2, T2, INCX + fldx.s $f5, X, T2 + add.d T2, T2, INCX + fldx.s $f6, X, T2 + add.d T2, T2, INCX + fldx.s $f7, X, T2 + + add.d T2, T2, INCX + fldx.s $f8, X, T2 + add.d T2, T2, INCX + fldx.s $f9, X, T2 + add.d T2, T2, INCX + fldx.s $f10, X, T2 + add.d T2, T2, INCX + fldx.s $f11, X, T2 + + vextrins.w $vr4, $vr5, 0x10 + vextrins.w $vr4, $vr6, 0x20 + vextrins.w $vr4, $vr7, 0x30 + vextrins.w $vr8, $vr9, 0x10 + vextrins.w $vr8, $vr10, 0x20 + vextrins.w $vr8, $vr11, 0x30 + + vand.v $vr12, $vr2, $vr2 + + vfmadd.s U2, U1, U4, U2 + vfsub.s U2, U2, $vr12 + vfmadd.s U2, U14, U8, U2 + + vextrins.w U4, U2, 0x01 + vextrins.w U5, U2, 0x02 + vextrins.w U6, U2, 0x03 + + fadd.s $f2, $f2, $f4 + fadd.s $f2, $f2, $f5 + fadd.s $f2, $f2, $f6 + fadd.s $f2, $f2, $f12 + + vpermi.w U2, U2, 0x00 + + slli.d T2, INCX, 3 + add.d IX, IX, T2 + + addi.d II, II, 32 + addi.d I, I, 1 + blt I, T0, .L02 + +.L03: /* &4 */ + sub.d T0, M, J + addi.d T0, T0, -1 + andi T0, T0, 4 + beq $r0, T0, .L04 + + mul.w T1, J, LDA + add.d T1, T1, II + + vldx U1, AO1, T1 + + add.d T1, IY, INCY + add.d T2, T1, INCY + add.d T3, T2, INCY + add.d T4, T3, INCY + + fldx.s $f4, Y, T1 + fldx.s $f5, Y, T2 + fldx.s $f6, Y, T3 + fldx.s $f7, Y, T4 + + vextrins.w U4, U5, 0x10 + vextrins.w U4, U6, 0x20 + vextrins.w U4, U7, 0x30 + + vfmadd.s U4, U3, U1, U4 + + vextrins.w U5, U4, 0x01 + vextrins.w U6, U4, 0x02 + vextrins.w U7, U4, 0x03 + + fstx.s $f4, Y, T1 + fstx.s $f5, Y, T2 + fstx.s $f6, Y, T3 + fstx.s $f7, Y, T4 + + slli.d T1, INCY, 2 + add.d IY, IY, T1 + + add.d T1, IX, INCX + add.d T2, T1, INCX + add.d T3, T2, INCX + add.d T4, T3, INCX + + fldx.s $f4, X, T1 + fldx.s $f5, X, T2 + fldx.s $f6, X, T3 + fldx.s $f7, X, T4 + + vextrins.w U4, U5, 0x10 + vextrins.w U4, U6, 0x20 + vextrins.w U4, U7, 0x30 + + vand.v $vr12, $vr2, $vr2 + + vfmadd.s U2, U1, U4, U2 + vfsub.s $vr2, $vr2, $vr12 + + vextrins.w U4, U2, 0x01 + vextrins.w U5, U2, 0x02 + vextrins.w U6, U2, 0x03 + + fadd.s $f2, $f2, $f4 + fadd.s $f2, $f2, $f5 + fadd.s $f2, $f2, $f6 + fadd.s $f2, $f2, $f12 + + vpermi.w U2, U2, 0x00 + + slli.d T2, INCX, 2 + add.d IX, IX, T2 + + addi.d II, II, 16 + +.L04: /* &2 */ + sub.d T0, M, J + addi.d T0, T0, -1 + andi T0, T0, 2 + beq $r0, T0, .L05 + + mul.w T1, J, LDA + add.d T1, T1, II + addi.d T2, T1, 4 + + fldx.s $f4, AO1, T1 + fldx.s $f5, AO1, T2 + + add.d T1, IY, INCY + add.d T2, T1, INCY + + fldx.s $f6, Y, T1 + fldx.s $f7, Y, T2 + + fmadd.s $f6, $f3, $f4, $f6 + fmadd.s $f7, $f3, $f5, $f7 + + fstx.s $f6, Y, T1 + fstx.s $f7, Y, T2 + + slli.d T1, INCY, 1 + add.d IY, IY, T1 + + add.d T1, IX, INCX + add.d T2, T1, INCX + + fldx.s $f6, X, T1 + fldx.s $f7, X, T2 + + fmadd.s $f2, $f4, $f6, $f2 + fmadd.s $f2, $f5, $f7, $f2 + + slli.d T2, INCX, 1 + add.d IX, IX, T2 + + addi.d II, II, 8 + +.L05: /* &1 */ + sub.d T0, M, J + addi.d T0, T0, -1 + andi T0, T0, 1 + beq $r0, T0, .L06 + + mul.w T1, J, LDA + add.d T1, T1, II + + fldx.s $f4, AO1, T1 + add.d IY, IY, INCY + fldx.s $f6, Y, IY + fmadd.s $f6, $f3, $f4, $f6 + fstx.s $f6, Y, IY + + add.d IX, IX, INCX + fldx.s $f6, X, IX + fmadd.s $f2, $f4, $f6, $f2 + + addi.d II, II, 4 + +.L06: + fldx.s $f6, Y, JY + fmadd.s $f6, ALPHA, $f2, $f6 + fstx.s $f6, Y, JY + + add.d JX, JX, INCX + add.d JY, JY, INCY + + addi.d J, J, 1 + blt J, N, .L01 + +.L999: + LDARG $r23, $sp, 0 + LDARG $r24, $sp, 8 + LDARG $r25, $sp, 16 + LDARG $r26, $sp, 32 + LDARG $r27, $sp, 40 + LDARG $r28, $sp, 48 + LDARG $r29, $sp, 56 + LDARG $r30, $sp, 64 + LDARG $r31, $sp, 72 + + addi.d $sp, $sp, 88 + jirl $r0, $r1, 0x0 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/ssymv_U_lsx.S b/kernel/loongarch64/ssymv_U_lsx.S new file mode 100644 index 0000000000..f3898e1483 --- /dev/null +++ b/kernel/loongarch64/ssymv_U_lsx.S @@ -0,0 +1,417 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +/* Param */ +#define M $r4 +#define N $r5 +#define A $r6 +#define LDA $r7 +#define X $r8 +#define INCX $r9 +#define Y $r10 +#define INCY $r11 +#define BUFFER $r16 +#define ALPHA $f0 + +#define JY $r18 +#define JX $r31 +#define T0 $r19 +#define T1 $r20 +#define M1 $r12 +#define AO4 $r13 +#define I $r14 +#define J $r15 +#define AO1 $r23 +#define AO2 $r24 +#define IX $r25 +#define IY $r26 +#define II $r27 +#define T2 $r28 +#define T3 $r29 +#define T4 $r30 + +/* LSX vectors */ +#define U0 $vr31 +#define U1 $vr1 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 +#define U8 $vr8 +#define U9 $vr9 +#define U10 $vr10 +#define U11 $vr11 +#define U12 $vr12 +#define U13 $vr13 +#define U14 $vr14 +#define U15 $vr15 +#define U16 $vr16 +#define VALPHA $vr17 + +#define a2 $f2 +#define a3 $f3 +#define a4 $f4 +#define a5 $f5 +#define a6 $f6 +#define a7 $f7 +#define a8 $f8 +#define a9 $f9 + + + PROLOGUE + + LDARG BUFFER, $sp, 0 + + addi.d $sp, $sp, -88 + + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 32 + SDARG $r27, $sp, 40 + SDARG $r28, $sp, 48 + SDARG $r29, $sp, 56 + SDARG $r30, $sp, 64 + SDARG $r31, $sp, 72 + ST ALPHA, $sp, 80 + + vldrepl.w VALPHA, $sp, 80 + + slli.d LDA, LDA, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + + bge $r0, M, .L999 + bge $r0, N, .L999 + + sub.d M1, M, N + + mul.d JY, M1, INCY + mul.d JX, M1, INCX + + move J, M1 + move AO1, A + + beq J, M, .L999 + +.L01: + MTC $f2, $r0 //temp2 + fldx.s $f6, X, JX + fmul.s $f3, ALPHA, $f6 //temp1 + vpermi.w U3, U3, 0x00 + vpermi.w U2, U2, 0x00 + + move IY, $r0 + move IX, $r0 + move II, $r0 + move I, $r0 + + srai.d T0, J, 3 + beq I, T0, .L03 + + mul.w T1, J, LDA + add.d T1, T1, II + +.L02: /* /8 */ + vldx U1, AO1, T1 + addi.d T1, T1, 16 + vldx U14, AO1, T1 + addi.d T1, T1, 16 + + fldx.s $f4, Y, IY + add.d T2, IY, INCY + fldx.s $f5, Y, T2 + add.d T2, T2, INCY + fldx.s $f6, Y, T2 + add.d T2, T2, INCY + fldx.s $f7, Y, T2 + + add.d T2, T2, INCY + fldx.s $f8, Y, T2 + add.d T2, T2, INCY + fldx.s $f9, Y, T2 + add.d T2, T2, INCY + fldx.s $f10, Y, T2 + add.d T2, T2, INCY + fldx.s $f11, Y, T2 + + vextrins.w U4, U5, 0x10 + vextrins.w U4, U6, 0x20 + vextrins.w U4, U7, 0x30 + vextrins.w U8, U9, 0x10 + vextrins.w U8, U10, 0x20 + vextrins.w U8, U11, 0x30 + + vfmadd.s U4, U3, U1, U4 + vfmadd.s U8, U3, U14, U8 + + vextrins.w U5, U4, 0x01 + vextrins.w U6, U4, 0x02 + vextrins.w U7, U4, 0x03 + vextrins.w U9, U8, 0x01 + vextrins.w U10, U8, 0x02 + vextrins.w U11, U8, 0x03 + + fstx.s $f4, Y, IY + add.d T2, IY, INCY + fstx.s $f5, Y, T2 + add.d T2, T2, INCY + fstx.s $f6, Y, T2 + add.d T2, T2, INCY + fstx.s $f7, Y, T2 + + add.d T2, T2, INCY + fstx.s $f8, Y, T2 + add.d T2, T2, INCY + fstx.s $f9, Y, T2 + add.d T2, T2, INCY + fstx.s $f10, Y, T2 + add.d T2, T2, INCY + fstx.s $f11, Y, T2 + + slli.d T2, INCY, 3 + add.d IY, IY, T2 + + fldx.s $f4, X, IX + add.d T2, IX, INCX + fldx.s $f5, X, T2 + add.d T2, T2, INCX + fldx.s $f6, X, T2 + add.d T2, T2, INCX + fldx.s $f7, X, T2 + + add.d T2, T2, INCX + fldx.s $f8, X, T2 + add.d T2, T2, INCX + fldx.s $f9, X, T2 + add.d T2, T2, INCX + fldx.s $f10, X, T2 + add.d T2, T2, INCX + fldx.s $f11, X, T2 + + vextrins.w $vr4, $vr5, 0x10 + vextrins.w $vr4, $vr6, 0x20 + vextrins.w $vr4, $vr7, 0x30 + vextrins.w $vr8, $vr9, 0x10 + vextrins.w $vr8, $vr10, 0x20 + vextrins.w $vr8, $vr11, 0x30 + + vand.v $vr12, $vr2, $vr2 + + vfmadd.s U2, U1, U4, U2 + vfsub.s U2, U2, $vr12 + vfmadd.s U2, U14, U8, U2 + + vextrins.w U4, U2, 0x01 + vextrins.w U5, U2, 0x02 + vextrins.w U6, U2, 0x03 + + fadd.s $f2, $f2, $f4 + fadd.s $f2, $f2, $f5 + fadd.s $f2, $f2, $f6 + fadd.s $f2, $f2, $f12 + + vpermi.w U2, U2, 0x00 + + slli.d T2, INCX, 3 + add.d IX, IX, T2 + + addi.d II, II, 32 + addi.d I, I, 1 + blt I, T0, .L02 + +.L03: /* &4 */ + andi T0, J, 4 + beq $r0, T0, .L04 + + mul.w T1, J, LDA + add.d T1, T1, II + + vldx U1, AO1, T1 + + move T1, IY + add.d T2, T1, INCY + add.d T3, T2, INCY + add.d T4, T3, INCY + + fldx.s $f4, Y, T1 + fldx.s $f5, Y, T2 + fldx.s $f6, Y, T3 + fldx.s $f7, Y, T4 + + vextrins.w U4, U5, 0x10 + vextrins.w U4, U6, 0x20 + vextrins.w U4, U7, 0x30 + + vfmadd.s U4, U3, U1, U4 + + vextrins.w U5, U4, 0x01 + vextrins.w U6, U4, 0x02 + vextrins.w U7, U4, 0x03 + + fstx.s $f4, Y, T1 + fstx.s $f5, Y, T2 + fstx.s $f6, Y, T3 + fstx.s $f7, Y, T4 + + slli.d T1, INCY, 2 + add.d IY, IY, T1 + + move T1, IX + add.d T2, T1, INCX + add.d T3, T2, INCX + add.d T4, T3, INCX + + fldx.s $f4, X, T1 + fldx.s $f5, X, T2 + fldx.s $f6, X, T3 + fldx.s $f7, X, T4 + + vextrins.w U4, U5, 0x10 + vextrins.w U4, U6, 0x20 + vextrins.w U4, U7, 0x30 + + vand.v $vr12, $vr2, $vr2 + + vfmadd.s U2, U1, U4, U2 + vfsub.s $vr2, $vr2, $vr12 + + vextrins.w U4, U2, 0x01 + vextrins.w U5, U2, 0x02 + vextrins.w U6, U2, 0x03 + + fadd.s $f2, $f2, $f4 + fadd.s $f2, $f2, $f5 + fadd.s $f2, $f2, $f6 + fadd.s $f2, $f2, $f12 + + vpermi.w U2, U2, 0x00 + + slli.d T2, INCX, 2 + add.d IX, IX, T2 + + addi.d II, II, 16 + +.L04: /* &2 */ + andi T0, J, 2 + beq $r0, T0, .L05 + + mul.w T1, J, LDA + add.d T1, T1, II + addi.d T2, T1, 4 + + fldx.s $f4, AO1, T1 + fldx.s $f5, AO1, T2 + + move T1, IY + add.d T2, T1, INCY + + fldx.s $f6, Y, T1 + fldx.s $f7, Y, T2 + + fmadd.s $f6, $f3, $f4, $f6 + fmadd.s $f7, $f3, $f5, $f7 + + fstx.s $f6, Y, T1 + fstx.s $f7, Y, T2 + + slli.d T1, INCY, 1 + add.d IY, IY, T1 + + move T1, IX + add.d T2, T1, INCX + + fldx.s $f6, X, T1 + fldx.s $f7, X, T2 + + fmadd.s $f2, $f4, $f6, $f2 + fmadd.s $f2, $f5, $f7, $f2 + + slli.d T2, INCX, 1 + add.d IX, IX, T2 + + addi.d II, II, 8 + +.L05: /* &1 */ + andi T0, J, 1 + beq $r0, T0, .L06 + + mul.w T1, J, LDA + add.d T1, T1, II + + fldx.s $f4, AO1, T1 + fldx.s $f6, Y, IY + fmadd.s $f6, $f3, $f4, $f6 + fstx.s $f6, Y, IY + add.d IY, IY, INCY + + fldx.s $f6, X, IX + fmadd.s $f2, $f4, $f6, $f2 + add.d IX, IX, INCX + + addi.d II, II, 4 + +.L06: + mul.w T1, J, LDA + slli.d T2, J, BASE_SHIFT + add.d T1, T1, T2 + + fldx.s $f6, Y, JY + fldx.s $f4, AO1, T1 + fmadd.s $f6, $f3, $f4, $f6 + fmul.s $f7, ALPHA, $f2 + fadd.s $f6, $f6, $f7 + + fstx.s $f6, Y, JY + + add.d JX, JX, INCX + add.d JY, JY, INCY + + addi.d J, J, 1 + blt J, M, .L01 + +.L999: + LDARG $r23, $sp, 0 + LDARG $r24, $sp, 8 + LDARG $r25, $sp, 16 + LDARG $r26, $sp, 32 + LDARG $r27, $sp, 40 + LDARG $r28, $sp, 48 + LDARG $r29, $sp, 56 + LDARG $r30, $sp, 64 + LDARG $r31, $sp, 72 + + addi.d $sp, $sp, 88 + jirl $r0, $r1, 0x0 + + EPILOGUE \ No newline at end of file