Add invscalv, invertv (flame#12)

Co-authored-by: Michael Yeh <[email protected]>
sifive · Mar 28, 2023 · c33f542 · c33f542
1 parent 60374c0
commit c33f542
Show file tree

Hide file tree

Showing 4 changed files with 411 additions and 0 deletions.
diff --git a/config/x280/bli_cntx_init_x280.c b/config/x280/bli_cntx_init_x280.c
@@ -30,6 +30,16 @@ void bli_cntx_init_x280( cntx_t* cntx )
         BLIS_COPYV_KER,      BLIS_SCOMPLEX, bli_ccopyv_x280_asm,
         BLIS_COPYV_KER,      BLIS_DCOMPLEX, bli_zcopyv_x280_asm,
 
+        BLIS_INVERTV_KER,    BLIS_FLOAT,    bli_sinvertv_x280_asm,
+        BLIS_INVERTV_KER,    BLIS_DOUBLE,   bli_dinvertv_x280_asm,
+        BLIS_INVERTV_KER,    BLIS_SCOMPLEX, bli_cinvertv_x280_asm,
+        BLIS_INVERTV_KER,    BLIS_DCOMPLEX, bli_zinvertv_x280_asm,
+
+        BLIS_INVSCALV_KER,   BLIS_FLOAT,    bli_sinvscalv_x280_asm,
+        BLIS_INVSCALV_KER,   BLIS_DOUBLE,   bli_dinvscalv_x280_asm,
+        BLIS_INVSCALV_KER,   BLIS_SCOMPLEX, bli_cinvscalv_x280_asm,
+        BLIS_INVSCALV_KER,   BLIS_DCOMPLEX, bli_zinvscalv_x280_asm,
+
         BLIS_SETV_KER,       BLIS_FLOAT,    bli_ssetv_x280_asm,
         BLIS_SETV_KER,       BLIS_DOUBLE,   bli_dsetv_x280_asm,
         BLIS_SETV_KER,       BLIS_SCOMPLEX, bli_csetv_x280_asm,

diff --git a/kernels/x280/1/invertv.c b/kernels/x280/1/invertv.c
@@ -0,0 +1,175 @@
+#include "blis.h"
+#include <math.h>
+#include <riscv_vector.h>
+#include <stdbool.h>
+#include <stddef.h>
+
+#define FLT_SIZE 4
+#define FLT_LOAD "flw "
+#define VLE "vle32.v "
+#define VLSE "vlse32.v "
+#define VSE "vse32.v "
+#define VSSE "vsse32.v "
+
+void bli_sinvertv_x280_asm(dim_t n, float *restrict x, inc_t incx,
+                           cntx_t *restrict cntx) {
+    (void)cntx;
+    float one = 1.f;
+    __asm__(FLT_LOAD "f0, (%0)" : : "r"(&one));
+    incx *= FLT_SIZE;
+    size_t avl = n;
+    while (avl) {
+        size_t vl;
+        __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma"
+                         : "=r"(vl)
+                         : "r"(avl), "i"(8 * FLT_SIZE));
+        if (incx == FLT_SIZE) {
+            __asm__(VLE "v0, (%0)" : : "r"(x));
+            __asm__("vfrdiv.vf v0, v0, f0");
+            __asm__(VSE "v0, (%0)" : : "r"(x));
+        } else {
+            __asm__(VLSE "v0, (%0), %1" : : "r"(x), "r"(incx));
+            __asm__("vfrdiv.vf v0, v0, f0");
+            __asm__(VSSE "v0, (%0), %1" : : "r"(x), "r"(incx));
+        }
+        inc_t tmp1 = vl * incx;
+        __asm__("add %0, %0, %1" : "+r"(x) : "r"(tmp1));
+        avl -= vl;
+    }
+    return;
+}
+
+#undef FLT_SIZE
+#undef FLT_LOAD
+#undef VLE
+#undef VLSE
+#undef VSE
+#undef VSSE
+
+#define FLT_SIZE 8
+#define FLT_LOAD "fld "
+#define VLE "vle64.v "
+#define VLSE "vlse64.v "
+#define VSE "vse64.v "
+#define VSSE "vsse64.v "
+
+void bli_dinvertv_x280_asm(dim_t n, double *restrict x, inc_t incx,
+                           cntx_t *restrict cntx) {
+    (void)cntx;
+    double one = 1.;
+    __asm__(FLT_LOAD "f0, (%0)" : : "r"(&one));
+    incx *= FLT_SIZE;
+    size_t avl = n;
+    while (avl) {
+        size_t vl;
+        __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma"
+                         : "=r"(vl)
+                         : "r"(avl), "i"(8 * FLT_SIZE));
+        if (incx == FLT_SIZE) {
+            __asm__(VLE "v0, (%0)" : : "r"(x));
+            __asm__("vfrdiv.vf v0, v0, f0");
+            __asm__(VSE "v0, (%0)" : : "r"(x));
+        } else {
+            __asm__(VLSE "v0, (%0), %1" : : "r"(x), "r"(incx));
+            __asm__("vfrdiv.vf v0, v0, f0");
+            __asm__(VSSE "v0, (%0), %1" : : "r"(x), "r"(incx));
+        }
+        inc_t tmp1 = vl * incx;
+        __asm__("add %0, %0, %1" : "+r"(x) : "r"(tmp1));
+        avl -= vl;
+    }
+    return;
+}
+
+#undef FLT_SIZE
+#undef FLT_LOAD
+#undef VLE
+#undef VLSE
+#undef VSE
+#undef VSSE
+
+#define FLT_SIZE 4
+#define VLSEG2 "vlseg2e32.v "
+#define VLSSEG2 "vlsseg2e32.v "
+#define VSSEG2 "vsseg2e32.v "
+#define VSSSEG2 "vssseg2e32.v "
+
+void bli_cinvertv_x280_asm(dim_t n, scomplex *restrict x, inc_t incx,
+                           cntx_t *restrict cntx) {
+    (void)cntx;
+    incx *= 2 * FLT_SIZE;
+    size_t avl = n;
+    while (avl) {
+        size_t vl;
+        __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma"
+                         : "=r"(vl)
+                         : "r"(avl), "i"(8 * FLT_SIZE));
+        if (incx == 2 * FLT_SIZE) {
+            __asm__(VLSEG2 "v0, (%0)" : : "r"(x));
+            __asm__("vfneg.v v4, v4");
+            __asm__("vfmul.vv v8, v0, v0");
+            __asm__("vfmacc.vv v8, v4, v4");
+            __asm__("vfdiv.vv v0, v0, v8");
+            __asm__("vfdiv.vv v4, v4, v8");
+            __asm__(VSSEG2 "v0, (%0)" : : "r"(x));
+        } else {
+            __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx));
+            __asm__("vfneg.v v4, v4");
+            __asm__("vfmul.vv v8, v0, v0");
+            __asm__("vfmacc.vv v8, v4, v4");
+            __asm__("vfdiv.vv v0, v0, v8");
+            __asm__("vfdiv.vv v4, v4, v8");
+            __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx));
+        }
+        inc_t tmp1 = vl * incx;
+        __asm__("add %0, %0, %1" : "+r"(x) : "r"(tmp1));
+        avl -= vl;
+    }
+    return;
+}
+
+#undef FLT_SIZE
+#undef VLSEG2
+#undef VLSSEG2
+#undef VSSEG2
+#undef VSSSEG2
+
+#define FLT_SIZE 8
+#define VLSEG2 "vlseg2e64.v "
+#define VLSSEG2 "vlsseg2e64.v "
+#define VSSEG2 "vsseg2e64.v "
+#define VSSSEG2 "vssseg2e64.v "
+
+void bli_zinvertv_x280_asm(dim_t n, dcomplex *restrict x, inc_t incx,
+                           cntx_t *restrict cntx) {
+    (void)cntx;
+    incx *= 2 * FLT_SIZE;
+    size_t avl = n;
+    while (avl) {
+        size_t vl;
+        __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma"
+                         : "=r"(vl)
+                         : "r"(avl), "i"(8 * FLT_SIZE));
+        if (incx == 2 * FLT_SIZE) {
+            __asm__(VLSEG2 "v0, (%0)" : : "r"(x));
+            __asm__("vfneg.v v4, v4");
+            __asm__("vfmul.vv v8, v0, v0");
+            __asm__("vfmacc.vv v8, v4, v4");
+            __asm__("vfdiv.vv v0, v0, v8");
+            __asm__("vfdiv.vv v4, v4, v8");
+            __asm__(VSSEG2 "v0, (%0)" : : "r"(x));
+        } else {
+            __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx));
+            __asm__("vfneg.v v4, v4");
+            __asm__("vfmul.vv v8, v0, v0");
+            __asm__("vfmacc.vv v8, v4, v4");
+            __asm__("vfdiv.vv v0, v0, v8");
+            __asm__("vfdiv.vv v4, v4, v8");
+            __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx));
+        }
+        inc_t tmp1 = vl * incx;
+        __asm__("add %0, %0, %1" : "+r"(x) : "r"(tmp1));
+        avl -= vl;
+    }
+    return;
+}