Skip to content

Commit

Permalink
Add invscalv, invertv (flame#12)
Browse files Browse the repository at this point in the history
Co-authored-by: Michael Yeh <[email protected]>
  • Loading branch information
2 people authored and Aaron-Hutchinson committed Mar 28, 2023
1 parent 60374c0 commit c33f542
Show file tree
Hide file tree
Showing 4 changed files with 411 additions and 0 deletions.
10 changes: 10 additions & 0 deletions config/x280/bli_cntx_init_x280.c
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,16 @@ void bli_cntx_init_x280( cntx_t* cntx )
BLIS_COPYV_KER, BLIS_SCOMPLEX, bli_ccopyv_x280_asm,
BLIS_COPYV_KER, BLIS_DCOMPLEX, bli_zcopyv_x280_asm,

BLIS_INVERTV_KER, BLIS_FLOAT, bli_sinvertv_x280_asm,
BLIS_INVERTV_KER, BLIS_DOUBLE, bli_dinvertv_x280_asm,
BLIS_INVERTV_KER, BLIS_SCOMPLEX, bli_cinvertv_x280_asm,
BLIS_INVERTV_KER, BLIS_DCOMPLEX, bli_zinvertv_x280_asm,

BLIS_INVSCALV_KER, BLIS_FLOAT, bli_sinvscalv_x280_asm,
BLIS_INVSCALV_KER, BLIS_DOUBLE, bli_dinvscalv_x280_asm,
BLIS_INVSCALV_KER, BLIS_SCOMPLEX, bli_cinvscalv_x280_asm,
BLIS_INVSCALV_KER, BLIS_DCOMPLEX, bli_zinvscalv_x280_asm,

BLIS_SETV_KER, BLIS_FLOAT, bli_ssetv_x280_asm,
BLIS_SETV_KER, BLIS_DOUBLE, bli_dsetv_x280_asm,
BLIS_SETV_KER, BLIS_SCOMPLEX, bli_csetv_x280_asm,
Expand Down
175 changes: 175 additions & 0 deletions kernels/x280/1/invertv.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
#include "blis.h"
#include <math.h>
#include <riscv_vector.h>
#include <stdbool.h>
#include <stddef.h>

#define FLT_SIZE 4
#define FLT_LOAD "flw "
#define VLE "vle32.v "
#define VLSE "vlse32.v "
#define VSE "vse32.v "
#define VSSE "vsse32.v "

void bli_sinvertv_x280_asm(dim_t n, float *restrict x, inc_t incx,
cntx_t *restrict cntx) {
(void)cntx;
float one = 1.f;
__asm__(FLT_LOAD "f0, (%0)" : : "r"(&one));
incx *= FLT_SIZE;
size_t avl = n;
while (avl) {
size_t vl;
__asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma"
: "=r"(vl)
: "r"(avl), "i"(8 * FLT_SIZE));
if (incx == FLT_SIZE) {
__asm__(VLE "v0, (%0)" : : "r"(x));
__asm__("vfrdiv.vf v0, v0, f0");
__asm__(VSE "v0, (%0)" : : "r"(x));
} else {
__asm__(VLSE "v0, (%0), %1" : : "r"(x), "r"(incx));
__asm__("vfrdiv.vf v0, v0, f0");
__asm__(VSSE "v0, (%0), %1" : : "r"(x), "r"(incx));
}
inc_t tmp1 = vl * incx;
__asm__("add %0, %0, %1" : "+r"(x) : "r"(tmp1));
avl -= vl;
}
return;
}

#undef FLT_SIZE
#undef FLT_LOAD
#undef VLE
#undef VLSE
#undef VSE
#undef VSSE

#define FLT_SIZE 8
#define FLT_LOAD "fld "
#define VLE "vle64.v "
#define VLSE "vlse64.v "
#define VSE "vse64.v "
#define VSSE "vsse64.v "

void bli_dinvertv_x280_asm(dim_t n, double *restrict x, inc_t incx,
cntx_t *restrict cntx) {
(void)cntx;
double one = 1.;
__asm__(FLT_LOAD "f0, (%0)" : : "r"(&one));
incx *= FLT_SIZE;
size_t avl = n;
while (avl) {
size_t vl;
__asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma"
: "=r"(vl)
: "r"(avl), "i"(8 * FLT_SIZE));
if (incx == FLT_SIZE) {
__asm__(VLE "v0, (%0)" : : "r"(x));
__asm__("vfrdiv.vf v0, v0, f0");
__asm__(VSE "v0, (%0)" : : "r"(x));
} else {
__asm__(VLSE "v0, (%0), %1" : : "r"(x), "r"(incx));
__asm__("vfrdiv.vf v0, v0, f0");
__asm__(VSSE "v0, (%0), %1" : : "r"(x), "r"(incx));
}
inc_t tmp1 = vl * incx;
__asm__("add %0, %0, %1" : "+r"(x) : "r"(tmp1));
avl -= vl;
}
return;
}

#undef FLT_SIZE
#undef FLT_LOAD
#undef VLE
#undef VLSE
#undef VSE
#undef VSSE

#define FLT_SIZE 4
#define VLSEG2 "vlseg2e32.v "
#define VLSSEG2 "vlsseg2e32.v "
#define VSSEG2 "vsseg2e32.v "
#define VSSSEG2 "vssseg2e32.v "

void bli_cinvertv_x280_asm(dim_t n, scomplex *restrict x, inc_t incx,
cntx_t *restrict cntx) {
(void)cntx;
incx *= 2 * FLT_SIZE;
size_t avl = n;
while (avl) {
size_t vl;
__asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma"
: "=r"(vl)
: "r"(avl), "i"(8 * FLT_SIZE));
if (incx == 2 * FLT_SIZE) {
__asm__(VLSEG2 "v0, (%0)" : : "r"(x));
__asm__("vfneg.v v4, v4");
__asm__("vfmul.vv v8, v0, v0");
__asm__("vfmacc.vv v8, v4, v4");
__asm__("vfdiv.vv v0, v0, v8");
__asm__("vfdiv.vv v4, v4, v8");
__asm__(VSSEG2 "v0, (%0)" : : "r"(x));
} else {
__asm__(VLSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx));
__asm__("vfneg.v v4, v4");
__asm__("vfmul.vv v8, v0, v0");
__asm__("vfmacc.vv v8, v4, v4");
__asm__("vfdiv.vv v0, v0, v8");
__asm__("vfdiv.vv v4, v4, v8");
__asm__(VSSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx));
}
inc_t tmp1 = vl * incx;
__asm__("add %0, %0, %1" : "+r"(x) : "r"(tmp1));
avl -= vl;
}
return;
}

#undef FLT_SIZE
#undef VLSEG2
#undef VLSSEG2
#undef VSSEG2
#undef VSSSEG2

#define FLT_SIZE 8
#define VLSEG2 "vlseg2e64.v "
#define VLSSEG2 "vlsseg2e64.v "
#define VSSEG2 "vsseg2e64.v "
#define VSSSEG2 "vssseg2e64.v "

void bli_zinvertv_x280_asm(dim_t n, dcomplex *restrict x, inc_t incx,
cntx_t *restrict cntx) {
(void)cntx;
incx *= 2 * FLT_SIZE;
size_t avl = n;
while (avl) {
size_t vl;
__asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma"
: "=r"(vl)
: "r"(avl), "i"(8 * FLT_SIZE));
if (incx == 2 * FLT_SIZE) {
__asm__(VLSEG2 "v0, (%0)" : : "r"(x));
__asm__("vfneg.v v4, v4");
__asm__("vfmul.vv v8, v0, v0");
__asm__("vfmacc.vv v8, v4, v4");
__asm__("vfdiv.vv v0, v0, v8");
__asm__("vfdiv.vv v4, v4, v8");
__asm__(VSSEG2 "v0, (%0)" : : "r"(x));
} else {
__asm__(VLSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx));
__asm__("vfneg.v v4, v4");
__asm__("vfmul.vv v8, v0, v0");
__asm__("vfmacc.vv v8, v4, v4");
__asm__("vfdiv.vv v0, v0, v8");
__asm__("vfdiv.vv v4, v4, v8");
__asm__(VSSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx));
}
inc_t tmp1 = vl * incx;
__asm__("add %0, %0, %1" : "+r"(x) : "r"(tmp1));
avl -= vl;
}
return;
}
Loading

0 comments on commit c33f542

Please sign in to comment.