forked from flame/blis
-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Co-authored-by: Michael Yeh <[email protected]>
- Loading branch information
1 parent
60374c0
commit c33f542
Showing
4 changed files
with
411 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,175 @@ | ||
#include "blis.h" | ||
#include <math.h> | ||
#include <riscv_vector.h> | ||
#include <stdbool.h> | ||
#include <stddef.h> | ||
|
||
#define FLT_SIZE 4 | ||
#define FLT_LOAD "flw " | ||
#define VLE "vle32.v " | ||
#define VLSE "vlse32.v " | ||
#define VSE "vse32.v " | ||
#define VSSE "vsse32.v " | ||
|
||
void bli_sinvertv_x280_asm(dim_t n, float *restrict x, inc_t incx, | ||
cntx_t *restrict cntx) { | ||
(void)cntx; | ||
float one = 1.f; | ||
__asm__(FLT_LOAD "f0, (%0)" : : "r"(&one)); | ||
incx *= FLT_SIZE; | ||
size_t avl = n; | ||
while (avl) { | ||
size_t vl; | ||
__asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma" | ||
: "=r"(vl) | ||
: "r"(avl), "i"(8 * FLT_SIZE)); | ||
if (incx == FLT_SIZE) { | ||
__asm__(VLE "v0, (%0)" : : "r"(x)); | ||
__asm__("vfrdiv.vf v0, v0, f0"); | ||
__asm__(VSE "v0, (%0)" : : "r"(x)); | ||
} else { | ||
__asm__(VLSE "v0, (%0), %1" : : "r"(x), "r"(incx)); | ||
__asm__("vfrdiv.vf v0, v0, f0"); | ||
__asm__(VSSE "v0, (%0), %1" : : "r"(x), "r"(incx)); | ||
} | ||
inc_t tmp1 = vl * incx; | ||
__asm__("add %0, %0, %1" : "+r"(x) : "r"(tmp1)); | ||
avl -= vl; | ||
} | ||
return; | ||
} | ||
|
||
#undef FLT_SIZE | ||
#undef FLT_LOAD | ||
#undef VLE | ||
#undef VLSE | ||
#undef VSE | ||
#undef VSSE | ||
|
||
#define FLT_SIZE 8 | ||
#define FLT_LOAD "fld " | ||
#define VLE "vle64.v " | ||
#define VLSE "vlse64.v " | ||
#define VSE "vse64.v " | ||
#define VSSE "vsse64.v " | ||
|
||
void bli_dinvertv_x280_asm(dim_t n, double *restrict x, inc_t incx, | ||
cntx_t *restrict cntx) { | ||
(void)cntx; | ||
double one = 1.; | ||
__asm__(FLT_LOAD "f0, (%0)" : : "r"(&one)); | ||
incx *= FLT_SIZE; | ||
size_t avl = n; | ||
while (avl) { | ||
size_t vl; | ||
__asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma" | ||
: "=r"(vl) | ||
: "r"(avl), "i"(8 * FLT_SIZE)); | ||
if (incx == FLT_SIZE) { | ||
__asm__(VLE "v0, (%0)" : : "r"(x)); | ||
__asm__("vfrdiv.vf v0, v0, f0"); | ||
__asm__(VSE "v0, (%0)" : : "r"(x)); | ||
} else { | ||
__asm__(VLSE "v0, (%0), %1" : : "r"(x), "r"(incx)); | ||
__asm__("vfrdiv.vf v0, v0, f0"); | ||
__asm__(VSSE "v0, (%0), %1" : : "r"(x), "r"(incx)); | ||
} | ||
inc_t tmp1 = vl * incx; | ||
__asm__("add %0, %0, %1" : "+r"(x) : "r"(tmp1)); | ||
avl -= vl; | ||
} | ||
return; | ||
} | ||
|
||
#undef FLT_SIZE | ||
#undef FLT_LOAD | ||
#undef VLE | ||
#undef VLSE | ||
#undef VSE | ||
#undef VSSE | ||
|
||
#define FLT_SIZE 4 | ||
#define VLSEG2 "vlseg2e32.v " | ||
#define VLSSEG2 "vlsseg2e32.v " | ||
#define VSSEG2 "vsseg2e32.v " | ||
#define VSSSEG2 "vssseg2e32.v " | ||
|
||
void bli_cinvertv_x280_asm(dim_t n, scomplex *restrict x, inc_t incx, | ||
cntx_t *restrict cntx) { | ||
(void)cntx; | ||
incx *= 2 * FLT_SIZE; | ||
size_t avl = n; | ||
while (avl) { | ||
size_t vl; | ||
__asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma" | ||
: "=r"(vl) | ||
: "r"(avl), "i"(8 * FLT_SIZE)); | ||
if (incx == 2 * FLT_SIZE) { | ||
__asm__(VLSEG2 "v0, (%0)" : : "r"(x)); | ||
__asm__("vfneg.v v4, v4"); | ||
__asm__("vfmul.vv v8, v0, v0"); | ||
__asm__("vfmacc.vv v8, v4, v4"); | ||
__asm__("vfdiv.vv v0, v0, v8"); | ||
__asm__("vfdiv.vv v4, v4, v8"); | ||
__asm__(VSSEG2 "v0, (%0)" : : "r"(x)); | ||
} else { | ||
__asm__(VLSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx)); | ||
__asm__("vfneg.v v4, v4"); | ||
__asm__("vfmul.vv v8, v0, v0"); | ||
__asm__("vfmacc.vv v8, v4, v4"); | ||
__asm__("vfdiv.vv v0, v0, v8"); | ||
__asm__("vfdiv.vv v4, v4, v8"); | ||
__asm__(VSSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx)); | ||
} | ||
inc_t tmp1 = vl * incx; | ||
__asm__("add %0, %0, %1" : "+r"(x) : "r"(tmp1)); | ||
avl -= vl; | ||
} | ||
return; | ||
} | ||
|
||
#undef FLT_SIZE | ||
#undef VLSEG2 | ||
#undef VLSSEG2 | ||
#undef VSSEG2 | ||
#undef VSSSEG2 | ||
|
||
#define FLT_SIZE 8 | ||
#define VLSEG2 "vlseg2e64.v " | ||
#define VLSSEG2 "vlsseg2e64.v " | ||
#define VSSEG2 "vsseg2e64.v " | ||
#define VSSSEG2 "vssseg2e64.v " | ||
|
||
void bli_zinvertv_x280_asm(dim_t n, dcomplex *restrict x, inc_t incx, | ||
cntx_t *restrict cntx) { | ||
(void)cntx; | ||
incx *= 2 * FLT_SIZE; | ||
size_t avl = n; | ||
while (avl) { | ||
size_t vl; | ||
__asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma" | ||
: "=r"(vl) | ||
: "r"(avl), "i"(8 * FLT_SIZE)); | ||
if (incx == 2 * FLT_SIZE) { | ||
__asm__(VLSEG2 "v0, (%0)" : : "r"(x)); | ||
__asm__("vfneg.v v4, v4"); | ||
__asm__("vfmul.vv v8, v0, v0"); | ||
__asm__("vfmacc.vv v8, v4, v4"); | ||
__asm__("vfdiv.vv v0, v0, v8"); | ||
__asm__("vfdiv.vv v4, v4, v8"); | ||
__asm__(VSSEG2 "v0, (%0)" : : "r"(x)); | ||
} else { | ||
__asm__(VLSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx)); | ||
__asm__("vfneg.v v4, v4"); | ||
__asm__("vfmul.vv v8, v0, v0"); | ||
__asm__("vfmacc.vv v8, v4, v4"); | ||
__asm__("vfdiv.vv v0, v0, v8"); | ||
__asm__("vfdiv.vv v4, v4, v8"); | ||
__asm__(VSSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx)); | ||
} | ||
inc_t tmp1 = vl * incx; | ||
__asm__("add %0, %0, %1" : "+r"(x) : "r"(tmp1)); | ||
avl -= vl; | ||
} | ||
return; | ||
} |
Oops, something went wrong.