diff --git a/src/f32-sigmoid/gen/wasmsimd-lut64-p2-div-x12.c b/src/f32-sigmoid/gen/wasmsimd-lut64-p2-div-x12.c index 24ebde05e70..84f138e8955 100644 --- a/src/f32-sigmoid/gen/wasmsimd-lut64-p2-div-x12.c +++ b/src/f32-sigmoid/gen/wasmsimd-lut64-p2-div-x12.c @@ -118,9 +118,9 @@ void xnn_f32_sigmoid_ukernel__wasmsimd_lut64_p2_div_x12( vf4567 = wasm_v128_andnot(vf4567, wasm_f32x4_gt(vz4567, vdenorm_cutoff)); vf89AB = wasm_v128_andnot(vf89AB, wasm_f32x4_gt(vz89AB, vdenorm_cutoff)); - vf0123 = wasm_v128_bitselect(vf0123, wasm_f32x4_sub(vone, vf0123), wasm_i32x4_shr(vx0123, 31)); - vf4567 = wasm_v128_bitselect(vf4567, wasm_f32x4_sub(vone, vf4567), wasm_i32x4_shr(vx4567, 31)); - vf89AB = wasm_v128_bitselect(vf89AB, wasm_f32x4_sub(vone, vf89AB), wasm_i32x4_shr(vx89AB, 31)); + vf0123 = __builtin_wasm_signselect_i32x4(vf0123, wasm_f32x4_sub(vone, vf0123), vx0123); + vf4567 = __builtin_wasm_signselect_i32x4(vf4567, wasm_f32x4_sub(vone, vf4567), vx4567); + vf89AB = __builtin_wasm_signselect_i32x4(vf89AB, wasm_f32x4_sub(vone, vf89AB), vx89AB); wasm_v128_store(y, vf0123); wasm_v128_store(y + 4, vf4567); @@ -159,7 +159,7 @@ void xnn_f32_sigmoid_ukernel__wasmsimd_lut64_p2_div_x12( v128_t vf = wasm_f32x4_div(vy, vd); vf = wasm_v128_andnot(vf, wasm_f32x4_gt(vz, vdenorm_cutoff)); - vf = wasm_v128_bitselect(vf, wasm_f32x4_sub(vone, vf), wasm_i32x4_shr(vx, 31)); + vf = __builtin_wasm_signselect_i32x4(vf, wasm_f32x4_sub(vone, vf), vx); wasm_v128_store(y, vf); y += 4; @@ -195,7 +195,7 @@ void xnn_f32_sigmoid_ukernel__wasmsimd_lut64_p2_div_x12( v128_t vf = wasm_f32x4_div(vy, vd); vf = wasm_v128_andnot(vf, wasm_f32x4_gt(vz, vdenorm_cutoff)); - vf = wasm_v128_bitselect(vf, wasm_f32x4_sub(vone, vf), wasm_i32x4_shr(vx, 31)); + vf = __builtin_wasm_signselect_i32x4(vf, wasm_f32x4_sub(vone, vf), vx); if (n & (2 * sizeof(float))) { *((double*) y) = wasm_f64x2_extract_lane(vf, 0); diff --git a/src/f32-sigmoid/gen/wasmsimd-lut64-p2-div-x16.c b/src/f32-sigmoid/gen/wasmsimd-lut64-p2-div-x16.c index 0560ddc6637..2c72d9a9c74 100644 --- a/src/f32-sigmoid/gen/wasmsimd-lut64-p2-div-x16.c +++ b/src/f32-sigmoid/gen/wasmsimd-lut64-p2-div-x16.c @@ -140,10 +140,10 @@ void xnn_f32_sigmoid_ukernel__wasmsimd_lut64_p2_div_x16( vf89AB = wasm_v128_andnot(vf89AB, wasm_f32x4_gt(vz89AB, vdenorm_cutoff)); vfCDEF = wasm_v128_andnot(vfCDEF, wasm_f32x4_gt(vzCDEF, vdenorm_cutoff)); - vf0123 = wasm_v128_bitselect(vf0123, wasm_f32x4_sub(vone, vf0123), wasm_i32x4_shr(vx0123, 31)); - vf4567 = wasm_v128_bitselect(vf4567, wasm_f32x4_sub(vone, vf4567), wasm_i32x4_shr(vx4567, 31)); - vf89AB = wasm_v128_bitselect(vf89AB, wasm_f32x4_sub(vone, vf89AB), wasm_i32x4_shr(vx89AB, 31)); - vfCDEF = wasm_v128_bitselect(vfCDEF, wasm_f32x4_sub(vone, vfCDEF), wasm_i32x4_shr(vxCDEF, 31)); + vf0123 = __builtin_wasm_signselect_i32x4(vf0123, wasm_f32x4_sub(vone, vf0123), vx0123); + vf4567 = __builtin_wasm_signselect_i32x4(vf4567, wasm_f32x4_sub(vone, vf4567), vx4567); + vf89AB = __builtin_wasm_signselect_i32x4(vf89AB, wasm_f32x4_sub(vone, vf89AB), vx89AB); + vfCDEF = __builtin_wasm_signselect_i32x4(vfCDEF, wasm_f32x4_sub(vone, vfCDEF), vxCDEF); wasm_v128_store(y, vf0123); wasm_v128_store(y + 4, vf4567); @@ -183,7 +183,7 @@ void xnn_f32_sigmoid_ukernel__wasmsimd_lut64_p2_div_x16( v128_t vf = wasm_f32x4_div(vy, vd); vf = wasm_v128_andnot(vf, wasm_f32x4_gt(vz, vdenorm_cutoff)); - vf = wasm_v128_bitselect(vf, wasm_f32x4_sub(vone, vf), wasm_i32x4_shr(vx, 31)); + vf = __builtin_wasm_signselect_i32x4(vf, wasm_f32x4_sub(vone, vf), vx); wasm_v128_store(y, vf); y += 4; @@ -219,7 +219,7 @@ void xnn_f32_sigmoid_ukernel__wasmsimd_lut64_p2_div_x16( v128_t vf = wasm_f32x4_div(vy, vd); vf = wasm_v128_andnot(vf, wasm_f32x4_gt(vz, vdenorm_cutoff)); - vf = wasm_v128_bitselect(vf, wasm_f32x4_sub(vone, vf), wasm_i32x4_shr(vx, 31)); + vf = __builtin_wasm_signselect_i32x4(vf, wasm_f32x4_sub(vone, vf), vx); if (n & (2 * sizeof(float))) { *((double*) y) = wasm_f64x2_extract_lane(vf, 0); diff --git a/src/f32-sigmoid/gen/wasmsimd-lut64-p2-div-x20.c b/src/f32-sigmoid/gen/wasmsimd-lut64-p2-div-x20.c index 5002b95581d..93d7aa99bf0 100644 --- a/src/f32-sigmoid/gen/wasmsimd-lut64-p2-div-x20.c +++ b/src/f32-sigmoid/gen/wasmsimd-lut64-p2-div-x20.c @@ -162,11 +162,11 @@ void xnn_f32_sigmoid_ukernel__wasmsimd_lut64_p2_div_x20( vfCDEF = wasm_v128_andnot(vfCDEF, wasm_f32x4_gt(vzCDEF, vdenorm_cutoff)); vfGHIJ = wasm_v128_andnot(vfGHIJ, wasm_f32x4_gt(vzGHIJ, vdenorm_cutoff)); - vf0123 = wasm_v128_bitselect(vf0123, wasm_f32x4_sub(vone, vf0123), wasm_i32x4_shr(vx0123, 31)); - vf4567 = wasm_v128_bitselect(vf4567, wasm_f32x4_sub(vone, vf4567), wasm_i32x4_shr(vx4567, 31)); - vf89AB = wasm_v128_bitselect(vf89AB, wasm_f32x4_sub(vone, vf89AB), wasm_i32x4_shr(vx89AB, 31)); - vfCDEF = wasm_v128_bitselect(vfCDEF, wasm_f32x4_sub(vone, vfCDEF), wasm_i32x4_shr(vxCDEF, 31)); - vfGHIJ = wasm_v128_bitselect(vfGHIJ, wasm_f32x4_sub(vone, vfGHIJ), wasm_i32x4_shr(vxGHIJ, 31)); + vf0123 = __builtin_wasm_signselect_i32x4(vf0123, wasm_f32x4_sub(vone, vf0123), vx0123); + vf4567 = __builtin_wasm_signselect_i32x4(vf4567, wasm_f32x4_sub(vone, vf4567), vx4567); + vf89AB = __builtin_wasm_signselect_i32x4(vf89AB, wasm_f32x4_sub(vone, vf89AB), vx89AB); + vfCDEF = __builtin_wasm_signselect_i32x4(vfCDEF, wasm_f32x4_sub(vone, vfCDEF), vxCDEF); + vfGHIJ = __builtin_wasm_signselect_i32x4(vfGHIJ, wasm_f32x4_sub(vone, vfGHIJ), vxGHIJ); wasm_v128_store(y, vf0123); wasm_v128_store(y + 4, vf4567); @@ -207,7 +207,7 @@ void xnn_f32_sigmoid_ukernel__wasmsimd_lut64_p2_div_x20( v128_t vf = wasm_f32x4_div(vy, vd); vf = wasm_v128_andnot(vf, wasm_f32x4_gt(vz, vdenorm_cutoff)); - vf = wasm_v128_bitselect(vf, wasm_f32x4_sub(vone, vf), wasm_i32x4_shr(vx, 31)); + vf = __builtin_wasm_signselect_i32x4(vf, wasm_f32x4_sub(vone, vf), vx); wasm_v128_store(y, vf); y += 4; @@ -243,7 +243,7 @@ void xnn_f32_sigmoid_ukernel__wasmsimd_lut64_p2_div_x20( v128_t vf = wasm_f32x4_div(vy, vd); vf = wasm_v128_andnot(vf, wasm_f32x4_gt(vz, vdenorm_cutoff)); - vf = wasm_v128_bitselect(vf, wasm_f32x4_sub(vone, vf), wasm_i32x4_shr(vx, 31)); + vf = __builtin_wasm_signselect_i32x4(vf, wasm_f32x4_sub(vone, vf), vx); if (n & (2 * sizeof(float))) { *((double*) y) = wasm_f64x2_extract_lane(vf, 0); diff --git a/src/f32-sigmoid/gen/wasmsimd-lut64-p2-div-x24.c b/src/f32-sigmoid/gen/wasmsimd-lut64-p2-div-x24.c index f6f0bc16123..9651e8857c9 100644 --- a/src/f32-sigmoid/gen/wasmsimd-lut64-p2-div-x24.c +++ b/src/f32-sigmoid/gen/wasmsimd-lut64-p2-div-x24.c @@ -184,12 +184,12 @@ void xnn_f32_sigmoid_ukernel__wasmsimd_lut64_p2_div_x24( vfGHIJ = wasm_v128_andnot(vfGHIJ, wasm_f32x4_gt(vzGHIJ, vdenorm_cutoff)); vfKLMN = wasm_v128_andnot(vfKLMN, wasm_f32x4_gt(vzKLMN, vdenorm_cutoff)); - vf0123 = wasm_v128_bitselect(vf0123, wasm_f32x4_sub(vone, vf0123), wasm_i32x4_shr(vx0123, 31)); - vf4567 = wasm_v128_bitselect(vf4567, wasm_f32x4_sub(vone, vf4567), wasm_i32x4_shr(vx4567, 31)); - vf89AB = wasm_v128_bitselect(vf89AB, wasm_f32x4_sub(vone, vf89AB), wasm_i32x4_shr(vx89AB, 31)); - vfCDEF = wasm_v128_bitselect(vfCDEF, wasm_f32x4_sub(vone, vfCDEF), wasm_i32x4_shr(vxCDEF, 31)); - vfGHIJ = wasm_v128_bitselect(vfGHIJ, wasm_f32x4_sub(vone, vfGHIJ), wasm_i32x4_shr(vxGHIJ, 31)); - vfKLMN = wasm_v128_bitselect(vfKLMN, wasm_f32x4_sub(vone, vfKLMN), wasm_i32x4_shr(vxKLMN, 31)); + vf0123 = __builtin_wasm_signselect_i32x4(vf0123, wasm_f32x4_sub(vone, vf0123), vx0123); + vf4567 = __builtin_wasm_signselect_i32x4(vf4567, wasm_f32x4_sub(vone, vf4567), vx4567); + vf89AB = __builtin_wasm_signselect_i32x4(vf89AB, wasm_f32x4_sub(vone, vf89AB), vx89AB); + vfCDEF = __builtin_wasm_signselect_i32x4(vfCDEF, wasm_f32x4_sub(vone, vfCDEF), vxCDEF); + vfGHIJ = __builtin_wasm_signselect_i32x4(vfGHIJ, wasm_f32x4_sub(vone, vfGHIJ), vxGHIJ); + vfKLMN = __builtin_wasm_signselect_i32x4(vfKLMN, wasm_f32x4_sub(vone, vfKLMN), vxKLMN); wasm_v128_store(y, vf0123); wasm_v128_store(y + 4, vf4567); @@ -231,7 +231,7 @@ void xnn_f32_sigmoid_ukernel__wasmsimd_lut64_p2_div_x24( v128_t vf = wasm_f32x4_div(vy, vd); vf = wasm_v128_andnot(vf, wasm_f32x4_gt(vz, vdenorm_cutoff)); - vf = wasm_v128_bitselect(vf, wasm_f32x4_sub(vone, vf), wasm_i32x4_shr(vx, 31)); + vf = __builtin_wasm_signselect_i32x4(vf, wasm_f32x4_sub(vone, vf), vx); wasm_v128_store(y, vf); y += 4; @@ -267,7 +267,7 @@ void xnn_f32_sigmoid_ukernel__wasmsimd_lut64_p2_div_x24( v128_t vf = wasm_f32x4_div(vy, vd); vf = wasm_v128_andnot(vf, wasm_f32x4_gt(vz, vdenorm_cutoff)); - vf = wasm_v128_bitselect(vf, wasm_f32x4_sub(vone, vf), wasm_i32x4_shr(vx, 31)); + vf = __builtin_wasm_signselect_i32x4(vf, wasm_f32x4_sub(vone, vf), vx); if (n & (2 * sizeof(float))) { *((double*) y) = wasm_f64x2_extract_lane(vf, 0); diff --git a/src/f32-sigmoid/gen/wasmsimd-lut64-p2-div-x4.c b/src/f32-sigmoid/gen/wasmsimd-lut64-p2-div-x4.c index 3651e8474ee..1bca9c813a5 100644 --- a/src/f32-sigmoid/gen/wasmsimd-lut64-p2-div-x4.c +++ b/src/f32-sigmoid/gen/wasmsimd-lut64-p2-div-x4.c @@ -66,7 +66,7 @@ void xnn_f32_sigmoid_ukernel__wasmsimd_lut64_p2_div_x4( v128_t vf = wasm_f32x4_div(vy, vd); vf = wasm_v128_andnot(vf, wasm_f32x4_gt(vz, vdenorm_cutoff)); - vf = wasm_v128_bitselect(vf, wasm_f32x4_sub(vone, vf), wasm_i32x4_shr(vx, 31)); + vf = __builtin_wasm_signselect_i32x4(vf, wasm_f32x4_sub(vone, vf), vx); wasm_v128_store(y, vf); y += 4; @@ -102,7 +102,7 @@ void xnn_f32_sigmoid_ukernel__wasmsimd_lut64_p2_div_x4( v128_t vf = wasm_f32x4_div(vy, vd); vf = wasm_v128_andnot(vf, wasm_f32x4_gt(vz, vdenorm_cutoff)); - vf = wasm_v128_bitselect(vf, wasm_f32x4_sub(vone, vf), wasm_i32x4_shr(vx, 31)); + vf = __builtin_wasm_signselect_i32x4(vf, wasm_f32x4_sub(vone, vf), vx); if (n & (2 * sizeof(float))) { *((double*) y) = wasm_f64x2_extract_lane(vf, 0); diff --git a/src/f32-sigmoid/gen/wasmsimd-lut64-p2-div-x8.c b/src/f32-sigmoid/gen/wasmsimd-lut64-p2-div-x8.c index 912d2495d77..b73b1d18614 100644 --- a/src/f32-sigmoid/gen/wasmsimd-lut64-p2-div-x8.c +++ b/src/f32-sigmoid/gen/wasmsimd-lut64-p2-div-x8.c @@ -96,8 +96,8 @@ void xnn_f32_sigmoid_ukernel__wasmsimd_lut64_p2_div_x8( vf0123 = wasm_v128_andnot(vf0123, wasm_f32x4_gt(vz0123, vdenorm_cutoff)); vf4567 = wasm_v128_andnot(vf4567, wasm_f32x4_gt(vz4567, vdenorm_cutoff)); - vf0123 = wasm_v128_bitselect(vf0123, wasm_f32x4_sub(vone, vf0123), wasm_i32x4_shr(vx0123, 31)); - vf4567 = wasm_v128_bitselect(vf4567, wasm_f32x4_sub(vone, vf4567), wasm_i32x4_shr(vx4567, 31)); + vf0123 = __builtin_wasm_signselect_i32x4(vf0123, wasm_f32x4_sub(vone, vf0123), vx0123); + vf4567 = __builtin_wasm_signselect_i32x4(vf4567, wasm_f32x4_sub(vone, vf4567), vx4567); wasm_v128_store(y, vf0123); wasm_v128_store(y + 4, vf4567); @@ -135,7 +135,7 @@ void xnn_f32_sigmoid_ukernel__wasmsimd_lut64_p2_div_x8( v128_t vf = wasm_f32x4_div(vy, vd); vf = wasm_v128_andnot(vf, wasm_f32x4_gt(vz, vdenorm_cutoff)); - vf = wasm_v128_bitselect(vf, wasm_f32x4_sub(vone, vf), wasm_i32x4_shr(vx, 31)); + vf = __builtin_wasm_signselect_i32x4(vf, wasm_f32x4_sub(vone, vf), vx); wasm_v128_store(y, vf); y += 4; @@ -171,7 +171,7 @@ void xnn_f32_sigmoid_ukernel__wasmsimd_lut64_p2_div_x8( v128_t vf = wasm_f32x4_div(vy, vd); vf = wasm_v128_andnot(vf, wasm_f32x4_gt(vz, vdenorm_cutoff)); - vf = wasm_v128_bitselect(vf, wasm_f32x4_sub(vone, vf), wasm_i32x4_shr(vx, 31)); + vf = __builtin_wasm_signselect_i32x4(vf, wasm_f32x4_sub(vone, vf), vx); if (n & (2 * sizeof(float))) { *((double*) y) = wasm_f64x2_extract_lane(vf, 0); diff --git a/src/f32-sigmoid/gen/wasmsimd-p5-div-x12.c b/src/f32-sigmoid/gen/wasmsimd-p5-div-x12.c index 1a9e2de5bc4..9f3cb5dd7d9 100644 --- a/src/f32-sigmoid/gen/wasmsimd-p5-div-x12.c +++ b/src/f32-sigmoid/gen/wasmsimd-p5-div-x12.c @@ -101,9 +101,9 @@ void xnn_f32_sigmoid_ukernel__wasmsimd_p5_div_x12( vf4567 = wasm_v128_andnot(vf4567, wasm_f32x4_gt(vz4567, vdenorm_cutoff)); vf89AB = wasm_v128_andnot(vf89AB, wasm_f32x4_gt(vz89AB, vdenorm_cutoff)); - vf0123 = wasm_v128_bitselect(vf0123, wasm_f32x4_sub(vone, vf0123), wasm_i32x4_shr(vx0123, 31)); - vf4567 = wasm_v128_bitselect(vf4567, wasm_f32x4_sub(vone, vf4567), wasm_i32x4_shr(vx4567, 31)); - vf89AB = wasm_v128_bitselect(vf89AB, wasm_f32x4_sub(vone, vf89AB), wasm_i32x4_shr(vx89AB, 31)); + vf0123 = __builtin_wasm_signselect_i32x4(vf0123, wasm_f32x4_sub(vone, vf0123), vx0123); + vf4567 = __builtin_wasm_signselect_i32x4(vf4567, wasm_f32x4_sub(vone, vf4567), vx4567); + vf89AB = __builtin_wasm_signselect_i32x4(vf89AB, wasm_f32x4_sub(vone, vf89AB), vx89AB); wasm_v128_store(y, vf0123); wasm_v128_store(y + 4, vf4567); @@ -134,7 +134,7 @@ void xnn_f32_sigmoid_ukernel__wasmsimd_p5_div_x12( v128_t vf = wasm_f32x4_div(ve, vd); vf = wasm_v128_andnot(vf, wasm_f32x4_gt(vz, vdenorm_cutoff)); - vf = wasm_v128_bitselect(vf, wasm_f32x4_sub(vone, vf), wasm_i32x4_shr(vx, 31)); + vf = __builtin_wasm_signselect_i32x4(vf, wasm_f32x4_sub(vone, vf), vx); wasm_v128_store(y, vf); y += 4; @@ -162,7 +162,7 @@ void xnn_f32_sigmoid_ukernel__wasmsimd_p5_div_x12( v128_t vf = wasm_f32x4_div(ve, vd); vf = wasm_v128_andnot(vf, wasm_f32x4_gt(vz, vdenorm_cutoff)); - vf = wasm_v128_bitselect(vf, wasm_f32x4_sub(vone, vf), wasm_i32x4_shr(vx, 31)); + vf = __builtin_wasm_signselect_i32x4(vf, wasm_f32x4_sub(vone, vf), vx); if (n & (2 * sizeof(float))) { *((double*) y) = wasm_f64x2_extract_lane(vf, 0); diff --git a/src/f32-sigmoid/gen/wasmsimd-p5-div-x16.c b/src/f32-sigmoid/gen/wasmsimd-p5-div-x16.c index ad22b685af0..55b86ae5d3f 100644 --- a/src/f32-sigmoid/gen/wasmsimd-p5-div-x16.c +++ b/src/f32-sigmoid/gen/wasmsimd-p5-div-x16.c @@ -117,10 +117,10 @@ void xnn_f32_sigmoid_ukernel__wasmsimd_p5_div_x16( vf89AB = wasm_v128_andnot(vf89AB, wasm_f32x4_gt(vz89AB, vdenorm_cutoff)); vfCDEF = wasm_v128_andnot(vfCDEF, wasm_f32x4_gt(vzCDEF, vdenorm_cutoff)); - vf0123 = wasm_v128_bitselect(vf0123, wasm_f32x4_sub(vone, vf0123), wasm_i32x4_shr(vx0123, 31)); - vf4567 = wasm_v128_bitselect(vf4567, wasm_f32x4_sub(vone, vf4567), wasm_i32x4_shr(vx4567, 31)); - vf89AB = wasm_v128_bitselect(vf89AB, wasm_f32x4_sub(vone, vf89AB), wasm_i32x4_shr(vx89AB, 31)); - vfCDEF = wasm_v128_bitselect(vfCDEF, wasm_f32x4_sub(vone, vfCDEF), wasm_i32x4_shr(vxCDEF, 31)); + vf0123 = __builtin_wasm_signselect_i32x4(vf0123, wasm_f32x4_sub(vone, vf0123), vx0123); + vf4567 = __builtin_wasm_signselect_i32x4(vf4567, wasm_f32x4_sub(vone, vf4567), vx4567); + vf89AB = __builtin_wasm_signselect_i32x4(vf89AB, wasm_f32x4_sub(vone, vf89AB), vx89AB); + vfCDEF = __builtin_wasm_signselect_i32x4(vfCDEF, wasm_f32x4_sub(vone, vfCDEF), vxCDEF); wasm_v128_store(y, vf0123); wasm_v128_store(y + 4, vf4567); @@ -152,7 +152,7 @@ void xnn_f32_sigmoid_ukernel__wasmsimd_p5_div_x16( v128_t vf = wasm_f32x4_div(ve, vd); vf = wasm_v128_andnot(vf, wasm_f32x4_gt(vz, vdenorm_cutoff)); - vf = wasm_v128_bitselect(vf, wasm_f32x4_sub(vone, vf), wasm_i32x4_shr(vx, 31)); + vf = __builtin_wasm_signselect_i32x4(vf, wasm_f32x4_sub(vone, vf), vx); wasm_v128_store(y, vf); y += 4; @@ -180,7 +180,7 @@ void xnn_f32_sigmoid_ukernel__wasmsimd_p5_div_x16( v128_t vf = wasm_f32x4_div(ve, vd); vf = wasm_v128_andnot(vf, wasm_f32x4_gt(vz, vdenorm_cutoff)); - vf = wasm_v128_bitselect(vf, wasm_f32x4_sub(vone, vf), wasm_i32x4_shr(vx, 31)); + vf = __builtin_wasm_signselect_i32x4(vf, wasm_f32x4_sub(vone, vf), vx); if (n & (2 * sizeof(float))) { *((double*) y) = wasm_f64x2_extract_lane(vf, 0); diff --git a/src/f32-sigmoid/gen/wasmsimd-p5-div-x20.c b/src/f32-sigmoid/gen/wasmsimd-p5-div-x20.c index f8f0333f9a6..339c13f069f 100644 --- a/src/f32-sigmoid/gen/wasmsimd-p5-div-x20.c +++ b/src/f32-sigmoid/gen/wasmsimd-p5-div-x20.c @@ -133,11 +133,11 @@ void xnn_f32_sigmoid_ukernel__wasmsimd_p5_div_x20( vfCDEF = wasm_v128_andnot(vfCDEF, wasm_f32x4_gt(vzCDEF, vdenorm_cutoff)); vfGHIJ = wasm_v128_andnot(vfGHIJ, wasm_f32x4_gt(vzGHIJ, vdenorm_cutoff)); - vf0123 = wasm_v128_bitselect(vf0123, wasm_f32x4_sub(vone, vf0123), wasm_i32x4_shr(vx0123, 31)); - vf4567 = wasm_v128_bitselect(vf4567, wasm_f32x4_sub(vone, vf4567), wasm_i32x4_shr(vx4567, 31)); - vf89AB = wasm_v128_bitselect(vf89AB, wasm_f32x4_sub(vone, vf89AB), wasm_i32x4_shr(vx89AB, 31)); - vfCDEF = wasm_v128_bitselect(vfCDEF, wasm_f32x4_sub(vone, vfCDEF), wasm_i32x4_shr(vxCDEF, 31)); - vfGHIJ = wasm_v128_bitselect(vfGHIJ, wasm_f32x4_sub(vone, vfGHIJ), wasm_i32x4_shr(vxGHIJ, 31)); + vf0123 = __builtin_wasm_signselect_i32x4(vf0123, wasm_f32x4_sub(vone, vf0123), vx0123); + vf4567 = __builtin_wasm_signselect_i32x4(vf4567, wasm_f32x4_sub(vone, vf4567), vx4567); + vf89AB = __builtin_wasm_signselect_i32x4(vf89AB, wasm_f32x4_sub(vone, vf89AB), vx89AB); + vfCDEF = __builtin_wasm_signselect_i32x4(vfCDEF, wasm_f32x4_sub(vone, vfCDEF), vxCDEF); + vfGHIJ = __builtin_wasm_signselect_i32x4(vfGHIJ, wasm_f32x4_sub(vone, vfGHIJ), vxGHIJ); wasm_v128_store(y, vf0123); wasm_v128_store(y + 4, vf4567); @@ -170,7 +170,7 @@ void xnn_f32_sigmoid_ukernel__wasmsimd_p5_div_x20( v128_t vf = wasm_f32x4_div(ve, vd); vf = wasm_v128_andnot(vf, wasm_f32x4_gt(vz, vdenorm_cutoff)); - vf = wasm_v128_bitselect(vf, wasm_f32x4_sub(vone, vf), wasm_i32x4_shr(vx, 31)); + vf = __builtin_wasm_signselect_i32x4(vf, wasm_f32x4_sub(vone, vf), vx); wasm_v128_store(y, vf); y += 4; @@ -198,7 +198,7 @@ void xnn_f32_sigmoid_ukernel__wasmsimd_p5_div_x20( v128_t vf = wasm_f32x4_div(ve, vd); vf = wasm_v128_andnot(vf, wasm_f32x4_gt(vz, vdenorm_cutoff)); - vf = wasm_v128_bitselect(vf, wasm_f32x4_sub(vone, vf), wasm_i32x4_shr(vx, 31)); + vf = __builtin_wasm_signselect_i32x4(vf, wasm_f32x4_sub(vone, vf), vx); if (n & (2 * sizeof(float))) { *((double*) y) = wasm_f64x2_extract_lane(vf, 0); diff --git a/src/f32-sigmoid/gen/wasmsimd-p5-div-x24.c b/src/f32-sigmoid/gen/wasmsimd-p5-div-x24.c index 94ee6b85c93..8a6b9e875b4 100644 --- a/src/f32-sigmoid/gen/wasmsimd-p5-div-x24.c +++ b/src/f32-sigmoid/gen/wasmsimd-p5-div-x24.c @@ -149,12 +149,12 @@ void xnn_f32_sigmoid_ukernel__wasmsimd_p5_div_x24( vfGHIJ = wasm_v128_andnot(vfGHIJ, wasm_f32x4_gt(vzGHIJ, vdenorm_cutoff)); vfKLMN = wasm_v128_andnot(vfKLMN, wasm_f32x4_gt(vzKLMN, vdenorm_cutoff)); - vf0123 = wasm_v128_bitselect(vf0123, wasm_f32x4_sub(vone, vf0123), wasm_i32x4_shr(vx0123, 31)); - vf4567 = wasm_v128_bitselect(vf4567, wasm_f32x4_sub(vone, vf4567), wasm_i32x4_shr(vx4567, 31)); - vf89AB = wasm_v128_bitselect(vf89AB, wasm_f32x4_sub(vone, vf89AB), wasm_i32x4_shr(vx89AB, 31)); - vfCDEF = wasm_v128_bitselect(vfCDEF, wasm_f32x4_sub(vone, vfCDEF), wasm_i32x4_shr(vxCDEF, 31)); - vfGHIJ = wasm_v128_bitselect(vfGHIJ, wasm_f32x4_sub(vone, vfGHIJ), wasm_i32x4_shr(vxGHIJ, 31)); - vfKLMN = wasm_v128_bitselect(vfKLMN, wasm_f32x4_sub(vone, vfKLMN), wasm_i32x4_shr(vxKLMN, 31)); + vf0123 = __builtin_wasm_signselect_i32x4(vf0123, wasm_f32x4_sub(vone, vf0123), vx0123); + vf4567 = __builtin_wasm_signselect_i32x4(vf4567, wasm_f32x4_sub(vone, vf4567), vx4567); + vf89AB = __builtin_wasm_signselect_i32x4(vf89AB, wasm_f32x4_sub(vone, vf89AB), vx89AB); + vfCDEF = __builtin_wasm_signselect_i32x4(vfCDEF, wasm_f32x4_sub(vone, vfCDEF), vxCDEF); + vfGHIJ = __builtin_wasm_signselect_i32x4(vfGHIJ, wasm_f32x4_sub(vone, vfGHIJ), vxGHIJ); + vfKLMN = __builtin_wasm_signselect_i32x4(vfKLMN, wasm_f32x4_sub(vone, vfKLMN), vxKLMN); wasm_v128_store(y, vf0123); wasm_v128_store(y + 4, vf4567); @@ -188,7 +188,7 @@ void xnn_f32_sigmoid_ukernel__wasmsimd_p5_div_x24( v128_t vf = wasm_f32x4_div(ve, vd); vf = wasm_v128_andnot(vf, wasm_f32x4_gt(vz, vdenorm_cutoff)); - vf = wasm_v128_bitselect(vf, wasm_f32x4_sub(vone, vf), wasm_i32x4_shr(vx, 31)); + vf = __builtin_wasm_signselect_i32x4(vf, wasm_f32x4_sub(vone, vf), vx); wasm_v128_store(y, vf); y += 4; @@ -216,7 +216,7 @@ void xnn_f32_sigmoid_ukernel__wasmsimd_p5_div_x24( v128_t vf = wasm_f32x4_div(ve, vd); vf = wasm_v128_andnot(vf, wasm_f32x4_gt(vz, vdenorm_cutoff)); - vf = wasm_v128_bitselect(vf, wasm_f32x4_sub(vone, vf), wasm_i32x4_shr(vx, 31)); + vf = __builtin_wasm_signselect_i32x4(vf, wasm_f32x4_sub(vone, vf), vx); if (n & (2 * sizeof(float))) { *((double*) y) = wasm_f64x2_extract_lane(vf, 0); diff --git a/src/f32-sigmoid/gen/wasmsimd-p5-div-x4.c b/src/f32-sigmoid/gen/wasmsimd-p5-div-x4.c index 253a75ec160..4673811f84f 100644 --- a/src/f32-sigmoid/gen/wasmsimd-p5-div-x4.c +++ b/src/f32-sigmoid/gen/wasmsimd-p5-div-x4.c @@ -59,7 +59,7 @@ void xnn_f32_sigmoid_ukernel__wasmsimd_p5_div_x4( v128_t vf = wasm_f32x4_div(ve, vd); vf = wasm_v128_andnot(vf, wasm_f32x4_gt(vz, vdenorm_cutoff)); - vf = wasm_v128_bitselect(vf, wasm_f32x4_sub(vone, vf), wasm_i32x4_shr(vx, 31)); + vf = __builtin_wasm_signselect_i32x4(vf, wasm_f32x4_sub(vone, vf), vx); wasm_v128_store(y, vf); y += 4; @@ -87,7 +87,7 @@ void xnn_f32_sigmoid_ukernel__wasmsimd_p5_div_x4( v128_t vf = wasm_f32x4_div(ve, vd); vf = wasm_v128_andnot(vf, wasm_f32x4_gt(vz, vdenorm_cutoff)); - vf = wasm_v128_bitselect(vf, wasm_f32x4_sub(vone, vf), wasm_i32x4_shr(vx, 31)); + vf = __builtin_wasm_signselect_i32x4(vf, wasm_f32x4_sub(vone, vf), vx); if (n & (2 * sizeof(float))) { *((double*) y) = wasm_f64x2_extract_lane(vf, 0); diff --git a/src/f32-sigmoid/gen/wasmsimd-p5-div-x8.c b/src/f32-sigmoid/gen/wasmsimd-p5-div-x8.c index f93e3510965..4b0773469cd 100644 --- a/src/f32-sigmoid/gen/wasmsimd-p5-div-x8.c +++ b/src/f32-sigmoid/gen/wasmsimd-p5-div-x8.c @@ -85,8 +85,8 @@ void xnn_f32_sigmoid_ukernel__wasmsimd_p5_div_x8( vf0123 = wasm_v128_andnot(vf0123, wasm_f32x4_gt(vz0123, vdenorm_cutoff)); vf4567 = wasm_v128_andnot(vf4567, wasm_f32x4_gt(vz4567, vdenorm_cutoff)); - vf0123 = wasm_v128_bitselect(vf0123, wasm_f32x4_sub(vone, vf0123), wasm_i32x4_shr(vx0123, 31)); - vf4567 = wasm_v128_bitselect(vf4567, wasm_f32x4_sub(vone, vf4567), wasm_i32x4_shr(vx4567, 31)); + vf0123 = __builtin_wasm_signselect_i32x4(vf0123, wasm_f32x4_sub(vone, vf0123), vx0123); + vf4567 = __builtin_wasm_signselect_i32x4(vf4567, wasm_f32x4_sub(vone, vf4567), vx4567); wasm_v128_store(y, vf0123); wasm_v128_store(y + 4, vf4567); @@ -116,7 +116,7 @@ void xnn_f32_sigmoid_ukernel__wasmsimd_p5_div_x8( v128_t vf = wasm_f32x4_div(ve, vd); vf = wasm_v128_andnot(vf, wasm_f32x4_gt(vz, vdenorm_cutoff)); - vf = wasm_v128_bitselect(vf, wasm_f32x4_sub(vone, vf), wasm_i32x4_shr(vx, 31)); + vf = __builtin_wasm_signselect_i32x4(vf, wasm_f32x4_sub(vone, vf), vx); wasm_v128_store(y, vf); y += 4; @@ -144,7 +144,7 @@ void xnn_f32_sigmoid_ukernel__wasmsimd_p5_div_x8( v128_t vf = wasm_f32x4_div(ve, vd); vf = wasm_v128_andnot(vf, wasm_f32x4_gt(vz, vdenorm_cutoff)); - vf = wasm_v128_bitselect(vf, wasm_f32x4_sub(vone, vf), wasm_i32x4_shr(vx, 31)); + vf = __builtin_wasm_signselect_i32x4(vf, wasm_f32x4_sub(vone, vf), vx); if (n & (2 * sizeof(float))) { *((double*) y) = wasm_f64x2_extract_lane(vf, 0); diff --git a/src/f32-sigmoid/wasmsimd-lut64-p2-div.c.in b/src/f32-sigmoid/wasmsimd-lut64-p2-div.c.in index cf6db17adfd..eb5585daad1 100644 --- a/src/f32-sigmoid/wasmsimd-lut64-p2-div.c.in +++ b/src/f32-sigmoid/wasmsimd-lut64-p2-div.c.in @@ -92,7 +92,7 @@ void xnn_f32_sigmoid_ukernel__wasmsimd_lut64_p2_div_x${BATCH_TILE}( vf${ABC[N:N+4]} = wasm_v128_andnot(vf${ABC[N:N+4]}, wasm_f32x4_gt(vz${ABC[N:N+4]}, vdenorm_cutoff)); $for N in range(0, BATCH_TILE, 4): - vf${ABC[N:N+4]} = wasm_v128_bitselect(vf${ABC[N:N+4]}, wasm_f32x4_sub(vone, vf${ABC[N:N+4]}), wasm_i32x4_shr(vx${ABC[N:N+4]}, 31)); + vf${ABC[N:N+4]} = __builtin_wasm_signselect_i32x4(vf${ABC[N:N+4]}, wasm_f32x4_sub(vone, vf${ABC[N:N+4]}), vx${ABC[N:N+4]}); wasm_v128_store(y, vf${ABC[0:4]}); $for N in range(4, BATCH_TILE, 4): @@ -131,7 +131,7 @@ void xnn_f32_sigmoid_ukernel__wasmsimd_lut64_p2_div_x${BATCH_TILE}( v128_t vf = wasm_f32x4_div(vy, vd); vf = wasm_v128_andnot(vf, wasm_f32x4_gt(vz, vdenorm_cutoff)); - vf = wasm_v128_bitselect(vf, wasm_f32x4_sub(vone, vf), wasm_i32x4_shr(vx, 31)); + vf = __builtin_wasm_signselect_i32x4(vf, wasm_f32x4_sub(vone, vf), vx); wasm_v128_store(y, vf); y += 4; @@ -167,7 +167,7 @@ void xnn_f32_sigmoid_ukernel__wasmsimd_lut64_p2_div_x${BATCH_TILE}( v128_t vf = wasm_f32x4_div(vy, vd); vf = wasm_v128_andnot(vf, wasm_f32x4_gt(vz, vdenorm_cutoff)); - vf = wasm_v128_bitselect(vf, wasm_f32x4_sub(vone, vf), wasm_i32x4_shr(vx, 31)); + vf = __builtin_wasm_signselect_i32x4(vf, wasm_f32x4_sub(vone, vf), vx); if (n & (2 * sizeof(float))) { *((double*) y) = wasm_f64x2_extract_lane(vf, 0); diff --git a/src/f32-sigmoid/wasmsimd-p5-div.c.in b/src/f32-sigmoid/wasmsimd-p5-div.c.in index 6d7567332c3..221f6e21987 100644 --- a/src/f32-sigmoid/wasmsimd-p5-div.c.in +++ b/src/f32-sigmoid/wasmsimd-p5-div.c.in @@ -87,7 +87,7 @@ void xnn_f32_sigmoid_ukernel__wasmsimd_p5_div_x${BATCH_TILE}( vf${ABC[N:N+4]} = wasm_v128_andnot(vf${ABC[N:N+4]}, wasm_f32x4_gt(vz${ABC[N:N+4]}, vdenorm_cutoff)); $for N in range(0, BATCH_TILE, 4): - vf${ABC[N:N+4]} = wasm_v128_bitselect(vf${ABC[N:N+4]}, wasm_f32x4_sub(vone, vf${ABC[N:N+4]}), wasm_i32x4_shr(vx${ABC[N:N+4]}, 31)); + vf${ABC[N:N+4]} = __builtin_wasm_signselect_i32x4(vf${ABC[N:N+4]}, wasm_f32x4_sub(vone, vf${ABC[N:N+4]}), vx${ABC[N:N+4]}); wasm_v128_store(y, vf${ABC[0:4]}); $for N in range(4, BATCH_TILE, 4): @@ -118,7 +118,7 @@ void xnn_f32_sigmoid_ukernel__wasmsimd_p5_div_x${BATCH_TILE}( v128_t vf = wasm_f32x4_div(ve, vd); vf = wasm_v128_andnot(vf, wasm_f32x4_gt(vz, vdenorm_cutoff)); - vf = wasm_v128_bitselect(vf, wasm_f32x4_sub(vone, vf), wasm_i32x4_shr(vx, 31)); + vf = __builtin_wasm_signselect_i32x4(vf, wasm_f32x4_sub(vone, vf), vx); wasm_v128_store(y, vf); y += 4; @@ -146,7 +146,7 @@ void xnn_f32_sigmoid_ukernel__wasmsimd_p5_div_x${BATCH_TILE}( v128_t vf = wasm_f32x4_div(ve, vd); vf = wasm_v128_andnot(vf, wasm_f32x4_gt(vz, vdenorm_cutoff)); - vf = wasm_v128_bitselect(vf, wasm_f32x4_sub(vone, vf), wasm_i32x4_shr(vx, 31)); + vf = __builtin_wasm_signselect_i32x4(vf, wasm_f32x4_sub(vone, vf), vx); if (n & (2 * sizeof(float))) { *((double*) y) = wasm_f64x2_extract_lane(vf, 0); diff --git a/src/f32-velu/gen/velu-wasmsimd-arm-rr2-lut16-p3-x12.c b/src/f32-velu/gen/velu-wasmsimd-arm-rr2-lut16-p3-x12.c index 3d2e81e97c1..e2cc978e365 100644 --- a/src/f32-velu/gen/velu-wasmsimd-arm-rr2-lut16-p3-x12.c +++ b/src/f32-velu/gen/velu-wasmsimd-arm-rr2-lut16-p3-x12.c @@ -124,16 +124,13 @@ void xnn_f32_velu_ukernel__wasmsimd_arm_rr2_lut16_p3_x12( const v128_t ve4567 = wasm_f32x4_mul(wasm_f32x4_add(vp4567, vs4567), valpha); const v128_t ve89AB = wasm_f32x4_mul(wasm_f32x4_add(vp89AB, vs89AB), valpha); - const v128_t vm0123 = wasm_i32x4_shr(vx0123, 31); vx0123 = wasm_f32x4_mul(vx0123, vbeta); - const v128_t vm4567 = wasm_i32x4_shr(vx4567, 31); vx4567 = wasm_f32x4_mul(vx4567, vbeta); - const v128_t vm89AB = wasm_i32x4_shr(vx89AB, 31); vx89AB = wasm_f32x4_mul(vx89AB, vbeta); - const v128_t vy0123 = wasm_v128_bitselect(ve0123, vx0123, vm0123); - const v128_t vy4567 = wasm_v128_bitselect(ve4567, vx4567, vm4567); - const v128_t vy89AB = wasm_v128_bitselect(ve89AB, vx89AB, vm89AB); + const v128_t vy0123 = __builtin_wasm_signselect_i32x4(ve0123, vx0123, vx0123); + const v128_t vy4567 = __builtin_wasm_signselect_i32x4(ve4567, vx4567, vx4567); + const v128_t vy89AB = __builtin_wasm_signselect_i32x4(ve89AB, vx89AB, vx89AB); wasm_v128_store(y, vy0123); wasm_v128_store(y + 4, vy4567); @@ -172,9 +169,8 @@ void xnn_f32_velu_ukernel__wasmsimd_arm_rr2_lut16_p3_x12( vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vt); const v128_t ve = wasm_f32x4_mul(wasm_f32x4_add(vp, vs), valpha); - const v128_t vm = wasm_i32x4_shr(vx, 31); vx = wasm_f32x4_mul(vx, vbeta); - const v128_t vy = wasm_v128_bitselect(ve, vx, vm); + const v128_t vy = __builtin_wasm_signselect_i32x4(ve, vx, vx); wasm_v128_store(y, vy); y += 4; @@ -210,9 +206,8 @@ void xnn_f32_velu_ukernel__wasmsimd_arm_rr2_lut16_p3_x12( vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vt); const v128_t ve = wasm_f32x4_mul(wasm_f32x4_add(vp, vs), valpha); - const v128_t vm = wasm_i32x4_shr(vx, 31); vx = wasm_f32x4_mul(vx, vbeta); - v128_t vy = wasm_v128_bitselect(ve, vx, vm); + v128_t vy = __builtin_wasm_signselect_i32x4(ve, vx, vx); if (n & (2 * sizeof(float))) { *((double*) y) = wasm_f64x2_extract_lane(vy, 0); diff --git a/src/f32-velu/gen/velu-wasmsimd-arm-rr2-lut16-p3-x16.c b/src/f32-velu/gen/velu-wasmsimd-arm-rr2-lut16-p3-x16.c index a8a5e7f6c75..183b05a8a03 100644 --- a/src/f32-velu/gen/velu-wasmsimd-arm-rr2-lut16-p3-x16.c +++ b/src/f32-velu/gen/velu-wasmsimd-arm-rr2-lut16-p3-x16.c @@ -146,19 +146,15 @@ void xnn_f32_velu_ukernel__wasmsimd_arm_rr2_lut16_p3_x16( const v128_t ve89AB = wasm_f32x4_mul(wasm_f32x4_add(vp89AB, vs89AB), valpha); const v128_t veCDEF = wasm_f32x4_mul(wasm_f32x4_add(vpCDEF, vsCDEF), valpha); - const v128_t vm0123 = wasm_i32x4_shr(vx0123, 31); vx0123 = wasm_f32x4_mul(vx0123, vbeta); - const v128_t vm4567 = wasm_i32x4_shr(vx4567, 31); vx4567 = wasm_f32x4_mul(vx4567, vbeta); - const v128_t vm89AB = wasm_i32x4_shr(vx89AB, 31); vx89AB = wasm_f32x4_mul(vx89AB, vbeta); - const v128_t vmCDEF = wasm_i32x4_shr(vxCDEF, 31); vxCDEF = wasm_f32x4_mul(vxCDEF, vbeta); - const v128_t vy0123 = wasm_v128_bitselect(ve0123, vx0123, vm0123); - const v128_t vy4567 = wasm_v128_bitselect(ve4567, vx4567, vm4567); - const v128_t vy89AB = wasm_v128_bitselect(ve89AB, vx89AB, vm89AB); - const v128_t vyCDEF = wasm_v128_bitselect(veCDEF, vxCDEF, vmCDEF); + const v128_t vy0123 = __builtin_wasm_signselect_i32x4(ve0123, vx0123, vx0123); + const v128_t vy4567 = __builtin_wasm_signselect_i32x4(ve4567, vx4567, vx4567); + const v128_t vy89AB = __builtin_wasm_signselect_i32x4(ve89AB, vx89AB, vx89AB); + const v128_t vyCDEF = __builtin_wasm_signselect_i32x4(veCDEF, vxCDEF, vxCDEF); wasm_v128_store(y, vy0123); wasm_v128_store(y + 4, vy4567); @@ -198,9 +194,8 @@ void xnn_f32_velu_ukernel__wasmsimd_arm_rr2_lut16_p3_x16( vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vt); const v128_t ve = wasm_f32x4_mul(wasm_f32x4_add(vp, vs), valpha); - const v128_t vm = wasm_i32x4_shr(vx, 31); vx = wasm_f32x4_mul(vx, vbeta); - const v128_t vy = wasm_v128_bitselect(ve, vx, vm); + const v128_t vy = __builtin_wasm_signselect_i32x4(ve, vx, vx); wasm_v128_store(y, vy); y += 4; @@ -236,9 +231,8 @@ void xnn_f32_velu_ukernel__wasmsimd_arm_rr2_lut16_p3_x16( vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vt); const v128_t ve = wasm_f32x4_mul(wasm_f32x4_add(vp, vs), valpha); - const v128_t vm = wasm_i32x4_shr(vx, 31); vx = wasm_f32x4_mul(vx, vbeta); - v128_t vy = wasm_v128_bitselect(ve, vx, vm); + v128_t vy = __builtin_wasm_signselect_i32x4(ve, vx, vx); if (n & (2 * sizeof(float))) { *((double*) y) = wasm_f64x2_extract_lane(vy, 0); diff --git a/src/f32-velu/gen/velu-wasmsimd-arm-rr2-lut16-p3-x20.c b/src/f32-velu/gen/velu-wasmsimd-arm-rr2-lut16-p3-x20.c index 3bb62df333e..e70c9b016e9 100644 --- a/src/f32-velu/gen/velu-wasmsimd-arm-rr2-lut16-p3-x20.c +++ b/src/f32-velu/gen/velu-wasmsimd-arm-rr2-lut16-p3-x20.c @@ -168,22 +168,17 @@ void xnn_f32_velu_ukernel__wasmsimd_arm_rr2_lut16_p3_x20( const v128_t veCDEF = wasm_f32x4_mul(wasm_f32x4_add(vpCDEF, vsCDEF), valpha); const v128_t veGHIJ = wasm_f32x4_mul(wasm_f32x4_add(vpGHIJ, vsGHIJ), valpha); - const v128_t vm0123 = wasm_i32x4_shr(vx0123, 31); vx0123 = wasm_f32x4_mul(vx0123, vbeta); - const v128_t vm4567 = wasm_i32x4_shr(vx4567, 31); vx4567 = wasm_f32x4_mul(vx4567, vbeta); - const v128_t vm89AB = wasm_i32x4_shr(vx89AB, 31); vx89AB = wasm_f32x4_mul(vx89AB, vbeta); - const v128_t vmCDEF = wasm_i32x4_shr(vxCDEF, 31); vxCDEF = wasm_f32x4_mul(vxCDEF, vbeta); - const v128_t vmGHIJ = wasm_i32x4_shr(vxGHIJ, 31); vxGHIJ = wasm_f32x4_mul(vxGHIJ, vbeta); - const v128_t vy0123 = wasm_v128_bitselect(ve0123, vx0123, vm0123); - const v128_t vy4567 = wasm_v128_bitselect(ve4567, vx4567, vm4567); - const v128_t vy89AB = wasm_v128_bitselect(ve89AB, vx89AB, vm89AB); - const v128_t vyCDEF = wasm_v128_bitselect(veCDEF, vxCDEF, vmCDEF); - const v128_t vyGHIJ = wasm_v128_bitselect(veGHIJ, vxGHIJ, vmGHIJ); + const v128_t vy0123 = __builtin_wasm_signselect_i32x4(ve0123, vx0123, vx0123); + const v128_t vy4567 = __builtin_wasm_signselect_i32x4(ve4567, vx4567, vx4567); + const v128_t vy89AB = __builtin_wasm_signselect_i32x4(ve89AB, vx89AB, vx89AB); + const v128_t vyCDEF = __builtin_wasm_signselect_i32x4(veCDEF, vxCDEF, vxCDEF); + const v128_t vyGHIJ = __builtin_wasm_signselect_i32x4(veGHIJ, vxGHIJ, vxGHIJ); wasm_v128_store(y, vy0123); wasm_v128_store(y + 4, vy4567); @@ -224,9 +219,8 @@ void xnn_f32_velu_ukernel__wasmsimd_arm_rr2_lut16_p3_x20( vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vt); const v128_t ve = wasm_f32x4_mul(wasm_f32x4_add(vp, vs), valpha); - const v128_t vm = wasm_i32x4_shr(vx, 31); vx = wasm_f32x4_mul(vx, vbeta); - const v128_t vy = wasm_v128_bitselect(ve, vx, vm); + const v128_t vy = __builtin_wasm_signselect_i32x4(ve, vx, vx); wasm_v128_store(y, vy); y += 4; @@ -262,9 +256,8 @@ void xnn_f32_velu_ukernel__wasmsimd_arm_rr2_lut16_p3_x20( vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vt); const v128_t ve = wasm_f32x4_mul(wasm_f32x4_add(vp, vs), valpha); - const v128_t vm = wasm_i32x4_shr(vx, 31); vx = wasm_f32x4_mul(vx, vbeta); - v128_t vy = wasm_v128_bitselect(ve, vx, vm); + v128_t vy = __builtin_wasm_signselect_i32x4(ve, vx, vx); if (n & (2 * sizeof(float))) { *((double*) y) = wasm_f64x2_extract_lane(vy, 0); diff --git a/src/f32-velu/gen/velu-wasmsimd-arm-rr2-lut16-p3-x24.c b/src/f32-velu/gen/velu-wasmsimd-arm-rr2-lut16-p3-x24.c index bedb6b68791..63fc4d8dc38 100644 --- a/src/f32-velu/gen/velu-wasmsimd-arm-rr2-lut16-p3-x24.c +++ b/src/f32-velu/gen/velu-wasmsimd-arm-rr2-lut16-p3-x24.c @@ -190,25 +190,19 @@ void xnn_f32_velu_ukernel__wasmsimd_arm_rr2_lut16_p3_x24( const v128_t veGHIJ = wasm_f32x4_mul(wasm_f32x4_add(vpGHIJ, vsGHIJ), valpha); const v128_t veKLMN = wasm_f32x4_mul(wasm_f32x4_add(vpKLMN, vsKLMN), valpha); - const v128_t vm0123 = wasm_i32x4_shr(vx0123, 31); vx0123 = wasm_f32x4_mul(vx0123, vbeta); - const v128_t vm4567 = wasm_i32x4_shr(vx4567, 31); vx4567 = wasm_f32x4_mul(vx4567, vbeta); - const v128_t vm89AB = wasm_i32x4_shr(vx89AB, 31); vx89AB = wasm_f32x4_mul(vx89AB, vbeta); - const v128_t vmCDEF = wasm_i32x4_shr(vxCDEF, 31); vxCDEF = wasm_f32x4_mul(vxCDEF, vbeta); - const v128_t vmGHIJ = wasm_i32x4_shr(vxGHIJ, 31); vxGHIJ = wasm_f32x4_mul(vxGHIJ, vbeta); - const v128_t vmKLMN = wasm_i32x4_shr(vxKLMN, 31); vxKLMN = wasm_f32x4_mul(vxKLMN, vbeta); - const v128_t vy0123 = wasm_v128_bitselect(ve0123, vx0123, vm0123); - const v128_t vy4567 = wasm_v128_bitselect(ve4567, vx4567, vm4567); - const v128_t vy89AB = wasm_v128_bitselect(ve89AB, vx89AB, vm89AB); - const v128_t vyCDEF = wasm_v128_bitselect(veCDEF, vxCDEF, vmCDEF); - const v128_t vyGHIJ = wasm_v128_bitselect(veGHIJ, vxGHIJ, vmGHIJ); - const v128_t vyKLMN = wasm_v128_bitselect(veKLMN, vxKLMN, vmKLMN); + const v128_t vy0123 = __builtin_wasm_signselect_i32x4(ve0123, vx0123, vx0123); + const v128_t vy4567 = __builtin_wasm_signselect_i32x4(ve4567, vx4567, vx4567); + const v128_t vy89AB = __builtin_wasm_signselect_i32x4(ve89AB, vx89AB, vx89AB); + const v128_t vyCDEF = __builtin_wasm_signselect_i32x4(veCDEF, vxCDEF, vxCDEF); + const v128_t vyGHIJ = __builtin_wasm_signselect_i32x4(veGHIJ, vxGHIJ, vxGHIJ); + const v128_t vyKLMN = __builtin_wasm_signselect_i32x4(veKLMN, vxKLMN, vxKLMN); wasm_v128_store(y, vy0123); wasm_v128_store(y + 4, vy4567); @@ -250,9 +244,8 @@ void xnn_f32_velu_ukernel__wasmsimd_arm_rr2_lut16_p3_x24( vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vt); const v128_t ve = wasm_f32x4_mul(wasm_f32x4_add(vp, vs), valpha); - const v128_t vm = wasm_i32x4_shr(vx, 31); vx = wasm_f32x4_mul(vx, vbeta); - const v128_t vy = wasm_v128_bitselect(ve, vx, vm); + const v128_t vy = __builtin_wasm_signselect_i32x4(ve, vx, vx); wasm_v128_store(y, vy); y += 4; @@ -288,9 +281,8 @@ void xnn_f32_velu_ukernel__wasmsimd_arm_rr2_lut16_p3_x24( vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vt); const v128_t ve = wasm_f32x4_mul(wasm_f32x4_add(vp, vs), valpha); - const v128_t vm = wasm_i32x4_shr(vx, 31); vx = wasm_f32x4_mul(vx, vbeta); - v128_t vy = wasm_v128_bitselect(ve, vx, vm); + v128_t vy = __builtin_wasm_signselect_i32x4(ve, vx, vx); if (n & (2 * sizeof(float))) { *((double*) y) = wasm_f64x2_extract_lane(vy, 0); diff --git a/src/f32-velu/gen/velu-wasmsimd-arm-rr2-lut16-p3-x4.c b/src/f32-velu/gen/velu-wasmsimd-arm-rr2-lut16-p3-x4.c index cccfde5edc5..54bffad77c9 100644 --- a/src/f32-velu/gen/velu-wasmsimd-arm-rr2-lut16-p3-x4.c +++ b/src/f32-velu/gen/velu-wasmsimd-arm-rr2-lut16-p3-x4.c @@ -74,9 +74,8 @@ void xnn_f32_velu_ukernel__wasmsimd_arm_rr2_lut16_p3_x4( vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vt); const v128_t ve = wasm_f32x4_mul(wasm_f32x4_add(vp, vs), valpha); - const v128_t vm = wasm_i32x4_shr(vx, 31); vx = wasm_f32x4_mul(vx, vbeta); - const v128_t vy = wasm_v128_bitselect(ve, vx, vm); + const v128_t vy = __builtin_wasm_signselect_i32x4(ve, vx, vx); wasm_v128_store(y, vy); y += 4; @@ -112,9 +111,8 @@ void xnn_f32_velu_ukernel__wasmsimd_arm_rr2_lut16_p3_x4( vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vt); const v128_t ve = wasm_f32x4_mul(wasm_f32x4_add(vp, vs), valpha); - const v128_t vm = wasm_i32x4_shr(vx, 31); vx = wasm_f32x4_mul(vx, vbeta); - v128_t vy = wasm_v128_bitselect(ve, vx, vm); + v128_t vy = __builtin_wasm_signselect_i32x4(ve, vx, vx); if (n & (2 * sizeof(float))) { *((double*) y) = wasm_f64x2_extract_lane(vy, 0); diff --git a/src/f32-velu/gen/velu-wasmsimd-arm-rr2-lut16-p3-x8.c b/src/f32-velu/gen/velu-wasmsimd-arm-rr2-lut16-p3-x8.c index 34ea3237d12..a5adcd74020 100644 --- a/src/f32-velu/gen/velu-wasmsimd-arm-rr2-lut16-p3-x8.c +++ b/src/f32-velu/gen/velu-wasmsimd-arm-rr2-lut16-p3-x8.c @@ -102,13 +102,11 @@ void xnn_f32_velu_ukernel__wasmsimd_arm_rr2_lut16_p3_x8( const v128_t ve0123 = wasm_f32x4_mul(wasm_f32x4_add(vp0123, vs0123), valpha); const v128_t ve4567 = wasm_f32x4_mul(wasm_f32x4_add(vp4567, vs4567), valpha); - const v128_t vm0123 = wasm_i32x4_shr(vx0123, 31); vx0123 = wasm_f32x4_mul(vx0123, vbeta); - const v128_t vm4567 = wasm_i32x4_shr(vx4567, 31); vx4567 = wasm_f32x4_mul(vx4567, vbeta); - const v128_t vy0123 = wasm_v128_bitselect(ve0123, vx0123, vm0123); - const v128_t vy4567 = wasm_v128_bitselect(ve4567, vx4567, vm4567); + const v128_t vy0123 = __builtin_wasm_signselect_i32x4(ve0123, vx0123, vx0123); + const v128_t vy4567 = __builtin_wasm_signselect_i32x4(ve4567, vx4567, vx4567); wasm_v128_store(y, vy0123); wasm_v128_store(y + 4, vy4567); @@ -146,9 +144,8 @@ void xnn_f32_velu_ukernel__wasmsimd_arm_rr2_lut16_p3_x8( vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vt); const v128_t ve = wasm_f32x4_mul(wasm_f32x4_add(vp, vs), valpha); - const v128_t vm = wasm_i32x4_shr(vx, 31); vx = wasm_f32x4_mul(vx, vbeta); - const v128_t vy = wasm_v128_bitselect(ve, vx, vm); + const v128_t vy = __builtin_wasm_signselect_i32x4(ve, vx, vx); wasm_v128_store(y, vy); y += 4; @@ -184,9 +181,8 @@ void xnn_f32_velu_ukernel__wasmsimd_arm_rr2_lut16_p3_x8( vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vt); const v128_t ve = wasm_f32x4_mul(wasm_f32x4_add(vp, vs), valpha); - const v128_t vm = wasm_i32x4_shr(vx, 31); vx = wasm_f32x4_mul(vx, vbeta); - v128_t vy = wasm_v128_bitselect(ve, vx, vm); + v128_t vy = __builtin_wasm_signselect_i32x4(ve, vx, vx); if (n & (2 * sizeof(float))) { *((double*) y) = wasm_f64x2_extract_lane(vy, 0); diff --git a/src/f32-velu/gen/velu-wasmsimd-arm-rr2-p6-x12.c b/src/f32-velu/gen/velu-wasmsimd-arm-rr2-p6-x12.c index 9b7bb25f18f..654a6b020ec 100644 --- a/src/f32-velu/gen/velu-wasmsimd-arm-rr2-p6-x12.c +++ b/src/f32-velu/gen/velu-wasmsimd-arm-rr2-p6-x12.c @@ -108,16 +108,13 @@ void xnn_f32_velu_ukernel__wasmsimd_arm_rr2_p6_x12( const v128_t ve4567 = wasm_f32x4_mul(wasm_f32x4_add(vp4567, vs4567), valpha); const v128_t ve89AB = wasm_f32x4_mul(wasm_f32x4_add(vp89AB, vs89AB), valpha); - const v128_t vm0123 = wasm_i32x4_shr(vx0123, 31); vx0123 = wasm_f32x4_mul(vx0123, vbeta); - const v128_t vm4567 = wasm_i32x4_shr(vx4567, 31); vx4567 = wasm_f32x4_mul(vx4567, vbeta); - const v128_t vm89AB = wasm_i32x4_shr(vx89AB, 31); vx89AB = wasm_f32x4_mul(vx89AB, vbeta); - const v128_t vy0123 = wasm_v128_bitselect(ve0123, vx0123, vm0123); - const v128_t vy4567 = wasm_v128_bitselect(ve4567, vx4567, vm4567); - const v128_t vy89AB = wasm_v128_bitselect(ve89AB, vx89AB, vm89AB); + const v128_t vy0123 = __builtin_wasm_signselect_i32x4(ve0123, vx0123, vx0123); + const v128_t vy4567 = __builtin_wasm_signselect_i32x4(ve4567, vx4567, vx4567); + const v128_t vy89AB = __builtin_wasm_signselect_i32x4(ve89AB, vx89AB, vx89AB); wasm_v128_store(y, vy0123); wasm_v128_store(y + 4, vy4567); @@ -148,9 +145,8 @@ void xnn_f32_velu_ukernel__wasmsimd_arm_rr2_p6_x12( vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vt); const v128_t ve = wasm_f32x4_mul(wasm_f32x4_add(vp, vs), valpha); - const v128_t vm = wasm_i32x4_shr(vx, 31); vx = wasm_f32x4_mul(vx, vbeta); - const v128_t vy = wasm_v128_bitselect(ve, vx, vm); + const v128_t vy = __builtin_wasm_signselect_i32x4(ve, vx, vx); wasm_v128_store(y, vy); y += 4; @@ -178,9 +174,8 @@ void xnn_f32_velu_ukernel__wasmsimd_arm_rr2_p6_x12( vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vt); const v128_t ve = wasm_f32x4_mul(wasm_f32x4_add(vp, vs), valpha); - const v128_t vm = wasm_i32x4_shr(vx, 31); vx = wasm_f32x4_mul(vx, vbeta); - v128_t vy = wasm_v128_bitselect(ve, vx, vm); + v128_t vy = __builtin_wasm_signselect_i32x4(ve, vx, vx); if (n & (2 * sizeof(float))) { *((double*) y) = wasm_f64x2_extract_lane(vy, 0); diff --git a/src/f32-velu/gen/velu-wasmsimd-arm-rr2-p6-x16.c b/src/f32-velu/gen/velu-wasmsimd-arm-rr2-p6-x16.c index f991127c07f..278dd3400b3 100644 --- a/src/f32-velu/gen/velu-wasmsimd-arm-rr2-p6-x16.c +++ b/src/f32-velu/gen/velu-wasmsimd-arm-rr2-p6-x16.c @@ -124,19 +124,15 @@ void xnn_f32_velu_ukernel__wasmsimd_arm_rr2_p6_x16( const v128_t ve89AB = wasm_f32x4_mul(wasm_f32x4_add(vp89AB, vs89AB), valpha); const v128_t veCDEF = wasm_f32x4_mul(wasm_f32x4_add(vpCDEF, vsCDEF), valpha); - const v128_t vm0123 = wasm_i32x4_shr(vx0123, 31); vx0123 = wasm_f32x4_mul(vx0123, vbeta); - const v128_t vm4567 = wasm_i32x4_shr(vx4567, 31); vx4567 = wasm_f32x4_mul(vx4567, vbeta); - const v128_t vm89AB = wasm_i32x4_shr(vx89AB, 31); vx89AB = wasm_f32x4_mul(vx89AB, vbeta); - const v128_t vmCDEF = wasm_i32x4_shr(vxCDEF, 31); vxCDEF = wasm_f32x4_mul(vxCDEF, vbeta); - const v128_t vy0123 = wasm_v128_bitselect(ve0123, vx0123, vm0123); - const v128_t vy4567 = wasm_v128_bitselect(ve4567, vx4567, vm4567); - const v128_t vy89AB = wasm_v128_bitselect(ve89AB, vx89AB, vm89AB); - const v128_t vyCDEF = wasm_v128_bitselect(veCDEF, vxCDEF, vmCDEF); + const v128_t vy0123 = __builtin_wasm_signselect_i32x4(ve0123, vx0123, vx0123); + const v128_t vy4567 = __builtin_wasm_signselect_i32x4(ve4567, vx4567, vx4567); + const v128_t vy89AB = __builtin_wasm_signselect_i32x4(ve89AB, vx89AB, vx89AB); + const v128_t vyCDEF = __builtin_wasm_signselect_i32x4(veCDEF, vxCDEF, vxCDEF); wasm_v128_store(y, vy0123); wasm_v128_store(y + 4, vy4567); @@ -168,9 +164,8 @@ void xnn_f32_velu_ukernel__wasmsimd_arm_rr2_p6_x16( vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vt); const v128_t ve = wasm_f32x4_mul(wasm_f32x4_add(vp, vs), valpha); - const v128_t vm = wasm_i32x4_shr(vx, 31); vx = wasm_f32x4_mul(vx, vbeta); - const v128_t vy = wasm_v128_bitselect(ve, vx, vm); + const v128_t vy = __builtin_wasm_signselect_i32x4(ve, vx, vx); wasm_v128_store(y, vy); y += 4; @@ -198,9 +193,8 @@ void xnn_f32_velu_ukernel__wasmsimd_arm_rr2_p6_x16( vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vt); const v128_t ve = wasm_f32x4_mul(wasm_f32x4_add(vp, vs), valpha); - const v128_t vm = wasm_i32x4_shr(vx, 31); vx = wasm_f32x4_mul(vx, vbeta); - v128_t vy = wasm_v128_bitselect(ve, vx, vm); + v128_t vy = __builtin_wasm_signselect_i32x4(ve, vx, vx); if (n & (2 * sizeof(float))) { *((double*) y) = wasm_f64x2_extract_lane(vy, 0); diff --git a/src/f32-velu/gen/velu-wasmsimd-arm-rr2-p6-x20.c b/src/f32-velu/gen/velu-wasmsimd-arm-rr2-p6-x20.c index d465f5f129d..a6310782973 100644 --- a/src/f32-velu/gen/velu-wasmsimd-arm-rr2-p6-x20.c +++ b/src/f32-velu/gen/velu-wasmsimd-arm-rr2-p6-x20.c @@ -140,22 +140,17 @@ void xnn_f32_velu_ukernel__wasmsimd_arm_rr2_p6_x20( const v128_t veCDEF = wasm_f32x4_mul(wasm_f32x4_add(vpCDEF, vsCDEF), valpha); const v128_t veGHIJ = wasm_f32x4_mul(wasm_f32x4_add(vpGHIJ, vsGHIJ), valpha); - const v128_t vm0123 = wasm_i32x4_shr(vx0123, 31); vx0123 = wasm_f32x4_mul(vx0123, vbeta); - const v128_t vm4567 = wasm_i32x4_shr(vx4567, 31); vx4567 = wasm_f32x4_mul(vx4567, vbeta); - const v128_t vm89AB = wasm_i32x4_shr(vx89AB, 31); vx89AB = wasm_f32x4_mul(vx89AB, vbeta); - const v128_t vmCDEF = wasm_i32x4_shr(vxCDEF, 31); vxCDEF = wasm_f32x4_mul(vxCDEF, vbeta); - const v128_t vmGHIJ = wasm_i32x4_shr(vxGHIJ, 31); vxGHIJ = wasm_f32x4_mul(vxGHIJ, vbeta); - const v128_t vy0123 = wasm_v128_bitselect(ve0123, vx0123, vm0123); - const v128_t vy4567 = wasm_v128_bitselect(ve4567, vx4567, vm4567); - const v128_t vy89AB = wasm_v128_bitselect(ve89AB, vx89AB, vm89AB); - const v128_t vyCDEF = wasm_v128_bitselect(veCDEF, vxCDEF, vmCDEF); - const v128_t vyGHIJ = wasm_v128_bitselect(veGHIJ, vxGHIJ, vmGHIJ); + const v128_t vy0123 = __builtin_wasm_signselect_i32x4(ve0123, vx0123, vx0123); + const v128_t vy4567 = __builtin_wasm_signselect_i32x4(ve4567, vx4567, vx4567); + const v128_t vy89AB = __builtin_wasm_signselect_i32x4(ve89AB, vx89AB, vx89AB); + const v128_t vyCDEF = __builtin_wasm_signselect_i32x4(veCDEF, vxCDEF, vxCDEF); + const v128_t vyGHIJ = __builtin_wasm_signselect_i32x4(veGHIJ, vxGHIJ, vxGHIJ); wasm_v128_store(y, vy0123); wasm_v128_store(y + 4, vy4567); @@ -188,9 +183,8 @@ void xnn_f32_velu_ukernel__wasmsimd_arm_rr2_p6_x20( vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vt); const v128_t ve = wasm_f32x4_mul(wasm_f32x4_add(vp, vs), valpha); - const v128_t vm = wasm_i32x4_shr(vx, 31); vx = wasm_f32x4_mul(vx, vbeta); - const v128_t vy = wasm_v128_bitselect(ve, vx, vm); + const v128_t vy = __builtin_wasm_signselect_i32x4(ve, vx, vx); wasm_v128_store(y, vy); y += 4; @@ -218,9 +212,8 @@ void xnn_f32_velu_ukernel__wasmsimd_arm_rr2_p6_x20( vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vt); const v128_t ve = wasm_f32x4_mul(wasm_f32x4_add(vp, vs), valpha); - const v128_t vm = wasm_i32x4_shr(vx, 31); vx = wasm_f32x4_mul(vx, vbeta); - v128_t vy = wasm_v128_bitselect(ve, vx, vm); + v128_t vy = __builtin_wasm_signselect_i32x4(ve, vx, vx); if (n & (2 * sizeof(float))) { *((double*) y) = wasm_f64x2_extract_lane(vy, 0); diff --git a/src/f32-velu/gen/velu-wasmsimd-arm-rr2-p6-x24.c b/src/f32-velu/gen/velu-wasmsimd-arm-rr2-p6-x24.c index 6398b04e772..678f6432ef5 100644 --- a/src/f32-velu/gen/velu-wasmsimd-arm-rr2-p6-x24.c +++ b/src/f32-velu/gen/velu-wasmsimd-arm-rr2-p6-x24.c @@ -156,25 +156,19 @@ void xnn_f32_velu_ukernel__wasmsimd_arm_rr2_p6_x24( const v128_t veGHIJ = wasm_f32x4_mul(wasm_f32x4_add(vpGHIJ, vsGHIJ), valpha); const v128_t veKLMN = wasm_f32x4_mul(wasm_f32x4_add(vpKLMN, vsKLMN), valpha); - const v128_t vm0123 = wasm_i32x4_shr(vx0123, 31); vx0123 = wasm_f32x4_mul(vx0123, vbeta); - const v128_t vm4567 = wasm_i32x4_shr(vx4567, 31); vx4567 = wasm_f32x4_mul(vx4567, vbeta); - const v128_t vm89AB = wasm_i32x4_shr(vx89AB, 31); vx89AB = wasm_f32x4_mul(vx89AB, vbeta); - const v128_t vmCDEF = wasm_i32x4_shr(vxCDEF, 31); vxCDEF = wasm_f32x4_mul(vxCDEF, vbeta); - const v128_t vmGHIJ = wasm_i32x4_shr(vxGHIJ, 31); vxGHIJ = wasm_f32x4_mul(vxGHIJ, vbeta); - const v128_t vmKLMN = wasm_i32x4_shr(vxKLMN, 31); vxKLMN = wasm_f32x4_mul(vxKLMN, vbeta); - const v128_t vy0123 = wasm_v128_bitselect(ve0123, vx0123, vm0123); - const v128_t vy4567 = wasm_v128_bitselect(ve4567, vx4567, vm4567); - const v128_t vy89AB = wasm_v128_bitselect(ve89AB, vx89AB, vm89AB); - const v128_t vyCDEF = wasm_v128_bitselect(veCDEF, vxCDEF, vmCDEF); - const v128_t vyGHIJ = wasm_v128_bitselect(veGHIJ, vxGHIJ, vmGHIJ); - const v128_t vyKLMN = wasm_v128_bitselect(veKLMN, vxKLMN, vmKLMN); + const v128_t vy0123 = __builtin_wasm_signselect_i32x4(ve0123, vx0123, vx0123); + const v128_t vy4567 = __builtin_wasm_signselect_i32x4(ve4567, vx4567, vx4567); + const v128_t vy89AB = __builtin_wasm_signselect_i32x4(ve89AB, vx89AB, vx89AB); + const v128_t vyCDEF = __builtin_wasm_signselect_i32x4(veCDEF, vxCDEF, vxCDEF); + const v128_t vyGHIJ = __builtin_wasm_signselect_i32x4(veGHIJ, vxGHIJ, vxGHIJ); + const v128_t vyKLMN = __builtin_wasm_signselect_i32x4(veKLMN, vxKLMN, vxKLMN); wasm_v128_store(y, vy0123); wasm_v128_store(y + 4, vy4567); @@ -208,9 +202,8 @@ void xnn_f32_velu_ukernel__wasmsimd_arm_rr2_p6_x24( vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vt); const v128_t ve = wasm_f32x4_mul(wasm_f32x4_add(vp, vs), valpha); - const v128_t vm = wasm_i32x4_shr(vx, 31); vx = wasm_f32x4_mul(vx, vbeta); - const v128_t vy = wasm_v128_bitselect(ve, vx, vm); + const v128_t vy = __builtin_wasm_signselect_i32x4(ve, vx, vx); wasm_v128_store(y, vy); y += 4; @@ -238,9 +231,8 @@ void xnn_f32_velu_ukernel__wasmsimd_arm_rr2_p6_x24( vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vt); const v128_t ve = wasm_f32x4_mul(wasm_f32x4_add(vp, vs), valpha); - const v128_t vm = wasm_i32x4_shr(vx, 31); vx = wasm_f32x4_mul(vx, vbeta); - v128_t vy = wasm_v128_bitselect(ve, vx, vm); + v128_t vy = __builtin_wasm_signselect_i32x4(ve, vx, vx); if (n & (2 * sizeof(float))) { *((double*) y) = wasm_f64x2_extract_lane(vy, 0); diff --git a/src/f32-velu/gen/velu-wasmsimd-arm-rr2-p6-x4.c b/src/f32-velu/gen/velu-wasmsimd-arm-rr2-p6-x4.c index 51a075340e3..d2ba58e4e97 100644 --- a/src/f32-velu/gen/velu-wasmsimd-arm-rr2-p6-x4.c +++ b/src/f32-velu/gen/velu-wasmsimd-arm-rr2-p6-x4.c @@ -66,9 +66,8 @@ void xnn_f32_velu_ukernel__wasmsimd_arm_rr2_p6_x4( vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vt); const v128_t ve = wasm_f32x4_mul(wasm_f32x4_add(vp, vs), valpha); - const v128_t vm = wasm_i32x4_shr(vx, 31); vx = wasm_f32x4_mul(vx, vbeta); - const v128_t vy = wasm_v128_bitselect(ve, vx, vm); + const v128_t vy = __builtin_wasm_signselect_i32x4(ve, vx, vx); wasm_v128_store(y, vy); y += 4; @@ -96,9 +95,8 @@ void xnn_f32_velu_ukernel__wasmsimd_arm_rr2_p6_x4( vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vt); const v128_t ve = wasm_f32x4_mul(wasm_f32x4_add(vp, vs), valpha); - const v128_t vm = wasm_i32x4_shr(vx, 31); vx = wasm_f32x4_mul(vx, vbeta); - v128_t vy = wasm_v128_bitselect(ve, vx, vm); + v128_t vy = __builtin_wasm_signselect_i32x4(ve, vx, vx); if (n & (2 * sizeof(float))) { *((double*) y) = wasm_f64x2_extract_lane(vy, 0); diff --git a/src/f32-velu/gen/velu-wasmsimd-arm-rr2-p6-x8.c b/src/f32-velu/gen/velu-wasmsimd-arm-rr2-p6-x8.c index dc59ebc969f..6740c8376dd 100644 --- a/src/f32-velu/gen/velu-wasmsimd-arm-rr2-p6-x8.c +++ b/src/f32-velu/gen/velu-wasmsimd-arm-rr2-p6-x8.c @@ -92,13 +92,11 @@ void xnn_f32_velu_ukernel__wasmsimd_arm_rr2_p6_x8( const v128_t ve0123 = wasm_f32x4_mul(wasm_f32x4_add(vp0123, vs0123), valpha); const v128_t ve4567 = wasm_f32x4_mul(wasm_f32x4_add(vp4567, vs4567), valpha); - const v128_t vm0123 = wasm_i32x4_shr(vx0123, 31); vx0123 = wasm_f32x4_mul(vx0123, vbeta); - const v128_t vm4567 = wasm_i32x4_shr(vx4567, 31); vx4567 = wasm_f32x4_mul(vx4567, vbeta); - const v128_t vy0123 = wasm_v128_bitselect(ve0123, vx0123, vm0123); - const v128_t vy4567 = wasm_v128_bitselect(ve4567, vx4567, vm4567); + const v128_t vy0123 = __builtin_wasm_signselect_i32x4(ve0123, vx0123, vx0123); + const v128_t vy4567 = __builtin_wasm_signselect_i32x4(ve4567, vx4567, vx4567); wasm_v128_store(y, vy0123); wasm_v128_store(y + 4, vy4567); @@ -128,9 +126,8 @@ void xnn_f32_velu_ukernel__wasmsimd_arm_rr2_p6_x8( vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vt); const v128_t ve = wasm_f32x4_mul(wasm_f32x4_add(vp, vs), valpha); - const v128_t vm = wasm_i32x4_shr(vx, 31); vx = wasm_f32x4_mul(vx, vbeta); - const v128_t vy = wasm_v128_bitselect(ve, vx, vm); + const v128_t vy = __builtin_wasm_signselect_i32x4(ve, vx, vx); wasm_v128_store(y, vy); y += 4; @@ -158,9 +155,8 @@ void xnn_f32_velu_ukernel__wasmsimd_arm_rr2_p6_x8( vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vt); const v128_t ve = wasm_f32x4_mul(wasm_f32x4_add(vp, vs), valpha); - const v128_t vm = wasm_i32x4_shr(vx, 31); vx = wasm_f32x4_mul(vx, vbeta); - v128_t vy = wasm_v128_bitselect(ve, vx, vm); + v128_t vy = __builtin_wasm_signselect_i32x4(ve, vx, vx); if (n & (2 * sizeof(float))) { *((double*) y) = wasm_f64x2_extract_lane(vy, 0); diff --git a/src/f32-velu/gen/velu-wasmsimd-x86-rr2-lut16-p3-x12.c b/src/f32-velu/gen/velu-wasmsimd-x86-rr2-lut16-p3-x12.c index 34304b8d0f6..f0209b33d3f 100644 --- a/src/f32-velu/gen/velu-wasmsimd-x86-rr2-lut16-p3-x12.c +++ b/src/f32-velu/gen/velu-wasmsimd-x86-rr2-lut16-p3-x12.c @@ -127,16 +127,13 @@ void xnn_f32_velu_ukernel__wasmsimd_x86_rr2_lut16_p3_x12( const v128_t ve4567 = wasm_f32x4_mul(wasm_f32x4_add(vp4567, vs4567), valpha); const v128_t ve89AB = wasm_f32x4_mul(wasm_f32x4_add(vp89AB, vs89AB), valpha); - const v128_t vm0123 = wasm_i32x4_shr(vx0123, 31); vx0123 = wasm_f32x4_mul(vx0123, vbeta); - const v128_t vm4567 = wasm_i32x4_shr(vx4567, 31); vx4567 = wasm_f32x4_mul(vx4567, vbeta); - const v128_t vm89AB = wasm_i32x4_shr(vx89AB, 31); vx89AB = wasm_f32x4_mul(vx89AB, vbeta); - const v128_t vy0123 = wasm_v128_bitselect(ve0123, vx0123, vm0123); - const v128_t vy4567 = wasm_v128_bitselect(ve4567, vx4567, vm4567); - const v128_t vy89AB = wasm_v128_bitselect(ve89AB, vx89AB, vm89AB); + const v128_t vy0123 = __builtin_wasm_signselect_i32x4(ve0123, vx0123, vx0123); + const v128_t vy4567 = __builtin_wasm_signselect_i32x4(ve4567, vx4567, vx4567); + const v128_t vy89AB = __builtin_wasm_signselect_i32x4(ve89AB, vx89AB, vx89AB); wasm_v128_store(y, vy0123); wasm_v128_store(y + 4, vy4567); @@ -176,9 +173,8 @@ void xnn_f32_velu_ukernel__wasmsimd_x86_rr2_lut16_p3_x12( vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vt); const v128_t ve = wasm_f32x4_mul(wasm_f32x4_add(vp, vs), valpha); - const v128_t vm = wasm_i32x4_shr(vx, 31); vx = wasm_f32x4_mul(vx, vbeta); - const v128_t vy = wasm_v128_bitselect(ve, vx, vm); + const v128_t vy = __builtin_wasm_signselect_i32x4(ve, vx, vx); wasm_v128_store(y, vy); y += 4; @@ -215,9 +211,8 @@ void xnn_f32_velu_ukernel__wasmsimd_x86_rr2_lut16_p3_x12( vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vt); const v128_t ve = wasm_f32x4_mul(wasm_f32x4_add(vp, vs), valpha); - const v128_t vm = wasm_i32x4_shr(vx, 31); vx = wasm_f32x4_mul(vx, vbeta); - v128_t vy = wasm_v128_bitselect(ve, vx, vm); + v128_t vy = __builtin_wasm_signselect_i32x4(ve, vx, vx); if (n & (2 * sizeof(float))) { *((double*) y) = wasm_f64x2_extract_lane(vy, 0); diff --git a/src/f32-velu/gen/velu-wasmsimd-x86-rr2-lut16-p3-x16.c b/src/f32-velu/gen/velu-wasmsimd-x86-rr2-lut16-p3-x16.c index 9c53ef642bf..8a974706888 100644 --- a/src/f32-velu/gen/velu-wasmsimd-x86-rr2-lut16-p3-x16.c +++ b/src/f32-velu/gen/velu-wasmsimd-x86-rr2-lut16-p3-x16.c @@ -150,19 +150,15 @@ void xnn_f32_velu_ukernel__wasmsimd_x86_rr2_lut16_p3_x16( const v128_t ve89AB = wasm_f32x4_mul(wasm_f32x4_add(vp89AB, vs89AB), valpha); const v128_t veCDEF = wasm_f32x4_mul(wasm_f32x4_add(vpCDEF, vsCDEF), valpha); - const v128_t vm0123 = wasm_i32x4_shr(vx0123, 31); vx0123 = wasm_f32x4_mul(vx0123, vbeta); - const v128_t vm4567 = wasm_i32x4_shr(vx4567, 31); vx4567 = wasm_f32x4_mul(vx4567, vbeta); - const v128_t vm89AB = wasm_i32x4_shr(vx89AB, 31); vx89AB = wasm_f32x4_mul(vx89AB, vbeta); - const v128_t vmCDEF = wasm_i32x4_shr(vxCDEF, 31); vxCDEF = wasm_f32x4_mul(vxCDEF, vbeta); - const v128_t vy0123 = wasm_v128_bitselect(ve0123, vx0123, vm0123); - const v128_t vy4567 = wasm_v128_bitselect(ve4567, vx4567, vm4567); - const v128_t vy89AB = wasm_v128_bitselect(ve89AB, vx89AB, vm89AB); - const v128_t vyCDEF = wasm_v128_bitselect(veCDEF, vxCDEF, vmCDEF); + const v128_t vy0123 = __builtin_wasm_signselect_i32x4(ve0123, vx0123, vx0123); + const v128_t vy4567 = __builtin_wasm_signselect_i32x4(ve4567, vx4567, vx4567); + const v128_t vy89AB = __builtin_wasm_signselect_i32x4(ve89AB, vx89AB, vx89AB); + const v128_t vyCDEF = __builtin_wasm_signselect_i32x4(veCDEF, vxCDEF, vxCDEF); wasm_v128_store(y, vy0123); wasm_v128_store(y + 4, vy4567); @@ -203,9 +199,8 @@ void xnn_f32_velu_ukernel__wasmsimd_x86_rr2_lut16_p3_x16( vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vt); const v128_t ve = wasm_f32x4_mul(wasm_f32x4_add(vp, vs), valpha); - const v128_t vm = wasm_i32x4_shr(vx, 31); vx = wasm_f32x4_mul(vx, vbeta); - const v128_t vy = wasm_v128_bitselect(ve, vx, vm); + const v128_t vy = __builtin_wasm_signselect_i32x4(ve, vx, vx); wasm_v128_store(y, vy); y += 4; @@ -242,9 +237,8 @@ void xnn_f32_velu_ukernel__wasmsimd_x86_rr2_lut16_p3_x16( vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vt); const v128_t ve = wasm_f32x4_mul(wasm_f32x4_add(vp, vs), valpha); - const v128_t vm = wasm_i32x4_shr(vx, 31); vx = wasm_f32x4_mul(vx, vbeta); - v128_t vy = wasm_v128_bitselect(ve, vx, vm); + v128_t vy = __builtin_wasm_signselect_i32x4(ve, vx, vx); if (n & (2 * sizeof(float))) { *((double*) y) = wasm_f64x2_extract_lane(vy, 0); diff --git a/src/f32-velu/gen/velu-wasmsimd-x86-rr2-lut16-p3-x20.c b/src/f32-velu/gen/velu-wasmsimd-x86-rr2-lut16-p3-x20.c index b334696b34b..abb93fe2ace 100644 --- a/src/f32-velu/gen/velu-wasmsimd-x86-rr2-lut16-p3-x20.c +++ b/src/f32-velu/gen/velu-wasmsimd-x86-rr2-lut16-p3-x20.c @@ -173,22 +173,17 @@ void xnn_f32_velu_ukernel__wasmsimd_x86_rr2_lut16_p3_x20( const v128_t veCDEF = wasm_f32x4_mul(wasm_f32x4_add(vpCDEF, vsCDEF), valpha); const v128_t veGHIJ = wasm_f32x4_mul(wasm_f32x4_add(vpGHIJ, vsGHIJ), valpha); - const v128_t vm0123 = wasm_i32x4_shr(vx0123, 31); vx0123 = wasm_f32x4_mul(vx0123, vbeta); - const v128_t vm4567 = wasm_i32x4_shr(vx4567, 31); vx4567 = wasm_f32x4_mul(vx4567, vbeta); - const v128_t vm89AB = wasm_i32x4_shr(vx89AB, 31); vx89AB = wasm_f32x4_mul(vx89AB, vbeta); - const v128_t vmCDEF = wasm_i32x4_shr(vxCDEF, 31); vxCDEF = wasm_f32x4_mul(vxCDEF, vbeta); - const v128_t vmGHIJ = wasm_i32x4_shr(vxGHIJ, 31); vxGHIJ = wasm_f32x4_mul(vxGHIJ, vbeta); - const v128_t vy0123 = wasm_v128_bitselect(ve0123, vx0123, vm0123); - const v128_t vy4567 = wasm_v128_bitselect(ve4567, vx4567, vm4567); - const v128_t vy89AB = wasm_v128_bitselect(ve89AB, vx89AB, vm89AB); - const v128_t vyCDEF = wasm_v128_bitselect(veCDEF, vxCDEF, vmCDEF); - const v128_t vyGHIJ = wasm_v128_bitselect(veGHIJ, vxGHIJ, vmGHIJ); + const v128_t vy0123 = __builtin_wasm_signselect_i32x4(ve0123, vx0123, vx0123); + const v128_t vy4567 = __builtin_wasm_signselect_i32x4(ve4567, vx4567, vx4567); + const v128_t vy89AB = __builtin_wasm_signselect_i32x4(ve89AB, vx89AB, vx89AB); + const v128_t vyCDEF = __builtin_wasm_signselect_i32x4(veCDEF, vxCDEF, vxCDEF); + const v128_t vyGHIJ = __builtin_wasm_signselect_i32x4(veGHIJ, vxGHIJ, vxGHIJ); wasm_v128_store(y, vy0123); wasm_v128_store(y + 4, vy4567); @@ -230,9 +225,8 @@ void xnn_f32_velu_ukernel__wasmsimd_x86_rr2_lut16_p3_x20( vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vt); const v128_t ve = wasm_f32x4_mul(wasm_f32x4_add(vp, vs), valpha); - const v128_t vm = wasm_i32x4_shr(vx, 31); vx = wasm_f32x4_mul(vx, vbeta); - const v128_t vy = wasm_v128_bitselect(ve, vx, vm); + const v128_t vy = __builtin_wasm_signselect_i32x4(ve, vx, vx); wasm_v128_store(y, vy); y += 4; @@ -269,9 +263,8 @@ void xnn_f32_velu_ukernel__wasmsimd_x86_rr2_lut16_p3_x20( vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vt); const v128_t ve = wasm_f32x4_mul(wasm_f32x4_add(vp, vs), valpha); - const v128_t vm = wasm_i32x4_shr(vx, 31); vx = wasm_f32x4_mul(vx, vbeta); - v128_t vy = wasm_v128_bitselect(ve, vx, vm); + v128_t vy = __builtin_wasm_signselect_i32x4(ve, vx, vx); if (n & (2 * sizeof(float))) { *((double*) y) = wasm_f64x2_extract_lane(vy, 0); diff --git a/src/f32-velu/gen/velu-wasmsimd-x86-rr2-lut16-p3-x24.c b/src/f32-velu/gen/velu-wasmsimd-x86-rr2-lut16-p3-x24.c index 3f2cf2aa6e5..cf0b0476e97 100644 --- a/src/f32-velu/gen/velu-wasmsimd-x86-rr2-lut16-p3-x24.c +++ b/src/f32-velu/gen/velu-wasmsimd-x86-rr2-lut16-p3-x24.c @@ -196,25 +196,19 @@ void xnn_f32_velu_ukernel__wasmsimd_x86_rr2_lut16_p3_x24( const v128_t veGHIJ = wasm_f32x4_mul(wasm_f32x4_add(vpGHIJ, vsGHIJ), valpha); const v128_t veKLMN = wasm_f32x4_mul(wasm_f32x4_add(vpKLMN, vsKLMN), valpha); - const v128_t vm0123 = wasm_i32x4_shr(vx0123, 31); vx0123 = wasm_f32x4_mul(vx0123, vbeta); - const v128_t vm4567 = wasm_i32x4_shr(vx4567, 31); vx4567 = wasm_f32x4_mul(vx4567, vbeta); - const v128_t vm89AB = wasm_i32x4_shr(vx89AB, 31); vx89AB = wasm_f32x4_mul(vx89AB, vbeta); - const v128_t vmCDEF = wasm_i32x4_shr(vxCDEF, 31); vxCDEF = wasm_f32x4_mul(vxCDEF, vbeta); - const v128_t vmGHIJ = wasm_i32x4_shr(vxGHIJ, 31); vxGHIJ = wasm_f32x4_mul(vxGHIJ, vbeta); - const v128_t vmKLMN = wasm_i32x4_shr(vxKLMN, 31); vxKLMN = wasm_f32x4_mul(vxKLMN, vbeta); - const v128_t vy0123 = wasm_v128_bitselect(ve0123, vx0123, vm0123); - const v128_t vy4567 = wasm_v128_bitselect(ve4567, vx4567, vm4567); - const v128_t vy89AB = wasm_v128_bitselect(ve89AB, vx89AB, vm89AB); - const v128_t vyCDEF = wasm_v128_bitselect(veCDEF, vxCDEF, vmCDEF); - const v128_t vyGHIJ = wasm_v128_bitselect(veGHIJ, vxGHIJ, vmGHIJ); - const v128_t vyKLMN = wasm_v128_bitselect(veKLMN, vxKLMN, vmKLMN); + const v128_t vy0123 = __builtin_wasm_signselect_i32x4(ve0123, vx0123, vx0123); + const v128_t vy4567 = __builtin_wasm_signselect_i32x4(ve4567, vx4567, vx4567); + const v128_t vy89AB = __builtin_wasm_signselect_i32x4(ve89AB, vx89AB, vx89AB); + const v128_t vyCDEF = __builtin_wasm_signselect_i32x4(veCDEF, vxCDEF, vxCDEF); + const v128_t vyGHIJ = __builtin_wasm_signselect_i32x4(veGHIJ, vxGHIJ, vxGHIJ); + const v128_t vyKLMN = __builtin_wasm_signselect_i32x4(veKLMN, vxKLMN, vxKLMN); wasm_v128_store(y, vy0123); wasm_v128_store(y + 4, vy4567); @@ -257,9 +251,8 @@ void xnn_f32_velu_ukernel__wasmsimd_x86_rr2_lut16_p3_x24( vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vt); const v128_t ve = wasm_f32x4_mul(wasm_f32x4_add(vp, vs), valpha); - const v128_t vm = wasm_i32x4_shr(vx, 31); vx = wasm_f32x4_mul(vx, vbeta); - const v128_t vy = wasm_v128_bitselect(ve, vx, vm); + const v128_t vy = __builtin_wasm_signselect_i32x4(ve, vx, vx); wasm_v128_store(y, vy); y += 4; @@ -296,9 +289,8 @@ void xnn_f32_velu_ukernel__wasmsimd_x86_rr2_lut16_p3_x24( vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vt); const v128_t ve = wasm_f32x4_mul(wasm_f32x4_add(vp, vs), valpha); - const v128_t vm = wasm_i32x4_shr(vx, 31); vx = wasm_f32x4_mul(vx, vbeta); - v128_t vy = wasm_v128_bitselect(ve, vx, vm); + v128_t vy = __builtin_wasm_signselect_i32x4(ve, vx, vx); if (n & (2 * sizeof(float))) { *((double*) y) = wasm_f64x2_extract_lane(vy, 0); diff --git a/src/f32-velu/gen/velu-wasmsimd-x86-rr2-lut16-p3-x4.c b/src/f32-velu/gen/velu-wasmsimd-x86-rr2-lut16-p3-x4.c index 6c794e3509b..017e5d54759 100644 --- a/src/f32-velu/gen/velu-wasmsimd-x86-rr2-lut16-p3-x4.c +++ b/src/f32-velu/gen/velu-wasmsimd-x86-rr2-lut16-p3-x4.c @@ -75,9 +75,8 @@ void xnn_f32_velu_ukernel__wasmsimd_x86_rr2_lut16_p3_x4( vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vt); const v128_t ve = wasm_f32x4_mul(wasm_f32x4_add(vp, vs), valpha); - const v128_t vm = wasm_i32x4_shr(vx, 31); vx = wasm_f32x4_mul(vx, vbeta); - const v128_t vy = wasm_v128_bitselect(ve, vx, vm); + const v128_t vy = __builtin_wasm_signselect_i32x4(ve, vx, vx); wasm_v128_store(y, vy); y += 4; @@ -114,9 +113,8 @@ void xnn_f32_velu_ukernel__wasmsimd_x86_rr2_lut16_p3_x4( vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vt); const v128_t ve = wasm_f32x4_mul(wasm_f32x4_add(vp, vs), valpha); - const v128_t vm = wasm_i32x4_shr(vx, 31); vx = wasm_f32x4_mul(vx, vbeta); - v128_t vy = wasm_v128_bitselect(ve, vx, vm); + v128_t vy = __builtin_wasm_signselect_i32x4(ve, vx, vx); if (n & (2 * sizeof(float))) { *((double*) y) = wasm_f64x2_extract_lane(vy, 0); diff --git a/src/f32-velu/gen/velu-wasmsimd-x86-rr2-lut16-p3-x8.c b/src/f32-velu/gen/velu-wasmsimd-x86-rr2-lut16-p3-x8.c index b142a3afc8e..3f4ad7c3ce7 100644 --- a/src/f32-velu/gen/velu-wasmsimd-x86-rr2-lut16-p3-x8.c +++ b/src/f32-velu/gen/velu-wasmsimd-x86-rr2-lut16-p3-x8.c @@ -104,13 +104,11 @@ void xnn_f32_velu_ukernel__wasmsimd_x86_rr2_lut16_p3_x8( const v128_t ve0123 = wasm_f32x4_mul(wasm_f32x4_add(vp0123, vs0123), valpha); const v128_t ve4567 = wasm_f32x4_mul(wasm_f32x4_add(vp4567, vs4567), valpha); - const v128_t vm0123 = wasm_i32x4_shr(vx0123, 31); vx0123 = wasm_f32x4_mul(vx0123, vbeta); - const v128_t vm4567 = wasm_i32x4_shr(vx4567, 31); vx4567 = wasm_f32x4_mul(vx4567, vbeta); - const v128_t vy0123 = wasm_v128_bitselect(ve0123, vx0123, vm0123); - const v128_t vy4567 = wasm_v128_bitselect(ve4567, vx4567, vm4567); + const v128_t vy0123 = __builtin_wasm_signselect_i32x4(ve0123, vx0123, vx0123); + const v128_t vy4567 = __builtin_wasm_signselect_i32x4(ve4567, vx4567, vx4567); wasm_v128_store(y, vy0123); wasm_v128_store(y + 4, vy4567); @@ -149,9 +147,8 @@ void xnn_f32_velu_ukernel__wasmsimd_x86_rr2_lut16_p3_x8( vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vt); const v128_t ve = wasm_f32x4_mul(wasm_f32x4_add(vp, vs), valpha); - const v128_t vm = wasm_i32x4_shr(vx, 31); vx = wasm_f32x4_mul(vx, vbeta); - const v128_t vy = wasm_v128_bitselect(ve, vx, vm); + const v128_t vy = __builtin_wasm_signselect_i32x4(ve, vx, vx); wasm_v128_store(y, vy); y += 4; @@ -188,9 +185,8 @@ void xnn_f32_velu_ukernel__wasmsimd_x86_rr2_lut16_p3_x8( vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vt); const v128_t ve = wasm_f32x4_mul(wasm_f32x4_add(vp, vs), valpha); - const v128_t vm = wasm_i32x4_shr(vx, 31); vx = wasm_f32x4_mul(vx, vbeta); - v128_t vy = wasm_v128_bitselect(ve, vx, vm); + v128_t vy = __builtin_wasm_signselect_i32x4(ve, vx, vx); if (n & (2 * sizeof(float))) { *((double*) y) = wasm_f64x2_extract_lane(vy, 0); diff --git a/src/f32-velu/gen/velu-wasmsimd-x86-rr2-p6-x12.c b/src/f32-velu/gen/velu-wasmsimd-x86-rr2-p6-x12.c index 32084bd3117..770f49aae0e 100644 --- a/src/f32-velu/gen/velu-wasmsimd-x86-rr2-p6-x12.c +++ b/src/f32-velu/gen/velu-wasmsimd-x86-rr2-p6-x12.c @@ -111,16 +111,13 @@ void xnn_f32_velu_ukernel__wasmsimd_x86_rr2_p6_x12( const v128_t ve4567 = wasm_f32x4_mul(wasm_f32x4_add(vp4567, vs4567), valpha); const v128_t ve89AB = wasm_f32x4_mul(wasm_f32x4_add(vp89AB, vs89AB), valpha); - const v128_t vm0123 = wasm_i32x4_shr(vx0123, 31); vx0123 = wasm_f32x4_mul(vx0123, vbeta); - const v128_t vm4567 = wasm_i32x4_shr(vx4567, 31); vx4567 = wasm_f32x4_mul(vx4567, vbeta); - const v128_t vm89AB = wasm_i32x4_shr(vx89AB, 31); vx89AB = wasm_f32x4_mul(vx89AB, vbeta); - const v128_t vy0123 = wasm_v128_bitselect(ve0123, vx0123, vm0123); - const v128_t vy4567 = wasm_v128_bitselect(ve4567, vx4567, vm4567); - const v128_t vy89AB = wasm_v128_bitselect(ve89AB, vx89AB, vm89AB); + const v128_t vy0123 = __builtin_wasm_signselect_i32x4(ve0123, vx0123, vx0123); + const v128_t vy4567 = __builtin_wasm_signselect_i32x4(ve4567, vx4567, vx4567); + const v128_t vy89AB = __builtin_wasm_signselect_i32x4(ve89AB, vx89AB, vx89AB); wasm_v128_store(y, vy0123); wasm_v128_store(y + 4, vy4567); @@ -152,9 +149,8 @@ void xnn_f32_velu_ukernel__wasmsimd_x86_rr2_p6_x12( vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vt); const v128_t ve = wasm_f32x4_mul(wasm_f32x4_add(vp, vs), valpha); - const v128_t vm = wasm_i32x4_shr(vx, 31); vx = wasm_f32x4_mul(vx, vbeta); - const v128_t vy = wasm_v128_bitselect(ve, vx, vm); + const v128_t vy = __builtin_wasm_signselect_i32x4(ve, vx, vx); wasm_v128_store(y, vy); y += 4; @@ -183,9 +179,8 @@ void xnn_f32_velu_ukernel__wasmsimd_x86_rr2_p6_x12( vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vt); const v128_t ve = wasm_f32x4_mul(wasm_f32x4_add(vp, vs), valpha); - const v128_t vm = wasm_i32x4_shr(vx, 31); vx = wasm_f32x4_mul(vx, vbeta); - v128_t vy = wasm_v128_bitselect(ve, vx, vm); + v128_t vy = __builtin_wasm_signselect_i32x4(ve, vx, vx); if (n & (2 * sizeof(float))) { *((double*) y) = wasm_f64x2_extract_lane(vy, 0); diff --git a/src/f32-velu/gen/velu-wasmsimd-x86-rr2-p6-x16.c b/src/f32-velu/gen/velu-wasmsimd-x86-rr2-p6-x16.c index 316c099ebd1..aa585d3f69f 100644 --- a/src/f32-velu/gen/velu-wasmsimd-x86-rr2-p6-x16.c +++ b/src/f32-velu/gen/velu-wasmsimd-x86-rr2-p6-x16.c @@ -128,19 +128,15 @@ void xnn_f32_velu_ukernel__wasmsimd_x86_rr2_p6_x16( const v128_t ve89AB = wasm_f32x4_mul(wasm_f32x4_add(vp89AB, vs89AB), valpha); const v128_t veCDEF = wasm_f32x4_mul(wasm_f32x4_add(vpCDEF, vsCDEF), valpha); - const v128_t vm0123 = wasm_i32x4_shr(vx0123, 31); vx0123 = wasm_f32x4_mul(vx0123, vbeta); - const v128_t vm4567 = wasm_i32x4_shr(vx4567, 31); vx4567 = wasm_f32x4_mul(vx4567, vbeta); - const v128_t vm89AB = wasm_i32x4_shr(vx89AB, 31); vx89AB = wasm_f32x4_mul(vx89AB, vbeta); - const v128_t vmCDEF = wasm_i32x4_shr(vxCDEF, 31); vxCDEF = wasm_f32x4_mul(vxCDEF, vbeta); - const v128_t vy0123 = wasm_v128_bitselect(ve0123, vx0123, vm0123); - const v128_t vy4567 = wasm_v128_bitselect(ve4567, vx4567, vm4567); - const v128_t vy89AB = wasm_v128_bitselect(ve89AB, vx89AB, vm89AB); - const v128_t vyCDEF = wasm_v128_bitselect(veCDEF, vxCDEF, vmCDEF); + const v128_t vy0123 = __builtin_wasm_signselect_i32x4(ve0123, vx0123, vx0123); + const v128_t vy4567 = __builtin_wasm_signselect_i32x4(ve4567, vx4567, vx4567); + const v128_t vy89AB = __builtin_wasm_signselect_i32x4(ve89AB, vx89AB, vx89AB); + const v128_t vyCDEF = __builtin_wasm_signselect_i32x4(veCDEF, vxCDEF, vxCDEF); wasm_v128_store(y, vy0123); wasm_v128_store(y + 4, vy4567); @@ -173,9 +169,8 @@ void xnn_f32_velu_ukernel__wasmsimd_x86_rr2_p6_x16( vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vt); const v128_t ve = wasm_f32x4_mul(wasm_f32x4_add(vp, vs), valpha); - const v128_t vm = wasm_i32x4_shr(vx, 31); vx = wasm_f32x4_mul(vx, vbeta); - const v128_t vy = wasm_v128_bitselect(ve, vx, vm); + const v128_t vy = __builtin_wasm_signselect_i32x4(ve, vx, vx); wasm_v128_store(y, vy); y += 4; @@ -204,9 +199,8 @@ void xnn_f32_velu_ukernel__wasmsimd_x86_rr2_p6_x16( vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vt); const v128_t ve = wasm_f32x4_mul(wasm_f32x4_add(vp, vs), valpha); - const v128_t vm = wasm_i32x4_shr(vx, 31); vx = wasm_f32x4_mul(vx, vbeta); - v128_t vy = wasm_v128_bitselect(ve, vx, vm); + v128_t vy = __builtin_wasm_signselect_i32x4(ve, vx, vx); if (n & (2 * sizeof(float))) { *((double*) y) = wasm_f64x2_extract_lane(vy, 0); diff --git a/src/f32-velu/gen/velu-wasmsimd-x86-rr2-p6-x20.c b/src/f32-velu/gen/velu-wasmsimd-x86-rr2-p6-x20.c index 55665ca07c6..41bbb786e01 100644 --- a/src/f32-velu/gen/velu-wasmsimd-x86-rr2-p6-x20.c +++ b/src/f32-velu/gen/velu-wasmsimd-x86-rr2-p6-x20.c @@ -145,22 +145,17 @@ void xnn_f32_velu_ukernel__wasmsimd_x86_rr2_p6_x20( const v128_t veCDEF = wasm_f32x4_mul(wasm_f32x4_add(vpCDEF, vsCDEF), valpha); const v128_t veGHIJ = wasm_f32x4_mul(wasm_f32x4_add(vpGHIJ, vsGHIJ), valpha); - const v128_t vm0123 = wasm_i32x4_shr(vx0123, 31); vx0123 = wasm_f32x4_mul(vx0123, vbeta); - const v128_t vm4567 = wasm_i32x4_shr(vx4567, 31); vx4567 = wasm_f32x4_mul(vx4567, vbeta); - const v128_t vm89AB = wasm_i32x4_shr(vx89AB, 31); vx89AB = wasm_f32x4_mul(vx89AB, vbeta); - const v128_t vmCDEF = wasm_i32x4_shr(vxCDEF, 31); vxCDEF = wasm_f32x4_mul(vxCDEF, vbeta); - const v128_t vmGHIJ = wasm_i32x4_shr(vxGHIJ, 31); vxGHIJ = wasm_f32x4_mul(vxGHIJ, vbeta); - const v128_t vy0123 = wasm_v128_bitselect(ve0123, vx0123, vm0123); - const v128_t vy4567 = wasm_v128_bitselect(ve4567, vx4567, vm4567); - const v128_t vy89AB = wasm_v128_bitselect(ve89AB, vx89AB, vm89AB); - const v128_t vyCDEF = wasm_v128_bitselect(veCDEF, vxCDEF, vmCDEF); - const v128_t vyGHIJ = wasm_v128_bitselect(veGHIJ, vxGHIJ, vmGHIJ); + const v128_t vy0123 = __builtin_wasm_signselect_i32x4(ve0123, vx0123, vx0123); + const v128_t vy4567 = __builtin_wasm_signselect_i32x4(ve4567, vx4567, vx4567); + const v128_t vy89AB = __builtin_wasm_signselect_i32x4(ve89AB, vx89AB, vx89AB); + const v128_t vyCDEF = __builtin_wasm_signselect_i32x4(veCDEF, vxCDEF, vxCDEF); + const v128_t vyGHIJ = __builtin_wasm_signselect_i32x4(veGHIJ, vxGHIJ, vxGHIJ); wasm_v128_store(y, vy0123); wasm_v128_store(y + 4, vy4567); @@ -194,9 +189,8 @@ void xnn_f32_velu_ukernel__wasmsimd_x86_rr2_p6_x20( vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vt); const v128_t ve = wasm_f32x4_mul(wasm_f32x4_add(vp, vs), valpha); - const v128_t vm = wasm_i32x4_shr(vx, 31); vx = wasm_f32x4_mul(vx, vbeta); - const v128_t vy = wasm_v128_bitselect(ve, vx, vm); + const v128_t vy = __builtin_wasm_signselect_i32x4(ve, vx, vx); wasm_v128_store(y, vy); y += 4; @@ -225,9 +219,8 @@ void xnn_f32_velu_ukernel__wasmsimd_x86_rr2_p6_x20( vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vt); const v128_t ve = wasm_f32x4_mul(wasm_f32x4_add(vp, vs), valpha); - const v128_t vm = wasm_i32x4_shr(vx, 31); vx = wasm_f32x4_mul(vx, vbeta); - v128_t vy = wasm_v128_bitselect(ve, vx, vm); + v128_t vy = __builtin_wasm_signselect_i32x4(ve, vx, vx); if (n & (2 * sizeof(float))) { *((double*) y) = wasm_f64x2_extract_lane(vy, 0); diff --git a/src/f32-velu/gen/velu-wasmsimd-x86-rr2-p6-x24.c b/src/f32-velu/gen/velu-wasmsimd-x86-rr2-p6-x24.c index 4d8c0dc5955..976921ddcfb 100644 --- a/src/f32-velu/gen/velu-wasmsimd-x86-rr2-p6-x24.c +++ b/src/f32-velu/gen/velu-wasmsimd-x86-rr2-p6-x24.c @@ -162,25 +162,19 @@ void xnn_f32_velu_ukernel__wasmsimd_x86_rr2_p6_x24( const v128_t veGHIJ = wasm_f32x4_mul(wasm_f32x4_add(vpGHIJ, vsGHIJ), valpha); const v128_t veKLMN = wasm_f32x4_mul(wasm_f32x4_add(vpKLMN, vsKLMN), valpha); - const v128_t vm0123 = wasm_i32x4_shr(vx0123, 31); vx0123 = wasm_f32x4_mul(vx0123, vbeta); - const v128_t vm4567 = wasm_i32x4_shr(vx4567, 31); vx4567 = wasm_f32x4_mul(vx4567, vbeta); - const v128_t vm89AB = wasm_i32x4_shr(vx89AB, 31); vx89AB = wasm_f32x4_mul(vx89AB, vbeta); - const v128_t vmCDEF = wasm_i32x4_shr(vxCDEF, 31); vxCDEF = wasm_f32x4_mul(vxCDEF, vbeta); - const v128_t vmGHIJ = wasm_i32x4_shr(vxGHIJ, 31); vxGHIJ = wasm_f32x4_mul(vxGHIJ, vbeta); - const v128_t vmKLMN = wasm_i32x4_shr(vxKLMN, 31); vxKLMN = wasm_f32x4_mul(vxKLMN, vbeta); - const v128_t vy0123 = wasm_v128_bitselect(ve0123, vx0123, vm0123); - const v128_t vy4567 = wasm_v128_bitselect(ve4567, vx4567, vm4567); - const v128_t vy89AB = wasm_v128_bitselect(ve89AB, vx89AB, vm89AB); - const v128_t vyCDEF = wasm_v128_bitselect(veCDEF, vxCDEF, vmCDEF); - const v128_t vyGHIJ = wasm_v128_bitselect(veGHIJ, vxGHIJ, vmGHIJ); - const v128_t vyKLMN = wasm_v128_bitselect(veKLMN, vxKLMN, vmKLMN); + const v128_t vy0123 = __builtin_wasm_signselect_i32x4(ve0123, vx0123, vx0123); + const v128_t vy4567 = __builtin_wasm_signselect_i32x4(ve4567, vx4567, vx4567); + const v128_t vy89AB = __builtin_wasm_signselect_i32x4(ve89AB, vx89AB, vx89AB); + const v128_t vyCDEF = __builtin_wasm_signselect_i32x4(veCDEF, vxCDEF, vxCDEF); + const v128_t vyGHIJ = __builtin_wasm_signselect_i32x4(veGHIJ, vxGHIJ, vxGHIJ); + const v128_t vyKLMN = __builtin_wasm_signselect_i32x4(veKLMN, vxKLMN, vxKLMN); wasm_v128_store(y, vy0123); wasm_v128_store(y + 4, vy4567); @@ -215,9 +209,8 @@ void xnn_f32_velu_ukernel__wasmsimd_x86_rr2_p6_x24( vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vt); const v128_t ve = wasm_f32x4_mul(wasm_f32x4_add(vp, vs), valpha); - const v128_t vm = wasm_i32x4_shr(vx, 31); vx = wasm_f32x4_mul(vx, vbeta); - const v128_t vy = wasm_v128_bitselect(ve, vx, vm); + const v128_t vy = __builtin_wasm_signselect_i32x4(ve, vx, vx); wasm_v128_store(y, vy); y += 4; @@ -246,9 +239,8 @@ void xnn_f32_velu_ukernel__wasmsimd_x86_rr2_p6_x24( vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vt); const v128_t ve = wasm_f32x4_mul(wasm_f32x4_add(vp, vs), valpha); - const v128_t vm = wasm_i32x4_shr(vx, 31); vx = wasm_f32x4_mul(vx, vbeta); - v128_t vy = wasm_v128_bitselect(ve, vx, vm); + v128_t vy = __builtin_wasm_signselect_i32x4(ve, vx, vx); if (n & (2 * sizeof(float))) { *((double*) y) = wasm_f64x2_extract_lane(vy, 0); diff --git a/src/f32-velu/gen/velu-wasmsimd-x86-rr2-p6-x4.c b/src/f32-velu/gen/velu-wasmsimd-x86-rr2-p6-x4.c index 62399a74e75..84242100021 100644 --- a/src/f32-velu/gen/velu-wasmsimd-x86-rr2-p6-x4.c +++ b/src/f32-velu/gen/velu-wasmsimd-x86-rr2-p6-x4.c @@ -67,9 +67,8 @@ void xnn_f32_velu_ukernel__wasmsimd_x86_rr2_p6_x4( vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vt); const v128_t ve = wasm_f32x4_mul(wasm_f32x4_add(vp, vs), valpha); - const v128_t vm = wasm_i32x4_shr(vx, 31); vx = wasm_f32x4_mul(vx, vbeta); - const v128_t vy = wasm_v128_bitselect(ve, vx, vm); + const v128_t vy = __builtin_wasm_signselect_i32x4(ve, vx, vx); wasm_v128_store(y, vy); y += 4; @@ -98,9 +97,8 @@ void xnn_f32_velu_ukernel__wasmsimd_x86_rr2_p6_x4( vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vt); const v128_t ve = wasm_f32x4_mul(wasm_f32x4_add(vp, vs), valpha); - const v128_t vm = wasm_i32x4_shr(vx, 31); vx = wasm_f32x4_mul(vx, vbeta); - v128_t vy = wasm_v128_bitselect(ve, vx, vm); + v128_t vy = __builtin_wasm_signselect_i32x4(ve, vx, vx); if (n & (2 * sizeof(float))) { *((double*) y) = wasm_f64x2_extract_lane(vy, 0); diff --git a/src/f32-velu/gen/velu-wasmsimd-x86-rr2-p6-x8.c b/src/f32-velu/gen/velu-wasmsimd-x86-rr2-p6-x8.c index 44e71bce445..18688724a74 100644 --- a/src/f32-velu/gen/velu-wasmsimd-x86-rr2-p6-x8.c +++ b/src/f32-velu/gen/velu-wasmsimd-x86-rr2-p6-x8.c @@ -94,13 +94,11 @@ void xnn_f32_velu_ukernel__wasmsimd_x86_rr2_p6_x8( const v128_t ve0123 = wasm_f32x4_mul(wasm_f32x4_add(vp0123, vs0123), valpha); const v128_t ve4567 = wasm_f32x4_mul(wasm_f32x4_add(vp4567, vs4567), valpha); - const v128_t vm0123 = wasm_i32x4_shr(vx0123, 31); vx0123 = wasm_f32x4_mul(vx0123, vbeta); - const v128_t vm4567 = wasm_i32x4_shr(vx4567, 31); vx4567 = wasm_f32x4_mul(vx4567, vbeta); - const v128_t vy0123 = wasm_v128_bitselect(ve0123, vx0123, vm0123); - const v128_t vy4567 = wasm_v128_bitselect(ve4567, vx4567, vm4567); + const v128_t vy0123 = __builtin_wasm_signselect_i32x4(ve0123, vx0123, vx0123); + const v128_t vy4567 = __builtin_wasm_signselect_i32x4(ve4567, vx4567, vx4567); wasm_v128_store(y, vy0123); wasm_v128_store(y + 4, vy4567); @@ -131,9 +129,8 @@ void xnn_f32_velu_ukernel__wasmsimd_x86_rr2_p6_x8( vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vt); const v128_t ve = wasm_f32x4_mul(wasm_f32x4_add(vp, vs), valpha); - const v128_t vm = wasm_i32x4_shr(vx, 31); vx = wasm_f32x4_mul(vx, vbeta); - const v128_t vy = wasm_v128_bitselect(ve, vx, vm); + const v128_t vy = __builtin_wasm_signselect_i32x4(ve, vx, vx); wasm_v128_store(y, vy); y += 4; @@ -162,9 +159,8 @@ void xnn_f32_velu_ukernel__wasmsimd_x86_rr2_p6_x8( vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vt); const v128_t ve = wasm_f32x4_mul(wasm_f32x4_add(vp, vs), valpha); - const v128_t vm = wasm_i32x4_shr(vx, 31); vx = wasm_f32x4_mul(vx, vbeta); - v128_t vy = wasm_v128_bitselect(ve, vx, vm); + v128_t vy = __builtin_wasm_signselect_i32x4(ve, vx, vx); if (n & (2 * sizeof(float))) { *((double*) y) = wasm_f64x2_extract_lane(vy, 0); diff --git a/src/f32-velu/wasmsimd-rr2-lut16-p3.c.in b/src/f32-velu/wasmsimd-rr2-lut16-p3.c.in index c51b9198ec8..91d15d14f7b 100644 --- a/src/f32-velu/wasmsimd-rr2-lut16-p3.c.in +++ b/src/f32-velu/wasmsimd-rr2-lut16-p3.c.in @@ -101,11 +101,10 @@ void xnn_f32_velu_ukernel__wasmsimd_${"x86" if X86 else "arm"}_rr2_lut16_p3_x${B const v128_t ve${ABC[N:N+4]} = wasm_f32x4_mul(wasm_f32x4_add(vp${ABC[N:N+4]}, vs${ABC[N:N+4]}), valpha); $for N in range(0, BATCH_TILE, 4): - const v128_t vm${ABC[N:N+4]} = wasm_i32x4_shr(vx${ABC[N:N+4]}, 31); vx${ABC[N:N+4]} = wasm_f32x4_mul(vx${ABC[N:N+4]}, vbeta); $for N in range(0, BATCH_TILE, 4): - const v128_t vy${ABC[N:N+4]} = wasm_v128_bitselect(ve${ABC[N:N+4]}, vx${ABC[N:N+4]}, vm${ABC[N:N+4]}); + const v128_t vy${ABC[N:N+4]} = __builtin_wasm_signselect_i32x4(ve${ABC[N:N+4]}, vx${ABC[N:N+4]}, vx${ABC[N:N+4]}); wasm_v128_store(y, vy${ABC[0:4]}); $for N in range(4, BATCH_TILE, 4): @@ -149,9 +148,8 @@ void xnn_f32_velu_ukernel__wasmsimd_${"x86" if X86 else "arm"}_rr2_lut16_p3_x${B vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vt); const v128_t ve = wasm_f32x4_mul(wasm_f32x4_add(vp, vs), valpha); - const v128_t vm = wasm_i32x4_shr(vx, 31); vx = wasm_f32x4_mul(vx, vbeta); - const v128_t vy = wasm_v128_bitselect(ve, vx, vm); + const v128_t vy = __builtin_wasm_signselect_i32x4(ve, vx, vx); wasm_v128_store(y, vy); y += 4; @@ -192,9 +190,8 @@ void xnn_f32_velu_ukernel__wasmsimd_${"x86" if X86 else "arm"}_rr2_lut16_p3_x${B vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vt); const v128_t ve = wasm_f32x4_mul(wasm_f32x4_add(vp, vs), valpha); - const v128_t vm = wasm_i32x4_shr(vx, 31); vx = wasm_f32x4_mul(vx, vbeta); - v128_t vy = wasm_v128_bitselect(ve, vx, vm); + v128_t vy = __builtin_wasm_signselect_i32x4(ve, vx, vx); if (n & (2 * sizeof(float))) { *((double*) y) = wasm_f64x2_extract_lane(vy, 0); diff --git a/src/f32-velu/wasmsimd-rr2-p6.c.in b/src/f32-velu/wasmsimd-rr2-p6.c.in index 31c3343e0ae..fd5240b9499 100644 --- a/src/f32-velu/wasmsimd-rr2-p6.c.in +++ b/src/f32-velu/wasmsimd-rr2-p6.c.in @@ -99,11 +99,10 @@ void xnn_f32_velu_ukernel__wasmsimd_${"x86" if X86 else "arm"}_rr2_p6_x${BATCH_T const v128_t ve${ABC[N:N+4]} = wasm_f32x4_mul(wasm_f32x4_add(vp${ABC[N:N+4]}, vs${ABC[N:N+4]}), valpha); $for N in range(0, BATCH_TILE, 4): - const v128_t vm${ABC[N:N+4]} = wasm_i32x4_shr(vx${ABC[N:N+4]}, 31); vx${ABC[N:N+4]} = wasm_f32x4_mul(vx${ABC[N:N+4]}, vbeta); $for N in range(0, BATCH_TILE, 4): - const v128_t vy${ABC[N:N+4]} = wasm_v128_bitselect(ve${ABC[N:N+4]}, vx${ABC[N:N+4]}, vm${ABC[N:N+4]}); + const v128_t vy${ABC[N:N+4]} = __builtin_wasm_signselect_i32x4(ve${ABC[N:N+4]}, vx${ABC[N:N+4]}, vx${ABC[N:N+4]}); wasm_v128_store(y, vy${ABC[0:4]}); $for N in range(4, BATCH_TILE, 4): @@ -139,9 +138,8 @@ void xnn_f32_velu_ukernel__wasmsimd_${"x86" if X86 else "arm"}_rr2_p6_x${BATCH_T vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vt); const v128_t ve = wasm_f32x4_mul(wasm_f32x4_add(vp, vs), valpha); - const v128_t vm = wasm_i32x4_shr(vx, 31); vx = wasm_f32x4_mul(vx, vbeta); - const v128_t vy = wasm_v128_bitselect(ve, vx, vm); + const v128_t vy = __builtin_wasm_signselect_i32x4(ve, vx, vx); wasm_v128_store(y, vy); y += 4; @@ -174,9 +172,8 @@ void xnn_f32_velu_ukernel__wasmsimd_${"x86" if X86 else "arm"}_rr2_p6_x${BATCH_T vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vt); const v128_t ve = wasm_f32x4_mul(wasm_f32x4_add(vp, vs), valpha); - const v128_t vm = wasm_i32x4_shr(vx, 31); vx = wasm_f32x4_mul(vx, vbeta); - v128_t vy = wasm_v128_bitselect(ve, vx, vm); + v128_t vy = __builtin_wasm_signselect_i32x4(ve, vx, vx); if (n & (2 * sizeof(float))) { *((double*) y) = wasm_f64x2_extract_lane(vy, 0);