@@ -319,22 +319,27 @@ void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm,
319
319
uint32_t neg_imag = extract32 (desc , SIMD_DATA_SHIFT + 1 , 1 );
320
320
intptr_t index = extract32 (desc , SIMD_DATA_SHIFT + 2 , 2 );
321
321
uint32_t neg_real = flip ^ neg_imag ;
322
- uintptr_t i ;
323
- float16 e1 = m [ H2 ( 2 * index + flip )] ;
324
- float16 e3 = m [ H2 ( 2 * index + 1 - flip )] ;
322
+ intptr_t elements = opr_sz / sizeof ( float16 ) ;
323
+ intptr_t eltspersegment = 16 / sizeof ( float16 ) ;
324
+ intptr_t i , j ;
325
325
326
326
/* Shift boolean to the sign bit so we can xor to negate. */
327
327
neg_real <<= 15 ;
328
328
neg_imag <<= 15 ;
329
- e1 ^= neg_real ;
330
- e3 ^= neg_imag ;
331
329
332
- for (i = 0 ; i < opr_sz / 2 ; i += 2 ) {
333
- float16 e2 = n [H2 (i + flip )];
334
- float16 e4 = e2 ;
330
+ for (i = 0 ; i < elements ; i += eltspersegment ) {
331
+ float16 mr = m [H2 (i + 2 * index + 0 )];
332
+ float16 mi = m [H2 (i + 2 * index + 1 )];
333
+ float16 e1 = neg_real ^ (flip ? mi : mr );
334
+ float16 e3 = neg_imag ^ (flip ? mr : mi );
335
335
336
- d [H2 (i )] = float16_muladd (e2 , e1 , d [H2 (i )], 0 , fpst );
337
- d [H2 (i + 1 )] = float16_muladd (e4 , e3 , d [H2 (i + 1 )], 0 , fpst );
336
+ for (j = i ; j < i + eltspersegment ; j += 2 ) {
337
+ float16 e2 = n [H2 (j + flip )];
338
+ float16 e4 = e2 ;
339
+
340
+ d [H2 (j )] = float16_muladd (e2 , e1 , d [H2 (j )], 0 , fpst );
341
+ d [H2 (j + 1 )] = float16_muladd (e4 , e3 , d [H2 (j + 1 )], 0 , fpst );
342
+ }
338
343
}
339
344
clear_tail (d , opr_sz , simd_maxsz (desc ));
340
345
}
@@ -380,22 +385,27 @@ void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm,
380
385
uint32_t neg_imag = extract32 (desc , SIMD_DATA_SHIFT + 1 , 1 );
381
386
intptr_t index = extract32 (desc , SIMD_DATA_SHIFT + 2 , 2 );
382
387
uint32_t neg_real = flip ^ neg_imag ;
383
- uintptr_t i ;
384
- float32 e1 = m [ H4 ( 2 * index + flip )] ;
385
- float32 e3 = m [ H4 ( 2 * index + 1 - flip )] ;
388
+ intptr_t elements = opr_sz / sizeof ( float32 ) ;
389
+ intptr_t eltspersegment = 16 / sizeof ( float32 ) ;
390
+ intptr_t i , j ;
386
391
387
392
/* Shift boolean to the sign bit so we can xor to negate. */
388
393
neg_real <<= 31 ;
389
394
neg_imag <<= 31 ;
390
- e1 ^= neg_real ;
391
- e3 ^= neg_imag ;
392
395
393
- for (i = 0 ; i < opr_sz / 4 ; i += 2 ) {
394
- float32 e2 = n [H4 (i + flip )];
395
- float32 e4 = e2 ;
396
+ for (i = 0 ; i < elements ; i += eltspersegment ) {
397
+ float32 mr = m [H4 (i + 2 * index + 0 )];
398
+ float32 mi = m [H4 (i + 2 * index + 1 )];
399
+ float32 e1 = neg_real ^ (flip ? mi : mr );
400
+ float32 e3 = neg_imag ^ (flip ? mr : mi );
396
401
397
- d [H4 (i )] = float32_muladd (e2 , e1 , d [H4 (i )], 0 , fpst );
398
- d [H4 (i + 1 )] = float32_muladd (e4 , e3 , d [H4 (i + 1 )], 0 , fpst );
402
+ for (j = i ; j < i + eltspersegment ; j += 2 ) {
403
+ float32 e2 = n [H4 (j + flip )];
404
+ float32 e4 = e2 ;
405
+
406
+ d [H4 (j )] = float32_muladd (e2 , e1 , d [H4 (j )], 0 , fpst );
407
+ d [H4 (j + 1 )] = float32_muladd (e4 , e3 , d [H4 (j + 1 )], 0 , fpst );
408
+ }
399
409
}
400
410
clear_tail (d , opr_sz , simd_maxsz (desc ));
401
411
}
0 commit comments