From f7f52445c802ec8ef6574891647d19d07734c90c Mon Sep 17 00:00:00 2001 From: Afonso Bordado Date: Sun, 20 Jun 2021 14:45:32 +0100 Subject: [PATCH] aarch64: Implement lowering rotl/rotr for i128 values --- .../codegen/src/isa/aarch64/lower_inst.rs | 55 +++++++++++++++ .../filetests/isa/aarch64/shift-rotate.clif | 69 +++++++++++++++++++ .../filetests/runtests/i128-rotate.clif | 2 +- 3 files changed, 125 insertions(+), 1 deletion(-) diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs index 25c83eede65a..4c783785dcab 100644 --- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs +++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs @@ -878,6 +878,61 @@ pub(crate) fn lower_insn_to_regs>( let ty = ty.unwrap(); let ty_bits_size = ty_bits(ty) as u8; + // TODO: We can do much better codegen if we have a constant amt + if ty == I128 { + let dst = get_output_reg(ctx, outputs[0]); + let src = put_input_in_regs(ctx, inputs[0]); + let amt_src = put_input_in_regs(ctx, inputs[1]).regs()[0]; + + let tmp = ctx.alloc_tmp(I128); + let inv_amt = ctx.alloc_tmp(I64).only_reg().unwrap(); + + lower_constant_u64(ctx, inv_amt, 128); + ctx.emit(Inst::AluRRR { + alu_op: ALUOp::Sub64, + rd: inv_amt, + rn: inv_amt.to_reg(), + rm: amt_src, + }); + + if is_rotl { + // rotl + // (shl.i128 tmp, amt) + // (ushr.i128 dst, 128-amt) + + emit_shl_i128(ctx, src, tmp, amt_src); + emit_shr_i128( + ctx, + src, + dst, + inv_amt.to_reg(), + /* is_signed = */ false, + ); + } else { + // rotr + // (ushr.i128 tmp, amt) + // (shl.i128 dst, 128-amt) + + emit_shr_i128(ctx, src, tmp, amt_src, /* is_signed = */ false); + emit_shl_i128(ctx, src, dst, inv_amt.to_reg()); + } + + ctx.emit(Inst::AluRRR { + alu_op: ALUOp::Orr64, + rd: dst.regs()[0], + rn: dst.regs()[0].to_reg(), + rm: tmp.regs()[0].to_reg(), + }); + ctx.emit(Inst::AluRRR { + alu_op: ALUOp::Orr64, + rd: dst.regs()[1], + rn: dst.regs()[1].to_reg(), + rm: tmp.regs()[1].to_reg(), + }); + + return Ok(()); + } + let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); let rn = put_input_in_reg( ctx, diff --git a/cranelift/filetests/filetests/isa/aarch64/shift-rotate.clif b/cranelift/filetests/filetests/isa/aarch64/shift-rotate.clif index e3aab98b7f50..7bdfd3404af7 100644 --- a/cranelift/filetests/filetests/isa/aarch64/shift-rotate.clif +++ b/cranelift/filetests/filetests/isa/aarch64/shift-rotate.clif @@ -6,6 +6,39 @@ target aarch64 ;; ROR, variable ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +function %i128_rotr(i128, i128) -> i128 { +block0(v0: i128, v1: i128): + v2 = rotr.i128 v0, v1 + return v2 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: movz x3, #128 +; nextln: sub x5, x3, x2 +; nextln: orn w4, wzr, w2 +; nextln: lsl x6, x1, #1 +; nextln: lsr x3, x0, x2 +; nextln: lsl x6, x6, x4 +; nextln: lsr x4, x1, x2 +; nextln: ands xzr, x2, #64 +; nextln: orr x2, x3, x6 +; nextln: csel x3, xzr, x4, ne +; nextln: csel x4, x4, x2, ne +; nextln: orn w2, wzr, w5 +; nextln: lsr x6, x0, #1 +; nextln: lsl x1, x1, x5 +; nextln: lsr x2, x6, x2 +; nextln: lsl x0, x0, x5 +; nextln: ands xzr, x5, #64 +; nextln: orr x1, x1, x2 +; nextln: csel x1, x0, x1, ne +; nextln: csel x0, xzr, x0, ne +; nextln: orr x0, x0, x4 +; nextln: orr x1, x1, x3 +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + function %f0(i64, i64) -> i64 { block0(v0: i64, v1: i64): v2 = rotr.i64 v0, v1 @@ -70,6 +103,42 @@ block0(v0: i8, v1: i8): ;; ROL, variable ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +function %i128_rotl(i128, i128) -> i128 { +block0(v0: i128, v1: i128): + v2 = rotl.i128 v0, v1 + return v2 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: movz x3, #128 +; nextln: sub x5, x3, x2 +; nextln: orn w4, wzr, w2 +; nextln: lsr x6, x0, #1 +; nextln: lsl x3, x1, x2 +; nextln: lsr x6, x6, x4 +; nextln: lsl x4, x0, x2 +; nextln: ands xzr, x2, #64 +; nextln: orr x2, x3, x6 +; nextln: csel x3, x4, x2, ne +; nextln: csel x4, xzr, x4, ne +; nextln: orn w2, wzr, w5 +; nextln: lsl x6, x1, #1 +; nextln: lsr x0, x0, x5 +; nextln: lsl x2, x6, x2 +; nextln: lsr x1, x1, x5 +; nextln: ands xzr, x5, #64 +; nextln: orr x2, x0, x2 +; nextln: csel x0, xzr, x1, ne +; nextln: csel x1, x1, x2, ne +; nextln: orr x1, x1, x4 +; nextln: orr x0, x0, x3 +; nextln: mov x2, x0 +; nextln: mov x0, x1 +; nextln: mov x1, x2 +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + function %f4(i64, i64) -> i64 { block0(v0: i64, v1: i64): v2 = rotl.i64 v0, v1 diff --git a/cranelift/filetests/filetests/runtests/i128-rotate.clif b/cranelift/filetests/filetests/runtests/i128-rotate.clif index ef2c111194d4..359e9c1a7c0e 100644 --- a/cranelift/filetests/filetests/runtests/i128-rotate.clif +++ b/cranelift/filetests/filetests/runtests/i128-rotate.clif @@ -1,5 +1,5 @@ test run -; target aarch64 TODO: Not yet implemented on aarch64 +target aarch64 ; target s390x TODO: Not yet implemented on s390x target x86_64 machinst