[Arm64] "Move" Intrinsics #35037

tannergooding · 2020-04-16T02:04:07Z

namespace System.Runtime.Intrinsics.Arm
{
    public abstract class AdvSimd
    {
        /// <summary>
        /// Duplicate general-purpose register to vector
        /// For each element result[elem] = value
        /// Corresponds to vector forms of DUP and VDUP
        /// </summary>
        Vector64<byte>    DuplicateToVector64(byte value);
        Vector64<short>   DuplicateToVector64(short value);
        Vector64<int>     DuplicateToVector64(int value);
        Vector64<sbyte>   DuplicateToVector64(sbyte value);
        Vector64<ushort>  DuplicateToVector64(ushort value);
        Vector64<uint>    DuplicateToVector64(uint value);

        Vector128<byte>   DuplicateToVector128(byte value);
        Vector128<short>  DuplicateToVector128(short value);
        Vector128<int>    DuplicateToVector128(int value);
        Vector128<sbyte>  DuplicateToVector128(sbyte value);
        Vector128<ushort> DuplicateToVector128(ushort value);
        Vector128<uint>   DuplicateToVector128(uint value);

        /// <summary>
        /// Duplicate vector element to vector
        /// For each element result[elem] = value[index]
        /// Corresponds to vector forms of DUP and VDUP
        /// </summary>
        Vector64<byte>    DuplicateSelectedScalarToVector64(Vector64<byte> value,     byte index);
        Vector64<short>   DuplicateSelectedScalarToVector64(Vector64<short> value,    byte index);
        Vector64<int>     DuplicateSelectedScalarToVector64(Vector64<int> value,      byte index);
        Vector64<float>   DuplicateSelectedScalarToVector64(Vector64<float> value,    byte index);
        Vector64<sbyte>   DuplicateSelectedScalarToVector64(Vector64<sbyte> value,    byte index);
        Vector64<ushort>  DuplicateSelectedScalarToVector64(Vector64<ushort> value,   byte index);
        Vector64<uint>    DuplicateSelectedScalarToVector64(Vector64<uint> value,     byte index);

        Vector64<byte>    DuplicateSelectedScalarToVector64(Vector128<byte> value,     byte index);
        Vector64<short>   DuplicateSelectedScalarToVector64(Vector128<short> value,    byte index);
        Vector64<int>     DuplicateSelectedScalarToVector64(Vector128<int> value,      byte index);
        Vector64<float>   DuplicateSelectedScalarToVector64(Vector128<float> value,    byte index);
        Vector64<sbyte>   DuplicateSelectedScalarToVector64(Vector128<sbyte> value,    byte index);
        Vector64<ushort>  DuplicateSelectedScalarToVector64(Vector128<ushort> value,   byte index);
        Vector64<uint>    DuplicateSelectedScalarToVector64(Vector128<uint> value,     byte index);

        Vector128<byte>   DuplicateSelectedScalarToVector128(Vector64<byte> value,    byte index);
        Vector128<short>  DuplicateSelectedScalarToVector128(Vector64<short> value,   byte index);
        Vector128<int>    DuplicateSelectedScalarToVector128(Vector64<int> value,     byte index);
        Vector128<float>  DuplicateSelectedScalarToVector128(Vector64<float> value,   byte index);
        Vector128<sbyte>  DuplicateSelectedScalarToVector128(Vector64<sbyte> value,   byte index);
        Vector128<ushort> DuplicateSelectedScalarToVector128(Vector64<ushort> value,  byte index);
        Vector128<uint>   DuplicateSelectedScalarToVector128(Vector64<uint> value,    byte index);

        Vector128<byte>   DuplicateSelectedScalarToVector128(Vector128<byte> value,   byte index);
        Vector128<double> DuplicateSelectedScalarToVector128(Vector128<double> value, byte index);
        Vector128<short>  DuplicateSelectedScalarToVector128(Vector128<short> value,  byte index);
        Vector128<int>    DuplicateSelectedScalarToVector128(Vector128<int> value,    byte index);
        Vector128<long>   DuplicateSelectedScalarToVector128(Vector128<long> value,   byte index);
        Vector128<float>  DuplicateSelectedScalarToVector128(Vector128<float> value,  byte index);
        Vector128<sbyte>  DuplicateSelectedScalarToVector128(Vector128<sbyte> value,  byte index);
        Vector128<ushort> DuplicateSelectedScalarToVector128(Vector128<ushort> value, byte index);
        Vector128<uint>   DuplicateSelectedScalarToVector128(Vector128<uint> value,   byte index);
        Vector128<ulong>  DuplicateSelectedScalarToVector128(Vector128<ulong> value,  byte index);

        public abstract class Arm64
        {
            /// <summary>
            /// Duplicate general-purpose register to vector
            /// For each element result[elem] = value
            /// Corresponds to vector forms of DUP
            /// </summary>
            Vector128<long>   DuplicateToVector64(long value);
            Vector128<ulong>  DuplicateToVector64(ulong value);

            /// <summary>
            /// Insert vector element from another vector element
            /// result[resultIndex] = value[valueIndex]
            /// Corresponds to vector forms of INS
            /// </summary>
            Vector128<byte>   InsertSelectedScalar(Vector128<byte> result,   byte resultIndex, Vector64<byte> value,    byte valueIndex);
            Vector128<short>  InsertSelectedScalar(Vector128<short> result,  byte resultIndex, Vector64<short> value,   byte valueIndex);
            Vector128<int>    InsertSelectedScalar(Vector128<int> result,    byte resultIndex, Vector64<int> value,     byte valueIndex);
            Vector128<float>  InsertSelectedScalar(Vector128<float> result,  byte resultIndex, Vector64<float> value,   byte valueIndex);
            Vector128<sbyte>  InsertSelectedScalar(Vector128<sbyte> result,  byte resultIndex, Vector64<sbyte> value,   byte valueIndex);
            Vector128<ushort> InsertSelectedScalar(Vector128<ushort> result, byte resultIndex, Vector64<ushort> value,  byte valueIndex);
            Vector128<uint>   InsertSelectedScalar(Vector128<uint> result,   byte resultIndex, Vector64<uint> value,    byte valueIndex);

            Vector128<byte>   InsertSelectedScalar(Vector128<byte> result,   byte resultIndex, Vector128<byte> value,   byte valueIndex);
            Vector128<double> InsertSelectedScalar(Vector128<double> result, byte resultIndex, Vector128<double> value, byte valueIndex);
            Vector128<short>  InsertSelectedScalar(Vector128<short> result,  byte resultIndex, Vector128<short> value,  byte valueIndex);
            Vector128<int>    InsertSelectedScalar(Vector128<int> result,    byte resultIndex, Vector128<int> value,    byte valueIndex);
            Vector128<long>   InsertSelectedScalar(Vector128<long> result,   byte resultIndex, Vector128<long> value,   byte valueIndex);
            Vector128<float>  InsertSelectedScalar(Vector128<float> result,  byte resultIndex, Vector128<float> value,  byte valueIndex);
            Vector128<sbyte>  InsertSelectedScalar(Vector128<sbyte> result,  byte resultIndex, Vector128<sbyte> value,  byte valueIndex);
            Vector128<ushort> InsertSelectedScalar(Vector128<ushort> result, byte resultIndex, Vector128<ushort> value, byte valueIndex);
            Vector128<uint>   InsertSelectedScalar(Vector128<uint> result,   byte resultIndex, Vector128<uint> value,   byte valueIndex);
            Vector128<ulong>  InsertSelectedScalar(Vector128<ulong> result,  byte resultIndex, Vector128<ulong> value,  byte valueIndex);
        }
    }
}

ghost · 2020-04-16T02:04:10Z

Tagging subscribers to this area: @tannergooding
Notify danmosemsft if you want to be subscribed.

tannergooding · 2020-04-16T02:04:39Z

CC. @echesakovMSFT, @CarolEidt, @TamarChristinaArm

echesakov · 2020-04-16T02:21:32Z

For InsertSelectedScalar I believe we want Cartesian product of [typeof(result) is Vector64<T>; typeof(result) is Vector128<T>] x [typeof(value) is Vector64<T>; typeof(value) is Vector128<T>]?

echesakov · 2020-04-16T02:23:22Z

Same for DuplicateSelectedScalarToVector128 and DuplicateSelectedScalarToVector64 corresponding to DUP (element)?

tannergooding · 2020-04-16T02:43:57Z

Added the Vector128 DuplicateSelectedScalarToVector64(Vector128, byte) overloads that were missing.

For INS the decoding isn't clear on whether the full set is supported...

tannergooding · 2020-04-16T02:44:52Z

Actually, it looks like the result should always be 128-bits:

Operation
    CheckFPAdvSIMDEnabled64();
    bits(idxdsize) operand = V[n];
    bits(128) result;
    result = V[d];
    Elem[result, dst_index, esize] = Elem[operand, src_index, esize];
    V[d] = result;

TamarChristinaArm · 2020-04-16T11:41:48Z

DuplicateSelectedScalarToVector64 and DuplicateSelectedScalarToVector128 are both definitely implementable on AArch32. InsertSelectedScalar is tricky for 16 and 8 bit ones on AArch32. I'll have to have a think on these..

tannergooding · 2020-04-16T15:20:25Z

DuplicateSelectedScalarToVector64 and DuplicateSelectedScalarToVector128 are both definitely implementable on AArch32

This would be incrementing the register by 1 if the index is greater than Count / 2, correct?

TamarChristinaArm · 2020-04-16T15:55:19Z

Yup, that's correct.

terrajobst · 2020-05-05T17:54:55Z

Video

Looks good as proposed
Should there be an InsertSelectedScalar that returns Vector64?

namespace System.Runtime.Intrinsics.Arm
{
    public abstract class AdvSimd
    {
        /// <summary>
        /// Duplicate general-purpose register to vector
        /// For each element result[elem] = value
        /// Corresponds to vector forms of DUP and VDUP
        /// </summary>
        Vector64<byte>    DuplicateToVector64(byte value);
        Vector64<short>   DuplicateToVector64(short value);
        Vector64<int>     DuplicateToVector64(int value);
        Vector64<sbyte>   DuplicateToVector64(sbyte value);
        Vector64<ushort>  DuplicateToVector64(ushort value);
        Vector64<uint>    DuplicateToVector64(uint value);

        Vector128<byte>   DuplicateToVector128(byte value);
        Vector128<short>  DuplicateToVector128(short value);
        Vector128<int>    DuplicateToVector128(int value);
        Vector128<sbyte>  DuplicateToVector128(sbyte value);
        Vector128<ushort> DuplicateToVector128(ushort value);
        Vector128<uint>   DuplicateToVector128(uint value);

        /// <summary>
        /// Duplicate vector element to vector
        /// For each element result[elem] = value[index]
        /// Corresponds to vector forms of DUP and VDUP
        /// </summary>
        Vector64<byte>    DuplicateSelectedScalarToVector64(Vector64<byte> value,     byte index);
        Vector64<short>   DuplicateSelectedScalarToVector64(Vector64<short> value,    byte index);
        Vector64<int>     DuplicateSelectedScalarToVector64(Vector64<int> value,      byte index);
        Vector64<float>   DuplicateSelectedScalarToVector64(Vector64<float> value,    byte index);
        Vector64<sbyte>   DuplicateSelectedScalarToVector64(Vector64<sbyte> value,    byte index);
        Vector64<ushort>  DuplicateSelectedScalarToVector64(Vector64<ushort> value,   byte index);
        Vector64<uint>    DuplicateSelectedScalarToVector64(Vector64<uint> value,     byte index);

        Vector64<byte>    DuplicateSelectedScalarToVector64(Vector128<byte> value,     byte index);
        Vector64<short>   DuplicateSelectedScalarToVector64(Vector128<short> value,    byte index);
        Vector64<int>     DuplicateSelectedScalarToVector64(Vector128<int> value,      byte index);
        Vector64<float>   DuplicateSelectedScalarToVector64(Vector128<float> value,    byte index);
        Vector64<sbyte>   DuplicateSelectedScalarToVector64(Vector128<sbyte> value,    byte index);
        Vector64<ushort>  DuplicateSelectedScalarToVector64(Vector128<ushort> value,   byte index);
        Vector64<uint>    DuplicateSelectedScalarToVector64(Vector128<uint> value,     byte index);

        Vector128<byte>   DuplicateSelectedScalarToVector128(Vector64<byte> value,    byte index);
        Vector128<short>  DuplicateSelectedScalarToVector128(Vector64<short> value,   byte index);
        Vector128<int>    DuplicateSelectedScalarToVector128(Vector64<int> value,     byte index);
        Vector128<float>  DuplicateSelectedScalarToVector128(Vector64<float> value,   byte index);
        Vector128<sbyte>  DuplicateSelectedScalarToVector128(Vector64<sbyte> value,   byte index);
        Vector128<ushort> DuplicateSelectedScalarToVector128(Vector64<ushort> value,  byte index);
        Vector128<uint>   DuplicateSelectedScalarToVector128(Vector64<uint> value,    byte index);

        Vector128<byte>   DuplicateSelectedScalarToVector128(Vector128<byte> value,   byte index);
        Vector128<double> DuplicateSelectedScalarToVector128(Vector128<double> value, byte index);
        Vector128<short>  DuplicateSelectedScalarToVector128(Vector128<short> value,  byte index);
        Vector128<int>    DuplicateSelectedScalarToVector128(Vector128<int> value,    byte index);
        Vector128<long>   DuplicateSelectedScalarToVector128(Vector128<long> value,   byte index);
        Vector128<float>  DuplicateSelectedScalarToVector128(Vector128<float> value,  byte index);
        Vector128<sbyte>  DuplicateSelectedScalarToVector128(Vector128<sbyte> value,  byte index);
        Vector128<ushort> DuplicateSelectedScalarToVector128(Vector128<ushort> value, byte index);
        Vector128<uint>   DuplicateSelectedScalarToVector128(Vector128<uint> value,   byte index);
        Vector128<ulong>  DuplicateSelectedScalarToVector128(Vector128<ulong> value,  byte index);

        public abstract class Arm64
        {
            /// <summary>
            /// Duplicate general-purpose register to vector
            /// For each element result[elem] = value
            /// Corresponds to vector forms of DUP
            /// </summary>
            Vector128<long>   DuplicateToVector64(long value);
            Vector128<ulong>  DuplicateToVector64(ulong value);

            /// <summary>
            /// Insert vector element from another vector element
            /// result[resultIndex] = value[valueIndex]
            /// Corresponds to vector forms of INS
            /// </summary>
            Vector128<byte>   InsertSelectedScalar(Vector128<byte> result,   byte resultIndex, Vector64<byte> value,    byte valueIndex);
            Vector128<short>  InsertSelectedScalar(Vector128<short> result,  byte resultIndex, Vector64<short> value,   byte valueIndex);
            Vector128<int>    InsertSelectedScalar(Vector128<int> result,    byte resultIndex, Vector64<int> value,     byte valueIndex);
            Vector128<float>  InsertSelectedScalar(Vector128<float> result,  byte resultIndex, Vector64<float> value,   byte valueIndex);
            Vector128<sbyte>  InsertSelectedScalar(Vector128<sbyte> result,  byte resultIndex, Vector64<sbyte> value,   byte valueIndex);
            Vector128<ushort> InsertSelectedScalar(Vector128<ushort> result, byte resultIndex, Vector64<ushort> value,  byte valueIndex);
            Vector128<uint>   InsertSelectedScalar(Vector128<uint> result,   byte resultIndex, Vector64<uint> value,    byte valueIndex);

            Vector128<byte>   InsertSelectedScalar(Vector128<byte> result,   byte resultIndex, Vector128<byte> value,   byte valueIndex);
            Vector128<double> InsertSelectedScalar(Vector128<double> result, byte resultIndex, Vector128<double> value, byte valueIndex);
            Vector128<short>  InsertSelectedScalar(Vector128<short> result,  byte resultIndex, Vector128<short> value,  byte valueIndex);
            Vector128<int>    InsertSelectedScalar(Vector128<int> result,    byte resultIndex, Vector128<int> value,    byte valueIndex);
            Vector128<long>   InsertSelectedScalar(Vector128<long> result,   byte resultIndex, Vector128<long> value,   byte valueIndex);
            Vector128<float>  InsertSelectedScalar(Vector128<float> result,  byte resultIndex, Vector128<float> value,  byte valueIndex);
            Vector128<sbyte>  InsertSelectedScalar(Vector128<sbyte> result,  byte resultIndex, Vector128<sbyte> value,  byte valueIndex);
            Vector128<ushort> InsertSelectedScalar(Vector128<ushort> result, byte resultIndex, Vector128<ushort> value, byte valueIndex);
            Vector128<uint>   InsertSelectedScalar(Vector128<uint> result,   byte resultIndex, Vector128<uint> value,   byte valueIndex);
            Vector128<ulong>  InsertSelectedScalar(Vector128<ulong> result,  byte resultIndex, Vector128<ulong> value,  byte valueIndex);
        }
    }
}

tannergooding · 2020-05-07T19:02:09Z

@echesakovMSFT, @kunalspathak. If neither of you are working on this one, I'm going to pick it up.

It will unblock being able to do #35857 for ARM64.

echesakov · 2020-05-07T19:11:28Z

@tannergooding Please go ahead - I un-assigned myself

tannergooding · 2020-05-08T13:47:03Z

@TamarChristinaArm, could you indicate what instruction is used for:

int64x1_t vdup_lane_s64 (int64x1_t vec, const int lane)
int64x2_t vdupq_lane_s64 (int64x1_t vec, const int lane)

https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics?page=3&search=DUP indicates it is supported on ARM32 but VDUP and the other ARM instructions I can find are restricted to 8, 16, and 32 from what I can tell.

It also looks like the variants that take a Vector128<T> input are A64 only, is that correct?

kunalspathak · 2020-05-08T21:18:55Z

I am wondering what is the motivation to implement AdvSimd.DuplicateToVector64() and AdvSimd.DuplicateToVector128() methods. They are same as Vector64.Create() or Vector128.Create() that takes single argument and duplicates it to all lanes.

tannergooding · 2020-05-08T21:33:07Z

Vector64.Create() and Vector128.Create() aren't guaranteed to be DUP instructions, they could be a chain of instructions or be some other instruction depending on the hardware or various environment variables. This is the case on x86, for example, where Broadcast is only available on AVX+. It is also the case for double, long, and ulong on ARM which use MOV and FMOV instead for Vector64.Create.

AdvSimd.Duplicate* however is contracted to map exactly to the DUP instruction and is only supported on types the underlying instruction supports. It's the way we've exposed the APIs so far and we are remaining consistent with that premise as there are all kinds of microarchitectural differences that are essentially impossible to account for, so we try to perform minimal changes to the actual intrinsics and instead emit them exactly as the developer asked us to.

kunalspathak · 2020-05-08T21:35:58Z

Got it. Yeah, I was little confused as a .NET developer given the set of APIs we expose and the end result of some of them being identical.

tannergooding · 2020-05-08T21:43:09Z

The APIs on Vector64/128/256 mainly exist for convenience because you don't want to have to remember all the different paths for efficiently creating a vector on ARM vs x86 or SSE vs SSE2 vs ... vs AVX vs AVX2; you just want whatever does that the most efficiently. It also gives a convenient place for us to place optimizations like generating constants (#35857) or having low/zero cost conversion between them and the scalar types or between eachother.

The set of APIs exposed was based on allowing interaction with the vectors even when HWIntrinsics aren't supported so you can trivially access the various elements from the debugger or to provide a trivial software fallback.

TamarChristinaArm · 2020-05-11T11:11:07Z

@TamarChristinaArm, could you indicate what instruction is used for:
int64x1_t vdup_lane_s64 (int64x1_t vec, const int lane)
int64x2_t vdupq_lane_s64 (int64x1_t vec, const int lane)
https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics?page=3&search=DUP indicates it is supported on ARM32 but VDUP and the other ARM instructions I can find are restricted to 8, 16, and 32 from what I can tell.

VDUP (scalar) itself is indeed restricted but the intrinsics use VMOV. Note that vdup_lane_s64 becomes a no-op later since it's VMOV d0, d0.

It also looks like the variants that take a Vector128<T> input are A64 only, is that correct?

No, that looks like an oversight. the same register pair trick can be used to implement them on A32.

tannergooding · 2020-05-12T03:41:02Z

No, that looks like an oversight. the same register pair trick can be used to implement them on A32.

Could you elaborate? The normal register trick is for dealing with just the upper half, while this requires broadcasting to the entire vector which would require, afaict, a dup and then a move?

tannergooding · 2020-05-12T04:17:36Z

Could you also explain how Insert(long/ulong/double) is supported on ARM32? As best as I can tell, there isn't any single instruction it can map to.
Likewise, I can't find anything for InsertSelectedScalar for any type, but it sounded like you had an idea of how it would be supported

TamarChristinaArm · 2020-05-12T08:44:34Z

No, that looks like an oversight. the same register pair trick can be used to implement them on A32.

Could you elaborate? The normal register trick is for dealing with just the upper half, while this requires broadcasting to the entire vector which would require, afaict, a dup and then a move?

The Vector128<T> vector input can be split into the same register pair Dn, Dn+1.

lets take e.g. vdupq_laneq_s32. calling vdupq_laneq_s32(x, 2) should result in

VDUP.32 Q0, D1[1] for instance. vdup_laneq_s32(x, 2) becomes VDUP.32 D0, D1[1] for instance.

TamarChristinaArm · 2020-05-12T14:11:46Z

Could you also explain how Insert(long/ulong/double) is supported on ARM32? As best as I can tell, there isn't any single instruction it can map to.

They map to VMOV. The index determins which part of the pair you're writing to and you just override the single part. e.g.:

vsetq_lane_u64 maps to vsetq_lane_u64 (b, a, 0) to vmov d0, r0, r1 and vsetq_lane_u64 (b, a, 1) to vmov d1, r0, r1. If you're carrying the value on the SIMD side you can just use the same VMOV Dn, Dm variant VMOV (register, SIMD).

ACLE doesn't define these because we don't have a float64_t or float64x1_t on AArch32.

if your values are in the SIMD side you use the SIMD variant of VMOV but same principle.

Likewise, I can't find anything for InsertSelectedScalar for any type, but it sounded like you had an idea of how it would be supported

Same as the above. the indices determine which one of the pairs are used. They all become VMOVs just different variants.

for 8 and 16 bit you need to go through the GPR using VMOV (general-purpose register to scalar)
for 32 and 64 bit floating point you have VMOV (register) which do it SIMD->SIMD.
for 64 bit int element sizes you can go through VMOV (register, SIMD)
for 32 bit int you need again two GPR VMOVs.

So it's split about half way in complexity, for the ones that go through GPR there's no difference between that and the user using get/set on the elements themselves. In that regard it's a question whether they should be provided.

If it's for ease of use then that may be an argument.

tannergooding · 2020-05-12T14:26:42Z

VDUP.32 Q0, D1[1] for instance. vdup_laneq_s32(x, 2) becomes VDUP.32 D0, D1[1] for instance.

I'm not connecting how this would achieve a Vector128<T> where every element contains VDUP.32 D0, D1[1]. This, as best as I can tell, would result in D0 matching that, but not D1.
You would need to do: VDUP.32 D0, D1[1]; VDUP.32 D1, D1[1] to achieve this instead and so it would be a convenience method and not necessarily a true HWIntrinsic.

for 64 bit int element sizes you can go through VMOV (register, SIMD)
for 32 bit int you need again two GPR VMOVs.

Did you get these two reversed? It sounds like, similarly to the above, that Insert(Vector128<value> byte index, long data) is emulated using two 32-bit inserts so it would be a convenience method and not a true HWIntrinsic?

tannergooding · 2020-05-12T14:36:12Z

If that is the case, It's something we should definitely raise to API review to make sure they are aware and agree. We haven't really exposed "convenience" methods that don't map to a single hardware instruction (ignoring needing to move data into the correct register or from memory).

Instead, all the intrinsics are essentially 1-to-1 mappings which means in the ideal case you get ins tgt, op1, op2 and in the worst case you would get mov op1, [mem]; mov op2, [mem]; ins tgt op1, op2 but never more than the data being loaded.
Anything that is a combination of multiple intrinsics are either part of Vector64/128/256 (such as Create, GetElement, and WithElement) or part of the user library.

Most of the Lower/Upper combinations can still fit the 1-to-1 mapping since the two halves of the V128 are individually addressable as registers, but it doesn't sound like the case for Insert or Duplicate.

InsertSelectedScalar feels like it is in the same boat, as it really requires an Extract + Insert to accomplish (if I understood correctly).

TamarChristinaArm · 2020-05-12T15:44:19Z

VDUP.32 Q0, D1[1] for instance. vdup_laneq_s32(x, 2) becomes VDUP.32 D0, D1[1] for instance.

I'm not connecting how this would achieve a Vector128<T> where every element contains VDUP.32 D0, D1[1]. This, as best as I can tell, would result in D0 matching that, but not D1.
You would need to do: VDUP.32 D0, D1[1]; VDUP.32 D1, D1[1] to achieve this instead and so it would be a convenience method and not necessarily a true HWIntrinsic.

You have two versions where you can have a Vector128<T> as an input.

When the output is a Vector64<T>. that's when you use VDUP.32 D0, Dn(+1)[0-1];
When the output is a Vector128<T>, in which case you use VDUP.32 Q0, Dn(+1)[0-1];

The instruction has a Q variant. The only variant that doesn't use VDUP is when the input is a Vector128<64-bit type> which just uses VMOV. This is the only one that requires two instructions. which is similar to a user doing a get and two sets themselves.

for 64 bit int element sizes you can go through VMOV (register, SIMD)
for 32 bit int you need again two GPR VMOVs.

Did you get these two reversed? It sounds like, similarly to the above, that

No, if you have a Vector128 and you're inserting a long, the only thing you have to do is overwrite the right Dn half. If You're inserting in lane 0 you get VMOV Dm, Dn if lane 1 you get VMOV Dm+1, Dn. If your long was in a vector register. if it's not you just the VMOV from the GPR move VMOV Dn, rY, rX.

The problem with the 32-bit variant is you don't have 32 bit variant of VMOV for non-FP which does a VMOV between S registers. So you have to move it back to GPR to use the GPR->SIMD instruction that can.

Insert(Vector128<value> byte index, long data) is emulated using two 32-bit inserts so it would be a convenience method and not a true HWIntrinsic?

The long no, that's just VMOV Dn(+1), Dm(+1).

Insert(a, 0, x) becomes VMOV D[lower reg pair a], D[reg x], and Insert(a, 1, x) becomes VMOV D[upper reg pair a], D[reg x] if x is in an SIMD register. otherwise you use the variant that transfers from GPR, i.e. VMOV D[upper reg pair a], r[reg_lowpart x], r[reg_highpart x]

tannergooding · 2020-05-12T15:55:49Z

Ah, ok. I think I understand now.

I missed that VMOV (between two general-purpose registers and a doubleword floating-point register) was a thing and wasn't considering that Vector128<long> DuplicateSelectedScalarToVector64 is really just a move low to high.

Thanks for the explanation (and sorry for the confusion)!

TamarChristinaArm · 2020-05-12T16:11:57Z

Ah, ok. I think I understand now.

I missed that VMOV (between two general-purpose registers and a doubleword floating-point register) was a thing

Yeah there are quite a lot of VMOV variants :)

and wasn't considering that Vector128<long> DuplicateSelectedScalarToVector64 is really just a move low to high.

Yeah, though you also need a mov low to low in the new register. to copy it. Which is why it can be considered a helper.

Thanks for the explanation (and sorry for the confusion)!

No worries, glad it's clear :) The helpers do indeed need an API design to see if it's worth it. I suspect we didn't do them in ACLE because it wasn't worth the complication.

Dotnet-GitSync-Bot added area-System.Runtime.Intrinsics untriaged New issue has not been triaged by the area owner labels Apr 16, 2020

tannergooding added api-ready-for-review and removed untriaged New issue has not been triaged by the area owner labels Apr 16, 2020

echesakov mentioned this issue May 1, 2020

ARM64 intrinsic support for Vector64.Create() and Vector128.Create() #35590

Merged

terrajobst added api-approved API was approved in API review, it can be implemented and removed api-ready-for-review labels May 5, 2020

echesakov self-assigned this May 6, 2020

echesakov removed their assignment May 7, 2020

tannergooding self-assigned this May 7, 2020

tannergooding mentioned this issue May 8, 2020

[ARM64] Implement Duplicate and DuplicateSelectedScalar #36144

Merged

TamarChristinaArm mentioned this issue May 12, 2020

Optimize Vector64 and Vector128.Create methods #36267

Merged

JulieLeeMSFT added this to the 5.0 milestone May 18, 2020

echesakov assigned echesakov and unassigned tannergooding Jun 10, 2020

echesakov mentioned this issue Jun 11, 2020

[Arm64] ASIMD By Element Intrinsics #36916

Merged

echesakov closed this as completed in #36916 Jun 17, 2020

echesakov mentioned this issue Jun 19, 2020

[Arm64] ASIMD InsertScalar #38137

Closed

ghost locked as resolved and limited conversation to collaborators Dec 9, 2020

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[Arm64] "Move" Intrinsics #35037

[Arm64] "Move" Intrinsics #35037

tannergooding commented Apr 16, 2020 •

edited

Loading

ghost commented Apr 16, 2020

tannergooding commented Apr 16, 2020

echesakov commented Apr 16, 2020

echesakov commented Apr 16, 2020

tannergooding commented Apr 16, 2020

tannergooding commented Apr 16, 2020

TamarChristinaArm commented Apr 16, 2020 •

edited

Loading

tannergooding commented Apr 16, 2020

TamarChristinaArm commented Apr 16, 2020

terrajobst commented May 5, 2020 •

edited

Loading

tannergooding commented May 7, 2020

echesakov commented May 7, 2020

tannergooding commented May 8, 2020

kunalspathak commented May 8, 2020 •

edited

Loading

tannergooding commented May 8, 2020

kunalspathak commented May 8, 2020

tannergooding commented May 8, 2020

TamarChristinaArm commented May 11, 2020 •

edited

Loading

tannergooding commented May 12, 2020

tannergooding commented May 12, 2020

TamarChristinaArm commented May 12, 2020

TamarChristinaArm commented May 12, 2020

tannergooding commented May 12, 2020

tannergooding commented May 12, 2020

TamarChristinaArm commented May 12, 2020

tannergooding commented May 12, 2020 •

edited

Loading

TamarChristinaArm commented May 12, 2020

[Arm64] "Move" Intrinsics #35037

[Arm64] "Move" Intrinsics #35037

Comments

tannergooding commented Apr 16, 2020 • edited Loading

ghost commented Apr 16, 2020

tannergooding commented Apr 16, 2020

echesakov commented Apr 16, 2020

echesakov commented Apr 16, 2020

tannergooding commented Apr 16, 2020

tannergooding commented Apr 16, 2020

TamarChristinaArm commented Apr 16, 2020 • edited Loading

tannergooding commented Apr 16, 2020

TamarChristinaArm commented Apr 16, 2020

terrajobst commented May 5, 2020 • edited Loading

tannergooding commented May 7, 2020

echesakov commented May 7, 2020

tannergooding commented May 8, 2020

kunalspathak commented May 8, 2020 • edited Loading

tannergooding commented May 8, 2020

kunalspathak commented May 8, 2020

tannergooding commented May 8, 2020

TamarChristinaArm commented May 11, 2020 • edited Loading

tannergooding commented May 12, 2020

tannergooding commented May 12, 2020

TamarChristinaArm commented May 12, 2020

TamarChristinaArm commented May 12, 2020

tannergooding commented May 12, 2020

tannergooding commented May 12, 2020

TamarChristinaArm commented May 12, 2020

tannergooding commented May 12, 2020 • edited Loading

TamarChristinaArm commented May 12, 2020

tannergooding commented Apr 16, 2020 •

edited

Loading

TamarChristinaArm commented Apr 16, 2020 •

edited

Loading

terrajobst commented May 5, 2020 •

edited

Loading

kunalspathak commented May 8, 2020 •

edited

Loading

TamarChristinaArm commented May 11, 2020 •

edited

Loading

tannergooding commented May 12, 2020 •

edited

Loading