[interpreter] Implement SIMD extended multiply instructions

These were accepted into the proposal in WebAssembly#376. There are 12 instructions in total: - i16x8.extmul_{low,high}_i8x16_{s,u} - i32x4.extmul_{low,high}_i16x8_{s,u} - i64x2.extmul_{low,high}_i32x4_{s,u} The implementation is straightforward, widen (using existing operations), then a multiply with the wider shape. Added a test generation script that reuses some logic in the generator for arithmetic instructions. Since these instructions have different src and dst shapes, I tweaked the base class to allow for having different shapes.
ngzhian · Feb 2, 2021 · 270d6c2 · 270d6c2
1 parent 0cd0a20
commit 270d6c2
Show file tree

Hide file tree

Showing 15 changed files with 1,435 additions and 18 deletions.
diff --git a/interpreter/binary/decode.ml b/interpreter/binary/decode.ml
@@ -365,7 +365,11 @@ let simd_prefix s =
   | 0x97l -> i16x8_min_u
   | 0x98l -> i16x8_max_s
   | 0x99l -> i16x8_max_u
+  | 0x9al -> i16x8_extmul_low_i8x16_s
   | 0x9bl -> i16x8_avgr_u
+  | 0x9dl -> i16x8_extmul_high_i8x16_s
+  | 0x9el -> i16x8_extmul_low_i8x16_u
+  | 0x9fl -> i16x8_extmul_high_i8x16_u
   | 0xa0l -> i32x4_abs
   | 0xa1l -> i32x4_neg
   | 0xa3l -> i32x4_all_true
@@ -385,12 +389,20 @@ let simd_prefix s =
   | 0xb8l -> i32x4_max_s
   | 0xb9l -> i32x4_max_u
   | 0xbal -> i32x4_dot_i16x8_s
+  | 0xbbl -> i32x4_extmul_low_i16x8_s
+  | 0xbdl -> i32x4_extmul_high_i16x8_s
+  | 0xbel -> i32x4_extmul_low_i16x8_u
+  | 0xbfl -> i32x4_extmul_high_i16x8_u
   | 0xc1l -> i64x2_neg
   | 0xcbl -> i64x2_shl
   | 0xccl -> i64x2_shr_s
   | 0xcdl -> i64x2_shr_u
   | 0xcel -> i64x2_add
   | 0xd1l -> i64x2_sub
+  | 0xd2l -> i64x2_extmul_low_i32x4_s
+  | 0xd3l -> i64x2_extmul_high_i32x4_s
+  | 0xd6l -> i64x2_extmul_low_i32x4_u
+  | 0xd7l -> i64x2_extmul_high_i32x4_u
   | 0xd5l -> i64x2_mul
   | 0xd8l -> f32x4_ceil
   | 0xd9l -> f32x4_floor

diff --git a/interpreter/binary/encode.ml b/interpreter/binary/encode.ml
@@ -467,6 +467,10 @@ let encode m =
       | Binary (V128 V128Op.(I16x8 MaxS)) -> simd_op 0x98l
       | Binary (V128 V128Op.(I16x8 MaxU)) -> simd_op 0x99l
       | Binary (V128 V128Op.(I16x8 AvgrU)) -> simd_op 0x9bl
+      | Binary (V128 V128Op.(I16x8 ExtMulLowS)) -> simd_op 0x9al
+      | Binary (V128 V128Op.(I16x8 ExtMulHighS)) -> simd_op 0x9dl
+      | Binary (V128 V128Op.(I16x8 ExtMulLowU)) -> simd_op 0x9el
+      | Binary (V128 V128Op.(I16x8 ExtMulHighU)) -> simd_op 0x9fl
       | Binary (V128 V128Op.(I32x4 Add)) -> simd_op 0xael
       | Binary (V128 V128Op.(I32x4 Sub)) -> simd_op 0xb1l
       | Binary (V128 V128Op.(I32x4 MinS)) -> simd_op 0xb6l
@@ -485,9 +489,17 @@ let encode m =
       | Binary (V128 V128Op.(I32x4 LeU)) -> simd_op 0x3el
       | Binary (V128 V128Op.(I32x4 GeS)) -> simd_op 0x3fl
       | Binary (V128 V128Op.(I32x4 GeU)) -> simd_op 0x40l
+      | Binary (V128 V128Op.(I32x4 ExtMulLowS)) -> simd_op 0xbbl
+      | Binary (V128 V128Op.(I32x4 ExtMulHighS)) -> simd_op 0xbdl
+      | Binary (V128 V128Op.(I32x4 ExtMulLowU)) -> simd_op 0xbel
+      | Binary (V128 V128Op.(I32x4 ExtMulHighU)) -> simd_op 0xbfl
       | Binary (V128 V128Op.(I64x2 Add)) -> simd_op 0xcel
       | Binary (V128 V128Op.(I64x2 Sub)) -> simd_op 0xd1l
       | Binary (V128 V128Op.(I64x2 Mul)) -> simd_op 0xd5l
+      | Binary (V128 V128Op.(I64x2 ExtMulLowS)) -> simd_op 0xd2l
+      | Binary (V128 V128Op.(I64x2 ExtMulHighS)) -> simd_op 0xd3l
+      | Binary (V128 V128Op.(I64x2 ExtMulLowU)) -> simd_op 0xd6l
+      | Binary (V128 V128Op.(I64x2 ExtMulHighU)) -> simd_op 0xd7l
       | Binary (V128 V128Op.(F32x4 Eq)) -> simd_op 0x41l
       | Binary (V128 V128Op.(F32x4 Ne)) -> simd_op 0x42l
       | Binary (V128 V128Op.(F32x4 Lt)) -> simd_op 0x43l

diff --git a/interpreter/exec/eval_simd.ml b/interpreter/exec/eval_simd.ml
@@ -101,6 +101,10 @@ module SimdOp (SXX : Simd.S) (Value : ValueType with type t = SXX.t) = struct
       | I16x8 MaxS -> SXX.I16x8.max_s
       | I16x8 MaxU -> SXX.I16x8.max_u
       | I16x8 AvgrU -> SXX.I16x8.avgr_u
+      | I16x8 ExtMulLowS -> SXX.I16x8_convert.extmul_low_s
+      | I16x8 ExtMulHighS -> SXX.I16x8_convert.extmul_high_s
+      | I16x8 ExtMulLowU -> SXX.I16x8_convert.extmul_low_u
+      | I16x8 ExtMulHighU -> SXX.I16x8_convert.extmul_high_u
       | I32x4 Add -> SXX.I32x4.add
       | I32x4 Sub -> SXX.I32x4.sub
       | I32x4 MinS -> SXX.I32x4.min_s
@@ -119,9 +123,17 @@ module SimdOp (SXX : Simd.S) (Value : ValueType with type t = SXX.t) = struct
       | I32x4 GeS -> SXX.I32x4.ge_s
       | I32x4 GeU -> SXX.I32x4.ge_u
       | I32x4 DotI16x8S -> SXX.I32x4_convert.dot_i16x8_s
+      | I32x4 ExtMulLowS -> SXX.I32x4_convert.extmul_low_s
+      | I32x4 ExtMulHighS -> SXX.I32x4_convert.extmul_high_s
+      | I32x4 ExtMulLowU -> SXX.I32x4_convert.extmul_low_u
+      | I32x4 ExtMulHighU -> SXX.I32x4_convert.extmul_high_u
       | I64x2 Add -> SXX.I64x2.add
       | I64x2 Sub -> SXX.I64x2.sub
       | I64x2 Mul -> SXX.I64x2.mul
+      | I64x2 ExtMulLowS -> SXX.I64x2_convert.extmul_low_s
+      | I64x2 ExtMulHighS -> SXX.I64x2_convert.extmul_high_s
+      | I64x2 ExtMulLowU -> SXX.I64x2_convert.extmul_low_u
+      | I64x2 ExtMulHighU -> SXX.I64x2_convert.extmul_high_u
       | F32x4 Eq -> SXX.F32x4.eq
       | F32x4 Ne -> SXX.F32x4.ne
       | F32x4 Lt -> SXX.F32x4.lt

diff --git a/interpreter/exec/simd.ml b/interpreter/exec/simd.ml
@@ -177,6 +177,10 @@ sig
     val widen_high_s : t -> t
     val widen_low_u : t -> t
     val widen_high_u : t -> t
+    val extmul_low_s : t -> t -> t
+    val extmul_high_s : t -> t -> t
+    val extmul_low_u : t -> t -> t
+    val extmul_high_u : t -> t -> t
   end
   module I32x4_convert : sig
     val trunc_sat_f32x4_s : t -> t
@@ -186,10 +190,20 @@ sig
     val widen_low_u : t -> t
     val widen_high_u : t -> t
     val dot_i16x8_s : t -> t -> t
+    val extmul_low_s : t -> t -> t
+    val extmul_high_s : t -> t -> t
+    val extmul_low_u : t -> t -> t
+    val extmul_high_u : t -> t -> t
   end
   module I64x2_convert : sig
     val widen_low_s : t -> t
+    val widen_high_s : t -> t
     val widen_low_u : t -> t
+    val widen_high_u : t -> t
+    val extmul_low_s : t -> t -> t
+    val extmul_high_s : t -> t -> t
+    val extmul_low_u : t -> t -> t
+    val extmul_high_u : t -> t -> t
   end
   module F32x4_convert : sig
     val convert_i32x4_s : t -> t
@@ -417,6 +431,10 @@ struct
     let widen_low_u = widen Lib.List.take 0xffl
     let widen_high_u = widen Lib.List.drop 0xffl
 
+    let extmul_low_s x y = I16x8.mul (widen_low_s x) (widen_low_s y)
+    let extmul_high_s x y = I16x8.mul (widen_high_s x) (widen_high_s y)
+    let extmul_low_u x y = I16x8.mul (widen_low_u x) (widen_low_u y)
+    let extmul_high_u x y = I16x8.mul (widen_high_u x) (widen_high_u y)
   end
 
   module I32x4_convert = struct
@@ -441,16 +459,28 @@ struct
         | [], [] -> []
         | _, _ -> assert false
       in Rep.of_i32x4 (dot xs ys)
+
+    let extmul_low_s x y = I32x4.mul (widen_low_s x) (widen_low_s y)
+    let extmul_high_s x y = I32x4.mul (widen_high_s x) (widen_high_s y)
+    let extmul_low_u x y = I32x4.mul (widen_low_u x) (widen_low_u y)
+    let extmul_high_u x y = I32x4.mul (widen_high_u x) (widen_high_u y)
   end
 
   module I64x2_convert = struct
-    let widen mask x =
+    let widen take_or_drop mask x =
       Rep.of_i64x2
         (List.map
            (fun i32 -> Int64.(logand mask (of_int32 i32)))
-           (Lib.List.take 2 (Rep.to_i32x4 x)))
-    let widen_low_s = widen 0xffffffffffffffffL
-    let widen_low_u = widen 0xffffffffL
+           (take_or_drop 2 (Rep.to_i32x4 x)))
+    let widen_low_s = widen Lib.List.take 0xffffffffffffffffL
+    let widen_high_s = widen Lib.List.drop 0xffffffffffffffffL
+    let widen_low_u = widen Lib.List.take 0xffffffffL
+    let widen_high_u = widen Lib.List.drop 0xffffffffL
+
+    let extmul_low_s x y = I64x2.mul (widen_low_s x) (widen_low_s y)
+    let extmul_high_s x y = I64x2.mul (widen_high_s x) (widen_high_s y)
+    let extmul_low_u x y = I64x2.mul (widen_low_u x) (widen_low_u y)
+    let extmul_high_u x y = I64x2.mul (widen_high_u x) (widen_high_u y)
   end
 
   module F32x4_convert = struct

diff --git a/interpreter/syntax/ast.ml b/interpreter/syntax/ast.ml
@@ -55,6 +55,7 @@ struct
               | Swizzle | Shuffle of int list | NarrowS | NarrowU
               | AddSatS | AddSatU | SubSatS | SubSatU
               | DotI16x8S
+              | ExtMulLowS | ExtMulHighS | ExtMulLowU | ExtMulHighU
   type funop = Abs | Neg | Sqrt
              | Ceil | Floor | Trunc | Nearest
              | ConvertI32x4S | ConvertI32x4U

diff --git a/interpreter/syntax/operators.ml b/interpreter/syntax/operators.ml
@@ -340,6 +340,10 @@ let i16x8_min_u = Binary (V128 V128Op.(I16x8 MinU))
 let i16x8_max_s = Binary (V128 V128Op.(I16x8 MaxS))
 let i16x8_max_u = Binary (V128 V128Op.(I16x8 MaxU))
 let i16x8_avgr_u = Binary (V128 V128Op.(I16x8 AvgrU))
+let i16x8_extmul_low_i8x16_s = Binary (V128 V128Op.(I16x8 ExtMulLowS))
+let i16x8_extmul_high_i8x16_s = Binary (V128 V128Op.(I16x8 ExtMulHighS))
+let i16x8_extmul_low_i8x16_u = Binary (V128 V128Op.(I16x8 ExtMulLowU))
+let i16x8_extmul_high_i8x16_u = Binary (V128 V128Op.(I16x8 ExtMulHighU))
 
 let i32x4_splat = Convert (V128 V128Op.(I32x4 Splat))
 let i32x4_extract_lane imm = SimdExtract (V128Op.I32x4 (ZX, imm))
@@ -375,6 +379,10 @@ let i32x4_mul = Binary (V128 V128Op.(I32x4 Mul))
 let i32x4_trunc_sat_f32x4_s = Unary (V128 V128Op.(I32x4 TruncSatF32x4S))
 let i32x4_trunc_sat_f32x4_u = Unary (V128 V128Op.(I32x4 TruncSatF32x4U))
 let i32x4_dot_i16x8_s = Binary (V128 V128Op.(I32x4 DotI16x8S))
+let i32x4_extmul_low_i16x8_s = Binary (V128 V128Op.(I32x4 ExtMulLowS))
+let i32x4_extmul_high_i16x8_s = Binary (V128 V128Op.(I32x4 ExtMulHighS))
+let i32x4_extmul_low_i16x8_u = Binary (V128 V128Op.(I32x4 ExtMulLowU))
+let i32x4_extmul_high_i16x8_u = Binary (V128 V128Op.(I32x4 ExtMulHighU))
 
 let i64x2_splat = Convert (V128 V128Op.(I64x2 Splat))
 let i64x2_extract_lane imm = SimdExtract (V128Op.I64x2 (ZX, imm))
@@ -386,6 +394,10 @@ let i64x2_mul = Binary (V128 V128Op.(I64x2 Mul))
 let i64x2_shl = SimdShift V128Op.(I64x2 Shl)
 let i64x2_shr_s = SimdShift V128Op.(I64x2 ShrS)
 let i64x2_shr_u = SimdShift V128Op.(I64x2 ShrU)
+let i64x2_extmul_low_i32x4_s = Binary (V128 V128Op.(I64x2 ExtMulLowS))
+let i64x2_extmul_high_i32x4_s = Binary (V128 V128Op.(I64x2 ExtMulHighS))
+let i64x2_extmul_low_i32x4_u = Binary (V128 V128Op.(I64x2 ExtMulLowU))
+let i64x2_extmul_high_i32x4_u = Binary (V128 V128Op.(I64x2 ExtMulHighU))
 
 let f32x4_splat = Convert (V128 V128Op.(F32x4 Splat))
 let f32x4_extract_lane imm = SimdExtract (V128Op.F32x4 (ZX, imm))

diff --git a/interpreter/text/arrange.ml b/interpreter/text/arrange.ml
@@ -296,6 +296,10 @@ struct
     | I16x8 MaxS -> "i16x8.max_s"
     | I16x8 MaxU -> "i16x8.max_u"
     | I16x8 AvgrU -> "i16x8.avgr_u"
+    | I16x8 ExtMulLowS -> "i16x8.extmul_low_i8x16_s"
+    | I16x8 ExtMulHighS -> "i16x8.extmul_high_i8x16_s"
+    | I16x8 ExtMulLowU -> "i16x8.extmul_low_i8x16_u"
+    | I16x8 ExtMulHighU -> "i16x8.extmul_high_i8x16_u"
     | I32x4 Add -> "i32x4.add"
     | I32x4 Sub -> "i32x4.sub"
     | I32x4 Mul -> "i32x4.mul"
@@ -304,9 +308,17 @@ struct
     | I32x4 MaxS -> "i32x4.max_s"
     | I32x4 MaxU -> "i32x4.max_u"
     | I32x4 DotI16x8S -> "i32x4.dot_i16x8_s"
+    | I32x4 ExtMulLowS -> "i32x4.extmul_low_i16x8_s"
+    | I32x4 ExtMulHighS -> "i32x4.extmul_high_i16x8_s"
+    | I32x4 ExtMulLowU -> "i32x4.extmul_low_i16x8_u"
+    | I32x4 ExtMulHighU -> "i32x4.extmul_high_i16x8_u"
     | I64x2 Add -> "i64x2.add"
     | I64x2 Sub -> "i64x2.sub"
     | I64x2 Mul -> "i64x2.mul"
+    | I64x2 ExtMulLowS -> "i64x2.extmul_low_i32x4_s"
+    | I64x2 ExtMulHighS -> "i64x2.extmul_high_i32x4_s"
+    | I64x2 ExtMulLowU -> "i64x2.extmul_low_i32x4_u"
+    | I64x2 ExtMulHighU -> "i64x2.extmul_high_i32x4_u"
     | F32x4 Eq -> "f32x4.eq"
     | F32x4 Ne -> "f32x4.ne"
     | F32x4 Lt -> "f32x4.lt"

diff --git a/interpreter/text/lexer.mll b/interpreter/text/lexer.mll
@@ -578,6 +578,19 @@ rule token = parse
   | "i32x4.dot_i16x8_s"
   { BINARY i32x4_dot_i16x8_s }
 
+  | "i16x8.extmul_low_i8x16_"(sign as s)
+  { BINARY (ext s i16x8_extmul_low_i8x16_s i16x8_extmul_low_i8x16_u) }
+  | "i16x8.extmul_high_i8x16_"(sign as s)
+  { BINARY (ext s i16x8_extmul_high_i8x16_s i16x8_extmul_high_i8x16_u) }
+  | "i32x4.extmul_low_i16x8_"(sign as s)
+  { BINARY (ext s i32x4_extmul_low_i16x8_s i32x4_extmul_low_i16x8_u) }
+  | "i32x4.extmul_high_i16x8_"(sign as s)
+  { BINARY (ext s i32x4_extmul_high_i16x8_s i32x4_extmul_high_i16x8_u) }
+  | "i64x2.extmul_low_i32x4_"(sign as s)
+  { BINARY (ext s i64x2_extmul_low_i32x4_s i64x2_extmul_low_i32x4_u) }
+  | "i64x2.extmul_high_i32x4_"(sign as s)
+  { BINARY (ext s i64x2_extmul_high_i32x4_s i64x2_extmul_high_i32x4_u) }
+
   | (simd_shape as s) { SIMD_SHAPE (simd_shape s) }
 
   | name as s { VAR s }

diff --git a/test/core/simd/meta/gen_tests.py b/test/core/simd/meta/gen_tests.py
@@ -32,6 +32,7 @@
     'simd_f64x2_pmin_pmax',
     'simd_i32x4_dot_i16x8',
     'simd_load_lane',
+    'simd_ext_mul',
 )
 
 

diff --git a/test/core/simd/meta/simd_arithmetic.py b/test/core/simd/meta/simd_arithmetic.py
@@ -35,14 +35,27 @@ def __str__(self):
     def lane(self):
         return self.LANE_VALUE.get(self.LANE_TYPE)
 
+    @property
+    def dst_lane(self):
+        return self.lane
+
+    @property
+    def src_lane(self):
+        # Used for arithmetic that extends the lane, e.g. i16x8 lanes, which
+        # are extended multiply to i32x4.
+        if hasattr(self, 'SRC_LANE_TYPE'):
+            return self.LANE_VALUE.get(self.SRC_LANE_TYPE)
+        else:
+            return self.lane
+
     @property
     def normal_unary_op_test_data(self):
-        lane = self.lane
+        lane = self.src_lane
         return [0, 1, -1, lane.max - 1, lane.min + 1, lane.min, lane.max, lane.mask]
 
     @property
     def normal_binary_op_test_data(self):
-        lane = self.lane
+        lane = self.src_lane
         return [
             (0, 0),
             (0, 1),
@@ -170,7 +183,7 @@ def get_case_data(self):
             for data_group, v128_forms in self.bin_test_data:
                 for data in data_group:
                     case_data.append([op_name, [str(data[0]), str(data[1])],
-                                      str(o.binary_op(data[0], data[1], self.lane)),
+                                      str(o.binary_op(data[0], data[1], self.src_lane, self.dst_lane)),
                                      v128_forms])
             for data_group in self.full_bin_test_data:
                 for data in data_group.get(op_name):
@@ -183,7 +196,7 @@ def get_case_data(self):
             for data_group, v128_forms in self.unary_test_data:
                 for data in data_group:
                     case_data.append([op_name, [str(data)],
-                                      str(o.unary_op(data, self.lane)),
+                                      str(o.unary_op(data, self.dst_lane)),
                                       v128_forms])
 
         return case_data

diff --git a/test/core/simd/meta/simd_ext_mul.py b/test/core/simd/meta/simd_ext_mul.py
@@ -0,0 +1,75 @@
+#!/usr/bin/env python3
+
+""" Base class for generating extended multiply instructions.  These
+instructions 2 inputs of the same (narrower) lane shape, multiplies
+corresponding lanes with extension (no overflow/wraparound), producing 1 output
+of a (wider) shape. These instructions can choose to work on the low or high
+halves of the inputs, and perform signed or unsigned multiply.
+
+Subclasses need to define 3 attributes:
+  - LANE_TYPE (this is the output shape)
+  - SRC_LANE_TYPE (this is the input (narrower) shape)
+  - BINARY_OPS (list of operations)
+"""
+
+from simd_arithmetic import SimdArithmeticCase
+
+
+class SimdExtMulCase(SimdArithmeticCase):
+    UNARY_OPS = ()
+
+    @property
+    def full_bin_test_data(self):
+        return []
+
+    def get_combine_cases(self):
+        return ''
+
+    @property
+    def bin_test_data(self):
+        lane_forms = [self.SRC_LANE_TYPE, self.SRC_LANE_TYPE, self.LANE_TYPE]
+        return [(self.normal_binary_op_test_data, lane_forms)]
+
+    @property
+    def hex_binary_op_test_data(self):
+        return []
+
+    def gen_test_cases(self):
+        wast_filename = '../simd_{wide}_extmul_{narrow}.wast'.format(
+                wide=self.LANE_TYPE, narrow=self.SRC_LANE_TYPE)
+        with open(wast_filename, 'w') as fp:
+            fp.write(self.get_all_cases())
+
+
+class SimdI16x8ExtMulCase(SimdExtMulCase):
+    LANE_TYPE = 'i16x8'
+    SRC_LANE_TYPE = 'i8x16'
+    BINARY_OPS = ('extmul_low_i8x16_s', 'extmul_high_i8x16_s',
+                  'extmul_low_i8x16_u', 'extmul_high_i8x16_u')
+
+
+class SimdI32x4ExtMulCase(SimdExtMulCase):
+    LANE_TYPE = 'i32x4'
+    SRC_LANE_TYPE = 'i16x8'
+    BINARY_OPS = ('extmul_low_i16x8_s', 'extmul_high_i16x8_s',
+                  'extmul_low_i16x8_u', 'extmul_high_i16x8_u')
+
+
+class SimdI64x2ExtMulCase(SimdExtMulCase):
+    LANE_TYPE = 'i64x2'
+    SRC_LANE_TYPE = 'i32x4'
+    BINARY_OPS = ('extmul_low_i32x4_s', 'extmul_high_i32x4_s',
+                  'extmul_low_i32x4_u', 'extmul_high_i32x4_u')
+
+
+def gen_test_cases():
+    simd_i16x8_ext_mul_case = SimdI16x8ExtMulCase()
+    simd_i16x8_ext_mul_case.gen_test_cases()
+    simd_i32x4_ext_mul_case = SimdI32x4ExtMulCase()
+    simd_i32x4_ext_mul_case.gen_test_cases()
+    simd_i64x2_ext_mul_case = SimdI64x2ExtMulCase()
+    simd_i64x2_ext_mul_case.gen_test_cases()
+
+
+if __name__ == '__main__':
+    gen_test_cases()