From 946718556c931dcc19135359e0178a413eaa0309 Mon Sep 17 00:00:00 2001
From: gnzlbg <gnzlbg@users.noreply.github.com>
Date: Wed, 15 Aug 2018 18:20:33 +0200
Subject: [PATCH] Add wasm32 simd128 intrinsics (#549)

* Add wasm32 simd128 intrinsics

* test wasm32 simd128 instructions

* Run wasm tests like all other tests

* use modules instead of types to access wasm simd128 interpretations

* generate docs for wasm32-unknown-unknown

* fix typo

* Enable #[assert_instr] on wasm32

* Shell out to Node's `execSync` to execute `wasm2wat` over our wasm file
* Parse the wasm file line-by-line, looking for various function markers and
  such
* Use the `elem` section to build a function pointer table, allowing us to map
  exactly from function pointer to a function
* Avoid losing debug info (the names section) in release mode by stripping
  `--strip-debug` from `rust-lld`.

* remove exclude list from Cargo.toml

* fix assert_instr for non-wasm targets

* re-format assert-instr changes

* add crate that uses assert_instr

* Fix instructions having extra quotes

* Add assert_instr for wasm memory intrinsics

* Remove hacks for git wasm-bindgen

* add wasm_simd128 feature

* make wasm32 build correctly

* run simd128 tests on ci

* remove wasm-assert-instr-tests
---
 .travis.yml                                 |   12 -
 Cargo.toml                                  |    3 +
 ci/docker/wasm32-unknown-unknown/Dockerfile |   37 +
 ci/dox.sh                                   |    1 +
 ci/lld-shim.rs                              |   11 +
 ci/run-docker.sh                            |    6 +-
 ci/run.sh                                   |    6 +-
 coresimd/simd_llvm.rs                       |    3 +-
 coresimd/{wasm32.rs => wasm32/mod.rs}       |   19 +
 coresimd/wasm32/simd128.rs                  | 1424 +++++++++++++++++++
 crates/assert-instr-macro/src/lib.rs        |   28 +-
 crates/coresimd/Cargo.toml                  |    5 +
 crates/coresimd/src/lib.rs                  |    6 +-
 crates/stdsimd-test/Cargo.toml              |    4 +
 crates/stdsimd-test/src/lib.rs              |  174 ++-
 crates/stdsimd/Cargo.toml                   |    4 +
 crates/stdsimd/src/lib.rs                   |    1 +
 17 files changed, 1684 insertions(+), 60 deletions(-)
 create mode 100644 ci/docker/wasm32-unknown-unknown/Dockerfile
 create mode 100644 ci/lld-shim.rs
 rename coresimd/{wasm32.rs => wasm32/mod.rs} (71%)
 create mode 100644 coresimd/wasm32/simd128.rs

diff --git a/.travis.yml b/.travis.yml
index 0746da3949..6b21652456 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -30,18 +30,6 @@ matrix:
       env: TARGET=x86_64-apple-darwin NO_ADD=1
       script: ci/run.sh
     - env: TARGET=wasm32-unknown-unknown
-      before_script:
-        - git clone --recursive https://github.com/WebAssembly/wabt
-        - (cd wabt && git reset --hard a0bdeb7 && make -j4)
-        - export PATH=$PATH:$PWD/wabt/bin
-      script:
-        - cargo build --target wasm32-unknown-unknown -p stdsimd
-        - cargo build --target wasm32-unknown-unknown -p stdsimd --release
-        - cargo rustc --target wasm32-unknown-unknown -p stdsimd --release --example wasm -- -C lto
-        - wasm2wat target/wasm32-unknown-unknown/release/examples/wasm.wasm -o wasm.wat
-        - cat wasm.wat
-        - grep current_memory wasm.wat
-        - grep grow_memory wasm.wat
     - env: TARGET=thumbv6m-none-eabi NOSTD=1
     - env: TARGET=thumbv7m-none-eabi NOSTD=1
     - env: TARGET=thumbv7em-none-eabi NOSTD=1
diff --git a/Cargo.toml b/Cargo.toml
index d789fed9aa..4e96e5f494 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -3,6 +3,9 @@ members = [
   "crates/stdsimd-verify",
   "crates/stdsimd",
 ]
+exclude = [
+  "crates/wasm-assert-instr-tests"
+]
 
 [profile.release]
 debug = true
diff --git a/ci/docker/wasm32-unknown-unknown/Dockerfile b/ci/docker/wasm32-unknown-unknown/Dockerfile
new file mode 100644
index 0000000000..56eef71204
--- /dev/null
+++ b/ci/docker/wasm32-unknown-unknown/Dockerfile
@@ -0,0 +1,37 @@
+FROM ubuntu:18.04
+
+RUN apt-get update -y && apt-get install -y --no-install-recommends \
+  ca-certificates \
+  clang \
+  cmake \
+  curl \
+  git \
+  libc6-dev \
+  make \
+  python \
+  xz-utils
+
+# Install `wasm2wat`
+RUN git clone --recursive https://github.com/WebAssembly/wabt
+RUN make -C wabt -j$(nproc)
+ENV PATH=$PATH:/wabt/bin
+
+# Install `wasm-bindgen-test-runner`
+RUN curl -L https://github.com/rustwasm/wasm-bindgen/releases/download/0.2.16/wasm-bindgen-0.2.16-x86_64-unknown-linux-musl.tar.gz \
+  | tar xzf -
+ENV PATH=$PATH:/wasm-bindgen-0.2.16-x86_64-unknown-linux-musl
+ENV CARGO_TARGET_WASM32_UNKNOWN_UNKNOWN_RUNNER=wasm-bindgen-test-runner
+
+# Install `node`
+RUN curl https://nodejs.org/dist/v10.8.0/node-v10.8.0-linux-x64.tar.xz | tar xJf -
+ENV PATH=$PATH:/node-v10.8.0-linux-x64/bin
+
+# We use a shim linker that removes `--strip-debug` when passed to LLD. While
+# this typically results in invalid debug information in release mode it doesn't
+# result in an invalid names section which is what we're interested in.
+COPY lld-shim.rs /
+ENV CARGO_TARGET_WASM32_UNKNOWN_UNKNOWN_LINKER=/tmp/lld-shim
+
+# Rustc isn't available until this container starts, so defer compilation of the
+# shim.
+ENTRYPOINT /rust/bin/rustc /lld-shim.rs -o /tmp/lld-shim && exec bash "$@"
diff --git a/ci/dox.sh b/ci/dox.sh
index a604fb541d..fe7e04711d 100755
--- a/ci/dox.sh
+++ b/ci/dox.sh
@@ -44,6 +44,7 @@ dox aarch64 aarch64-unknown-linux-gnu
 dox powerpc64le powerpc64le-unknown-linux-gnu
 dox mips mips-unknown-linux-gnu
 dox mips64 mips64-unknown-linux-gnuabi64
+dox wasm32 wasm32-unknown-unknown
 
 # If we're on travis, not a PR, and on the right branch, publish!
 if [ "$TRAVIS_PULL_REQUEST" = "false" ] && [ "$TRAVIS_BRANCH" = "master" ]; then
diff --git a/ci/lld-shim.rs b/ci/lld-shim.rs
new file mode 100644
index 0000000000..10263869e8
--- /dev/null
+++ b/ci/lld-shim.rs
@@ -0,0 +1,11 @@
+use std::os::unix::prelude::*;
+use std::process::Command;
+use std::env;
+
+fn main() {
+    let args = env::args()
+        .skip(1)
+        .filter(|s| s != "--strip-debug")
+        .collect::<Vec<_>>();
+    panic!("failed to exec: {}", Command::new("rust-lld").args(&args).exec());
+}
diff --git a/ci/run-docker.sh b/ci/run-docker.sh
index 0c560c825c..5226363410 100755
--- a/ci/run-docker.sh
+++ b/ci/run-docker.sh
@@ -13,8 +13,8 @@ run() {
       --user `id -u`:`id -g` \
       --rm \
       --init \
-      --volume $HOME/.cargo:/cargo \
-      --env CARGO_HOME=/cargo \
+      --volume $HOME/.cargo:/cargo-h \
+      --env CARGO_HOME=/cargo-h \
       --volume `rustc --print sysroot`:/rust:ro \
       --env TARGET=$target \
       --env STDSIMD_TEST_EVERYTHING \
@@ -25,7 +25,7 @@ run() {
       --privileged \
       stdsimd \
       bash \
-      -c 'PATH=$PATH:/rust/bin exec ci/run.sh'
+      -c 'PATH=/rust/bin:$PATH exec ci/run.sh'
 }
 
 if [ -z "$1" ]; then
diff --git a/ci/run.sh b/ci/run.sh
index d2350fc6c7..8bc915d38b 100755
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -59,13 +59,17 @@ cargo_test() {
 cargo_test
 cargo_test "--release"
 
-# Test x86 targets compiled with AVX.
+# Test targets compiled with extra features.
 case ${TARGET} in
     x86*)
         RUSTFLAGS="${RUSTFLAGS} -C target-feature=+avx"
         export STDSIMD_DISABLE_ASSERT_INSTR=1
         cargo_test "--release"
         ;;
+    wasm32-unknown-unknown*)
+        # export RUSTFLAGS="${RUSTFLAGS} -C target-feature=+simd128"
+        cargo_test "--release --features=wasm_simd128"
+        ;;
     *)
         ;;
 esac
diff --git a/coresimd/simd_llvm.rs b/coresimd/simd_llvm.rs
index 2ba3944bd4..072a950b4c 100644
--- a/coresimd/simd_llvm.rs
+++ b/coresimd/simd_llvm.rs
@@ -51,8 +51,7 @@ extern "platform-intrinsic" {
     pub fn simd_select<M, T>(m: M, a: T, b: T) -> T;
 
     pub fn simd_fmin<T>(a: T, b: T) -> T;
-    // FIXME: https://github.com/rust-lang-nursery/stdsimd/issues/416
-    // pub fn simd_fmax<T>(a: T, b: T) -> T;
+    pub fn simd_fmax<T>(a: T, b: T) -> T;
 
     pub fn simd_fsqrt<T>(a: T) -> T;
     pub fn simd_fma<T>(a: T, b: T, c: T) -> T;
diff --git a/coresimd/wasm32.rs b/coresimd/wasm32/mod.rs
similarity index 71%
rename from coresimd/wasm32.rs
rename to coresimd/wasm32/mod.rs
index ac13458122..2862d296a9 100644
--- a/coresimd/wasm32.rs
+++ b/coresimd/wasm32/mod.rs
@@ -1,3 +1,20 @@
+//! WASM32 intrinsics
+
+
+#[macro_use]
+#[cfg(all(not(test), feature = "wasm_simd128"))]
+mod simd128;
+
+#[cfg(all(test, feature = "wasm_simd128"))]
+pub mod simd128;
+#[cfg(all(test, feature = "wasm_simd128"))]
+pub use self::simd128::*;
+
+#[cfg(test)]
+use stdsimd_test::assert_instr;
+#[cfg(test)]
+use wasm_bindgen_test::wasm_bindgen_test;
+
 extern "C" {
     #[link_name = "llvm.wasm.grow.memory.i32"]
     fn llvm_grow_memory(pages: i32) -> i32;
@@ -12,6 +29,7 @@ extern "C" {
 ///
 /// [instr]: https://github.com/WebAssembly/design/blob/master/Semantics.md#resizing
 #[inline]
+#[cfg_attr(test, assert_instr("memory.size"))]
 pub unsafe fn current_memory() -> i32 {
     llvm_current_memory()
 }
@@ -25,6 +43,7 @@ pub unsafe fn current_memory() -> i32 {
 ///
 /// [instr]: https://github.com/WebAssembly/design/blob/master/Semantics.md#resizing
 #[inline]
+#[cfg_attr(test, assert_instr("memory.grow"))]
 pub unsafe fn grow_memory(delta: i32) -> i32 {
     llvm_grow_memory(delta)
 }
diff --git a/coresimd/wasm32/simd128.rs b/coresimd/wasm32/simd128.rs
new file mode 100644
index 0000000000..3c23189942
--- /dev/null
+++ b/coresimd/wasm32/simd128.rs
@@ -0,0 +1,1424 @@
+//! This module implements the [WebAssembly `SIMD128` ISA].
+//!
+//! [WebAssembly `SIMD128` ISA]:
+//! https://github.com/WebAssembly/simd/blob/master/proposals/simd/SIMD.md
+//
+// This files is structured as follows:
+// * first the types are defined
+// * then macros implementing the different APIs are provided
+// * finally the API of each type is implements
+//
+#![allow(non_camel_case_types)]
+
+#[cfg(test)]
+use stdsimd_test::assert_instr;
+#[cfg(test)]
+use wasm_bindgen_test::wasm_bindgen_test;
+
+////////////////////////////////////////////////////////////////////////////////
+// Types
+
+/// A single unconstrained byte (0-255).
+pub type ImmByte = u8;
+/// A byte with values in the range 0–1 identifying a lane.
+pub type LaneIdx2 = u8;
+/// A byte with values in the range 0–3 identifying a lane.
+pub type LaneIdx4 = u8;
+/// A byte with values in the range 0–7 identifying a lane.
+pub type LaneIdx8 = u8;
+/// A byte with values in the range 0–15 identifying a lane.
+pub type LaneIdx16 = u8;
+/// A byte with values in the range 0–31 identifying a lane.
+pub type LaneIdx32 = u8;
+
+types! {
+    /// WASM-specific 128-bit wide SIMD vector type
+    pub struct v128(i128);
+}
+
+mod sealed {
+    types! {
+        /// 128-bit wide SIMD vector type with 8 16-bit wide signed lanes
+        pub struct v8x16(
+            pub i8, pub i8, pub i8, pub i8, pub i8, pub i8, pub i8, pub i8,
+            pub i8, pub i8, pub i8, pub i8, pub i8, pub i8, pub i8, pub i8,
+        );
+        /// 128-bit wide SIMD vector type with 8 16-bit wide signed lanes
+        pub struct v16x8(
+            pub i16, pub i16, pub i16, pub i16,
+            pub i16, pub i16, pub i16, pub i16
+        );
+        /// 128-bit wide SIMD vector type with 4 32-bit wide signed lanes
+        pub struct v32x4(pub i32, pub i32, pub i32, pub i32);
+        /// 128-bit wide SIMD vector type with 2 64-bit wide signed lanes
+        pub struct v64x2(pub i64, pub i64);
+
+        /// 128-bit wide SIMD vector type with 8 16-bit wide unsigned lanes
+        pub struct u8x16(
+            pub u8, pub u8, pub u8, pub u8, pub u8, pub u8, pub u8, pub u8,
+            pub u8, pub u8, pub u8, pub u8, pub u8, pub u8, pub u8, pub u8,
+        );
+        /// 128-bit wide SIMD vector type with 8 16-bit wide unsigned lanes
+        pub struct u16x8(
+            pub u16, pub u16, pub u16, pub u16,
+            pub u16, pub u16, pub u16, pub u16
+        );
+        /// 128-bit wide SIMD vector type with 4 32-bit wide unsigned lanes
+        pub struct u32x4(pub u32, pub u32, pub u32, pub u32);
+        /// 128-bit wide SIMD vector type with 2 64-bit wide unsigned lanes
+        pub struct u64x2(pub u64, pub u64);
+
+        /// 128-bit wide SIMD vector type with 4 32-bit wide floating-point lanes
+        pub struct f32x4(pub f32, pub f32, pub f32, pub f32);
+        /// 128-bit wide SIMD vector type with 2 64-bit wide floating-point lanes
+        pub struct f64x2(pub f64, pub f64);
+    }
+
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[link_name = "llvm.fabs.v4f32"]
+        fn abs_v4f32(x: f32x4) -> f32x4;
+        #[link_name = "llvm.fabs.v2f64"]
+        fn abs_v2f64(x: f64x2) -> f64x2;
+        #[link_name = "llvm.sqrt.v4f32"]
+        fn sqrt_v4f32(x: f32x4) -> f32x4;
+        #[link_name = "llvm.sqrt.v2f64"]
+        fn sqrt_v2f64(x: f64x2) -> f64x2;
+        #[link_name = "shufflevector"]
+        pub fn shufflevector_v16i8(x: v8x16, y: v8x16, i: v8x16) -> v8x16;
+
+    }
+    impl f32x4 {
+        #[inline(always)]
+        pub unsafe fn abs(self) -> Self {
+            abs_v4f32(self)
+        }
+        #[inline(always)]
+        pub unsafe fn sqrt(self) -> Self {
+            sqrt_v4f32(self)
+        }
+    }
+    impl f64x2 {
+        #[inline(always)]
+        pub unsafe fn abs(self) -> Self {
+            abs_v2f64(self)
+        }
+        #[inline(always)]
+        pub unsafe fn sqrt(self) -> Self {
+            sqrt_v2f64(self)
+        }
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Macros implementing the spec APIs:
+
+macro_rules! impl_splat {
+    ($id:ident[$ivec_ty:ident : $elem_ty:ident] <= $x_ty:ident | $($lane_id:ident),*) => {
+        /// Create vector with identical lanes
+        ///
+        /// Construct a vector with `x` replicated to all lanes.
+        #[inline]
+        // #[target_feature(enable = "simd128")]
+        // FIXME: #[cfg_attr(test, assert_instr($ident.splat))]
+        pub const unsafe fn splat(x: $x_ty) -> v128 {
+            union U {
+                vec: self::sealed::$ivec_ty,
+                res: v128
+            }
+            U { vec: self::sealed::$ivec_ty($({ struct $lane_id; x as $elem_ty}),*) }.res
+        }
+    }
+}
+
+macro_rules! impl_extract_lane {
+    ($id:ident[$ivec_ty:ident : $selem_ty:ident|$uelem_ty:ident]($lane_idx:ty)
+     => $x_ty:ident) => {
+        /// Extract lane as a scalar (sign-extend)
+        ///
+        /// Extract the scalar value of lane specified in the immediate
+        /// mode operand `imm` from `a` by sign-extending it.
+        #[inline]
+        // #[target_feature(enable = "simd128")]
+        // FIXME: #[cfg_attr(test, assert_instr($id.extract_lane_s, imm =
+        // 0))]
+        #[rustc_args_required_const(1)]
+        pub unsafe fn extract_lane_s(a: v128, imm: $lane_idx) -> $x_ty {
+            use coresimd::simd_llvm::simd_extract;
+            union U {
+                vec: self::sealed::$ivec_ty,
+                a: v128,
+            }
+            // the vectors store a signed integer => extract into it
+            let v: $selem_ty = simd_extract(
+                U { a }.vec,
+                imm as u32, /* zero-extends index */
+            );
+            v as $x_ty
+        }
+
+        /// Extract lane as a scalar (zero-extend)
+        ///
+        /// Extract the scalar value of lane specified in the immediate
+        /// mode operand `imm` from `a` by zero-extending it.
+        #[inline]
+        // #[target_feature(enable = "simd128")]
+        // FIXME: #[cfg_attr(test, assert_instr($id.extract_lane_u, imm =
+        // 0))]
+        #[rustc_args_required_const(1)]
+        pub unsafe fn extract_lane_u(a: v128, imm: $lane_idx) -> $x_ty {
+            use coresimd::simd_llvm::simd_extract;
+            union U {
+                vec: self::sealed::$ivec_ty,
+                a: v128,
+            }
+            // the vectors store a signed integer => extract into it
+            let v: $selem_ty = simd_extract(
+                U { a }.vec,
+                imm as u32, /* zero-extends index */
+            );
+            // re-interpret the signed integer as an unsigned one of the
+            // same size (no-op)
+            let v: $uelem_ty = ::mem::transmute(v);
+            // cast the internal unsigned integer to a larger signed
+            // integer (zero-extends)
+            v as $x_ty
+        }
+    };
+    ($id:ident[$ivec_ty:ident]($lane_idx:ty) => $x_ty:ident) => {
+        /// Extract lane as a scalar
+        ///
+        /// Extract the scalar value of lane specified in the immediate
+        /// mode operand `imm` from `a`.
+        #[inline]
+        // #[target_feature(enable = "simd128")]
+        // FIXME: #[cfg_attr(test, assert_instr($id.extract_lane_u, imm =
+        // 0))]
+        #[rustc_args_required_const(1)]
+        pub unsafe fn extract_lane(a: v128, imm: $lane_idx) -> $x_ty {
+            use coresimd::simd_llvm::simd_extract;
+            union U {
+                vec: self::sealed::$ivec_ty,
+                a: v128,
+            }
+            // the vectors store a signed integer => extract into it
+            simd_extract(U { a }.vec, imm as u32 /* zero-extends index */)
+        }
+    };
+}
+
+macro_rules! impl_replace_lane {
+    ($id:ident[$ivec_ty:ident:$ielem_ty:ident]($lane_idx:ty) <= $x_ty:ident) => {
+        /// Replace lane value
+        ///
+        /// Return a new vector with lanes identical to `a`, except for
+        /// lane specified in the immediate mode argument `i` which
+        /// has the value `x`.
+        #[inline]
+        // #[target_feature(enable = "simd128")]
+        // FIXME: #[cfg_attr(test, assert_instr($id.extract_lane_u))]
+        #[rustc_args_required_const(1)]
+        pub unsafe fn replace_lane(a: v128, imm: $lane_idx, x: $x_ty) -> v128 {
+            use coresimd::simd_llvm::simd_insert;
+            union U {
+                vec: self::sealed::$ivec_ty,
+                a: v128,
+            }
+            // the vectors store a signed integer => extract into it
+            ::mem::transmute(simd_insert(
+                U { a }.vec,
+                imm as u32, /* zero-extends index */
+                x as $ielem_ty,
+            ))
+        }
+    };
+}
+
+macro_rules! impl_wrapping_add_sub_neg {
+    ($id:ident[$ivec_ty:ident]) => {
+        /// Lane-wise wrapping integer addition
+        #[inline]
+        // #[target_feature(enable = "simd128")]
+        // FIXME: #[cfg_attr(test, assert_instr($id.add))]
+        pub unsafe fn add(a: v128, b: v128) -> v128 {
+            use coresimd::simd_llvm::simd_add;
+            let a: sealed::$ivec_ty = ::mem::transmute(a);
+            let b: sealed::$ivec_ty = ::mem::transmute(b);
+            ::mem::transmute(simd_add(a, b))
+        }
+
+        /// Lane-wise wrapping integer subtraction
+        #[inline]
+        // #[target_feature(enable = "simd128")]
+        // FIXME: #[cfg_attr(test, assert_instr($id.sub))]
+        pub unsafe fn sub(a: v128, b: v128) -> v128 {
+            use coresimd::simd_llvm::simd_sub;
+            let a: sealed::$ivec_ty = ::mem::transmute(a);
+            let b: sealed::$ivec_ty = ::mem::transmute(b);
+            ::mem::transmute(simd_sub(a, b))
+        }
+
+        /// Lane-wise wrapping integer negation
+        #[inline]
+        // #[target_feature(enable = "simd128")]
+        // FIXME: #[cfg_attr(test, assert_instr($id.neg))]
+        pub unsafe fn neg(a: v128) -> v128 {
+            use coresimd::simd_llvm::simd_mul;
+            let a: sealed::$ivec_ty = ::mem::transmute(a);
+            let b: sealed::$ivec_ty = ::mem::transmute($id::splat(-1));
+            ::mem::transmute(simd_mul(b, a))
+        }
+
+        // note: multiplication explicitly omitted because i64x2 does
+        // not implement it
+    };
+}
+
+// TODO: Saturating integer arithmetic
+// need to add intrinsics to rustc
+
+// note: multiplication explicitly implemented separately because i64x2 does
+// not implement it
+
+macro_rules! impl_wrapping_mul {
+    ($id:ident[$ivec_ty:ident]) => {
+        /// Lane-wise wrapping integer multiplication
+        #[inline]
+        // #[target_feature(enable = "simd128")]
+        // FIXME: #[cfg_attr(test, assert_instr($id.mul))]
+        pub unsafe fn mul(a: v128, b: v128) -> v128 {
+            use coresimd::simd_llvm::simd_mul;
+            let a: sealed::$ivec_ty = ::mem::transmute(a);
+            let b: sealed::$ivec_ty = ::mem::transmute(b);
+            ::mem::transmute(simd_mul(a, b))
+        }
+    };
+}
+
+macro_rules! impl_shl_scalar {
+    ($id:ident[$ivec_ty:ident : $t:ty]) => {
+        /// Left shift by scalar.
+        ///
+        /// Shift the bits in each lane to the left by the same amount.
+        /// Only the low bits of the shift amount are used.
+        #[inline]
+        // #[target_feature(enable = "simd128")]
+        // FIXME: #[cfg_attr(test, assert_instr($id.shl))]
+        pub unsafe fn shl(a: v128, y: i32) -> v128 {
+            use coresimd::simd_llvm::simd_shl;
+            let a: sealed::$ivec_ty = ::mem::transmute(a);
+            let b: sealed::$ivec_ty = ::mem::transmute($id::splat(y as $t));
+            ::mem::transmute(simd_shl(a, b))
+        }
+    };
+}
+
+macro_rules! impl_shr_scalar {
+    ($id:ident[$svec_ty:ident : $uvec_ty:ident : $t:ty]) => {
+        /// Arithmetic right shift by scalar.
+        ///
+        /// Shift the bits in each lane to the right by the same amount.
+        #[inline]
+        // #[target_feature(enable = "simd128")]
+        // FIXME: #[cfg_attr(test, assert_instr($id.shr))]
+        pub unsafe fn shr_s(a: v128, y: i32) -> v128 {
+            use coresimd::simd_llvm::simd_shr;
+            let a: sealed::$svec_ty = ::mem::transmute(a);
+            let b: sealed::$svec_ty = ::mem::transmute($id::splat(y as $t));
+            ::mem::transmute(simd_shr(a, b))
+        }
+
+        /// Logical right shift by scalar.
+        ///
+        /// Shift the bits in each lane to the right by the same amount.
+        #[inline]
+        // #[target_feature(enable = "simd128")]
+        // FIXME: #[cfg_attr(test, assert_instr($id.shr))]
+        pub unsafe fn shr_u(a: v128, y: i32) -> v128 {
+            use coresimd::simd_llvm::simd_shr;
+            let a: sealed::$uvec_ty = ::mem::transmute(a);
+            let b: sealed::$uvec_ty = ::mem::transmute($id::splat(y as $t));
+            ::mem::transmute(simd_shr(a, b))
+        }
+    };
+}
+
+macro_rules! impl_boolean_reduction {
+    ($id:ident[$ivec_ty:ident]) => {
+        /// Any lane true
+        ///
+        /// Returns `1` if any lane in `a` is non-zero, `0` otherwise.
+        #[inline]
+        // #[target_feature(enable = "simd128")]
+        // FIXME: #[cfg_attr(test, assert_instr($id.any_true))]
+        pub unsafe fn any_true(a: v128) -> i32 {
+            use coresimd::simd_llvm::simd_reduce_any;
+            let a: sealed::$ivec_ty = ::mem::transmute(a);
+            if simd_reduce_any(a) {
+                1
+            } else {
+                0
+            }
+        }
+
+        /// All lanes true
+        ///
+        /// Returns `1` if all lanes in `a` are non-zero, `0` otherwise.
+        #[inline]
+        // #[target_feature(enable = "simd128")]
+        // FIXME: #[cfg_attr(test, assert_instr($id.all_true))]
+        pub unsafe fn all_true(a: v128) -> i32 {
+            use coresimd::simd_llvm::simd_reduce_all;
+            let a: sealed::$ivec_ty = ::mem::transmute(a);
+            if simd_reduce_all(a) {
+                1
+            } else {
+                0
+            }
+        }
+    };
+}
+
+macro_rules! impl_comparisons {
+    ($id:ident[$ivec_ty:ident]) => {
+        impl_comparisons!($id[$ivec_ty=>$ivec_ty]);
+    };
+    ($id:ident[$ivec_ty:ident=>$rvec_ty:ident]) => {
+        /// Equality
+        #[inline]
+        // #[target_feature(enable = "simd128")]
+        // FIXME: #[cfg_attr(test, assert_instr($id.eq))]
+        pub unsafe fn eq(a: v128, b: v128) -> v128 {
+            use coresimd::simd_llvm::simd_eq;
+            let a: sealed::$ivec_ty = ::mem::transmute(a);
+            let b: sealed::$ivec_ty = ::mem::transmute(b);
+            let c: sealed::$rvec_ty = simd_eq(a, b);
+            ::mem::transmute(c)
+        }
+        /// Non-Equality
+        #[inline]
+        // #[target_feature(enable = "simd128")]
+        // FIXME: #[cfg_attr(test, assert_instr($id.ne))]
+        pub unsafe fn ne(a: v128, b: v128) -> v128 {
+            use coresimd::simd_llvm::simd_ne;
+            let a: sealed::$ivec_ty = ::mem::transmute(a);
+            let b: sealed::$ivec_ty = ::mem::transmute(b);
+            let c: sealed::$rvec_ty = simd_ne(a, b);
+            ::mem::transmute(c)
+        }
+        /// Less-than
+        #[inline]
+        // #[target_feature(enable = "simd128")]
+        // FIXME: #[cfg_attr(test, assert_instr($id.lt))]
+        pub unsafe fn lt(a: v128, b: v128) -> v128 {
+            use coresimd::simd_llvm::simd_lt;
+            let a: sealed::$ivec_ty = ::mem::transmute(a);
+            let b: sealed::$ivec_ty = ::mem::transmute(b);
+            let c: sealed::$rvec_ty = simd_lt(a, b);
+            ::mem::transmute(c)
+        }
+        /// Less-than or equal
+        #[inline]
+        // #[target_feature(enable = "simd128")]
+        // FIXME: #[cfg_attr(test, assert_instr($id.le))]
+        pub unsafe fn le(a: v128, b: v128) -> v128 {
+            use coresimd::simd_llvm::simd_le;
+            let a: sealed::$ivec_ty = ::mem::transmute(a);
+            let b: sealed::$ivec_ty = ::mem::transmute(b);
+            let c: sealed::$rvec_ty = simd_le(a, b);
+            ::mem::transmute(c)
+        }
+        /// Greater-than
+        #[inline]
+        // #[target_feature(enable = "simd128")]
+        // FIXME: #[cfg_attr(test, assert_instr($id.gt))]
+        pub unsafe fn gt(a: v128, b: v128) -> v128 {
+            use coresimd::simd_llvm::simd_gt;
+            let a: sealed::$ivec_ty = ::mem::transmute(a);
+            let b: sealed::$ivec_ty = ::mem::transmute(b);
+            let c: sealed::$rvec_ty = simd_gt(a, b);
+            ::mem::transmute(c)
+        }
+        /// Greater-than or equal
+        #[inline]
+        // #[target_feature(enable = "simd128")]
+        // FIXME: #[cfg_attr(test, assert_instr($id.ge))]
+        pub unsafe fn ge(a: v128, b: v128) -> v128 {
+            use coresimd::simd_llvm::simd_ge;
+            let a: sealed::$ivec_ty = ::mem::transmute(a);
+            let b: sealed::$ivec_ty = ::mem::transmute(b);
+            let c: sealed::$rvec_ty = simd_ge(a, b);
+            ::mem::transmute(c)
+        }
+    }
+}
+
+// Floating-point operations
+macro_rules! impl_floating_point_ops {
+    ($id:ident) => {
+        /// Negation
+        ///
+        /// Apply the IEEE `negate(x)` function to each lane. This simply
+        /// inverts the sign bit, preserving all other bits, even for `NaN`
+        /// inputs.
+        #[inline]
+        // #[target_feature(enable = "simd128")]
+        // FIXME: #[cfg_attr(test, assert_instr($id.neg))]
+        pub unsafe fn neg(a: v128) -> v128 {
+            use coresimd::simd_llvm::simd_mul;
+            let a: sealed::$id = ::mem::transmute(a);
+            let b: sealed::$id = ::mem::transmute($id::splat(-1.));
+            ::mem::transmute(simd_mul(b, a))
+        }
+        /// Absolute value
+        ///
+        /// Apply the IEEE `abs(x)` function to each lane. This simply
+        /// clears the sign bit, preserving all other bits, even for `NaN`
+        /// inputs.
+        #[inline]
+        // #[target_feature(enable = "simd128")]
+        // FIXME: #[cfg_attr(test, assert_instr($id.abs))]
+        pub unsafe fn abs(a: v128) -> v128 {
+            let a: sealed::$id = ::mem::transmute(a);
+            ::mem::transmute(a.abs())
+        }
+        /// NaN-propagating minimum
+        ///
+        /// Lane-wise minimum value, propagating `NaN`s.
+        #[inline]
+        // #[target_feature(enable = "simd128")]
+        // FIXME: #[cfg_attr(test, assert_instr($id.min))]
+        pub unsafe fn min(a: v128, b: v128) -> v128 {
+            v128::bitselect(a, b, $id::lt(a, b))
+        }
+        /// NaN-propagating maximum
+        ///
+        /// Lane-wise maximum value, propagating `NaN`s.
+        #[inline]
+        // #[target_feature(enable = "simd128")]
+        // FIXME: #[cfg_attr(test, assert_instr($id.max))]
+        pub unsafe fn max(a: v128, b: v128) -> v128 {
+            v128::bitselect(a, b, $id::gt(a, b))
+        }
+        /// Square-root
+        ///
+        /// Lane-wise square-root.
+        #[inline]
+        // #[target_feature(enable = "simd128")]
+        // FIXME: #[cfg_attr(test, assert_instr($id.sqrt))]
+        pub unsafe fn sqrt(a: v128) -> v128 {
+            let a: sealed::$id = ::mem::transmute(a);
+            ::mem::transmute(a.sqrt())
+        }
+        /// Lane-wise addition
+        #[inline]
+        // #[target_feature(enable = "simd128")]
+        // FIXME: #[cfg_attr(test, assert_instr($id.add))]
+        pub unsafe fn add(a: v128, b: v128) -> v128 {
+            use coresimd::simd_llvm::simd_add;
+            let a: sealed::$id = ::mem::transmute(a);
+            let b: sealed::$id = ::mem::transmute(b);
+            ::mem::transmute(simd_add(a, b))
+        }
+        /// Lane-wise subtraction
+        #[inline]
+        // #[target_feature(enable = "simd128")]
+        // FIXME: #[cfg_attr(test, assert_instr($id.sub))]
+        pub unsafe fn sub(a: v128, b: v128) -> v128 {
+            use coresimd::simd_llvm::simd_sub;
+            let a: sealed::$id = ::mem::transmute(a);
+            let b: sealed::$id = ::mem::transmute(b);
+            ::mem::transmute(simd_sub(a, b))
+        }
+        /// Lane-wise multiplication
+        #[inline]
+        // #[target_feature(enable = "simd128")]
+        // FIXME: #[cfg_attr(test, assert_instr($id.mul))]
+        pub unsafe fn mul(a: v128, b: v128) -> v128 {
+            use coresimd::simd_llvm::simd_mul;
+            let a: sealed::$id = ::mem::transmute(a);
+            let b: sealed::$id = ::mem::transmute(b);
+            ::mem::transmute(simd_mul(a, b))
+        }
+        /// Lane-wise division
+        #[inline]
+        // #[target_feature(enable = "simd128")]
+        // FIXME: #[cfg_attr(test, assert_instr($id.div))]
+        pub unsafe fn div(a: v128, b: v128) -> v128 {
+            use coresimd::simd_llvm::simd_div;
+            let a: sealed::$id = ::mem::transmute(a);
+            let b: sealed::$id = ::mem::transmute(b);
+            ::mem::transmute(simd_div(a, b))
+        }
+    };
+}
+
+macro_rules! impl_conversion {
+    ($conversion:ident[$instr:expr]: $from_ty:ident => $to_ty:ident | $id:ident) => {
+        #[inline]
+        // #[target_feature(enable = "simd128")]
+        // FIXME: #[cfg_attr(test, assert_instr($instr))]
+        pub unsafe fn $conversion(a: v128) -> v128 {
+            use coresimd::simd_llvm::simd_cast;
+            let a: sealed::$from_ty = ::mem::transmute(a);
+            let b: sealed::$to_ty = simd_cast(a);
+            ::mem::transmute(b)
+        }
+    };
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Implementations:
+
+// v128
+impl v128 {
+    ///////////////////////////////////////////////////////////////////////////
+    // Const constructor:
+
+    /// Materialize a constant SIMD value from the immediate operands.
+    ///
+    /// The `v128.const` instruction is encoded with 16 immediate bytes
+    /// `imm` which provide the bits of the vector directly.
+    #[inline]
+    // #[target_feature(enable = "simd128")]
+    // FIXME: #[cfg_attr(test, assert_instr(v128.const, imm =
+    // [ImmByte::new(42); 16]))]
+    #[rustc_args_required_const(0)]
+    pub const unsafe fn const_(imm: [ImmByte; 16]) -> v128 {
+        union U {
+            imm: [ImmByte; 16],
+            vec: v128,
+        }
+        U { imm }.vec
+    }
+
+    ///////////////////////////////////////////////////////////////////////////
+    // Bitwise logical operations:
+
+    /// Bitwise logical and
+    #[inline]
+    // #[target_feature(enable = "simd128")]
+    // FIXME: #[cfg_attr(test, assert_instr($id.and))]
+    pub unsafe fn and(a: v128, b: v128) -> v128 {
+        use coresimd::simd_llvm::simd_and;
+        simd_and(a, b)
+    }
+
+    /// Bitwise logical or
+    #[inline]
+    // #[target_feature(enable = "simd128")]
+    // FIXME: #[cfg_attr(test, assert_instr($id.or))]
+    pub unsafe fn or(a: v128, b: v128) -> v128 {
+        use coresimd::simd_llvm::simd_or;
+        simd_or(a, b)
+    }
+
+    /// Bitwise logical xor
+    #[inline]
+    // #[target_feature(enable = "simd128")]
+    // FIXME: #[cfg_attr(test, assert_instr($id.xor))]
+    pub unsafe fn xor(a: v128, b: v128) -> v128 {
+        use coresimd::simd_llvm::simd_xor;
+        simd_xor(a, b)
+    }
+
+    /// Bitwise logical not
+    #[inline]
+    // #[target_feature(enable = "simd128")]
+    // FIXME: #[cfg_attr(test, assert_instr($id.not))]
+    pub unsafe fn not(a: v128) -> v128 {
+        union U {
+            v: u128,
+            c: [ImmByte; 16],
+        }
+        // FIXME: https://github.com/rust-lang/rust/issues/53193
+        const C: [ImmByte; 16] = unsafe {
+            U {
+                v: ::_core::u128::MAX,
+            }.c
+        };
+        Self::xor(v128::const_(C), a)
+    }
+
+    /// Bitwise select
+    ///
+    /// Use the bits in the control mask `c` to select the corresponding bit
+    /// from `v1` when `1` and `v2` when `0`.
+    #[inline]
+    // #[target_feature(enable = "simd128")]
+    // FIXME: #[cfg_attr(test, assert_instr($id.bitselect))]
+    pub unsafe fn bitselect(v1: v128, v2: v128, c: v128) -> v128 {
+        // FIXME: use llvm.select instead - we need to add a `simd_bitselect`
+        // intrinsic to rustc that converts a v128 vector into a i1x128. The
+        // `simd_select` intrinsic converts e.g. a i8x16 into a i1x16 which is
+        // not what we want here:
+        Self::or(Self::and(v1, c), Self::and(v2, Self::not(c)))
+    }
+
+    ///////////////////////////////////////////////////////////////////////////
+    // Memory load/stores:
+
+    /// Load a `v128` vector from the given heap address.
+    #[inline]
+    // #[target_feature(enable = "simd128")]
+    // FIXME: #[cfg_attr(test, assert_instr($id.load))]
+    pub unsafe fn load(m: *const v128) -> v128 {
+        ::_core::ptr::read(m)
+    }
+
+    /// Store a `v128` vector to the given heap address.
+    #[inline]
+    // #[target_feature(enable = "simd128")]
+    // FIXME: #[cfg_attr(test, assert_instr($id.store))]
+    pub unsafe fn store(m: *mut v128, a: v128) {
+        ::_core::ptr::write(m, a)
+    }
+}
+
+pub use self::sealed::v8x16 as __internal_v8x16;
+pub use coresimd::simd_llvm::simd_shuffle16 as __internal_v8x16_shuffle;
+/// Shuffle lanes
+///
+/// Create vector with lanes selected from the lanes of two input vectors
+/// `a` and `b` by the indices specified in the immediate mode operand
+/// `imm`. Each index selects an element of the result vector, where the
+/// indices `i` in range `[0, 15]` select the `i`-th elements of `a`, and
+/// the indices in range `[16, 31]` select the `i - 16`-th element of `b`.
+#[macro_export]
+macro_rules! v8x16_shuffle {
+    ($a:expr, $b:expr, [
+        $imm0:expr, $imm1:expr, $imm2:expr, $imm3:expr,
+        $imm4:expr, $imm5:expr, $imm6:expr, $imm7:expr,
+        $imm8:expr, $imm9:expr, $imm10:expr, $imm11:expr,
+        $imm12:expr, $imm13:expr, $imm14:expr, $imm15:expr
+    ]) => {
+        #[allow(unused_unsafe)]
+        unsafe {
+            let a: $crate::arch::wasm32::v128 = $a;
+            let b: $crate::arch::wasm32::v128 = $b;
+            union U {
+                e: v128,
+                i: $crate::arch::wasm32::__internal_v8x16,
+            }
+            let a = U { e: a }.i;
+            let b = U { e: b }.i;
+
+            let r: $crate::arch::wasm32::__internal_v8x16 =
+                $crate::arch::wasm32::__internal_v8x16_shuffle(
+                    a,
+                    b,
+                    [
+                        $imm0 as u32,
+                        $imm1,
+                        $imm2,
+                        $imm3,
+                        $imm4,
+                        $imm5,
+                        $imm6,
+                        $imm7,
+                        $imm8,
+                        $imm9,
+                        $imm10,
+                        $imm11,
+                        $imm12,
+                        $imm13,
+                        $imm14,
+                        $imm15,
+                    ],
+                );
+            U { i: r }.e
+        }
+    };
+}
+
+/// WASM-specific v8x16 instructions with modulo-arithmetic semantics
+pub mod i8x16 {
+    use super::*;
+    impl_splat!(
+        i8x16[v8x16: i8] <= i32 | x0,
+        x1,
+        x2,
+        x3,
+        x4,
+        x5,
+        x6,
+        x7,
+        x8,
+        x9,
+        x10,
+        x11,
+        x12,
+        x13,
+        x14,
+        x15
+    );
+    impl_extract_lane!(i8x16[v8x16:i8|u8](LaneIdx16) => i32);
+    impl_replace_lane!(i8x16[v8x16: i8](LaneIdx16) <= i32);
+    impl_wrapping_add_sub_neg!(i8x16[v8x16]);
+    impl_wrapping_mul!(i8x16[v8x16]);
+    impl_shl_scalar!(i8x16[v8x16: i32]);
+    impl_shr_scalar!(i8x16[v8x16: u8x16: i32]);
+    impl_boolean_reduction!(i8x16[v8x16]);
+    impl_comparisons!(i8x16[v8x16]);
+}
+
+/// WASM-specific v16x8 instructions with modulo-arithmetic semantics
+pub mod i16x8 {
+    use super::*;
+    impl_splat!(i16x8[v16x8: i16] <= i32 | x0, x1, x2, x3, x4, x5, x6, x7);
+    impl_extract_lane!(i16x8[v16x8:i16|u16](LaneIdx8) => i32);
+    impl_replace_lane!(i16x8[v16x8: i16](LaneIdx8) <= i32);
+    impl_wrapping_add_sub_neg!(i16x8[v16x8]);
+    impl_wrapping_mul!(i16x8[v16x8]);
+    impl_shl_scalar!(i16x8[v16x8: i32]);
+    impl_shr_scalar!(i16x8[v16x8: u16x8: i32]);
+    impl_boolean_reduction!(i16x8[v16x8]);
+    impl_comparisons!(i16x8[v16x8]);
+}
+
+/// WASM-specific v32x4 instructions with modulo-arithmetic semantics
+pub mod i32x4 {
+    use super::*;
+    impl_splat!(i32x4[v32x4: i32] <= i32 | x0, x1, x2, x3);
+    impl_extract_lane!(i32x4[v32x4](LaneIdx4) => i32);
+    impl_replace_lane!(i32x4[v32x4: i32](LaneIdx4) <= i32);
+    impl_wrapping_add_sub_neg!(i32x4[v32x4]);
+    impl_wrapping_mul!(i32x4[v32x4]);
+    impl_shl_scalar!(i32x4[v32x4: i32]);
+    impl_shr_scalar!(i32x4[v32x4: u32x4: i32]);
+    impl_boolean_reduction!(i32x4[v32x4]);
+    impl_comparisons!(i32x4[v32x4]);
+    impl_conversion!(trunc_s_f32x4_sat["i32x4.trunc_s/f32x4:sat"]: f32x4 => v32x4 | i32x4);
+    impl_conversion!(trunc_u_f32x4_sat["i32x4.trunc_s/f32x4:sat"]: f32x4 => u32x4 | i32x4);
+}
+
+/// WASM-specific v64x2 instructions with modulo-arithmetic semantics
+pub mod i64x2 {
+    use super::*;
+    impl_splat!(i64x2[v64x2: i64] <= i64 | x0, x1);
+    impl_extract_lane!(i64x2[v64x2](LaneIdx2) => i64);
+    impl_replace_lane!(i64x2[v64x2: i64](LaneIdx2) <= i64);
+    impl_wrapping_add_sub_neg!(i64x2[v64x2]);
+    // note: wrapping multiplication for i64x2 is not part of the spec
+    impl_shl_scalar!(i64x2[v64x2: i64]);
+    impl_shr_scalar!(i64x2[v64x2: u64x2: i64]);
+    impl_boolean_reduction!(i64x2[v64x2]);
+    impl_comparisons!(i64x2[v64x2]);
+    impl_conversion!(trunc_s_f64x2_sat["i64x2.trunc_s/f64x2:sat"]: f64x2 => v64x2 | i64x2);
+    impl_conversion!(trunc_u_f64x2_sat["i64x2.trunc_s/f64x2:sat"]: f64x2 => u64x2 | i64x2);
+}
+
+/// WASM-specific v32x4 floating-point instructions
+pub mod f32x4 {
+    use super::*;
+    impl_splat!(f32x4[f32x4: f32] <= f32 | x0, x1, x2, x3);
+    impl_extract_lane!(f32x4[f32x4](LaneIdx4) => f32);
+    impl_replace_lane!(f32x4[f32x4: f32](LaneIdx4) <= f32);
+    impl_comparisons!(f32x4[f32x4=>v32x4]);
+    impl_floating_point_ops!(f32x4);
+    impl_conversion!(convert_s_i32x4["f32x4.convert_s/i32x4"]: v32x4 => f32x4 | f32x4);
+    impl_conversion!(convert_u_i32x4["f32x4.convert_u/i32x4"]: u32x4 => f32x4 | f32x4);
+
+}
+
+/// WASM-specific v64x2 floating-point instructions
+pub mod f64x2 {
+    use super::*;
+    impl_splat!(f64x2[f64x2: f64] <= f64 | x0, x1);
+    impl_extract_lane!(f64x2[f64x2](LaneIdx2) => f64);
+    impl_replace_lane!(f64x2[f64x2: f64](LaneIdx2) <= f64);
+    impl_comparisons!(f64x2[f64x2=>v64x2]);
+    impl_floating_point_ops!(f64x2);
+    impl_conversion!(convert_s_i64x2["f64x2.convert_s/i64x2"]: v64x2 => f64x2 | f64x2);
+    impl_conversion!(convert_u_i64x2["f64x2.convert_u/i64x2"]: u64x2 => f64x2 | f64x2);
+}
+
+#[cfg(test)]
+pub mod tests {
+    use super::*;
+    use std;
+    use std::mem;
+    use std::prelude::v1::*;
+    use wasm_bindgen_test::*;
+
+    fn compare_bytes(a: v128, b: v128) {
+        let a: [u8; 16] = unsafe { mem::transmute(a) };
+        let b: [u8; 16] = unsafe { mem::transmute(b) };
+        assert_eq!(a, b);
+    }
+
+    #[wasm_bindgen_test]
+    fn v128_const() {
+        const A: v128 = unsafe {
+            v128::const_([
+                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+            ])
+        };
+        compare_bytes(A, A);
+    }
+
+    macro_rules! test_splat {
+        ($test_id:ident: $id:ident($val:expr) => $($vals:expr),*) => {
+            #[wasm_bindgen_test]
+            fn $test_id() {
+                const A: v128 = unsafe {
+                    $id::splat($val)
+                };
+                const B: v128 = unsafe {
+                    v128::const_([$($vals),*])
+                };
+                compare_bytes(A, B);
+            }
+        }
+    }
+
+    test_splat!(i8x16_splat: i8x16(42) => 42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42);
+    test_splat!(i16x8_splat: i16x8(42) => 42, 0, 42, 0, 42, 0, 42, 0, 42, 0, 42, 0, 42, 0, 42, 0);
+    test_splat!(i32x4_splat: i32x4(42) => 42, 0, 0, 0, 42, 0, 0, 0, 42, 0, 0, 0, 42, 0, 0, 0);
+    test_splat!(i64x2_splat: i64x2(42) => 42, 0, 0, 0, 0, 0, 0, 0, 42, 0, 0, 0, 0, 0, 0, 0);
+    test_splat!(f32x4_splat: f32x4(42.) => 0, 0, 40, 66, 0, 0, 40, 66, 0, 0, 40, 66, 0, 0, 40, 66);
+    test_splat!(f64x2_splat: f64x2(42.) => 0, 0, 0, 0, 0, 0, 69, 64, 0, 0, 0, 0, 0, 0, 69, 64);
+
+    // tests extract and replace lanes
+    macro_rules! test_extract {
+        ($test_id:ident: $id:ident[$ety:ident] => $extract_fn:ident | [$val:expr; $count:expr]
+         | [$($vals:expr),*] => ($other:expr)
+         | $($ids:expr),*) => {
+            #[wasm_bindgen_test]
+            fn $test_id() {
+                unsafe {
+                    // splat vector and check that all indices contain the same value
+                    // splatted:
+                    const A: v128 = unsafe {
+                        $id::splat($val)
+                    };
+                    $(
+                        assert_eq!($id::$extract_fn(A, $ids) as $ety, $val);
+                    )*;
+
+                    // create a vector from array and check that the indices contain
+                    // the same values as in the array:
+                    let arr: [$ety; $count] = [$($vals),*];
+                    let mut vec: v128 = mem::transmute(arr);
+                    $(
+                        assert_eq!($id::$extract_fn(vec, $ids) as $ety, arr[$ids]);
+                    )*;
+
+                    // replace lane 0 with another value
+                    vec = $id::replace_lane(vec, 0, $other);
+                    assert_ne!($id::$extract_fn(vec, 0) as $ety, arr[0]);
+                    assert_eq!($id::$extract_fn(vec, 0) as $ety, $other);
+                }
+            }
+        }
+    }
+
+    test_extract!(i8x16_extract_u: i8x16[u8] => extract_lane_u | [255; 16]
+                  | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] => (42)
+                  | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+    );
+    test_extract!(i8x16_extract_s: i8x16[i8] => extract_lane_s | [-122; 16]
+                  | [0, -1, 2, -3, 4, -5, 6, -7, 8, -9, 10, -11, 12, -13, 14, -15] => (-42)
+                  | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+    );
+
+    test_extract!(i16x8_extract_u: i16x8[u16] => extract_lane_u | [255; 8]
+                  | [0, 1, 2, 3, 4, 5, 6, 7]  => (42) | 0, 1, 2, 3, 4, 5, 6, 7
+    );
+    test_extract!(i16x8_extract_s: i16x8[i16] => extract_lane_s | [-122; 8]
+                  | [0, -1, 2, -3, 4, -5, 6, -7]  => (-42) | 0, 1, 2, 3, 4, 5, 6, 7
+    );
+    test_extract!(i32x4_extract: i32x4[i32] => extract_lane | [-122; 4]
+                  | [0, -1, 2, -3]  => (42) | 0, 1, 2, 3
+    );
+    test_extract!(i64x2_extract: i64x2[i64] => extract_lane | [-122; 2]
+                  | [0, -1]  => (42) | 0, 1
+    );
+    test_extract!(f32x4_extract: f32x4[f32] => extract_lane | [-122.; 4]
+                  | [0., -1., 2., -3.]  => (42.) | 0, 1, 2, 3
+    );
+    test_extract!(f64x2_extract: f64x2[f64] => extract_lane | [-122.; 2]
+                  | [0., -1.]  => (42.) | 0, 1
+    );
+
+    #[wasm_bindgen_test]
+    fn v8x16_shuffle() {
+        unsafe {
+            let a = [0_u8, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
+            let b = [
+                16_u8, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
+                31,
+            ];
+
+            let vec_a: v128 = mem::transmute(a);
+            let vec_b: v128 = mem::transmute(b);
+
+            let vec_r = v8x16_shuffle!(
+                vec_a,
+                vec_b,
+                [0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30]
+            );
+
+            let e =
+                [0_u8, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30];
+            let vec_e: v128 = mem::transmute(e);
+            compare_bytes(vec_r, vec_e);
+        }
+    }
+
+    macro_rules! floating_point {
+        (f32) => {
+            true
+        };
+        (f64) => {
+            true
+        };
+        ($id:ident) => {
+            false
+        };
+    }
+
+    trait IsNan: Sized {
+        fn is_nan(self) -> bool {
+            false
+        }
+    }
+    impl IsNan for i8 {}
+    impl IsNan for i16 {}
+    impl IsNan for i32 {}
+    impl IsNan for i64 {}
+
+    macro_rules! test_bop {
+        ($id:ident[$ety:ident; $ecount:expr] |
+         $binary_op:ident [$op_test_id:ident] :
+         ([$($in_a:expr),*], [$($in_b:expr),*]) => [$($out:expr),*]) => {
+            test_bop!(
+                $id[$ety; $ecount] => $ety | $binary_op [ $op_test_id ]:
+                ([$($in_a),*], [$($in_b),*]) => [$($out),*]
+            );
+
+        };
+        ($id:ident[$ety:ident; $ecount:expr] => $oty:ident |
+         $binary_op:ident [$op_test_id:ident] :
+         ([$($in_a:expr),*], [$($in_b:expr),*]) => [$($out:expr),*]) => {
+            #[wasm_bindgen_test]
+            fn $op_test_id() {
+                unsafe {
+                    let a_input: [$ety; $ecount] = [$($in_a),*];
+                    let b_input: [$ety; $ecount] = [$($in_b),*];
+                    let output: [$oty; $ecount] = [$($out),*];
+
+                    let a_vec_in: v128 = mem::transmute(a_input);
+                    let b_vec_in: v128 = mem::transmute(b_input);
+                    let vec_res: v128 = $id::$binary_op(a_vec_in, b_vec_in);
+
+                    let res: [$oty; $ecount] = mem::transmute(vec_res);
+
+                    if !floating_point!($ety) {
+                        assert_eq!(res, output);
+                    } else {
+                        for i in 0..$ecount {
+                            let r = res[i];
+                            let o = output[i];
+                            assert_eq!(r.is_nan(), o.is_nan());
+                            if !r.is_nan() {
+                                assert_eq!(r, o);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    macro_rules! test_bops {
+        ($id:ident[$ety:ident; $ecount:expr] |
+         $binary_op:ident [$op_test_id:ident]:
+         ([$($in_a:expr),*], $in_b:expr) => [$($out:expr),*]) => {
+            #[wasm_bindgen_test]
+            fn $op_test_id() {
+                unsafe {
+                    let a_input: [$ety; $ecount] = [$($in_a),*];
+                    let output: [$ety; $ecount] = [$($out),*];
+
+                    let a_vec_in: v128 = mem::transmute(a_input);
+                    let vec_res: v128 = $id::$binary_op(a_vec_in, $in_b);
+
+                    let res: [$ety; $ecount] = mem::transmute(vec_res);
+                    assert_eq!(res, output);
+                }
+            }
+        }
+    }
+
+    macro_rules! test_uop {
+        ($id:ident[$ety:ident; $ecount:expr] |
+         $unary_op:ident [$op_test_id:ident]: [$($in_a:expr),*] => [$($out:expr),*]) => {
+            #[wasm_bindgen_test]
+            fn $op_test_id() {
+                unsafe {
+                    let a_input: [$ety; $ecount] = [$($in_a),*];
+                    let output: [$ety; $ecount] = [$($out),*];
+
+                    let a_vec_in: v128 = mem::transmute(a_input);
+                    let vec_res: v128 = $id::$unary_op(a_vec_in);
+
+                    let res: [$ety; $ecount] = mem::transmute(vec_res);
+                    assert_eq!(res, output);
+                }
+            }
+        }
+    }
+
+    test_bop!(i8x16[i8; 16] | add[i8x16_add_test]:
+              ([0, -1, 2, 3, 4, 5, 6, i8::max_value(), 1, 1, 1, 1, 1, 1, 1, 1],
+               [8, i8::min_value(), 10, 11, 12, 13, 14, 1, 1, 1, 1, 1, 1, 1, 1, 1]) =>
+              [8, i8::max_value(), 12, 14, 16, 18, 20, i8::min_value(), 2, 2, 2, 2, 2, 2, 2, 2]);
+    test_bop!(i8x16[i8; 16] | sub[i8x16_sub_test]:
+              ([0, -1, 2, 3, 4, 5, 6, -1, 1, 1, 1, 1, 1, 1, 1, 1],
+               [8, i8::min_value(), 10, 11, 12, 13, 14, i8::max_value(), 1, 1, 1, 1, 1, 1, 1, 1]) =>
+              [-8, i8::max_value(), -8, -8, -8, -8, -8, i8::min_value(), 0, 0, 0, 0, 0, 0, 0, 0]);
+    test_bop!(i8x16[i8; 16] | mul[i8x16_mul_test]:
+              ([0, -2, 2, 3, 4, 5, 6, 2, 1, 1, 1, 1, 1, 1, 1, 1],
+               [8, i8::min_value(), 10, 11, 12, 13, 14, i8::max_value(), 1, 1, 1, 1, 1, 1, 1, 1]) =>
+              [0, 0, 20, 33, 48, 65, 84, -2, 1, 1, 1, 1, 1, 1, 1, 1]);
+    test_uop!(i8x16[i8; 16] | neg[i8x16_neg_test]:
+              [8, i8::min_value(), 10, 11, 12, 13, 14, i8::max_value(), 1, 1, 1, 1, 1, 1, 1, 1] =>
+              [-8, i8::min_value(), -10, -11, -12, -13, -14, i8::min_value() + 1, -1, -1, -1, -1, -1, -1, -1, -1]);
+
+    test_bop!(i16x8[i16; 8] | add[i16x8_add_test]:
+              ([0, -1, 2, 3, 4, 5, 6, i16::max_value()],
+               [8, i16::min_value(), 10, 11, 12, 13, 14, 1]) =>
+              [8, i16::max_value(), 12, 14, 16, 18, 20, i16::min_value()]);
+    test_bop!(i16x8[i16; 8] | sub[i16x8_sub_test]:
+              ([0, -1, 2, 3, 4, 5, 6, -1],
+               [8, i16::min_value(), 10, 11, 12, 13, 14, i16::max_value()]) =>
+              [-8, i16::max_value(), -8, -8, -8, -8, -8, i16::min_value()]);
+    test_bop!(i16x8[i16; 8] | mul[i16x8_mul_test]:
+              ([0, -2, 2, 3, 4, 5, 6, 2],
+               [8, i16::min_value(), 10, 11, 12, 13, 14, i16::max_value()]) =>
+              [0, 0, 20, 33, 48, 65, 84, -2]);
+    test_uop!(i16x8[i16; 8] | neg[i16x8_neg_test]:
+              [8, i16::min_value(), 10, 11, 12, 13, 14, i16::max_value()] =>
+              [-8, i16::min_value(), -10, -11, -12, -13, -14, i16::min_value() + 1]);
+
+    test_bop!(i32x4[i32; 4] | add[i32x4_add_test]:
+              ([0, -1, 2, i32::max_value()],
+               [8, i32::min_value(), 10, 1]) =>
+              [8, i32::max_value(), 12, i32::min_value()]);
+    test_bop!(i32x4[i32; 4] | sub[i32x4_sub_test]:
+              ([0, -1, 2, -1],
+               [8, i32::min_value(), 10, i32::max_value()]) =>
+              [-8, i32::max_value(), -8, i32::min_value()]);
+    test_bop!(i32x4[i32; 4] | mul[i32x4_mul_test]:
+              ([0, -2, 2, 2],
+               [8, i32::min_value(), 10, i32::max_value()]) =>
+              [0, 0, 20, -2]);
+    test_uop!(i32x4[i32; 4] | neg[i32x4_neg_test]:
+              [8, i32::min_value(), 10, i32::max_value()] =>
+              [-8, i32::min_value(), -10, i32::min_value() + 1]);
+
+    test_bop!(i64x2[i64; 2] | add[i64x2_add_test]:
+              ([-1, i64::max_value()],
+               [i64::min_value(), 1]) =>
+              [i64::max_value(), i64::min_value()]);
+    test_bop!(i64x2[i64; 2] | sub[i64x2_sub_test]:
+              ([-1, -1],
+               [i64::min_value(), i64::max_value()]) =>
+              [ i64::max_value(), i64::min_value()]);
+    // note: mul for i64x2 is not part of the spec
+    test_uop!(i64x2[i64; 2] | neg[i64x2_neg_test]:
+              [i64::min_value(), i64::max_value()] =>
+              [i64::min_value(), i64::min_value() + 1]);
+
+    test_bops!(i8x16[i8; 16] | shl[i8x16_shl_test]:
+               ([0, -1, 2, 3, 4, 5, 6, i8::max_value(), 1, 1, 1, 1, 1, 1, 1, 1], 1) =>
+               [0, -2, 4, 6, 8, 10, 12, -2, 2, 2, 2, 2, 2, 2, 2, 2]);
+    test_bops!(i16x8[i16; 8] | shl[i16x8_shl_test]:
+               ([0, -1, 2, 3, 4, 5, 6, i16::max_value()], 1) =>
+               [0, -2, 4, 6, 8, 10, 12, -2]);
+    test_bops!(i32x4[i32; 4] | shl[i32x4_shl_test]:
+               ([0, -1, 2, 3], 1) => [0, -2, 4, 6]);
+    test_bops!(i64x2[i64; 2] | shl[i64x2_shl_test]:
+               ([0, -1], 1) => [0, -2]);
+
+    test_bops!(i8x16[i8; 16] | shr_s[i8x16_shr_s_test]:
+               ([0, -1, 2, 3, 4, 5, 6, i8::max_value(), 1, 1, 1, 1, 1, 1, 1, 1], 1) =>
+               [0, -1, 1, 1, 2, 2, 3, 63, 0, 0, 0, 0, 0, 0, 0, 0]);
+    test_bops!(i16x8[i16; 8] | shr_s[i16x8_shr_s_test]:
+               ([0, -1, 2, 3, 4, 5, 6, i16::max_value()], 1) =>
+               [0, -1, 1, 1, 2, 2, 3, i16::max_value() / 2]);
+    test_bops!(i32x4[i32; 4] | shr_s[i32x4_shr_s_test]:
+               ([0, -1, 2, 3], 1) => [0, -1, 1, 1]);
+    test_bops!(i64x2[i64; 2] | shr_s[i64x2_shr_s_test]:
+               ([0, -1], 1) => [0, -1]);
+
+    test_bops!(i8x16[i8; 16] | shr_u[i8x16_uhr_u_test]:
+               ([0, -1, 2, 3, 4, 5, 6, i8::max_value(), 1, 1, 1, 1, 1, 1, 1, 1], 1) =>
+               [0, i8::max_value(), 1, 1, 2, 2, 3, 63, 0, 0, 0, 0, 0, 0, 0, 0]);
+    test_bops!(i16x8[i16; 8] | shr_u[i16x8_uhr_u_test]:
+               ([0, -1, 2, 3, 4, 5, 6, i16::max_value()], 1) =>
+               [0, i16::max_value(), 1, 1, 2, 2, 3, i16::max_value() / 2]);
+    test_bops!(i32x4[i32; 4] | shr_u[i32x4_uhr_u_test]:
+               ([0, -1, 2, 3], 1) => [0, i32::max_value(), 1, 1]);
+    test_bops!(i64x2[i64; 2] | shr_u[i64x2_uhr_u_test]:
+               ([0, -1], 1) => [0, i64::max_value()]);
+
+    #[wasm_bindgen_test]
+    fn v128_bitwise_logical_ops() {
+        unsafe {
+            let a: [u32; 4] = [u32::max_value(), 0, u32::max_value(), 0];
+            let b: [u32; 4] = [u32::max_value(); 4];
+            let c: [u32; 4] = [0; 4];
+
+            let vec_a: v128 = mem::transmute(a);
+            let vec_b: v128 = mem::transmute(b);
+            let vec_c: v128 = mem::transmute(c);
+
+            let r: v128 = v128::and(vec_a, vec_a);
+            compare_bytes(r, vec_a);
+            let r: v128 = v128::and(vec_a, vec_b);
+            compare_bytes(r, vec_a);
+            let r: v128 = v128::or(vec_a, vec_b);
+            compare_bytes(r, vec_b);
+            let r: v128 = v128::not(vec_b);
+            compare_bytes(r, vec_c);
+            let r: v128 = v128::xor(vec_a, vec_c);
+            compare_bytes(r, vec_a);
+
+            let r: v128 = v128::bitselect(vec_b, vec_c, vec_b);
+            compare_bytes(r, vec_b);
+            let r: v128 = v128::bitselect(vec_b, vec_c, vec_c);
+            compare_bytes(r, vec_c);
+            let r: v128 = v128::bitselect(vec_b, vec_c, vec_a);
+            compare_bytes(r, vec_a);
+        }
+    }
+
+    macro_rules! test_bool_red {
+        ($id:ident[$test_id:ident] | [$($true:expr),*] | [$($false:expr),*] | [$($alt:expr),*]) => {
+            #[wasm_bindgen_test]
+            fn $test_id() {
+                unsafe {
+                    let vec_a: v128 = mem::transmute([$($true),*]); // true
+                    let vec_b: v128 = mem::transmute([$($false),*]); // false
+                    let vec_c: v128 = mem::transmute([$($alt),*]); // alternating
+
+                    assert_eq!($id::any_true(vec_a), 1);
+                    assert_eq!($id::any_true(vec_b), 0);
+                    assert_eq!($id::any_true(vec_c), 1);
+
+                    assert_eq!($id::all_true(vec_a), 1);
+                    assert_eq!($id::all_true(vec_b), 0);
+                    assert_eq!($id::all_true(vec_c), 0);
+                }
+            }
+        }
+    }
+
+    test_bool_red!(
+        i8x16[i8x16_boolean_reductions]
+            | [1_i8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+            | [0_i8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+            | [1_i8, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0]
+    );
+    test_bool_red!(
+        i16x8[i16x8_boolean_reductions]
+            | [1_i16, 1, 1, 1, 1, 1, 1, 1]
+            | [0_i16, 0, 0, 0, 0, 0, 0, 0]
+            | [1_i16, 0, 1, 0, 1, 0, 1, 0]
+    );
+    test_bool_red!(
+        i32x4[i32x4_boolean_reductions]
+            | [1_i32, 1, 1, 1]
+            | [0_i32, 0, 0, 0]
+            | [1_i32, 0, 1, 0]
+    );
+    test_bool_red!(
+        i64x2[i64x2_boolean_reductions] | [1_i64, 1] | [0_i64, 0] | [1_i64, 0]
+    );
+
+    test_bop!(i8x16[i8; 16] | eq[i8x16_eq_test]:
+              ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
+               [0, 2, 2, 4, 4, 6, 6, 7, 8, 10, 10, 12, 12, 14, 14, 15]) =>
+              [-1, 0, -1, 0 ,-1, 0, -1, -1, -1, 0, -1, 0 ,-1, 0, -1, -1]);
+    test_bop!(i16x8[i16; 8] | eq[i16x8_eq_test]:
+              ([0, 1, 2, 3, 4, 5, 6, 7], [0, 2, 2, 4, 4, 6, 6, 7]) =>
+              [-1, 0, -1, 0 ,-1, 0, -1, -1]);
+    test_bop!(i32x4[i32; 4] | eq[i32x4_eq_test]:
+              ([0, 1, 2, 3], [0, 2, 2, 4]) => [-1, 0, -1, 0]);
+    test_bop!(i64x2[i64; 2] | eq[i64x2_eq_test]: ([0, 1], [0, 2]) => [-1, 0]);
+    test_bop!(f32x4[f32; 4] => i32 | eq[f32x4_eq_test]:
+              ([0., 1., 2., 3.], [0., 2., 2., 4.]) => [-1, 0, -1, 0]);
+    test_bop!(f64x2[f64; 2] => i64 | eq[f64x2_eq_test]: ([0., 1.], [0., 2.]) => [-1, 0]);
+
+    test_bop!(i8x16[i8; 16] | ne[i8x16_ne_test]:
+              ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
+               [0, 2, 2, 4, 4, 6, 6, 7, 8, 10, 10, 12, 12, 14, 14, 15]) =>
+              [0, -1, 0, -1 ,0, -1, 0, 0, 0, -1, 0, -1 ,0, -1, 0, 0]);
+    test_bop!(i16x8[i16; 8] | ne[i16x8_ne_test]:
+              ([0, 1, 2, 3, 4, 5, 6, 7], [0, 2, 2, 4, 4, 6, 6, 7]) =>
+              [0, -1, 0, -1 ,0, -1, 0, 0]);
+    test_bop!(i32x4[i32; 4] | ne[i32x4_ne_test]:
+              ([0, 1, 2, 3], [0, 2, 2, 4]) => [0, -1, 0, -1]);
+    test_bop!(i64x2[i64; 2] | ne[i64x2_ne_test]: ([0, 1], [0, 2]) => [0, -1]);
+    test_bop!(f32x4[f32; 4] => i32 | ne[f32x4_ne_test]:
+              ([0., 1., 2., 3.], [0., 2., 2., 4.]) => [0, -1, 0, -1]);
+    test_bop!(f64x2[f64; 2] => i64 | ne[f64x2_ne_test]: ([0., 1.], [0., 2.]) => [0, -1]);
+
+    test_bop!(i8x16[i8; 16] | lt[i8x16_lt_test]:
+              ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
+               [0, 2, 2, 4, 4, 6, 6, 7, 8, 10, 10, 12, 12, 14, 14, 15]) =>
+              [0, -1, 0, -1 ,0, -1, 0, 0, 0, -1, 0, -1 ,0, -1, 0, 0]);
+    test_bop!(i16x8[i16; 8] | lt[i16x8_lt_test]:
+              ([0, 1, 2, 3, 4, 5, 6, 7], [0, 2, 2, 4, 4, 6, 6, 7]) =>
+              [0, -1, 0, -1 ,0, -1, 0, 0]);
+    test_bop!(i32x4[i32; 4] | lt[i32x4_lt_test]:
+              ([0, 1, 2, 3], [0, 2, 2, 4]) => [0, -1, 0, -1]);
+    test_bop!(i64x2[i64; 2] | lt[i64x2_lt_test]: ([0, 1], [0, 2]) => [0, -1]);
+    test_bop!(f32x4[f32; 4] => i32 | lt[f32x4_lt_test]:
+              ([0., 1., 2., 3.], [0., 2., 2., 4.]) => [0, -1, 0, -1]);
+    test_bop!(f64x2[f64; 2] => i64 | lt[f64x2_lt_test]: ([0., 1.], [0., 2.]) => [0, -1]);
+
+    test_bop!(i8x16[i8; 16] | gt[i8x16_gt_test]:
+          ([0, 2, 2, 4, 4, 6, 6, 7, 8, 10, 10, 12, 12, 14, 14, 15],
+           [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]) =>
+              [0, -1, 0, -1 ,0, -1, 0, 0, 0, -1, 0, -1 ,0, -1, 0, 0]);
+    test_bop!(i16x8[i16; 8] | gt[i16x8_gt_test]:
+              ([0, 2, 2, 4, 4, 6, 6, 7], [0, 1, 2, 3, 4, 5, 6, 7]) =>
+              [0, -1, 0, -1 ,0, -1, 0, 0]);
+    test_bop!(i32x4[i32; 4] | gt[i32x4_gt_test]:
+              ([0, 2, 2, 4], [0, 1, 2, 3]) => [0, -1, 0, -1]);
+    test_bop!(i64x2[i64; 2] | gt[i64x2_gt_test]: ([0, 2], [0, 1]) => [0, -1]);
+    test_bop!(f32x4[f32; 4] => i32 | gt[f32x4_gt_test]:
+              ([0., 2., 2., 4.], [0., 1., 2., 3.]) => [0, -1, 0, -1]);
+    test_bop!(f64x2[f64; 2] => i64 | gt[f64x2_gt_test]: ([0., 2.], [0., 1.]) => [0, -1]);
+
+    test_bop!(i8x16[i8; 16] | ge[i8x16_ge_test]:
+              ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
+               [0, 2, 2, 4, 4, 6, 6, 7, 8, 10, 10, 12, 12, 14, 14, 15]) =>
+              [-1, 0, -1, 0 ,-1, 0, -1, -1, -1, 0, -1, 0 ,-1, 0, -1, -1]);
+    test_bop!(i16x8[i16; 8] | ge[i16x8_ge_test]:
+              ([0, 1, 2, 3, 4, 5, 6, 7], [0, 2, 2, 4, 4, 6, 6, 7]) =>
+              [-1, 0, -1, 0 ,-1, 0, -1, -1]);
+    test_bop!(i32x4[i32; 4] | ge[i32x4_ge_test]:
+              ([0, 1, 2, 3], [0, 2, 2, 4]) => [-1, 0, -1, 0]);
+    test_bop!(i64x2[i64; 2] | ge[i64x2_ge_test]: ([0, 1], [0, 2]) => [-1, 0]);
+    test_bop!(f32x4[f32; 4] => i32 | ge[f32x4_ge_test]:
+              ([0., 1., 2., 3.], [0., 2., 2., 4.]) => [-1, 0, -1, 0]);
+    test_bop!(f64x2[f64; 2] => i64 | ge[f64x2_ge_test]: ([0., 1.], [0., 2.]) => [-1, 0]);
+
+    test_bop!(i8x16[i8; 16] | le[i8x16_le_test]:
+              ([0, 2, 2, 4, 4, 6, 6, 7, 8, 10, 10, 12, 12, 14, 14, 15],
+               [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
+              ) =>
+              [-1, 0, -1, 0 ,-1, 0, -1, -1, -1, 0, -1, 0 ,-1, 0, -1, -1]);
+    test_bop!(i16x8[i16; 8] | le[i16x8_le_test]:
+              ([0, 2, 2, 4, 4, 6, 6, 7], [0, 1, 2, 3, 4, 5, 6, 7]) =>
+              [-1, 0, -1, 0 ,-1, 0, -1, -1]);
+    test_bop!(i32x4[i32; 4] | le[i32x4_le_test]:
+              ([0, 2, 2, 4], [0, 1, 2, 3]) => [-1, 0, -1, 0]);
+    test_bop!(i64x2[i64; 2] | le[i64x2_le_test]: ([0, 2], [0, 1]) => [-1, 0]);
+    test_bop!(f32x4[f32; 4] => i32 | le[f32x4_le_test]:
+              ([0., 2., 2., 4.], [0., 1., 2., 3.]) => [-1, 0, -1, -0]);
+    test_bop!(f64x2[f64; 2] => i64 | le[f64x2_le_test]: ([0., 2.], [0., 1.]) => [-1, 0]);
+
+    #[wasm_bindgen_test]
+    fn v128_bitwise_load_store() {
+        unsafe {
+            let mut arr: [i32; 4] = [0, 1, 2, 3];
+
+            let vec = v128::load(arr.as_ptr() as *const v128);
+            let vec = i32x4::add(vec, vec);
+            v128::store(arr.as_mut_ptr() as *mut v128, vec);
+
+            assert_eq!(arr, [0, 2, 4, 6]);
+        }
+    }
+
+    test_uop!(f32x4[f32; 4] | neg[f32x4_neg_test]: [0., 1., 2., 3.] => [ 0., -1., -2., -3.]);
+    test_uop!(f32x4[f32; 4] | abs[f32x4_abs_test]: [0., -1., 2., -3.] => [ 0., 1., 2., 3.]);
+    test_bop!(f32x4[f32; 4] | min[f32x4_min_test]:
+              ([0., -1., 7., 8.], [1., -3., -4., 10.]) => [0., -3., -4., 8.]);
+    test_bop!(f32x4[f32; 4] | min[f32x4_min_test_nan]:
+              ([0., -1., 7., 8.], [1., -3., -4., std::f32::NAN])
+              => [0., -3., -4., std::f32::NAN]);
+    test_bop!(f32x4[f32; 4] | max[f32x4_max_test]:
+              ([0., -1., 7., 8.], [1., -3., -4., 10.]) => [1., -1., 7., 10.]);
+    test_bop!(f32x4[f32; 4] | max[f32x4_max_test_nan]:
+              ([0., -1., 7., 8.], [1., -3., -4., std::f32::NAN])
+              => [1., -1., 7., std::f32::NAN]);
+    test_bop!(f32x4[f32; 4] | add[f32x4_add_test]:
+              ([0., -1., 7., 8.], [1., -3., -4., 10.]) => [1., -4., 3., 18.]);
+    test_bop!(f32x4[f32; 4] | sub[f32x4_sub_test]:
+              ([0., -1., 7., 8.], [1., -3., -4., 10.]) => [-1., 2., 11., -2.]);
+    test_bop!(f32x4[f32; 4] | mul[f32x4_mul_test]:
+              ([0., -1., 7., 8.], [1., -3., -4., 10.]) => [0., 3., -28., 80.]);
+    test_bop!(f32x4[f32; 4] | div[f32x4_div_test]:
+              ([0., -8., 70., 8.], [1., 4., 10., 2.]) => [0., -2., 7., 4.]);
+
+    test_uop!(f64x2[f64; 2] | neg[f64x2_neg_test]: [0., 1.] => [ 0., -1.]);
+    test_uop!(f64x2[f64; 2] | abs[f64x2_abs_test]: [0., -1.] => [ 0., 1.]);
+    test_bop!(f64x2[f64; 2] | min[f64x2_min_test]:
+              ([0., -1.], [1., -3.]) => [0., -3.]);
+    test_bop!(f64x2[f64; 2] | min[f64x2_min_test_nan]:
+              ([7., 8.], [-4., std::f64::NAN])
+              => [ -4., std::f64::NAN]);
+    test_bop!(f64x2[f64; 2] | max[f64x2_max_test]:
+              ([0., -1.], [1., -3.]) => [1., -1.]);
+    test_bop!(f64x2[f64; 2] | max[f64x2_max_test_nan]:
+              ([7., 8.], [ -4., std::f64::NAN])
+              => [7., std::f64::NAN]);
+    test_bop!(f64x2[f64; 2] | add[f64x2_add_test]:
+              ([0., -1.], [1., -3.]) => [1., -4.]);
+    test_bop!(f64x2[f64; 2] | sub[f64x2_sub_test]:
+              ([0., -1.], [1., -3.]) => [-1., 2.]);
+    test_bop!(f64x2[f64; 2] | mul[f64x2_mul_test]:
+              ([0., -1.], [1., -3.]) => [0., 3.]);
+    test_bop!(f64x2[f64; 2] | div[f64x2_div_test]:
+              ([0., -8.], [1., 4.]) => [0., -2.]);
+
+    macro_rules! test_conv {
+        ($test_id:ident | $conv_id:ident | $to_ty:ident | $from:expr,  $to:expr) => {
+            #[wasm_bindgen_test]
+            fn $test_id() {
+                unsafe {
+                    let from: v128 = mem::transmute($from);
+                    let to: v128 = mem::transmute($to);
+
+                    let r: v128 = $to_ty::$conv_id(from);
+
+                    compare_bytes(r, to);
+                }
+            }
+        };
+    }
+
+    test_conv!(
+        f32x4_convert_s_i32x4 | convert_s_i32x4 | f32x4 | [1_i32, 2, 3, 4],
+        [1_f32, 2., 3., 4.]
+    );
+    test_conv!(
+        f32x4_convert_u_i32x4
+            | convert_u_i32x4
+            | f32x4
+            | [u32::max_value(), 2, 3, 4],
+        [u32::max_value() as f32, 2., 3., 4.]
+    );
+    test_conv!(
+        f64x2_convert_s_i64x2 | convert_s_i64x2 | f64x2 | [1_i64, 2],
+        [1_f64, 2.]
+    );
+    test_conv!(
+        f64x2_convert_u_i64x2
+            | convert_u_i64x2
+            | f64x2
+            | [u64::max_value(), 2],
+        [18446744073709552000.0, 2.]
+    );
+
+    // FIXME: this fails, and produces -2147483648 instead of saturating at
+    // i32::max_value() test_conv!(i32x4_trunc_s_f32x4_sat | trunc_s_f32x4_sat
+    // | i32x4 | [1_f32, 2., (i32::max_value() as f32 + 1.), 4.],
+    // [1_i32, 2, i32::max_value(), 4]); FIXME: add other saturating tests
+}
diff --git a/crates/assert-instr-macro/src/lib.rs b/crates/assert-instr-macro/src/lib.rs
index 25b5572ad8..e5575e85a3 100644
--- a/crates/assert-instr-macro/src/lib.rs
+++ b/crates/assert-instr-macro/src/lib.rs
@@ -38,17 +38,9 @@ pub fn assert_instr(
     // testing for.
     let disable_assert_instr =
         std::env::var("STDSIMD_DISABLE_ASSERT_INSTR").is_ok();
-    let maybe_ignore = if cfg!(optimized) && !disable_assert_instr {
-        TokenStream::new()
-    } else {
-        (quote! { #[ignore] }).into()
-    };
 
     use quote::ToTokens;
     let instr_str = instr
-        .clone()
-        .into_token_stream()
-        .to_string()
         .replace('.', "_")
         .replace(|c: char| c.is_whitespace(), "");
     let assert_name = syn::Ident::new(
@@ -124,16 +116,22 @@ pub fn assert_instr(
         }
     };
 
+    // If instruction tests are disabled avoid emitting this shim at all, just
+    // return the original item without our attribute.
+    if !cfg!(optimized) || disable_assert_instr {
+        return (quote! { #item }).into();
+    }
+
     let tts: TokenStream = quote! {
-        #[test]
+        #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
+        #[cfg_attr(not(target_arch = "wasm32"), test)]
         #[allow(non_snake_case)]
-        #maybe_ignore
         fn #assert_name() {
             #to_test
 
             ::stdsimd_test::assert(#shim_name as usize,
                                    stringify!(#shim_name),
-                                   stringify!(#instr));
+                                   #instr);
         }
     }.into();
     // why? necessary now to get tests to work?
@@ -148,13 +146,17 @@ pub fn assert_instr(
 }
 
 struct Invoc {
-    instr: syn::Expr,
+    instr: String,
     args: Vec<(syn::Ident, syn::Expr)>,
 }
 
 impl syn::synom::Synom for Invoc {
     named!(parse -> Self, do_parse!(
-        instr: syn!(syn::Expr) >>
+        instr: alt!(
+            map!(syn!(syn::Ident), |s| s.to_string())
+            |
+            map!(syn!(syn::LitStr), |s| s.value())
+        ) >>
         args: many0!(do_parse!(
             syn!(syn::token::Comma) >>
             name: syn!(syn::Ident) >>
diff --git a/crates/coresimd/Cargo.toml b/crates/coresimd/Cargo.toml
index 5bc2e5d7ef..dac4d916da 100644
--- a/crates/coresimd/Cargo.toml
+++ b/crates/coresimd/Cargo.toml
@@ -22,9 +22,14 @@ maintenance = { status = "experimental" }
 stdsimd-test = { version = "0.*", path = "../stdsimd-test" }
 stdsimd = { version = "0.0.3", path = "../stdsimd" }
 
+[target.wasm32-unknown-unknown.dev-dependencies]
+wasm-bindgen-test = "0.2.16"
+
 [features]
 # Internal-usage only: denies all warnings.
 strict = []
 # Internal-usage only: enables only those intrinsics supported by Intel's
 # Software Development Environment (SDE).
 intel_sde = []
+# Enables wasm simd128 intrinsics
+wasm_simd128 = []
diff --git a/crates/coresimd/src/lib.rs b/crates/coresimd/src/lib.rs
index 1c5f185a8a..8d892e3b47 100644
--- a/crates/coresimd/src/lib.rs
+++ b/crates/coresimd/src/lib.rs
@@ -11,6 +11,7 @@
 #![allow(unused_features)]
 #![feature(
     const_fn,
+    const_fn_union,
     link_llvm_intrinsics,
     platform_intrinsics,
     repr_simd,
@@ -34,7 +35,7 @@
     arm_target_feature,
     aarch64_target_feature,
     mips_target_feature,
-    powerpc_target_feature
+    powerpc_target_feature,
 )]
 #![cfg_attr(
     test,
@@ -81,6 +82,9 @@ extern crate stdsimd_test;
 #[cfg(test)]
 extern crate test;
 
+#[cfg(all(test, target_arch = "wasm32"))]
+extern crate wasm_bindgen_test;
+
 #[path = "../../../coresimd/mod.rs"]
 mod coresimd;
 
diff --git a/crates/stdsimd-test/Cargo.toml b/crates/stdsimd-test/Cargo.toml
index e2fc6e30d3..cf459b6e04 100644
--- a/crates/stdsimd-test/Cargo.toml
+++ b/crates/stdsimd-test/Cargo.toml
@@ -10,3 +10,7 @@ backtrace = "0.3"
 cc = "1.0"
 lazy_static = "1.0"
 rustc-demangle = "0.1.8"
+wasm-bindgen = "0.2.16"
+
+[features]
+default = []
diff --git a/crates/stdsimd-test/src/lib.rs b/crates/stdsimd-test/src/lib.rs
index 06d1db5136..9f56363835 100644
--- a/crates/stdsimd-test/src/lib.rs
+++ b/crates/stdsimd-test/src/lib.rs
@@ -17,12 +17,16 @@ extern crate cc;
 extern crate lazy_static;
 extern crate rustc_demangle;
 extern crate simd_test_macro;
+extern crate wasm_bindgen;
 
 use std::collections::HashMap;
 use std::env;
+use std::path::Path;
 use std::process::Command;
 use std::str;
 
+use wasm_bindgen::prelude::*;
+
 pub use assert_instr_macro::*;
 pub use simd_test_macro::*;
 
@@ -32,6 +36,7 @@ lazy_static! {
 }
 
 struct Function {
+    addr: Option<usize>,
     instrs: Vec<Instruction>,
 }
 
@@ -40,6 +45,10 @@ struct Instruction {
 }
 
 fn disassemble_myself() -> HashMap<String, Vec<Function>> {
+    if cfg!(target_arch = "wasm32") {
+        return parse_wasm2wat();
+    }
+
     let me = env::current_exe().expect("failed to get current exe");
 
     if cfg!(target_arch = "x86_64")
@@ -145,6 +154,7 @@ fn parse_objdump(output: &str) -> HashMap<String, Vec<Function>> {
         ret.entry(normalize(symbol))
             .or_insert_with(Vec::new)
             .push(Function {
+                addr: None,
                 instrs: instructions,
             });
     }
@@ -189,6 +199,7 @@ fn parse_otool(output: &str) -> HashMap<String, Vec<Function>> {
         ret.entry(normalize(symbol))
             .or_insert_with(Vec::new)
             .push(Function {
+                addr: None,
                 instrs: instructions,
             });
     }
@@ -239,6 +250,7 @@ fn parse_dumpbin(output: &str) -> HashMap<String, Vec<Function>> {
         ret.entry(normalize(symbol))
             .or_insert_with(Vec::new)
             .push(Function {
+                addr: None,
                 instrs: instructions,
             });
     }
@@ -246,6 +258,100 @@ fn parse_dumpbin(output: &str) -> HashMap<String, Vec<Function>> {
     ret
 }
 
+
+#[wasm_bindgen(module = "child_process")]
+extern "C" {
+    #[wasm_bindgen(js_name = execSync)]
+    fn exec_sync(cmd: &str) -> Buffer;
+}
+
+#[wasm_bindgen(module = "buffer")]
+extern "C" {
+    type Buffer;
+    #[wasm_bindgen(method, js_name = toString)]
+    fn to_string(this: &Buffer) -> String;
+}
+
+#[wasm_bindgen]
+extern "C" {
+    #[wasm_bindgen(js_namespace = require)]
+    fn resolve(module: &str) -> String;
+    #[wasm_bindgen(js_namespace = console, js_name = log)]
+    fn js_console_log(s: &str);
+}
+
+// println! doesn't work on wasm32 right now, so shadow the compiler's println!
+// macro with our own shim that redirects to `console.log`.
+#[cfg(target_arch = "wasm32")]
+macro_rules! println {
+    ($($args:tt)*) => (js_console_log(&format!($($args)*)))
+}
+
+fn parse_wasm2wat() -> HashMap<String, Vec<Function>> {
+    // Our wasm module in the wasm-bindgen test harness is called
+    // "wasm-bindgen-test_bg". When running in node this is actually a shim JS
+    // file. Ask node where that JS file is, and then we use that with a wasm
+    // extension to find the wasm file itself.
+    let js_shim = resolve("wasm-bindgen-test_bg");
+    let js_shim = Path::new(&js_shim).with_extension("wasm");
+
+    // Execute `wasm2wat` synchronously, waiting for and capturing all of its
+    // output.
+    let output =
+        exec_sync(&format!("wasm2wat {}", js_shim.display())).to_string();
+
+    let mut ret: HashMap<String, Vec<Function>> = HashMap::new();
+    let mut lines = output.lines().map(|s| s.trim());
+    while let Some(line) = lines.next() {
+        // If we found the table of function pointers, fill in the known
+        // address for all our `Function` instances
+        if line.starts_with("(elem") {
+            for (i, name) in line.split_whitespace().skip(3).enumerate() {
+                let name = name.trim_right_matches(")");
+                for f in ret.get_mut(name).expect("ret.get_mut(name) failed") {
+                    f.addr = Some(i + 1);
+                }
+            }
+            continue;
+        }
+
+        // If this isn't a function, we don't care about it.
+        if !line.starts_with("(func ") {
+            continue;
+        }
+
+        let mut function = Function {
+            instrs: Vec::new(),
+            addr: None,
+        };
+
+        // Empty functions will end in `))` so there's nothing to do, otherwise
+        // we'll have a bunch of following lines which are instructions.
+        //
+        // Lines that have an imbalanced `)` mark the end of a function.
+        if !line.ends_with("))") {
+            while let Some(line) = lines.next() {
+                function.instrs.push(Instruction {
+                    parts: line
+                        .split_whitespace()
+                        .map(|s| s.to_string())
+                        .collect(),
+                });
+                if !line.starts_with("(") && line.ends_with(")") {
+                    break;
+                }
+            }
+        }
+
+        // The second element here split on whitespace should be the name of
+        // the function, skipping the type/params/results
+        ret.entry(line.split_whitespace().nth(1).unwrap().to_string())
+            .or_insert(Vec::new())
+            .push(function);
+    }
+    return ret;
+}
+
 fn normalize(symbol: &str) -> String {
     let symbol = rustc_demangle::demangle(symbol).to_string();
     match symbol.rfind("::h") {
@@ -259,27 +365,8 @@ fn normalize(symbol: &str) -> String {
 /// This asserts that the function at `fnptr` contains the instruction
 /// `expected` provided.
 pub fn assert(fnptr: usize, fnname: &str, expected: &str) {
-    // Translate this function pointer to a symbolic name that we'd have found
-    // in the disassembly.
-    let mut sym = None;
-    backtrace::resolve(fnptr as *mut _, |name| {
-        sym = name.name().and_then(|s| s.as_str()).map(normalize);
-    });
-
-    let functions =
-        if let Some(s) = sym.as_ref().and_then(|s| DISASSEMBLY.get(s)) {
-            s
-        } else {
-            if let Some(sym) = sym {
-                println!("assumed symbol name: `{}`", sym);
-            }
-            println!("maybe related functions");
-            for f in DISASSEMBLY.keys().filter(|k| k.contains(fnname)) {
-                println!("\t- {}", f);
-            }
-            panic!("failed to find disassembly of {:#x} ({})", fnptr, fnname);
-        };
-
+    let mut fnname = fnname.to_string();
+    let functions = get_functions(fnptr, &mut fnname);
     assert_eq!(functions.len(), 1);
     let function = &functions[0];
 
@@ -362,16 +449,14 @@ pub fn assert(fnptr: usize, fnname: &str, expected: &str) {
 
     // Help debug by printing out the found disassembly, and then panic as we
     // didn't find the instruction.
-    println!(
-        "disassembly for {}: ",
-        sym.as_ref().expect("symbol not found")
-    );
+    println!("disassembly for {}: ", fnname,);
     for (i, instr) in instrs.iter().enumerate() {
-        print!("\t{:2}: ", i);
+        let mut s = format!("\t{:2}: ", i);
         for part in &instr.parts {
-            print!("{} ", part);
+            s.push_str(part);
+            s.push_str(" ");
         }
-        println!();
+        println!("{}", s);
     }
 
     if !found {
@@ -394,6 +479,39 @@ pub fn assert(fnptr: usize, fnname: &str, expected: &str) {
     }
 }
 
+fn get_functions(fnptr: usize, fnname: &mut String) -> &'static [Function] {
+    // Translate this function pointer to a symbolic name that we'd have found
+    // in the disassembly.
+    let mut sym = None;
+    backtrace::resolve(fnptr as *mut _, |name| {
+        sym = name.name().and_then(|s| s.as_str()).map(normalize);
+    });
+
+    if let Some(sym) = &sym {
+        if let Some(s) = DISASSEMBLY.get(sym) {
+            *fnname = sym.to_string();
+            return s;
+        }
+    }
+
+    let exact_match = DISASSEMBLY
+        .iter()
+        .find(|(_, list)| list.iter().any(|f| f.addr == Some(fnptr)));
+    if let Some((name, list)) = exact_match {
+        *fnname = name.to_string();
+        return list;
+    }
+
+    if let Some(sym) = sym {
+        println!("assumed symbol name: `{}`", sym);
+    }
+    println!("maybe related functions");
+    for f in DISASSEMBLY.keys().filter(|k| k.contains(&**fnname)) {
+        println!("\t- {}", f);
+    }
+    panic!("failed to find disassembly of {:#x} ({})", fnptr, fnname);
+}
+
 pub fn assert_skip_test_ok(name: &str) {
     if env::var("STDSIMD_TEST_EVERYTHING").is_err() {
         return;
diff --git a/crates/stdsimd/Cargo.toml b/crates/stdsimd/Cargo.toml
index 4ab553db48..3db3ed1187 100644
--- a/crates/stdsimd/Cargo.toml
+++ b/crates/stdsimd/Cargo.toml
@@ -37,3 +37,7 @@ path = "../../examples/hex.rs"
 name = "wasm"
 crate-type = ["cdylib"]
 path = "../../examples/wasm.rs"
+
+[features]
+default = []
+wasm_simd128 = ["coresimd/wasm_simd128"]
\ No newline at end of file
diff --git a/crates/stdsimd/src/lib.rs b/crates/stdsimd/src/lib.rs
index 65871cc5eb..021dc06ae3 100644
--- a/crates/stdsimd/src/lib.rs
+++ b/crates/stdsimd/src/lib.rs
@@ -17,6 +17,7 @@ extern crate libc;
 extern crate std as __do_not_use_this_import;
 
 #[cfg(test)]
+#[allow(unused_imports)]
 #[macro_use(println, print)]
 extern crate std;