diff --git a/src/arm/assembler-arm.h b/src/arm/assembler-arm.h
index ccb51042068..cbceca70686 100644
--- a/src/arm/assembler-arm.h
+++ b/src/arm/assembler-arm.h
@@ -366,6 +366,34 @@ struct QwNeonRegister {
     return r;
   }
 
+  static int ToAllocationIndex(QwNeonRegister reg) {
+    ASSERT(reg.code() < kMaxNumRegisters);
+    return reg.code();
+  }
+
+  static const char* AllocationIndexToString(int index) {
+    ASSERT(index >= 0 && index < kMaxNumRegisters);
+    const char* const names[] = {
+      "q0",
+      "q1",
+      "q2",
+      "q3",
+      "q4",
+      "q5",
+      "q6",
+      "q7",
+      "q8",
+      "q9",
+      "q10",
+      "q11",
+      "q12",
+      "q13",
+      "q14",
+      "q15",
+    };
+    return names[index];
+  }
+
   bool is_valid() const {
     return (0 <= code_) && (code_ < kMaxNumRegisters);
   }
@@ -385,6 +413,7 @@ struct QwNeonRegister {
 
 
 typedef QwNeonRegister QuadRegister;
+typedef QwNeonRegister SIMD128Register;
 
 
 // Support for the VFP registers s0 to s31 (d0 to d15).
diff --git a/src/arm/cpu-arm.cc b/src/arm/cpu-arm.cc
index 20c6a5dcce3..a04f1413b88 100644
--- a/src/arm/cpu-arm.cc
+++ b/src/arm/cpu-arm.cc
@@ -56,6 +56,12 @@ bool CPU::SupportsCrankshaft() {
 }
 
 
+bool CPU::SupportsSIMD128InCrankshaft() {
+  // Not Implemented.
+  return false;
+}
+
+
 void CPU::FlushICache(void* start, size_t size) {
   // Nothing to do flushing no instructions.
   if (size == 0) {
diff --git a/src/arm/deoptimizer-arm.cc b/src/arm/deoptimizer-arm.cc
index 6031499dbd1..f3f9fd1ee68 100644
--- a/src/arm/deoptimizer-arm.cc
+++ b/src/arm/deoptimizer-arm.cc
@@ -113,7 +113,7 @@ void Deoptimizer::SetPlatformCompiledStubRegisters(
 }
 
 
-void Deoptimizer::CopyDoubleRegisters(FrameDescription* output_frame) {
+void Deoptimizer::CopySIMD128Registers(FrameDescription* output_frame) {
   for (int i = 0; i < DwVfpRegister::kMaxNumRegisters; ++i) {
     double double_value = input_->GetDoubleRegister(i);
     output_frame->SetDoubleRegister(i, double_value);
@@ -210,7 +210,7 @@ void Deoptimizer::EntryGenerator::Generate() {
 
   // Copy VFP registers to
   // double_registers_[DoubleRegister::kMaxNumAllocatableRegisters]
-  int double_regs_offset = FrameDescription::double_registers_offset();
+  int double_regs_offset = FrameDescription::simd128_registers_offset();
   for (int i = 0; i < DwVfpRegister::kMaxNumAllocatableRegisters; ++i) {
     int dst_offset = i * kDoubleSize + double_regs_offset;
     int src_offset = i * kDoubleSize + kNumberOfRegisters * kPointerSize;
@@ -284,7 +284,7 @@ void Deoptimizer::EntryGenerator::Generate() {
   __ CheckFor32DRegs(ip);
 
   __ ldr(r1, MemOperand(r0, Deoptimizer::input_offset()));
-  int src_offset = FrameDescription::double_registers_offset();
+  int src_offset = FrameDescription::simd128_registers_offset();
   for (int i = 0; i < DwVfpRegister::kMaxNumRegisters; ++i) {
     if (i == kDoubleRegZero.code()) continue;
     if (i == kScratchDoubleReg.code()) continue;
@@ -350,6 +350,18 @@ void FrameDescription::SetCallerFp(unsigned offset, intptr_t value) {
 }
 
 
+double FrameDescription::GetDoubleRegister(unsigned n) const {
+  ASSERT(n < 2 * ARRAY_SIZE(simd128_registers_));
+  return simd128_registers_[n / 2].d[n % 2];
+}
+
+
+void FrameDescription::SetDoubleRegister(unsigned n, double value) {
+  ASSERT(n < 2 * ARRAY_SIZE(simd128_registers_));
+  simd128_registers_[n / 2].d[n % 2] = value;
+}
+
+
 #undef __
 
 } }  // namespace v8::internal
diff --git a/src/cpu.h b/src/cpu.h
index b2e9f7da7ee..b43e618a0d9 100644
--- a/src/cpu.h
+++ b/src/cpu.h
@@ -107,6 +107,8 @@ class CPU V8_FINAL BASE_EMBEDDED {
 
   static bool SupportsCrankshaft();
 
+  static bool SupportsSIMD128InCrankshaft();
+
   // Flush instruction cache.
   static void FlushICache(void* start, size_t size);
 
diff --git a/src/deoptimizer.cc b/src/deoptimizer.cc
index 9e7e113ed9a..4584990ed53 100644
--- a/src/deoptimizer.cc
+++ b/src/deoptimizer.cc
@@ -1711,7 +1711,7 @@ void Deoptimizer::DoComputeCompiledStubFrame(TranslationIterator* iterator,
   }
 
   // Copy the double registers from the input into the output frame.
-  CopyDoubleRegisters(output_frame);
+  CopySIMD128Registers(output_frame);
 
   // Fill registers containing handler and number of parameters.
   SetPlatformCompiledStubRegisters(output_frame, descriptor);
@@ -1864,6 +1864,43 @@ void Deoptimizer::MaterializeHeapObjects(JavaScriptFrameIterator* it) {
     Memory::Object_at(d.destination()) = *num;
   }
 
+  // Materialize all float32x4 before looking at arguments because when the
+  // output frames are used to materialize arguments objects later on they need
+  // to already contain valid float32x4 values.
+  for (int i = 0; i < deferred_float32x4s_.length(); i++) {
+    SIMD128MaterializationDescriptor<Address> d = deferred_float32x4s_[i];
+    float32x4_value_t x4 = d.value().f4;
+    Handle<Object> float32x4 = isolate_->factory()->NewFloat32x4(x4);
+    if (trace_scope_ != NULL) {
+      PrintF(trace_scope_->file(),
+             "Materialized a new float32x4 %p "
+             "[float32x4(%e, %e, %e, %e)] in slot %p\n",
+             reinterpret_cast<void*>(*float32x4),
+             x4.storage[0], x4.storage[1], x4.storage[2], x4.storage[3],
+             d.destination());
+    }
+    Memory::Object_at(d.destination()) = *float32x4;
+  }
+
+  // Materialize all int32x4 before looking at arguments because when the
+  // output frames are used to materialize arguments objects later on they need
+  // to already contain valid int32x4 values.
+  for (int i = 0; i < deferred_int32x4s_.length(); i++) {
+    SIMD128MaterializationDescriptor<Address> d = deferred_int32x4s_[i];
+    int32x4_value_t x4 = d.value().i4;
+    Handle<Object> int32x4 = isolate_->factory()->NewInt32x4(x4);
+    if (trace_scope_ != NULL) {
+      PrintF(trace_scope_->file(),
+             "Materialized a new int32x4 %p "
+             "[int32x4(%u, %u, %u, %u)] in slot %p\n",
+             reinterpret_cast<void*>(*int32x4),
+             x4.storage[0], x4.storage[1], x4.storage[2], x4.storage[3],
+             d.destination());
+    }
+    Memory::Object_at(d.destination()) = *int32x4;
+  }
+
+
   // Materialize all heap numbers required for arguments/captured objects.
   for (int i = 0; i < deferred_objects_double_values_.length(); i++) {
     HeapNumberMaterializationDescriptor<int> d =
@@ -1883,6 +1920,48 @@ void Deoptimizer::MaterializeHeapObjects(JavaScriptFrameIterator* it) {
   // Play it safe and clear all object double values before we continue.
   deferred_objects_double_values_.Clear();
 
+  // Materialize all float32x4 values required for arguments/captured objects.
+  for (int i = 0; i < deferred_objects_float32x4_values_.length(); i++) {
+    SIMD128MaterializationDescriptor<int> d =
+        deferred_objects_float32x4_values_[i];
+    float32x4_value_t x4 = d.value().f4;
+    Handle<Object> float32x4 = isolate_->factory()->NewFloat32x4(x4);
+    if (trace_scope_ != NULL) {
+      PrintF(trace_scope_->file(),
+             "Materialized a new float32x4 %p "
+             "[float32x4(%e, %e, %e, %e)] for object at %d\n",
+             reinterpret_cast<void*>(*float32x4),
+             x4.storage[0], x4.storage[1], x4.storage[2], x4.storage[3],
+             d.destination());
+    }
+    ASSERT(values.at(d.destination())->IsTheHole());
+    values.Set(d.destination(), float32x4);
+  }
+
+  // Play it safe and clear all object float32x4 values before we continue.
+  deferred_objects_float32x4_values_.Clear();
+
+  // Materialize all int32x4 values required for arguments/captured objects.
+  for (int i = 0; i < deferred_objects_int32x4_values_.length(); i++) {
+    SIMD128MaterializationDescriptor<int> d =
+        deferred_objects_int32x4_values_[i];
+    int32x4_value_t x4 = d.value().i4;
+    Handle<Object> int32x4 = isolate_->factory()->NewInt32x4(x4);
+    if (trace_scope_ != NULL) {
+      PrintF(trace_scope_->file(),
+             "Materialized a new int32x4 %p "
+             "[int32x4(%u, %u, %u, %u)] for object at %d\n",
+             reinterpret_cast<void*>(*int32x4),
+             x4.storage[0], x4.storage[1], x4.storage[2], x4.storage[3],
+             d.destination());
+    }
+    ASSERT(values.at(d.destination())->IsTheHole());
+    values.Set(d.destination(), int32x4);
+  }
+
+  // Play it safe and clear all object int32x4 values before we continue.
+  deferred_objects_int32x4_values_.Clear();
+
   // Materialize arguments/captured objects.
   if (!deferred_objects_.is_empty()) {
     List<Handle<Object> > materialized_objects(deferred_objects_.length());
@@ -2094,6 +2173,38 @@ void Deoptimizer::DoTranslateObject(TranslationIterator* iterator,
       return;
     }
 
+    case Translation::FLOAT32x4_REGISTER:
+    case Translation::INT32x4_REGISTER: {
+      int input_reg = iterator->Next();
+      simd128_value_t value = input_->GetSIMD128Register(input_reg);
+      if (trace_scope_ != NULL) {
+        if (opcode == Translation::FLOAT32x4_REGISTER) {
+          float32x4_value_t x4 = value.f4;
+          PrintF(trace_scope_->file(),
+                 "      object @0x%08" V8PRIxPTR ": [field #%d] <- ",
+                 reinterpret_cast<intptr_t>(object_slot),
+                 field_index);
+          PrintF(trace_scope_->file(),
+                 "float32x4(%e, %e, %e, %e) ; %s\n",
+                 x4.storage[0], x4.storage[1], x4.storage[2], x4.storage[3],
+                 SIMD128Register::AllocationIndexToString(input_reg));
+        } else {
+          ASSERT(opcode == Translation::INT32x4_REGISTER);
+          int32x4_value_t x4 = value.i4;
+          PrintF(trace_scope_->file(),
+                 "      object @0x%08" V8PRIxPTR ": [field #%d] <- ",
+                 reinterpret_cast<intptr_t>(object_slot),
+                 field_index);
+          PrintF(trace_scope_->file(),
+                 "int32x4(%u, %u, %u, %u) ; %s\n",
+                 x4.storage[0], x4.storage[1], x4.storage[2], x4.storage[3],
+                 SIMD128Register::AllocationIndexToString(input_reg));
+        }
+      }
+      AddObjectSIMD128Value(value, opcode);
+      return;
+    }
+
     case Translation::STACK_SLOT: {
       int input_slot_index = iterator->Next();
       unsigned input_offset = input_->GetOffsetFromSlotIndex(input_slot_index);
@@ -2181,6 +2292,39 @@ void Deoptimizer::DoTranslateObject(TranslationIterator* iterator,
       return;
     }
 
+    case Translation::FLOAT32x4_STACK_SLOT:
+    case Translation::INT32x4_STACK_SLOT: {
+      int input_slot_index = iterator->Next();
+      unsigned input_offset = input_->GetOffsetFromSlotIndex(input_slot_index);
+      simd128_value_t value = input_->GetSIMD128FrameSlot(input_offset);
+      if (trace_scope_ != NULL) {
+        if (opcode == Translation::FLOAT32x4_STACK_SLOT) {
+          float32x4_value_t x4 = value.f4;
+          PrintF(trace_scope_->file(),
+                 "      object @0x%08" V8PRIxPTR ": [field #%d] <- ",
+                 reinterpret_cast<intptr_t>(object_slot),
+                 field_index);
+          PrintF(trace_scope_->file(),
+                 "float32x4(%e, %e, %e, %e) ; [sp + %d]\n",
+                 x4.storage[0], x4.storage[1], x4.storage[2], x4.storage[3],
+                 input_offset);
+        } else {
+          ASSERT(opcode == Translation::INT32x4_STACK_SLOT);
+          int32x4_value_t x4 = value.i4;
+          PrintF(trace_scope_->file(),
+                 "      object @0x%08" V8PRIxPTR ": [field #%d] <- ",
+                 reinterpret_cast<intptr_t>(object_slot),
+                 field_index);
+          PrintF(trace_scope_->file(),
+                 "int32x4(%u, %u, %u, %u) ; [sp + %d]\n",
+                 x4.storage[0], x4.storage[1], x4.storage[2], x4.storage[3],
+                 input_offset);
+        }
+      }
+      AddObjectSIMD128Value(value, opcode);
+      return;
+    }
+
     case Translation::LITERAL: {
       Object* literal = ComputeLiteral(iterator->Next());
       if (trace_scope_ != NULL) {
@@ -2363,6 +2507,40 @@ void Deoptimizer::DoTranslateCommand(TranslationIterator* iterator,
       return;
     }
 
+    case Translation::FLOAT32x4_REGISTER:
+    case Translation::INT32x4_REGISTER: {
+      int input_reg = iterator->Next();
+      simd128_value_t value = input_->GetSIMD128Register(input_reg);
+      if (trace_scope_ != NULL) {
+        if (opcode == Translation::FLOAT32x4_REGISTER) {
+          float32x4_value_t x4 = value.f4;
+          PrintF(trace_scope_->file(),
+                 "    0x%08" V8PRIxPTR ":"
+                 " [top + %d] <- float32x4(%e, %e, %e, %e) ; %s\n",
+                 output_[frame_index]->GetTop() + output_offset,
+                 output_offset,
+                 x4.storage[0], x4.storage[1], x4.storage[2], x4.storage[3],
+                 SIMD128Register::AllocationIndexToString(input_reg));
+        } else {
+          ASSERT(opcode == Translation::INT32x4_REGISTER);
+          int32x4_value_t x4 = value.i4;
+          PrintF(trace_scope_->file(),
+                 "    0x%08" V8PRIxPTR ":"
+                 " [top + %d] <- int32x4(%u, %u, %u, %u) ; %s\n",
+                 output_[frame_index]->GetTop() + output_offset,
+                 output_offset,
+                 x4.storage[0], x4.storage[1], x4.storage[2], x4.storage[3],
+                 SIMD128Register::AllocationIndexToString(input_reg));
+        }
+      }
+      // We save the untagged value on the side and store a GC-safe
+      // temporary placeholder in the frame.
+      AddSIMD128Value(output_[frame_index]->GetTop() + output_offset, value,
+                      opcode);
+      output_[frame_index]->SetFrameSlot(output_offset, kPlaceholder);
+      return;
+    }
+
     case Translation::STACK_SLOT: {
       int input_slot_index = iterator->Next();
       unsigned input_offset = input_->GetOffsetFromSlotIndex(input_slot_index);
@@ -2464,6 +2642,41 @@ void Deoptimizer::DoTranslateCommand(TranslationIterator* iterator,
       return;
     }
 
+    case Translation::FLOAT32x4_STACK_SLOT:
+    case Translation::INT32x4_STACK_SLOT: {
+      int input_slot_index = iterator->Next();
+      unsigned input_offset = input_->GetOffsetFromSlotIndex(input_slot_index);
+      simd128_value_t value = input_->GetSIMD128FrameSlot(input_offset);
+      if (trace_scope_ != NULL) {
+        if (opcode == Translation::FLOAT32x4_STACK_SLOT) {
+          float32x4_value_t x4 = value.f4;
+          PrintF(trace_scope_->file(),
+                 "    0x%08" V8PRIxPTR ": "
+                 "[top + %d] <- float32x4(%e, %e, %e, %e) ; [sp + %d]\n",
+                 output_[frame_index]->GetTop() + output_offset,
+                 output_offset,
+                 x4.storage[0], x4.storage[1], x4.storage[2], x4.storage[3],
+                 input_offset);
+        } else {
+          ASSERT(opcode == Translation::INT32x4_STACK_SLOT);
+          int32x4_value_t x4 = value.i4;
+          PrintF(trace_scope_->file(),
+                 "    0x%08" V8PRIxPTR ": "
+                 "[top + %d] <- int32x4(%u, %u, %u, %u) ; [sp + %d]\n",
+                 output_[frame_index]->GetTop() + output_offset,
+                 output_offset,
+                 x4.storage[0], x4.storage[1], x4.storage[2], x4.storage[3],
+                 input_offset);
+        }
+      }
+      // We save the untagged value on the side and store a GC-safe
+      // temporary placeholder in the frame.
+      AddSIMD128Value(output_[frame_index]->GetTop() + output_offset, value,
+                      opcode);
+      output_[frame_index]->SetFrameSlot(output_offset, kPlaceholder);
+      return;
+    }
+
     case Translation::LITERAL: {
       Object* literal = ComputeLiteral(iterator->Next());
       if (trace_scope_ != NULL) {
@@ -2612,6 +2825,24 @@ void Deoptimizer::AddObjectDoubleValue(double value) {
 }
 
 
+void Deoptimizer::AddObjectSIMD128Value(simd128_value_t value,
+                                        int translation_opcode) {
+  deferred_objects_tagged_values_.Add(isolate()->heap()->the_hole_value());
+  SIMD128MaterializationDescriptor<int> value_desc(
+      deferred_objects_tagged_values_.length() - 1, value);
+  Translation::Opcode opcode =
+      static_cast<Translation::Opcode>(translation_opcode);
+  if (opcode == Translation::FLOAT32x4_REGISTER ||
+      opcode == Translation::FLOAT32x4_STACK_SLOT) {
+    deferred_objects_float32x4_values_.Add(value_desc);
+  } else {
+    ASSERT(opcode == Translation::INT32x4_REGISTER ||
+           opcode == Translation::INT32x4_STACK_SLOT);
+    deferred_objects_int32x4_values_.Add(value_desc);
+  }
+}
+
+
 void Deoptimizer::AddDoubleValue(intptr_t slot_address, double value) {
   HeapNumberMaterializationDescriptor<Address> value_desc(
       reinterpret_cast<Address>(slot_address), value);
@@ -2619,6 +2850,24 @@ void Deoptimizer::AddDoubleValue(intptr_t slot_address, double value) {
 }
 
 
+void Deoptimizer::AddSIMD128Value(intptr_t slot_address,
+                                  simd128_value_t value,
+                                  int translation_opcode) {
+  SIMD128MaterializationDescriptor<Address> value_desc(
+      reinterpret_cast<Address>(slot_address), value);
+  Translation::Opcode opcode =
+      static_cast<Translation::Opcode>(translation_opcode);
+  if (opcode == Translation::FLOAT32x4_REGISTER ||
+      opcode == Translation::FLOAT32x4_STACK_SLOT) {
+    deferred_float32x4s_.Add(value_desc);
+  } else {
+    ASSERT(opcode == Translation::INT32x4_REGISTER ||
+           opcode == Translation::INT32x4_STACK_SLOT);
+    deferred_int32x4s_.Add(value_desc);
+  }
+}
+
+
 void Deoptimizer::EnsureCodeForDeoptimizationEntry(Isolate* isolate,
                                                    BailoutType type,
                                                    int max_entry_id) {
@@ -2860,6 +3109,12 @@ void Translation::StoreDoubleRegister(DoubleRegister reg) {
 }
 
 
+void Translation::StoreSIMD128Register(SIMD128Register reg, Opcode opcode) {
+  buffer_->Add(opcode, zone());
+  buffer_->Add(SIMD128Register::ToAllocationIndex(reg), zone());
+}
+
+
 void Translation::StoreStackSlot(int index) {
   buffer_->Add(STACK_SLOT, zone());
   buffer_->Add(index, zone());
@@ -2884,6 +3139,12 @@ void Translation::StoreDoubleStackSlot(int index) {
 }
 
 
+void Translation::StoreSIMD128StackSlot(int index, Opcode opcode) {
+  buffer_->Add(opcode, zone());
+  buffer_->Add(index, zone());
+}
+
+
 void Translation::StoreLiteral(int literal_id) {
   buffer_->Add(LITERAL, zone());
   buffer_->Add(literal_id, zone());
@@ -2911,10 +3172,14 @@ int Translation::NumberOfOperandsFor(Opcode opcode) {
     case INT32_REGISTER:
     case UINT32_REGISTER:
     case DOUBLE_REGISTER:
+    case FLOAT32x4_REGISTER:
+    case INT32x4_REGISTER:
     case STACK_SLOT:
     case INT32_STACK_SLOT:
     case UINT32_STACK_SLOT:
     case DOUBLE_STACK_SLOT:
+    case FLOAT32x4_STACK_SLOT:
+    case INT32x4_STACK_SLOT:
     case LITERAL:
     case COMPILED_STUB_FRAME:
       return 1;
@@ -2974,6 +3239,8 @@ SlotRef SlotRef::ComputeSlotForNextArgument(TranslationIterator* iterator,
     case Translation::INT32_REGISTER:
     case Translation::UINT32_REGISTER:
     case Translation::DOUBLE_REGISTER:
+    case Translation::FLOAT32x4_REGISTER:
+    case Translation::INT32x4_REGISTER:
       // We are at safepoint which corresponds to call.  All registers are
       // saved by caller so there would be no live registers at this
       // point. Thus these translation commands should not be used.
@@ -3003,6 +3270,18 @@ SlotRef SlotRef::ComputeSlotForNextArgument(TranslationIterator* iterator,
       return SlotRef(slot_addr, SlotRef::DOUBLE);
     }
 
+    case Translation::FLOAT32x4_STACK_SLOT: {
+      int slot_index = iterator->Next();
+      Address slot_addr = SlotAddress(frame, slot_index);
+      return SlotRef(slot_addr, SlotRef::FLOAT32x4);
+    }
+
+    case Translation::INT32x4_STACK_SLOT: {
+      int slot_index = iterator->Next();
+      Address slot_addr = SlotAddress(frame, slot_index);
+      return SlotRef(slot_addr, SlotRef::INT32x4);
+    }
+
     case Translation::LITERAL: {
       int literal_index = iterator->Next();
       return SlotRef(data->GetIsolate(),
diff --git a/src/deoptimizer.h b/src/deoptimizer.h
index aace2208673..5bffff7feb4 100644
--- a/src/deoptimizer.h
+++ b/src/deoptimizer.h
@@ -55,6 +55,9 @@ static inline double read_double_value(Address p) {
 #endif  // V8_HOST_CAN_READ_UNALIGNED
 }
 
+static inline simd128_value_t read_simd128_value(Address p) {
+  return *reinterpret_cast<simd128_value_t*>(p);
+}
 
 class FrameDescription;
 class TranslationIterator;
@@ -75,6 +78,21 @@ class HeapNumberMaterializationDescriptor BASE_EMBEDDED {
 };
 
 
+template<typename T>
+class SIMD128MaterializationDescriptor BASE_EMBEDDED {
+ public:
+  SIMD128MaterializationDescriptor(T destination, simd128_value_t value)
+      : destination_(destination), value_(value) { }
+
+  T destination() const { return destination_; }
+  simd128_value_t value() const { return value_; }
+
+ private:
+  T destination_;
+  simd128_value_t value_;
+};
+
+
 class ObjectMaterializationDescriptor BASE_EMBEDDED {
  public:
   ObjectMaterializationDescriptor(
@@ -349,7 +367,10 @@ class Deoptimizer : public Malloced {
   void AddObjectDuplication(intptr_t slot, int object_index);
   void AddObjectTaggedValue(intptr_t value);
   void AddObjectDoubleValue(double value);
+  void AddObjectSIMD128Value(simd128_value_t value, int translation_opcode);
   void AddDoubleValue(intptr_t slot_address, double value);
+  void AddSIMD128Value(intptr_t slot_address, simd128_value_t value,
+                       int translation_opcode);
 
   bool ArgumentsObjectIsAdapted(int object_index) {
     ObjectMaterializationDescriptor desc = deferred_objects_.at(object_index);
@@ -398,9 +419,9 @@ class Deoptimizer : public Malloced {
   void SetPlatformCompiledStubRegisters(FrameDescription* output_frame,
                                         CodeStubInterfaceDescriptor* desc);
 
-  // Fill the given output frame's double registers with the original values
-  // from the input frame's double registers.
-  void CopyDoubleRegisters(FrameDescription* output_frame);
+  // Fill the given output frame's simd128 registers with the original values
+  // from the input frame's simd128 registers.
+  void CopySIMD128Registers(FrameDescription* output_frame);
 
   // Determines whether the input frame contains alignment padding by looking
   // at the dynamic alignment state slot inside the frame.
@@ -432,8 +453,14 @@ class Deoptimizer : public Malloced {
   List<Object*> deferred_objects_tagged_values_;
   List<HeapNumberMaterializationDescriptor<int> >
       deferred_objects_double_values_;
+  List<SIMD128MaterializationDescriptor<int> >
+      deferred_objects_float32x4_values_;
+  List<SIMD128MaterializationDescriptor<int> >
+      deferred_objects_int32x4_values_;
   List<ObjectMaterializationDescriptor> deferred_objects_;
   List<HeapNumberMaterializationDescriptor<Address> > deferred_heap_numbers_;
+  List<SIMD128MaterializationDescriptor<Address> > deferred_float32x4s_;
+  List<SIMD128MaterializationDescriptor<Address> > deferred_int32x4s_;
 
   // Output frame information. Only used during heap object materialization.
   List<Handle<JSFunction> > jsframe_functions_;
@@ -495,6 +522,11 @@ class FrameDescription {
     return read_double_value(reinterpret_cast<Address>(ptr));
   }
 
+  simd128_value_t GetSIMD128FrameSlot(unsigned offset) {
+    intptr_t* ptr = GetFrameSlotPointer(offset);
+    return read_simd128_value(reinterpret_cast<Address>(ptr));
+  }
+
   void SetFrameSlot(unsigned offset, intptr_t value) {
     *GetFrameSlotPointer(offset) = value;
   }
@@ -516,9 +548,11 @@ class FrameDescription {
     return registers_[n];
   }
 
-  double GetDoubleRegister(unsigned n) const {
-    ASSERT(n < ARRAY_SIZE(double_registers_));
-    return double_registers_[n];
+  double GetDoubleRegister(unsigned n) const;
+
+  simd128_value_t GetSIMD128Register(unsigned n) const {
+    ASSERT(n < ARRAY_SIZE(simd128_registers_));
+    return simd128_registers_[n];
   }
 
   void SetRegister(unsigned n, intptr_t value) {
@@ -526,9 +560,11 @@ class FrameDescription {
     registers_[n] = value;
   }
 
-  void SetDoubleRegister(unsigned n, double value) {
-    ASSERT(n < ARRAY_SIZE(double_registers_));
-    double_registers_[n] = value;
+  void SetDoubleRegister(unsigned n, double value);
+
+  void SetSIMD128Register(unsigned n, simd128_value_t value) {
+    ASSERT(n < ARRAY_SIZE(simd128_registers_));
+    simd128_registers_[n] = value;
   }
 
   intptr_t GetTop() const { return top_; }
@@ -572,8 +608,8 @@ class FrameDescription {
     return OFFSET_OF(FrameDescription, registers_);
   }
 
-  static int double_registers_offset() {
-    return OFFSET_OF(FrameDescription, double_registers_);
+  static int simd128_registers_offset() {
+    return OFFSET_OF(FrameDescription, simd128_registers_);
   }
 
   static int frame_size_offset() {
@@ -605,7 +641,7 @@ class FrameDescription {
   uintptr_t frame_size_;  // Number of bytes.
   JSFunction* function_;
   intptr_t registers_[Register::kNumRegisters];
-  double double_registers_[DoubleRegister::kMaxNumRegisters];
+  simd128_value_t simd128_registers_[SIMD128Register::kMaxNumRegisters];
   intptr_t top_;
   intptr_t pc_;
   intptr_t fp_;
@@ -708,10 +744,14 @@ class TranslationIterator BASE_EMBEDDED {
   V(INT32_REGISTER)                                                            \
   V(UINT32_REGISTER)                                                           \
   V(DOUBLE_REGISTER)                                                           \
+  V(FLOAT32x4_REGISTER)                                                        \
+  V(INT32x4_REGISTER)                                                          \
   V(STACK_SLOT)                                                                \
   V(INT32_STACK_SLOT)                                                          \
   V(UINT32_STACK_SLOT)                                                         \
   V(DOUBLE_STACK_SLOT)                                                         \
+  V(FLOAT32x4_STACK_SLOT)                                                      \
+  V(INT32x4_STACK_SLOT)                                                        \
   V(LITERAL)
 
 
@@ -750,10 +790,12 @@ class Translation BASE_EMBEDDED {
   void StoreInt32Register(Register reg);
   void StoreUint32Register(Register reg);
   void StoreDoubleRegister(DoubleRegister reg);
+  void StoreSIMD128Register(SIMD128Register reg, Opcode opcode);
   void StoreStackSlot(int index);
   void StoreInt32StackSlot(int index);
   void StoreUint32StackSlot(int index);
   void StoreDoubleStackSlot(int index);
+  void StoreSIMD128StackSlot(int index, Opcode opcode);
   void StoreLiteral(int literal_id);
   void StoreArgumentsObject(bool args_known, int args_index, int args_length);
 
@@ -783,6 +825,8 @@ class SlotRef BASE_EMBEDDED {
     INT32,
     UINT32,
     DOUBLE,
+    FLOAT32x4,
+    INT32x4,
     LITERAL
   };
 
@@ -823,6 +867,14 @@ class SlotRef BASE_EMBEDDED {
         return isolate->factory()->NewNumber(value);
       }
 
+      case FLOAT32x4: {
+        return isolate->factory()->NewFloat32x4(read_simd128_value(addr_).f4);
+      }
+
+      case INT32x4: {
+        return isolate->factory()->NewInt32x4(read_simd128_value(addr_).i4);
+      }
+
       case LITERAL:
         return literal_;
 
diff --git a/src/globals.h b/src/globals.h
index c9d2326a7f9..90ca9478610 100644
--- a/src/globals.h
+++ b/src/globals.h
@@ -227,6 +227,11 @@ typedef byte* Address;
 
 struct float32x4_value_t { float storage[4]; };
 struct int32x4_value_t { int32_t storage[4]; };
+union simd128_value_t {
+  double d[2];
+  float32x4_value_t f4;
+  int32x4_value_t i4;
+};
 
 const int KB = 1024;
 const int MB = KB * KB;
@@ -253,6 +258,7 @@ const int kDoubleSize    = sizeof(double);               // NOLINT
 const int kFloatSize     = sizeof(float);                // NOLINT
 const int kFloat32x4Size = sizeof(float32x4_value_t);    // NOLINT
 const int kInt32x4Size   = sizeof(int32x4_value_t);      // NOLINT
+const int kSIMD128Size   = sizeof(simd128_value_t);      // NOLINT
 const int kIntptrSize    = sizeof(intptr_t);             // NOLINT
 const int kPointerSize   = sizeof(void*);                // NOLINT
 const int kRegisterSize  = kPointerSize;
diff --git a/src/hydrogen-instructions.cc b/src/hydrogen-instructions.cc
index 89399db8e41..f7f0c78117a 100644
--- a/src/hydrogen-instructions.cc
+++ b/src/hydrogen-instructions.cc
@@ -336,6 +336,8 @@ const char* HType::ToString() {
     case kTaggedNumber: return "number";
     case kSmi: return "smi";
     case kHeapNumber: return "heap-number";
+    case kFloat32x4: return "float32x4";
+    case kInt32x4: return "int32x4";
     case kString: return "string";
     case kBoolean: return "boolean";
     case kNonPrimitive: return "non-primitive";
@@ -353,6 +355,10 @@ HType HType::TypeFromValue(Handle<Object> value) {
     result = HType::Smi();
   } else if (value->IsHeapNumber()) {
     result = HType::HeapNumber();
+  } else if (value->IsFloat32x4()) {
+    result = HType::Float32x4();
+  } else if (value->IsInt32x4()) {
+    result = HType::Int32x4();
   } else if (value->IsString()) {
     result = HType::String();
   } else if (value->IsBoolean()) {
@@ -1205,7 +1211,22 @@ bool HTypeofIsAndBranch::KnownSuccessorBlock(HBasicBlock** block) {
       *block = SecondSuccessor();
     }
     return true;
+  } else if (value()->representation().IsFloat32x4()) {
+    if (compares_float32x4_type()) {
+      *block = FirstSuccessor();
+    } else {
+      *block = SecondSuccessor();
+    }
+    return true;
+  } else if (value()->representation().IsInt32x4()) {
+    if (compares_int32x4_type()) {
+      *block = FirstSuccessor();
+    } else {
+      *block = SecondSuccessor();
+    }
+    return true;
   }
+
   *block = NULL;
   return false;
 }
diff --git a/src/hydrogen-instructions.h b/src/hydrogen-instructions.h
index 208cbd24ca9..54cbd9ad96b 100644
--- a/src/hydrogen-instructions.h
+++ b/src/hydrogen-instructions.h
@@ -314,6 +314,8 @@ class HType V8_FINAL {
   static HType TaggedNumber() { return HType(kTaggedNumber); }
   static HType Smi() { return HType(kSmi); }
   static HType HeapNumber() { return HType(kHeapNumber); }
+  static HType Float32x4() { return HType(kFloat32x4); }
+  static HType Int32x4() { return HType(kInt32x4); }
   static HType String() { return HType(kString); }
   static HType Boolean() { return HType(kBoolean); }
   static HType NonPrimitive() { return HType(kNonPrimitive); }
@@ -349,12 +351,24 @@ class HType V8_FINAL {
     return ((type_ & kHeapNumber) == kHeapNumber);
   }
 
+  bool IsFloat32x4() const {
+    return ((type_ & kFloat32x4) == kFloat32x4);
+  }
+
+  bool IsInt32x4() const {
+    return ((type_ & kInt32x4) == kInt32x4);
+  }
+
+  bool IsSIMD128() const {
+    return IsFloat32x4() || IsInt32x4();
+  }
+
   bool IsString() const {
     return ((type_ & kString) == kString);
   }
 
   bool IsNonString() const {
-    return IsTaggedPrimitive() || IsSmi() || IsHeapNumber() ||
+    return IsTaggedPrimitive() || IsSmi() || IsHeapNumber() || IsSIMD128() ||
         IsBoolean() || IsJSArray();
   }
 
@@ -375,7 +389,8 @@ class HType V8_FINAL {
   }
 
   bool IsHeapObject() const {
-    return IsHeapNumber() || IsString() || IsBoolean() || IsNonPrimitive();
+    return IsHeapNumber() || IsSIMD128() || IsString() ||
+        IsBoolean() || IsNonPrimitive();
   }
 
   bool ToStringOrToNumberCanBeObserved(Representation representation) {
@@ -384,6 +399,8 @@ class HType V8_FINAL {
       case kTaggedNumber:     // fallthru
       case kSmi:              // fallthru
       case kHeapNumber:       // fallthru
+      case kFloat32x4:        // fallthru
+      case kInt32x4:          // fallthru
       case kString:           // fallthru
       case kBoolean:
         return false;
@@ -408,11 +425,13 @@ class HType V8_FINAL {
     kTaggedNumber = 0xd,     // 0000 0000 0000 1101
     kSmi = 0x1d,             // 0000 0000 0001 1101
     kHeapNumber = 0x2d,      // 0000 0000 0010 1101
-    kString = 0x45,          // 0000 0000 0100 0101
-    kBoolean = 0x85,         // 0000 0000 1000 0101
-    kNonPrimitive = 0x101,   // 0000 0001 0000 0001
-    kJSObject = 0x301,       // 0000 0011 0000 0001
-    kJSArray = 0x701         // 0000 0111 0000 0001
+    kFloat32x4 = 0x45,       // 0000 0000 0100 0101
+    kInt32x4 = 0x85,         // 0000 0000 1000 0101
+    kString = 0x105,         // 0000 0001 0000 0101
+    kBoolean = 0x205,        // 0000 0010 1000 0101
+    kNonPrimitive = 0x401,   // 0000 0100 0000 0001
+    kJSObject = 0xc01,       // 0000 1100 0000 0001
+    kJSArray = 0x1c01        // 0001 1100 0000 0001
   };
 
   // Make sure type fits in int16.
@@ -673,6 +692,8 @@ class HValue : public ZoneObject {
       HType t = type();
       if (t.IsSmi()) return Representation::Smi();
       if (t.IsHeapNumber()) return Representation::Double();
+      if (t.IsFloat32x4()) return Representation::Float32x4();
+      if (t.IsInt32x4()) return Representation::Int32x4();
       if (t.IsHeapObject()) return r;
       return Representation::None();
     }
@@ -1729,7 +1750,13 @@ class HChange V8_FINAL : public HUnaryOperation {
     if (value->representation().IsSmi() || value->type().IsSmi()) {
       set_type(HType::Smi());
     } else {
-      set_type(HType::TaggedNumber());
+      if (to.IsFloat32x4()) {
+        set_type(HType::Float32x4());
+      } else if (to.IsInt32x4()) {
+        set_type(HType::Int32x4());
+      } else {
+        set_type(HType::TaggedNumber());
+      }
       if (to.IsTagged()) SetGVNFlag(kChangesNewSpacePromotion);
     }
   }
@@ -4628,6 +4655,8 @@ class HTypeofIsAndBranch V8_FINAL : public HUnaryControlInstruction {
 
   Handle<String> type_literal() { return type_literal_; }
   bool compares_number_type() { return compares_number_type_; }
+  bool compares_float32x4_type() { return compares_float32x4_type_; }
+  bool compares_int32x4_type() { return compares_int32x4_type_; }
   virtual void PrintDataTo(StringStream* stream) V8_OVERRIDE;
 
   DECLARE_CONCRETE_INSTRUCTION(TypeofIsAndBranch)
@@ -4644,10 +4673,14 @@ class HTypeofIsAndBranch V8_FINAL : public HUnaryControlInstruction {
         type_literal_(type_literal) {
     Heap* heap = type_literal->GetHeap();
     compares_number_type_ = type_literal->Equals(heap->number_string());
+    compares_float32x4_type_ = type_literal->Equals(heap->float32x4_string());
+    compares_int32x4_type_ = type_literal->Equals(heap->int32x4_string());
   }
 
   Handle<String> type_literal_;
   bool compares_number_type_ : 1;
+  bool compares_float32x4_type_ : 1;
+  bool compares_int32x4_type_ : 1;
 };
 
 
@@ -6431,13 +6464,12 @@ class HLoadKeyed V8_FINAL
           elements_kind == FLOAT32_ELEMENTS ||
           elements_kind == FLOAT64_ELEMENTS) {
         set_representation(Representation::Double());
-      } else if (elements_kind == EXTERNAL_FLOAT32x4_ELEMENTS ||
-                 elements_kind == FLOAT32x4_ELEMENTS ||
-                 elements_kind == EXTERNAL_INT32x4_ELEMENTS ||
-                 elements_kind == INT32x4_ELEMENTS) {
-        // TODO(haitao): Set the representation to Float32x4 or Int32x4 after
-        // SIMD instructions are added.
-        set_representation(Representation::Tagged());
+      } else if (IsFloat32x4ElementsKind(elements_kind)) {
+        set_representation(CPU::SupportsSIMD128InCrankshaft() ?
+            Representation::Float32x4() : Representation::Tagged());
+      } else if (IsInt32x4ElementsKind(elements_kind)) {
+        set_representation(CPU::SupportsSIMD128InCrankshaft() ?
+            Representation::Int32x4() : Representation::Tagged());
       } else {
         set_representation(Representation::Integer32());
       }
@@ -6721,16 +6753,17 @@ class HStoreKeyed V8_FINAL
     }
 
     ASSERT_EQ(index, 2);
+
     if (IsDoubleOrFloatElementsKind(elements_kind())) {
       return Representation::Double();
     }
-    if (IsExternalFloat32x4ElementsKind(elements_kind()) ||
-        IsFixedFloat32x4ElementsKind(elements_kind()) ||
-        IsExternalInt32x4ElementsKind(elements_kind()) ||
-        IsFixedInt32x4ElementsKind(elements_kind())) {
-      // TODO(haitao): Set the required input representation to Float32x4 or
-      // Int32x4 after SIMD instructions are added.
-      return Representation::Tagged();
+    if (IsFloat32x4ElementsKind(elements_kind())) {
+      return CPU::SupportsSIMD128InCrankshaft() ?
+          Representation::Float32x4() : Representation::Tagged();
+    }
+    if (IsInt32x4ElementsKind(elements_kind())) {
+      return CPU::SupportsSIMD128InCrankshaft() ?
+          Representation::Int32x4() : Representation::Tagged();
     }
     if (SmiValuesAre32Bits() && store_mode_ == STORE_TO_INITIALIZED_ENTRY) {
       return Representation::Integer32();
@@ -6764,13 +6797,13 @@ class HStoreKeyed V8_FINAL
     if (IsDoubleOrFloatElementsKind(elements_kind())) {
       return Representation::Double();
     }
-    if (IsExternalFloat32x4ElementsKind(elements_kind()) ||
-        IsFixedFloat32x4ElementsKind(elements_kind()) ||
-        IsExternalInt32x4ElementsKind(elements_kind()) ||
-        IsFixedInt32x4ElementsKind(elements_kind())) {
-      // TODO(haitao): Set the required input representation to Float32x4 or
-      // Int32x4 after SIMD instructions are added.
-      return Representation::Tagged();
+    if (IsFloat32x4ElementsKind(elements_kind())) {
+      return CPU::SupportsSIMD128InCrankshaft() ?
+          Representation::Float32x4() : Representation::Tagged();
+    }
+    if (IsInt32x4ElementsKind(elements_kind())) {
+      return CPU::SupportsSIMD128InCrankshaft() ?
+          Representation::Int32x4() : Representation::Tagged();
     }
     if (SmiValuesAre32Bits() && store_mode_ == STORE_TO_INITIALIZED_ENTRY) {
       return Representation::Integer32();
diff --git a/src/hydrogen.cc b/src/hydrogen.cc
index 3698a322c04..0daf83c9400 100644
--- a/src/hydrogen.cc
+++ b/src/hydrogen.cc
@@ -10952,6 +10952,12 @@ void HTracer::TraceLiveRange(LiveRange* range, const char* type,
       if (op->IsDoubleRegister()) {
         trace_.Add(" \"%s\"",
                    DoubleRegister::AllocationIndexToString(assigned_reg));
+      } else if (op->IsFloat32x4Register()) {
+        trace_.Add(" \"%s\"",
+                   SIMD128Register::AllocationIndexToString(assigned_reg));
+      } else if (op->IsInt32x4Register()) {
+        trace_.Add(" \"%s\"",
+                   SIMD128Register::AllocationIndexToString(assigned_reg));
       } else {
         ASSERT(op->IsRegister());
         trace_.Add(" \"%s\"", Register::AllocationIndexToString(assigned_reg));
@@ -10960,6 +10966,10 @@ void HTracer::TraceLiveRange(LiveRange* range, const char* type,
       LOperand* op = range->TopLevel()->GetSpillOperand();
       if (op->IsDoubleStackSlot()) {
         trace_.Add(" \"double_stack:%d\"", op->index());
+      } else if (op->IsFloat32x4StackSlot()) {
+        trace_.Add(" \"float32x4_stack:%d\"", op->index());
+      } else if (op->IsInt32x4StackSlot()) {
+        trace_.Add(" \"int32x4_stack:%d\"", op->index());
       } else {
         ASSERT(op->IsStackSlot());
         trace_.Add(" \"stack:%d\"", op->index());
diff --git a/src/ia32/assembler-ia32.cc b/src/ia32/assembler-ia32.cc
index 733432028af..f99f202535e 100644
--- a/src/ia32/assembler-ia32.cc
+++ b/src/ia32/assembler-ia32.cc
@@ -262,6 +262,52 @@ Operand::Operand(Register index,
 }
 
 
+Operand::Operand(const Operand& operand, int32_t offset) {
+  ASSERT(operand.len_ >= 1);
+  // Operand encodes REX ModR/M [SIB] [Disp].
+  byte modrm = operand.buf_[0];
+  ASSERT(modrm < 0xC0);  // Disallow mode 3 (register target).
+  bool has_sib = ((modrm & 0x07) == 0x04);
+  byte mode = modrm & 0xC0;
+  int disp_offset = has_sib ? 2 : 1;
+  int base_reg = (has_sib ? operand.buf_[1] : modrm) & 0x07;
+  // Mode 0 with rbp/r13 as ModR/M or SIB base register always has a 32-bit
+  // displacement.
+  bool is_baseless = (mode == 0) && (base_reg == 0x05);  // No base or RIP base.
+  int32_t disp_value = 0;
+  if (mode == 0x80 || is_baseless) {
+    // Mode 2 or mode 0 with rbp/r13 as base: Word displacement.
+    disp_value = *BitCast<const int32_t*>(&operand.buf_[disp_offset]);
+  } else if (mode == 0x40) {
+    // Mode 1: Byte displacement.
+    disp_value = static_cast<signed char>(operand.buf_[disp_offset]);
+  }
+
+  // Write new operand with same registers, but with modified displacement.
+  ASSERT(offset >= 0 ? disp_value + offset >= disp_value
+                     : disp_value + offset < disp_value);  // No overflow.
+  disp_value += offset;
+  if (!is_int8(disp_value) || is_baseless) {
+    // Need 32 bits of displacement, mode 2 or mode 1 with register rbp/r13.
+    buf_[0] = (modrm & 0x3f) | (is_baseless ? 0x00 : 0x80);
+    len_ = disp_offset + 4;
+    Memory::int32_at(&buf_[disp_offset]) = disp_value;
+  } else if (disp_value != 0 || (base_reg == 0x05)) {
+    // Need 8 bits of displacement.
+    buf_[0] = (modrm & 0x3f) | 0x40;  // Mode 1.
+    len_ = disp_offset + 1;
+    buf_[disp_offset] = static_cast<byte>(disp_value);
+  } else {
+    // Need no displacement.
+    buf_[0] = (modrm & 0x3f);  // Mode 0.
+    len_ = disp_offset;
+  }
+  if (has_sib) {
+    buf_[1] = operand.buf_[1];
+  }
+}
+
+
 bool Operand::is_reg(Register reg) const {
   return ((buf_[0] & 0xF8) == 0xC0)  // addressing mode is register only.
       && ((buf_[0] & 0x07) == reg.code());  // register codes match.
@@ -2232,6 +2278,24 @@ void Assembler::movaps(XMMRegister dst, XMMRegister src) {
 }
 
 
+void Assembler::movups(XMMRegister dst, const Operand& src) {
+  ASSERT(IsEnabled(SSE2));
+  EnsureSpace ensure_space(this);
+  EMIT(0x0F);
+  EMIT(0x10);
+  emit_sse_operand(dst, src);
+}
+
+
+void Assembler::movups(const Operand& dst, XMMRegister src) {
+  ASSERT(IsEnabled(SSE2));
+  EnsureSpace ensure_space(this);
+  EMIT(0x0F);
+  EMIT(0x11);
+  emit_sse_operand(src, dst);
+}
+
+
 void Assembler::shufps(XMMRegister dst, XMMRegister src, byte imm8) {
   ASSERT(IsEnabled(SSE2));
   ASSERT(is_uint8(imm8));
diff --git a/src/ia32/assembler-ia32.h b/src/ia32/assembler-ia32.h
index 43d6d6868a9..fb58bdc0991 100644
--- a/src/ia32/assembler-ia32.h
+++ b/src/ia32/assembler-ia32.h
@@ -220,6 +220,9 @@ struct XMMRegister : IntelDoubleRegister {
 };
 
 
+typedef XMMRegister SIMD128Register;
+
+
 #define xmm0 (static_cast<const XMMRegister&>(double_register_0))
 #define xmm1 (static_cast<const XMMRegister&>(double_register_1))
 #define xmm2 (static_cast<const XMMRegister&>(double_register_2))
@@ -411,6 +414,11 @@ class Operand BASE_EMBEDDED {
                    int32_t disp,
                    RelocInfo::Mode rmode = RelocInfo::NONE32);
 
+  // Offset from existing memory operand.
+  // Offset is added to existing displacement as 32-bit signed values and
+  // this must not overflow.
+  Operand(const Operand& base, int32_t offset);
+
   static Operand StaticVariable(const ExternalReference& ext) {
     return Operand(reinterpret_cast<int32_t>(ext.address()),
                    RelocInfo::EXTERNAL_REFERENCE);
@@ -1014,6 +1022,8 @@ class Assembler : public AssemblerBase {
 
   // SSE instructions
   void movaps(XMMRegister dst, XMMRegister src);
+  void movups(XMMRegister dst, const Operand& src);
+  void movups(const Operand& dst, XMMRegister src);
   void shufps(XMMRegister dst, XMMRegister src, byte imm8);
 
   void andps(XMMRegister dst, const Operand& src);
diff --git a/src/ia32/cpu-ia32.cc b/src/ia32/cpu-ia32.cc
index 5fb04fc7272..72bdc96b003 100644
--- a/src/ia32/cpu-ia32.cc
+++ b/src/ia32/cpu-ia32.cc
@@ -51,6 +51,11 @@ bool CPU::SupportsCrankshaft() {
 }
 
 
+bool CPU::SupportsSIMD128InCrankshaft() {
+  return CpuFeatures::IsSupported(SSE2);
+}
+
+
 void CPU::FlushICache(void* start, size_t size) {
   // No need to flush the instruction cache on Intel. On Intel instruction
   // cache flushing is only necessary when multiple cores running the same
diff --git a/src/ia32/deoptimizer-ia32.cc b/src/ia32/deoptimizer-ia32.cc
index 5300dde9a21..38090087edf 100644
--- a/src/ia32/deoptimizer-ia32.cc
+++ b/src/ia32/deoptimizer-ia32.cc
@@ -187,8 +187,9 @@ void Deoptimizer::FillInputFrame(Address tos, JavaScriptFrame* frame) {
   }
   input_->SetRegister(esp.code(), reinterpret_cast<intptr_t>(frame->sp()));
   input_->SetRegister(ebp.code(), reinterpret_cast<intptr_t>(frame->fp()));
+  simd128_value_t zero = {{0.0, 0.0}};
   for (int i = 0; i < DoubleRegister::NumAllocatableRegisters(); i++) {
-    input_->SetDoubleRegister(i, 0.0);
+    input_->SetSIMD128Register(i, zero);
   }
 
   // Fill the frame content from the actual data on the frame.
@@ -208,11 +209,11 @@ void Deoptimizer::SetPlatformCompiledStubRegisters(
 }
 
 
-void Deoptimizer::CopyDoubleRegisters(FrameDescription* output_frame) {
+void Deoptimizer::CopySIMD128Registers(FrameDescription* output_frame) {
   if (!CpuFeatures::IsSupported(SSE2)) return;
   for (int i = 0; i < XMMRegister::kNumAllocatableRegisters; ++i) {
-    double double_value = input_->GetDoubleRegister(i);
-    output_frame->SetDoubleRegister(i, double_value);
+    simd128_value_t xmm_value = input_->GetSIMD128Register(i);
+    output_frame->SetSIMD128Register(i, xmm_value);
   }
 }
 
@@ -246,22 +247,22 @@ void Deoptimizer::EntryGenerator::Generate() {
   // Save all general purpose registers before messing with them.
   const int kNumberOfRegisters = Register::kNumRegisters;
 
-  const int kDoubleRegsSize = kDoubleSize *
-                              XMMRegister::kNumAllocatableRegisters;
-  __ sub(esp, Immediate(kDoubleRegsSize));
+  const int kXMMRegsSize = kSIMD128Size *
+      XMMRegister::kNumAllocatableRegisters;
+  __ sub(esp, Immediate(kXMMRegsSize));
   if (CpuFeatures::IsSupported(SSE2)) {
     CpuFeatureScope scope(masm(), SSE2);
     for (int i = 0; i < XMMRegister::kNumAllocatableRegisters; ++i) {
       XMMRegister xmm_reg = XMMRegister::FromAllocationIndex(i);
-      int offset = i * kDoubleSize;
-      __ movsd(Operand(esp, offset), xmm_reg);
+      int offset = i * kSIMD128Size;
+      __ movups(Operand(esp, offset), xmm_reg);
     }
   }
 
   __ pushad();
 
   const int kSavedRegistersAreaSize = kNumberOfRegisters * kPointerSize +
-                                      kDoubleRegsSize;
+                                      kXMMRegsSize;
 
   // Get the bailout id from the stack.
   __ mov(ebx, Operand(esp, kSavedRegistersAreaSize));
@@ -299,15 +300,15 @@ void Deoptimizer::EntryGenerator::Generate() {
     __ pop(Operand(ebx, offset));
   }
 
-  int double_regs_offset = FrameDescription::double_registers_offset();
+  int xmm_regs_offset = FrameDescription::simd128_registers_offset();
   if (CpuFeatures::IsSupported(SSE2)) {
     CpuFeatureScope scope(masm(), SSE2);
-    // Fill in the double input registers.
+    // Fill in the xmm input registers.
     for (int i = 0; i < XMMRegister::kNumAllocatableRegisters; ++i) {
-      int dst_offset = i * kDoubleSize + double_regs_offset;
-      int src_offset = i * kDoubleSize;
-      __ movsd(xmm0, Operand(esp, src_offset));
-      __ movsd(Operand(ebx, dst_offset), xmm0);
+      int dst_offset = i * kSIMD128Size + xmm_regs_offset;
+      int src_offset = i * kSIMD128Size;
+      __ movups(xmm0, Operand(esp, src_offset));
+      __ movups(Operand(ebx, dst_offset), xmm0);
     }
   }
 
@@ -317,7 +318,7 @@ void Deoptimizer::EntryGenerator::Generate() {
   __ fnclex();
 
   // Remove the bailout id, return address and the double registers.
-  __ add(esp, Immediate(kDoubleRegsSize + 2 * kPointerSize));
+  __ add(esp, Immediate(kXMMRegsSize + 2 * kPointerSize));
 
   // Compute a pointer to the unwinding limit in register ecx; that is
   // the first stack slot not part of the input frame.
@@ -391,8 +392,8 @@ void Deoptimizer::EntryGenerator::Generate() {
     CpuFeatureScope scope(masm(), SSE2);
     for (int i = 0; i < XMMRegister::kNumAllocatableRegisters; ++i) {
       XMMRegister xmm_reg = XMMRegister::FromAllocationIndex(i);
-      int src_offset = i * kDoubleSize + double_regs_offset;
-      __ movsd(xmm_reg, Operand(ebx, src_offset));
+      int src_offset = i * kSIMD128Size + xmm_regs_offset;
+      __ movups(xmm_reg, Operand(ebx, src_offset));
     }
   }
 
@@ -440,6 +441,18 @@ void FrameDescription::SetCallerFp(unsigned offset, intptr_t value) {
 }
 
 
+double FrameDescription::GetDoubleRegister(unsigned n) const {
+  ASSERT(n < ARRAY_SIZE(simd128_registers_));
+  return simd128_registers_[n].d[0];
+}
+
+
+void FrameDescription::SetDoubleRegister(unsigned n, double value) {
+  ASSERT(n < ARRAY_SIZE(simd128_registers_));
+  simd128_registers_[n].d[0] = value;
+}
+
+
 #undef __
 
 
diff --git a/src/ia32/disasm-ia32.cc b/src/ia32/disasm-ia32.cc
index 6a7f3bc8377..81746ef3d99 100644
--- a/src/ia32/disasm-ia32.cc
+++ b/src/ia32/disasm-ia32.cc
@@ -1043,6 +1043,19 @@ int DisassemblerIA32::InstructionDecode(v8::internal::Vector<char> out_buffer,
                            NameOfXMMRegister(regop),
                            NameOfXMMRegister(rm));
             data++;
+          } else if (f0byte == 0x10) {
+            data += 2;
+            int mod, regop, rm;
+            get_modrm(*data, &mod, &regop, &rm);
+            AppendToBuffer("movups %s,", NameOfXMMRegister(regop));
+            data += PrintRightXMMOperand(data);
+          } else if (f0byte == 0x11) {
+            AppendToBuffer("movups ");
+            data += 2;
+            int mod, regop, rm;
+            get_modrm(*data, &mod, &regop, &rm);
+            data += PrintRightXMMOperand(data);
+            AppendToBuffer(",%s", NameOfXMMRegister(regop));
           } else if (f0byte >= 0x53 && f0byte <= 0x5F) {
             const char* const pseudo_op[] = {
               "rcpps",
diff --git a/src/ia32/lithium-codegen-ia32.cc b/src/ia32/lithium-codegen-ia32.cc
index 2588e900407..66bab87fea2 100644
--- a/src/ia32/lithium-codegen-ia32.cc
+++ b/src/ia32/lithium-codegen-ia32.cc
@@ -547,6 +547,11 @@ XMMRegister LCodeGen::ToDoubleRegister(int index) const {
 }
 
 
+XMMRegister LCodeGen::ToSIMD128Register(int index) const {
+  return XMMRegister::FromAllocationIndex(index);
+}
+
+
 void LCodeGen::X87LoadForUsage(X87Register reg) {
   ASSERT(x87_stack_.Contains(reg));
   x87_stack_.Fxch(reg);
@@ -769,6 +774,24 @@ XMMRegister LCodeGen::ToDoubleRegister(LOperand* op) const {
 }
 
 
+XMMRegister LCodeGen::ToFloat32x4Register(LOperand* op) const {
+  ASSERT(op->IsFloat32x4Register());
+  return ToSIMD128Register(op->index());
+}
+
+
+XMMRegister LCodeGen::ToInt32x4Register(LOperand* op) const {
+  ASSERT(op->IsInt32x4Register());
+  return ToSIMD128Register(op->index());
+}
+
+
+XMMRegister LCodeGen::ToSIMD128Register(LOperand* op) const {
+  ASSERT(op->IsFloat32x4Register() || op->IsInt32x4Register());
+  return ToSIMD128Register(op->index());
+}
+
+
 int32_t LCodeGen::ToInteger32(LConstantOperand* op) const {
   return ToRepresentation(op, Representation::Integer32());
 }
@@ -824,7 +847,10 @@ static int ArgumentsOffsetWithoutFrame(int index) {
 Operand LCodeGen::ToOperand(LOperand* op) const {
   if (op->IsRegister()) return Operand(ToRegister(op));
   if (op->IsDoubleRegister()) return Operand(ToDoubleRegister(op));
-  ASSERT(op->IsStackSlot() || op->IsDoubleStackSlot());
+  if (op->IsFloat32x4Register()) return Operand(ToFloat32x4Register(op));
+  if (op->IsInt32x4Register()) return Operand(ToInt32x4Register(op));
+  ASSERT(op->IsStackSlot() || op->IsDoubleStackSlot() ||
+         op->IsFloat32x4StackSlot() || op->IsInt32x4StackSlot());
   if (NeedsEagerFrame()) {
     return Operand(ebp, StackSlotOffset(op->index()));
   } else {
@@ -951,6 +977,12 @@ void LCodeGen::AddToTranslation(LEnvironment* environment,
     }
   } else if (op->IsDoubleStackSlot()) {
     translation->StoreDoubleStackSlot(op->index());
+  } else if (op->IsFloat32x4StackSlot()) {
+    translation->StoreSIMD128StackSlot(op->index(),
+                                       Translation::FLOAT32x4_STACK_SLOT);
+  } else if (op->IsInt32x4StackSlot()) {
+    translation->StoreSIMD128StackSlot(op->index(),
+                                       Translation::INT32x4_STACK_SLOT);
   } else if (op->IsArgument()) {
     ASSERT(is_tagged);
     int src_index = GetStackSlotCount() + op->index();
@@ -967,6 +999,12 @@ void LCodeGen::AddToTranslation(LEnvironment* environment,
   } else if (op->IsDoubleRegister()) {
     XMMRegister reg = ToDoubleRegister(op);
     translation->StoreDoubleRegister(reg);
+  } else if (op->IsFloat32x4Register()) {
+    XMMRegister reg = ToFloat32x4Register(op);
+    translation->StoreSIMD128Register(reg, Translation::FLOAT32x4_REGISTER);
+  } else if (op->IsInt32x4Register()) {
+    XMMRegister reg = ToInt32x4Register(op);
+    translation->StoreSIMD128Register(reg, Translation::INT32x4_REGISTER);
   } else if (op->IsConstantOperand()) {
     HConstant* constant = chunk()->LookupConstant(LConstantOperand::cast(op));
     int src_index = DefineDeoptimizationLiteral(constant->handle(isolate()));
@@ -3447,55 +3485,52 @@ void LCodeGen::DoLoadKeyedSIMD128ExternalArray(LLoadKeyed* instr) {
       Runtime::FunctionId id_;
   };
 
-  // Allocate a SIMD128 object on the heap.
-  Register reg = ToRegister(instr->result());
-  Register tmp = ToRegister(instr->temp());
-  DeferredSIMD128ToTagged* deferred = new(zone()) DeferredSIMD128ToTagged(
-      this, instr, static_cast<Runtime::FunctionId>(T::kRuntimeAllocatorId()),
-      x87_stack_);
-  if (FLAG_inline_new) {
-    __ AllocateSIMDHeapObject(T::kSize, reg, tmp, deferred->entry(),
-        static_cast<Heap::RootListIndex>(T::kMapRootIndex()));
-  } else {
-    __ jmp(deferred->entry());
-  }
-  __ bind(deferred->exit());
-
-  // Copy the SIMD128 value from the external array to the heap object.
-  STATIC_ASSERT(T::kValueSize % kPointerSize == 0);
   LOperand* key = instr->key();
   ElementsKind elements_kind = instr->elements_kind();
-  for (int offset = 0; offset < T::kValueSize; offset += kPointerSize) {
+
+  if (CpuFeatures::IsSupported(SSE2)) {
+    CpuFeatureScope scope(masm(), SSE2);
     Operand operand(BuildFastArrayOperand(
         instr->elements(),
         key,
         instr->hydrogen()->key()->representation(),
         elements_kind,
-        offset,
+        0,
         instr->additional_index()));
-    __ mov(tmp, operand);
-    __ mov(FieldOperand(reg, T::kValueOffset + offset), tmp);
+    __ movups(ToSIMD128Register(instr->result()), operand);
+  } else {
+    // Allocate a SIMD128 object on the heap.
+    Register reg = ToRegister(instr->result());
+    Register tmp = ToRegister(instr->temp());
+    DeferredSIMD128ToTagged* deferred = new(zone()) DeferredSIMD128ToTagged(
+        this, instr, static_cast<Runtime::FunctionId>(T::kRuntimeAllocatorId()),
+        x87_stack_);
+    if (FLAG_inline_new) {
+      __ AllocateSIMDHeapObject(T::kSize, reg, tmp, deferred->entry(),
+          static_cast<Heap::RootListIndex>(T::kMapRootIndex()));
+    } else {
+      __ jmp(deferred->entry());
+    }
+    __ bind(deferred->exit());
+
+    // Copy the SIMD128 value from the external array to the heap object.
+    STATIC_ASSERT(T::kValueSize % kPointerSize == 0);
+    for (int offset = 0; offset < T::kValueSize; offset += kPointerSize) {
+      Operand operand(BuildFastArrayOperand(
+          instr->elements(),
+          key,
+          instr->hydrogen()->key()->representation(),
+          elements_kind,
+          offset,
+          instr->additional_index()));
+      __ mov(tmp, operand);
+      __ mov(FieldOperand(reg, T::kValueOffset + offset), tmp);
+    }
   }
 }
 
 
 void LCodeGen::DoLoadKeyedExternalArray(LLoadKeyed* instr) {
-  class DeferredSIMD128ToTagged V8_FINAL : public LDeferredCode {
-    public:
-      DeferredSIMD128ToTagged(LCodeGen* codegen,
-          LInstruction* instr,
-          Runtime::FunctionId id,
-          const X87Stack& x87_stack)
-        : LDeferredCode(codegen, x87_stack), instr_(instr), id_(id) { }
-      virtual void Generate() V8_OVERRIDE {
-        codegen()->DoDeferredSIMD128ToTagged(instr_, id_);
-      }
-      virtual LInstruction* instr() V8_OVERRIDE { return instr_; }
-    private:
-      LInstruction* instr_;
-      Runtime::FunctionId id_;
-  };
-
   ElementsKind elements_kind = instr->elements_kind();
   LOperand* key = instr->key();
   if (!key->IsConstantOperand() &&
@@ -3504,6 +3539,7 @@ void LCodeGen::DoLoadKeyedExternalArray(LLoadKeyed* instr) {
     HandleExternalArrayOpRequiresTemp(key,
         instr->hydrogen()->key()->representation(), elements_kind);
   }
+
   Operand operand(BuildFastArrayOperand(
       instr->elements(),
       key,
@@ -4639,28 +4675,41 @@ void LCodeGen::DoBoundsCheck(LBoundsCheck* instr) {
 
 template<class T>
 void LCodeGen::DoStoreKeyedSIMD128ExternalArray(LStoreKeyed* instr) {
-  ASSERT(instr->value()->IsRegister());
-  Register temp = ToRegister(instr->temp());
-  Register input_reg = ToRegister(instr->value());
-  __ test(input_reg, Immediate(kSmiTagMask));
-  DeoptimizeIf(zero, instr->environment());
-  __ CmpObjectType(input_reg, T::kInstanceType, temp);
-  DeoptimizeIf(not_equal, instr->environment());
-
-  // Copy the SIMD128 value from the heap object to the external array.
-  STATIC_ASSERT(T::kValueSize % kPointerSize == 0);
   LOperand* key = instr->key();
   ElementsKind elements_kind = instr->elements_kind();
-  for (int offset = 0; offset < T::kValueSize; offset += kPointerSize) {
+
+  if (CpuFeatures::IsSafeForSnapshot(SSE2)) {
+    CpuFeatureScope scope(masm(), SSE2);
     Operand operand(BuildFastArrayOperand(
         instr->elements(),
         key,
         instr->hydrogen()->key()->representation(),
         elements_kind,
-        offset,
+        0,
         instr->additional_index()));
-    __ mov(temp, FieldOperand(input_reg, T::kValueOffset + offset));
-    __ mov(operand, temp);
+    __ movups(operand, ToSIMD128Register(instr->value()));
+  } else {
+    ASSERT(instr->value()->IsRegister());
+    Register temp = ToRegister(instr->temp());
+    Register input_reg = ToRegister(instr->value());
+    __ test(input_reg, Immediate(kSmiTagMask));
+    DeoptimizeIf(zero, instr->environment());
+    __ CmpObjectType(input_reg, T::kInstanceType, temp);
+    DeoptimizeIf(not_equal, instr->environment());
+
+    // Copy the SIMD128 value from the heap object to the external array.
+    STATIC_ASSERT(T::kValueSize % kPointerSize == 0);
+    for (int offset = 0; offset < T::kValueSize; offset += kPointerSize) {
+      Operand operand(BuildFastArrayOperand(
+          instr->elements(),
+          key,
+          instr->hydrogen()->key()->representation(),
+          elements_kind,
+          offset,
+          instr->additional_index()));
+      __ mov(temp, FieldOperand(input_reg, T::kValueOffset + offset));
+      __ mov(operand, temp);
+    }
   }
 }
 
@@ -4674,6 +4723,7 @@ void LCodeGen::DoStoreKeyedExternalArray(LStoreKeyed* instr) {
     HandleExternalArrayOpRequiresTemp(key,
         instr->hydrogen()->key()->representation(), elements_kind);
   }
+
   Operand operand(BuildFastArrayOperand(
       instr->elements(),
       key,
@@ -6518,6 +6568,83 @@ void LCodeGen::DoLoadFieldByIndex(LLoadFieldByIndex* instr) {
 }
 
 
+template<class T>
+void LCodeGen::HandleSIMD128ToTagged(LSIMD128ToTagged* instr) {
+  class DeferredSIMD128ToTagged V8_FINAL : public LDeferredCode {
+    public:
+      DeferredSIMD128ToTagged(LCodeGen* codegen,
+          LInstruction* instr,
+          Runtime::FunctionId id,
+          const X87Stack& x87_stack)
+        : LDeferredCode(codegen, x87_stack), instr_(instr), id_(id) { }
+      virtual void Generate() V8_OVERRIDE {
+        codegen()->DoDeferredSIMD128ToTagged(instr_, id_);
+      }
+      virtual LInstruction* instr() V8_OVERRIDE { return instr_; }
+    private:
+      LInstruction* instr_;
+      Runtime::FunctionId id_;
+  };
+
+  CpuFeatureScope scope(masm(), SSE2);
+  XMMRegister input_reg = ToSIMD128Register(instr->value());
+  Register reg = ToRegister(instr->result());
+  Register tmp = ToRegister(instr->temp());
+
+  DeferredSIMD128ToTagged* deferred = new(zone()) DeferredSIMD128ToTagged(
+      this, instr, static_cast<Runtime::FunctionId>(T::kRuntimeAllocatorId()),
+      x87_stack_);
+  if (FLAG_inline_new) {
+    __ AllocateSIMDHeapObject(T::kSize, reg, tmp, deferred->entry(),
+        static_cast<Heap::RootListIndex>(T::kMapRootIndex()));
+  } else {
+    __ jmp(deferred->entry());
+  }
+  __ bind(deferred->exit());
+  __ movups(FieldOperand(reg, T::kValueOffset), input_reg);
+}
+
+
+void LCodeGen::DoSIMD128ToTagged(LSIMD128ToTagged* instr) {
+  if (instr->value()->IsFloat32x4Register()) {
+    HandleSIMD128ToTagged<Float32x4>(instr);
+  } else {
+    ASSERT(instr->value()->IsInt32x4Register());
+    HandleSIMD128ToTagged<Int32x4>(instr);
+  }
+}
+
+
+template<class T>
+void LCodeGen::HandleTaggedToSIMD128(LTaggedToSIMD128* instr) {
+  LOperand* input = instr->value();
+  ASSERT(input->IsRegister());
+  LOperand* result = instr->result();
+  ASSERT(result->IsSIMD128Register());
+
+  Register input_reg = ToRegister(input);
+  Register temp_reg = ToRegister(instr->temp());
+  XMMRegister result_reg = ToSIMD128Register(result);
+
+  CpuFeatureScope scope(masm(), SSE2);
+  __ test(input_reg, Immediate(kSmiTagMask));
+  DeoptimizeIf(zero, instr->environment());
+  __ CmpObjectType(input_reg, T::kInstanceType, temp_reg);
+  DeoptimizeIf(not_equal, instr->environment());
+  __ movups(result_reg, FieldOperand(input_reg, T::kValueOffset));
+}
+
+
+void LCodeGen::DoTaggedToSIMD128(LTaggedToSIMD128* instr) {
+  if (instr->representation().IsFloat32x4()) {
+    HandleTaggedToSIMD128<Float32x4>(instr);
+  } else {
+    ASSERT(instr->representation().IsInt32x4());
+    HandleTaggedToSIMD128<Int32x4>(instr);
+  }
+}
+
+
 #undef __
 
 } }  // namespace v8::internal
diff --git a/src/ia32/lithium-codegen-ia32.h b/src/ia32/lithium-codegen-ia32.h
index 0269ab872cf..e05acc9d3f1 100644
--- a/src/ia32/lithium-codegen-ia32.h
+++ b/src/ia32/lithium-codegen-ia32.h
@@ -90,6 +90,9 @@ class LCodeGen: public LCodeGenBase {
   Operand ToOperand(LOperand* op) const;
   Register ToRegister(LOperand* op) const;
   XMMRegister ToDoubleRegister(LOperand* op) const;
+  XMMRegister ToFloat32x4Register(LOperand* op) const;
+  XMMRegister ToInt32x4Register(LOperand* op) const;
+  XMMRegister ToSIMD128Register(LOperand* op) const;
   X87Register ToX87Register(LOperand* op) const;
 
   bool IsInteger32(LConstantOperand* op) const;
@@ -163,6 +166,11 @@ class LCodeGen: public LCodeGenBase {
   void DoDeferredInstanceMigration(LCheckMaps* instr, Register object);
   void DoDeferredSIMD128ToTagged(LInstruction* instr, Runtime::FunctionId id);
 
+  template<class T>
+  void HandleTaggedToSIMD128(LTaggedToSIMD128* instr);
+  template<class T>
+  void HandleSIMD128ToTagged(LSIMD128ToTagged* instr);
+
   // Parallel move support.
   void DoParallelMove(LParallelMove* move);
   void DoGap(LGap* instr);
@@ -289,6 +297,9 @@ class LCodeGen: public LCodeGenBase {
 
   Register ToRegister(int index) const;
   XMMRegister ToDoubleRegister(int index) const;
+  XMMRegister ToFloat32x4Register(int index) const;
+  XMMRegister ToInt32x4Register(int index) const;
+  XMMRegister ToSIMD128Register(int index) const;
   X87Register ToX87Register(int index) const;
   int32_t ToRepresentation(LConstantOperand* op, const Representation& r) const;
   int32_t ToInteger32(LConstantOperand* op) const;
diff --git a/src/ia32/lithium-gap-resolver-ia32.cc b/src/ia32/lithium-gap-resolver-ia32.cc
index d621bd261d6..095d48e1eda 100644
--- a/src/ia32/lithium-gap-resolver-ia32.cc
+++ b/src/ia32/lithium-gap-resolver-ia32.cc
@@ -405,6 +405,27 @@ void LGapResolver::EmitMove(int index) {
         cgen_->X87Mov(dst, src);
       }
     }
+  } else if (source->IsSIMD128Register()) {
+    ASSERT(CpuFeatures::IsSupported(SSE2));
+    CpuFeatureScope scope(cgen_->masm(), SSE2);
+    XMMRegister src = cgen_->ToSIMD128Register(source);
+    if (destination->IsSIMD128Register()) {
+      __ movaps(cgen_->ToSIMD128Register(destination), src);
+    } else {
+      ASSERT(destination->IsSIMD128StackSlot());
+      __ movups(cgen_->ToOperand(destination), src);
+    }
+  } else if (source->IsSIMD128StackSlot()) {
+    ASSERT(CpuFeatures::IsSupported(SSE2));
+    CpuFeatureScope scope(cgen_->masm(), SSE2);
+    Operand src = cgen_->ToOperand(source);
+    if (destination->IsSIMD128Register()) {
+      __ movups(cgen_->ToSIMD128Register(destination), src);
+    } else {
+      ASSERT(destination->IsSIMD128StackSlot());
+      __ movups(xmm0, src);
+      __ movups(cgen_->ToOperand(destination), xmm0);
+    }
   } else {
     UNREACHABLE();
   }
@@ -506,6 +527,43 @@ void LGapResolver::EmitSwap(int index) {
     __ mov(dst1, tmp);
     __ movsd(src0, xmm0);
 
+  } else if ((source->IsSIMD128StackSlot() &&
+              destination->IsSIMD128StackSlot())) {
+    // Swap two XMM stack slots.
+    Operand src = cgen_->ToOperand(source);
+    Operand dst = cgen_->ToOperand(destination);
+    Register tmp = EnsureTempRegister();
+    __ movups(xmm0, src);
+    for (int offset = 0; offset < kSIMD128Size; offset += kPointerSize) {
+      __ mov(tmp, Operand(dst, offset));
+      __ mov(Operand(src, offset), tmp);
+    }
+    __ movups(dst, xmm0);
+
+  } else if (source->IsSIMD128Register() && destination->IsSIMD128Register()) {
+    // Swap two XMM registers.
+    XMMRegister source_reg = cgen_->ToSIMD128Register(source);
+    XMMRegister destination_reg = cgen_->ToSIMD128Register(destination);
+    __ movaps(xmm0, source_reg);
+    __ movaps(source_reg, destination_reg);
+    __ movaps(destination_reg, xmm0);
+
+  } else if (source->IsSIMD128Register() || destination->IsSIMD128Register()) {
+    // Swap a xmm register and a xmm stack slot.
+    ASSERT((source->IsSIMD128Register() &&
+            destination->IsSIMD128StackSlot()) ||
+           (source->IsSIMD128StackSlot() &&
+            destination->IsSIMD128Register()));
+    XMMRegister reg = cgen_->ToSIMD128Register(source->IsSIMD128Register()
+                                                   ? source
+                                                   : destination);
+    LOperand* other = source->IsSIMD128Register() ? destination : source;
+    ASSERT(other->IsSIMD128StackSlot());
+    Operand other_operand = cgen_->ToOperand(other);
+    __ movups(xmm0, other_operand);
+    __ movups(other_operand, reg);
+    __ movaps(reg, xmm0);
+
   } else {
     // No other combinations are possible.
     UNREACHABLE();
diff --git a/src/ia32/lithium-ia32.cc b/src/ia32/lithium-ia32.cc
index 26e84975638..4580af11b6c 100644
--- a/src/ia32/lithium-ia32.cc
+++ b/src/ia32/lithium-ia32.cc
@@ -377,23 +377,39 @@ void LAccessArgumentsAt::PrintDataTo(StringStream* stream) {
 
 
 int LPlatformChunk::GetNextSpillIndex(RegisterKind kind) {
-  // Skip a slot if for a double-width slot.
-  if (kind == DOUBLE_REGISTERS) {
-    spill_slot_count_++;
-    spill_slot_count_ |= 1;
-    num_double_slots_++;
+  switch (kind) {
+    case GENERAL_REGISTERS: return spill_slot_count_++;
+    case DOUBLE_REGISTERS: {
+      // Skip a slot if for a double-width slot.
+      spill_slot_count_++;
+      spill_slot_count_ |= 1;
+      num_double_slots_++;
+      return spill_slot_count_++;
+    }
+    case FLOAT32x4_REGISTERS:
+    case INT32x4_REGISTERS: {
+      // Skip three slots if for a quad-width slot.
+      spill_slot_count_ += 3;
+      num_double_slots_ += 2;  // for dynamic frame alignment
+      return spill_slot_count_++;
+    }
+    default:
+      UNREACHABLE();
+      return -1;
   }
-  return spill_slot_count_++;
 }
 
 
 LOperand* LPlatformChunk::GetNextSpillSlot(RegisterKind kind) {
   int index = GetNextSpillIndex(kind);
-  if (kind == DOUBLE_REGISTERS) {
-    return LDoubleStackSlot::Create(index, zone());
-  } else {
-    ASSERT(kind == GENERAL_REGISTERS);
-    return LStackSlot::Create(index, zone());
+  switch (kind) {
+    case GENERAL_REGISTERS: return LStackSlot::Create(index, zone());
+    case DOUBLE_REGISTERS: return LDoubleStackSlot::Create(index, zone());
+    case FLOAT32x4_REGISTERS: return LFloat32x4StackSlot::Create(index, zone());
+    case INT32x4_REGISTERS: return LInt32x4StackSlot::Create(index, zone());
+    default:
+      UNREACHABLE();
+      return NULL;
   }
 }
 
@@ -1835,6 +1851,11 @@ LInstruction* LChunkBuilder::DoChange(HChange* instr) {
       LOperand* temp = TempRegister();
       LNumberUntagD* res = new(zone()) LNumberUntagD(value, temp);
       return AssignEnvironment(DefineAsRegister(res));
+    } else if (to.IsSIMD128()) {
+      LOperand* value = UseRegister(instr->value());
+      LOperand* temp = TempRegister();
+      LTaggedToSIMD128* res = new(zone()) LTaggedToSIMD128(value, temp, to);
+      return AssignEnvironment(DefineAsRegister(res));
     } else if (to.IsSmi()) {
       HValue* val = instr->value();
       LOperand* value = UseRegister(val);
@@ -1918,6 +1939,16 @@ LInstruction* LChunkBuilder::DoChange(HChange* instr) {
             new(zone()) LInteger32ToDouble(Use(instr->value())));
       }
     }
+  } else if (from.IsSIMD128()) {
+    ASSERT(to.IsTagged());
+    info()->MarkAsDeferredCalling();
+    LOperand* value = UseRegister(instr->value());
+    LOperand* temp = TempRegister();
+
+    // Make sure that temp and result_temp are different registers.
+    LUnallocated* result_temp = TempRegister();
+    LSIMD128ToTagged* result = new(zone()) LSIMD128ToTagged(value, temp);
+    return AssignPointerMap(Define(result, result_temp));
   }
   UNREACHABLE();
   return NULL;
@@ -2124,7 +2155,8 @@ LInstruction* LChunkBuilder::DoLoadKeyed(HLoadKeyed* instr) {
       : UseRegisterOrConstantAtStart(instr->key());
   LLoadKeyed* result = NULL;
 
-  bool load_128bits_without_sse2 = IsSIMD128ElementsKind(elements_kind);
+  bool load_128bits_without_sse2 = IsSIMD128ElementsKind(elements_kind) &&
+                                   !CPU::SupportsSIMD128InCrankshaft();
   if (!instr->is_typed_elements()) {
     LOperand* obj = UseRegisterAtStart(instr->elements());
     result = new(zone()) LLoadKeyed(obj, key, NULL);
@@ -2134,8 +2166,14 @@ LInstruction* LChunkBuilder::DoLoadKeyed(HLoadKeyed* instr) {
          !(IsDoubleOrFloatElementsKind(instr->elements_kind()))) ||
         (instr->representation().IsDouble() &&
          (IsDoubleOrFloatElementsKind(instr->elements_kind()))) ||
-        (instr->representation().IsTagged() &&
-         (IsSIMD128ElementsKind(instr->elements_kind()))));
+        (CPU::SupportsSIMD128InCrankshaft()
+            ? instr->representation().IsFloat32x4()
+            : instr->representation().IsTagged() &&
+         (IsFloat32x4ElementsKind(instr->elements_kind()))) ||
+        (CPU::SupportsSIMD128InCrankshaft()
+            ? instr->representation().IsInt32x4()
+            : instr->representation().IsTagged() &&
+         (IsInt32x4ElementsKind(instr->elements_kind()))));
     LOperand* backing_store = UseRegister(instr->elements());
     result = new(zone()) LLoadKeyed(backing_store, key,
         load_128bits_without_sse2 ? TempRegister() : NULL);
@@ -2225,8 +2263,14 @@ LInstruction* LChunkBuilder::DoStoreKeyed(HStoreKeyed* instr) {
        !IsDoubleOrFloatElementsKind(elements_kind)) ||
       (instr->value()->representation().IsDouble() &&
        IsDoubleOrFloatElementsKind(elements_kind)) ||
-      (instr->value()->representation().IsTagged() &&
-       IsSIMD128ElementsKind(elements_kind)));
+      (CPU::SupportsSIMD128InCrankshaft()
+          ? instr->value()->representation().IsFloat32x4()
+          : instr->value()->representation().IsTagged() &&
+       IsFloat32x4ElementsKind(elements_kind)) ||
+      (CPU::SupportsSIMD128InCrankshaft()
+          ? instr->value()->representation().IsInt32x4()
+          : instr->value()->representation().IsTagged() &&
+       IsInt32x4ElementsKind(elements_kind)));
   ASSERT((instr->is_fixed_typed_array() &&
           instr->elements()->representation().IsTagged()) ||
          (instr->is_external() &&
@@ -2239,7 +2283,8 @@ LInstruction* LChunkBuilder::DoStoreKeyed(HStoreKeyed* instr) {
   LOperand* key = clobbers_key
       ? UseTempRegister(instr->key())
       : UseRegisterOrConstantAtStart(instr->key());
-  bool store_128bits_without_sse2 = IsSIMD128ElementsKind(elements_kind);
+  bool store_128bits_without_sse2 = IsSIMD128ElementsKind(elements_kind) &&
+                                    !CPU::SupportsSIMD128InCrankshaft();
   LStoreKeyed* result =
       new(zone()) LStoreKeyed(backing_store, key, val,
           store_128bits_without_sse2 ? TempRegister() : NULL);
diff --git a/src/ia32/lithium-ia32.h b/src/ia32/lithium-ia32.h
index 7cb7da0364f..1dd54061f11 100644
--- a/src/ia32/lithium-ia32.h
+++ b/src/ia32/lithium-ia32.h
@@ -144,6 +144,8 @@ class LCodeGen;
   V(NumberTagI)                                 \
   V(NumberTagU)                                 \
   V(NumberUntagD)                               \
+  V(SIMD128ToTagged)                            \
+  V(TaggedToSIMD128)                            \
   V(OsrEntry)                                   \
   V(OuterContext)                               \
   V(Parameter)                                  \
@@ -2076,6 +2078,21 @@ class LNumberTagD V8_FINAL : public LTemplateInstruction<1, 1, 1> {
 };
 
 
+class LSIMD128ToTagged V8_FINAL : public LTemplateInstruction<1, 1, 1> {
+ public:
+  explicit LSIMD128ToTagged(LOperand* value, LOperand* temp) {
+    inputs_[0] = value;
+    temps_[0] = temp;
+  }
+
+  LOperand* value() { return inputs_[0]; }
+  LOperand* temp() { return temps_[0]; }
+
+  DECLARE_CONCRETE_INSTRUCTION(SIMD128ToTagged, "simd128-tag")
+  DECLARE_HYDROGEN_ACCESSOR(Change)
+};
+
+
 // Sometimes truncating conversion from a tagged value to an int32.
 class LDoubleToI V8_FINAL : public LTemplateInstruction<1, 1, 1> {
  public:
@@ -2152,6 +2169,25 @@ class LNumberUntagD V8_FINAL : public LTemplateInstruction<1, 1, 1> {
 };
 
 
+class LTaggedToSIMD128 V8_FINAL : public LTemplateInstruction<1, 1, 1> {
+ public:
+  explicit LTaggedToSIMD128(LOperand* value, LOperand* temp,
+      Representation representation) : representation_(representation) {
+    inputs_[0] = value;
+    temps_[0] = temp;
+  }
+
+  LOperand* value() { return inputs_[0]; }
+  LOperand* temp() { return temps_[0]; }
+  Representation representation() const { return representation_; }
+
+  DECLARE_CONCRETE_INSTRUCTION(TaggedToSIMD128, "simd128-untag")
+  DECLARE_HYDROGEN_ACCESSOR(Change);
+ private:
+  Representation representation_;
+};
+
+
 class LSmiUntag V8_FINAL : public LTemplateInstruction<1, 1, 0> {
  public:
   LSmiUntag(LOperand* value, bool needs_check)
diff --git a/src/ia32/macro-assembler-ia32.cc b/src/ia32/macro-assembler-ia32.cc
index edb68813ac5..b20db74dfad 100644
--- a/src/ia32/macro-assembler-ia32.cc
+++ b/src/ia32/macro-assembler-ia32.cc
@@ -1117,12 +1117,13 @@ void MacroAssembler::EnterExitFrameEpilogue(int argc, bool save_doubles) {
   // Optionally save all XMM registers.
   if (save_doubles) {
     CpuFeatureScope scope(this, SSE2);
-    int space = XMMRegister::kNumRegisters * kDoubleSize + argc * kPointerSize;
+    int space = XMMRegister::kNumRegisters * kSIMD128Size +
+        argc * kPointerSize;
     sub(esp, Immediate(space));
     const int offset = -2 * kPointerSize;
     for (int i = 0; i < XMMRegister::kNumRegisters; i++) {
       XMMRegister reg = XMMRegister::from_code(i);
-      movsd(Operand(ebp, offset - ((i + 1) * kDoubleSize)), reg);
+      movups(Operand(ebp, offset - ((i + 1) * kSIMD128Size)), reg);
     }
   } else {
     sub(esp, Immediate(argc * kPointerSize));
@@ -1166,7 +1167,7 @@ void MacroAssembler::LeaveExitFrame(bool save_doubles) {
     const int offset = -2 * kPointerSize;
     for (int i = 0; i < XMMRegister::kNumRegisters; i++) {
       XMMRegister reg = XMMRegister::from_code(i);
-      movsd(reg, Operand(ebp, offset - ((i + 1) * kDoubleSize)));
+      movups(reg, Operand(ebp, offset - ((i + 1) * kSIMD128Size)));
     }
   }
 
diff --git a/src/ia32/macro-assembler-ia32.h b/src/ia32/macro-assembler-ia32.h
index 102adb6a32e..522bd5eb553 100644
--- a/src/ia32/macro-assembler-ia32.h
+++ b/src/ia32/macro-assembler-ia32.h
@@ -651,6 +651,10 @@ class MacroAssembler: public Assembler {
                           Register scratch1,
                           Register scratch2,
                           Label* gc_required);
+
+  // Allocate a float32x4 or int32x4 object in new space with undefined value.
+  // Returns tagged pointer in result register, or jumps to gc_required if new
+  // space is full.
   void AllocateSIMDHeapObject(int size,
                               Register result,
                               Register scratch,
diff --git a/src/lithium-allocator-inl.h b/src/lithium-allocator-inl.h
index deee98877d6..9a6d08b1683 100644
--- a/src/lithium-allocator-inl.h
+++ b/src/lithium-allocator-inl.h
@@ -146,7 +146,8 @@ void UseIterator::Advance() {
 
 
 void LAllocator::SetLiveRangeAssignedRegister(LiveRange* range, int reg) {
-  if (range->Kind() == DOUBLE_REGISTERS) {
+  if (range->Kind() == DOUBLE_REGISTERS ||
+      IsSIMD128RegisterKind(range->Kind())) {
     assigned_double_registers_->Add(reg);
   } else {
     ASSERT(range->Kind() == GENERAL_REGISTERS);
diff --git a/src/lithium-allocator.cc b/src/lithium-allocator.cc
index 48fa862c90d..fd8f847e0f4 100644
--- a/src/lithium-allocator.cc
+++ b/src/lithium-allocator.cc
@@ -238,6 +238,12 @@ LOperand* LiveRange::CreateAssignedOperand(Zone* zone) {
       case DOUBLE_REGISTERS:
         op = LDoubleRegister::Create(assigned_register(), zone);
         break;
+      case FLOAT32x4_REGISTERS:
+        op = LFloat32x4Register::Create(assigned_register(), zone);
+        break;
+      case INT32x4_REGISTERS:
+        op = LInt32x4Register::Create(assigned_register(), zone);
+        break;
       default:
         UNREACHABLE();
     }
@@ -488,7 +494,7 @@ void LiveRange::ConvertOperands(Zone* zone) {
 
     if (use_pos->HasOperand()) {
       ASSERT(op->IsRegister() || op->IsDoubleRegister() ||
-             !use_pos->RequiresRegister());
+             op->IsSIMD128Register() || !use_pos->RequiresRegister());
       use_pos->operand()->ConvertTo(op->kind(), op->index());
     }
     use_pos = use_pos->next();
@@ -554,6 +560,7 @@ LAllocator::LAllocator(int num_values, HGraph* graph)
       active_live_ranges_(8, zone()),
       inactive_live_ranges_(8, zone()),
       reusable_slots_(8, zone()),
+      reusable_simd128_slots_(8, zone()),
       next_virtual_register_(num_values),
       first_artificial_register_(num_values),
       mode_(UNALLOCATED_REGISTERS),
@@ -873,6 +880,16 @@ void LAllocator::MeetConstraintsBetween(LInstruction* first,
           double_artificial_registers_.Add(
               cur_input->virtual_register() - first_artificial_register_,
               zone());
+        } else if (RequiredRegisterKind(input_copy->virtual_register()) ==
+            FLOAT32x4_REGISTERS) {
+          float32x4_artificial_registers_.Add(
+              cur_input->virtual_register() - first_artificial_register_,
+              zone());
+        } else if (RequiredRegisterKind(input_copy->virtual_register()) ==
+            INT32x4_REGISTERS) {
+          int32x4_artificial_registers_.Add(
+              cur_input->virtual_register() - first_artificial_register_,
+              zone());
         }
 
         AddConstraintsGapMove(gap_index, input_copy, cur_input);
@@ -1185,8 +1202,10 @@ void LAllocator::ResolveControlFlow(LiveRange* range,
         if (branch->HasPointerMap()) {
           if (HasTaggedValue(range->id())) {
             branch->pointer_map()->RecordPointer(cur_op, chunk()->zone());
-          } else if (!cur_op->IsDoubleStackSlot() &&
-                     !cur_op->IsDoubleRegister()) {
+          } else if (!cur_op->IsDoubleStackSlot()  &&
+                     !cur_op->IsDoubleRegister()   &&
+                     !cur_op->IsSIMD128StackSlot() &&
+                     !cur_op->IsSIMD128Register()) {
             branch->pointer_map()->RemovePointer(cur_op);
           }
         }
@@ -1512,6 +1531,9 @@ void LAllocator::AllocateRegisters() {
     if (live_ranges_[i] != NULL) {
       if (live_ranges_[i]->Kind() == mode_) {
         AddToUnhandledUnsorted(live_ranges_[i]);
+      } else if (mode_ == DOUBLE_REGISTERS &&
+                 IsSIMD128RegisterKind(live_ranges_[i]->Kind())) {
+        AddToUnhandledUnsorted(live_ranges_[i]);
       }
     }
   }
@@ -1519,6 +1541,7 @@ void LAllocator::AllocateRegisters() {
   ASSERT(UnhandledIsSorted());
 
   ASSERT(reusable_slots_.is_empty());
+  ASSERT(reusable_simd128_slots_.is_empty());
   ASSERT(active_live_ranges_.is_empty());
   ASSERT(inactive_live_ranges_.is_empty());
 
@@ -1610,6 +1633,7 @@ void LAllocator::AllocateRegisters() {
   }
 
   reusable_slots_.Rewind(0);
+  reusable_simd128_slots_.Rewind(0);
   active_live_ranges_.Rewind(0);
   inactive_live_ranges_.Rewind(0);
 }
@@ -1646,10 +1670,20 @@ RegisterKind LAllocator::RequiredRegisterKind(int virtual_register) const {
     HValue* value = graph_->LookupValue(virtual_register);
     if (value != NULL && value->representation().IsDouble()) {
       return DOUBLE_REGISTERS;
+    } else if (value != NULL && (value->representation().IsFloat32x4())) {
+      return FLOAT32x4_REGISTERS;
+    } else if (value != NULL && (value->representation().IsInt32x4())) {
+      return INT32x4_REGISTERS;
     }
   } else if (double_artificial_registers_.Contains(
       virtual_register - first_artificial_register_)) {
     return DOUBLE_REGISTERS;
+  } else if (float32x4_artificial_registers_.Contains(
+      virtual_register - first_artificial_register_)) {
+    return FLOAT32x4_REGISTERS;
+  } else if (int32x4_artificial_registers_.Contains(
+      virtual_register - first_artificial_register_)) {
+    return INT32x4_REGISTERS;
   }
 
   return GENERAL_REGISTERS;
@@ -1732,19 +1766,26 @@ void LAllocator::FreeSpillSlot(LiveRange* range) {
 
   int index = range->TopLevel()->GetSpillOperand()->index();
   if (index >= 0) {
-    reusable_slots_.Add(range, zone());
+    if (IsSIMD128RegisterKind(range->Kind())) {
+      reusable_simd128_slots_.Add(range, zone());
+    } else {
+      reusable_slots_.Add(range, zone());
+    }
   }
 }
 
 
 LOperand* LAllocator::TryReuseSpillSlot(LiveRange* range) {
-  if (reusable_slots_.is_empty()) return NULL;
-  if (reusable_slots_.first()->End().Value() >
+  ZoneList<LiveRange*>* reusable_slots = IsSIMD128RegisterKind(range->Kind())
+      ? &reusable_simd128_slots_
+      : &reusable_slots_;
+  if (reusable_slots->is_empty()) return NULL;
+  if (reusable_slots->first()->End().Value() >
       range->TopLevel()->Start().Value()) {
     return NULL;
   }
-  LOperand* result = reusable_slots_.first()->TopLevel()->GetSpillOperand();
-  reusable_slots_.Remove(0);
+  LOperand* result = reusable_slots->first()->TopLevel()->GetSpillOperand();
+  reusable_slots->Remove(0);
   return result;
 }
 
@@ -1811,7 +1852,8 @@ bool LAllocator::TryAllocateFreeReg(LiveRange* current) {
   }
 
   LOperand* hint = current->FirstHint();
-  if (hint != NULL && (hint->IsRegister() || hint->IsDoubleRegister())) {
+  if (hint != NULL && (hint->IsRegister() || hint->IsDoubleRegister() ||
+      hint->IsSIMD128Register())) {
     int register_index = hint->index();
     TraceAlloc(
         "Found reg hint %s (free until [%d) for live range %d (end %d[).\n",
@@ -2162,7 +2204,17 @@ void LAllocator::Spill(LiveRange* range) {
 
   if (!first->HasAllocatedSpillOperand()) {
     LOperand* op = TryReuseSpillSlot(range);
-    if (op == NULL) op = chunk_->GetNextSpillSlot(range->Kind());
+    if (op == NULL) {
+      op = chunk_->GetNextSpillSlot(range->Kind());
+    } else if (range->Kind() == FLOAT32x4_REGISTERS &&
+               op->kind() != LOperand::FLOAT32x4_STACK_SLOT) {
+      // Convert to Float32x4StackSlot.
+      op = LFloat32x4StackSlot::Create(op->index(), zone());
+    } else if (range->Kind() == INT32x4_REGISTERS &&
+               op->kind() != LOperand::INT32x4_STACK_SLOT) {
+      // Convert to Int32x4StackSlot.
+      op = LInt32x4StackSlot::Create(op->index(), zone());
+    }
     first->SetSpillOperand(op);
   }
   range->MakeSpilled(chunk()->zone());
diff --git a/src/lithium-allocator.h b/src/lithium-allocator.h
index 9908ea823d3..d87b689f669 100644
--- a/src/lithium-allocator.h
+++ b/src/lithium-allocator.h
@@ -51,12 +51,9 @@ class LArgument;
 class LPlatformChunk;
 class LOperand;
 class LUnallocated;
-class LConstantOperand;
 class LGap;
 class LParallelMove;
 class LPointerMap;
-class LStackSlot;
-class LRegister;
 
 
 // This class represents a single point of a LOperand's lifetime.
@@ -148,10 +145,17 @@ class LifetimePosition {
 enum RegisterKind {
   UNALLOCATED_REGISTERS,
   GENERAL_REGISTERS,
-  DOUBLE_REGISTERS
+  DOUBLE_REGISTERS,
+  FLOAT32x4_REGISTERS,
+  INT32x4_REGISTERS
 };
 
 
+inline bool IsSIMD128RegisterKind(RegisterKind kind) {
+  return kind == FLOAT32x4_REGISTERS || kind == INT32x4_REGISTERS;
+}
+
+
 // A register-allocator view of a Lithium instruction. It contains the id of
 // the output operand and a list of input operand uses.
 
@@ -616,11 +620,15 @@ class LAllocator BASE_EMBEDDED {
   ZoneList<LiveRange*> active_live_ranges_;
   ZoneList<LiveRange*> inactive_live_ranges_;
   ZoneList<LiveRange*> reusable_slots_;
+  // Slots reusable for both float32x4 and int32x4 register spilling.
+  ZoneList<LiveRange*> reusable_simd128_slots_;
 
   // Next virtual register number to be assigned to temporaries.
   int next_virtual_register_;
   int first_artificial_register_;
   GrowableBitVector double_artificial_registers_;
+  GrowableBitVector float32x4_artificial_registers_;
+  GrowableBitVector int32x4_artificial_registers_;
 
   RegisterKind mode_;
   int num_registers_;
diff --git a/src/lithium.cc b/src/lithium.cc
index b2fb4ead7cc..96b0eba359e 100644
--- a/src/lithium.cc
+++ b/src/lithium.cc
@@ -102,45 +102,63 @@ void LOperand::PrintTo(StringStream* stream) {
     case DOUBLE_STACK_SLOT:
       stream->Add("[double_stack:%d]", index());
       break;
+    case FLOAT32x4_STACK_SLOT:
+      stream->Add("[float32x4_stack:%d]", index());
+      break;
+    case INT32x4_STACK_SLOT:
+      stream->Add("[int32x4_stack:%d]", index());
+      break;
     case REGISTER:
       stream->Add("[%s|R]", Register::AllocationIndexToString(index()));
       break;
     case DOUBLE_REGISTER:
       stream->Add("[%s|R]", DoubleRegister::AllocationIndexToString(index()));
       break;
+    case FLOAT32x4_REGISTER:
+      stream->Add("[%s|R]",
+                  SIMD128Register::AllocationIndexToString(index()));
+      break;
+    case INT32x4_REGISTER:
+      stream->Add("[%s|R]",
+                  SIMD128Register::AllocationIndexToString(index()));
+      break;
     case ARGUMENT:
       stream->Add("[arg:%d]", index());
       break;
   }
 }
 
-#define DEFINE_OPERAND_CACHE(name, type)                      \
-  L##name* L##name::cache = NULL;                             \
-                                                              \
-  void L##name::SetUpCache() {                                \
-    if (cache) return;                                        \
-    cache = new L##name[kNumCachedOperands];                  \
-    for (int i = 0; i < kNumCachedOperands; i++) {            \
-      cache[i].ConvertTo(type, i);                            \
-    }                                                         \
-  }                                                           \
-                                                              \
-  void L##name::TearDownCache() {                             \
-    delete[] cache;                                           \
+
+template<LOperand::Kind kOperandKind, int kNumCachedOperands>
+LSubKindOperand<kOperandKind, kNumCachedOperands>*
+LSubKindOperand<kOperandKind, kNumCachedOperands>::cache = NULL;
+
+
+template<LOperand::Kind kOperandKind, int kNumCachedOperands>
+void LSubKindOperand<kOperandKind, kNumCachedOperands>::SetUpCache() {
+  if (cache) return;
+  cache = new LSubKindOperand[kNumCachedOperands];
+  for (int i = 0; i < kNumCachedOperands; i++) {
+    cache[i].ConvertTo(kOperandKind, i);
   }
+}
+
+
+template<LOperand::Kind kOperandKind, int kNumCachedOperands>
+void LSubKindOperand<kOperandKind, kNumCachedOperands>::TearDownCache() {
+  delete[] cache;
+}
 
-LITHIUM_OPERAND_LIST(DEFINE_OPERAND_CACHE)
-#undef DEFINE_OPERAND_CACHE
 
 void LOperand::SetUpCaches() {
-#define LITHIUM_OPERAND_SETUP(name, type) L##name::SetUpCache();
+#define LITHIUM_OPERAND_SETUP(name, type, number) L##name::SetUpCache();
   LITHIUM_OPERAND_LIST(LITHIUM_OPERAND_SETUP)
 #undef LITHIUM_OPERAND_SETUP
 }
 
 
 void LOperand::TearDownCaches() {
-#define LITHIUM_OPERAND_TEARDOWN(name, type) L##name::TearDownCache();
+#define LITHIUM_OPERAND_TEARDOWN(name, type, number) L##name::TearDownCache();
   LITHIUM_OPERAND_LIST(LITHIUM_OPERAND_TEARDOWN)
 #undef LITHIUM_OPERAND_TEARDOWN
 }
@@ -197,7 +215,9 @@ void LEnvironment::PrintTo(StringStream* stream) {
 void LPointerMap::RecordPointer(LOperand* op, Zone* zone) {
   // Do not record arguments as pointers.
   if (op->IsStackSlot() && op->index() < 0) return;
-  ASSERT(!op->IsDoubleRegister() && !op->IsDoubleStackSlot());
+  ASSERT(!op->IsDoubleRegister() && !op->IsDoubleStackSlot() &&
+         !op->IsFloat32x4Register() && !op->IsFloat32x4StackSlot() &&
+         !op->IsInt32x4Register() && !op->IsInt32x4StackSlot());
   pointer_operands_.Add(op, zone);
 }
 
@@ -205,7 +225,9 @@ void LPointerMap::RecordPointer(LOperand* op, Zone* zone) {
 void LPointerMap::RemovePointer(LOperand* op) {
   // Do not record arguments as pointers.
   if (op->IsStackSlot() && op->index() < 0) return;
-  ASSERT(!op->IsDoubleRegister() && !op->IsDoubleStackSlot());
+  ASSERT(!op->IsDoubleRegister() && !op->IsDoubleStackSlot() &&
+         !op->IsFloat32x4Register() && !op->IsFloat32x4StackSlot() &&
+         !op->IsInt32x4Register() && !op->IsInt32x4StackSlot());
   for (int i = 0; i < pointer_operands_.length(); ++i) {
     if (pointer_operands_[i]->Equals(op)) {
       pointer_operands_.Remove(i);
@@ -218,7 +240,9 @@ void LPointerMap::RemovePointer(LOperand* op) {
 void LPointerMap::RecordUntagged(LOperand* op, Zone* zone) {
   // Do not record arguments as pointers.
   if (op->IsStackSlot() && op->index() < 0) return;
-  ASSERT(!op->IsDoubleRegister() && !op->IsDoubleStackSlot());
+  ASSERT(!op->IsDoubleRegister() && !op->IsDoubleStackSlot() &&
+         !op->IsFloat32x4Register() && !op->IsFloat32x4StackSlot() &&
+         !op->IsInt32x4Register() && !op->IsInt32x4StackSlot());
   untagged_operands_.Add(op, zone);
 }
 
diff --git a/src/lithium.h b/src/lithium.h
index 754f88da821..f70a60e36dc 100644
--- a/src/lithium.h
+++ b/src/lithium.h
@@ -35,12 +35,16 @@
 namespace v8 {
 namespace internal {
 
-#define LITHIUM_OPERAND_LIST(V)         \
-  V(ConstantOperand, CONSTANT_OPERAND)  \
-  V(StackSlot,       STACK_SLOT)        \
-  V(DoubleStackSlot, DOUBLE_STACK_SLOT) \
-  V(Register,        REGISTER)          \
-  V(DoubleRegister,  DOUBLE_REGISTER)
+#define LITHIUM_OPERAND_LIST(V)                     \
+  V(ConstantOperand,    CONSTANT_OPERAND,     128)  \
+  V(StackSlot,          STACK_SLOT,           128)  \
+  V(DoubleStackSlot,    DOUBLE_STACK_SLOT,    128)  \
+  V(Float32x4StackSlot, FLOAT32x4_STACK_SLOT, 128)  \
+  V(Int32x4StackSlot,   INT32x4_STACK_SLOT,   128)  \
+  V(Register,           REGISTER,             16)   \
+  V(DoubleRegister,     DOUBLE_REGISTER,      16)   \
+  V(Float32x4Register,  FLOAT32x4_REGISTER,   16)   \
+  V(Int32x4Register,    INT32x4_REGISTER,     16)
 
 
 class LOperand : public ZoneObject {
@@ -51,8 +55,12 @@ class LOperand : public ZoneObject {
     CONSTANT_OPERAND,
     STACK_SLOT,
     DOUBLE_STACK_SLOT,
+    FLOAT32x4_STACK_SLOT,
+    INT32x4_STACK_SLOT,
     REGISTER,
     DOUBLE_REGISTER,
+    FLOAT32x4_REGISTER,
+    INT32x4_REGISTER,
     ARGUMENT
   };
 
@@ -60,14 +68,24 @@ class LOperand : public ZoneObject {
 
   Kind kind() const { return KindField::decode(value_); }
   int index() const { return static_cast<int>(value_) >> kKindFieldWidth; }
-#define LITHIUM_OPERAND_PREDICATE(name, type) \
+#define LITHIUM_OPERAND_PREDICATE(name, type, number) \
   bool Is##name() const { return kind() == type; }
   LITHIUM_OPERAND_LIST(LITHIUM_OPERAND_PREDICATE)
-  LITHIUM_OPERAND_PREDICATE(Argument, ARGUMENT)
-  LITHIUM_OPERAND_PREDICATE(Unallocated, UNALLOCATED)
-  LITHIUM_OPERAND_PREDICATE(Ignored, INVALID)
+  LITHIUM_OPERAND_PREDICATE(Argument, ARGUMENT, 0)
+  LITHIUM_OPERAND_PREDICATE(Unallocated, UNALLOCATED, 0)
+  LITHIUM_OPERAND_PREDICATE(Ignored, INVALID, 0)
 #undef LITHIUM_OPERAND_PREDICATE
-  bool Equals(LOperand* other) const { return value_ == other->value_; }
+  bool IsSIMD128Register() const {
+    return kind() == FLOAT32x4_REGISTER || kind() == INT32x4_REGISTER;
+  }
+  bool IsSIMD128StackSlot() const {
+    return kind() == FLOAT32x4_STACK_SLOT || kind() == INT32x4_STACK_SLOT;
+  }
+  bool Equals(LOperand* other) const {
+    return value_ == other->value_ || (index() == other->index() &&
+        ((IsSIMD128Register() && other->IsSIMD128Register()) ||
+         (IsSIMD128StackSlot() && other->IsSIMD128StackSlot())));
+  }
 
   void PrintTo(StringStream* stream);
   void ConvertTo(Kind kind, int index) {
@@ -81,7 +99,7 @@ class LOperand : public ZoneObject {
   static void TearDownCaches();
 
  protected:
-  static const int kKindFieldWidth = 3;
+  static const int kKindFieldWidth = 4;
   class KindField : public BitField<Kind, 0, kKindFieldWidth> { };
 
   LOperand(Kind kind, int index) { ConvertTo(kind, index); }
@@ -165,32 +183,32 @@ class LUnallocated : public LOperand {
   // because it accommodates a larger pay-load.
   //
   // For FIXED_SLOT policy:
-  //     +------------------------------------------+
-  //     |       slot_index      |  vreg  | 0 | 001 |
-  //     +------------------------------------------+
+  //     +-------------------------------------------+
+  //     |       slot_index      |  vreg  | 0 | 0001 |
+  //     +-------------------------------------------+
   //
   // For all other (extended) policies:
-  //     +------------------------------------------+
-  //     |  reg_index  | L | PPP |  vreg  | 1 | 001 |    L ... Lifetime
-  //     +------------------------------------------+    P ... Policy
+  //     +-------------------------------------------+
+  //     |  reg_index  | L | PPP |  vreg  | 1 | 0001 |    L ... Lifetime
+  //     +-------------------------------------------+    P ... Policy
   //
   // The slot index is a signed value which requires us to decode it manually
   // instead of using the BitField utility class.
 
   // The superclass has a KindField.
-  STATIC_ASSERT(kKindFieldWidth == 3);
+  STATIC_ASSERT(kKindFieldWidth == 4);
 
   // BitFields for all unallocated operands.
-  class BasicPolicyField     : public BitField<BasicPolicy,     3,  1> {};
-  class VirtualRegisterField : public BitField<unsigned,        4, 18> {};
+  class BasicPolicyField     : public BitField<BasicPolicy,     4,  1> {};
+  class VirtualRegisterField : public BitField<unsigned,        5, 18> {};
 
   // BitFields specific to BasicPolicy::FIXED_SLOT.
-  class FixedSlotIndexField  : public BitField<int,            22, 10> {};
+  class FixedSlotIndexField  : public BitField<int,            23,  9> {};
 
   // BitFields specific to BasicPolicy::EXTENDED_POLICY.
-  class ExtendedPolicyField  : public BitField<ExtendedPolicy, 22,  3> {};
-  class LifetimeField        : public BitField<Lifetime,       25,  1> {};
-  class FixedRegisterField   : public BitField<int,            26,  6> {};
+  class ExtendedPolicyField  : public BitField<ExtendedPolicy, 23,  3> {};
+  class LifetimeField        : public BitField<Lifetime,       26,  1> {};
+  class FixedRegisterField   : public BitField<int,            27,  5> {};
 
   static const int kMaxVirtualRegisters = VirtualRegisterField::kMax + 1;
   static const int kFixedSlotIndexWidth = FixedSlotIndexField::kSize;
@@ -317,31 +335,37 @@ class LMoveOperands V8_FINAL BASE_EMBEDDED {
 };
 
 
-class LConstantOperand V8_FINAL : public LOperand {
+template<LOperand::Kind kOperandKind, int kNumCachedOperands>
+class LSubKindOperand V8_FINAL : public LOperand {
  public:
-  static LConstantOperand* Create(int index, Zone* zone) {
+  static LSubKindOperand* Create(int index, Zone* zone) {
     ASSERT(index >= 0);
     if (index < kNumCachedOperands) return &cache[index];
-    return new(zone) LConstantOperand(index);
+    return new(zone) LSubKindOperand(index);
   }
 
-  static LConstantOperand* cast(LOperand* op) {
-    ASSERT(op->IsConstantOperand());
-    return reinterpret_cast<LConstantOperand*>(op);
+  static LSubKindOperand* cast(LOperand* op) {
+    ASSERT(op->kind() == kOperandKind);
+    return reinterpret_cast<LSubKindOperand*>(op);
   }
 
   static void SetUpCache();
   static void TearDownCache();
 
  private:
-  static const int kNumCachedOperands = 128;
-  static LConstantOperand* cache;
+  static LSubKindOperand* cache;
 
-  LConstantOperand() : LOperand() { }
-  explicit LConstantOperand(int index) : LOperand(CONSTANT_OPERAND, index) { }
+  LSubKindOperand() : LOperand() { }
+  explicit LSubKindOperand(int index) : LOperand(kOperandKind, index) { }
 };
 
 
+#define LITHIUM_TYPEDEF_SUBKIND_OPERAND_CLASS(name, type, number)   \
+typedef LSubKindOperand<LOperand::type, number> L##name;
+LITHIUM_OPERAND_LIST(LITHIUM_TYPEDEF_SUBKIND_OPERAND_CLASS)
+#undef LITHIUM_TYPEDEF_SUBKIND_OPERAND_CLASS
+
+
 class LArgument V8_FINAL : public LOperand {
  public:
   explicit LArgument(int index) : LOperand(ARGUMENT, index) { }
@@ -353,106 +377,6 @@ class LArgument V8_FINAL : public LOperand {
 };
 
 
-class LStackSlot V8_FINAL : public LOperand {
- public:
-  static LStackSlot* Create(int index, Zone* zone) {
-    ASSERT(index >= 0);
-    if (index < kNumCachedOperands) return &cache[index];
-    return new(zone) LStackSlot(index);
-  }
-
-  static LStackSlot* cast(LOperand* op) {
-    ASSERT(op->IsStackSlot());
-    return reinterpret_cast<LStackSlot*>(op);
-  }
-
-  static void SetUpCache();
-  static void TearDownCache();
-
- private:
-  static const int kNumCachedOperands = 128;
-  static LStackSlot* cache;
-
-  LStackSlot() : LOperand() { }
-  explicit LStackSlot(int index) : LOperand(STACK_SLOT, index) { }
-};
-
-
-class LDoubleStackSlot V8_FINAL : public LOperand {
- public:
-  static LDoubleStackSlot* Create(int index, Zone* zone) {
-    ASSERT(index >= 0);
-    if (index < kNumCachedOperands) return &cache[index];
-    return new(zone) LDoubleStackSlot(index);
-  }
-
-  static LDoubleStackSlot* cast(LOperand* op) {
-    ASSERT(op->IsStackSlot());
-    return reinterpret_cast<LDoubleStackSlot*>(op);
-  }
-
-  static void SetUpCache();
-  static void TearDownCache();
-
- private:
-  static const int kNumCachedOperands = 128;
-  static LDoubleStackSlot* cache;
-
-  LDoubleStackSlot() : LOperand() { }
-  explicit LDoubleStackSlot(int index) : LOperand(DOUBLE_STACK_SLOT, index) { }
-};
-
-
-class LRegister V8_FINAL : public LOperand {
- public:
-  static LRegister* Create(int index, Zone* zone) {
-    ASSERT(index >= 0);
-    if (index < kNumCachedOperands) return &cache[index];
-    return new(zone) LRegister(index);
-  }
-
-  static LRegister* cast(LOperand* op) {
-    ASSERT(op->IsRegister());
-    return reinterpret_cast<LRegister*>(op);
-  }
-
-  static void SetUpCache();
-  static void TearDownCache();
-
- private:
-  static const int kNumCachedOperands = 16;
-  static LRegister* cache;
-
-  LRegister() : LOperand() { }
-  explicit LRegister(int index) : LOperand(REGISTER, index) { }
-};
-
-
-class LDoubleRegister V8_FINAL : public LOperand {
- public:
-  static LDoubleRegister* Create(int index, Zone* zone) {
-    ASSERT(index >= 0);
-    if (index < kNumCachedOperands) return &cache[index];
-    return new(zone) LDoubleRegister(index);
-  }
-
-  static LDoubleRegister* cast(LOperand* op) {
-    ASSERT(op->IsDoubleRegister());
-    return reinterpret_cast<LDoubleRegister*>(op);
-  }
-
-  static void SetUpCache();
-  static void TearDownCache();
-
- private:
-  static const int kNumCachedOperands = 16;
-  static LDoubleRegister* cache;
-
-  LDoubleRegister() : LOperand() { }
-  explicit LDoubleRegister(int index) : LOperand(DOUBLE_REGISTER, index) { }
-};
-
-
 class LParallelMove V8_FINAL : public ZoneObject {
  public:
   explicit LParallelMove(Zone* zone) : move_operands_(4, zone) { }
diff --git a/src/objects-inl.h b/src/objects-inl.h
index 8c919601b4b..f5ed7aafba7 100644
--- a/src/objects-inl.h
+++ b/src/objects-inl.h
@@ -1427,6 +1427,11 @@ int HeapNumber::get_sign() {
 }
 
 
+const char* Float32x4::Name() {
+  return "float32x4";
+}
+
+
 int Float32x4::kRuntimeAllocatorId() {
   return Runtime::kAllocateFloat32x4;
 }
@@ -1453,6 +1458,11 @@ float Float32x4::getAt(int index) {
 }
 
 
+const char* Int32x4::Name() {
+  return "int32x4";
+}
+
+
 int Int32x4::kRuntimeAllocatorId() {
   return Runtime::kAllocateInt32x4;
 }
diff --git a/src/objects.cc b/src/objects.cc
index 1f08b31d489..6e8874db38e 100644
--- a/src/objects.cc
+++ b/src/objects.cc
@@ -2330,6 +2330,8 @@ const char* Representation::Mnemonic() const {
     case kTagged: return "t";
     case kSmi: return "s";
     case kDouble: return "d";
+    case kFloat32x4: return "float32x4";
+    case kInt32x4: return "int32x44";
     case kInteger32: return "i";
     case kHeapObject: return "h";
     case kExternal: return "x";
@@ -11015,6 +11017,20 @@ void DeoptimizationInputData::DeoptimizationInputDataPrint(FILE* out) {
           break;
         }
 
+        case Translation::FLOAT32x4_REGISTER: {
+          int reg_code = iterator.Next();
+          PrintF(out, "{input=%s}",
+                 SIMD128Register::AllocationIndexToString(reg_code));
+          break;
+        }
+
+        case Translation::INT32x4_REGISTER: {
+          int reg_code = iterator.Next();
+          PrintF(out, "{input=%s}",
+                 SIMD128Register::AllocationIndexToString(reg_code));
+          break;
+        }
+
         case Translation::STACK_SLOT: {
           int input_slot_index = iterator.Next();
           PrintF(out, "{input=%d}", input_slot_index);
@@ -11039,6 +11055,18 @@ void DeoptimizationInputData::DeoptimizationInputDataPrint(FILE* out) {
           break;
         }
 
+        case Translation::FLOAT32x4_STACK_SLOT: {
+          int input_slot_index = iterator.Next();
+          PrintF(out, "{input=%d}", input_slot_index);
+          break;
+        }
+
+        case Translation::INT32x4_STACK_SLOT: {
+          int input_slot_index = iterator.Next();
+          PrintF(out, "{input=%d}", input_slot_index);
+          break;
+        }
+
         case Translation::LITERAL: {
           unsigned literal_index = iterator.Next();
           PrintF(out, "{literal_id=%u}", literal_index);
diff --git a/src/objects.h b/src/objects.h
index 73923b9aca8..e518b3bfe3f 100644
--- a/src/objects.h
+++ b/src/objects.h
@@ -1972,6 +1972,7 @@ class Float32x4: public HeapObject {
   static const int kLanes = 4;
   static const int kValueSize = kFloat32x4Size;
   static const InstanceType kInstanceType = FLOAT32x4_TYPE;
+  static inline const char* Name();
   static inline int kRuntimeAllocatorId();
   static inline int kMapRootIndex();
 
@@ -2009,6 +2010,7 @@ class Int32x4: public HeapObject {
   typedef int32x4_value_t value_t;
   static const int kValueSize = kInt32x4Size;
   static const InstanceType kInstanceType = INT32x4_TYPE;
+  static inline const char* Name();
   static inline int kRuntimeAllocatorId();
   static inline int kMapRootIndex();
 
diff --git a/src/property-details.h b/src/property-details.h
index 99dd1211b24..11d7063455b 100644
--- a/src/property-details.h
+++ b/src/property-details.h
@@ -148,6 +148,7 @@ class Representation {
     }
     if (kind_ == kUInteger8 && other.kind_ == kInteger8) return false;
     if (kind_ == kUInteger16 && other.kind_ == kInteger16) return false;
+    if (IsSIMD128() && other.IsSIMD128()) return false;
     return kind_ > other.kind_;
   }
 
@@ -189,6 +190,7 @@ class Representation {
   bool IsDouble() const { return kind_ == kDouble; }
   bool IsFloat32x4() const { return kind_ == kFloat32x4; }
   bool IsInt32x4() const { return kind_ == kInt32x4; }
+  bool IsSIMD128() const { return IsFloat32x4() || IsInt32x4(); }
   bool IsHeapObject() const { return kind_ == kHeapObject; }
   bool IsExternal() const { return kind_ == kExternal; }
   bool IsSpecialization() const {
diff --git a/src/x64/assembler-x64.cc b/src/x64/assembler-x64.cc
index e7c20bb1508..bc6121a07ef 100644
--- a/src/x64/assembler-x64.cc
+++ b/src/x64/assembler-x64.cc
@@ -2739,6 +2739,24 @@ void Assembler::movaps(XMMRegister dst, XMMRegister src) {
 }
 
 
+void Assembler::movups(XMMRegister dst, const Operand& src) {
+  EnsureSpace ensure_space(this);
+  emit_optional_rex_32(dst, src);
+  emit(0x0F);
+  emit(0x10);
+  emit_sse_operand(dst, src);
+}
+
+
+void Assembler::movups(const Operand& dst, XMMRegister src) {
+  EnsureSpace ensure_space(this);
+  emit_optional_rex_32(src, dst);
+  emit(0x0F);
+  emit(0x11);
+  emit_sse_operand(src, dst);
+}
+
+
 void Assembler::shufps(XMMRegister dst, XMMRegister src, byte imm8) {
   ASSERT(is_uint8(imm8));
   EnsureSpace ensure_space(this);
diff --git a/src/x64/assembler-x64.h b/src/x64/assembler-x64.h
index d4a51cd4dd9..c2411ba1477 100644
--- a/src/x64/assembler-x64.h
+++ b/src/x64/assembler-x64.h
@@ -300,6 +300,7 @@ const XMMRegister xmm15 = { 15 };
 
 
 typedef XMMRegister DoubleRegister;
+typedef XMMRegister SIMD128Register;
 
 
 enum Condition {
@@ -1359,6 +1360,8 @@ class Assembler : public AssemblerBase {
 
   // SSE instructions
   void movaps(XMMRegister dst, XMMRegister src);
+  void movups(XMMRegister dst, const Operand& src);
+  void movups(const Operand& dst, XMMRegister src);
   void movss(XMMRegister dst, const Operand& src);
   void movss(const Operand& dst, XMMRegister src);
   void shufps(XMMRegister dst, XMMRegister src, byte imm8);
diff --git a/src/x64/cpu-x64.cc b/src/x64/cpu-x64.cc
index 4fa290a8b5f..427d3b0aad8 100644
--- a/src/x64/cpu-x64.cc
+++ b/src/x64/cpu-x64.cc
@@ -51,6 +51,11 @@ bool CPU::SupportsCrankshaft() {
 }
 
 
+bool CPU::SupportsSIMD128InCrankshaft() {
+  return true;  // Yay!
+}
+
+
 void CPU::FlushICache(void* start, size_t size) {
   // No need to flush the instruction cache on Intel. On Intel instruction
   // cache flushing is only necessary when multiple cores running the same
diff --git a/src/x64/deoptimizer-x64.cc b/src/x64/deoptimizer-x64.cc
index fd26cf5265e..8eadb6f4ae9 100644
--- a/src/x64/deoptimizer-x64.cc
+++ b/src/x64/deoptimizer-x64.cc
@@ -91,8 +91,9 @@ void Deoptimizer::FillInputFrame(Address tos, JavaScriptFrame* frame) {
   }
   input_->SetRegister(rsp.code(), reinterpret_cast<intptr_t>(frame->sp()));
   input_->SetRegister(rbp.code(), reinterpret_cast<intptr_t>(frame->fp()));
+  simd128_value_t zero = {{0.0, 0.0}};
   for (int i = 0; i < DoubleRegister::NumAllocatableRegisters(); i++) {
-    input_->SetDoubleRegister(i, 0.0);
+    input_->SetSIMD128Register(i, zero);
   }
 
   // Fill the frame content from the actual data on the frame.
@@ -112,10 +113,10 @@ void Deoptimizer::SetPlatformCompiledStubRegisters(
 }
 
 
-void Deoptimizer::CopyDoubleRegisters(FrameDescription* output_frame) {
+void Deoptimizer::CopySIMD128Registers(FrameDescription* output_frame) {
   for (int i = 0; i < XMMRegister::NumAllocatableRegisters(); ++i) {
-    double double_value = input_->GetDoubleRegister(i);
-    output_frame->SetDoubleRegister(i, double_value);
+    simd128_value_t xmm_value = input_->GetSIMD128Register(i);
+    output_frame->SetSIMD128Register(i, xmm_value);
   }
 }
 
@@ -139,14 +140,14 @@ void Deoptimizer::EntryGenerator::Generate() {
   // Save all general purpose registers before messing with them.
   const int kNumberOfRegisters = Register::kNumRegisters;
 
-  const int kDoubleRegsSize = kDoubleSize *
+  const int kXMMRegsSize = kSIMD128Size *
       XMMRegister::NumAllocatableRegisters();
-  __ subq(rsp, Immediate(kDoubleRegsSize));
+  __ subq(rsp, Immediate(kXMMRegsSize));
 
   for (int i = 0; i < XMMRegister::NumAllocatableRegisters(); ++i) {
     XMMRegister xmm_reg = XMMRegister::FromAllocationIndex(i);
-    int offset = i * kDoubleSize;
-    __ movsd(Operand(rsp, offset), xmm_reg);
+    int offset = i * kSIMD128Size;
+    __ movups(Operand(rsp, offset), xmm_reg);
   }
 
   // We push all registers onto the stack, even though we do not need
@@ -157,7 +158,7 @@ void Deoptimizer::EntryGenerator::Generate() {
   }
 
   const int kSavedRegistersAreaSize = kNumberOfRegisters * kRegisterSize +
-                                      kDoubleRegsSize;
+                                      kXMMRegsSize;
 
   // We use this to keep the value of the fifth argument temporarily.
   // Unfortunately we can't store it directly in r8 (used for passing
@@ -207,11 +208,13 @@ void Deoptimizer::EntryGenerator::Generate() {
     __ pop(Operand(rbx, offset));
   }
 
-  // Fill in the double input registers.
-  int double_regs_offset = FrameDescription::double_registers_offset();
+  // Fill in the xmm input registers.
+  STATIC_ASSERT(kSIMD128Size == 2 * kDoubleSize);
+  int xmm_regs_offset = FrameDescription::simd128_registers_offset();
   for (int i = 0; i < XMMRegister::NumAllocatableRegisters(); i++) {
-    int dst_offset = i * kDoubleSize + double_regs_offset;
+    int dst_offset = i * kSIMD128Size + xmm_regs_offset;
     __ pop(Operand(rbx, dst_offset));
+    __ pop(Operand(rbx, dst_offset + kDoubleSize));
   }
 
   // Remove the bailout id and return address from the stack.
@@ -275,8 +278,8 @@ void Deoptimizer::EntryGenerator::Generate() {
 
   for (int i = 0; i < XMMRegister::NumAllocatableRegisters(); ++i) {
     XMMRegister xmm_reg = XMMRegister::FromAllocationIndex(i);
-    int src_offset = i * kDoubleSize + double_regs_offset;
-    __ movsd(xmm_reg, Operand(rbx, src_offset));
+    int src_offset = i * kSIMD128Size + xmm_regs_offset;
+    __ movups(xmm_reg, Operand(rbx, src_offset));
   }
 
   // Push state, pc, and continuation from the last output frame.
@@ -335,6 +338,18 @@ void FrameDescription::SetCallerFp(unsigned offset, intptr_t value) {
 }
 
 
+double FrameDescription::GetDoubleRegister(unsigned n) const {
+  ASSERT(n < ARRAY_SIZE(simd128_registers_));
+  return simd128_registers_[n].d[0];
+}
+
+
+void FrameDescription::SetDoubleRegister(unsigned n, double value) {
+  ASSERT(n < ARRAY_SIZE(simd128_registers_));
+  simd128_registers_[n].d[0] = value;
+}
+
+
 #undef __
 
 
diff --git a/src/x64/disasm-x64.cc b/src/x64/disasm-x64.cc
index 76b541c0100..0f6b9971c2f 100644
--- a/src/x64/disasm-x64.cc
+++ b/src/x64/disasm-x64.cc
@@ -1249,6 +1249,21 @@ int DisassemblerX64::TwoByteOpcodeInstruction(byte* data) {
     current += PrintRightXMMOperand(current);
     AppendToBuffer(",%s", NameOfXMMRegister(regop));
 
+  } else if (opcode == 0x10) {
+    // movups xmm, xmm/m128
+    int mod, regop, rm;
+    get_modrm(*current, &mod, &regop, &rm);
+    AppendToBuffer("movups %s, ", NameOfXMMRegister(regop));
+    current += PrintRightXMMOperand(current);
+
+  } else if (opcode == 0x11) {
+    // movups xmm/m128, xmm
+    int mod, regop, rm;
+    get_modrm(*current, &mod, &regop, &rm);
+    AppendToBuffer("movups ");
+    current += PrintRightXMMOperand(current);
+    AppendToBuffer(", %s", NameOfXMMRegister(regop));
+
   } else if (opcode == 0xA2) {
     // CPUID
     AppendToBuffer("%s", mnemonic);
diff --git a/src/x64/lithium-codegen-x64.cc b/src/x64/lithium-codegen-x64.cc
index d25971af7ee..c0f14e1fff9 100644
--- a/src/x64/lithium-codegen-x64.cc
+++ b/src/x64/lithium-codegen-x64.cc
@@ -385,6 +385,11 @@ XMMRegister LCodeGen::ToDoubleRegister(int index) const {
 }
 
 
+XMMRegister LCodeGen::ToSIMD128Register(int index) const {
+  return XMMRegister::FromAllocationIndex(index);
+}
+
+
 Register LCodeGen::ToRegister(LOperand* op) const {
   ASSERT(op->IsRegister());
   return ToRegister(op->index());
@@ -397,6 +402,24 @@ XMMRegister LCodeGen::ToDoubleRegister(LOperand* op) const {
 }
 
 
+XMMRegister LCodeGen::ToFloat32x4Register(LOperand* op) const {
+  ASSERT(op->IsFloat32x4Register());
+  return ToSIMD128Register(op->index());
+}
+
+
+XMMRegister LCodeGen::ToInt32x4Register(LOperand* op) const {
+  ASSERT(op->IsInt32x4Register());
+  return ToSIMD128Register(op->index());
+}
+
+
+XMMRegister LCodeGen::ToSIMD128Register(LOperand* op) const {
+  ASSERT(op->IsFloat32x4Register() || op->IsInt32x4Register());
+  return ToSIMD128Register(op->index());
+}
+
+
 bool LCodeGen::IsInteger32Constant(LConstantOperand* op) const {
   return op->IsConstantOperand() &&
       chunk_->LookupLiteralRepresentation(op).IsSmiOrInteger32();
@@ -457,7 +480,8 @@ static int ArgumentsOffsetWithoutFrame(int index) {
 Operand LCodeGen::ToOperand(LOperand* op) const {
   // Does not handle registers. In X64 assembler, plain registers are not
   // representable as an Operand.
-  ASSERT(op->IsStackSlot() || op->IsDoubleStackSlot());
+  ASSERT(op->IsStackSlot() || op->IsDoubleStackSlot() ||
+         op->IsFloat32x4StackSlot() || op->IsInt32x4StackSlot());
   if (NeedsEagerFrame()) {
     return Operand(rbp, StackSlotOffset(op->index()));
   } else {
@@ -570,6 +594,12 @@ void LCodeGen::AddToTranslation(LEnvironment* environment,
     }
   } else if (op->IsDoubleStackSlot()) {
     translation->StoreDoubleStackSlot(op->index());
+  } else if (op->IsFloat32x4StackSlot()) {
+    translation->StoreSIMD128StackSlot(op->index(),
+                                       Translation::FLOAT32x4_STACK_SLOT);
+  } else if (op->IsInt32x4StackSlot()) {
+    translation->StoreSIMD128StackSlot(op->index(),
+                                       Translation::INT32x4_STACK_SLOT);
   } else if (op->IsArgument()) {
     ASSERT(is_tagged);
     int src_index = GetStackSlotCount() + op->index();
@@ -586,6 +616,12 @@ void LCodeGen::AddToTranslation(LEnvironment* environment,
   } else if (op->IsDoubleRegister()) {
     XMMRegister reg = ToDoubleRegister(op);
     translation->StoreDoubleRegister(reg);
+  } else if (op->IsFloat32x4Register()) {
+    XMMRegister reg = ToFloat32x4Register(op);
+    translation->StoreSIMD128Register(reg, Translation::FLOAT32x4_REGISTER);
+  } else if (op->IsInt32x4Register()) {
+    XMMRegister reg = ToInt32x4Register(op);
+    translation->StoreSIMD128Register(reg, Translation::INT32x4_REGISTER);
   } else if (op->IsConstantOperand()) {
     HConstant* constant = chunk()->LookupConstant(LConstantOperand::cast(op));
     int src_index = DefineDeoptimizationLiteral(constant->handle(isolate()));
@@ -2936,26 +2972,6 @@ void LCodeGen::DoAccessArgumentsAt(LAccessArgumentsAt* instr) {
 }
 
 
-void LCodeGen::DoDeferredSIMD128ToTagged(LInstruction* instr,
-                                         Runtime::FunctionId id) {
-  // TODO(3095996): Get rid of this. For now, we need to make the
-  // result register contain a valid pointer because it is already
-  // contained in the register pointer map.
-  Register reg = ToRegister(instr->result());
-  __ Move(reg, Smi::FromInt(0));
-
-  {
-    PushSafepointRegistersScope scope(this);
-    __ movp(rsi, Operand(rbp, StandardFrameConstants::kContextOffset));
-    __ CallRuntimeSaveDoubles(id);
-    RecordSafepointWithRegisters(
-        instr->pointer_map(), 0, Safepoint::kNoLazyDeopt);
-    __ movp(kScratchRegister, rax);
-  }
-  __ movp(reg, kScratchRegister);
-}
-
-
 void LCodeGen::HandleExternalArrayOpRequiresPreScale(
     LOperand* key,
     ElementsKind elements_kind) {
@@ -2968,62 +2984,6 @@ void LCodeGen::HandleExternalArrayOpRequiresPreScale(
 }
 
 
-template<class T>
-void LCodeGen::DoLoadKeyedSIMD128ExternalArray(LLoadKeyed* instr) {
-  class DeferredSIMD128ToTagged V8_FINAL : public LDeferredCode {
-   public:
-    DeferredSIMD128ToTagged(LCodeGen* codegen,
-                            LInstruction* instr,
-                            Runtime::FunctionId id)
-        : LDeferredCode(codegen), instr_(instr), id_(id) { }
-    virtual void Generate() V8_OVERRIDE {
-      codegen()->DoDeferredSIMD128ToTagged(instr_, id_);
-    }
-    virtual LInstruction* instr() V8_OVERRIDE { return instr_; }
-   private:
-    LInstruction* instr_;
-    Runtime::FunctionId id_;
-  };
-
-  // Pre scale key if necessary.
-  LOperand* key = instr->key();
-  ElementsKind elements_kind = instr->elements_kind();
-  if (!key->IsConstantOperand()) {
-    HandleExternalArrayOpRequiresPreScale(key, elements_kind);
-  }
-
-  // Allocate a SIMD128 object on the heap.
-  Register reg = ToRegister(instr->result());
-  Register tmp = ToRegister(instr->temp());
-  DeferredSIMD128ToTagged* deferred =
-      new(zone()) DeferredSIMD128ToTagged(this, instr,
-          static_cast<Runtime::FunctionId>(T::kRuntimeAllocatorId()));
-  if (FLAG_inline_new) {
-    __ AllocateSIMDHeapObject(T::kSize, reg, tmp, deferred->entry(),
-        static_cast<Heap::RootListIndex>(T::kMapRootIndex()));
-  } else {
-    __ jmp(deferred->entry());
-  }
-  __ bind(deferred->exit());
-
-  // Copy the SIMD128 value from the external array to the heap object.
-  STATIC_ASSERT(T::kValueSize % kPointerSize == 0);
-  int base_offset = instr->is_fixed_typed_array()
-      ? FixedTypedArrayBase::kDataOffset - kHeapObjectTag
-      : 0;
-  for (int offset = 0; offset < T::kValueSize; offset += kPointerSize) {
-    Operand operand(BuildFastArrayOperand(
-        instr->elements(),
-        key,
-        elements_kind,
-        base_offset + offset,
-        instr->additional_index()));
-    __ movp(tmp, operand);
-    __ movp(FieldOperand(reg, T::kValueOffset + offset), tmp);
-  }
-}
-
-
 void LCodeGen::DoLoadKeyedExternalArray(LLoadKeyed* instr) {
   ElementsKind elements_kind = instr->elements_kind();
   LOperand* key = instr->key();
@@ -3039,6 +2999,8 @@ void LCodeGen::DoLoadKeyedExternalArray(LLoadKeyed* instr) {
       // and the dehoisted address computation happens in 64 bits
       __ movsxlq(key_reg, key_reg);
     }
+
+    HandleExternalArrayOpRequiresPreScale(key, elements_kind);
   }
   int base_offset = instr->is_fixed_typed_array()
     ? FixedTypedArrayBase::kDataOffset - kHeapObjectTag
@@ -3058,10 +3020,8 @@ void LCodeGen::DoLoadKeyedExternalArray(LLoadKeyed* instr) {
   } else if (elements_kind == EXTERNAL_FLOAT64_ELEMENTS ||
              elements_kind == FLOAT64_ELEMENTS) {
     __ movsd(ToDoubleRegister(instr->result()), operand);
-  } else if (IsFloat32x4ElementsKind(elements_kind)) {
-    DoLoadKeyedSIMD128ExternalArray<Float32x4>(instr);
-  } else if (IsInt32x4ElementsKind(elements_kind)) {
-    DoLoadKeyedSIMD128ExternalArray<Int32x4>(instr);
+  } else if (IsSIMD128ElementsKind(elements_kind)) {
+    __ movups(ToSIMD128Register(instr->result()), operand);
   } else {
     Register result(ToRegister(instr->result()));
     switch (elements_kind) {
@@ -3233,6 +3193,7 @@ Operand LCodeGen::BuildFastArrayOperand(
     if (constant_value & 0xF0000000) {
       Abort(kArrayIndexConstantValueTooBig);
     }
+
     return Operand(elements_pointer_reg,
                    ((constant_value + additional_index) << shift_size)
                        + offset);
@@ -4228,42 +4189,6 @@ void LCodeGen::DoBoundsCheck(LBoundsCheck* instr) {
 }
 
 
-template<class T>
-void LCodeGen::DoStoreKeyedSIMD128ExternalArray(LStoreKeyed* instr) {
-  ASSERT(instr->value()->IsRegister());
-  Register input_reg = ToRegister(instr->value());
-  Condition cc = masm()->CheckSmi(input_reg);
-  DeoptimizeIf(cc, instr->environment());
-  __ CompareRoot(FieldOperand(input_reg, HeapObject::kMapOffset),
-      static_cast<Heap::RootListIndex>(T::kMapRootIndex()));
-  DeoptimizeIf(not_equal, instr->environment());
-
-  // Pre scale key if necessary.
-  LOperand* key = instr->key();
-  ElementsKind elements_kind = instr->elements_kind();
-  if (!key->IsConstantOperand()) {
-    HandleExternalArrayOpRequiresPreScale(key, elements_kind);
-  }
-
-  // Copy the SIMD128 value from the heap object to the external array.
-  STATIC_ASSERT(T::kValueSize % kPointerSize == 0);
-  int base_offset = instr->is_fixed_typed_array()
-      ? FixedTypedArrayBase::kDataOffset - kHeapObjectTag
-      : 0;
-  for (int offset = 0; offset < T::kValueSize; offset += kPointerSize) {
-    Operand operand(BuildFastArrayOperand(
-          instr->elements(),
-          key,
-          elements_kind,
-          base_offset + offset,
-          instr->additional_index()));
-    __ movp(kScratchRegister,
-        FieldOperand(input_reg, T::kValueOffset + offset));
-    __ movp(operand, kScratchRegister);
-  }
-}
-
-
 void LCodeGen::DoStoreKeyedExternalArray(LStoreKeyed* instr) {
   ElementsKind elements_kind = instr->elements_kind();
   LOperand* key = instr->key();
@@ -4279,6 +4204,8 @@ void LCodeGen::DoStoreKeyedExternalArray(LStoreKeyed* instr) {
       // and the dehoisted address computation happens in 64 bits
       __ movsxlq(key_reg, key_reg);
     }
+
+    HandleExternalArrayOpRequiresPreScale(key, elements_kind);
   }
   int base_offset = instr->is_fixed_typed_array()
     ? FixedTypedArrayBase::kDataOffset - kHeapObjectTag
@@ -4298,10 +4225,8 @@ void LCodeGen::DoStoreKeyedExternalArray(LStoreKeyed* instr) {
   } else if (elements_kind == EXTERNAL_FLOAT64_ELEMENTS ||
              elements_kind == FLOAT64_ELEMENTS) {
     __ movsd(operand, ToDoubleRegister(instr->value()));
-  } else if (IsFloat32x4ElementsKind(elements_kind)) {
-    DoStoreKeyedSIMD128ExternalArray<Float32x4>(instr);
-  } else if (IsInt32x4ElementsKind(elements_kind)) {
-    DoStoreKeyedSIMD128ExternalArray<Int32x4>(instr);
+  } else if (IsSIMD128ElementsKind(elements_kind)) {
+    __ movups(operand, ToSIMD128Register(instr->value()));
   } else {
     Register value(ToRegister(instr->value()));
     switch (elements_kind) {
@@ -4835,6 +4760,71 @@ void LCodeGen::DoDeferredNumberTagD(LNumberTagD* instr) {
 }
 
 
+void LCodeGen::DoDeferredSIMD128ToTagged(LSIMD128ToTagged* instr,
+                                         Runtime::FunctionId id) {
+  // TODO(3095996): Get rid of this. For now, we need to make the
+  // result register contain a valid pointer because it is already
+  // contained in the register pointer map.
+  Register reg = ToRegister(instr->result());
+  __ Move(reg, Smi::FromInt(0));
+
+  {
+    PushSafepointRegistersScope scope(this);
+    __ movp(rsi, Operand(rbp, StandardFrameConstants::kContextOffset));
+    __ CallRuntimeSaveDoubles(id);
+    RecordSafepointWithRegisters(
+        instr->pointer_map(), 0, Safepoint::kNoLazyDeopt);
+    __ movp(kScratchRegister, rax);
+  }
+  __ movp(reg, kScratchRegister);
+}
+
+
+template<class T>
+void LCodeGen::HandleSIMD128ToTagged(LSIMD128ToTagged* instr) {
+  class DeferredSIMD128ToTagged V8_FINAL : public LDeferredCode {
+   public:
+    DeferredSIMD128ToTagged(LCodeGen* codegen,
+                            LSIMD128ToTagged* instr,
+                            Runtime::FunctionId id)
+        : LDeferredCode(codegen), instr_(instr), id_(id) { }
+    virtual void Generate() V8_OVERRIDE {
+      codegen()->DoDeferredSIMD128ToTagged(instr_, id_);
+    }
+    virtual LInstruction* instr() V8_OVERRIDE { return instr_; }
+   private:
+    LSIMD128ToTagged* instr_;
+    Runtime::FunctionId id_;
+  };
+
+  XMMRegister input_reg = ToSIMD128Register(instr->value());
+  Register reg = ToRegister(instr->result());
+  Register tmp = ToRegister(instr->temp());
+
+  DeferredSIMD128ToTagged* deferred =
+      new(zone()) DeferredSIMD128ToTagged(this, instr,
+          static_cast<Runtime::FunctionId>(T::kRuntimeAllocatorId()));
+  if (FLAG_inline_new) {
+    __ AllocateSIMDHeapObject(T::kSize, reg, tmp, deferred->entry(),
+        static_cast<Heap::RootListIndex>(T::kMapRootIndex()));
+  } else {
+    __ jmp(deferred->entry());
+  }
+  __ bind(deferred->exit());
+  __ movups(FieldOperand(reg, T::kValueOffset), input_reg);
+}
+
+
+void LCodeGen::DoSIMD128ToTagged(LSIMD128ToTagged* instr) {
+  if (instr->value()->IsFloat32x4Register()) {
+    HandleSIMD128ToTagged<Float32x4>(instr);
+  } else {
+    ASSERT(instr->value()->IsInt32x4Register());
+    HandleSIMD128ToTagged<Int32x4>(instr);
+  }
+}
+
+
 void LCodeGen::DoSmiTag(LSmiTag* instr) {
   ASSERT(instr->value()->Equals(instr->result()));
   Register input = ToRegister(instr->value());
@@ -5012,6 +5002,35 @@ void LCodeGen::DoNumberUntagD(LNumberUntagD* instr) {
 }
 
 
+template<class T>
+void LCodeGen::HandleTaggedToSIMD128(LTaggedToSIMD128* instr) {
+  LOperand* input = instr->value();
+  ASSERT(input->IsRegister());
+  LOperand* result = instr->result();
+  ASSERT(result->IsSIMD128Register());
+
+  Register input_reg = ToRegister(input);
+  XMMRegister result_reg = ToSIMD128Register(result);
+
+  Condition cc = masm()->CheckSmi(input_reg);
+  DeoptimizeIf(cc, instr->environment());
+  __ CompareRoot(FieldOperand(input_reg, HeapObject::kMapOffset),
+                 static_cast<Heap::RootListIndex>(T::kMapRootIndex()));
+  DeoptimizeIf(not_equal, instr->environment());
+  __ movups(result_reg, FieldOperand(input_reg, T::kValueOffset));
+}
+
+
+void LCodeGen::DoTaggedToSIMD128(LTaggedToSIMD128* instr) {
+  if (instr->representation().IsFloat32x4()) {
+    HandleTaggedToSIMD128<Float32x4>(instr);
+  } else {
+    ASSERT(instr->representation().IsInt32x4());
+    HandleTaggedToSIMD128<Int32x4>(instr);
+  }
+}
+
+
 void LCodeGen::DoDoubleToI(LDoubleToI* instr) {
   LOperand* input = instr->value();
   ASSERT(input->IsDoubleRegister());
diff --git a/src/x64/lithium-codegen-x64.h b/src/x64/lithium-codegen-x64.h
index 0c20c4b2026..db2f15bf3c5 100644
--- a/src/x64/lithium-codegen-x64.h
+++ b/src/x64/lithium-codegen-x64.h
@@ -85,6 +85,9 @@ class LCodeGen: public LCodeGenBase {
   // Support for converting LOperands to assembler types.
   Register ToRegister(LOperand* op) const;
   XMMRegister ToDoubleRegister(LOperand* op) const;
+  XMMRegister ToFloat32x4Register(LOperand* op) const;
+  XMMRegister ToInt32x4Register(LOperand* op) const;
+  XMMRegister ToSIMD128Register(LOperand* op) const;
   bool IsInteger32Constant(LConstantOperand* op) const;
   bool IsSmiConstant(LConstantOperand* op) const;
   int32_t ToInteger32(LConstantOperand* op) const;
@@ -116,7 +119,13 @@ class LCodeGen: public LCodeGenBase {
   void DoDeferredInstanceOfKnownGlobal(LInstanceOfKnownGlobal* instr,
                                        Label* map_check);
   void DoDeferredInstanceMigration(LCheckMaps* instr, Register object);
-  void DoDeferredSIMD128ToTagged(LInstruction* instr, Runtime::FunctionId id);
+  void DoDeferredSIMD128ToTagged(LSIMD128ToTagged* instr,
+                                 Runtime::FunctionId id);
+
+  template<class T>
+  void HandleTaggedToSIMD128(LTaggedToSIMD128* instr);
+  template<class T>
+  void HandleSIMD128ToTagged(LSIMD128ToTagged* instr);
 
 // Parallel move support.
   void DoParallelMove(LParallelMove* move);
@@ -245,6 +254,7 @@ class LCodeGen: public LCodeGenBase {
 
   Register ToRegister(int index) const;
   XMMRegister ToDoubleRegister(int index) const;
+  XMMRegister ToSIMD128Register(int index) const;
   Operand BuildFastArrayOperand(
       LOperand* elements_pointer,
       LOperand* key,
@@ -327,13 +337,9 @@ class LCodeGen: public LCodeGenBase {
   void DoLoadKeyedExternalArray(LLoadKeyed* instr);
   void HandleExternalArrayOpRequiresPreScale(LOperand* key,
                                              ElementsKind elements_kind);
-  template<class T>
-  void DoLoadKeyedSIMD128ExternalArray(LLoadKeyed* instr);
   void DoLoadKeyedFixedDoubleArray(LLoadKeyed* instr);
   void DoLoadKeyedFixedArray(LLoadKeyed* instr);
   void DoStoreKeyedExternalArray(LStoreKeyed* instr);
-  template<class T>
-  void DoStoreKeyedSIMD128ExternalArray(LStoreKeyed* instr);
   void DoStoreKeyedFixedDoubleArray(LStoreKeyed* instr);
   void DoStoreKeyedFixedArray(LStoreKeyed* instr);
 #ifdef _MSC_VER
diff --git a/src/x64/lithium-gap-resolver-x64.cc b/src/x64/lithium-gap-resolver-x64.cc
index 5b4e32d2c44..f1c112a699a 100644
--- a/src/x64/lithium-gap-resolver-x64.cc
+++ b/src/x64/lithium-gap-resolver-x64.cc
@@ -244,6 +244,23 @@ void LGapResolver::EmitMove(int index) {
       __ movsd(xmm0, src);
       __ movsd(cgen_->ToOperand(destination), xmm0);
     }
+  } else if (source->IsSIMD128Register()) {
+    XMMRegister src = cgen_->ToSIMD128Register(source);
+    if (destination->IsSIMD128Register()) {
+      __ movaps(cgen_->ToSIMD128Register(destination), src);
+    } else {
+      ASSERT(destination->IsSIMD128StackSlot());
+      __ movups(cgen_->ToOperand(destination), src);
+    }
+  } else if (source->IsSIMD128StackSlot()) {
+    Operand src = cgen_->ToOperand(source);
+    if (destination->IsSIMD128Register()) {
+      __ movups(cgen_->ToSIMD128Register(destination), src);
+    } else {
+      ASSERT(destination->IsSIMD128StackSlot());
+      __ movups(xmm0, src);
+      __ movups(cgen_->ToOperand(destination), xmm0);
+    }
   } else {
     UNREACHABLE();
   }
@@ -285,6 +302,19 @@ void LGapResolver::EmitSwap(int index) {
     __ movsd(dst, xmm0);
     __ movp(src, kScratchRegister);
 
+  } else if ((source->IsSIMD128StackSlot() &&
+              destination->IsSIMD128StackSlot())) {
+    // Swap two XMM stack slots.
+    STATIC_ASSERT(kSIMD128Size == 2 * kDoubleSize);
+    Operand src = cgen_->ToOperand(source);
+    Operand dst = cgen_->ToOperand(destination);
+    __ movups(xmm0, src);
+    __ movq(kScratchRegister, dst);
+    __ movq(src, kScratchRegister);
+    __ movq(kScratchRegister, Operand(dst, kDoubleSize));
+    __ movq(Operand(src, kDoubleSize), kScratchRegister);
+    __ movups(dst, xmm0);
+
   } else if (source->IsDoubleRegister() && destination->IsDoubleRegister()) {
     // Swap two double registers.
     XMMRegister source_reg = cgen_->ToDoubleRegister(source);
@@ -293,6 +323,14 @@ void LGapResolver::EmitSwap(int index) {
     __ movaps(source_reg, destination_reg);
     __ movaps(destination_reg, xmm0);
 
+  } else if (source->IsSIMD128Register() && destination->IsSIMD128Register()) {
+    // Swap two XMM registers.
+    XMMRegister source_reg = cgen_->ToSIMD128Register(source);
+    XMMRegister destination_reg = cgen_->ToSIMD128Register(destination);
+    __ movaps(xmm0, source_reg);
+    __ movaps(source_reg, destination_reg);
+    __ movaps(destination_reg, xmm0);
+
   } else if (source->IsDoubleRegister() || destination->IsDoubleRegister()) {
     // Swap a double register and a double stack slot.
     ASSERT((source->IsDoubleRegister() && destination->IsDoubleStackSlot()) ||
@@ -307,6 +345,22 @@ void LGapResolver::EmitSwap(int index) {
     __ movsd(other_operand, reg);
     __ movaps(reg, xmm0);
 
+  } else if (source->IsSIMD128Register() || destination->IsSIMD128Register()) {
+    // Swap a xmm register and a xmm stack slot.
+    ASSERT((source->IsSIMD128Register() &&
+            destination->IsSIMD128StackSlot()) ||
+           (source->IsSIMD128StackSlot() &&
+            destination->IsSIMD128Register()));
+    XMMRegister reg = cgen_->ToSIMD128Register(source->IsSIMD128Register()
+                                                   ? source
+                                                   : destination);
+    LOperand* other = source->IsSIMD128Register() ? destination : source;
+    ASSERT(other->IsSIMD128StackSlot());
+    Operand other_operand = cgen_->ToOperand(other);
+    __ movups(xmm0, other_operand);
+    __ movups(other_operand, reg);
+    __ movaps(reg, xmm0);
+
   } else {
     // No other combinations are possible.
     UNREACHABLE();
diff --git a/src/x64/lithium-x64.cc b/src/x64/lithium-x64.cc
index a0e94d1738e..ba5860c6379 100644
--- a/src/x64/lithium-x64.cc
+++ b/src/x64/lithium-x64.cc
@@ -344,6 +344,19 @@ void LAccessArgumentsAt::PrintDataTo(StringStream* stream) {
 
 
 int LPlatformChunk::GetNextSpillIndex(RegisterKind kind) {
+  switch (kind) {
+    case GENERAL_REGISTERS: return spill_slot_count_++;
+    case DOUBLE_REGISTERS: return spill_slot_count_++;
+    case FLOAT32x4_REGISTERS:
+    case INT32x4_REGISTERS: {
+      spill_slot_count_++;
+      return spill_slot_count_++;
+    }
+    default:
+      UNREACHABLE();
+      return -1;
+  }
+
   return spill_slot_count_++;
 }
 
@@ -353,11 +366,14 @@ LOperand* LPlatformChunk::GetNextSpillSlot(RegisterKind kind) {
   // Alternatively, at some point, start using half-size
   // stack slots for int32 values.
   int index = GetNextSpillIndex(kind);
-  if (kind == DOUBLE_REGISTERS) {
-    return LDoubleStackSlot::Create(index, zone());
-  } else {
-    ASSERT(kind == GENERAL_REGISTERS);
-    return LStackSlot::Create(index, zone());
+  switch (kind) {
+    case GENERAL_REGISTERS: return LStackSlot::Create(index, zone());
+    case DOUBLE_REGISTERS: return LDoubleStackSlot::Create(index, zone());
+    case FLOAT32x4_REGISTERS: return LFloat32x4StackSlot::Create(index, zone());
+    case INT32x4_REGISTERS: return LInt32x4StackSlot::Create(index, zone());
+    default:
+      UNREACHABLE();
+      return NULL;
   }
 }
 
@@ -1726,6 +1742,10 @@ LInstruction* LChunkBuilder::DoChange(HChange* instr) {
       LOperand* value = UseRegister(instr->value());
       LNumberUntagD* res = new(zone()) LNumberUntagD(value);
       return AssignEnvironment(DefineAsRegister(res));
+    } else if (to.IsSIMD128()) {
+      LOperand* value = UseRegister(instr->value());
+      LTaggedToSIMD128* res = new(zone()) LTaggedToSIMD128(value, to);
+      return AssignEnvironment(DefineAsRegister(res));
     } else if (to.IsSmi()) {
       HValue* val = instr->value();
       LOperand* value = UseRegister(val);
@@ -1809,6 +1829,16 @@ LInstruction* LChunkBuilder::DoChange(HChange* instr) {
         return DefineAsRegister(new(zone()) LInteger32ToDouble(value));
       }
     }
+  } else if (from.IsSIMD128()) {
+    ASSERT(to.IsTagged());
+    info()->MarkAsDeferredCalling();
+    LOperand* value = UseRegister(instr->value());
+    LOperand* temp = TempRegister();
+
+    // Make sure that temp and result_temp are different registers.
+    LUnallocated* result_temp = TempRegister();
+    LSIMD128ToTagged* result = new(zone()) LSIMD128ToTagged(value, temp);
+    return AssignPointerMap(Define(result, result_temp));
   }
   UNREACHABLE();
   return NULL;
@@ -2007,25 +2037,21 @@ LInstruction* LChunkBuilder::DoLoadKeyed(HLoadKeyed* instr) {
       : UseRegisterOrConstantAtStart(instr->key());
   LLoadKeyed* result = NULL;
 
-  bool load_128bits_without_sse2 = IsSIMD128ElementsKind(elements_kind);
   if (!instr->is_typed_elements()) {
     LOperand* obj = UseRegisterAtStart(instr->elements());
-    result = new(zone()) LLoadKeyed(obj, key, NULL);
+    result = new(zone()) LLoadKeyed(obj, key);
   } else {
     ASSERT(
         (instr->representation().IsInteger32() &&
          !(IsDoubleOrFloatElementsKind(instr->elements_kind()))) ||
         (instr->representation().IsDouble() &&
          (IsDoubleOrFloatElementsKind(instr->elements_kind()))) ||
-        (instr->representation().IsTagged() &&
-         (IsSIMD128ElementsKind(instr->elements_kind()))));
+        (instr->representation().IsFloat32x4() &&
+         IsFloat32x4ElementsKind(instr->elements_kind())) ||
+        (instr->representation().IsInt32x4() &&
+         IsInt32x4ElementsKind(instr->elements_kind())));
     LOperand* backing_store = UseRegister(instr->elements());
-    result = new(zone()) LLoadKeyed(backing_store, key,
-        load_128bits_without_sse2 ? TempRegister() : NULL);
-    if (load_128bits_without_sse2) {
-      info()->MarkAsDeferredCalling();
-      AssignPointerMap(result);
-    }
+    result = new(zone()) LLoadKeyed(backing_store, key);
   }
 
   DefineAsRegister(result);
@@ -2086,8 +2112,10 @@ LInstruction* LChunkBuilder::DoStoreKeyed(HStoreKeyed* instr) {
        !IsDoubleOrFloatElementsKind(elements_kind)) ||
        (instr->value()->representation().IsDouble() &&
        IsDoubleOrFloatElementsKind(elements_kind)) ||
-       (instr->value()->representation().IsTagged() &&
-       IsSIMD128ElementsKind(elements_kind)));
+      (instr->value()->representation().IsFloat32x4() &&
+       IsFloat32x4ElementsKind(elements_kind)) ||
+      (instr->value()->representation().IsInt32x4() &&
+       IsInt32x4ElementsKind(elements_kind)));
   ASSERT((instr->is_fixed_typed_array() &&
           instr->elements()->representation().IsTagged()) ||
          (instr->is_external() &&
@@ -2103,9 +2131,7 @@ LInstruction* LChunkBuilder::DoStoreKeyed(HStoreKeyed* instr) {
       ? UseTempRegisterOrConstant(instr->key())
       : UseRegisterOrConstantAtStart(instr->key());
   LOperand* backing_store = UseRegister(instr->elements());
-  LStoreKeyed* result = new(zone()) LStoreKeyed(backing_store, key, val);
-  bool store_128bits_without_sse2 = IsSIMD128ElementsKind(elements_kind);
-  return store_128bits_without_sse2 ? AssignEnvironment(result) : result;
+  return new(zone()) LStoreKeyed(backing_store, key, val);
 }
 
 
diff --git a/src/x64/lithium-x64.h b/src/x64/lithium-x64.h
index e628243e6fd..c084d06322d 100644
--- a/src/x64/lithium-x64.h
+++ b/src/x64/lithium-x64.h
@@ -139,9 +139,11 @@ class LCodeGen;
   V(ModI)                                       \
   V(MulI)                                       \
   V(NumberTagD)                                 \
+  V(SIMD128ToTagged)                            \
   V(NumberTagI)                                 \
   V(NumberTagU)                                 \
   V(NumberUntagD)                               \
+  V(TaggedToSIMD128)                            \
   V(OsrEntry)                                   \
   V(OuterContext)                               \
   V(Parameter)                                  \
@@ -1526,12 +1528,11 @@ class LLoadExternalArrayPointer V8_FINAL
 };
 
 
-class LLoadKeyed V8_FINAL : public LTemplateInstruction<1, 2, 1> {
+class LLoadKeyed V8_FINAL : public LTemplateInstruction<1, 2, 0> {
  public:
-  LLoadKeyed(LOperand* elements, LOperand* key, LOperand* temp) {
+  LLoadKeyed(LOperand* elements, LOperand* key) {
     inputs_[0] = elements;
     inputs_[1] = key;
-    temps_[0] = temp;
   }
 
   DECLARE_CONCRETE_INSTRUCTION(LoadKeyed, "load-keyed")
@@ -1548,7 +1549,6 @@ class LLoadKeyed V8_FINAL : public LTemplateInstruction<1, 2, 1> {
   }
   LOperand* elements() { return inputs_[0]; }
   LOperand* key() { return inputs_[1]; }
-  LOperand* temp() { return temps_[0]; }
   virtual void PrintDataTo(StringStream* stream) V8_OVERRIDE;
   uint32_t additional_index() const { return hydrogen()->index_offset(); }
   ElementsKind elements_kind() const {
@@ -2011,6 +2011,21 @@ class LNumberTagD V8_FINAL : public LTemplateInstruction<1, 1, 1> {
 };
 
 
+class LSIMD128ToTagged V8_FINAL : public LTemplateInstruction<1, 1, 1> {
+ public:
+  explicit LSIMD128ToTagged(LOperand* value, LOperand* temp) {
+    inputs_[0] = value;
+    temps_[0] = temp;
+  }
+
+  LOperand* value() { return inputs_[0]; }
+  LOperand* temp() { return temps_[0]; }
+
+  DECLARE_CONCRETE_INSTRUCTION(SIMD128ToTagged, "simd128-tag")
+  DECLARE_HYDROGEN_ACCESSOR(Change)
+};
+
+
 // Sometimes truncating conversion from a tagged value to an int32.
 class LDoubleToI V8_FINAL : public LTemplateInstruction<1, 1, 0> {
  public:
@@ -2083,6 +2098,23 @@ class LNumberUntagD V8_FINAL : public LTemplateInstruction<1, 1, 0> {
 };
 
 
+class LTaggedToSIMD128 V8_FINAL : public LTemplateInstruction<1, 1, 0> {
+ public:
+  explicit LTaggedToSIMD128(LOperand* value, Representation representation)
+      : representation_(representation) {
+    inputs_[0] = value;
+  }
+
+  LOperand* value() { return inputs_[0]; }
+  Representation representation() const { return representation_; }
+
+  DECLARE_CONCRETE_INSTRUCTION(TaggedToSIMD128, "simd128-untag")
+  DECLARE_HYDROGEN_ACCESSOR(Change);
+ private:
+  Representation representation_;
+};
+
+
 class LSmiUntag V8_FINAL : public LTemplateInstruction<1, 1, 0> {
  public:
   LSmiUntag(LOperand* value, bool needs_check)
diff --git a/src/x64/macro-assembler-x64.cc b/src/x64/macro-assembler-x64.cc
index a11a6a5cb3f..66ef3f027d7 100644
--- a/src/x64/macro-assembler-x64.cc
+++ b/src/x64/macro-assembler-x64.cc
@@ -898,10 +898,10 @@ void MacroAssembler::PushCallerSaved(SaveFPRegsMode fp_mode,
   }
   // R12 to r15 are callee save on all platforms.
   if (fp_mode == kSaveFPRegs) {
-    subq(rsp, Immediate(kDoubleSize * XMMRegister::kMaxNumRegisters));
+    subq(rsp, Immediate(kSIMD128Size * XMMRegister::kMaxNumRegisters));
     for (int i = 0; i < XMMRegister::kMaxNumRegisters; i++) {
       XMMRegister reg = XMMRegister::from_code(i);
-      movsd(Operand(rsp, i * kDoubleSize), reg);
+      movups(Operand(rsp, i * kSIMD128Size), reg);
     }
   }
 }
@@ -914,9 +914,9 @@ void MacroAssembler::PopCallerSaved(SaveFPRegsMode fp_mode,
   if (fp_mode == kSaveFPRegs) {
     for (int i = 0; i < XMMRegister::kMaxNumRegisters; i++) {
       XMMRegister reg = XMMRegister::from_code(i);
-      movsd(reg, Operand(rsp, i * kDoubleSize));
+      movups(reg, Operand(rsp, i * kSIMD128Size));
     }
-    addq(rsp, Immediate(kDoubleSize * XMMRegister::kMaxNumRegisters));
+    addq(rsp, Immediate(kSIMD128Size * XMMRegister::kMaxNumRegisters));
   }
   for (int i = kNumberOfSavedRegs - 1; i >= 0; i--) {
     Register reg = saved_regs[i];
@@ -3715,13 +3715,13 @@ void MacroAssembler::EnterExitFrameEpilogue(int arg_stack_space,
 #endif
   // Optionally save all XMM registers.
   if (save_doubles) {
-    int space = XMMRegister::kMaxNumAllocatableRegisters * kDoubleSize +
+    int space = XMMRegister::kMaxNumAllocatableRegisters * kSIMD128Size +
         arg_stack_space * kRegisterSize;
     subq(rsp, Immediate(space));
     int offset = -2 * kPointerSize;
     for (int i = 0; i < XMMRegister::NumAllocatableRegisters(); i++) {
       XMMRegister reg = XMMRegister::FromAllocationIndex(i);
-      movsd(Operand(rbp, offset - ((i + 1) * kDoubleSize)), reg);
+      movups(Operand(rbp, offset - ((i + 1) * kSIMD128Size)), reg);
     }
   } else if (arg_stack_space > 0) {
     subq(rsp, Immediate(arg_stack_space * kRegisterSize));
@@ -3765,7 +3765,7 @@ void MacroAssembler::LeaveExitFrame(bool save_doubles) {
     int offset = -2 * kPointerSize;
     for (int i = 0; i < XMMRegister::NumAllocatableRegisters(); i++) {
       XMMRegister reg = XMMRegister::FromAllocationIndex(i);
-      movsd(reg, Operand(rbp, offset - ((i + 1) * kDoubleSize)));
+      movups(reg, Operand(rbp, offset - ((i + 1) * kSIMD128Size)));
     }
   }
   // Get the return address from the stack and restore the frame pointer.
diff --git a/src/x64/macro-assembler-x64.h b/src/x64/macro-assembler-x64.h
index eafeba6cc51..2fb86216d85 100644
--- a/src/x64/macro-assembler-x64.h
+++ b/src/x64/macro-assembler-x64.h
@@ -1146,6 +1146,10 @@ class MacroAssembler: public Assembler {
   void AllocateHeapNumber(Register result,
                           Register scratch,
                           Label* gc_required);
+
+  // Allocate a float32x4 or int32x4 object in new space with undefined value.
+  // Returns tagged pointer in result register, or jumps to gc_required if new
+  // space is full.
   void AllocateSIMDHeapObject(int size,
                               Register result,
                               Register scratch,