From 0240696d13964adc1e4db0dcc39316ad4caef38d Mon Sep 17 00:00:00 2001
From: gcolvin <greg@colvin.org>
Date: Thu, 6 Jul 2017 22:33:48 -0600
Subject: [PATCH 1/6] new branch for PR, attempted vcc fix, other cleanup

---
 libevm/CMakeLists.txt                 |   1 +
 libevm/VM.cpp                         | 563 ++++++++++++++++----
 libevm/VM.h                           |  71 ++-
 libevm/VMConfig.h                     | 153 +++---
 libevm/VMOpt.cpp                      |   2 +-
 libevm/VMSIMD.cpp                     | 729 ++++++++++++++++++++++++++
 libevm/VMValidate.cpp                 |   2 +-
 libevmcore/Instruction.cpp            |  39 +-
 libevmcore/Instruction.h              |  36 +-
 test/tools/fuzzTesting/fuzzHelper.cpp |  37 +-
 10 files changed, 1454 insertions(+), 179 deletions(-)
 create mode 100755 libevm/VMSIMD.cpp
 mode change 100644 => 100755 test/tools/fuzzTesting/fuzzHelper.cpp

diff --git a/libevm/CMakeLists.txt b/libevm/CMakeLists.txt
index f91e83f9637..2dc0c6e28ed 100755
--- a/libevm/CMakeLists.txt
+++ b/libevm/CMakeLists.txt
@@ -4,6 +4,7 @@ set(SOURCES
 	VM.cpp
 	VMOpt.cpp
 	VMCalls.cpp
+	VMSIMD.cpp
 	VMValidate.cpp
 	VMFactory.cpp
 )
diff --git a/libevm/VM.cpp b/libevm/VM.cpp
index 401d3e11742..12793f710f7 100755
--- a/libevm/VM.cpp
+++ b/libevm/VM.cpp
@@ -115,6 +115,19 @@ void VM::adjustStack(unsigned _removed, unsigned _added)
 #endif
 }
 
+void VM::updateSSGas()
+{
+	if (!m_ext->store(m_SP[0]) && m_SP[1])
+		m_runGas = toInt63(m_schedule->sstoreSetGas);
+	else if (m_ext->store(m_SP[0]) && !m_SP[1])
+	{
+		m_runGas = toInt63(m_schedule->sstoreResetGas);
+		m_ext->sub.refunds += m_schedule->sstoreRefundGas;
+	}
+	else
+		m_runGas = toInt63(m_schedule->sstoreResetGas);
+}
+
 
 uint64_t VM::gasForMem(u512 _size)
 {
@@ -172,7 +185,6 @@ void VM::fetchInstruction()
 	#define updateIOGas()
 #endif
 
-
 ///////////////////////////////////////////////////////////////////////////////
 //
 // interpreter entry point
@@ -643,7 +655,460 @@ void VM::interpretCases()
 					number &= mask;
 			}
 		}
+		NEXT		
+
+#if EIP_615
+		CASE(JUMPTO)
+		{
+			ON_OP();
+			updateIOGas();
+			
+			m_PC = decodeJumpDest(m_code.data(), m_PC);
+		}
+		CONTINUE
+
+		CASE(JUMPIF)
+		{
+			ON_OP();
+			updateIOGas();
+			
+			if (m_SP[0])
+				m_PC = decodeJumpDest(m_code.data(), m_PC);
+			else
+				++m_PC;
+		}
+		CONTINUE
+
+		CASE(JUMPV)
+		{
+			ON_OP();
+			updateIOGas();
+			m_PC = decodeJumpvDest(m_code.data(), m_PC, byte(m_SP[0]));
+		}
+		CONTINUE
+
+		CASE(JUMPSUB)
+		{
+			ON_OP();
+			updateIOGas();
+			*m_RP++ = m_PC++;
+			m_PC = decodeJumpDest(m_code.data(), m_PC);
+			}
+		}
+		CONTINUE
+
+		CASE(JUMPSUBV)
+		{
+			ON_OP();
+			updateIOGas();
+			*m_RP++ = m_PC;
+			m_PC = decodeJumpvDest(m_code.data(), m_PC, byte(m_SP[0]));
+		}
+		CONTINUE
+
+		CASE(RETURNSUB)
+		{
+			ON_OP();
+			updateIOGas();
+			
+			m_PC = *m_RP--;
+		}
+		NEXT
+
+		CASE(BEGINSUB)
+		{
+			ON_OP();
+			updateIOGas();
+		}
 		NEXT
+		
+
+		CASE(BEGINDATA)
+		{
+			ON_OP();
+			updateIOGas();
+		}
+		NEXT
+
+		CASE(GETLOCAL)
+		{
+			ON_OP();
+			updateIOGas();
+		}
+		NEXT
+
+		CASE(PUTLOCAL)
+		{
+			ON_OP();
+			updateIOGas();
+		}
+		NEXT
+
+#else
+		CASE(JUMPTO)
+		CASE(JUMPIF)
+		CASE(JUMPV)
+		CASE(JUMPSUB)
+		CASE(JUMPSUBV)
+		CASE(RETURNSUB)
+		CASE(BEGINSUB)
+		CASE(BEGINDATA)
+		CASE(GETLOCAL)
+		CASE(PUTLOCAL)
+		{
+			throwBadInstruction();
+		}
+		CONTINUE
+#endif
+
+#if EIP_616
+		
+		CASE(XADD)
+		{
+			ON_OP();
+			updateIOGas();
+
+			xadd(simdType());
+		}
+		CONTINUE
+	         
+		CASE(XMUL)
+		{
+			ON_OP();
+			updateIOGas();
+
+			xmul(simdType());
+		}
+		CONTINUE
+         
+		CASE(XSUB)
+		{
+			ON_OP();
+			updateIOGas();
+
+			xsub(simdType());
+		}
+		CONTINUE
+         
+		CASE(XDIV)
+		{
+			ON_OP();
+			updateIOGas();
+
+			xdiv(simdType());
+		}
+		CONTINUE
+         
+		CASE(XSDIV)
+		{
+			ON_OP();
+			updateIOGas();
+
+			xsdiv(simdType());
+		}
+		CONTINUE
+        
+		CASE(XMOD)
+		{
+			ON_OP();
+			updateIOGas();
+
+			xmod(simdType());
+		}
+		CONTINUE
+         
+		CASE(XSMOD)
+		{
+			ON_OP();
+			updateIOGas();
+
+			xsmod(simdType());
+		}
+		CONTINUE
+        
+		CASE(XLT)
+		{
+			ON_OP();
+			updateIOGas();
+
+			xlt(simdType());
+		}
+		CONTINUE
+          
+		CASE(XGT)
+		{
+			ON_OP();
+			updateIOGas();
+
+			xgt(simdType());
+		}
+		CONTINUE
+          
+		CASE(XSLT)
+		{
+			ON_OP();
+			updateIOGas();
+
+			xslt(simdType());
+		}
+		CONTINUE
+         
+		CASE(XSGT)
+		{
+			ON_OP();
+			updateIOGas();
+
+			xsgt(simdType());
+		}
+		CONTINUE
+         
+		CASE(XEQ)
+		{
+			ON_OP();
+			updateIOGas();
+
+			xeq(simdType());
+		}
+		CONTINUE
+          
+		CASE(XISZERO)
+		{
+			ON_OP();
+			updateIOGas();
+
+			xzero(simdType());
+		}
+		CONTINUE
+      
+		CASE(XAND)
+		{
+			ON_OP();
+			updateIOGas();
+
+			xand(simdType());
+		}
+		CONTINUE
+         
+		CASE(XOOR)
+		{
+			ON_OP();
+			updateIOGas();
+
+			xoor(simdType());
+		}
+		CONTINUE
+         
+		CASE(XXOR)
+		{
+			ON_OP();
+			updateIOGas();
+
+			xxor(simdType());
+		}
+		CONTINUE
+         
+		CASE(XNOT)
+		{
+			ON_OP();
+			updateIOGas();
+
+			xnot(simdType());
+		}
+		CONTINUE
+         
+		CASE(XSHL)
+		{
+			ON_OP();
+			updateIOGas();
+
+			xshl(simdType());
+		}
+		CONTINUE
+         
+		CASE(XSHR)
+		{
+			ON_OP();
+			updateIOGas();
+
+			xshr(simdType());
+		}
+		CONTINUE
+         
+		CASE(XSAR)
+		{
+			ON_OP();
+			updateIOGas();
+
+			xsar(simdType());
+		}
+		CONTINUE
+         
+		CASE(XROL)
+		{
+			ON_OP();
+			updateIOGas();
+
+			xrol(simdType());
+		}
+		CONTINUE
+         
+		CASE(XROR)
+		{
+			ON_OP();
+			updateIOGas();
+
+			xror(simdType());
+		}
+		CONTINUE
+
+		CASE(XMLOAD)
+		{
+			updateMem(toInt63(m_SP[0]) + 32);
+			ON_OP();
+			updateIOGas();
+
+			xmload(simdType());
+		}
+		CONTINUE
+
+		CASE(XMSTORE)
+		{
+			updateMem(toInt63(m_SP[0]) + 32);
+			ON_OP();
+			updateIOGas();
+
+			xmstore(simdType());
+		}
+		CONTINUE
+
+		CASE(XSLOAD)
+		{
+			m_runGas = toInt63(m_schedule->sloadGas);
+			ON_OP();
+			updateIOGas();
+
+			xsload(simdType());
+		}
+		CONTINUE
+
+		CASE(XSSTORE)
+		{
+			if (m_ext->staticCall)
+				throwDisallowedStateChange();
+
+			updateSSGas();
+			ON_OP();
+			updateIOGas();
+	
+			xsstore(simdType());
+		}
+		CONTINUE
+
+		CASE(XVTOWIDE)
+		{
+			ON_OP();
+			updateIOGas();
+
+			xvtowide(simdType());
+		}
+		CONTINUE
+
+		CASE(XWIDETOV)
+		{
+			ON_OP();
+			updateIOGas();
+
+			xwidetov(simdType());
+		}
+		CONTINUE
+
+		CASE(XPUSH)
+		{
+			ON_OP();
+			updateIOGas();
+
+			xpush(simdType());
+		}
+		CONTINUE
+
+		CASE(XPUT)
+		{
+			ON_OP();
+			updateIOGas();
+
+			uint8_t b = ++m_PC;
+			uint8_t c = ++m_PC;
+			xput(m_code[b], m_code[c]); ++m_PC;
+		}
+		CONTINUE
+
+		CASE(XGET)
+		{
+			ON_OP();
+			updateIOGas();
+
+			uint8_t b = ++m_PC;
+			uint8_t c = ++m_PC;
+			xget(m_code[b], m_code[c]); ++m_PC;
+		}
+		CONTINUE
+
+		CASE(XSWIZZLE)
+		{
+			ON_OP();
+			updateIOGas();
+
+			xswizzle(simdType());
+		}
+		CONTINUE
+
+		CASE(XSHUFFLE)
+		{
+			ON_OP();
+			updateIOGas();
+
+			xshuffle(simdType());
+		}
+		CONTINUE
+#else
+		CASE(XADD)
+		CASE(XMUL)
+		CASE(XSUB)
+		CASE(XDIV)
+		CASE(XSDIV)
+		CASE(XMOD)
+		CASE(XSMOD)
+		CASE(XLT)
+		CASE(XGT)
+		CASE(XSLT)
+		CASE(XSGT)
+		CASE(XEQ)
+		CASE(XISZERO)
+		CASE(XAND)
+		CASE(XOOR)
+		CASE(XXOR)
+		CASE(XNOT)
+		CASE(XSHL)
+		CASE(XSHR)
+		CASE(XSAR)
+		CASE(XROL)
+		CASE(XROR)
+		CASE(XMLOAD)
+		CASE(XMSTORE)
+		CASE(XSLOAD)
+		CASE(XSSTORE)
+		CASE(XVTOWIDE)
+		CASE(XWIDETOV)
+		CASE(XPUSH)
+		CASE(XPUT)
+		CASE(XGET)
+		CASE(XSWIZZLE)
+		CASE(XSHUFFLE)
+		{
+			throwBadInstruction();
+		}
+		CONTINUE
+#endif
 
 		CASE(ADDRESS)
 		{
@@ -973,76 +1438,6 @@ void VM::interpretCases()
 		}
 		CONTINUE
 
-#if EVM_JUMPS_AND_SUBS
-		CASE(JUMPTO)
-		{
-			ON_OP();
-			updateIOGas();
-			
-			m_PC = decodeJumpDest(m_code.data(), m_PC);
-		}
-		CONTINUE
-
-		CASE(JUMPIF)
-		{
-			ON_OP();
-			updateIOGas();
-			
-			if (m_SP[0])
-				m_PC = decodeJumpDest(m_code.data(), m_PC);
-			else
-				++m_PC;
-		}
-		CONTINUE
-
-		CASE(JUMPV)
-		{
-			ON_OP();
-			updateIOGas();
-			m_PC = decodeJumpvDest(m_code.data(), m_PC, byte(m_SP[0]));
-		}
-		CONTINUE
-
-		CASE(JUMPSUB)
-		{
-			ON_OP();
-			updateIOGas();
-			*m_RP++ = m_PC++;
-			m_PC = decodeJumpDest(m_code.data(), m_PC);
-			}
-		}
-		CONTINUE
-
-		CASE(JUMPSUBV)
-		{
-			ON_OP();
-			updateIOGas();
-			*m_RP++ = m_PC;
-			m_PC = decodeJumpvDest(m_code.data(), m_PC, byte(m_SP[0]));
-		}
-		CONTINUE
-
-		CASE(RETURNSUB)
-		{
-			ON_OP();
-			updateIOGas();
-			
-			m_PC = *m_RP--;
-		}
-		NEXT
-#else
-		CASE(JUMPTO)
-		CASE(JUMPIF)
-		CASE(JUMPV)
-		CASE(JUMPSUB)
-		CASE(JUMPSUBV)
-		CASE(RETURNSUB)
-		{
-			throwBadInstruction();
-		}
-		CONTINUE
-#endif
-
 		CASE(JUMPC)
 		{
 #if EVM_REPLACE_CONST_JUMP
@@ -1096,7 +1491,7 @@ void VM::interpretCases()
 #if EVM_HACK_DUP_64
 			*(uint64_t*)m_SPP = *(uint64_t*)(m_SP + n);
 #else
-			m_SPP[0] = m_SP[n];
+			new(m_SPP) u256(m_SP[n]);
 #endif
 		}
 		NEXT
@@ -1142,16 +1537,8 @@ void VM::interpretCases()
 		{
 			if (m_ext->staticCall)
 				throwDisallowedStateChange();
-
-			if (!m_ext->store(m_SP[0]) && m_SP[1])
-				m_runGas = toInt63(m_schedule->sstoreSetGas);
-			else if (m_ext->store(m_SP[0]) && !m_SP[1])
-			{
-				m_runGas = toInt63(m_schedule->sstoreResetGas);
-				m_ext->sub.refunds += m_schedule->sstoreRefundGas;
-			}
-			else
-				m_runGas = toInt63(m_schedule->sstoreResetGas);
+				
+			updateSSGas();
 			ON_OP();
 			updateIOGas();
 	
@@ -1194,18 +1581,6 @@ void VM::interpretCases()
 		}
 		NEXT
 
-#if EVM_JUMPS_AND_SUBS
-		CASE(BEGINSUB)
-		{
-			m_runGas = 1;
-			ON_OP();
-			updateIOGas();
-		}
-		NEXT
-#else
-		CASE(BEGINSUB)
-#endif
-		CASE(BEGINDATA)
 		CASE(INVALID)
 		DEFAULT
 		{
diff --git a/libevm/VM.h b/libevm/VM.h
index 61046c44b11..e4b858da45f 100755
--- a/libevm/VM.h
+++ b/libevm/VM.h
@@ -59,7 +59,7 @@ class VM: public VMFace
 public:
 	virtual owning_bytes_ref exec(u256& _io_gas, ExtVMFace& _ext, OnOpFunc const& _onOp) override final;
 
-#if EVM_JUMPS_AND_SUBS
+#if EIP_615
 	// invalid code will throw an exeption
 	void validate(ExtVMFace& _ext);
 	void validateSubroutine(uint64_t _PC, uint64_t* _rp, u256* _sp);
@@ -108,7 +108,7 @@ class VM: public VMFace
 	u256 *m_stackEnd = &m_stack[1024];
 	size_t stackSize() { return m_stackEnd - m_SP; }
 	
-#if EVM_JUMPS_AND_SUBS
+#if EIP_615
 	// space for return stack
 	uint64_t m_return[1024];
 	
@@ -124,7 +124,7 @@ class VM: public VMFace
 	uint64_t    m_PC    = 0;            // program counter
 	u256*       m_SP    = m_stackEnd;   // stack pointer
 	u256*       m_SPP   = m_SP;         // stack pointer prime (next SP)
-#if EVM_JUMPS_AND_SUBS
+#if EIP_615
 	uint64_t*   m_RP    = m_return - 1; // return pointer
 #endif
 
@@ -167,6 +167,7 @@ class VM: public VMFace
 	void onOperation();
 	void adjustStack(unsigned _removed, unsigned _added);
 	uint64_t gasForMem(u512 _size);
+	void updateSSGas();
 	void updateIOGas();
 	void updateGas();
 	void updateMem(uint64_t _newMem);
@@ -184,7 +185,69 @@ class VM: public VMFace
 		uint64_t w = uint64_t(v);
 		return w;
 	}
-	};
+	
+	template<class T> uint64_t toInt15(T v)
+	{
+		// check for overflow
+		if (v > 0x7FFF)
+			throwOutOfGas();
+		uint64_t w = uint64_t(v);
+		return w;
+	}
+	
+	//
+	// implementations of simd opcodes
+	//
+	// input bytes are the inline simd type descriptors for the operand vectors on the stack
+	//
+#if EIP_616
+
+	void xadd    (uint8_t);
+	void xmul    (uint8_t);
+	void xsub    (uint8_t);
+	void xdiv    (uint8_t);
+	void xsdiv   (uint8_t);
+	void xmod    (uint8_t);
+	void xsmod   (uint8_t);
+	void xlt     (uint8_t);
+	void xslt    (uint8_t);
+	void xgt     (uint8_t);
+	void xsgt    (uint8_t);
+	void xeq     (uint8_t);
+	void xzero   (uint8_t);
+	void xand    (uint8_t);
+	void xoor    (uint8_t);
+	void xxor    (uint8_t);
+	void xnot    (uint8_t);
+	void xshr    (uint8_t);
+	void xsar    (uint8_t);
+	void xshl    (uint8_t);
+	void xrol    (uint8_t);
+	void xror    (uint8_t);
+	void xmload  (uint8_t);
+	void xmstore (uint8_t);
+	void xsload  (uint8_t);
+	void xsstore (uint8_t);
+	void xvtowide(uint8_t);
+	void xwidetov(uint8_t);
+	void xpush   (uint8_t);
+	void xput    (uint8_t, uint8_t);
+	void xget    (uint8_t, uint8_t);
+	void xswizzle(uint8_t);
+	void xshuffle(uint8_t);
+	
+	u256 vtow(uint8_t _b, const u256& _in);
+	void wtov(uint8_t _b, u256 _in, u256& _o_out);
+
+	uint8_t simdType()
+	{
+		uint8_t nt = m_code[++m_PC];  // advance PC and get simd type from code
+		++m_PC;                       // advance PC to next opcode, ready to continue
+		return nt;
+	}
+
+#endif
+};
 
 }
 }
diff --git a/libevm/VMConfig.h b/libevm/VMConfig.h
index cb0760982a7..854c840502d 100755
--- a/libevm/VMConfig.h
+++ b/libevm/VMConfig.h
@@ -22,17 +22,28 @@ namespace eth
 
 ///////////////////////////////////////////////////////////////////////////////
 //
-// interpreter configuration macros for optimizations and tracing
+// interpreter configuration macros for development, optimizations and tracing
+//
+// EIP_615                - subroutines and static jumps
+// EIP_616                - SIMD
 //
 // EVM_SWITCH_DISPATCH    - dispatch via loop and switch
 // EVM_JUMP_DISPATCH      - dispatch via a jump table - available only on GCC
 //
-// EVM_USE_CONSTANT_POOL  - 256 constants unpacked and ready to assign to stack
+// EVM_USE_CONSTANT_POOL  - constants unpacked and ready to assign to stack
 //
-// EVM_REPLACE_CONST_JUMP - with pre-verified jumps to save runtime lookup
+// EVM_REPLACE_CONST_JUMP - pre-verified jumps to save runtime lookup
 //
 // EVM_TRACE              - provides various levels of tracing
 
+#ifndef EIP_615
+	#define EIP_615 false
+#endif
+
+#ifndef EIP_616
+	#define EIP_616 false
+#endif
+
 #ifndef EVM_JUMP_DISPATCH
 	#ifdef __GNUC__
 		#define EVM_JUMP_DISPATCH true
@@ -42,10 +53,10 @@ namespace eth
 #endif
 #if EVM_JUMP_DISPATCH
 	#ifndef __GNUC__
-		#error "address of label extension avaiable only on Gnu"
+		#error "address of label extension available only on Gnu"
 	#endif
 #else
-	#define EVM_SWITCH_DISPATCH
+	#define EVM_SWITCH_DISPATCH true
 #endif
 
 #ifndef EVM_OPTIMIZE
@@ -60,8 +71,6 @@ namespace eth
 			)
 #endif
 
-#define EVM_JUMPS_AND_SUBS false
-
 
 ///////////////////////////////////////////////////////////////////////////////
 //
@@ -125,7 +134,7 @@ namespace eth
 //
 // build a simple loop-and-switch interpreter
 //
-#if defined(EVM_SWITCH_DISPATCH)
+#if EVM_SWITCH_DISPATCH
 
 	#define INIT_CASES if (!m_caseInit) { m_PC = 0; m_caseInit = true; return; }
 	#define DO_CASES for(;;) { fetchInstruction(); switch(m_OP) {
@@ -148,7 +157,7 @@ namespace eth
 	\
 		static const void * const jumpTable[256] =  \
 		{  \
-			&&STOP,          /* 00 */  \
+			&&STOP,        /* 00 */  \
 			&&ADD,  \
 			&&MUL,  \
 			&&SUB,  \
@@ -164,7 +173,7 @@ namespace eth
 			&&INVALID,  \
 			&&INVALID,  \
 			&&INVALID,  \
-			&&LT,            /* 10, */  \
+			&&LT,          /* 10, */  \
 			&&GT,  \
 			&&SLT,  \
 			&&SGT,  \
@@ -180,7 +189,7 @@ namespace eth
 			&&INVALID,  \
 			&&INVALID,  \
 			&&INVALID,  \
-			&&SHA3,          /* 20, */  \
+			&&SHA3,        /* 20, */  \
 			&&INVALID,  \
 			&&INVALID,  \
 			&&INVALID,  \
@@ -196,7 +205,7 @@ namespace eth
 			&&INVALID,  \
 			&&INVALID,  \
 			&&INVALID,  \
-			&&ADDRESS,       /* 30, */  \
+			&&ADDRESS,     /* 30, */  \
 			&&BALANCE,  \
 			&&ORIGIN,  \
 			&&CALLER,  \
@@ -212,23 +221,23 @@ namespace eth
 			&&RETURNDATASIZE,  \
 			&&RETURNDATACOPY,  \
 			&&INVALID,  \
-			&&BLOCKHASH,     /* 40, */  \
+			&&BLOCKHASH,   /* 40, */  \
 			&&COINBASE,  \
 			&&TIMESTAMP,  \
 			&&NUMBER,  \
 			&&DIFFICULTY,  \
 			&&GASLIMIT,  \
-			&&JUMPTO,  \
-			&&JUMPIF,  \
-			&&JUMPV,  \
-			&&JUMPSUB,  \
-			&&JUMPSUBV,  \
-			&&RETURNSUB,  \
-			&&BEGINSUB,  \
-			&&BEGINDATA,  \
 			&&INVALID,  \
 			&&INVALID,  \
-			&&POP,           /* 50, */  \
+			&&INVALID,  \
+			&&INVALID,  \
+			&&INVALID,  \
+			&&INVALID,  \
+			&&INVALID,  \
+			&&INVALID,  \
+			&&INVALID,  \
+			&&INVALID,  \
+			&&POP,         /* 50, */  \
 			&&MLOAD,  \
 			&&MSTORE,  \
 			&&MSTORE8,  \
@@ -244,7 +253,7 @@ namespace eth
 			&&BEGINSUB,  \
 			&&INVALID,  \
 			&&INVALID,  \
-			&&PUSH1,         /* 60, */  \
+			&&PUSH1,       /* 60, */  \
 			&&PUSH2,  \
 			&&PUSH3,  \
 			&&PUSH4,  \
@@ -260,7 +269,7 @@ namespace eth
 			&&PUSH14,  \
 			&&PUSH15,  \
 			&&PUSH16,  \
-			&&PUSH17,         /* 70, */  \
+			&&PUSH17,      /* 70, */  \
 			&&PUSH18,  \
 			&&PUSH19,  \
 			&&PUSH20,  \
@@ -276,7 +285,7 @@ namespace eth
 			&&PUSH30,  \
 			&&PUSH31,  \
 			&&PUSH32,  \
-			&&DUP1,          /* 80, */  \
+			&&DUP1,        /* 80, */  \
 			&&DUP2,  \
 			&&DUP3,  \
 			&&DUP4,  \
@@ -292,7 +301,7 @@ namespace eth
 			&&DUP14,  \
 			&&DUP15,  \
 			&&DUP16,  \
-			&&SWAP1,         /* 90, */  \
+			&&SWAP1,       /* 90, */  \
 			&&SWAP2,  \
 			&&SWAP3,  \
 			&&SWAP4,  \
@@ -308,7 +317,7 @@ namespace eth
 			&&SWAP14,  \
 			&&SWAP15,  \
 			&&SWAP16,  \
-			&&LOG0,          /* A0, */  \
+			&&LOG0,        /* A0, */  \
 			&&LOG1,  \
 			&&LOG2,  \
 			&&LOG3,  \
@@ -324,56 +333,30 @@ namespace eth
 			&&JUMPC,  \
 			&&JUMPCI,  \
 			&&INVALID,  \
-			&&INVALID,       /* B0, */  \
-			&&INVALID,  \
-			&&INVALID,  \
-			&&INVALID,  \
-			&&INVALID,  \
-			&&INVALID,  \
-			&&INVALID,  \
-			&&INVALID,  \
-			&&INVALID,  \
-			&&INVALID,  \
-			&&INVALID,  \
-			&&INVALID,  \
-			&&INVALID,  \
-			&&INVALID,  \
-			&&INVALID,  \
-			&&INVALID,  \
-			&&INVALID,       /* C0, */  \
-			&&INVALID,  \
-			&&INVALID,  \
-			&&INVALID,  \
-			&&INVALID,  \
-			&&INVALID,  \
-			&&INVALID,  \
-			&&INVALID,  \
-			&&INVALID,  \
-			&&INVALID,  \
-			&&INVALID,  \
-			&&INVALID,  \
-			&&INVALID,  \
-			&&INVALID,  \
-			&&INVALID,  \
-			&&INVALID,  \
-			&&INVALID,       /* D0, */  \
-			&&INVALID,  \
-			&&INVALID,  \
-			&&INVALID,  \
-			&&INVALID,  \
-			&&INVALID,  \
-			&&INVALID,  \
-			&&INVALID,  \
-			&&INVALID,  \
-			&&INVALID,  \
-			&&INVALID,  \
+			&&JUMPTO,      /* B0, */ \
+			&&JUMPIF,  \
+			&&JUMPSUB,  \
+			&&JUMPV,  \
+			&&JUMPSUBV,  \
+			&&BEGINSUB,  \
+			&&BEGINDATA, \
+			&&RETURNSUB, \
+			&&PUTLOCAL,  \
+			&&GETLOCAL,  \
 			&&INVALID,  \
 			&&INVALID,  \
 			&&INVALID,  \
 			&&INVALID,  \
 			&&INVALID,  \
-			&&INVALID,       /* E0, */  \
 			&&INVALID,  \
+			&&INVALID,     /* C0, */  \
+			&&XADD,  \
+			&&XMUL,  \
+			&&XSUB,  \
+			&&XDIV,  \
+			&&XSDIV,  \
+			&&XMOD,  \
+			&&XSMOD,  \
 			&&INVALID,  \
 			&&INVALID,  \
 			&&INVALID,  \
@@ -382,13 +365,39 @@ namespace eth
 			&&INVALID,  \
 			&&INVALID,  \
 			&&INVALID,  \
+			&&XLT,         /* D0 */ \
+			&&XGT,  \
+			&&XSLT,  \
+			&&XSGT,  \
+			&&XEQ,  \
+			&&XISZERO,  \
+			&&XAND,  \
+			&&XOOR,  \
+			&&XXOR,  \
+			&&XNOT,  \
 			&&INVALID,  \
+			&&XSHL,  \
+			&&XSHR,  \
+			&&XSAR,  \
+			&&XROL,  \
+			&&XROR,  \
+			&&XPUSH,       /* E0, */   \
+			&&XMLOAD,  \
+			&&XMSTORE,  \
 			&&INVALID,  \
+			&&XSLOAD,  \
+			&&XSSTORE,  \
+			&&XVTOWIDE,  \
+			&&XWIDETOV,  \
+			&&XGET,  \
+			&&XPUT,  \
+			&&XSWIZZLE,  \
+			&&XSHUFFLE,  \
 			&&INVALID,  \
 			&&INVALID,  \
 			&&INVALID,  \
 			&&INVALID,  \
-			&&CREATE,        /* F0, */  \
+			&&CREATE,      /* F0, */  \
 			&&CALL,  \
 			&&CALLCODE,  \
 			&&RETURN,  \
diff --git a/libevm/VMOpt.cpp b/libevm/VMOpt.cpp
index 1782a8654cb..aab516c37d2 100755
--- a/libevm/VMOpt.cpp
+++ b/libevm/VMOpt.cpp
@@ -97,7 +97,7 @@ void VM::optimize()
 		{
 			pc += (byte)op - (byte)Instruction::PUSH1 + 1;
 		}
-#if EVM_JUMPS_AND_SUBS
+#if EIP_615
 		else if (
 			op == Instruction::JUMPTO ||
 			op == Instruction::JUMPIF ||
diff --git a/libevm/VMSIMD.cpp b/libevm/VMSIMD.cpp
new file mode 100755
index 00000000000..1a9c26791e0
--- /dev/null
+++ b/libevm/VMSIMD.cpp
@@ -0,0 +1,729 @@
+/*
+	This file is part of cpp-ethereum.
+
+	cpp-ethereum is free software: you can redistribute it and/or modify
+	it under the terms of the GNU General Public License as published by
+	the Free Software Foundation, either version 3 of the License, or
+	(at your option) any later version.
+
+	cpp-ethereum is distributed in the hope that it will be useful,
+	but WITHOUT ANY WARRANTY; without even the implied warranty of
+	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+	GNU General Public License for more details.
+
+	You should have received a copy of the GNU General Public License
+	along with cpp-ethereum.  If not, see <http://www.gnu.org/licenses/>.
+*/
+#include <libethereum/ExtVM.h>
+#include "VMConfig.h"
+#include "VM.h"
+#if EIP_616
+
+namespace dev
+{
+namespace eth
+{
+
+// conversion functions to overlay vectors on storage for u256 stack slots
+// a dirty trick but it keeps the SIMD types from polluting the rest of the VM
+// so at least assert there's room for the trick, and use wrappers for some safety
+static_assert(sizeof(uint64_t[4]) <= sizeof(u256), "stack slot too narrow for SIMD");
+using a64x4  = uint64_t[4];		
+using a32x8  = uint32_t[8];
+using a16x16 = uint16_t[16];
+using a8x32  = uint8_t [32];
+inline a64x4       & v64x4 (u256      & _stack_item) { return (a64x4&) *(a64x4*) &_stack_item; }
+inline a32x8       & v32x8 (u256      & _stack_item) { return (a32x8&) *(a32x8*) &_stack_item; }
+inline a16x16      & v16x16(u256      & _stack_item) { return (a16x16&)*(a16x16*)&_stack_item; }
+inline a8x32       & v8x32 (u256      & _stack_item) { return (a8x32&) *(a8x32*) &_stack_item; }
+inline a64x4  const& v64x4 (u256 const& _stack_item) { return (a64x4&) *(a64x4*) &_stack_item; }
+inline a32x8  const& v32x8 (u256 const& _stack_item) { return (a32x8&) *(a32x8*) &_stack_item; }
+inline a16x16 const& v16x16(u256 const& _stack_item) { return (a16x16&)*(a16x16*)&_stack_item; }
+inline a8x32  const& v8x32 (u256 const& _stack_item) { return (a8x32&) *(a8x32*) &_stack_item; }
+
+// tried using template template functions, gave up fighting the compiler after a day
+#define EVALXOPS(OP, _b) EVALXOP(OP, int8_t, int16_t, int32_t, int64_t, _b)
+#define EVALXOPU(OP, _b) EVALXOP(OP, uint8_t, uint16_t, uint32_t, uint64_t, _b)
+#define EVALXOP(OP, T8, T16, T32, T64, _b) \
+{ \
+	uint8_t const t = (_b) & 0xf; \
+	m_SPP[0] = 0; \
+	switch (t) \
+	{ \
+	case 0: \
+		for (int i = 0; i < 32; ++i) \
+			v8x32(m_SPP[0])[i]  = (uint8_t) OP((T8) v8x32(m_SP[0])[i],  (T8) v8x32(m_SP[1])[i]); \
+		break; \
+	case 1: \
+		for (int i = 0; i < 16; ++i) \
+			v16x16(m_SPP[0])[i] = (uint16_t)OP((T16)v16x16(m_SP[0])[i], (T16)v16x16(m_SP[1])[i]); \
+		break; \
+	case 2: \
+		for (int i = 0; i < 8; ++i) \
+			v32x8(m_SPP[0])[i]  = (uint32_t)OP((T32)v32x8(m_SP[0])[i],  (T32)v32x8(m_SP[1])[i]); \
+		break; \
+	case 3: \
+		for (int i = 0; i < 4; ++i) \
+			v64x4(m_SPP[0])[i]  = (uint64_t)OP((T64)v64x4(m_SP[0])[i],  (T64)v64x4(m_SP[1])[i]); \
+		break; \
+	default: throwBadInstruction(); \
+	} \
+}
+#define ADD( x1, x2) ((x1) + (x2))
+#define MUL( x1, x2) ((x1) * (x2))
+#define SUB( x1, x2) ((x1) - (x2))
+#define DIV( x1, x2) ((x1) / (x2))
+#define MOD( x1, x2) ((x1) % (x2))
+#define LT(  x1, x2) ((x1) < (x2))
+#define GT(  x1, x2) ((x1) > (x2))
+#define EQ(  x1, x2) ((x1) == (x2))
+#define ZERO(x1, x2) ((x1) == 0)
+#define AND( x1, x2) ((x1) & (x2))
+#define OR(  x1, x2) ((x1) | (x2))
+#define XOR( x1, x2) ((x1) ^ (x2))
+#define NOT( x1, x2) (~x1)
+#define SHR( x1, x2) ((x1) >> (x2))
+#define SHL( x1, x2) ((x1) << (x2))
+#define ROL( x1, x2) (((x1) << x2)|((x1) >> (sizeof(x1) * 8 - (x2))))
+#define ROR( x1, x2) (((x1) >> x2)|((x1) << (sizeof(x1) * 8 - (x2))))
+
+void VM::xadd (uint8_t _b) { EVALXOPU(ADD, _b); }
+void VM::xmul (uint8_t _b) { EVALXOPU(MUL, _b); }
+void VM::xsub (uint8_t _b) { EVALXOPU(SUB, _b); }
+void VM::xdiv (uint8_t _b) { EVALXOPU(DIV, _b); }
+void VM::xsdiv(uint8_t _b) { EVALXOPS(DIV, _b); }
+void VM::xmod (uint8_t _b) { EVALXOPU(MOD, _b); }
+void VM::xsmod(uint8_t _b) { EVALXOPS(MOD, _b); }
+void VM::xlt  (uint8_t _b) { EVALXOPU(LT,  _b); }
+void VM::xslt (uint8_t _b) { EVALXOPS(LT,  _b); }
+void VM::xgt  (uint8_t _b) { EVALXOPU(GT,  _b); }
+void VM::xsgt (uint8_t _b) { EVALXOPS(GT,  _b); }
+void VM::xeq  (uint8_t _b) { EVALXOPU(EQ,  _b); }
+void VM::xzero(uint8_t _b) { EVALXOPU(ZERO,_b); }
+void VM::xand (uint8_t _b) { EVALXOPU(AND, _b); }
+void VM::xoor (uint8_t _b) { EVALXOPU(OR,  _b); }
+void VM::xxor (uint8_t _b) { EVALXOPU(XOR, _b); }
+void VM::xnot (uint8_t _b) { EVALXOPU(NOT, _b); }
+void VM::xshr (uint8_t _b) { EVALXOPU(SHR, _b); }
+void VM::xsar (uint8_t _b) { EVALXOPS(SHR, _b); }
+void VM::xshl (uint8_t _b) { EVALXOPU(SHL, _b); }
+void VM::xrol (uint8_t _b) { EVALXOPU(ROL, _b); }
+void VM::xror (uint8_t _b) { EVALXOPU(ROR, _b); }
+
+inline uint8_t pow2N(uint8_t n)
+{
+	static uint8_t exp[6] = { 1, 2, 4, 8, 16, 32 };
+	return exp[n];
+}
+
+inline uint8_t nElem(uint8_t _b)
+{
+	return pow2N((_b) & 0xf);
+}
+
+inline uint8_t elemType(uint8_t _b)
+{
+	return (_b) >> 4;
+}
+
+// in must be by reference because it is really just memory for a vector
+u256 VM::vtow(uint8_t _b, const u256& _in)
+{
+	u256 out;
+	uint8_t const n = nElem(_b);
+	uint8_t const t = elemType(_b);
+	switch (t)
+	{
+	case 0:
+		for (int i = n-1; 0 <= i; --i)
+		{ 
+			out << 8;
+			out |= v8x32(_in) [i];
+		}
+		break;
+	case 1:
+		for (int i = n-1; 0 <= i; --i)
+		{ 
+			out << 16;
+			out |= v16x16(_in)[i];
+		}
+		break;
+	case 2:
+		for (int i = n-1; 0 <= i; --i)
+		{ 
+			out << 32;
+			out |= v32x8(_in) [i];
+		}
+		break;
+	case 3:
+		for (int i = n-1; 0 <= i; --i)
+		{ 
+			out << 64;
+			out |= v64x4(_in) [i];
+		}
+		break;
+	default:
+		throwBadInstruction();
+	}
+	return out;
+}
+
+// out must be by reference because it is really just memory for a vector
+void VM::wtov(uint8_t _b, u256 _in, u256& _o_out)
+{
+	uint8_t const n = nElem(_b);
+	uint8_t const t = elemType(_b);
+	switch (t)
+	{
+	case 0:
+		for (int i = n-1; 0 <= i; --i)
+		{ 
+			v8x32(_o_out) [i] = (uint8_t )(_in & 0xff);
+			_in >>= 8;
+		}
+		break;
+	case 1:
+		for (int i = n-1; 0 <= i; --i)
+		{ 
+			v16x16(_o_out)[i] = (uint16_t)(_in & 0xffff);
+			_in >>= 16;
+		}
+		break;
+	case 2:
+		for (int i = n-1; 0 <= i; --i)
+		{ 
+			v32x8(_o_out) [i] = (uint32_t)(_in & 0xffffff);
+			_in >>= 32;
+		}
+		break;
+	case 3:
+		for (int i = n-1; 0 <= i; --i)
+		{ 
+			v64x4(_o_out) [i] = (uint64_t)(_in & 0xffffffff);
+			_in >>= 64;
+		}
+		break;
+	default:
+		throwBadInstruction();
+	}
+}
+
+void VM::xmload (uint8_t _b)
+{
+	// n bytes of type t elements in memory vector
+	// goes onto stack element by element, LSB first
+	uint8_t* p = m_mem.data() + toInt15(m_SP[0]);
+	uint8_t const n = nElem(_b);
+	uint8_t const t = elemType(_b);
+
+	switch (t)
+	{
+	case 0:
+		for (int j = n,  i = n - 1; 0 <= i; --i)
+		{
+			int v = 0;
+			v |= p[--j];
+			v8x32(m_SPP[0])[i] = v;
+		}
+		break;
+	case 1:
+		for (int j = n,  i = n - 1; 0 <= i; --i)
+		{
+			int v = 0;
+			v |= p[--j];
+			v <<= 8;
+			v |= p[--j];
+			v16x16(m_SPP[0])[i] = v;
+		}
+		break;
+	case 2:
+		for (int j = n,  i = n - 1; 0 <= i; --i)
+		{
+			int v = 0;
+			v |= p[--j];
+			v <<= 8;
+			v |= p[--j];
+			v <<= 8;
+			v |= p[--j];
+			v <<= 8;
+			v |= p[--j];
+			v32x8(m_SPP[0])[i] = v;
+		}
+		break;
+	case 3:
+		for (int j = n,  i = n - 1; 0 <= i; --i)
+		{
+			int v = 0;
+			v |= p[--j];
+			v <<= 8;
+			v |= p[--j];
+			v <<= 8;
+			v |= p[--j];
+			v <<= 8;
+			v |= p[--j];
+			v <<= 8;
+			v |= p[--j];
+			v <<= 8;
+			v |= p[--j];
+			v <<= 8;
+			v |= p[--j];
+			v <<= 8;
+			v |= p[--j];
+			v64x4(m_SPP[0])[i] = v;
+		}
+		break;
+	default:
+		throwBadInstruction();
+	}
+}
+   
+void VM::xmstore(uint8_t _b)
+{
+	// n bytes of type t elements in stack vector
+	// goes onto memory by element, LSB first
+	uint8_t *p = m_mem.data() + toInt15(m_SP[0]);
+	uint8_t const n = nElem(_b);
+	uint8_t const t = elemType(_b);
+
+	switch (t)
+	{
+	case 0:
+		for (int j = n,  i = n - 1; 0 <= i; --i)
+		{
+			int v = 0;
+			v = v8x32(m_SPP[0])[i];
+			p[--j] = (uint8_t)v;
+		}
+		break;
+	case 1:
+		for (int j = n,  i = n - 1; 0 <= i; --i)
+		{
+			int v = 0;
+			v = v8x32(m_SPP[0])[i];
+			p[--j] = (uint8_t)v;
+			v >>= 8;
+			p[--j] = (uint8_t)v;
+		}
+		break;
+	case 2:
+		for (int j = n,  i = n - 1; 0 <= i; --i)
+		{
+			int v = 0;
+			v = v8x32(m_SPP[0])[i];
+			p[--j] = (uint8_t)v;
+			v >>= 8;
+			p[--j] = (uint8_t)v;
+			v >>= 8;
+			p[--j] = (uint8_t)v;
+			v >>= 8;
+			p[--j] = (uint8_t)v;
+		}
+		break;
+	case 3:
+		for (int j = n,  i = n - 1; 0 <= i; --i)
+		{
+			int v = 0;
+			v = v8x32(m_SPP[0])[i];
+			p[--j] = (uint8_t)v;
+			v >>= 8;
+			p[--j] = (uint8_t)v;
+			v >>= 8;
+			p[--j] = (uint8_t)v;
+			v >>= 8;
+			p[--j] = (uint8_t)v;
+			v >>= 8;
+			p[--j] = (uint8_t)v;
+			v >>= 8;
+			p[--j] = (uint8_t)v;
+			v >>= 8;
+			p[--j] = (uint8_t)v;
+			v >>= 8;
+			p[--j] = (uint8_t)v;
+		}
+		break;
+	default:
+		throwBadInstruction();
+	}
+}
+
+void VM::xsload(uint8_t _b)
+{
+	u256 w = m_ext->store(m_SP[0]);
+	wtov(_b, w, m_SPP[0]);
+}
+
+void VM::xsstore(uint8_t _b)
+{
+	u256 w = vtow(_b, m_SP[1]);
+	m_ext->setStore(m_SP[0], w);
+}
+
+void VM::xvtowide(uint8_t _b)
+{
+	m_SPP[0] = vtow(_b, m_SP[0]);
+}
+
+void VM::xwidetov(uint8_t _b)
+{
+	wtov(_b, m_SP[0], m_SPP[0]);
+}
+
+void VM::xpush(uint8_t _b)
+{
+	// n type t elements in destination vector
+	uint8_t const  n = nElem(_b);
+	uint8_t const t = elemType(_b);
+	
+	// Construct a vector out of n bytes following XPUSH.
+	// This requires the code has been copied and extended by 32 zero
+	// bytes to handle "out of code" push data here.
+
+	// given the type of the vector
+	// mask and shift in the inline bytes                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          
+	m_SPP[0] = 0;
+	switch (t)
+	{
+	case 0:
+		for (int i = 0; i < n; ++i)
+		{
+			v8x32(m_SPP[0])[i] = m_code[++m_PC];
+		}
+		break;
+	case 1:
+		for (int i = 0; i < n; ++i)
+		{
+			uint16_t v = m_code[++m_PC];
+			v = (v << 8) | m_code[++m_PC];
+			v16x16(m_SPP[0])[i] = v;
+		}
+		break;
+	case 2:
+		for (int i = 0; i < n; ++i)
+		{
+			uint32_t v = m_code[m_PC];
+			v = (v << 8) | m_code[++m_PC];
+			v = (v << 8) | m_code[++m_PC];
+			v = (v << 8) | m_code[++m_PC];
+			v32x8(m_SPP[0])[i] = v;
+		}
+		break;
+	case 3:
+		for (int i = 0; i < n; ++i)
+		{
+			uint64_t v = m_code[++m_PC];
+			v = (v << 8) | m_code[++m_PC];
+			v = (v << 8) | m_code[++m_PC];
+			v = (v << 8) | m_code[++m_PC];
+			v = (v << 8) | m_code[++m_PC];
+			v = (v << 8) | m_code[++m_PC];
+			v = (v << 8) | m_code[++m_PC];
+			v = (v << 8) | m_code[++m_PC];
+			v64x4(m_SPP[0])[i] = v;
+		}
+		break;
+	default:
+		throwBadInstruction();
+	}
+}
+
+void VM::xget(uint8_t _b, uint8_t _c)
+{
+	// n type t elements in source vector, m type u in get indexes
+	uint8_t const t = elemType(_b);
+	uint8_t const m = nElem(_c);
+	uint8_t const u = elemType(_c);
+	
+	// given the type of the source and index
+	// for every element of the index get the indexed element from the source
+	switch (t)
+	{
+	case 0:
+
+		switch (u)
+		{
+		case 0:
+			for (int i = 0; i < m; ++i)
+				v8x32 (m_SPP[0])[i] = v8x32(m_SP[0])[v8x32 (m_SP[1])[i] % 32];
+			break;
+		case 1:
+			for (int i = 0; i < m; ++i)
+				v16x16(m_SPP[0])[i] = v8x32(m_SP[0])[v16x16(m_SP[1])[i] % 16];
+			break;
+		case 2:
+			for (int i = 0; i < m; ++i)
+				v32x8 (m_SPP[0])[i] = v8x32(m_SP[0])[v32x8 (m_SP[1])[i] %  8];
+			break;
+		case 3:
+			for (int i = 0; i < m; ++i)
+				v64x4 (m_SPP[0])[i] = v8x32(m_SP[0])[v64x4 (m_SP[1])[i] %  4];
+			break;
+		default:
+			throwBadInstruction();
+		}
+
+	case 1:
+
+		switch (u)
+		{
+		case 0:
+			for (int i = 0; i < m; ++i)
+				v8x32 (m_SPP[0])[i] = v16x16(m_SP[1])[v8x32 (m_SP[0])[i] % 32];
+			break;
+		case 1:
+			for (int i = 0; i < m; ++i)
+				v16x16(m_SPP[0])[i] = v16x16(m_SP[1])[v16x16(m_SP[0])[i] % 16];
+			break;
+		case 2:
+			for (int i = 0; i < m; ++i)
+				v32x8 (m_SPP[0])[i] = v16x16(m_SP[1])[v32x8 (m_SP[0])[i] %  8];
+			break;
+		case 3:
+			for (int i = 0; i < m; ++i)
+				v64x4 (m_SPP[0])[i] = v16x16(m_SP[1])[v64x4 (m_SP[0])[i] %  4];
+			break;
+		default:
+			throwBadInstruction();
+		}
+
+	case 2:
+
+		switch (u)
+		{
+		case 0:
+			for (int i = 0; i < m; ++i)
+				v8x32 (m_SPP[0])[i] = v32x8(m_SP[1])[v8x32 (m_SP[0])[i] % 32];
+			break;
+		case 1:
+			for (int i = 0; i < m; ++i)
+				v16x16(m_SPP[0])[i] = v32x8(m_SP[1])[v16x16(m_SP[0])[i] % 16];
+			break;
+		case 2:
+			for (int i = 0; i < m; ++i)
+				v32x8 (m_SPP[0])[i] = v32x8(m_SP[1])[v32x8 (m_SP[0])[i] %  8];
+			break;
+		case 3:
+			for (int i = 0; i < m; ++i)
+				v64x4 (m_SPP[0])[i] = v32x8(m_SP[1])[v64x4 (m_SP[0])[i] %  4];
+			break;
+		default:
+			throwBadInstruction();
+		}
+
+	case 3:
+
+		switch (u)
+		{
+		case 0:
+			for (int i = 0; i < m; ++i)
+				v8x32 (m_SPP[0])[i] = v64x4(m_SP[1])[v8x32 (m_SP[0])[i] % 32];
+			break;
+		case 1:
+			for (int i = 0; i < m; ++i)
+				v16x16(m_SPP[0])[i] = v64x4(m_SP[1])[v16x16(m_SP[0])[i] % 16];
+			break;
+		case 2:
+			for (int i = 0; i < m; ++i)
+				v32x8 (m_SPP[0])[i] = v64x4(m_SP[1])[v32x8 (m_SP[0])[i] %  8];
+			break;
+		case 3:
+			for (int i = 0; i < m; ++i)
+				v64x4 (m_SPP[0])[i] = v64x4(m_SP[1])[v64x4 (m_SP[0])[i] %  4];
+			break;
+		default:
+			throwBadInstruction();
+		}
+
+	default:
+		throwBadInstruction();
+	}
+}
+
+void VM::xput(uint8_t _b, uint8_t _c)
+{
+	// n type t elements in source and destination vectors, m type u elements in put index
+	uint8_t const t = elemType(_b);
+	uint8_t const m = nElem(_c);
+	uint8_t const u = elemType(_c);
+
+	// given the type of the source, destination and index
+	// for every element of the index put the indexed replacement in the destination                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   
+	switch (t)
+	{
+	case 0:
+
+		switch (u)
+		{
+		case 0:
+			for (int i = 0; i < m; ++i)
+				v8x32 (m_SPP[0])[v8x32(m_SP[1])[i] % 32] = v8x32(m_SP[0])[i];
+			break;
+		case 1:
+			for (int i = 0; i < m; ++i)
+				v16x16(m_SPP[0])[v8x32(m_SP[1])[i] % 16] = v8x32(m_SP[0])[i];
+			break;
+		case 2:
+			for (int i = 0; i < m; ++i)
+				v32x8 (m_SPP[0])[v8x32(m_SP[1])[i] %  8] = v8x32(m_SP[0])[i];
+			break;
+		case 3:
+			for (int i = 0; i < m; ++i)
+				v64x4 (m_SPP[0])[v8x32(m_SP[1])[i] %  4] = v8x32(m_SP[0])[i];
+			break;
+		default:
+			throwBadInstruction();
+		}
+
+	case 1:
+
+		switch (u)
+		{
+		case 0:
+			for (int i = 0; i < m; ++i)
+				v8x32 (m_SPP[0])[v16x16(m_SP[1])[i] % 32] = v16x16(m_SP[0])[i];
+			break;
+		case 1:
+			for (int i = 0; i < m; ++i)
+				v16x16(m_SPP[0])[v16x16(m_SP[1])[i] % 16] = v16x16(m_SP[0])[i];
+			break;
+		case 2:
+			for (int i = 0; i < m; ++i)
+				v32x8(m_SPP[0])[v16x16(m_SP[1])[i] %  8] = v16x16(m_SP[0])[i];
+			break;
+		case 3:
+			for (int i = 0; i < m; ++i)
+				v64x4(m_SPP[0])[v16x16(m_SP[1])[i] %  4] = v16x16(m_SP[0])[i];
+			break;
+		default:
+			throwBadInstruction();
+		}
+
+	case 2:
+
+		switch (u)
+		{
+		case 0:
+			for (int i = 0; i < m; ++i)
+				v8x32 (m_SPP[0])[v32x8(m_SP[1])[i] % 32] = v32x8(m_SP[0])[i];
+			break;
+		case 1:
+			for (int i = 0; i < m; ++i)
+				v16x16(m_SPP[0])[v32x8(m_SP[1])[i] % 16] = v32x8(m_SP[0])[i];
+			break;
+		case 2:
+			for (int i = 0; i < m; ++i)
+				v32x8 (m_SPP[0])[v32x8(m_SP[1])[i] %  8] = v32x8(m_SP[0])[i];
+			break;
+		case 3:
+			for (int i = 0; i < m; ++i)
+				v64x4 (m_SPP[0])[v32x8(m_SP[1])[i] %  4] = v32x8(m_SP[0])[i];
+			break;
+		default:
+			throwBadInstruction();
+		}
+
+	case 3:
+
+		switch (u)
+		{
+		case 0:
+			for (int i = 0; i < m; ++i)
+				v8x32 (m_SPP[0])[v64x4(m_SP[1])[i] % 32] = v64x4(m_SP[0])[i];
+			break;
+		case 1:
+			for (int i = 0; i < m; ++i)
+				v16x16(m_SPP[0])[v64x4(m_SP[1])[i] % 16] = v64x4(m_SP[0])[i];
+			break;
+		case 2:
+			for (int i = 0; i < m; ++i)
+				v32x8 (m_SPP[0])[v64x4(m_SP[1])[i] %  8] = v64x4(m_SP[0])[i];
+			break;
+		case 3:
+			for (int i = 0; i < m; ++i)
+				v64x4 (m_SPP[0])[v64x4(m_SP[1])[i] %  4] = v64x4(m_SP[0])[i];
+			break;
+		default:
+			throwBadInstruction();
+		}
+
+	default:
+		throwBadInstruction();
+	}
+}
+
+void VM::xswizzle(uint8_t _b)
+{
+	// n type t elements in source and mask vectors
+	uint8_t const n = nElem(_b);
+	uint8_t const t = elemType(_b);
+
+	// given the type of the source and mask                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          
+	// for every index in the mask copy out the indexed value in the source
+	switch (t)
+	{
+	case 0:
+		for (int i = 0; i < n; ++i)
+			v8x32 (m_SPP[0])[i] = v8x32(m_SP[1]) [v8x32 (m_SP[0])[i] % n];
+		break;
+	case 1:
+		for (int i = 0; i < n; ++i)
+			v16x16(m_SPP[0])[i] = v16x16(m_SP[1])[v16x16(m_SP[0])[i] % n];
+		break;
+	case 2:
+		for (int i = 0; i < n; ++i)
+			v32x8 (m_SPP[0])[i] = v32x8(m_SP[1]) [v32x8 (m_SP[0])[i] %  n];
+		break;
+	case 3:
+		for (int i = 0; i < n; ++i)
+			v64x4 (m_SPP[0])[i] = v64x4(m_SP[1]) [v64x4 (m_SP[0])[i] %  n];
+		break;
+	default:
+		throwBadInstruction();
+	}
+}
+
+void VM::xshuffle(uint8_t _b)
+{
+	// n type t elements in source and mask vectors
+	uint8_t const n = nElem(_b);
+	uint8_t const t = elemType(_b);
+
+	// given the type of the sources and mask                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          
+	// for every index in the mask copy out the indexed value in one of the sources
+	switch (t)
+	{
+	case 0:
+		for (int i = 0; i < n; ++i)
+		{
+			int j = v8x32(m_SP[0]) [i];
+			v8x32 (m_SPP[0])[i] = j < n ? v8x32(m_SP[2]) [j] : v8x32 (m_SP[2])[j % n];
+		}
+		break;
+	case 1:
+		for (int i = 0; i < n; ++i)
+		{
+			int j = v16x16(m_SP[0])[i];
+			v16x16(m_SPP[0])[i] = j < n ? v16x16(m_SP[2])[j] : v16x16(m_SP[2])[j % n];
+		}
+		break;
+	case 2:
+		for (int i = 0; i < n; ++i)
+		{
+			int j = v32x8(m_SP[0]) [i];
+			v32x8 (m_SPP[0])[i] = j < n ? v32x8(m_SP[2]) [j] : v32x8 (m_SP[2])[j %  n];
+		}
+		break;
+	case 3:
+		for (int i = 0; i < n; ++i)
+		{
+			int j = v64x4(m_SP[0]) [i];
+			v64x4 (m_SPP[0])[i] = j < n ? v64x4(m_SP[2]) [j] : v64x4 (m_SP[2])[j %  n];
+		}
+		break;
+	default:
+		throwBadInstruction();
+	}
+}
+
+}}
+
+#endif
diff --git a/libevm/VMValidate.cpp b/libevm/VMValidate.cpp
index 6053ec393ce..9dc8ef9743b 100755
--- a/libevm/VMValidate.cpp
+++ b/libevm/VMValidate.cpp
@@ -24,7 +24,7 @@ using namespace std;
 using namespace dev;
 using namespace dev::eth;
 
-#if EVM_JUMPS_AND_SUBS
+#if EIP_615
 
 ///////////////////////////////////////////////////////////////////////////////
 //
diff --git a/libevmcore/Instruction.cpp b/libevmcore/Instruction.cpp
index 5bf244c16ac..f1c4cdf9b39 100755
--- a/libevmcore/Instruction.cpp
+++ b/libevmcore/Instruction.cpp
@@ -15,8 +15,6 @@
 	along with cpp-ethereum.  If not, see <http://www.gnu.org/licenses/>.
 */
 /** @file Instruction.cpp
- * @author Gav Wood <i@gavwood.com>
- * @date 2014
  */
 
 #include "Instruction.h"
@@ -88,8 +86,6 @@ static const std::map<Instruction,  InstructionInfo> c_instructionInfo =
 	{ Instruction::MSIZE,        { "MSIZE",          0,     0,    1,  false,       Tier::Base } },
 	{ Instruction::GAS,          { "GAS",            0,     0,    1,  false,       Tier::Base } },
 	{ Instruction::JUMPDEST,     { "JUMPDEST",       0,     0,    0,  true,        Tier::Special } },
-	{ Instruction::BEGINDATA,    { "BEGINDATA",      0,     0,    0,  true,        Tier::Special } },
-	{ Instruction::BEGINSUB,     { "BEGINSUB",       0,     0,    0,  true,        Tier::Special } },
 	{ Instruction::PUSH1,        { "PUSH1",          1,     0,    1,  false,       Tier::VeryLow } },
 	{ Instruction::PUSH2,        { "PUSH2",          2,     0,    1,  false,       Tier::VeryLow } },
 	{ Instruction::PUSH3,        { "PUSH3",          3,     0,    1,  false,       Tier::VeryLow } },
@@ -165,10 +161,45 @@ static const std::map<Instruction,  InstructionInfo> c_instructionInfo =
 	{ Instruction::JUMPV,        { "JUMPV",          2,     1,    0,  true,        Tier::Mid } },
 	{ Instruction::JUMPSUB,      { "JUMPSUB",        2,     1,    0,  true,        Tier::Low } },
 	{ Instruction::JUMPSUBV,     { "JUMPSUBV",       2,     1,    0,  true,        Tier::Mid } },
+	{ Instruction::BEGINSUB,     { "BEGINSUB",       0,     0,    0,  true,        Tier::Special } },
+	{ Instruction::BEGINDATA,    { "BEGINDATA",      0,     0,    0,  true,        Tier::Special } },
 	{ Instruction::RETURNSUB,    { "RETURNSUB",      0,     1,    0,  true,        Tier::Mid } },
 	{ Instruction::PUTLOCAL,     { "PUTLOCAL",       2,     1,    0,  true,        Tier::VeryLow } },
 	{ Instruction::GETLOCAL,     { "GETLOCAL",       2,     0,    1,  true,        Tier::VeryLow } },
 
+	{ Instruction::XADD,         { "XADD",           1,     0,    0,  true,        Tier::Special } },
+	{ Instruction::XMUL,         { "XMUL",           1,     2,    1,  false,       Tier::Special } },
+	{ Instruction::XSUB,         { "XSUB",           1,     2,    1,  false,       Tier::Special } },
+	{ Instruction::XDIV,         { "XDIV",           1,     2,    1,  false,       Tier::Special } },
+	{ Instruction::XSDIV,        { "XSDIV",          1,     2,    1,  false,       Tier::Special } },
+	{ Instruction::XMOD,         { "XMOD",           1,     2,    1,  false,       Tier::Special } },
+	{ Instruction::XSMOD,        { "XSMOD",          1,     2,    1,  false,       Tier::Special } },
+	{ Instruction::XLT,          { "XLT",            1,     2,    1,  false,       Tier::Special } },
+	{ Instruction::XGT,          { "XGT",            1,     2,    1,  false,       Tier::Special } },
+	{ Instruction::XSLT,         { "XSLT",           1,     2,    1,  false,       Tier::Special } },
+	{ Instruction::XSGT,         { "XSGT",           1,     2,    1,  false,       Tier::Special } },
+	{ Instruction::XEQ,          { "XEQ",            1,     2,    1,  false,       Tier::Special } },
+	{ Instruction::XISZERO,      { "XISZERO",        1,     2,    1,  false,       Tier::Special } },
+	{ Instruction::XAND,         { "XAND",           1,     1,    1,  false,       Tier::Special } },
+	{ Instruction::XOR,          { "XOR",            1,     2,    1,  false,       Tier::Special } },
+	{ Instruction::XXOR,         { "XXOR",           1,     2,    1,  false,       Tier::Special } },
+	{ Instruction::XNOT,         { "XNOT",           1,     2,    1,  false,       Tier::Special } },
+	{ Instruction::XSHL,         { "XSHL",           1,     2,    1,  false,       Tier::Special } },
+	{ Instruction::XSHR,         { "XSHR",           1,     2,    1,  false,       Tier::Special } },
+	{ Instruction::XSAR,         { "XSAR",           1,     2,    1,  false,       Tier::Special } },
+	{ Instruction::XROL,         { "XROL",           1,     2,    1,  false,       Tier::Special } },
+	{ Instruction::XROR,         { "XROR",           1,     2,    1,  false,       Tier::Special } },
+	{ Instruction::XPUSH,        { "XPUSH",          1,     1,    1,  false,       Tier::VeryLow } },
+	{ Instruction::XMLOAD,       { "XMLOAD",         1,     1,    1,  false,       Tier::VeryLow } },
+	{ Instruction::XMSTORE,      { "XMSTORE",        1,     2,    0,  false,       Tier::VeryLow } },
+	{ Instruction::XSLOAD,       { "XSLOAD",         1,     1,    1,  false,       Tier::Special } },
+	{ Instruction::XSSTORE,      { "XSSTORE",        1,     2,    0,  false,       Tier::Special } },
+	{ Instruction::XVTOWIDE,     { "XVTOWIDE",       1,     1,    1,  false,       Tier::VeryLow } },
+	{ Instruction::XWIDETOV,     { "XWIDETOV",       1,     1,    1,  false,       Tier::VeryLow } },
+	{ Instruction::XPUT,         { "XPUT",           1,     3,    1,  false,       Tier::Special } },
+	{ Instruction::XGET,         { "XGET",           1,     2,    1,  false,       Tier::Special } },
+	{ Instruction::XSWIZZLE,     { "XSWIZZLE",       1,     2,    1,  false,       Tier::Special } },
+	{ Instruction::XSHUFFLE,     { "XSHUFFLE",       1,     3,    1,  false,       Tier::Special } },
 	{ Instruction::CREATE,       { "CREATE",         0,     3,     1,  true,       Tier::Special } },
 	{ Instruction::CREATE2,      { "CREATE2",        0,     4,     1,  true,       Tier::Special } },
 	{ Instruction::CALL,         { "CALL",           0,     7,     1,  true,       Tier::Special } },
diff --git a/libevmcore/Instruction.h b/libevmcore/Instruction.h
index 2344774e2a5..b0bf0b80a7e 100755
--- a/libevmcore/Instruction.h
+++ b/libevmcore/Instruction.h
@@ -169,7 +169,7 @@ enum class Instruction: uint8_t
 	LOG2,               ///< Makes a log entry; 2 topics.
 	LOG3,               ///< Makes a log entry; 3 topics.
 	LOG4,               ///< Makes a log entry; 4 topics.
-	
+
 	// these are generated by the interpreter - should never be in user code
 	PUSHC = 0xac,       ///< push value from constant pool
 	JUMPC,              ///< alter the program counter - pre-verified
@@ -186,6 +186,40 @@ enum class Instruction: uint8_t
 	PUTLOCAL,           ///< pop top of stack to local variable
 	GETLOCAL,           ///< push local variable to top of stack
 
+	XADD = 0xc1,        ///< addition operation
+	XMUL,               ///< mulitplication operation
+	XSUB,               ///< subtraction operation
+	XDIV,               ///< integer division operation
+	XSDIV,              ///< signed integer division operation
+	XMOD,               ///< modulo remainder operation
+	XSMOD,              ///< signed modulo remainder operation
+	XLT = 0xd0,         ///< less-than comparision
+	XGT,                ///< greater-than comparision
+	XSLT,               ///< signed less-than comparision
+	XSGT,               ///< signed greater-than comparision
+	XEQ,                ///< equality comparision
+	XISZERO,            ///< simple not operator
+	XAND,               ///< bitwise AND operation
+	XOOR,               ///< bitwise OR operation
+	XXOR,               ///< bitwise XOR operation
+	XNOT,               ///< bitwise NOT opertation
+	XSHL = 0xdb,        ///< shift left opertation
+	XSHR,               ///< shift right opertation
+	XSAR,               ///< shift arithmetic right opertation
+	XROL,               ///< rotate left opertation
+	XROR,               ///< rotate right opertation
+	XPUSH = 0xe0,       ///< push vector to stack
+	XMLOAD,             ///< load vector from memory
+	XMSTORE,            ///< save vector to memory
+	XSLOAD = 0xe4,      ///< load vector from storage
+	XSSTORE,            ///< save vector to storage
+	XVTOWIDE,           ///< convert vector to wide integer
+	XWIDETOV,           ///< convert wide integer to vector
+	XGET,               ///< get data from vector
+	XPUT,               ///< put data in vector
+	XSWIZZLE,           ///< permute data in vector
+	XSHUFFLE,           ///< permute data in two vectors
+
 	CREATE = 0xf0,      ///< create a new account with associated code
 	CALL,               ///< message-call into an account
 	CALLCODE,           ///< message-call with another account's code only
diff --git a/test/tools/fuzzTesting/fuzzHelper.cpp b/test/tools/fuzzTesting/fuzzHelper.cpp
old mode 100644
new mode 100755
index 26241553f76..312c9537fde
--- a/test/tools/fuzzTesting/fuzzHelper.cpp
+++ b/test/tools/fuzzTesting/fuzzHelper.cpp
@@ -27,7 +27,7 @@
 #include <test/tools/libtesteth/TestOutputHelper.h>
 
 using namespace dev;
-const static std::array<eth::Instruction, 14> invalidOpcodes{{
+const static std::array<eth::Instruction, 47> invalidOpcodes {{
 	eth::Instruction::INVALID,
 	eth::Instruction::PUSHC,
 	eth::Instruction::JUMPC,
@@ -41,7 +41,40 @@ const static std::array<eth::Instruction, 14> invalidOpcodes{{
 	eth::Instruction::BEGINDATA,
 	eth::Instruction::RETURNSUB,
 	eth::Instruction::PUTLOCAL,
-	eth::Instruction::GETLOCAL
+	eth::Instruction::GETLOCAL,
+	eth::Instruction::XADD,        
+	eth::Instruction::XMUL,        
+	eth::Instruction::XSUB,        
+	eth::Instruction::XDIV,        
+	eth::Instruction::XSDIV,       
+	eth::Instruction::XMOD,        
+	eth::Instruction::XSMOD,       
+	eth::Instruction::XLT,         
+	eth::Instruction::XGT,         
+	eth::Instruction::XSLT,        
+	eth::Instruction::XSGT,        
+	eth::Instruction::XEQ,         
+	eth::Instruction::XISZERO,     
+	eth::Instruction::XAND,        
+	eth::Instruction::XOR,         
+	eth::Instruction::XXOR,        
+	eth::Instruction::XNOT,        
+	eth::Instruction::XSHL,        
+	eth::Instruction::XSHR,        
+	eth::Instruction::XSAR,        
+	eth::Instruction::XROL,        
+	eth::Instruction::XROR,        
+	eth::Instruction::XPUSH,       
+	eth::Instruction::XMLOAD,      
+	eth::Instruction::XMSTORE,     
+	eth::Instruction::XSLOAD,      
+	eth::Instruction::XSSTORE,     
+	eth::Instruction::XVTOWIDE,
+	eth::Instruction::XWIDETOV,
+	eth::Instruction::XPUT,        
+	eth::Instruction::XGET,        
+	eth::Instruction::XSWIZZLE,
+	eth::Instruction::XSHUFFLE,
 }};
 
 namespace dev

From 1e9027736febe774e8d7df5885499f0e1670fb24 Mon Sep 17 00:00:00 2001
From: gcolvin <greg@colvin.org>
Date: Fri, 7 Jul 2017 10:18:38 -0600
Subject: [PATCH 2/6] more review by Andrei

---
 libevm/VM.cpp     |  9 ++++++---
 libevm/VMSIMD.cpp | 16 ++++++++--------
 2 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/libevm/VM.cpp b/libevm/VM.cpp
index 12793f710f7..f0e1d27f649 100755
--- a/libevm/VM.cpp
+++ b/libevm/VM.cpp
@@ -693,7 +693,6 @@ void VM::interpretCases()
 			updateIOGas();
 			*m_RP++ = m_PC++;
 			m_PC = decodeJumpDest(m_code.data(), m_PC);
-			}
 		}
 		CONTINUE
 
@@ -1038,7 +1037,8 @@ void VM::interpretCases()
 
 			uint8_t b = ++m_PC;
 			uint8_t c = ++m_PC;
-			xput(m_code[b], m_code[c]); ++m_PC;
+			xput(m_code[b], m_code[c]);
+			++m_PC;
 		}
 		CONTINUE
 
@@ -1049,7 +1049,8 @@ void VM::interpretCases()
 
 			uint8_t b = ++m_PC;
 			uint8_t c = ++m_PC;
-			xget(m_code[b], m_code[c]); ++m_PC;
+			xget(m_code[b], m_code[c]);
+			++m_PC;
 		}
 		CONTINUE
 
@@ -1491,6 +1492,8 @@ void VM::interpretCases()
 #if EVM_HACK_DUP_64
 			*(uint64_t*)m_SPP = *(uint64_t*)(m_SP + n);
 #else
+			// the stack slot being copied into may no longer hold a u256
+			// so we construct a new one in the memory, rather than assign
 			new(m_SPP) u256(m_SP[n]);
 #endif
 		}
diff --git a/libevm/VMSIMD.cpp b/libevm/VMSIMD.cpp
index 1a9c26791e0..c290beca7da 100755
--- a/libevm/VMSIMD.cpp
+++ b/libevm/VMSIMD.cpp
@@ -81,11 +81,11 @@ inline a8x32  const& v8x32 (u256 const& _stack_item) { return (a8x32&) *(a8x32*)
 #define AND( x1, x2) ((x1) & (x2))
 #define OR(  x1, x2) ((x1) | (x2))
 #define XOR( x1, x2) ((x1) ^ (x2))
-#define NOT( x1, x2) (~x1)
+#define NOT( x1, x2) (~(x1))
 #define SHR( x1, x2) ((x1) >> (x2))
 #define SHL( x1, x2) ((x1) << (x2))
-#define ROL( x1, x2) (((x1) << x2)|((x1) >> (sizeof(x1) * 8 - (x2))))
-#define ROR( x1, x2) (((x1) >> x2)|((x1) << (sizeof(x1) * 8 - (x2))))
+#define ROL( x1, x2) (((x1) << (x2))|((x1) >> (sizeof(x1) * 8 - (x2))))
+#define ROR( x1, x2) (((x1) >> (x2))|((x1) << (sizeof(x1) * 8 - (x2))))
 
 void VM::xadd (uint8_t _b) { EVALXOPU(ADD, _b); }
 void VM::xmul (uint8_t _b) { EVALXOPU(MUL, _b); }
@@ -110,10 +110,10 @@ void VM::xshl (uint8_t _b) { EVALXOPU(SHL, _b); }
 void VM::xrol (uint8_t _b) { EVALXOPU(ROL, _b); }
 void VM::xror (uint8_t _b) { EVALXOPU(ROR, _b); }
 
-inline uint8_t pow2N(uint8_t n)
+inline uint8_t pow2N(uint8_t _n)
 {
 	static uint8_t exp[6] = { 1, 2, 4, 8, 16, 32 };
-	return exp[n];
+	return exp[_n];
 }
 
 inline uint8_t nElem(uint8_t _b)
@@ -299,7 +299,7 @@ void VM::xmstore(uint8_t _b)
 		for (int j = n,  i = n - 1; 0 <= i; --i)
 		{
 			int v = 0;
-			v = v8x32(m_SPP[0])[i];
+			v = v16x16(m_SPP[0])[i];
 			p[--j] = (uint8_t)v;
 			v >>= 8;
 			p[--j] = (uint8_t)v;
@@ -309,7 +309,7 @@ void VM::xmstore(uint8_t _b)
 		for (int j = n,  i = n - 1; 0 <= i; --i)
 		{
 			int v = 0;
-			v = v8x32(m_SPP[0])[i];
+			v = v32x8(m_SPP[0])[i];
 			p[--j] = (uint8_t)v;
 			v >>= 8;
 			p[--j] = (uint8_t)v;
@@ -323,7 +323,7 @@ void VM::xmstore(uint8_t _b)
 		for (int j = n,  i = n - 1; 0 <= i; --i)
 		{
 			int v = 0;
-			v = v8x32(m_SPP[0])[i];
+			v = v64x4(m_SPP[0])[i];
 			p[--j] = (uint8_t)v;
 			v >>= 8;
 			p[--j] = (uint8_t)v;

From 09ea0604346dbd21bb101064aa4df05ae5ffc806 Mon Sep 17 00:00:00 2001
From: gcolvin <greg@colvin.org>
Date: Sat, 8 Jul 2017 20:16:41 -0600
Subject: [PATCH 3/6] everything compiles in all configs

---
 libevm/VMSIMD.cpp     |  2 +-
 libevm/VMValidate.cpp | 45 ++++++++++++++++++++++++-------------------
 2 files changed, 26 insertions(+), 21 deletions(-)

diff --git a/libevm/VMSIMD.cpp b/libevm/VMSIMD.cpp
index c290beca7da..811ef46823a 100755
--- a/libevm/VMSIMD.cpp
+++ b/libevm/VMSIMD.cpp
@@ -212,7 +212,7 @@ void VM::xmload (uint8_t _b)
 {
 	// n bytes of type t elements in memory vector
 	// goes onto stack element by element, LSB first
-	uint8_t* p = m_mem.data() + toInt15(m_SP[0]);
+	uint8_t const* p = m_mem.data() + toInt15(m_SP[0]);
 	uint8_t const n = nElem(_b);
 	uint8_t const t = elemType(_b);
 
diff --git a/libevm/VMValidate.cpp b/libevm/VMValidate.cpp
index 9dc8ef9743b..4f819757c0d 100755
--- a/libevm/VMValidate.cpp
+++ b/libevm/VMValidate.cpp
@@ -18,7 +18,11 @@
  */
 
 #include <libethereum/ExtVM.h>
+
+// validator is not a full interpreter, canot support optimized dispatch
+#define EVM_JUMP_DISPATCH false
 #include "VMConfig.h"
+
 #include "VM.h"
 using namespace std;
 using namespace dev;
@@ -36,22 +40,23 @@ void VM::validate(ExtVMFace& _ext)
 	m_ext = &_ext;
 	initEntry();
 	size_t PC;
-	byte OP;
-	for (PC = 0; (OP = m_code[PC]); ++PC)
-		if  (OP == byte(Instruction::BEGINSUB))
+	Instruction OP;
+	for (PC = 0; bool(OP = Instruction(m_code[PC])); ++PC)
+	{
+		if  (OP == Instruction::BEGINSUB)
 			validateSubroutine(PC, m_return, m_stack);
-		else if (OP == byte(Instruction::BEGINDATA))
+		else if (OP == Instruction::BEGINDATA)
 			break;
 		else if (
-				(byte)Instruction::PUSH1 <= (byte)OP &&
-				(byte)PC <= (byte)Instruction::PUSH32)
-			PC += (byte)OP - (byte)Instruction::PUSH1;
+				Instruction::PUSH1 <= OP &&
+				PC <= (size_t)Instruction::PUSH32)
+			PC += (size_t)OP - (size_t)Instruction::PUSH1;
 		else if (
 				OP == Instruction::JUMPTO ||
 				OP == Instruction::JUMPIF ||
 				OP == Instruction::JUMPSUB)
 			PC += 4;
-		else if (OP == Instruction::JUMPV || op == Instruction::JUMPSUBV)
+		else if (OP == Instruction::JUMPV || OP == Instruction::JUMPSUBV)
 			PC += 4 * m_code[PC];  // number of 4-byte dests followed by table
 	}
 }
@@ -71,12 +76,12 @@ void VM::validateSubroutine(uint64_t _pc, uint64_t* _rp, u256* _sp)
 		CASE(JUMPDEST)
 		{
 			// if frame size is set then we have been here before
-			ptrdiff_t frameSize = m_frameSize[m_PC];
-			if (0 <= frameSize)
+			size_t frameSize = m_frameSize[m_PC];
+			if (0 != frameSize)
 			{
 				// check for constant frame size
 				if (stackSize() != frameSize)
-					throwBadStack(stackSize(), frameSize, 0);
+					throwBadStack(stackSize(), frameSize);
 
 				// return to break cycle in control flow graph
 				return;
@@ -90,7 +95,7 @@ void VM::validateSubroutine(uint64_t _pc, uint64_t* _rp, u256* _sp)
 		CASE(JUMPTO)
 		{
 			// extract jump destination from bytecode
-			m_PC = decodeJumpDest(m_code, m_PC);
+			m_PC = decodeJumpDest(m_code.data(), m_PC);
 		}
 		NEXT
 
@@ -99,7 +104,7 @@ void VM::validateSubroutine(uint64_t _pc, uint64_t* _rp, u256* _sp)
 			// recurse to validate code to jump to, saving and restoring
 			// interpreter state around call
 			_pc = m_PC, _rp = m_RP, _sp = m_SP;
-			validateSubroutine(decodeJumpvDest(m_code, m_PC, m_SP), _rp, _sp);
+			validateSubroutine(decodeJumpvDest(m_code.data(), m_PC, byte(m_SP[0])), _rp, _sp);
 			m_PC = _pc, m_RP = _rp, m_SP = _sp;
 			++m_PC;
 		}
@@ -113,19 +118,19 @@ void VM::validateSubroutine(uint64_t _pc, uint64_t* _rp, u256* _sp)
 				// recurse to validate code to jump to, saving and 
 				// restoring interpreter state around call
 				_pc = m_PC, _rp = m_RP, _sp = m_SP;
-				validateSubroutine(decodeJumpDest(m_code, m_PC), _rp, _sp);
+				validateSubroutine(decodeJumpDest(m_code.data(), m_PC), _rp, _sp);
 				m_PC = _pc, m_RP = _rp, m_SP = _sp;
 			}
 		}
-		RETURN
+		BREAK
 
 		CASE(JUMPSUB)
 		{
 			// check for enough arguments on stack
-			size_t destPC = decodeJumpDest(m_code, m_PC);
+			size_t destPC = decodeJumpDest(m_code.data(), m_PC);
 			byte nArgs = m_code[destPC+1];
 			if (stackSize() < nArgs) 
-				throwBadStack(stackSize(), nArgs, 0);
+				throwBadStack(stackSize(), nArgs);
 		}
 		NEXT
 
@@ -138,10 +143,10 @@ void VM::validateSubroutine(uint64_t _pc, uint64_t* _rp, u256* _sp)
 				// check for enough arguments on stack
 				u256 slot = sub;
 				_sp = &slot;
-				size_t destPC = decodeJumpvDest(m_code, _pc, _sp);
+				size_t destPC = decodeJumpvDest(m_code.data(), _pc, byte(m_SP[0]));
 				byte nArgs = m_code[destPC+1];
 				if (stackSize() < nArgs) 
-					throwBadStack(stackSize(), nArgs, 0);
+					throwBadStack(stackSize(), nArgs);
 			}
 			m_PC = _pc;
 		}
@@ -164,7 +169,7 @@ void VM::validateSubroutine(uint64_t _pc, uint64_t* _rp, u256* _sp)
 			throwBadInstruction();
 		}
 	}
-	END_CASES
+	WHILE_CASES
 }
 
 #endif

From 68e4a3b5d5e45695d90100b91a5767ad3b685696 Mon Sep 17 00:00:00 2001
From: gcolvin <greg@colvin.org>
Date: Wed, 12 Jul 2017 17:07:33 -0600
Subject: [PATCH 4/6] meaningful names, clean up arguments

---
 libevm/VMSIMD.cpp | 491 +++++++++++++++++++++++-----------------------
 1 file changed, 244 insertions(+), 247 deletions(-)

diff --git a/libevm/VMSIMD.cpp b/libevm/VMSIMD.cpp
index 811ef46823a..183e51cbf95 100755
--- a/libevm/VMSIMD.cpp
+++ b/libevm/VMSIMD.cpp
@@ -41,28 +41,30 @@ inline a32x8  const& v32x8 (u256 const& _stack_item) { return (a32x8&) *(a32x8*)
 inline a16x16 const& v16x16(u256 const& _stack_item) { return (a16x16&)*(a16x16*)&_stack_item; }
 inline a8x32  const& v8x32 (u256 const& _stack_item) { return (a8x32&) *(a8x32*) &_stack_item; }
 
+class Pow2Bits enum { BITS_8, BITS_16, BITS_32, BITS_64 };
+
 // tried using template template functions, gave up fighting the compiler after a day
-#define EVALXOPS(OP, _b) EVALXOP(OP, int8_t, int16_t, int32_t, int64_t, _b)
-#define EVALXOPU(OP, _b) EVALXOP(OP, uint8_t, uint16_t, uint32_t, uint64_t, _b)
-#define EVALXOP(OP, T8, T16, T32, T64, _b) \
+#define EVALXOPS(OP, _type) EVALXOP(OP, int8_t, int16_t, int32_t, int64_t, _type)
+#define EVALXOPU(OP, _type) EVALXOP(OP, uint8_t, uint16_t, uint32_t, uint64_t, _type)
+#define EVALXOP(OP, T8, T16, T32, T64, _type) \
 { \
-	uint8_t const t = (_b) & 0xf; \
+	uint8_t const t = (_type) & 0xf; \
 	m_SPP[0] = 0; \
 	switch (t) \
 	{ \
-	case 0: \
+	case BITS_8: \
 		for (int i = 0; i < 32; ++i) \
 			v8x32(m_SPP[0])[i]  = (uint8_t) OP((T8) v8x32(m_SP[0])[i],  (T8) v8x32(m_SP[1])[i]); \
 		break; \
-	case 1: \
+	case BITS_16: \
 		for (int i = 0; i < 16; ++i) \
 			v16x16(m_SPP[0])[i] = (uint16_t)OP((T16)v16x16(m_SP[0])[i], (T16)v16x16(m_SP[1])[i]); \
 		break; \
-	case 2: \
+	case BITS_32: \
 		for (int i = 0; i < 8; ++i) \
 			v32x8(m_SPP[0])[i]  = (uint32_t)OP((T32)v32x8(m_SP[0])[i],  (T32)v32x8(m_SP[1])[i]); \
 		break; \
-	case 3: \
+	case BITS_64: \
 		for (int i = 0; i < 4; ++i) \
 			v64x4(m_SPP[0])[i]  = (uint64_t)OP((T64)v64x4(m_SP[0])[i],  (T64)v64x4(m_SP[1])[i]); \
 		break; \
@@ -87,28 +89,28 @@ inline a8x32  const& v8x32 (u256 const& _stack_item) { return (a8x32&) *(a8x32*)
 #define ROL( x1, x2) (((x1) << (x2))|((x1) >> (sizeof(x1) * 8 - (x2))))
 #define ROR( x1, x2) (((x1) >> (x2))|((x1) << (sizeof(x1) * 8 - (x2))))
 
-void VM::xadd (uint8_t _b) { EVALXOPU(ADD, _b); }
-void VM::xmul (uint8_t _b) { EVALXOPU(MUL, _b); }
-void VM::xsub (uint8_t _b) { EVALXOPU(SUB, _b); }
-void VM::xdiv (uint8_t _b) { EVALXOPU(DIV, _b); }
-void VM::xsdiv(uint8_t _b) { EVALXOPS(DIV, _b); }
-void VM::xmod (uint8_t _b) { EVALXOPU(MOD, _b); }
-void VM::xsmod(uint8_t _b) { EVALXOPS(MOD, _b); }
-void VM::xlt  (uint8_t _b) { EVALXOPU(LT,  _b); }
-void VM::xslt (uint8_t _b) { EVALXOPS(LT,  _b); }
-void VM::xgt  (uint8_t _b) { EVALXOPU(GT,  _b); }
-void VM::xsgt (uint8_t _b) { EVALXOPS(GT,  _b); }
-void VM::xeq  (uint8_t _b) { EVALXOPU(EQ,  _b); }
-void VM::xzero(uint8_t _b) { EVALXOPU(ZERO,_b); }
-void VM::xand (uint8_t _b) { EVALXOPU(AND, _b); }
-void VM::xoor (uint8_t _b) { EVALXOPU(OR,  _b); }
-void VM::xxor (uint8_t _b) { EVALXOPU(XOR, _b); }
-void VM::xnot (uint8_t _b) { EVALXOPU(NOT, _b); }
-void VM::xshr (uint8_t _b) { EVALXOPU(SHR, _b); }
-void VM::xsar (uint8_t _b) { EVALXOPS(SHR, _b); }
-void VM::xshl (uint8_t _b) { EVALXOPU(SHL, _b); }
-void VM::xrol (uint8_t _b) { EVALXOPU(ROL, _b); }
-void VM::xror (uint8_t _b) { EVALXOPU(ROR, _b); }
+void VM::xadd (uint8_t _type) { EVALXOPU(ADD, _type); }
+void VM::xmul (uint8_t _type) { EVALXOPU(MUL, _type); }
+void VM::xsub (uint8_t _type) { EVALXOPU(SUB, _type); }
+void VM::xdiv (uint8_t _type) { EVALXOPU(DIV, _type); }
+void VM::xsdiv(uint8_t _type) { EVALXOPS(DIV, _type); }
+void VM::xmod (uint8_t _type) { EVALXOPU(MOD, _type); }
+void VM::xsmod(uint8_t _type) { EVALXOPS(MOD, _type); }
+void VM::xlt  (uint8_t _type) { EVALXOPU(LT,  _type); }
+void VM::xslt (uint8_t _type) { EVALXOPS(LT,  _type); }
+void VM::xgt  (uint8_t _type) { EVALXOPU(GT,  _type); }
+void VM::xsgt (uint8_t _type) { EVALXOPS(GT,  _type); }
+void VM::xeq  (uint8_t _type) { EVALXOPU(EQ,  _type); }
+void VM::xzero(uint8_t _type) { EVALXOPU(ZERO,_type); }
+void VM::xand (uint8_t _type) { EVALXOPU(AND, _type); }
+void VM::xoor (uint8_t _type) { EVALXOPU(OR,  _type); }
+void VM::xxor (uint8_t _type) { EVALXOPU(XOR, _type); }
+void VM::xnot (uint8_t _type) { EVALXOPU(NOT, _type); }
+void VM::xshr (uint8_t _type) { EVALXOPU(SHR, _type); }
+void VM::xsar (uint8_t _type) { EVALXOPS(SHR, _type); }
+void VM::xshl (uint8_t _type) { EVALXOPU(SHL, _type); }
+void VM::xrol (uint8_t _type) { EVALXOPU(ROL, _type); }
+void VM::xror (uint8_t _type) { EVALXOPU(ROR, _type); }
 
 inline uint8_t pow2N(uint8_t _n)
 {
@@ -116,47 +118,47 @@ inline uint8_t pow2N(uint8_t _n)
 	return exp[_n];
 }
 
-inline uint8_t nElem(uint8_t _b)
+inline uint8_t laneCount(uint8_t _type)
 {
-	return pow2N((_b) & 0xf);
+	return pow2N((_type) & 0xf);
 }
 
-inline uint8_t elemType(uint8_t _b)
+inline uint8_t laneWidth(uint8_t _type)
 {
-	return (_b) >> 4;
+	return (_type) >> 4;
 }
 
 // in must be by reference because it is really just memory for a vector
-u256 VM::vtow(uint8_t _b, const u256& _in)
+u256 VM::vtow(uint8_t _type, const u256& _in)
 {
 	u256 out;
-	uint8_t const n = nElem(_b);
-	uint8_t const t = elemType(_b);
-	switch (t)
+	uint8_t const count = laneCount(_type);
+	uint8_t const width = laneWidth(_type);
+	switch (width)
 	{
-	case 0:
-		for (int i = n-1; 0 <= i; --i)
+	case BITS_8:
+		for (int i = count - 1; 0 <= i; --i)
 		{ 
 			out << 8;
 			out |= v8x32(_in) [i];
 		}
 		break;
-	case 1:
-		for (int i = n-1; 0 <= i; --i)
+	case BITS_16:
+		for (int i = count - 1; 0 <= i; --i)
 		{ 
 			out << 16;
 			out |= v16x16(_in)[i];
 		}
 		break;
-	case 2:
-		for (int i = n-1; 0 <= i; --i)
+	case BITS_32:
+		for (int i = count - 1; 0 <= i; --i)
 		{ 
 			out << 32;
 			out |= v32x8(_in) [i];
 		}
 		break;
-	case 3:
-		for (int i = n-1; 0 <= i; --i)
+	case BITS_64:
+		for (int i = count - 1; 0 <= i; --i)
 		{ 
 			out << 64;
 			out |= v64x4(_in) [i];
@@ -169,35 +171,35 @@ u256 VM::vtow(uint8_t _b, const u256& _in)
 }
 
 // out must be by reference because it is really just memory for a vector
-void VM::wtov(uint8_t _b, u256 _in, u256& _o_out)
+void VM::wtov(uint8_t _type, u256 _in, u256& _o_out)
 {
-	uint8_t const n = nElem(_b);
-	uint8_t const t = elemType(_b);
-	switch (t)
+	uint8_t const count = laneCount(_type);
+	uint8_t const width = laneWidth(_type);
+	switch (width)
 	{
-	case 0:
-		for (int i = n-1; 0 <= i; --i)
+	case BITS_8:
+		for (int i = count - 1; 0 <= i; --i)
 		{ 
 			v8x32(_o_out) [i] = (uint8_t )(_in & 0xff);
 			_in >>= 8;
 		}
 		break;
-	case 1:
-		for (int i = n-1; 0 <= i; --i)
+	case BITS_16:
+		for (int i = count - 1; 0 <= i; --i)
 		{ 
 			v16x16(_o_out)[i] = (uint16_t)(_in & 0xffff);
 			_in >>= 16;
 		}
 		break;
-	case 2:
-		for (int i = n-1; 0 <= i; --i)
+	case BITS_32:
+		for (int i = count - 1; 0 <= i; --i)
 		{ 
 			v32x8(_o_out) [i] = (uint32_t)(_in & 0xffffff);
 			_in >>= 32;
 		}
 		break;
-	case 3:
-		for (int i = n-1; 0 <= i; --i)
+	case BITS_64:
+		for (int i = count - 1; 0 <= i; --i)
 		{ 
 			v64x4(_o_out) [i] = (uint64_t)(_in & 0xffffffff);
 			_in >>= 64;
@@ -208,26 +210,25 @@ void VM::wtov(uint8_t _b, u256 _in, u256& _o_out)
 	}
 }
 
-void VM::xmload (uint8_t _b)
+void VM::xmload (uint8_t _type)
 {
-	// n bytes of type t elements in memory vector
 	// goes onto stack element by element, LSB first
 	uint8_t const* p = m_mem.data() + toInt15(m_SP[0]);
-	uint8_t const n = nElem(_b);
-	uint8_t const t = elemType(_b);
+	uint8_t const count = laneCount(_type);
+	uint8_t const width = laneWidth(_type);
 
-	switch (t)
+	switch (width)
 	{
-	case 0:
-		for (int j = n,  i = n - 1; 0 <= i; --i)
+	case BITS_8:
+		for (int j = 1,  i = count - 1; 0 <= i; --i)
 		{
 			int v = 0;
 			v |= p[--j];
 			v8x32(m_SPP[0])[i] = v;
 		}
 		break;
-	case 1:
-		for (int j = n,  i = n - 1; 0 <= i; --i)
+	case BITS_16:
+		for (int j = 2,  i = count - 1; 0 <= i; --i)
 		{
 			int v = 0;
 			v |= p[--j];
@@ -236,8 +237,8 @@ void VM::xmload (uint8_t _b)
 			v16x16(m_SPP[0])[i] = v;
 		}
 		break;
-	case 2:
-		for (int j = n,  i = n - 1; 0 <= i; --i)
+	case BITS_32:
+		for (int j = 4,  i = count - 1; 0 <= i; --i)
 		{
 			int v = 0;
 			v |= p[--j];
@@ -250,8 +251,8 @@ void VM::xmload (uint8_t _b)
 			v32x8(m_SPP[0])[i] = v;
 		}
 		break;
-	case 3:
-		for (int j = n,  i = n - 1; 0 <= i; --i)
+	case BITS_64:
+		for (int j = 8,  i = count - 1; 0 <= i; --i)
 		{
 			int v = 0;
 			v |= p[--j];
@@ -277,38 +278,38 @@ void VM::xmload (uint8_t _b)
 	}
 }
    
-void VM::xmstore(uint8_t _b)
+void VM::xmstore(uint8_t _type)
 {
 	// n bytes of type t elements in stack vector
 	// goes onto memory by element, LSB first
 	uint8_t *p = m_mem.data() + toInt15(m_SP[0]);
-	uint8_t const n = nElem(_b);
-	uint8_t const t = elemType(_b);
+	uint8_t const count = laneCount(_type);
+	uint8_t const width = laneWidth(_type);
 
-	switch (t)
+	switch (width)
 	{
-	case 0:
-		for (int j = n,  i = n - 1; 0 <= i; --i)
+	case BITS_8:
+		for (int j = 1,  i = count - 1; 0 <= i; --i)
 		{
 			int v = 0;
 			v = v8x32(m_SPP[0])[i];
 			p[--j] = (uint8_t)v;
 		}
 		break;
-	case 1:
-		for (int j = n,  i = n - 1; 0 <= i; --i)
+	case BITS_16:
+		for (int j = 2,  i = count - 1; 0 <= i; --i)
 		{
-			int v = 0;
+			int v = 2;
 			v = v16x16(m_SPP[0])[i];
 			p[--j] = (uint8_t)v;
 			v >>= 8;
 			p[--j] = (uint8_t)v;
 		}
 		break;
-	case 2:
-		for (int j = n,  i = n - 1; 0 <= i; --i)
+	case BITS_32:
+		for (int j = 4,  i = count - 1; 0 <= i; --i)
 		{
-			int v = 0;
+			int v = 4;
 			v = v32x8(m_SPP[0])[i];
 			p[--j] = (uint8_t)v;
 			v >>= 8;
@@ -319,8 +320,8 @@ void VM::xmstore(uint8_t _b)
 			p[--j] = (uint8_t)v;
 		}
 		break;
-	case 3:
-		for (int j = n,  i = n - 1; 0 <= i; --i)
+	case BITS_64:
+		for (int j = 8,  i = count - 1; 0 <= i; --i)
 		{
 			int v = 0;
 			v = v64x4(m_SPP[0])[i];
@@ -346,33 +347,32 @@ void VM::xmstore(uint8_t _b)
 	}
 }
 
-void VM::xsload(uint8_t _b)
+void VM::xsload(uint8_t _type)
 {
 	u256 w = m_ext->store(m_SP[0]);
-	wtov(_b, w, m_SPP[0]);
+	wtov(_type, w, m_SPP[0]);
 }
 
-void VM::xsstore(uint8_t _b)
+void VM::xsstore(uint8_t _type)
 {
-	u256 w = vtow(_b, m_SP[1]);
+	u256 w = vtow(_type, m_SP[1]);
 	m_ext->setStore(m_SP[0], w);
 }
 
-void VM::xvtowide(uint8_t _b)
+void VM::xvtowide(uint8_t _type)
 {
-	m_SPP[0] = vtow(_b, m_SP[0]);
+	m_SPP[0] = vtow(_type, m_SP[0]);
 }
 
-void VM::xwidetov(uint8_t _b)
+void VM::xwidetov(uint8_t _type)
 {
-	wtov(_b, m_SP[0], m_SPP[0]);
+	wtov(_type, m_SP[0], m_SPP[0]);
 }
 
-void VM::xpush(uint8_t _b)
+void VM::xpush(uint8_t _type)
 {
-	// n type t elements in destination vector
-	uint8_t const  n = nElem(_b);
-	uint8_t const t = elemType(_b);
+	uint8_t const count = laneCount(_type);
+	uint8_t const width = laneWidth(_type);
 	
 	// Construct a vector out of n bytes following XPUSH.
 	// This requires the code has been copied and extended by 32 zero
@@ -381,24 +381,24 @@ void VM::xpush(uint8_t _b)
 	// given the type of the vector
 	// mask and shift in the inline bytes                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          
 	m_SPP[0] = 0;
-	switch (t)
+	switch (width)
 	{
-	case 0:
-		for (int i = 0; i < n; ++i)
+	case BITS_8:
+		for (int i = 0; i < count ; ++i)
 		{
 			v8x32(m_SPP[0])[i] = m_code[++m_PC];
 		}
 		break;
-	case 1:
-		for (int i = 0; i < n; ++i)
+	case BITS_16:
+		for (int i = 0; i < count; ++i)
 		{
 			uint16_t v = m_code[++m_PC];
 			v = (v << 8) | m_code[++m_PC];
 			v16x16(m_SPP[0])[i] = v;
 		}
 		break;
-	case 2:
-		for (int i = 0; i < n; ++i)
+	case BITS_32:
+		for (int i = 0; i < count; ++i)
 		{
 			uint32_t v = m_code[m_PC];
 			v = (v << 8) | m_code[++m_PC];
@@ -407,8 +407,8 @@ void VM::xpush(uint8_t _b)
 			v32x8(m_SPP[0])[i] = v;
 		}
 		break;
-	case 3:
-		for (int i = 0; i < n; ++i)
+	case BITS_64:
+		for (int i = 0; i < count; ++i)
 		{
 			uint64_t v = m_code[++m_PC];
 			v = (v << 8) | m_code[++m_PC];
@@ -426,108 +426,107 @@ void VM::xpush(uint8_t _b)
 	}
 }
 
-void VM::xget(uint8_t _b, uint8_t _c)
+void VM::xget(uint8_t _src_type, uint8_t _idx_type)
 {
-	// n type t elements in source vector, m type u in get indexes
-	uint8_t const t = elemType(_b);
-	uint8_t const m = nElem(_c);
-	uint8_t const u = elemType(_c);
+	uint8_t const srcWidth = laneWidth(_src_type);
+	uint8_t const idxCount = laneCount(_idx_type);
+	uint8_t const idxWidth = laneWidth(_idx_type);
 	
 	// given the type of the source and index
 	// for every element of the index get the indexed element from the source
-	switch (t)
+	switch (srcWidth)
 	{
-	case 0:
+	case BITS_8:
 
-		switch (u)
+		switch (idxWidth)
 		{
-		case 0:
-			for (int i = 0; i < m; ++i)
-				v8x32 (m_SPP[0])[i] = v8x32(m_SP[0])[v8x32 (m_SP[1])[i] % 32];
+		case BITS_8:
+			for (int i = 0; i < idxCount; ++i)
+				v8x32 (m_SPP[1])[i] = v8x32(m_SP[0])[v8x32 (m_SP[1])[i] % idxCount];
 			break;
-		case 1:
-			for (int i = 0; i < m; ++i)
-				v16x16(m_SPP[0])[i] = v8x32(m_SP[0])[v16x16(m_SP[1])[i] % 16];
+		case BITS_16:
+			for (int i = 0; i< idxCount; ++i)
+				v16x16(m_SPP[1])[i] = v8x32(m_SP[0])[v16x16(m_SP[1])[i] % idxCount];
 			break;
-		case 2:
-			for (int i = 0; i < m; ++i)
-				v32x8 (m_SPP[0])[i] = v8x32(m_SP[0])[v32x8 (m_SP[1])[i] %  8];
+		case BITS_32:
+			for (int i = 0; i< idxCount; ++i)
+				v32x8 (m_SPP[1])[i] = v8x32(m_SP[0])[v32x8 (m_SP[1])[i] % idxCount];
 			break;
-		case 3:
-			for (int i = 0; i < m; ++i)
-				v64x4 (m_SPP[0])[i] = v8x32(m_SP[0])[v64x4 (m_SP[1])[i] %  4];
+		case BITS_64:
+			for (int i = 0; i< idxCount; ++i)
+				v64x4 (m_SPP[1])[i] = v8x32(m_SP[0])[v64x4 (m_SP[1])[i] % idxCount];
 			break;
 		default:
 			throwBadInstruction();
 		}
 
-	case 1:
+	case BITS_16:
 
-		switch (u)
+		switch (idxWidth)
 		{
-		case 0:
-			for (int i = 0; i < m; ++i)
-				v8x32 (m_SPP[0])[i] = v16x16(m_SP[1])[v8x32 (m_SP[0])[i] % 32];
+		case BITS_8:
+			for (int i = 0; i < idxCount; ++i)
+				v8x32 (m_SPP[0])[i] = v16x16(m_SP[1])[v8x32 (m_SP[0])[i] % idxCount];
 			break;
-		case 1:
-			for (int i = 0; i < m; ++i)
-				v16x16(m_SPP[0])[i] = v16x16(m_SP[1])[v16x16(m_SP[0])[i] % 16];
+		case BITS_16:
+			for (int i = 0; i < idxCount; ++i)
+				v16x16(m_SPP[0])[i] = v16x16(m_SP[1])[v16x16(m_SP[0])[i] % idxCount];
 			break;
-		case 2:
-			for (int i = 0; i < m; ++i)
-				v32x8 (m_SPP[0])[i] = v16x16(m_SP[1])[v32x8 (m_SP[0])[i] %  8];
+		case BITS_32:
+			for (int i = 0; i < idxCount; ++i)
+				v32x8 (m_SPP[0])[i] = v16x16(m_SP[1])[v32x8 (m_SP[0])[i] % idxCount];
 			break;
-		case 3:
-			for (int i = 0; i < m; ++i)
-				v64x4 (m_SPP[0])[i] = v16x16(m_SP[1])[v64x4 (m_SP[0])[i] %  4];
+		case BITS_64:
+			for (int i = 0; i < idxCount; ++i)
+				v64x4 (m_SPP[0])[i] = v16x16(m_SP[1])[v64x4 (m_SP[0])[i] % idxCount];
 			break;
 		default:
 			throwBadInstruction();
 		}
 
-	case 2:
+	case BITS_32:
 
-		switch (u)
+		switch (idxWidth)
 		{
-		case 0:
-			for (int i = 0; i < m; ++i)
-				v8x32 (m_SPP[0])[i] = v32x8(m_SP[1])[v8x32 (m_SP[0])[i] % 32];
+		case BITS_8:
+			for (int i = 0; i < idxCount; ++i)
+				v8x32 (m_SPP[0])[i] = v32x8(m_SP[1])[v8x32 (m_SP[0])[i] % idxCount];
 			break;
-		case 1:
-			for (int i = 0; i < m; ++i)
-				v16x16(m_SPP[0])[i] = v32x8(m_SP[1])[v16x16(m_SP[0])[i] % 16];
+		case BITS_16:
+			for (int i = 0; i < idxCount; ++i)
+				v16x16(m_SPP[0])[i] = v32x8(m_SP[1])[v16x16(m_SP[0])[i] % idxCount];
 			break;
-		case 2:
-			for (int i = 0; i < m; ++i)
-				v32x8 (m_SPP[0])[i] = v32x8(m_SP[1])[v32x8 (m_SP[0])[i] %  8];
+		case BITS_32:
+			for (int i = 0; i < idxCount; ++i)
+				v32x8 (m_SPP[0])[i] = v32x8(m_SP[1])[v32x8 (m_SP[0])[i] % idxCount];
 			break;
-		case 3:
-			for (int i = 0; i < m; ++i)
-				v64x4 (m_SPP[0])[i] = v32x8(m_SP[1])[v64x4 (m_SP[0])[i] %  4];
+		case BITS_64:
+			for (int i = 0; i < idxCount; ++i)
+				v64x4 (m_SPP[0])[i] = v32x8(m_SP[1])[v64x4 (m_SP[0])[i] % idxCount];
 			break;
 		default:
 			throwBadInstruction();
 		}
 
-	case 3:
+	case BITS_64:
 
-		switch (u)
+		switch (idxWidth)
 		{
-		case 0:
-			for (int i = 0; i < m; ++i)
-				v8x32 (m_SPP[0])[i] = v64x4(m_SP[1])[v8x32 (m_SP[0])[i] % 32];
+		case BITS_8:
+			for (int i = 0; i < idxCount; ++i)
+				v8x32 (m_SPP[0])[i] = v64x4(m_SP[1])[v8x32 (m_SP[0])[i] % idxCount];
 			break;
-		case 1:
-			for (int i = 0; i < m; ++i)
-				v16x16(m_SPP[0])[i] = v64x4(m_SP[1])[v16x16(m_SP[0])[i] % 16];
+		case BITS_16:
+			for (int i = 0; i < idxCount; ++i)
+				v16x16(m_SPP[0])[i] = v64x4(m_SP[1])[v16x16(m_SP[0])[i] % idxCount];
 			break;
-		case 2:
-			for (int i = 0; i < m; ++i)
-				v32x8 (m_SPP[0])[i] = v64x4(m_SP[1])[v32x8 (m_SP[0])[i] %  8];
+		case BITS_32:
+			for (int i = 0; i < idxCount; ++i)
+				v32x8 (m_SPP[0])[i] = v64x4(m_SP[1])[v32x8 (m_SP[0])[i] % idxCount];
 			break;
-		case 3:
-			for (int i = 0; i < m; ++i)
-				v64x4 (m_SPP[0])[i] = v64x4(m_SP[1])[v64x4 (m_SP[0])[i] %  4];
+		case BITS_64:
+			for (int i = 0; i < idxCount; ++i)
+				v64x4 (m_SPP[0])[i] = v64x4(m_SP[1])[v64x4 (m_SP[0])[i] % idxCount];
 			break;
 		default:
 			throwBadInstruction();
@@ -538,107 +537,106 @@ void VM::xget(uint8_t _b, uint8_t _c)
 	}
 }
 
-void VM::xput(uint8_t _b, uint8_t _c)
+void VM::xput(uint8_t _src_type, uint8_t _dst_type)
 {
-	// n type t elements in source and destination vectors, m type u elements in put index
-	uint8_t const t = elemType(_b);
-	uint8_t const m = nElem(_c);
-	uint8_t const u = elemType(_c);
+	uint8_t const srcWidth = laneWidth(_src_type);
+	uint8_t const dstCount = laneCount(_dst_type);
+	uint8_t const dstWidth = laneWidth(_dst_type);
 
 	// given the type of the source, destination and index
 	// for every element of the index put the indexed replacement in the destination                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   
-	switch (t)
+	switch (srcWidth)
 	{
-	case 0:
+	case BITS_8:
 
-		switch (u)
+		switch (dstWidth)
 		{
-		case 0:
-			for (int i = 0; i < m; ++i)
+		case BITS_8:
+			for (int i = 0; i < dstCount; ++i)
 				v8x32 (m_SPP[0])[v8x32(m_SP[1])[i] % 32] = v8x32(m_SP[0])[i];
 			break;
-		case 1:
-			for (int i = 0; i < m; ++i)
+		case BITS_16:
+			for (int i = 0; i < dstCount; ++i)
 				v16x16(m_SPP[0])[v8x32(m_SP[1])[i] % 16] = v8x32(m_SP[0])[i];
 			break;
-		case 2:
-			for (int i = 0; i < m; ++i)
+		case BITS_32:
+			for (int i = 0; i < dstCount; ++i)
 				v32x8 (m_SPP[0])[v8x32(m_SP[1])[i] %  8] = v8x32(m_SP[0])[i];
 			break;
-		case 3:
-			for (int i = 0; i < m; ++i)
+		case BITS_64:
+			for (int i = 0; i < dstCount; ++i)
 				v64x4 (m_SPP[0])[v8x32(m_SP[1])[i] %  4] = v8x32(m_SP[0])[i];
 			break;
 		default:
 			throwBadInstruction();
 		}
 
-	case 1:
+	case BITS_16:
 
-		switch (u)
+		switch (dstWidth)
 		{
-		case 0:
-			for (int i = 0; i < m; ++i)
+		case BITS_8:
+			for (int i = 0; i < dstCount; ++i)
 				v8x32 (m_SPP[0])[v16x16(m_SP[1])[i] % 32] = v16x16(m_SP[0])[i];
 			break;
-		case 1:
-			for (int i = 0; i < m; ++i)
+		case BITS_16:
+			for (int i = 0; i < dstCount; ++i)
 				v16x16(m_SPP[0])[v16x16(m_SP[1])[i] % 16] = v16x16(m_SP[0])[i];
 			break;
-		case 2:
-			for (int i = 0; i < m; ++i)
+		case BITS_32:
+			for (int i = 0; i < dstCount; ++i)
 				v32x8(m_SPP[0])[v16x16(m_SP[1])[i] %  8] = v16x16(m_SP[0])[i];
 			break;
-		case 3:
-			for (int i = 0; i < m; ++i)
+		case BITS_64:
+			for (int i = 0; i < dstCount; ++i)
 				v64x4(m_SPP[0])[v16x16(m_SP[1])[i] %  4] = v16x16(m_SP[0])[i];
 			break;
 		default:
 			throwBadInstruction();
 		}
 
-	case 2:
+	case BITS_32:
 
-		switch (u)
+		switch (dstWidth)
 		{
-		case 0:
-			for (int i = 0; i < m; ++i)
+		case BITS_8:
+			for (int i = 0; i < dstCount; ++i)
 				v8x32 (m_SPP[0])[v32x8(m_SP[1])[i] % 32] = v32x8(m_SP[0])[i];
 			break;
-		case 1:
-			for (int i = 0; i < m; ++i)
+		case BITS_16:
+			for (int i = 0; i < dstCount; ++i)
 				v16x16(m_SPP[0])[v32x8(m_SP[1])[i] % 16] = v32x8(m_SP[0])[i];
 			break;
-		case 2:
-			for (int i = 0; i < m; ++i)
+		case BITS_32:
+			for (int i = 0; i < dstCount; ++i)
 				v32x8 (m_SPP[0])[v32x8(m_SP[1])[i] %  8] = v32x8(m_SP[0])[i];
 			break;
-		case 3:
-			for (int i = 0; i < m; ++i)
+		case BITS_64:
+			for (int i = 0; i < dstCount; ++i)
 				v64x4 (m_SPP[0])[v32x8(m_SP[1])[i] %  4] = v32x8(m_SP[0])[i];
 			break;
 		default:
 			throwBadInstruction();
 		}
 
-	case 3:
+	case BITS_64:
 
-		switch (u)
+		switch (dstWidth)
 		{
-		case 0:
-			for (int i = 0; i < m; ++i)
+		case BITS_8:
+			for (int i = 0; i < dstCount; ++i)
 				v8x32 (m_SPP[0])[v64x4(m_SP[1])[i] % 32] = v64x4(m_SP[0])[i];
 			break;
-		case 1:
-			for (int i = 0; i < m; ++i)
+		case BITS_16:
+			for (int i = 0; i < dstCount; ++i)
 				v16x16(m_SPP[0])[v64x4(m_SP[1])[i] % 16] = v64x4(m_SP[0])[i];
 			break;
-		case 2:
-			for (int i = 0; i < m; ++i)
+		case BITS_32:
+			for (int i = 0; i < dstCount; ++i)
 				v32x8 (m_SPP[0])[v64x4(m_SP[1])[i] %  8] = v64x4(m_SP[0])[i];
 			break;
-		case 3:
-			for (int i = 0; i < m; ++i)
+		case BITS_64:
+			for (int i = 0; i < dstCount; ++i)
 				v64x4 (m_SPP[0])[v64x4(m_SP[1])[i] %  4] = v64x4(m_SP[0])[i];
 			break;
 		default:
@@ -650,73 +648,72 @@ void VM::xput(uint8_t _b, uint8_t _c)
 	}
 }
 
-void VM::xswizzle(uint8_t _b)
+void VM::xswizzle(uint8_t _type)
 {
-	// n type t elements in source and mask vectors
-	uint8_t const n = nElem(_b);
-	uint8_t const t = elemType(_b);
+	uint8_t const count = laneCount(_type);
+	uint8_t const width = laneWidth(_type);
 
 	// given the type of the source and mask                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          
 	// for every index in the mask copy out the indexed value in the source
-	switch (t)
+	switch (width)
 	{
-	case 0:
-		for (int i = 0; i < n; ++i)
-			v8x32 (m_SPP[0])[i] = v8x32(m_SP[1]) [v8x32 (m_SP[0])[i] % n];
+	case BITS_8:
+		for (int i = 0; i < count; ++i)
+			v8x32 (m_SPP[0])[i] = v8x32(m_SP[1]) [v8x32 (m_SP[0])[i] % count];
 		break;
-	case 1:
-		for (int i = 0; i < n; ++i)
-			v16x16(m_SPP[0])[i] = v16x16(m_SP[1])[v16x16(m_SP[0])[i] % n];
+	case BITS_16:
+		for (int i = 0; i < count; ++i)
+			v16x16(m_SPP[0])[i] = v16x16(m_SP[1])[v16x16(m_SP[0])[i] % count];
 		break;
-	case 2:
-		for (int i = 0; i < n; ++i)
-			v32x8 (m_SPP[0])[i] = v32x8(m_SP[1]) [v32x8 (m_SP[0])[i] %  n];
+	case BITS_32:
+		for (int i = 0; i < count; ++i)
+			v32x8 (m_SPP[0])[i] = v32x8(m_SP[1]) [v32x8 (m_SP[0])[i] % count];
 		break;
-	case 3:
-		for (int i = 0; i < n; ++i)
-			v64x4 (m_SPP[0])[i] = v64x4(m_SP[1]) [v64x4 (m_SP[0])[i] %  n];
+	case BITS_64:
+		for (int i = 0; i < count; ++i)
+			v64x4 (m_SPP[0])[i] = v64x4(m_SP[1]) [v64x4 (m_SP[0])[i] % count];
 		break;
 	default:
 		throwBadInstruction();
 	}
 }
 
-void VM::xshuffle(uint8_t _b)
+void VM::xshuffle(uint8_t _type)
 {
 	// n type t elements in source and mask vectors
-	uint8_t const n = nElem(_b);
-	uint8_t const t = elemType(_b);
+	uint8_t const count = laneCount(_type);
+	uint8_t const width = laneWidth(_type);
 
 	// given the type of the sources and mask                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          
 	// for every index in the mask copy out the indexed value in one of the sources
-	switch (t)
+	switch (width)
 	{
-	case 0:
-		for (int i = 0; i < n; ++i)
+	case BITS_8:
+		for (int i = 0; i < count; ++i)
 		{
 			int j = v8x32(m_SP[0]) [i];
-			v8x32 (m_SPP[0])[i] = j < n ? v8x32(m_SP[2]) [j] : v8x32 (m_SP[2])[j % n];
+			v8x32 (m_SPP[0])[i] = j < count ? v8x32(m_SP[1]) [j] : v8x32 (m_SP[2])[(j - count) % count];
 		}
 		break;
-	case 1:
-		for (int i = 0; i < n; ++i)
+	case BITS_16:
+		for (int i = 0; i < count; ++i)
 		{
 			int j = v16x16(m_SP[0])[i];
-			v16x16(m_SPP[0])[i] = j < n ? v16x16(m_SP[2])[j] : v16x16(m_SP[2])[j % n];
+			v16x16(m_SPP[0])[i] = j < count ? v16x16(m_SP[1])[j] : v16x16(m_SP[2])[(j - count) % count];
 		}
 		break;
-	case 2:
-		for (int i = 0; i < n; ++i)
+	case BITS_32:
+		for (int i = 0; i < count; ++i)
 		{
 			int j = v32x8(m_SP[0]) [i];
-			v32x8 (m_SPP[0])[i] = j < n ? v32x8(m_SP[2]) [j] : v32x8 (m_SP[2])[j %  n];
+			v32x8 (m_SPP[0])[i] = j < count ? v32x8(m_SP[1]) [j] : v32x8 (m_SP[2])[(j - count) % count];
 		}
 		break;
-	case 3:
-		for (int i = 0; i < n; ++i)
+	case BITS_64:
+		for (int i = 0; i < count; ++i)
 		{
 			int j = v64x4(m_SP[0]) [i];
-			v64x4 (m_SPP[0])[i] = j < n ? v64x4(m_SP[2]) [j] : v64x4 (m_SP[2])[j %  n];
+			v64x4 (m_SPP[0])[i] = j < count ? v64x4(m_SP[1]) [j] : v64x4 (m_SP[2])[(j - count) % count];
 		}
 		break;
 	default:

From 4764f1440337c4a0abde1568efea87fe82f46b7e Mon Sep 17 00:00:00 2001
From: gcolvin <greg@colvin.org>
Date: Thu, 13 Jul 2017 23:07:32 -0600
Subject: [PATCH 5/6] name nits

---
 libevm/VMSIMD.cpp | 188 +++++++++++++++++++++++-----------------------
 1 file changed, 94 insertions(+), 94 deletions(-)

diff --git a/libevm/VMSIMD.cpp b/libevm/VMSIMD.cpp
index 183e51cbf95..ea4e241ca7f 100755
--- a/libevm/VMSIMD.cpp
+++ b/libevm/VMSIMD.cpp
@@ -32,16 +32,16 @@ using a64x4  = uint64_t[4];
 using a32x8  = uint32_t[8];
 using a16x16 = uint16_t[16];
 using a8x32  = uint8_t [32];
-inline a64x4       & v64x4 (u256      & _stack_item) { return (a64x4&) *(a64x4*) &_stack_item; }
-inline a32x8       & v32x8 (u256      & _stack_item) { return (a32x8&) *(a32x8*) &_stack_item; }
-inline a16x16      & v16x16(u256      & _stack_item) { return (a16x16&)*(a16x16*)&_stack_item; }
-inline a8x32       & v8x32 (u256      & _stack_item) { return (a8x32&) *(a8x32*) &_stack_item; }
-inline a64x4  const& v64x4 (u256 const& _stack_item) { return (a64x4&) *(a64x4*) &_stack_item; }
-inline a32x8  const& v32x8 (u256 const& _stack_item) { return (a32x8&) *(a32x8*) &_stack_item; }
-inline a16x16 const& v16x16(u256 const& _stack_item) { return (a16x16&)*(a16x16*)&_stack_item; }
-inline a8x32  const& v8x32 (u256 const& _stack_item) { return (a8x32&) *(a8x32*) &_stack_item; }
+inline a64x4       & v64x4 (u256      & _stackItem) { return (a64x4&) *(a64x4*) &_stackItem; }
+inline a32x8       & v32x8 (u256      & _stackItem) { return (a32x8&) *(a32x8*) &_stackItem; }
+inline a16x16      & v16x16(u256      & _stackItem) { return (a16x16&)*(a16x16*)&_stackItem; }
+inline a8x32       & v8x32 (u256      & _stackItem) { return (a8x32&) *(a8x32*) &_stackItem; }
+inline a64x4  const& v64x4 (u256 const& _stackItem) { return (a64x4&) *(a64x4*) &_stackItem; }
+inline a32x8  const& v32x8 (u256 const& _stackItem) { return (a32x8&) *(a32x8*) &_stackItem; }
+inline a16x16 const& v16x16(u256 const& _stackItem) { return (a16x16&)*(a16x16*)&_stackItem; }
+inline a8x32  const& v8x32 (u256 const& _stackItem) { return (a8x32&) *(a8x32*) &_stackItem; }
 
-class Pow2Bits enum { BITS_8, BITS_16, BITS_32, BITS_64 };
+enum { Bits8, Bits16, Bits32, Bits64 };
 
 // tried using template template functions, gave up fighting the compiler after a day
 #define EVALXOPS(OP, _type) EVALXOP(OP, int8_t, int16_t, int32_t, int64_t, _type)
@@ -52,19 +52,19 @@ class Pow2Bits enum { BITS_8, BITS_16, BITS_32, BITS_64 };
 	m_SPP[0] = 0; \
 	switch (t) \
 	{ \
-	case BITS_8: \
+	case Bits8: \
 		for (int i = 0; i < 32; ++i) \
 			v8x32(m_SPP[0])[i]  = (uint8_t) OP((T8) v8x32(m_SP[0])[i],  (T8) v8x32(m_SP[1])[i]); \
 		break; \
-	case BITS_16: \
+	case Bits16: \
 		for (int i = 0; i < 16; ++i) \
 			v16x16(m_SPP[0])[i] = (uint16_t)OP((T16)v16x16(m_SP[0])[i], (T16)v16x16(m_SP[1])[i]); \
 		break; \
-	case BITS_32: \
+	case Bits32: \
 		for (int i = 0; i < 8; ++i) \
 			v32x8(m_SPP[0])[i]  = (uint32_t)OP((T32)v32x8(m_SP[0])[i],  (T32)v32x8(m_SP[1])[i]); \
 		break; \
-	case BITS_64: \
+	case Bits64: \
 		for (int i = 0; i < 4; ++i) \
 			v64x4(m_SPP[0])[i]  = (uint64_t)OP((T64)v64x4(m_SP[0])[i],  (T64)v64x4(m_SP[1])[i]); \
 		break; \
@@ -136,28 +136,28 @@ u256 VM::vtow(uint8_t _type, const u256& _in)
 	uint8_t const width = laneWidth(_type);
 	switch (width)
 	{
-	case BITS_8:
+	case Bits8:
 		for (int i = count - 1; 0 <= i; --i)
 		{ 
 			out << 8;
 			out |= v8x32(_in) [i];
 		}
 		break;
-	case BITS_16:
+	case Bits16:
 		for (int i = count - 1; 0 <= i; --i)
 		{ 
 			out << 16;
 			out |= v16x16(_in)[i];
 		}
 		break;
-	case BITS_32:
+	case Bits32:
 		for (int i = count - 1; 0 <= i; --i)
 		{ 
 			out << 32;
 			out |= v32x8(_in) [i];
 		}
 		break;
-	case BITS_64:
+	case Bits64:
 		for (int i = count - 1; 0 <= i; --i)
 		{ 
 			out << 64;
@@ -171,37 +171,37 @@ u256 VM::vtow(uint8_t _type, const u256& _in)
 }
 
 // out must be by reference because it is really just memory for a vector
-void VM::wtov(uint8_t _type, u256 _in, u256& _o_out)
+void VM::wtov(uint8_t _type, u256 _in, u256& o_out)
 {
 	uint8_t const count = laneCount(_type);
 	uint8_t const width = laneWidth(_type);
 	switch (width)
 	{
-	case BITS_8:
+	case Bits8:
 		for (int i = count - 1; 0 <= i; --i)
 		{ 
-			v8x32(_o_out) [i] = (uint8_t )(_in & 0xff);
+			v8x32(o_out) [i] = (uint8_t )(_in & 0xff);
 			_in >>= 8;
 		}
 		break;
-	case BITS_16:
+	case Bits16:
 		for (int i = count - 1; 0 <= i; --i)
 		{ 
-			v16x16(_o_out)[i] = (uint16_t)(_in & 0xffff);
+			v16x16(o_out)[i] = (uint16_t)(_in & 0xffff);
 			_in >>= 16;
 		}
 		break;
-	case BITS_32:
+	case Bits32:
 		for (int i = count - 1; 0 <= i; --i)
 		{ 
-			v32x8(_o_out) [i] = (uint32_t)(_in & 0xffffff);
+			v32x8(o_out) [i] = (uint32_t)(_in & 0xffffff);
 			_in >>= 32;
 		}
 		break;
-	case BITS_64:
+	case Bits64:
 		for (int i = count - 1; 0 <= i; --i)
 		{ 
-			v64x4(_o_out) [i] = (uint64_t)(_in & 0xffffffff);
+			v64x4(o_out) [i] = (uint64_t)(_in & 0xffffffff);
 			_in >>= 64;
 		}
 		break;
@@ -219,7 +219,7 @@ void VM::xmload (uint8_t _type)
 
 	switch (width)
 	{
-	case BITS_8:
+	case Bits8:
 		for (int j = 1,  i = count - 1; 0 <= i; --i)
 		{
 			int v = 0;
@@ -227,7 +227,7 @@ void VM::xmload (uint8_t _type)
 			v8x32(m_SPP[0])[i] = v;
 		}
 		break;
-	case BITS_16:
+	case Bits16:
 		for (int j = 2,  i = count - 1; 0 <= i; --i)
 		{
 			int v = 0;
@@ -237,7 +237,7 @@ void VM::xmload (uint8_t _type)
 			v16x16(m_SPP[0])[i] = v;
 		}
 		break;
-	case BITS_32:
+	case Bits32:
 		for (int j = 4,  i = count - 1; 0 <= i; --i)
 		{
 			int v = 0;
@@ -251,7 +251,7 @@ void VM::xmload (uint8_t _type)
 			v32x8(m_SPP[0])[i] = v;
 		}
 		break;
-	case BITS_64:
+	case Bits64:
 		for (int j = 8,  i = count - 1; 0 <= i; --i)
 		{
 			int v = 0;
@@ -288,7 +288,7 @@ void VM::xmstore(uint8_t _type)
 
 	switch (width)
 	{
-	case BITS_8:
+	case Bits8:
 		for (int j = 1,  i = count - 1; 0 <= i; --i)
 		{
 			int v = 0;
@@ -296,7 +296,7 @@ void VM::xmstore(uint8_t _type)
 			p[--j] = (uint8_t)v;
 		}
 		break;
-	case BITS_16:
+	case Bits16:
 		for (int j = 2,  i = count - 1; 0 <= i; --i)
 		{
 			int v = 2;
@@ -306,7 +306,7 @@ void VM::xmstore(uint8_t _type)
 			p[--j] = (uint8_t)v;
 		}
 		break;
-	case BITS_32:
+	case Bits32:
 		for (int j = 4,  i = count - 1; 0 <= i; --i)
 		{
 			int v = 4;
@@ -320,7 +320,7 @@ void VM::xmstore(uint8_t _type)
 			p[--j] = (uint8_t)v;
 		}
 		break;
-	case BITS_64:
+	case Bits64:
 		for (int j = 8,  i = count - 1; 0 <= i; --i)
 		{
 			int v = 0;
@@ -383,13 +383,13 @@ void VM::xpush(uint8_t _type)
 	m_SPP[0] = 0;
 	switch (width)
 	{
-	case BITS_8:
+	case Bits8:
 		for (int i = 0; i < count ; ++i)
 		{
 			v8x32(m_SPP[0])[i] = m_code[++m_PC];
 		}
 		break;
-	case BITS_16:
+	case Bits16:
 		for (int i = 0; i < count; ++i)
 		{
 			uint16_t v = m_code[++m_PC];
@@ -397,7 +397,7 @@ void VM::xpush(uint8_t _type)
 			v16x16(m_SPP[0])[i] = v;
 		}
 		break;
-	case BITS_32:
+	case Bits32:
 		for (int i = 0; i < count; ++i)
 		{
 			uint32_t v = m_code[m_PC];
@@ -407,7 +407,7 @@ void VM::xpush(uint8_t _type)
 			v32x8(m_SPP[0])[i] = v;
 		}
 		break;
-	case BITS_64:
+	case Bits64:
 		for (int i = 0; i < count; ++i)
 		{
 			uint64_t v = m_code[++m_PC];
@@ -426,33 +426,33 @@ void VM::xpush(uint8_t _type)
 	}
 }
 
-void VM::xget(uint8_t _src_type, uint8_t _idx_type)
+void VM::xget(uint8_t _srcType, uint8_t _idxType)
 {
-	uint8_t const srcWidth = laneWidth(_src_type);
-	uint8_t const idxCount = laneCount(_idx_type);
-	uint8_t const idxWidth = laneWidth(_idx_type);
+	uint8_t const srcWidth = laneWidth(_srcType);
+	uint8_t const idxCount = laneCount(_idxType);
+	uint8_t const idxWidth = laneWidth(_idxType);
 	
 	// given the type of the source and index
 	// for every element of the index get the indexed element from the source
 	switch (srcWidth)
 	{
-	case BITS_8:
+	case Bits8:
 
 		switch (idxWidth)
 		{
-		case BITS_8:
+		case Bits8:
 			for (int i = 0; i < idxCount; ++i)
 				v8x32 (m_SPP[1])[i] = v8x32(m_SP[0])[v8x32 (m_SP[1])[i] % idxCount];
 			break;
-		case BITS_16:
+		case Bits16:
 			for (int i = 0; i< idxCount; ++i)
 				v16x16(m_SPP[1])[i] = v8x32(m_SP[0])[v16x16(m_SP[1])[i] % idxCount];
 			break;
-		case BITS_32:
+		case Bits32:
 			for (int i = 0; i< idxCount; ++i)
 				v32x8 (m_SPP[1])[i] = v8x32(m_SP[0])[v32x8 (m_SP[1])[i] % idxCount];
 			break;
-		case BITS_64:
+		case Bits64:
 			for (int i = 0; i< idxCount; ++i)
 				v64x4 (m_SPP[1])[i] = v8x32(m_SP[0])[v64x4 (m_SP[1])[i] % idxCount];
 			break;
@@ -460,23 +460,23 @@ void VM::xget(uint8_t _src_type, uint8_t _idx_type)
 			throwBadInstruction();
 		}
 
-	case BITS_16:
+	case Bits16:
 
 		switch (idxWidth)
 		{
-		case BITS_8:
+		case Bits8:
 			for (int i = 0; i < idxCount; ++i)
 				v8x32 (m_SPP[0])[i] = v16x16(m_SP[1])[v8x32 (m_SP[0])[i] % idxCount];
 			break;
-		case BITS_16:
+		case Bits16:
 			for (int i = 0; i < idxCount; ++i)
 				v16x16(m_SPP[0])[i] = v16x16(m_SP[1])[v16x16(m_SP[0])[i] % idxCount];
 			break;
-		case BITS_32:
+		case Bits32:
 			for (int i = 0; i < idxCount; ++i)
 				v32x8 (m_SPP[0])[i] = v16x16(m_SP[1])[v32x8 (m_SP[0])[i] % idxCount];
 			break;
-		case BITS_64:
+		case Bits64:
 			for (int i = 0; i < idxCount; ++i)
 				v64x4 (m_SPP[0])[i] = v16x16(m_SP[1])[v64x4 (m_SP[0])[i] % idxCount];
 			break;
@@ -484,23 +484,23 @@ void VM::xget(uint8_t _src_type, uint8_t _idx_type)
 			throwBadInstruction();
 		}
 
-	case BITS_32:
+	case Bits32:
 
 		switch (idxWidth)
 		{
-		case BITS_8:
+		case Bits8:
 			for (int i = 0; i < idxCount; ++i)
 				v8x32 (m_SPP[0])[i] = v32x8(m_SP[1])[v8x32 (m_SP[0])[i] % idxCount];
 			break;
-		case BITS_16:
+		case Bits16:
 			for (int i = 0; i < idxCount; ++i)
 				v16x16(m_SPP[0])[i] = v32x8(m_SP[1])[v16x16(m_SP[0])[i] % idxCount];
 			break;
-		case BITS_32:
+		case Bits32:
 			for (int i = 0; i < idxCount; ++i)
 				v32x8 (m_SPP[0])[i] = v32x8(m_SP[1])[v32x8 (m_SP[0])[i] % idxCount];
 			break;
-		case BITS_64:
+		case Bits64:
 			for (int i = 0; i < idxCount; ++i)
 				v64x4 (m_SPP[0])[i] = v32x8(m_SP[1])[v64x4 (m_SP[0])[i] % idxCount];
 			break;
@@ -508,23 +508,23 @@ void VM::xget(uint8_t _src_type, uint8_t _idx_type)
 			throwBadInstruction();
 		}
 
-	case BITS_64:
+	case Bits64:
 
 		switch (idxWidth)
 		{
-		case BITS_8:
+		case Bits8:
 			for (int i = 0; i < idxCount; ++i)
 				v8x32 (m_SPP[0])[i] = v64x4(m_SP[1])[v8x32 (m_SP[0])[i] % idxCount];
 			break;
-		case BITS_16:
+		case Bits16:
 			for (int i = 0; i < idxCount; ++i)
 				v16x16(m_SPP[0])[i] = v64x4(m_SP[1])[v16x16(m_SP[0])[i] % idxCount];
 			break;
-		case BITS_32:
+		case Bits32:
 			for (int i = 0; i < idxCount; ++i)
 				v32x8 (m_SPP[0])[i] = v64x4(m_SP[1])[v32x8 (m_SP[0])[i] % idxCount];
 			break;
-		case BITS_64:
+		case Bits64:
 			for (int i = 0; i < idxCount; ++i)
 				v64x4 (m_SPP[0])[i] = v64x4(m_SP[1])[v64x4 (m_SP[0])[i] % idxCount];
 			break;
@@ -537,33 +537,33 @@ void VM::xget(uint8_t _src_type, uint8_t _idx_type)
 	}
 }
 
-void VM::xput(uint8_t _src_type, uint8_t _dst_type)
+void VM::xput(uint8_t _srcType, uint8_t _dstType)
 {
-	uint8_t const srcWidth = laneWidth(_src_type);
-	uint8_t const dstCount = laneCount(_dst_type);
-	uint8_t const dstWidth = laneWidth(_dst_type);
+	uint8_t const srcWidth = laneWidth(_srcType);
+	uint8_t const dstCount = laneCount(_dstType);
+	uint8_t const dstWidth = laneWidth(_dstType);
 
 	// given the type of the source, destination and index
 	// for every element of the index put the indexed replacement in the destination                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   
 	switch (srcWidth)
 	{
-	case BITS_8:
+	case Bits8:
 
 		switch (dstWidth)
 		{
-		case BITS_8:
+		case Bits8:
 			for (int i = 0; i < dstCount; ++i)
 				v8x32 (m_SPP[0])[v8x32(m_SP[1])[i] % 32] = v8x32(m_SP[0])[i];
 			break;
-		case BITS_16:
+		case Bits16:
 			for (int i = 0; i < dstCount; ++i)
 				v16x16(m_SPP[0])[v8x32(m_SP[1])[i] % 16] = v8x32(m_SP[0])[i];
 			break;
-		case BITS_32:
+		case Bits32:
 			for (int i = 0; i < dstCount; ++i)
 				v32x8 (m_SPP[0])[v8x32(m_SP[1])[i] %  8] = v8x32(m_SP[0])[i];
 			break;
-		case BITS_64:
+		case Bits64:
 			for (int i = 0; i < dstCount; ++i)
 				v64x4 (m_SPP[0])[v8x32(m_SP[1])[i] %  4] = v8x32(m_SP[0])[i];
 			break;
@@ -571,23 +571,23 @@ void VM::xput(uint8_t _src_type, uint8_t _dst_type)
 			throwBadInstruction();
 		}
 
-	case BITS_16:
+	case Bits16:
 
 		switch (dstWidth)
 		{
-		case BITS_8:
+		case Bits8:
 			for (int i = 0; i < dstCount; ++i)
 				v8x32 (m_SPP[0])[v16x16(m_SP[1])[i] % 32] = v16x16(m_SP[0])[i];
 			break;
-		case BITS_16:
+		case Bits16:
 			for (int i = 0; i < dstCount; ++i)
 				v16x16(m_SPP[0])[v16x16(m_SP[1])[i] % 16] = v16x16(m_SP[0])[i];
 			break;
-		case BITS_32:
+		case Bits32:
 			for (int i = 0; i < dstCount; ++i)
 				v32x8(m_SPP[0])[v16x16(m_SP[1])[i] %  8] = v16x16(m_SP[0])[i];
 			break;
-		case BITS_64:
+		case Bits64:
 			for (int i = 0; i < dstCount; ++i)
 				v64x4(m_SPP[0])[v16x16(m_SP[1])[i] %  4] = v16x16(m_SP[0])[i];
 			break;
@@ -595,23 +595,23 @@ void VM::xput(uint8_t _src_type, uint8_t _dst_type)
 			throwBadInstruction();
 		}
 
-	case BITS_32:
+	case Bits32:
 
 		switch (dstWidth)
 		{
-		case BITS_8:
+		case Bits8:
 			for (int i = 0; i < dstCount; ++i)
 				v8x32 (m_SPP[0])[v32x8(m_SP[1])[i] % 32] = v32x8(m_SP[0])[i];
 			break;
-		case BITS_16:
+		case Bits16:
 			for (int i = 0; i < dstCount; ++i)
 				v16x16(m_SPP[0])[v32x8(m_SP[1])[i] % 16] = v32x8(m_SP[0])[i];
 			break;
-		case BITS_32:
+		case Bits32:
 			for (int i = 0; i < dstCount; ++i)
 				v32x8 (m_SPP[0])[v32x8(m_SP[1])[i] %  8] = v32x8(m_SP[0])[i];
 			break;
-		case BITS_64:
+		case Bits64:
 			for (int i = 0; i < dstCount; ++i)
 				v64x4 (m_SPP[0])[v32x8(m_SP[1])[i] %  4] = v32x8(m_SP[0])[i];
 			break;
@@ -619,23 +619,23 @@ void VM::xput(uint8_t _src_type, uint8_t _dst_type)
 			throwBadInstruction();
 		}
 
-	case BITS_64:
+	case Bits64:
 
 		switch (dstWidth)
 		{
-		case BITS_8:
+		case Bits8:
 			for (int i = 0; i < dstCount; ++i)
 				v8x32 (m_SPP[0])[v64x4(m_SP[1])[i] % 32] = v64x4(m_SP[0])[i];
 			break;
-		case BITS_16:
+		case Bits16:
 			for (int i = 0; i < dstCount; ++i)
 				v16x16(m_SPP[0])[v64x4(m_SP[1])[i] % 16] = v64x4(m_SP[0])[i];
 			break;
-		case BITS_32:
+		case Bits32:
 			for (int i = 0; i < dstCount; ++i)
 				v32x8 (m_SPP[0])[v64x4(m_SP[1])[i] %  8] = v64x4(m_SP[0])[i];
 			break;
-		case BITS_64:
+		case Bits64:
 			for (int i = 0; i < dstCount; ++i)
 				v64x4 (m_SPP[0])[v64x4(m_SP[1])[i] %  4] = v64x4(m_SP[0])[i];
 			break;
@@ -657,19 +657,19 @@ void VM::xswizzle(uint8_t _type)
 	// for every index in the mask copy out the indexed value in the source
 	switch (width)
 	{
-	case BITS_8:
+	case Bits8:
 		for (int i = 0; i < count; ++i)
 			v8x32 (m_SPP[0])[i] = v8x32(m_SP[1]) [v8x32 (m_SP[0])[i] % count];
 		break;
-	case BITS_16:
+	case Bits16:
 		for (int i = 0; i < count; ++i)
 			v16x16(m_SPP[0])[i] = v16x16(m_SP[1])[v16x16(m_SP[0])[i] % count];
 		break;
-	case BITS_32:
+	case Bits32:
 		for (int i = 0; i < count; ++i)
 			v32x8 (m_SPP[0])[i] = v32x8(m_SP[1]) [v32x8 (m_SP[0])[i] % count];
 		break;
-	case BITS_64:
+	case Bits64:
 		for (int i = 0; i < count; ++i)
 			v64x4 (m_SPP[0])[i] = v64x4(m_SP[1]) [v64x4 (m_SP[0])[i] % count];
 		break;
@@ -688,28 +688,28 @@ void VM::xshuffle(uint8_t _type)
 	// for every index in the mask copy out the indexed value in one of the sources
 	switch (width)
 	{
-	case BITS_8:
+	case Bits8:
 		for (int i = 0; i < count; ++i)
 		{
 			int j = v8x32(m_SP[0]) [i];
 			v8x32 (m_SPP[0])[i] = j < count ? v8x32(m_SP[1]) [j] : v8x32 (m_SP[2])[(j - count) % count];
 		}
 		break;
-	case BITS_16:
+	case Bits16:
 		for (int i = 0; i < count; ++i)
 		{
 			int j = v16x16(m_SP[0])[i];
 			v16x16(m_SPP[0])[i] = j < count ? v16x16(m_SP[1])[j] : v16x16(m_SP[2])[(j - count) % count];
 		}
 		break;
-	case BITS_32:
+	case Bits32:
 		for (int i = 0; i < count; ++i)
 		{
 			int j = v32x8(m_SP[0]) [i];
 			v32x8 (m_SPP[0])[i] = j < count ? v32x8(m_SP[1]) [j] : v32x8 (m_SP[2])[(j - count) % count];
 		}
 		break;
-	case BITS_64:
+	case Bits64:
 		for (int i = 0; i < count; ++i)
 		{
 			int j = v64x4(m_SP[0]) [i];

From e7ca6e72424119fc8cbdcd67f71f285a7ca71496 Mon Sep 17 00:00:00 2001
From: gcolvin <greg@colvin.org>
Date: Fri, 14 Jul 2017 16:28:10 -0600
Subject: [PATCH 6/6] more name nits

---
 libevm/VMSIMD.cpp | 87 ++++++++++++++++++++++++-----------------------
 1 file changed, 45 insertions(+), 42 deletions(-)

diff --git a/libevm/VMSIMD.cpp b/libevm/VMSIMD.cpp
index ea4e241ca7f..b3614e49fe6 100755
--- a/libevm/VMSIMD.cpp
+++ b/libevm/VMSIMD.cpp
@@ -32,6 +32,7 @@ using a64x4  = uint64_t[4];
 using a32x8  = uint32_t[8];
 using a16x16 = uint16_t[16];
 using a8x32  = uint8_t [32];
+
 inline a64x4       & v64x4 (u256      & _stackItem) { return (a64x4&) *(a64x4*) &_stackItem; }
 inline a32x8       & v32x8 (u256      & _stackItem) { return (a32x8&) *(a32x8*) &_stackItem; }
 inline a16x16      & v16x16(u256      & _stackItem) { return (a16x16&)*(a16x16*)&_stackItem; }
@@ -112,6 +113,8 @@ void VM::xshl (uint8_t _type) { EVALXOPU(SHL, _type); }
 void VM::xrol (uint8_t _type) { EVALXOPU(ROL, _type); }
 void VM::xror (uint8_t _type) { EVALXOPU(ROR, _type); }
 
+// SIMD type encodes log base 2 of lane width and count - one in each nibble
+//
 inline uint8_t pow2N(uint8_t _n)
 {
 	static uint8_t exp[6] = { 1, 2, 4, 8, 16, 32 };
@@ -120,12 +123,12 @@ inline uint8_t pow2N(uint8_t _n)
 
 inline uint8_t laneCount(uint8_t _type)
 {
-	return pow2N((_type) & 0xf);
+	return pow2N(_type & 0xf);
 }
 
 inline uint8_t laneWidth(uint8_t _type)
 {
-	return (_type) >> 4;
+	return pow2N(_type >> 4);
 }
 
 // in must be by reference because it is really just memory for a vector
@@ -213,63 +216,63 @@ void VM::wtov(uint8_t _type, u256 _in, u256& o_out)
 void VM::xmload (uint8_t _type)
 {
 	// goes onto stack element by element, LSB first
-	uint8_t const* p = m_mem.data() + toInt15(m_SP[0]);
+	uint8_t const* vecData = m_mem.data() + toInt15(m_SP[0]);
 	uint8_t const count = laneCount(_type);
 	uint8_t const width = laneWidth(_type);
 
 	switch (width)
 	{
 	case Bits8:
-		for (int j = 1,  i = count - 1; 0 <= i; --i)
+		for (int j = count,  i = count - 1; 0 <= i; --i)
 		{
 			int v = 0;
-			v |= p[--j];
+			v |= vecData[--j];
 			v8x32(m_SPP[0])[i] = v;
 		}
 		break;
 	case Bits16:
-		for (int j = 2,  i = count - 1; 0 <= i; --i)
+		for (int j = count,  i = count - 1; 0 <= i; --i)
 		{
 			int v = 0;
-			v |= p[--j];
+			v |= vecData[--j];
 			v <<= 8;
-			v |= p[--j];
+			v |= vecData[--j];
 			v16x16(m_SPP[0])[i] = v;
 		}
 		break;
 	case Bits32:
-		for (int j = 4,  i = count - 1; 0 <= i; --i)
+		for (int j = count,  i = count - 1; 0 <= i; --i)
 		{
 			int v = 0;
-			v |= p[--j];
+			v |= vecData[--j];
 			v <<= 8;
-			v |= p[--j];
+			v |= vecData[--j];
 			v <<= 8;
-			v |= p[--j];
+			v |= vecData[--j];
 			v <<= 8;
-			v |= p[--j];
+			v |= vecData[--j];
 			v32x8(m_SPP[0])[i] = v;
 		}
 		break;
 	case Bits64:
-		for (int j = 8,  i = count - 1; 0 <= i; --i)
+		for (int j = count,  i = count - 1; 0 <= i; --i)
 		{
 			int v = 0;
-			v |= p[--j];
+			v |= vecData[--j];
 			v <<= 8;
-			v |= p[--j];
+			v |= vecData[--j];
 			v <<= 8;
-			v |= p[--j];
+			v |= vecData[--j];
 			v <<= 8;
-			v |= p[--j];
+			v |= vecData[--j];
 			v <<= 8;
-			v |= p[--j];
+			v |= vecData[--j];
 			v <<= 8;
-			v |= p[--j];
+			v |= vecData[--j];
 			v <<= 8;
-			v |= p[--j];
+			v |= vecData[--j];
 			v <<= 8;
-			v |= p[--j];
+			v |= vecData[--j];
 			v64x4(m_SPP[0])[i] = v;
 		}
 		break;
@@ -282,64 +285,64 @@ void VM::xmstore(uint8_t _type)
 {
 	// n bytes of type t elements in stack vector
 	// goes onto memory by element, LSB first
-	uint8_t *p = m_mem.data() + toInt15(m_SP[0]);
+	uint8_t* vecData = m_mem.data() + toInt15(m_SP[0]);
 	uint8_t const count = laneCount(_type);
 	uint8_t const width = laneWidth(_type);
 
 	switch (width)
 	{
 	case Bits8:
-		for (int j = 1,  i = count - 1; 0 <= i; --i)
+		for (int j = count,  i = count - 1; 0 <= i; --i)
 		{
 			int v = 0;
 			v = v8x32(m_SPP[0])[i];
-			p[--j] = (uint8_t)v;
+			vecData[--j] = (uint8_t)v;
 		}
 		break;
 	case Bits16:
-		for (int j = 2,  i = count - 1; 0 <= i; --i)
+		for (int j = count,  i = count - 1; 0 <= i; --i)
 		{
 			int v = 2;
 			v = v16x16(m_SPP[0])[i];
-			p[--j] = (uint8_t)v;
+			vecData[--j] = (uint8_t)v;
 			v >>= 8;
-			p[--j] = (uint8_t)v;
+			vecData[--j] = (uint8_t)v;
 		}
 		break;
 	case Bits32:
-		for (int j = 4,  i = count - 1; 0 <= i; --i)
+		for (int j = count,  i = count - 1; 0 <= i; --i)
 		{
 			int v = 4;
 			v = v32x8(m_SPP[0])[i];
-			p[--j] = (uint8_t)v;
+			vecData[--j] = (uint8_t)v;
 			v >>= 8;
-			p[--j] = (uint8_t)v;
+			vecData[--j] = (uint8_t)v;
 			v >>= 8;
-			p[--j] = (uint8_t)v;
+			vecData[--j] = (uint8_t)v;
 			v >>= 8;
-			p[--j] = (uint8_t)v;
+			vecData[--j] = (uint8_t)v;
 		}
 		break;
 	case Bits64:
-		for (int j = 8,  i = count - 1; 0 <= i; --i)
+		for (int j = count,  i = count - 1; 0 <= i; --i)
 		{
 			int v = 0;
 			v = v64x4(m_SPP[0])[i];
-			p[--j] = (uint8_t)v;
+			vecData[--j] = (uint8_t)v;
 			v >>= 8;
-			p[--j] = (uint8_t)v;
+			vecData[--j] = (uint8_t)v;
 			v >>= 8;
-			p[--j] = (uint8_t)v;
+			vecData[--j] = (uint8_t)v;
 			v >>= 8;
-			p[--j] = (uint8_t)v;
+			vecData[--j] = (uint8_t)v;
 			v >>= 8;
-			p[--j] = (uint8_t)v;
+			vecData[--j] = (uint8_t)v;
 			v >>= 8;
-			p[--j] = (uint8_t)v;
+			vecData[--j] = (uint8_t)v;
 			v >>= 8;
-			p[--j] = (uint8_t)v;
+			vecData[--j] = (uint8_t)v;
 			v >>= 8;
-			p[--j] = (uint8_t)v;
+			vecData[--j] = (uint8_t)v;
 		}
 		break;
 	default: