remove FpM::init

herumi · Jun 28, 2024 · 4e172b0 · 4e172b0
1 parent 7ab194f
commit 4e172b0
Show file tree

Hide file tree

Showing 5 changed files with 91 additions and 72 deletions.
diff --git a/include/mcl/bn.hpp b/include/mcl/bn.hpp
@@ -2283,7 +2283,6 @@ inline void init(bool *pb, const mcl::CurveParam& cp = mcl::BN254, fp::Mode mode
 	mcl::msm::Func func;
 	func.fp = &Fp::getOp();
 	func.fr = &Fr::getOp();
-	func.rw = local::GLV1::rw.getUnit();
 	func.invVecFp = mcl::msm::invVecFpFunc(mcl::invVec<mcl::bn::Fp>);
 	func.normalizeVecG1 = mcl::msm::normalizeVecG1Func(mcl::ec::normalizeVec<mcl::bn::G1>);
 #if defined(__GNUC__) && !defined(__EMSCRIPTEN__) && !defined(__clang__)

diff --git a/include/mcl/op.hpp b/include/mcl/op.hpp
@@ -445,7 +445,6 @@ typedef void (*clearG1Func)(G1A& z);
 struct Func {
 	const mcl::fp::Op *fp;
 	const mcl::fp::Op *fr;
-	const Unit *rw;
 	invVecFpFunc invVecFp;
 	normalizeVecG1Func normalizeVecG1;
 	addG1Func addG1;

diff --git a/src/gen_msm_para.py b/src/gen_msm_para.py
@@ -78,21 +78,30 @@ def expandN(name, v):
 def putCode(curve, mont):
   print('// generated by src/gen_msm_para.py')
   print(f'static const uint64_t g_mask = {hex(mont.mask)};')
-  expand("g_vmask_", mont.mask)
-  expand("g_vrp_", mont.rp)
-  expandN('g_vpN_', toArray(curve.p))
-  expandN('g_vR_', toArray(mont.R)) # Fp:M::one()
-  expandN('g_vR2_', toArray(mont.R2)) # Fp:M::R2()
+  expand("g_mask_", mont.mask)
+  expand("g_rp_", mont.rp)
+  expandN('g_ap_', toArray(curve.p)) # array of p
+  expandN('g_R_', toArray(mont.R)) # FpM::one()
+  expandN('g_R2_', toArray(mont.R2)) # FpM::R2()
   expandN('g_m64to52_', toArray(mont.toMont(2**32)))
   expandN('g_m52to64_', toArray(mont.toMont(pow(2**32, -1, curve.p))))
-  expandN('g_vrawOne_', toArray(1)) # Fp:M::rawOne()
-  expand("g_offset_", [0, 1, 2, 3, 4, 5, 6, 7, 8])
+  expandN('g_zero_', toArray(0)) # FpM::zero()
+  expandN('g_rawOne_', toArray(1)) # FpM::rawOne()
+  expand('g_offset_', [0, 1, 2, 3, 4, 5, 6, 7, 8])
+  p = curve.p
+  rw = pow(-3, (p+1)//4, p)
+  rw = p-(rw+1)//2
+  if (rw*rw+rw+1)%p != 0:
+    print(f'ERR rw {rw=}')
+    return
+  print(f'// rw={hex(rw)}')
+  expandN('g_rw_', toArray(mont.toMont(rw)))
 
   print(f'''
 struct G {{
-	static const Vec& vmask() {{ return *(const Vec*)g_vmask_; }}
-	static const Vec& vrp() {{ return *(const Vec*)g_vrp_; }}
-	static const Vec* vpN() {{ return (const Vec*)g_vpN_; }}
+	static const Vec& mask() {{ return *(const Vec*)g_mask_; }}
+	static const Vec& rp() {{ return *(const Vec*)g_rp_; }}
+	static const Vec* ap() {{ return (const Vec*)g_ap_; }}
 	static const Vec& offset() {{ return *(const Vec*)g_offset_; }}
 }};
 ''')

diff --git a/src/msm_avx.cpp b/src/msm_avx.cpp
@@ -137,7 +137,7 @@ inline void vrawAdd(Vec *z, const Vec *x, const Vec *y)
 {
 	Vec t = vpaddq(x[0], y[0]);
 	Vec c = vpsrlq(t, W);
-	z[0] = vpandq(t, G::vmask());
+	z[0] = vpandq(t, G::mask());
 
 	for (size_t i = 1; i < n; i++) {
 		t = vpaddq(x[i], y[i]);
@@ -147,7 +147,7 @@ inline void vrawAdd(Vec *z, const Vec *x, const Vec *y)
 			return;
 		}
 		c = vpsrlq(t, W);
-		z[i] = vpandq(t, G::vmask());
+		z[i] = vpandq(t, G::mask());
 	}
 }
 
@@ -156,12 +156,12 @@ inline Vmask vrawSub(Vec *z, const Vec *x, const Vec *y)
 {
 	Vec t = vpsubq(x[0], y[0]);
 	Vec c = vpsrlq(t, S);
-	z[0] = vpandq(t, G::vmask());
+	z[0] = vpandq(t, G::mask());
 	for (size_t i = 1; i < n; i++) {
 		t = vpsubq(x[i], y[i]);
 		t = vpsubq(t, c);
 		c = vpsrlq(t, S);
-		z[i] = vpandq(t, G::vmask());
+		z[i] = vpandq(t, G::mask());
 	}
 	return vcmpneqq(c, vzero());
 }
@@ -177,16 +177,16 @@ inline void uvadd(Vec *z, const Vec *x, const Vec *y)
 {
 	Vec sN[N], tN[N];
 	vrawAdd(sN, x, y);
-	Vmask c = vrawSub(tN, sN, G::vpN());
+	Vmask c = vrawSub(tN, sN, G::ap());
 	uvselect(z, c, sN, tN);
 }
 
 inline void uvsub(Vec *z, const Vec *x, const Vec *y)
 {
 	Vec sN[N], tN[N];
 	Vmask c = vrawSub(sN, x, y);
-	vrawAdd(tN, sN, G::vpN());
-	tN[N-1] = vpandq(tN[N-1], G::vmask());
+	vrawAdd(tN, sN, G::ap());
+	tN[N-1] = vpandq(tN[N-1], G::mask());
 	uvselect(z, c, tN, sN);
 }
 
@@ -293,15 +293,15 @@ inline void vset(Vec *t, const Vmask& c, const Vec a[n])
 inline void uvmont(Vec z[N], Vec xy[N*2])
 {
 	for (size_t i = 0; i < N; i++) {
-		Vec q = vmulL(xy[i], G::vrp());
-		xy[N+i] = vpaddq(xy[N+i], vrawMulUnitAdd(xy+i, G::vpN(), q));
+		Vec q = vmulL(xy[i], G::rp());
+		xy[N+i] = vpaddq(xy[N+i], vrawMulUnitAdd(xy+i, G::ap(), q));
 		xy[i+1] = vpaddq(xy[i+1], vpsrlq(xy[i], W));
 	}
 	for (size_t i = N; i < N*2-1; i++) {
 		xy[i+1] = vpaddq(xy[i+1], vpsrlq(xy[i], W));
-		xy[i] = vpandq(xy[i], G::vmask());
+		xy[i] = vpandq(xy[i], G::mask());
 	}
-	Vmask c = vrawSub(z, xy+N, G::vpN());
+	Vmask c = vrawSub(z, xy+N, G::ap());
 	uvselect(z, c, xy+N, z);
 }
 
@@ -314,19 +314,19 @@ inline void uvmul(Vec *z, const Vec *x, const Vec *y)
 #else
 	Vec t[N*2], q;
 	vrawMulUnit(t, x, y[0]);
-	q = vmulL(t[0], G::vrp());
-	t[N] = vpaddq(t[N], vrawMulUnitAdd(t, G::vpN(), q));
+	q = vmulL(t[0], G::rp());
+	t[N] = vpaddq(t[N], vrawMulUnitAdd(t, G::ap(), q));
 	for (size_t i = 1; i < N; i++) {
 		t[N+i] = vrawMulUnitAdd(t+i, x, y[i]);
 		t[i] = vpaddq(t[i], vpsrlq(t[i-1], W));
-		q = vmulL(t[i], G::vrp());
-		t[N+i] = vpaddq(t[N+i], vrawMulUnitAdd(t+i, G::vpN(), q));
+		q = vmulL(t[i], G::rp());
+		t[N+i] = vpaddq(t[N+i], vrawMulUnitAdd(t+i, G::ap(), q));
 	}
 	for (size_t i = N; i < N*2; i++) {
 		t[i] = vpaddq(t[i], vpsrlq(t[i-1], W));
-		t[i-1] = vpandq(t[i-1], G::vmask());
+		t[i-1] = vpandq(t[i-1], G::mask());
 	}
-	Vmask c = vrawSub(z, t+N, G::vpN());
+	Vmask c = vrawSub(z, t+N, G::ap());
 	uvselect(z, c, t+N, z);
 #endif
 }
@@ -427,13 +427,13 @@ class Montgomery {
 inline void split52bit(Vec y[8], const Vec x[6])
 {
 	assert(&y != &x);
-	y[0] = vpandq(x[0], G::vmask());
-	y[1] = vpandq(vporq(vpsrlq(x[0], 52), vpsllq(x[1], 12)), G::vmask());
-	y[2] = vpandq(vporq(vpsrlq(x[1], 40), vpsllq(x[2], 24)), G::vmask());
-	y[3] = vpandq(vporq(vpsrlq(x[2], 28), vpsllq(x[3], 36)), G::vmask());
-	y[4] = vpandq(vporq(vpsrlq(x[3], 16), vpsllq(x[4], 48)), G::vmask());
-	y[5] = vpandq(vpsrlq(x[4], 4), G::vmask());
-	y[6] = vpandq(vporq(vpsrlq(x[4], 56), vpsllq(x[5], 8)), G::vmask());
+	y[0] = vpandq(x[0], G::mask());
+	y[1] = vpandq(vporq(vpsrlq(x[0], 52), vpsllq(x[1], 12)), G::mask());
+	y[2] = vpandq(vporq(vpsrlq(x[1], 40), vpsllq(x[2], 24)), G::mask());
+	y[3] = vpandq(vporq(vpsrlq(x[2], 28), vpsllq(x[3], 36)), G::mask());
+	y[4] = vpandq(vporq(vpsrlq(x[3], 16), vpsllq(x[4], 48)), G::mask());
+	y[5] = vpandq(vpsrlq(x[4], 4), G::mask());
+	y[6] = vpandq(vporq(vpsrlq(x[4], 56), vpsllq(x[5], 8)), G::mask());
 	y[7] = vpsrlq(x[5], 44);
 }
 
@@ -527,16 +527,14 @@ inline void cvt6Ux8to8Ux8(Vec y[8], const Unit x[6*8])
 
 struct FpM {
 	Vec v[N];
-	static FpM zero_;
-	static FpM rawOne_;
-	static FpM rw_;
 	static Montgomery g_mont;
-	static const FpM& zero() { return zero_; }
-	static const FpM& one() { return *(const FpM*)g_vR_; }
-	static const FpM& R2() { return *(const FpM*)g_vR2_; }
-	static const FpM& rawOne() { return *(const FpM*)g_vrawOne_; }
+	static const FpM& zero() { return *(const FpM*)g_zero_; }
+	static const FpM& one() { return *(const FpM*)g_R_; }
+	static const FpM& R2() { return *(const FpM*)g_R2_; }
+	static const FpM& rawOne() { return *(const FpM*)g_rawOne_; }
 	static const FpM& m64to52() { return *(const FpM*)g_m64to52_; }
 	static const FpM& m52to64() { return *(const FpM*)g_m52to64_; }
+	static const FpM& rw() { return *(const FpM*)g_rw_; }
 	static void add(FpM& z, const FpM& x, const FpM& y)
 	{
 		uvadd(z.v, x.v, y.v);
@@ -639,12 +637,12 @@ struct FpM {
 		const size_t bitLen = sizeof(Unit)*8;
 		const size_t jn = bitLen / w;
 		z = tbl[0];
-		const Vec vmask4 = vpbroadcastq(getMask(4));
+		const Vec mask4 = vpbroadcastq(getMask(4));
 		for (size_t i = 0; i < yn; i++) {
 			const Vec& v = y[yn-1-i];
 			for (size_t j = 0; j < jn; j++) {
 				for (int k = 0; k < w; k++) FpM::sqr(z, z);
-				Vec idx = vpandq(vpsrlq(v, bitLen-w-j*w), vmask4);
+				Vec idx = vpandq(vpsrlq(v, bitLen-w-j*w), mask4);
 				idx = vpsllq(idx, 6); // 512 B = 64 Unit
 				idx = vpaddq(idx, G::offset());
 				FpM t;
@@ -704,17 +702,11 @@ struct FpM {
 		}
 		return d;
 	}
-	static void init(const mpz_class& mp)
-	{
-		g_mont.init(mp);
-	}
 #ifdef MCL_MSM_TEST
 	void dump(size_t pos, const char *msg = nullptr) const;
 #endif
 };
 
-FpM FpM::zero_;
-FpM FpM::rw_;
 Montgomery FpM::g_mont;
 
 template<class E, size_t n>
@@ -974,7 +966,7 @@ struct EcM {
 	}
 	static void mulLambda(EcM& Q, const EcM& P)
 	{
-		FpM::mul(Q.x, P.x, FpM::rw_);
+		FpM::mul(Q.x, P.x, FpM::rw());
 		Q.y = P.y;
 		Q.z = P.z;
 	}
@@ -1011,13 +1003,13 @@ struct EcM {
 			pb[i+M*0] = bb[0]; pb[i+M*1] = bb[1];
 		}
 		const size_t bitLen = 128;
-		Vec vmask = vpbroadcastq((1<<w)-1);
+		Vec mask = vpbroadcastq((1<<w)-1);
 		bool first = true;
 		size_t pos = bitLen;
 		for (size_t i = 0; i < (bitLen + w-1)/w; i++) {
 			size_t dblN = w;
 			if (pos < w) {
-				vmask = vpbroadcastq((1<<pos)-1);
+				mask = vpbroadcastq((1<<pos)-1);
 				dblN = pos;
 				pos = 0;
 			} else {
@@ -1026,15 +1018,15 @@ struct EcM {
 			if (!first) for (size_t k = 0; k < dblN; k++) EcM::dbl<isProj>(Q, Q);
 			EcM T;
 			Vec idx;
-			idx = vpandq(getUnitAt(b, 2, pos), vmask);
+			idx = vpandq(getUnitAt(b, 2, pos), mask);
 			if (first) {
 				Q.gather(tbl2, idx);
 				first = false;
 			} else {
 				T.gather(tbl2, idx);
 				add<isProj, mixed>(Q, Q, T);
 			}
-			idx = vpandq(getUnitAt(a, 2, pos), vmask);
+			idx = vpandq(getUnitAt(a, 2, pos), mask);
 			T.gather(tbl1, idx);
 			add<isProj, mixed>(Q, Q, T);
 		}
@@ -1044,7 +1036,7 @@ struct EcM {
 	template<size_t bitLen, size_t w>
 	static void makeNAFtbl(Vec *idxTbl, Vmask *negTbl, const Vec a[2])
 	{
-		const Vec vmask = vpbroadcastq((1<<w)-1);
+		const Vec mask = vpbroadcastq((1<<w)-1);
 #ifdef SIGNED_TABLE
 		(void)negTbl;
 #else
@@ -1057,16 +1049,16 @@ struct EcM {
 		const size_t n = (bitLen+w-1)/w;
 		for (size_t i = 0; i < n; i++) {
 			Vec idx = getUnitAt(a, 2, pos);
-			idx = vpandq(idx, vmask);
+			idx = vpandq(idx, mask);
 			idx = vpaddq(idx, CF);
 #ifdef SIGNED_TABLE
-			Vec masked = vpandq(idx, vmask);
+			Vec masked = vpandq(idx, mask);
 			Vmask v = vcmpgtq(masked, H);
 			idxTbl[i] = masked; //vselect(negTbl[i], vpsubq(F, masked), masked); // idx >= H ? F - idx : idx;
 			CF = vpsrlq(idx, w);
 			CF = vpaddq(v, CF, one);
 #else
-			Vec masked = vpandq(idx, vmask);
+			Vec masked = vpandq(idx, mask);
 			negTbl[i] = vcmpgtq(masked, H);
 			idxTbl[i] = vselect(negTbl[i], vpsubq(F, masked), masked); // idx >= H ? F - idx : idx;
 			CF = vpsrlq(idx, w);
@@ -1325,10 +1317,9 @@ bool initMsm(const mcl::CurveParam& cp, const mcl::msm::Func *func)
 	g_func = *func;
 
 	const mpz_class& mp = g_func.fp->mp;
-	FpM::init(mp);
+	FpM::g_mont.init(mp);
+//	FpM::init(mp);
 	Montgomery& mont = FpM::g_mont;
-	FpM::zero_.clear();
-	FpM::rw_.setFp(g_func.rw);
 	EcM::init(mont);
 	return true;
 }

diff --git a/src/msm_avx_bls12_381.h b/src/msm_avx_bls12_381.h
@@ -9,9 +9,9 @@ rp=0x3fffcfffcfffd
 #endif
 // generated by src/gen_msm_para.py
 static const uint64_t g_mask = 0xfffffffffffff;
-static const CYBOZU_ALIGN(64) uint64_t g_vmask_[] = { 0xfffffffffffff, 0xfffffffffffff, 0xfffffffffffff, 0xfffffffffffff, 0xfffffffffffff, 0xfffffffffffff, 0xfffffffffffff, 0xfffffffffffff, };
-static const CYBOZU_ALIGN(64) uint64_t g_vrp_[] = { 0x3fffcfffcfffd, 0x3fffcfffcfffd, 0x3fffcfffcfffd, 0x3fffcfffcfffd, 0x3fffcfffcfffd, 0x3fffcfffcfffd, 0x3fffcfffcfffd, 0x3fffcfffcfffd, };
-static const CYBOZU_ALIGN(64) uint64_t g_vpN_[] = {
+static const CYBOZU_ALIGN(64) uint64_t g_mask_[] = { 0xfffffffffffff, 0xfffffffffffff, 0xfffffffffffff, 0xfffffffffffff, 0xfffffffffffff, 0xfffffffffffff, 0xfffffffffffff, 0xfffffffffffff, };
+static const CYBOZU_ALIGN(64) uint64_t g_rp_[] = { 0x3fffcfffcfffd, 0x3fffcfffcfffd, 0x3fffcfffcfffd, 0x3fffcfffcfffd, 0x3fffcfffcfffd, 0x3fffcfffcfffd, 0x3fffcfffcfffd, 0x3fffcfffcfffd, };
+static const CYBOZU_ALIGN(64) uint64_t g_ap_[] = {
 0xeffffffffaaab, 0xeffffffffaaab, 0xeffffffffaaab, 0xeffffffffaaab, 0xeffffffffaaab, 0xeffffffffaaab, 0xeffffffffaaab, 0xeffffffffaaab,
 0xfeb153ffffb9f, 0xfeb153ffffb9f, 0xfeb153ffffb9f, 0xfeb153ffffb9f, 0xfeb153ffffb9f, 0xfeb153ffffb9f, 0xfeb153ffffb9f, 0xfeb153ffffb9f,
 0x6b0f6241eabff, 0x6b0f6241eabff, 0x6b0f6241eabff, 0x6b0f6241eabff, 0x6b0f6241eabff, 0x6b0f6241eabff, 0x6b0f6241eabff, 0x6b0f6241eabff,
@@ -21,7 +21,7 @@ static const CYBOZU_ALIGN(64) uint64_t g_vpN_[] = {
 0x1ea397fe69a4b, 0x1ea397fe69a4b, 0x1ea397fe69a4b, 0x1ea397fe69a4b, 0x1ea397fe69a4b, 0x1ea397fe69a4b, 0x1ea397fe69a4b, 0x1ea397fe69a4b,
 0x1a011, 0x1a011, 0x1a011, 0x1a011, 0x1a011, 0x1a011, 0x1a011, 0x1a011,
 };
-static const CYBOZU_ALIGN(64) uint64_t g_vR_[] = {
+static const CYBOZU_ALIGN(64) uint64_t g_R_[] = {
 0x6480ea8e9b9af, 0x6480ea8e9b9af, 0x6480ea8e9b9af, 0x6480ea8e9b9af, 0x6480ea8e9b9af, 0x6480ea8e9b9af, 0x6480ea8e9b9af, 0x6480ea8e9b9af,
 0x65766c8fe444f, 0x65766c8fe444f, 0x65766c8fe444f, 0x65766c8fe444f, 0x65766c8fe444f, 0x65766c8fe444f, 0x65766c8fe444f, 0x65766c8fe444f,
 0x8b540fea96f7d, 0x8b540fea96f7d, 0x8b540fea96f7d, 0x8b540fea96f7d, 0x8b540fea96f7d, 0x8b540fea96f7d, 0x8b540fea96f7d, 0x8b540fea96f7d,
@@ -31,7 +31,7 @@ static const CYBOZU_ALIGN(64) uint64_t g_vR_[] = {
 0xe06ef23c24a25, 0xe06ef23c24a25, 0xe06ef23c24a25, 0xe06ef23c24a25, 0xe06ef23c24a25, 0xe06ef23c24a25, 0xe06ef23c24a25, 0xe06ef23c24a25,
 0x14c8e, 0x14c8e, 0x14c8e, 0x14c8e, 0x14c8e, 0x14c8e, 0x14c8e, 0x14c8e,
 };
-static const CYBOZU_ALIGN(64) uint64_t g_vR2_[] = {
+static const CYBOZU_ALIGN(64) uint64_t g_R2_[] = {
 0xa5bf4cb89af51, 0xa5bf4cb89af51, 0xa5bf4cb89af51, 0xa5bf4cb89af51, 0xa5bf4cb89af51, 0xa5bf4cb89af51, 0xa5bf4cb89af51, 0xa5bf4cb89af51,
 0x3afbba7ca31a2, 0x3afbba7ca31a2, 0x3afbba7ca31a2, 0x3afbba7ca31a2, 0x3afbba7ca31a2, 0x3afbba7ca31a2, 0x3afbba7ca31a2, 0x3afbba7ca31a2,
 0x2646160ec71f1, 0x2646160ec71f1, 0x2646160ec71f1, 0x2646160ec71f1, 0x2646160ec71f1, 0x2646160ec71f1, 0x2646160ec71f1, 0x2646160ec71f1,
@@ -61,7 +61,17 @@ static const CYBOZU_ALIGN(64) uint64_t g_m52to64_[] = {
 0xec3fa80e4935c, 0xec3fa80e4935c, 0xec3fa80e4935c, 0xec3fa80e4935c, 0xec3fa80e4935c, 0xec3fa80e4935c, 0xec3fa80e4935c, 0xec3fa80e4935c,
 0x15f65, 0x15f65, 0x15f65, 0x15f65, 0x15f65, 0x15f65, 0x15f65, 0x15f65,
 };
-static const CYBOZU_ALIGN(64) uint64_t g_vrawOne_[] = {
+static const CYBOZU_ALIGN(64) uint64_t g_zero_[] = {
+0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+};
+static const CYBOZU_ALIGN(64) uint64_t g_rawOne_[] = {
 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1,
 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
@@ -72,11 +82,22 @@ static const CYBOZU_ALIGN(64) uint64_t g_vrawOne_[] = {
 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
 };
 static const CYBOZU_ALIGN(64) uint64_t g_offset_[] = { 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8 };
+// rw=0x1a0111ea397fe699ec02408663d4de85aa0d857d89759ad4897d29650fb85f9b409427eb4f49fffd8bfd00000000aaac
+static const CYBOZU_ALIGN(64) uint64_t g_rw_[] = {
+0xa424657e25648, 0xa424657e25648, 0xa424657e25648, 0xa424657e25648, 0xa424657e25648, 0xa424657e25648, 0xa424657e25648, 0xa424657e25648,
+0xc75706049e739, 0xc75706049e739, 0xc75706049e739, 0xc75706049e739, 0xc75706049e739, 0xc75706049e739, 0xc75706049e739, 0xc75706049e739,
+0xb59085299e0e2, 0xb59085299e0e2, 0xb59085299e0e2, 0xb59085299e0e2, 0xb59085299e0e2, 0xb59085299e0e2, 0xb59085299e0e2, 0xb59085299e0e2,
+0xd9cf17286a964, 0xd9cf17286a964, 0xd9cf17286a964, 0xd9cf17286a964, 0xd9cf17286a964, 0xd9cf17286a964, 0xd9cf17286a964, 0xd9cf17286a964,
+0x69ec7cb33aa8, 0x69ec7cb33aa8, 0x69ec7cb33aa8, 0x69ec7cb33aa8, 0x69ec7cb33aa8, 0x69ec7cb33aa8, 0x69ec7cb33aa8, 0x69ec7cb33aa8,
+0x35e995b239c7e, 0x35e995b239c7e, 0x35e995b239c7e, 0x35e995b239c7e, 0x35e995b239c7e, 0x35e995b239c7e, 0x35e995b239c7e, 0x35e995b239c7e,
+0x82faa0ff3c329, 0x82faa0ff3c329, 0x82faa0ff3c329, 0x82faa0ff3c329, 0x82faa0ff3c329, 0x82faa0ff3c329, 0x82faa0ff3c329, 0x82faa0ff3c329,
+0x17601, 0x17601, 0x17601, 0x17601, 0x17601, 0x17601, 0x17601, 0x17601,
+};
 
 struct G {
-	static const Vec& vmask() { return *(const Vec*)g_vmask_; }
-	static const Vec& vrp() { return *(const Vec*)g_vrp_; }
-	static const Vec* vpN() { return (const Vec*)g_vpN_; }
+	static const Vec& mask() { return *(const Vec*)g_mask_; }
+	static const Vec& rp() { return *(const Vec*)g_rp_; }
+	static const Vec* ap() { return (const Vec*)g_ap_; }
 	static const Vec& offset() { return *(const Vec*)g_offset_; }
 };