Merge pull request #9176 from hrydgard/always-prescale-uv

Always prescale uv ("texcoord speedhack")
hrydgard · Dec 20, 2016 · e9bea75 · e9bea75
2 parents 80af358 + 0fa2f2c
commit e9bea75
Show file tree

Hide file tree

Showing 32 changed files with 118 additions and 581 deletions.
diff --git a/Core/Config.cpp b/Core/Config.cpp
@@ -730,7 +730,6 @@ static ConfigSetting debuggerSettings[] = {
 };
 
 static ConfigSetting speedHackSettings[] = {
-	ReportedConfigSetting("PrescaleUVCoords", &g_Config.bPrescaleUV, true, true, true),
 	ReportedConfigSetting("DisableAlphaTest", &g_Config.bDisableAlphaTest, false, true, true),
 
 	ConfigSetting(false),

diff --git a/Core/Config.h b/Core/Config.h
@@ -360,14 +360,7 @@ struct Config {
 	float fAnalogLimiterDeadzone;
 	// GLES backend-specific hacks. Not saved to the ini file, do not add checkboxes. Will be made into
 	// proper options when good enough.
-	// PrescaleUV:
-	//   * Applies UV scale/offset when decoding verts. Get rid of some work in the vertex shader,
-	//     saves a uniform upload and is a prerequisite for future optimized hybrid
-	//     (SW skinning, HW transform) skinning.
-	//   * Still has major problems so off by default - need to store tex scale/offset per DeferredDrawCall,
-	//     which currently isn't done so if texscale/offset isn't static (like in Tekken 6) things go wrong.
-	bool bPrescaleUV;
-	bool bDisableAlphaTest;  // Helps PowerVR immensely, breaks some graphics
+	bool bDisableAlphaTest;  // Helps PowerVR performance immensely, breaks some graphics
 	// End GLES hacks.
 
 	// Use the hardware scaler to scale up the image to save fillrate. Similar to Windows' window size, really.

diff --git a/GPU/Common/DrawEngineCommon.cpp b/GPU/Common/DrawEngineCommon.cpp
@@ -27,7 +27,7 @@
 
 #define QUAD_INDICES_MAX 65536
 
-DrawEngineCommon::DrawEngineCommon() : dec_(nullptr) {
+DrawEngineCommon::DrawEngineCommon() : dec_(nullptr), decOptions_{} {
 	quadIndices_ = new u16[6 * QUAD_INDICES_MAX];
 	decJitCache_ = new VertexDecoderJitCache();
 }

diff --git a/GPU/Common/ShaderId.cpp b/GPU/Common/ShaderId.cpp
@@ -42,7 +42,6 @@ std::string VertexShaderDesc(const ShaderID &id) {
 	}
 	if (id.Bits(VS_BIT_MATERIAL_UPDATE, 3)) desc << "MatUp:" << id.Bits(VS_BIT_MATERIAL_UPDATE, 3) << " ";
 	if (id.Bits(VS_BIT_WEIGHT_FMTSCALE, 2)) desc << "WScale " << id.Bits(VS_BIT_WEIGHT_FMTSCALE, 2) << " ";
-	if (id.Bits(VS_BIT_TEXCOORD_FMTSCALE, 2)) desc << "TCScale " << id.Bits(VS_BIT_TEXCOORD_FMTSCALE, 2) << " ";
 	if (id.Bit(VS_BIT_FLATSHADE)) desc << "Flat ";
 
 	// TODO: More...
@@ -119,11 +118,6 @@ void ComputeVertexShaderID(ShaderID *id_out, u32 vertType, bool useHWTransform)
 
 		id.SetBit(VS_BIT_NORM_REVERSE, gstate.areNormalsReversed());
 		id.SetBit(VS_BIT_HAS_TEXCOORD, hasTexcoord);
-		if (doTextureProjection && gstate.getUVProjMode() == GE_PROJMAP_UV) {
-			id.SetBits(VS_BIT_TEXCOORD_FMTSCALE, 2, (vertType & GE_VTYPE_TC_MASK) >> GE_VTYPE_TC_SHIFT);  // two bits
-		} else {
-			id.SetBits(VS_BIT_TEXCOORD_FMTSCALE, 2, 3);  // float - no scaling
-		}
 	}
 
 	id.SetBit(VS_BIT_FLATSHADE, doFlatShading);

diff --git a/GPU/Common/ShaderId.h b/GPU/Common/ShaderId.h
@@ -37,7 +37,6 @@ enum {
 	VS_BIT_LIGHT3_ENABLE = 55,
 	VS_BIT_LIGHTING_ENABLE = 56,
 	VS_BIT_WEIGHT_FMTSCALE = 57,  // only two bits, 1 free after
-	VS_BIT_TEXCOORD_FMTSCALE = 60,
 	VS_BIT_FLATSHADE = 62,  // 1 free after
 };
 

diff --git a/GPU/Common/SoftwareTransformCommon.cpp b/GPU/Common/SoftwareTransformCommon.cpp
@@ -146,12 +146,9 @@ void SoftwareTransform(
 
 	float uscale = 1.0f;
 	float vscale = 1.0f;
-	bool scaleUV = false;
 	if (throughmode) {
 		uscale /= gstate_c.curTextureWidth;
 		vscale /= gstate_c.curTextureHeight;
-	} else {
-		scaleUV = !g_Config.bPrescaleUV;
 	}
 
 	bool skinningEnabled = vertTypeIsSkinningEnabled(vertType);
@@ -317,14 +314,9 @@ void SoftwareTransform(
 			switch (gstate.getUVGenMode()) {
 			case GE_TEXMAP_TEXTURE_COORDS:	// UV mapping
 			case GE_TEXMAP_UNKNOWN: // Seen in Riviera.  Unsure of meaning, but this works.
-				// Texture scale/offset is only performed in this mode.
-				if (scaleUV) {
-					uv[0] = ruv[0]*gstate_c.uv.uScale + gstate_c.uv.uOff;
-					uv[1] = ruv[1]*gstate_c.uv.vScale + gstate_c.uv.vOff;
-				} else {
-					uv[0] = ruv[0];
-					uv[1] = ruv[1];
-				}
+				// We always prescale in the vertex decoder now.
+				uv[0] = ruv[0];
+				uv[1] = ruv[1];
 				uv[2] = 1.0f;
 				break;
 

diff --git a/GPU/Common/SplineCommon.cpp b/GPU/Common/SplineCommon.cpp
@@ -864,21 +864,21 @@ void DrawEngineCommon::SubmitSpline(const void *control_points, const void *indi
 	u32 vertTypeWithIndex16 = (vertType & ~GE_VTYPE_IDX_MASK) | GE_VTYPE_IDX_16BIT;
 
 	UVScale prevUVScale;
-	if (g_Config.bPrescaleUV && (origVertType & GE_VTYPE_TC_MASK) != 0) {
+	if ((origVertType & GE_VTYPE_TC_MASK) != 0) {
 		// We scaled during Normalize already so let's turn it off when drawing.
 		prevUVScale = gstate_c.uv;
 		gstate_c.uv.uScale = 1.0f;
 		gstate_c.uv.vScale = 1.0f;
-		gstate_c.uv.uOff = 0;
-		gstate_c.uv.vOff = 0;
+		gstate_c.uv.uOff = 0.0f;
+		gstate_c.uv.vOff = 0.0f;
 	}
 
 	int generatedBytesRead;
 	DispatchSubmitPrim(splineBuffer, quadIndices_, primType[prim_type], count, vertTypeWithIndex16, &generatedBytesRead);
 
 	DispatchFlush();
 
-	if (g_Config.bPrescaleUV && (origVertType & GE_VTYPE_TC_MASK) != 0) {
+	if ((origVertType & GE_VTYPE_TC_MASK) != 0) {
 		gstate_c.uv = prevUVScale;
 	}
 }
@@ -979,7 +979,7 @@ void DrawEngineCommon::SubmitBezier(const void *control_points, const void *indi
 	u32 vertTypeWithIndex16 = (vertType & ~GE_VTYPE_IDX_MASK) | GE_VTYPE_IDX_16BIT;
 
 	UVScale prevUVScale;
-	if (g_Config.bPrescaleUV && (origVertType & GE_VTYPE_TC_MASK) != 0) {
+	if (origVertType & GE_VTYPE_TC_MASK) {
 		// We scaled during Normalize already so let's turn it off when drawing.
 		prevUVScale = gstate_c.uv;
 		gstate_c.uv.uScale = 1.0f;
@@ -993,7 +993,7 @@ void DrawEngineCommon::SubmitBezier(const void *control_points, const void *indi
 
 	DispatchFlush();
 
-	if (g_Config.bPrescaleUV && (origVertType & GE_VTYPE_TC_MASK) != 0) {
+	if (origVertType & GE_VTYPE_TC_MASK) {
 		gstate_c.uv = prevUVScale;
 	}
 }
diff --git a/GPU/Common/VertexDecoderArm.cpp b/GPU/Common/VertexDecoderArm.cpp
@@ -119,8 +119,6 @@ static const JitLookup jitLookup[] = {
 	{&VertexDecoder::Step_WeightsU16Skin, &VertexDecoderJitCache::Jit_WeightsU16Skin},
 	{&VertexDecoder::Step_WeightsFloatSkin, &VertexDecoderJitCache::Jit_WeightsFloatSkin},
 
-	{&VertexDecoder::Step_TcU8, &VertexDecoderJitCache::Jit_TcU8},
-	{&VertexDecoder::Step_TcU16, &VertexDecoderJitCache::Jit_TcU16},
 	{&VertexDecoder::Step_TcFloat, &VertexDecoderJitCache::Jit_TcFloat},
 	{&VertexDecoder::Step_TcU16Double, &VertexDecoderJitCache::Jit_TcU16Double},
 
@@ -563,21 +561,6 @@ void VertexDecoderJitCache::Jit_WeightsFloatSkin() {
 	Jit_ApplyWeights();
 }
 
-// Fill last two bytes with zeroes to align to 4 bytes. LDRH does it for us, handy.
-void VertexDecoderJitCache::Jit_TcU8() {
-	LDRB(tempReg1, srcReg, dec_->tcoff);
-	LDRB(tempReg2, srcReg, dec_->tcoff + 1);
-	ORR(tempReg1, tempReg1, Operand2(tempReg2, ST_LSL, 8));
-	STR(tempReg1, dstReg, dec_->decFmt.uvoff);
-}
-
-void VertexDecoderJitCache::Jit_TcU16() {
-	LDRH(tempReg1, srcReg, dec_->tcoff);
-	LDRH(tempReg2, srcReg, dec_->tcoff + 2);
-	ORR(tempReg1, tempReg1, Operand2(tempReg2, ST_LSL, 16));
-	STR(tempReg1, dstReg, dec_->decFmt.uvoff);
-}
-
 void VertexDecoderJitCache::Jit_TcFloat() {
 	LDR(tempReg1, srcReg, dec_->tcoff);
 	LDR(tempReg2, srcReg, dec_->tcoff + 4);

diff --git a/GPU/Common/VertexDecoderArm64.cpp b/GPU/Common/VertexDecoderArm64.cpp
@@ -92,8 +92,6 @@ static const JitLookup jitLookup[] = {
 	{&VertexDecoder::Step_WeightsU16Skin, &VertexDecoderJitCache::Jit_WeightsU16Skin},
 	{&VertexDecoder::Step_WeightsFloatSkin, &VertexDecoderJitCache::Jit_WeightsFloatSkin},
 
-	{&VertexDecoder::Step_TcU8, &VertexDecoderJitCache::Jit_TcU8},
-	{&VertexDecoder::Step_TcU16, &VertexDecoderJitCache::Jit_TcU16},
 	{&VertexDecoder::Step_TcFloat, &VertexDecoderJitCache::Jit_TcFloat},
 	{&VertexDecoder::Step_TcU16Double, &VertexDecoderJitCache::Jit_TcU16Double},
 	{&VertexDecoder::Step_TcU8Prescale, &VertexDecoderJitCache::Jit_TcU8Prescale},
@@ -578,16 +576,6 @@ void VertexDecoderJitCache::Jit_Color5551() {
 	CSEL(fullAlphaReg, fullAlphaReg, WZR, CC_EQ);
 }
 
-void VertexDecoderJitCache::Jit_TcU8() {
-	LDURH(tempReg1, srcReg, dec_->tcoff);
-	STR(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.uvoff);
-}
-
-void VertexDecoderJitCache::Jit_TcU16() {
-	LDUR(tempReg1, srcReg, dec_->tcoff);
-	STR(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.uvoff);
-}
-
 void VertexDecoderJitCache::Jit_TcU16Through() {
 	LDRH(INDEX_UNSIGNED, tempReg1, srcReg, dec_->tcoff);
 	LDRH(INDEX_UNSIGNED, tempReg2, srcReg, dec_->tcoff + 2);

diff --git a/GPU/Common/VertexDecoderCommon.cpp b/GPU/Common/VertexDecoderCommon.cpp
@@ -264,14 +264,6 @@ void VertexDecoder::Step_WeightsFloatSkin() const
 	}
 }
 
-void VertexDecoder::Step_TcU8() const
-{
-	// u32 to write two bytes of zeroes for free.
-	u32 *uv = (u32*)(decoded_ + decFmt.uvoff);
-	const u16 *uvdata = (const u16*)(ptr_ + tcoff);
-	*uv = *uvdata;
-}
-
 void VertexDecoder::Step_TcU8ToFloat() const
 {
 	// u32 to write two bytes of zeroes for free.
@@ -281,14 +273,6 @@ void VertexDecoder::Step_TcU8ToFloat() const
 	uv[1] = uvdata[1] * (1.0f / 128.0f);
 }
 
-void VertexDecoder::Step_TcU16() const
-{
-	u32 *uv = (u32 *)(decoded_ + decFmt.uvoff);
-	// TODO: Fix big-endian without losing the optimization
-	const u32 *uvdata = (const u32*)(ptr_ + tcoff);
-	*uv = *uvdata;
-}
-
 void VertexDecoder::Step_TcU16ToFloat() const
 {
 	float *uv = (float *)(decoded_ + decFmt.uvoff);
@@ -903,13 +887,6 @@ static const StepFunction wtstep_skin[4] = {
 	&VertexDecoder::Step_WeightsFloatSkin,
 };
 
-static const StepFunction tcstep[4] = {
-	0,
-	&VertexDecoder::Step_TcU8,
-	&VertexDecoder::Step_TcU16,
-	&VertexDecoder::Step_TcFloat,
-};
-
 static const StepFunction tcstepToFloat[4] = {
 	0,
 	&VertexDecoder::Step_TcU8ToFloat,
@@ -973,42 +950,20 @@ static const StepFunction tcstep_morph_remasterToFloat[4] = {
 	&VertexDecoder::Step_TcFloatMorph,
 };
 
-static const StepFunction tcstep_through[4] = {
-	0,
-	&VertexDecoder::Step_TcU8,
-	&VertexDecoder::Step_TcU16Through,
-	&VertexDecoder::Step_TcFloatThrough,
-};
-
 static const StepFunction tcstep_throughToFloat[4] = {
 	0,
 	&VertexDecoder::Step_TcU8ToFloat,
 	&VertexDecoder::Step_TcU16ThroughToFloat,
 	&VertexDecoder::Step_TcFloatThrough,
 };
 
-// Some HD Remaster games double the u16 texture coordinates.
-static const StepFunction tcstep_remaster[4] = {
-	0,
-	&VertexDecoder::Step_TcU8,
-	&VertexDecoder::Step_TcU16Double,
-	&VertexDecoder::Step_TcFloat,
-};
-
 static const StepFunction tcstep_remasterToFloat[4] = {
 	0,
 	&VertexDecoder::Step_TcU8ToFloat,
 	&VertexDecoder::Step_TcU16DoubleToFloat,
 	&VertexDecoder::Step_TcFloat,
 };
 
-static const StepFunction tcstep_through_remaster[4] = {
-	0,
-	&VertexDecoder::Step_TcU8,
-	&VertexDecoder::Step_TcU16ThroughDouble,
-	&VertexDecoder::Step_TcFloatThrough,
-};
-
 static const StepFunction tcstep_through_remasterToFloat[4] = {
 	0,
 	&VertexDecoder::Step_TcU8ToFloat,
@@ -1173,41 +1128,21 @@ void VertexDecoder::SetVertexType(u32 fmt, const VertexDecoderOptions &options,
 			biggest = tcalign[tc];
 
 		// NOTE: That we check getUVGenMode here means that we must include it in the decoder ID!
-		if (g_Config.bPrescaleUV && !throughmode && (gstate.getUVGenMode() == GE_TEXMAP_TEXTURE_COORDS || gstate.getUVGenMode() == GE_TEXMAP_UNKNOWN)) {
+		if (!throughmode && (gstate.getUVGenMode() == GE_TEXMAP_TEXTURE_COORDS || gstate.getUVGenMode() == GE_TEXMAP_UNKNOWN)) {
 			if (g_DoubleTextureCoordinates)
 				steps_[numSteps_++] = morphcount == 1 ? tcstep_prescale_remaster[tc] : tcstep_prescale_morph_remaster[tc];
 			else
 				steps_[numSteps_++] = morphcount == 1 ? tcstep_prescale[tc] : tcstep_prescale_morph[tc];
 			decFmt.uvfmt = DEC_FLOAT_2;
 		} else {
-			if (options.expandAllUVtoFloat) {
-				if (morphcount != 1 && !throughmode)
-					steps_[numSteps_++] = g_DoubleTextureCoordinates ? tcstep_morph_remasterToFloat[tc] : tcstep_morphToFloat[tc];
-				else if (g_DoubleTextureCoordinates)
-					steps_[numSteps_++] = throughmode ? tcstep_through_remasterToFloat[tc] : tcstep_remasterToFloat[tc];
-				else
-					steps_[numSteps_++] = throughmode ? tcstep_throughToFloat[tc] : tcstepToFloat[tc];
-				decFmt.uvfmt = DEC_FLOAT_2;
-			} else {
-				if (morphcount != 1 && !throughmode)
-					steps_[numSteps_++] = g_DoubleTextureCoordinates ? tcstep_morph_remaster[tc] : tcstep_morph[tc];
-				else if (g_DoubleTextureCoordinates)
-					steps_[numSteps_++] = throughmode ? tcstep_through_remaster[tc] : tcstep_remaster[tc];
-				else
-					steps_[numSteps_++] = throughmode ? tcstep_through[tc] : tcstep[tc];
-
-				switch (tc) {
-				case GE_VTYPE_TC_8BIT >> GE_VTYPE_TC_SHIFT:
-					decFmt.uvfmt = throughmode ? DEC_U8A_2 : DEC_U8_2;
-					break;
-				case GE_VTYPE_TC_16BIT >> GE_VTYPE_TC_SHIFT:
-					decFmt.uvfmt = throughmode ? DEC_U16A_2 : DEC_U16_2;
-					break;
-				case GE_VTYPE_TC_FLOAT >> GE_VTYPE_TC_SHIFT:
-					decFmt.uvfmt = DEC_FLOAT_2;
-					break;
-				}
-			}
+			// We now always expand UV to float.
+			if (morphcount != 1 && !throughmode)
+				steps_[numSteps_++] = g_DoubleTextureCoordinates ? tcstep_morph_remasterToFloat[tc] : tcstep_morphToFloat[tc];
+			else if (g_DoubleTextureCoordinates)
+				steps_[numSteps_++] = throughmode ? tcstep_through_remasterToFloat[tc] : tcstep_remasterToFloat[tc];
+			else
+				steps_[numSteps_++] = throughmode ? tcstep_throughToFloat[tc] : tcstepToFloat[tc];
+			decFmt.uvfmt = DEC_FLOAT_2;
 		}
 
 		decFmt.uvoff = decOff;

diff --git a/GPU/Common/VertexDecoderCommon.h b/GPU/Common/VertexDecoderCommon.h
@@ -443,7 +443,6 @@ int TranslateNumBones(int bones);
 typedef void(*JittedVertexDecoder)(const u8 *src, u8 *dst, int count);
 
 struct VertexDecoderOptions {
-	bool expandAllUVtoFloat;
 	bool expandAllWeightsToFloat;
 	bool expand8BitNormalsToFloat;
 };
@@ -477,8 +476,6 @@ class VertexDecoder {
 	void Step_WeightsU16Skin() const;
 	void Step_WeightsFloatSkin() const;
 
-	void Step_TcU8() const;
-	void Step_TcU16() const;
 	void Step_TcU8ToFloat() const;
 	void Step_TcU16ToFloat() const;
 	void Step_TcFloat() const;
@@ -633,9 +630,7 @@ class VertexDecoderJitCache : public FakeGen::FakeXCodeBlock {
 	void Jit_WeightsU16Skin();
 	void Jit_WeightsFloatSkin();
 
-	void Jit_TcU8();
 	void Jit_TcU8ToFloat();
-	void Jit_TcU16();
 	void Jit_TcU16ToFloat();
 	void Jit_TcFloat();
 

diff --git a/GPU/Common/VertexDecoderX86.cpp b/GPU/Common/VertexDecoderX86.cpp
@@ -97,8 +97,6 @@ static const JitLookup jitLookup[] = {
 	{&VertexDecoder::Step_WeightsU16Skin, &VertexDecoderJitCache::Jit_WeightsU16Skin},
 	{&VertexDecoder::Step_WeightsFloatSkin, &VertexDecoderJitCache::Jit_WeightsFloatSkin},
 
-	{&VertexDecoder::Step_TcU8, &VertexDecoderJitCache::Jit_TcU8},
-	{&VertexDecoder::Step_TcU16, &VertexDecoderJitCache::Jit_TcU16},
 	{&VertexDecoder::Step_TcU8ToFloat, &VertexDecoderJitCache::Jit_TcU8ToFloat},
 	{&VertexDecoder::Step_TcU16ToFloat, &VertexDecoderJitCache::Jit_TcU16ToFloat},
 	{&VertexDecoder::Step_TcFloat, &VertexDecoderJitCache::Jit_TcFloat},
@@ -687,17 +685,6 @@ void VertexDecoderJitCache::Jit_WeightsFloatSkin() {
 	}
 }
 
-// Fill last two bytes with zeroes to align to 4 bytes. MOVZX does it for us, handy.
-void VertexDecoderJitCache::Jit_TcU8() {
-	MOVZX(32, 16, tempReg1, MDisp(srcReg, dec_->tcoff));
-	MOV(32, MDisp(dstReg, dec_->decFmt.uvoff), R(tempReg1));
-}
-
-void VertexDecoderJitCache::Jit_TcU16() {
-	MOV(32, R(tempReg1), MDisp(srcReg, dec_->tcoff));
-	MOV(32, MDisp(dstReg, dec_->decFmt.uvoff), R(tempReg1));
-}
-
 void VertexDecoderJitCache::Jit_TcU8ToFloat() {
 	Jit_AnyU8ToFloat(dec_->tcoff, 16);
 	MOVQ_xmm(MDisp(dstReg, dec_->decFmt.uvoff), XMM3);