From 5315c404c55778b48ab7dbf41295ed3904d0ce54 Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Sat, 11 Sep 2021 12:13:39 -0700
Subject: [PATCH 01/17] GPU: Cull rectangles outside valid Z.

Both TL and BR must be outside in the same direction to be culled when
depth clamp is enabled.
---
 GPU/Common/SoftwareTransformCommon.cpp | 27 +++++++++++++++-----------
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/GPU/Common/SoftwareTransformCommon.cpp b/GPU/Common/SoftwareTransformCommon.cpp
index ccadf1f18817..296f4dd1975a 100644
--- a/GPU/Common/SoftwareTransformCommon.cpp
+++ b/GPU/Common/SoftwareTransformCommon.cpp
@@ -64,14 +64,8 @@ static void SwapUVs(TransformedVertex &a, TransformedVertex &b) {
 
 // Note: 0 is BR and 2 is TL.
 
-static void RotateUV(TransformedVertex v[4], float flippedMatrix[16], bool flippedY) {
-	// Transform these two coordinates to figure out whether they're flipped or not.
-	Vec4f tl;
-	Vec3ByMatrix44(tl.AsArray(), v[2].pos, flippedMatrix);
-
-	Vec4f br;
-	Vec3ByMatrix44(br.AsArray(), v[0].pos, flippedMatrix);
-
+static void RotateUV(TransformedVertex v[4], Vec4f tl, Vec4f br, bool flippedY) {
+	// We use the transformed tl/br coordinates to figure out whether they're flipped or not.
 	float ySign = flippedY ? -1.0 : 1.0;
 
 	const float invtlw = 1.0f / tl.w;
@@ -629,10 +623,21 @@ void SoftwareTransform::BuildDrawingParams(int prim, int vertexCount, u32 vertTy
 			trans[3].u = transVtxTL.u;
 
 			// That's the four corners. Now process UV rotation.
-			if (throughmode)
+			if (throughmode) {
 				RotateUVThrough(trans);
-			else
-				RotateUV(trans, flippedMatrix, flippedY);
+			} else {
+				Vec4f tl;
+				Vec3ByMatrix44(tl.AsArray(), transVtxTL.pos, flippedMatrix);
+				Vec4f br;
+				Vec3ByMatrix44(br.AsArray(), transVtxBR.pos, flippedMatrix);
+
+				// If both transformed verts are outside Z, cull this rectangle entirely.
+				constexpr float outsideValue = 1.000030517578125f;
+				if (fabsf(tl.z) >= outsideValue && fabsf(br.z) >= outsideValue)
+					continue;
+
+				RotateUV(trans, tl, br, flippedY);
+			}
 
 			// Triangle: BR-TR-TL
 			indsOut[0] = i * 2 + 0;

From 4ac36cb8103f240a3c8790da20e54cf4989edc3b Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Sat, 11 Sep 2021 12:18:28 -0700
Subject: [PATCH 02/17] GPU: Cull rectangles more when depth clamp off.

If any vert is outside Z, it's culled when not clamping/clipping.
---
 GPU/Common/SoftwareTransformCommon.cpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/GPU/Common/SoftwareTransformCommon.cpp b/GPU/Common/SoftwareTransformCommon.cpp
index 296f4dd1975a..882ebc89c6bb 100644
--- a/GPU/Common/SoftwareTransformCommon.cpp
+++ b/GPU/Common/SoftwareTransformCommon.cpp
@@ -633,7 +633,11 @@ void SoftwareTransform::BuildDrawingParams(int prim, int vertexCount, u32 vertTy
 
 				// If both transformed verts are outside Z, cull this rectangle entirely.
 				constexpr float outsideValue = 1.000030517578125f;
-				if (fabsf(tl.z) >= outsideValue && fabsf(br.z) >= outsideValue)
+				bool tlOutside = fabsf(tl.z / tl.w) >= outsideValue;
+				bool brOutside = fabsf(br.z / br.w) >= outsideValue;
+				if (tlOutside && brOutside)
+					continue;
+				if (!gstate.isDepthClampEnabled() && (tlOutside || brOutside))
 					continue;
 
 				RotateUV(trans, tl, br, flippedY);

From 6252241c0ff6b5c490877ef4af259082dfe440a9 Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Sat, 11 Sep 2021 12:20:34 -0700
Subject: [PATCH 03/17] GPU: Verify throughmode for clears/rects.

---
 GPU/Common/SoftwareTransformCommon.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/GPU/Common/SoftwareTransformCommon.cpp b/GPU/Common/SoftwareTransformCommon.cpp
index 882ebc89c6bb..93a72e07da58 100644
--- a/GPU/Common/SoftwareTransformCommon.cpp
+++ b/GPU/Common/SoftwareTransformCommon.cpp
@@ -432,7 +432,7 @@ void SoftwareTransform::Decode(int prim, u32 vertType, const DecVtxFormat &decVt
 	// TODO: This bleeds outside the play area in non-buffered mode. Big deal? Probably not.
 	// TODO: Allow creating a depth clear and a color draw.
 	bool reallyAClear = false;
-	if (maxIndex > 1 && prim == GE_PRIM_RECTANGLES && gstate.isModeClear()) {
+	if (maxIndex > 1 && prim == GE_PRIM_RECTANGLES && gstate.isModeClear() && throughmode) {
 		int scissorX2 = gstate.getScissorX2() + 1;
 		int scissorY2 = gstate.getScissorY2() + 1;
 		reallyAClear = IsReallyAClear(transformed, maxIndex, scissorX2, scissorY2);
@@ -459,7 +459,7 @@ void SoftwareTransform::Decode(int prim, u32 vertType, const DecVtxFormat &decVt
 	}
 
 	// Detect full screen "clears" that might not be so obvious, to set the safe size if possible.
-	if (!result->setSafeSize && prim == GE_PRIM_RECTANGLES && maxIndex == 2) {
+	if (!result->setSafeSize && prim == GE_PRIM_RECTANGLES && maxIndex == 2 && throughmode) {
 		bool clearingColor = gstate.isModeClear() && (gstate.isClearModeColorMask() || gstate.isClearModeAlphaMask());
 		bool writingColor = gstate.getColorMask() != 0xFFFFFFFF;
 		bool startsZeroX = transformed[0].x <= 0.0f && transformed[1].x > 0.0f && transformed[1].x > transformed[0].x;

From 24011c3754a8c750a13ad4245823afef371d9459 Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Sat, 11 Sep 2021 16:54:25 -0700
Subject: [PATCH 04/17] GPU: Correct depth handling for guardband.

This culls based on pre-viewport Z and avoids culling based on the clip
range at negative Z.
---
 GPU/Common/ShaderUniforms.cpp        | 40 ++++++----------------------
 GPU/Common/VertexShaderGenerator.cpp | 28 +++++++++----------
 GPU/Directx9/ShaderManagerDX9.cpp    | 11 +++-----
 GPU/GLES/ShaderManagerGLES.cpp       |  9 +------
 4 files changed, 26 insertions(+), 62 deletions(-)

diff --git a/GPU/Common/ShaderUniforms.cpp b/GPU/Common/ShaderUniforms.cpp
index 8ca4bacff697..a8a02e78a465 100644
--- a/GPU/Common/ShaderUniforms.cpp
+++ b/GPU/Common/ShaderUniforms.cpp
@@ -43,29 +43,12 @@ void CalcCullRange(float minValues[4], float maxValues[4], bool flipViewport, bo
 		float pspViewport = (y - gstate.getViewportYCenter()) * (1.0f / gstate.getViewportYScale());
 		return (pspViewport * heightScale) - yOffset;
 	};
-	auto reverseViewportZ = [hasNegZ](float z) {
-		float vpZScale = gstate.getViewportZScale();
-		float vpZCenter = gstate.getViewportZCenter();
-
-		float scale, center;
-		if (gstate_c.Supports(GPU_SUPPORTS_ACCURATE_DEPTH)) {
-			// These are just the reverse of the formulas in GPUStateUtils.
-			float halfActualZRange = vpZScale * (1.0f / gstate_c.vpDepthScale);
-			float minz = -((gstate_c.vpZOffset * halfActualZRange) - vpZCenter) - halfActualZRange;
-
-			// In accurate depth mode, we're comparing against a value scaled to (minz, maxz).
-			// And minz might be very negative, (e.g. if we're clamping in that direction.)
-			scale = halfActualZRange;
-			center = minz + halfActualZRange;
-		} else {
-			// In old-style depth mode, we're comparing against a value scaled to viewport.
-			// (and possibly incorrectly clipped against it.)
-			scale = vpZScale;
-			center = vpZCenter;
+	auto transformZ = [hasNegZ](float z) {
+		// Z culling ignores the viewport, so we just redo the projection matrix adjustments.
+		if (hasNegZ) {
+			return (z * gstate_c.vpDepthScale) + gstate_c.vpZOffset;
 		}
-
-		float realViewport = (z - center) * (1.0f / scale);
-		return hasNegZ ? realViewport : (realViewport * 0.5f + 0.5f);
+		return (z * gstate_c.vpDepthScale * 0.5f) + gstate_c.vpZOffset * 0.5f + 0.5f;
 	};
 	auto sortPair = [](float a, float b) {
 		return a > b ? std::make_pair(b, a) : std::make_pair(a, b);
@@ -75,7 +58,7 @@ void CalcCullRange(float minValues[4], float maxValues[4], bool flipViewport, bo
 	// Any vertex outside this range (unless depth clamp enabled) is discarded.
 	auto x = sortPair(reverseViewportX(0.0f), reverseViewportX(4096.0f));
 	auto y = sortPair(reverseViewportY(0.0f), reverseViewportY(4096.0f));
-	auto z = sortPair(reverseViewportZ(0.0f), reverseViewportZ(65535.5f));
+	auto z = sortPair(transformZ(-1.000030517578125f), transformZ(1.000030517578125f));
 	// Since we have space in w, use it to pass the depth clamp flag.  We also pass NAN for w "discard".
 	float clampEnable = gstate.isDepthClampEnabled() ? 1.0f : 0.0f;
 
@@ -243,18 +226,11 @@ void BaseUpdateUniforms(UB_VS_FS_Base *ub, uint64_t dirtyUniforms, bool flipView
 		float viewZScale = halfActualZRange * 2.0f;
 		// Account for the half pixel offset.
 		float viewZCenter = minz + (DepthSliceFactor() / 256.0f) * 0.5f;
-		float viewZInvScale;
-
-		if (viewZScale != 0.0) {
-			viewZInvScale = 1.0f / viewZScale;
-		} else {
-			viewZInvScale = 0.0;
-		}
 
 		ub->depthRange[0] = viewZScale;
 		ub->depthRange[1] = viewZCenter;
-		ub->depthRange[2] = viewZCenter;
-		ub->depthRange[3] = viewZInvScale;
+		ub->depthRange[2] = gstate_c.vpZOffset * 0.5f + 0.5f;
+		ub->depthRange[3] = 2.0f * (1.0f / gstate_c.vpDepthScale);
 	}
 
 	if (dirtyUniforms & DIRTY_CULLRANGE) {
diff --git a/GPU/Common/VertexShaderGenerator.cpp b/GPU/Common/VertexShaderGenerator.cpp
index f2d3152ec59d..19b6a104c0d6 100644
--- a/GPU/Common/VertexShaderGenerator.cpp
+++ b/GPU/Common/VertexShaderGenerator.cpp
@@ -332,10 +332,8 @@ bool GenerateVertexShader(const VShaderID &id, char *buffer, const ShaderLanguag
 				}
 			}
 
-			if (!isModeThrough && gstate_c.Supports(GPU_ROUND_DEPTH_TO_16BIT)) {
-				WRITE(p, "vec4 u_depthRange : register(c%i);\n", CONST_VS_DEPTHRANGE);
-			}
 			if (!isModeThrough) {
+				WRITE(p, "vec4 u_depthRange : register(c%i);\n", CONST_VS_DEPTHRANGE);
 				WRITE(p, "vec4 u_cullRangeMin : register(c%i);\n", CONST_VS_CULLRANGEMIN);
 				WRITE(p, "vec4 u_cullRangeMax : register(c%i);\n", CONST_VS_CULLRANGEMAX);
 			}
@@ -517,15 +515,11 @@ bool GenerateVertexShader(const VShaderID &id, char *buffer, const ShaderLanguag
 			*uniformMask |= DIRTY_FOGCOEF;
 		}
 
-		if (!isModeThrough && gstate_c.Supports(GPU_ROUND_DEPTH_TO_16BIT)) {
-			WRITE(p, "uniform highp vec4 u_depthRange;\n");
-			*uniformMask |= DIRTY_DEPTHRANGE;
-		}
-
 		if (!isModeThrough) {
+			WRITE(p, "uniform highp vec4 u_depthRange;\n");
 			WRITE(p, "uniform highp vec4 u_cullRangeMin;\n");
 			WRITE(p, "uniform highp vec4 u_cullRangeMax;\n");
-			*uniformMask |= DIRTY_CULLRANGE;
+			*uniformMask |= DIRTY_DEPTHRANGE | DIRTY_CULLRANGE;
 		}
 
 		WRITE(p, "%s%s lowp vec4 v_color0;\n", shading, compat.varying_vs);
@@ -554,7 +548,7 @@ bool GenerateVertexShader(const VShaderID &id, char *buffer, const ShaderLanguag
 		WRITE(p, "  float z = v.z / v.w;\n");
 		WRITE(p, "  z = z * u_depthRange.x + u_depthRange.y;\n");
 		WRITE(p, "  z = floor(z);\n");
-		WRITE(p, "  z = (z - u_depthRange.z) * u_depthRange.w;\n");
+		WRITE(p, "  z = (z - u_depthRange.y) / u_depthRange.x;\n");
 		WRITE(p, "  return vec4(v.x, v.y, z * v.w, v.w);\n");
 		WRITE(p, "}\n\n");
 	}
@@ -1099,14 +1093,20 @@ bool GenerateVertexShader(const VShaderID &id, char *buffer, const ShaderLanguag
 
 	if (vertexRangeCulling) {
 		WRITE(p, "  vec3 projPos = outPos.xyz / outPos.w;\n");
-		// Vertex range culling doesn't happen when depth is clamped, so only do this if in range.
-		WRITE(p, "  if (u_cullRangeMin.w <= 0.0 || (projPos.z >= u_cullRangeMin.z && projPos.z <= u_cullRangeMax.z)) {\n");
-		const char *outMin = "projPos.x < u_cullRangeMin.x || projPos.y < u_cullRangeMin.y || projPos.z < u_cullRangeMin.z";
-		const char *outMax = "projPos.x > u_cullRangeMax.x || projPos.y > u_cullRangeMax.y || projPos.z > u_cullRangeMax.z";
+		WRITE(p, "  float projZ = (projPos.z - u_depthRange.z) * u_depthRange.w;\n");
+		// Vertex range culling doesn't happen when Z clips, note sign of w is important.
+		WRITE(p, "  if (u_cullRangeMin.w <= 0.0 || projZ * outPos.w > -outPos.w) {\n");
+		const char *outMin = "projPos.x < u_cullRangeMin.x || projPos.y < u_cullRangeMin.y";
+		const char *outMax = "projPos.x > u_cullRangeMax.x || projPos.y > u_cullRangeMax.y";
 		WRITE(p, "    if (%s || %s) {\n", outMin, outMax);
 		WRITE(p, "      outPos.xyzw = u_cullRangeMax.wwww;\n");
 		WRITE(p, "    }\n");
 		WRITE(p, "  }\n");
+		WRITE(p, "  if (u_cullRangeMin.w <= 0.0) {\n");
+		WRITE(p, "    if (projPos.z < u_cullRangeMin.z || projPos.z > u_cullRangeMax.z) {\n");
+		WRITE(p, "      outPos.xyzw = u_cullRangeMax.wwww;\n");
+		WRITE(p, "    }\n");
+		WRITE(p, "  }\n");
 	}
 
 	// We've named the output gl_Position in HLSL as well.
diff --git a/GPU/Directx9/ShaderManagerDX9.cpp b/GPU/Directx9/ShaderManagerDX9.cpp
index 90d9b325caed..9927460cd4c5 100644
--- a/GPU/Directx9/ShaderManagerDX9.cpp
+++ b/GPU/Directx9/ShaderManagerDX9.cpp
@@ -439,15 +439,10 @@ void ShaderManagerDX9::VSUpdateUniforms(u64 dirtyUniforms) {
 		float viewZScale = halfActualZRange * 2.0f;
 		// Account for the half pixel offset.
 		float viewZCenter = minz + (DepthSliceFactor() / 256.0f) * 0.5f;
-		float viewZInvScale;
+		float reverseScale = 2.0f * (1.0f / gstate_c.vpDepthScale);
+		float reverseTranslate = gstate_c.vpZOffset * 0.5f + 0.5f;
 
-		if (viewZScale != 0.0) {
-			viewZInvScale = 1.0f / viewZScale;
-		} else {
-			viewZInvScale = 0.0;
-		}
-
-		float data[4] = { viewZScale, viewZCenter, viewZCenter, viewZInvScale };
+		float data[4] = { viewZScale, viewZCenter, reverseTranslate, reverseScale };
 		VSSetFloatUniform4(CONST_VS_DEPTHRANGE, data);
 	}
 	if (dirtyUniforms & DIRTY_CULLRANGE) {
diff --git a/GPU/GLES/ShaderManagerGLES.cpp b/GPU/GLES/ShaderManagerGLES.cpp
index 44029e691ccc..bc086b01df64 100644
--- a/GPU/GLES/ShaderManagerGLES.cpp
+++ b/GPU/GLES/ShaderManagerGLES.cpp
@@ -476,14 +476,7 @@ void LinkedShader::UpdateUniforms(u32 vertType, const ShaderID &vsid, bool useBu
 			viewZCenter = vpZCenter;
 		}
 
-		float viewZInvScale;
-		if (viewZScale != 0.0) {
-			viewZInvScale = 1.0f / viewZScale;
-		} else {
-			viewZInvScale = 0.0;
-		}
-
-		float data[4] = { viewZScale, viewZCenter, viewZCenter, viewZInvScale };
+		float data[4] = { viewZScale, viewZCenter, gstate_c.vpZOffset, 1.0f / gstate_c.vpDepthScale };
 		SetFloatUniform4(render_, &u_depthRange, data);
 	}
 	if (dirty & DIRTY_CULLRANGE) {

From 273b9a3dc161cf18629dfc8a4077e475803bb7da Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Sat, 11 Sep 2021 16:57:52 -0700
Subject: [PATCH 05/17] Vulkan: Add negative Z clipping.

---
 Common/GPU/Vulkan/VulkanContext.cpp  | 2 ++
 GPU/Common/VertexShaderGenerator.cpp | 6 ++++++
 2 files changed, 8 insertions(+)

diff --git a/Common/GPU/Vulkan/VulkanContext.cpp b/Common/GPU/Vulkan/VulkanContext.cpp
index c9c53b5503e4..9427a3d3baea 100644
--- a/Common/GPU/Vulkan/VulkanContext.cpp
+++ b/Common/GPU/Vulkan/VulkanContext.cpp
@@ -580,6 +580,8 @@ void VulkanContext::ChooseDevice(int physical_device) {
 	deviceFeatures_.enabled.depthClamp = deviceFeatures_.available.depthClamp;
 	deviceFeatures_.enabled.depthBounds = deviceFeatures_.available.depthBounds;
 	deviceFeatures_.enabled.samplerAnisotropy = deviceFeatures_.available.samplerAnisotropy;
+	deviceFeatures_.enabled.shaderClipDistance = deviceFeatures_.available.shaderClipDistance;
+	deviceFeatures_.enabled.shaderCullDistance = deviceFeatures_.available.shaderCullDistance;
 	// For easy wireframe mode, someday.
 	deviceFeatures_.enabled.fillModeNonSolid = deviceFeatures_.available.fillModeNonSolid;
 
diff --git a/GPU/Common/VertexShaderGenerator.cpp b/GPU/Common/VertexShaderGenerator.cpp
index 19b6a104c0d6..02c30c2dbefd 100644
--- a/GPU/Common/VertexShaderGenerator.cpp
+++ b/GPU/Common/VertexShaderGenerator.cpp
@@ -105,6 +105,8 @@ const char *boneWeightAttrInitHLSL[9] = {
 // to 0 and 65535 if a depth clamping/clipping flag is set (x/y clipping is performed only if depth
 // needs to be clamped.)
 //
+// Additionally, depth is clipped to negative z based on vec.z (before viewport), at -1.
+//
 // All this above is for full transform mode.
 // In through mode, the Z coordinate just goes straight through and there is no perspective division.
 // We simulate this of course with pretty much an identity matrix. Rounding Z becomes very easy.
@@ -1107,6 +1109,10 @@ bool GenerateVertexShader(const VShaderID &id, char *buffer, const ShaderLanguag
 		WRITE(p, "      outPos.xyzw = u_cullRangeMax.wwww;\n");
 		WRITE(p, "    }\n");
 		WRITE(p, "  }\n");
+
+		if (compat.shaderLanguage == GLSL_VULKAN) {
+			WRITE(p, "  %sgl_ClipDistance[0] = projZ * outPos.w + outPos.w;\n", compat.vsOutPrefix);
+		}
 	}
 
 	// We've named the output gl_Position in HLSL as well.

From 2271b41d077b6390506b1bcd7e5d0dbc3f3718ad Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Sat, 11 Sep 2021 17:10:29 -0700
Subject: [PATCH 06/17] Vulkan: Use clip distance only if supported.

---
 GPU/Common/VertexShaderGenerator.cpp |  3 ++-
 GPU/GPUState.h                       |  2 +-
 GPU/Vulkan/GPU_Vulkan.cpp            | 15 ++++++++++-----
 3 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/GPU/Common/VertexShaderGenerator.cpp b/GPU/Common/VertexShaderGenerator.cpp
index 02c30c2dbefd..d969a719c263 100644
--- a/GPU/Common/VertexShaderGenerator.cpp
+++ b/GPU/Common/VertexShaderGenerator.cpp
@@ -1110,7 +1110,8 @@ bool GenerateVertexShader(const VShaderID &id, char *buffer, const ShaderLanguag
 		WRITE(p, "    }\n");
 		WRITE(p, "  }\n");
 
-		if (compat.shaderLanguage == GLSL_VULKAN) {
+		if (compat.shaderLanguage == GLSL_VULKAN && gstate_c.Supports(GPU_SUPPORTS_CLIP_CULL_DISTANCE)) {
+			// TODO: Not rectangles...
 			WRITE(p, "  %sgl_ClipDistance[0] = projZ * outPos.w + outPos.w;\n", compat.vsOutPrefix);
 		}
 	}
diff --git a/GPU/GPUState.h b/GPU/GPUState.h
index 5b11b37b7317..5806fdab240a 100644
--- a/GPU/GPUState.h
+++ b/GPU/GPUState.h
@@ -482,7 +482,7 @@ enum {
 	GPU_SUPPORTS_32BIT_INT_FSHADER = FLAG_BIT(15),
 	GPU_SUPPORTS_DEPTH_TEXTURE = FLAG_BIT(16),
 	GPU_SUPPORTS_ACCURATE_DEPTH = FLAG_BIT(17),
-	// Free bit: 18,
+	GPU_SUPPORTS_CLIP_CULL_DISTANCE = FLAG_BIT(18),
 	GPU_SUPPORTS_COPY_IMAGE = FLAG_BIT(19),
 	GPU_SUPPORTS_ANY_FRAMEBUFFER_FETCH = FLAG_BIT(20),
 	GPU_SCALE_DEPTH_FROM_24BIT_TO_16BIT = FLAG_BIT(21),
diff --git a/GPU/Vulkan/GPU_Vulkan.cpp b/GPU/Vulkan/GPU_Vulkan.cpp
index 6b4c5e7d03d0..7af0794ae6e9 100644
--- a/GPU/Vulkan/GPU_Vulkan.cpp
+++ b/GPU/Vulkan/GPU_Vulkan.cpp
@@ -238,21 +238,26 @@ void GPU_Vulkan::CheckGPUFeatures() {
 		features |= GPU_SUPPORTS_FRAMEBUFFER_BLIT_TO_DEPTH;
 	}
 
-	if (vulkan_->GetDeviceFeatures().enabled.wideLines) {
+	auto &enabledFeatures = vulkan_->GetDeviceFeatures().enabled;
+	if (enabledFeatures.wideLines) {
 		features |= GPU_SUPPORTS_WIDE_LINES;
 	}
-	if (vulkan_->GetDeviceFeatures().enabled.depthClamp) {
+	if (enabledFeatures.depthClamp) {
 		features |= GPU_SUPPORTS_DEPTH_CLAMP;
 	}
-	if (vulkan_->GetDeviceFeatures().enabled.dualSrcBlend) {
+	if (enabledFeatures.shaderClipDistance && enabledFeatures.shaderCullDistance) {
+		// Must support at least 8 if feature supported, so we're fine.
+		features |= GPU_SUPPORTS_CLIP_CULL_DISTANCE;
+	}
+	if (enabledFeatures.dualSrcBlend) {
 		if (!g_Config.bVendorBugChecksEnabled || !draw_->GetBugs().Has(Draw::Bugs::DUAL_SOURCE_BLENDING_BROKEN)) {
 			features |= GPU_SUPPORTS_DUALSOURCE_BLEND;
 		}
 	}
-	if (vulkan_->GetDeviceFeatures().enabled.logicOp) {
+	if (enabledFeatures.logicOp) {
 		features |= GPU_SUPPORTS_LOGIC_OP;
 	}
-	if (vulkan_->GetDeviceFeatures().enabled.samplerAnisotropy) {
+	if (enabledFeatures.samplerAnisotropy) {
 		features |= GPU_SUPPORTS_ANISOTROPY;
 	}
 

From 046a5c548b11afe31fa098f2f396b62665f2cc13 Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Sat, 11 Sep 2021 17:17:46 -0700
Subject: [PATCH 07/17] GLES: Check clip/cull distance support.

Pretty limited on GLES3+.  Also D3D11.
Seems like doing it on D3D9 might be a bit tricky.
---
 Common/GPU/D3D11/thin3d_d3d11.cpp   | 2 ++
 Common/GPU/OpenGL/thin3d_gl.cpp     | 1 +
 Common/GPU/Vulkan/thin3d_vulkan.cpp | 1 +
 Common/GPU/thin3d.h                 | 1 +
 GPU/D3D11/GPU_D3D11.cpp             | 2 ++
 GPU/GLES/GPU_GLES.cpp               | 2 ++
 6 files changed, 9 insertions(+)

diff --git a/Common/GPU/D3D11/thin3d_d3d11.cpp b/Common/GPU/D3D11/thin3d_d3d11.cpp
index 114692228cbd..b927382c6288 100644
--- a/Common/GPU/D3D11/thin3d_d3d11.cpp
+++ b/Common/GPU/D3D11/thin3d_d3d11.cpp
@@ -245,6 +245,8 @@ D3D11DrawContext::D3D11DrawContext(ID3D11Device *device, ID3D11DeviceContext *de
 	// Seems like a fair approximation...
 	caps_.dualSourceBlend = featureLevel_ >= D3D_FEATURE_LEVEL_10_0;
 	caps_.depthClampSupported = featureLevel_ >= D3D_FEATURE_LEVEL_10_0;
+	// SV_ClipDistance# seems to be 10+.
+	caps_.clipCullDistanceSupported = featureLevel_ >= D3D_FEATURE_LEVEL_10_0;
 
 	caps_.depthRangeMinusOneToOne = false;
 	caps_.framebufferBlitSupported = false;
diff --git a/Common/GPU/OpenGL/thin3d_gl.cpp b/Common/GPU/OpenGL/thin3d_gl.cpp
index 1c7a19792d4f..83102c88989d 100644
--- a/Common/GPU/OpenGL/thin3d_gl.cpp
+++ b/Common/GPU/OpenGL/thin3d_gl.cpp
@@ -534,6 +534,7 @@ OpenGLContext::OpenGLContext() {
 	caps_.framebufferBlitSupported = gl_extensions.NV_framebuffer_blit || gl_extensions.ARB_framebuffer_object;
 	caps_.framebufferDepthBlitSupported = caps_.framebufferBlitSupported;
 	caps_.depthClampSupported = gl_extensions.ARB_depth_clamp;
+	caps_.clipCullDistanceSupported = gl_extensions.EXT_clip_cull_distance || (!gl_extensions.IsGLES && gl_extensions.VersionGEThan(3, 0));
 
 	// Interesting potential hack for emulating GL_DEPTH_CLAMP (use a separate varying, force depth in fragment shader):
 	// This will induce a performance penalty on many architectures though so a blanket enable of this
diff --git a/Common/GPU/Vulkan/thin3d_vulkan.cpp b/Common/GPU/Vulkan/thin3d_vulkan.cpp
index 2d74240b9a1f..8d996aea0c15 100644
--- a/Common/GPU/Vulkan/thin3d_vulkan.cpp
+++ b/Common/GPU/Vulkan/thin3d_vulkan.cpp
@@ -780,6 +780,7 @@ VKContext::VKContext(VulkanContext *vulkan, bool splitSubmit)
 	caps_.multiViewport = vulkan->GetDeviceFeatures().enabled.multiViewport != 0;
 	caps_.dualSourceBlend = vulkan->GetDeviceFeatures().enabled.dualSrcBlend != 0;
 	caps_.depthClampSupported = vulkan->GetDeviceFeatures().enabled.depthClamp != 0;
+	caps_.clipCullDistanceSupported = vulkan->GetDeviceFeatures().enabled.shaderClipDistance != 0 && vulkan->GetDeviceFeatures().enabled.shaderCullDistance != 0;
 	caps_.framebufferBlitSupported = true;
 	caps_.framebufferCopySupported = true;
 	caps_.framebufferDepthBlitSupported = false;  // Can be checked for.
diff --git a/Common/GPU/thin3d.h b/Common/GPU/thin3d.h
index 023098405cc5..c7a4db531a23 100644
--- a/Common/GPU/thin3d.h
+++ b/Common/GPU/thin3d.h
@@ -520,6 +520,7 @@ struct DeviceCaps {
 	bool dualSourceBlend;
 	bool logicOpSupported;
 	bool depthClampSupported;
+	bool clipCullDistanceSupported;
 	bool framebufferCopySupported;
 	bool framebufferBlitSupported;
 	bool framebufferDepthCopySupported;
diff --git a/GPU/D3D11/GPU_D3D11.cpp b/GPU/D3D11/GPU_D3D11.cpp
index f973d49e5ef0..e01dd30b2351 100644
--- a/GPU/D3D11/GPU_D3D11.cpp
+++ b/GPU/D3D11/GPU_D3D11.cpp
@@ -128,6 +128,8 @@ void GPU_D3D11::CheckGPUFeatures() {
 		features |= GPU_SUPPORTS_DUALSOURCE_BLEND;
 	if (draw_->GetDeviceCaps().depthClampSupported)
 		features |= GPU_SUPPORTS_DEPTH_CLAMP;
+	if (draw_->GetDeviceCaps().clipCullDistanceSupported)
+		features |= GPU_SUPPORTS_CLIP_CULL_DISTANCE;
 	features |= GPU_SUPPORTS_COPY_IMAGE;
 	features |= GPU_SUPPORTS_TEXTURE_FLOAT;
 	features |= GPU_SUPPORTS_INSTANCE_RENDERING;
diff --git a/GPU/GLES/GPU_GLES.cpp b/GPU/GLES/GPU_GLES.cpp
index 18b11d57f5f2..fc28f4d9ed8e 100644
--- a/GPU/GLES/GPU_GLES.cpp
+++ b/GPU/GLES/GPU_GLES.cpp
@@ -228,6 +228,8 @@ void GPU_GLES::CheckGPUFeatures() {
 		if (gl_extensions.GLES3)
 			features |= GPU_SUPPORTS_DEPTH_TEXTURE;
 	}
+	if (draw_->GetDeviceCaps().clipCullDistanceSupported)
+		features |= GPU_SUPPORTS_CLIP_CULL_DISTANCE;
 
 	// If we already have a 16-bit depth buffer, we don't need to round.
 	bool prefer24 = draw_->GetDeviceCaps().preferredDepthBufferFormat == Draw::DataFormat::D24_S8;

From 7d00b6ca90f2c1220cce8a6aca44b4ece9bd97fa Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Sat, 11 Sep 2021 17:53:50 -0700
Subject: [PATCH 08/17] GLES: Enable/disable clip distance 0.

---
 Common/GPU/OpenGL/GLQueueRunner.cpp  | 15 +++++++++++++++
 Common/GPU/OpenGL/GLRenderManager.h  |  4 +++-
 Common/GPU/OpenGL/thin3d_gl.cpp      |  2 +-
 GPU/Common/VertexShaderGenerator.cpp |  5 ++++-
 GPU/GLES/DepalettizeShaderGLES.cpp   |  2 +-
 GPU/GLES/DepthBufferGLES.cpp         |  2 +-
 GPU/GLES/FramebufferManagerGLES.cpp  |  2 +-
 GPU/GLES/ShaderManagerGLES.cpp       | 19 ++++++++++++-------
 GPU/GLES/ShaderManagerGLES.h         | 11 +++++++++--
 GPU/GLES/StencilBufferGLES.cpp       |  2 +-
 10 files changed, 48 insertions(+), 16 deletions(-)

diff --git a/Common/GPU/OpenGL/GLQueueRunner.cpp b/Common/GPU/OpenGL/GLQueueRunner.cpp
index a2d66b9a8690..f3bd46dd4b03 100644
--- a/Common/GPU/OpenGL/GLQueueRunner.cpp
+++ b/Common/GPU/OpenGL/GLQueueRunner.cpp
@@ -17,6 +17,11 @@
 #include "GLRenderManager.h"
 #include "DataFormatGL.h"
 
+// These are the same value, alias for simplicity.
+#if defined(GL_CLIP_DISTANCE0_EXT) && !defined(GL_CLIP_DISTANCE0)
+#define GL_CLIP_DISTANCE0 GL_CLIP_DISTANCE0_EXT
+#endif
+
 static constexpr int TEXCACHE_NAME_CACHE_SIZE = 16;
 
 #if PPSSPP_PLATFORM(IOS)
@@ -798,6 +803,7 @@ void GLQueueRunner::PerformRenderPass(const GLRStep &step, bool first, bool last
 	int logicOp = -1;
 	bool logicEnabled = false;
 #endif
+	bool clipDistance0Enabled = false;
 	GLuint blendEqColor = (GLuint)-1;
 	GLuint blendEqAlpha = (GLuint)-1;
 
@@ -1106,6 +1112,13 @@ void GLQueueRunner::PerformRenderPass(const GLRStep &step, bool first, bool last
 		{
 			if (curProgram != c.program.program) {
 				glUseProgram(c.program.program->program);
+				if (c.program.program->use_clip_distance0 != clipDistance0Enabled) {
+					if (c.program.program->use_clip_distance0)
+						glEnable(GL_CLIP_DISTANCE0);
+					else
+						glDisable(GL_CLIP_DISTANCE0);
+					clipDistance0Enabled = c.program.program->use_clip_distance0;
+				}
 				curProgram = c.program.program;
 			}
 			CHECK_GL_ERROR_IF_DEBUG();
@@ -1340,6 +1353,8 @@ void GLQueueRunner::PerformRenderPass(const GLRStep &step, bool first, bool last
 		glDisable(GL_COLOR_LOGIC_OP);
 	}
 #endif
+	if (clipDistance0Enabled)
+		glDisable(GL_CLIP_DISTANCE0);
 	if ((colorMask & 15) != 15)
 		glColorMask(GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE);
 	CHECK_GL_ERROR_IF_DEBUG();
diff --git a/Common/GPU/OpenGL/GLRenderManager.h b/Common/GPU/OpenGL/GLRenderManager.h
index b4c3378739aa..110704bab254 100644
--- a/Common/GPU/OpenGL/GLRenderManager.h
+++ b/Common/GPU/OpenGL/GLRenderManager.h
@@ -119,6 +119,7 @@ class GLRProgram {
 	std::vector<Semantic> semantics_;
 	std::vector<UniformLocQuery> queries_;
 	std::vector<Initializer> initialize_;
+	bool use_clip_distance0 = false;
 
 	struct UniformInfo {
 		int loc_;
@@ -422,13 +423,14 @@ class GLRenderManager {
 	// not be an active render pass.
 	GLRProgram *CreateProgram(
 		std::vector<GLRShader *> shaders, std::vector<GLRProgram::Semantic> semantics, std::vector<GLRProgram::UniformLocQuery> queries,
-		std::vector<GLRProgram::Initializer> initalizers, bool supportDualSource) {
+		std::vector<GLRProgram::Initializer> initalizers, bool supportDualSource, bool useClipDistance0) {
 		GLRInitStep step{ GLRInitStepType::CREATE_PROGRAM };
 		_assert_(shaders.size() <= ARRAY_SIZE(step.create_program.shaders));
 		step.create_program.program = new GLRProgram();
 		step.create_program.program->semantics_ = semantics;
 		step.create_program.program->queries_ = queries;
 		step.create_program.program->initialize_ = initalizers;
+		step.create_program.program->use_clip_distance0 = useClipDistance0;
 		step.create_program.support_dual_source = supportDualSource;
 		_assert_msg_(shaders.size() > 0, "Can't create a program with zero shaders");
 		for (size_t i = 0; i < shaders.size(); i++) {
diff --git a/Common/GPU/OpenGL/thin3d_gl.cpp b/Common/GPU/OpenGL/thin3d_gl.cpp
index 83102c88989d..ab4bb2416153 100644
--- a/Common/GPU/OpenGL/thin3d_gl.cpp
+++ b/Common/GPU/OpenGL/thin3d_gl.cpp
@@ -1163,7 +1163,7 @@ bool OpenGLPipeline::LinkShaders() {
 	std::vector<GLRProgram::Initializer> initialize;
 	for (int i = 0; i < MAX_TEXTURE_SLOTS; ++i)
 		initialize.push_back({ &samplerLocs_[i], 0, i });
-	program_ = render_->CreateProgram(linkShaders, semantics, queries, initialize, false);
+	program_ = render_->CreateProgram(linkShaders, semantics, queries, initialize, false, false);
 	return true;
 }
 
diff --git a/GPU/Common/VertexShaderGenerator.cpp b/GPU/Common/VertexShaderGenerator.cpp
index d969a719c263..4e90be2e537c 100644
--- a/GPU/Common/VertexShaderGenerator.cpp
+++ b/GPU/Common/VertexShaderGenerator.cpp
@@ -141,6 +141,9 @@ bool GenerateVertexShader(const VShaderID &id, char *buffer, const ShaderLanguag
 		if (gl_extensions.EXT_gpu_shader4) {
 			gl_exts.push_back("#extension GL_EXT_gpu_shader4 : enable");
 		}
+		if (gl_extensions.EXT_clip_cull_distance && id.Bit(VS_BIT_VERTEX_RANGE_CULLING)) {
+			gl_exts.push_back("#extension GL_EXT_clip_cull_distance : enable");
+		}
 	}
 	ShaderWriter p(buffer, compat, ShaderStage::Vertex, gl_exts.data(), gl_exts.size());
 
@@ -1110,7 +1113,7 @@ bool GenerateVertexShader(const VShaderID &id, char *buffer, const ShaderLanguag
 		WRITE(p, "    }\n");
 		WRITE(p, "  }\n");
 
-		if (compat.shaderLanguage == GLSL_VULKAN && gstate_c.Supports(GPU_SUPPORTS_CLIP_CULL_DISTANCE)) {
+		if (gstate_c.Supports(GPU_SUPPORTS_CLIP_CULL_DISTANCE) && (compat.shaderLanguage == GLSL_VULKAN || ShaderLanguageIsOpenGL(compat.shaderLanguage))) {
 			// TODO: Not rectangles...
 			WRITE(p, "  %sgl_ClipDistance[0] = projZ * outPos.w + outPos.w;\n", compat.vsOutPrefix);
 		}
diff --git a/GPU/GLES/DepalettizeShaderGLES.cpp b/GPU/GLES/DepalettizeShaderGLES.cpp
index 47ddfa19074a..fbf544f1cae1 100644
--- a/GPU/GLES/DepalettizeShaderGLES.cpp
+++ b/GPU/GLES/DepalettizeShaderGLES.cpp
@@ -183,7 +183,7 @@ DepalShader *DepalShaderCacheGLES::GetDepalettizeShader(uint32_t clutMode, GEBuf
 
 	std::vector<GLRShader *> shaders{ vertexShader_, fragShader };
 
-	GLRProgram *program = render_->CreateProgram(shaders, semantics, queries, initializer, false);
+	GLRProgram *program = render_->CreateProgram(shaders, semantics, queries, initializer, false, false);
 
 	depal->program = program;
 	depal->fragShader = fragShader;
diff --git a/GPU/GLES/DepthBufferGLES.cpp b/GPU/GLES/DepthBufferGLES.cpp
index 7eae61caa690..d26271372120 100644
--- a/GPU/GLES/DepthBufferGLES.cpp
+++ b/GPU/GLES/DepthBufferGLES.cpp
@@ -116,7 +116,7 @@ void FramebufferManagerGLES::PackDepthbuffer(VirtualFramebuffer *vfb, int x, int
 			queries.push_back({ &u_depthDownloadTo8, "u_depthTo8" });
 			std::vector<GLRProgram::Initializer> inits;
 			inits.push_back({ &u_depthDownloadTex, 0, TEX_SLOT_PSP_TEXTURE });
-			depthDownloadProgram_ = render_->CreateProgram(shaders, semantics, queries, inits, false);
+			depthDownloadProgram_ = render_->CreateProgram(shaders, semantics, queries, inits, false, false);
 			for (auto iter : shaders) {
 				render_->DeleteShader(iter);
 			}
diff --git a/GPU/GLES/FramebufferManagerGLES.cpp b/GPU/GLES/FramebufferManagerGLES.cpp
index f28b4f7016f5..56672127d9b6 100644
--- a/GPU/GLES/FramebufferManagerGLES.cpp
+++ b/GPU/GLES/FramebufferManagerGLES.cpp
@@ -87,7 +87,7 @@ void FramebufferManagerGLES::CompileDraw2DProgram() {
 		std::vector<GLRProgram::Semantic> semantics;
 		semantics.push_back({ 0, "a_position" });
 		semantics.push_back({ 1, "a_texcoord0" });
-		draw2dprogram_ = render_->CreateProgram(shaders, semantics, queries, initializers, false);
+		draw2dprogram_ = render_->CreateProgram(shaders, semantics, queries, initializers, false, false);
 		for (auto shader : shaders)
 			render_->DeleteShader(shader);
 	}
diff --git a/GPU/GLES/ShaderManagerGLES.cpp b/GPU/GLES/ShaderManagerGLES.cpp
index bc086b01df64..a25231c23256 100644
--- a/GPU/GLES/ShaderManagerGLES.cpp
+++ b/GPU/GLES/ShaderManagerGLES.cpp
@@ -51,10 +51,10 @@
 
 using namespace Lin;
 
-Shader::Shader(GLRenderManager *render, const char *code, const std::string &desc, uint32_t glShaderType, bool useHWTransform, uint32_t attrMask, uint64_t uniformMask)
-	  : render_(render), failed_(false), useHWTransform_(useHWTransform), attrMask_(attrMask), uniformMask_(uniformMask) {
+Shader::Shader(GLRenderManager *render, const char *code, const std::string &desc, const ShaderDescGLES &params)
+	  : render_(render), useHWTransform_(params.useHWTransform), attrMask_(params.attrMask), uniformMask_(params.uniformMask) {
 	PROFILE_THIS_SCOPE("shadercomp");
-	isFragment_ = glShaderType == GL_FRAGMENT_SHADER;
+	isFragment_ = params.glShaderType == GL_FRAGMENT_SHADER;
 	source_ = code;
 #ifdef SHADERLOG
 #ifdef _WIN32
@@ -63,7 +63,7 @@ Shader::Shader(GLRenderManager *render, const char *code, const std::string &des
 	printf("%s\n", code);
 #endif
 #endif
-	shader = render->CreateShader(glShaderType, source_, desc);
+	shader = render->CreateShader(params.glShaderType, source_, desc);
 }
 
 Shader::~Shader() {
@@ -182,7 +182,9 @@ LinkedShader::LinkedShader(GLRenderManager *render, VShaderID VSID, Shader *vs,
 	initialize.push_back({ &u_tess_weights_u, 0, 5 });
 	initialize.push_back({ &u_tess_weights_v, 0, 6 });
 
-	program = render->CreateProgram(shaders, semantics, queries, initialize, gstate_c.featureFlags & GPU_SUPPORTS_DUALSOURCE_BLEND);
+	bool useDualSource = (gstate_c.featureFlags & GPU_SUPPORTS_DUALSOURCE_BLEND) != 0;
+	bool useClip0 = VSID.Bit(VS_BIT_VERTEX_RANGE_CULLING) && gstate_c.Supports(GPU_SUPPORTS_CLIP_CULL_DISTANCE);
+	program = render->CreateProgram(shaders, semantics, queries, initialize, useDualSource, useClip0);
 
 	// The rest, use the "dirty" mechanism.
 	dirtyUniforms = DIRTY_ALL_UNIFORMS;
@@ -633,7 +635,8 @@ Shader *ShaderManagerGLES::CompileFragmentShader(FShaderID FSID) {
 		return nullptr;
 	}
 	std::string desc = FragmentShaderDesc(FSID);
-	return new Shader(render_, codeBuffer_, desc, GL_FRAGMENT_SHADER, false, 0, uniformMask);
+	ShaderDescGLES params{ GL_FRAGMENT_SHADER, 0, uniformMask };
+	return new Shader(render_, codeBuffer_, desc, params);
 }
 
 Shader *ShaderManagerGLES::CompileVertexShader(VShaderID VSID) {
@@ -646,7 +649,9 @@ Shader *ShaderManagerGLES::CompileVertexShader(VShaderID VSID) {
 		return nullptr;
 	}
 	std::string desc = VertexShaderDesc(VSID);
-	return new Shader(render_, codeBuffer_, desc, GL_VERTEX_SHADER, useHWTransform, attrMask, uniformMask);
+	ShaderDescGLES params{ GL_VERTEX_SHADER, attrMask, uniformMask };
+	params.useHWTransform = useHWTransform;
+	return new Shader(render_, codeBuffer_, desc, params);
 }
 
 Shader *ShaderManagerGLES::ApplyVertexShader(bool useHWTransform, bool useHWTessellation, u32 vertType, bool weightsAsFloat, VShaderID *VSID) {
diff --git a/GPU/GLES/ShaderManagerGLES.h b/GPU/GLES/ShaderManagerGLES.h
index c49b6700c02d..24ef3e42f118 100644
--- a/GPU/GLES/ShaderManagerGLES.h
+++ b/GPU/GLES/ShaderManagerGLES.h
@@ -114,9 +114,16 @@ class LinkedShader {
 
 // Real public interface
 
+struct ShaderDescGLES {
+	uint32_t glShaderType;
+	uint32_t attrMask;
+	uint64_t uniformMask;
+	bool useHWTransform;
+};
+
 class Shader {
 public:
-	Shader(GLRenderManager *render, const char *code, const std::string &desc, uint32_t glShaderType, bool useHWTransform, uint32_t attrMask, uint64_t uniformMask);
+	Shader(GLRenderManager *render, const char *code, const std::string &desc, const ShaderDescGLES &params);
 	~Shader();
 	GLRShader *shader;
 
@@ -131,7 +138,7 @@ class Shader {
 private:
 	GLRenderManager *render_;
 	std::string source_;
-	bool failed_;
+	bool failed_ = false;
 	bool useHWTransform_;
 	bool isFragment_;
 	uint32_t attrMask_; // only used in vertex shaders
diff --git a/GPU/GLES/StencilBufferGLES.cpp b/GPU/GLES/StencilBufferGLES.cpp
index 480690b9301c..7ea535925f2e 100644
--- a/GPU/GLES/StencilBufferGLES.cpp
+++ b/GPU/GLES/StencilBufferGLES.cpp
@@ -147,7 +147,7 @@ bool FramebufferManagerGLES::NotifyStencilUpload(u32 addr, int size, StencilUplo
 		queries.push_back({ &u_stencilValue, "u_stencilValue" });
 		std::vector<GLRProgram::Initializer> inits;
 		inits.push_back({ &u_stencilUploadTex, 0, TEX_SLOT_PSP_TEXTURE });
-		stencilUploadProgram_ = render_->CreateProgram(shaders, semantics, queries, inits, false);
+		stencilUploadProgram_ = render_->CreateProgram(shaders, semantics, queries, inits, false, false);
 		for (auto iter : shaders) {
 			render_->DeleteShader(iter);
 		}

From d2ff66a660cd4c7f0f8153f68a21043c138d19d2 Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Sat, 11 Sep 2021 18:01:36 -0700
Subject: [PATCH 09/17] UI: Clear textures on Begin.

On GLES, saw a texture bound to slot 1 when UI started to draw after an
emu frame, which caused a crash because there was no sampler.  Let's just
explicitly flush.
---
 Common/UI/Context.cpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/Common/UI/Context.cpp b/Common/UI/Context.cpp
index dc3fbd11d89e..ff698bbc9764 100644
--- a/Common/UI/Context.cpp
+++ b/Common/UI/Context.cpp
@@ -56,7 +56,11 @@ void UIContext::BeginNoTex() {
 
 void UIContext::BeginPipeline(Draw::Pipeline *pipeline, Draw::SamplerState *samplerState) {
 	_assert_(pipeline != nullptr);
-	draw_->BindSamplerStates(0, 1, &samplerState);
+	// Also clear out any other textures bound.
+	Draw::SamplerState *samplers[3]{ samplerState };
+	draw_->BindSamplerStates(0, 3, samplers);
+	Draw::Texture *textures[2]{};
+	draw_->BindTextures(1, 2, textures);
 	RebindTexture();
 	UIBegin(pipeline);
 }

From 1a603fedf59c1b146466d0037f4fb66791ed14d5 Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Sat, 11 Sep 2021 18:03:11 -0700
Subject: [PATCH 10/17] Vulkan: Cull verts fully outside depth.

Following PSP rules of -1 to 1 pre-viewport Z.  This also enables it for
GLES/OpenGL.
---
 GPU/Common/VertexShaderGenerator.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/GPU/Common/VertexShaderGenerator.cpp b/GPU/Common/VertexShaderGenerator.cpp
index 4e90be2e537c..ae554ac7c72e 100644
--- a/GPU/Common/VertexShaderGenerator.cpp
+++ b/GPU/Common/VertexShaderGenerator.cpp
@@ -1116,6 +1116,11 @@ bool GenerateVertexShader(const VShaderID &id, char *buffer, const ShaderLanguag
 		if (gstate_c.Supports(GPU_SUPPORTS_CLIP_CULL_DISTANCE) && (compat.shaderLanguage == GLSL_VULKAN || ShaderLanguageIsOpenGL(compat.shaderLanguage))) {
 			// TODO: Not rectangles...
 			WRITE(p, "  %sgl_ClipDistance[0] = projZ * outPos.w + outPos.w;\n", compat.vsOutPrefix);
+			// Cull any triangle fully outside in the same direction when depth clamp enabled.
+			WRITE(p, "  if (u_cullRangeMin.w > 0.0) {\n");
+			WRITE(p, "    %sgl_CullDistance[0] = projPos.z - u_cullRangeMin.z;\n", compat.vsOutPrefix);
+			WRITE(p, "    %sgl_CullDistance[1] = u_cullRangeMax.z - projPos.z;\n", compat.vsOutPrefix);
+			WRITE(p, "  }\n");
 		}
 	}
 

From 1e66a66ed7cf87c461454648fc32f2c18f00ac4f Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Sat, 11 Sep 2021 18:13:43 -0700
Subject: [PATCH 11/17] D3D11: Correct clearing samplers.

---
 Common/GPU/D3D11/thin3d_d3d11.cpp | 2 +-
 Common/GPU/D3D9/thin3d_d3d9.cpp   | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/Common/GPU/D3D11/thin3d_d3d11.cpp b/Common/GPU/D3D11/thin3d_d3d11.cpp
index b927382c6288..e239fcc395a2 100644
--- a/Common/GPU/D3D11/thin3d_d3d11.cpp
+++ b/Common/GPU/D3D11/thin3d_d3d11.cpp
@@ -1347,7 +1347,7 @@ void D3D11DrawContext::BindSamplerStates(int start, int count, SamplerState **st
 	_assert_(start + count <= ARRAY_SIZE(samplers));
 	for (int i = 0; i < count; i++) {
 		D3D11SamplerState *samp = (D3D11SamplerState *)states[i];
-		samplers[i] = samp->ss;
+		samplers[i] = samp ? samp->ss : nullptr;
 	}
 	context_->PSSetSamplers(start, count, samplers);
 }
diff --git a/Common/GPU/D3D9/thin3d_d3d9.cpp b/Common/GPU/D3D9/thin3d_d3d9.cpp
index 449b74b05c50..f277d6ae95cb 100644
--- a/Common/GPU/D3D9/thin3d_d3d9.cpp
+++ b/Common/GPU/D3D9/thin3d_d3d9.cpp
@@ -530,7 +530,8 @@ class D3D9Context : public DrawContext {
 		_assert_(start + count <= MAX_BOUND_TEXTURES);
 		for (int i = 0; i < count; ++i) {
 			D3D9SamplerState *s = static_cast<D3D9SamplerState *>(states[i]);
-			s->Apply(device_, start + i);
+			if (s)
+				s->Apply(device_, start + i);
 		}
 	}
 	void BindVertexBuffers(int start, int count, Buffer **buffers, const int *offsets) override {

From c6a52909f989a59cab76242ca48c9d34f357681a Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Sat, 11 Sep 2021 18:19:56 -0700
Subject: [PATCH 12/17] D3D11: Support vertex clip/cull planes.

---
 GPU/Common/VertexShaderGenerator.cpp | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/GPU/Common/VertexShaderGenerator.cpp b/GPU/Common/VertexShaderGenerator.cpp
index ae554ac7c72e..296c734626ba 100644
--- a/GPU/Common/VertexShaderGenerator.cpp
+++ b/GPU/Common/VertexShaderGenerator.cpp
@@ -400,6 +400,10 @@ bool GenerateVertexShader(const VShaderID &id, char *buffer, const ShaderLanguag
 			WRITE(p, "  vec4 gl_Position   : POSITION;\n");
 		} else {
 			WRITE(p, "  vec4 gl_Position   : SV_Position;\n");
+			if (vertexRangeCulling && gstate_c.Supports(GPU_SUPPORTS_CLIP_CULL_DISTANCE)) {
+				WRITE(p, "  float gl_ClipDistance : SV_ClipDistance0;\n");
+				WRITE(p, "  float2 gl_CullDistance : SV_CullDistance0;\n");
+			}
 		}
 		WRITE(p, "};\n");
 	} else {
@@ -1113,13 +1117,16 @@ bool GenerateVertexShader(const VShaderID &id, char *buffer, const ShaderLanguag
 		WRITE(p, "    }\n");
 		WRITE(p, "  }\n");
 
-		if (gstate_c.Supports(GPU_SUPPORTS_CLIP_CULL_DISTANCE) && (compat.shaderLanguage == GLSL_VULKAN || ShaderLanguageIsOpenGL(compat.shaderLanguage))) {
+		const char *clip0 = compat.shaderLanguage == HLSL_D3D11 ? "" : "[0]";
+		const char *cull0 = compat.shaderLanguage == HLSL_D3D11 ? ".x" : "[0]";
+		const char *cull1 = compat.shaderLanguage == HLSL_D3D11 ? ".y" : "[1]";
+		if (gstate_c.Supports(GPU_SUPPORTS_CLIP_CULL_DISTANCE)) {
 			// TODO: Not rectangles...
-			WRITE(p, "  %sgl_ClipDistance[0] = projZ * outPos.w + outPos.w;\n", compat.vsOutPrefix);
+			WRITE(p, "  %sgl_ClipDistance%s = projZ * outPos.w + outPos.w;\n", compat.vsOutPrefix, clip0);
 			// Cull any triangle fully outside in the same direction when depth clamp enabled.
 			WRITE(p, "  if (u_cullRangeMin.w > 0.0) {\n");
-			WRITE(p, "    %sgl_CullDistance[0] = projPos.z - u_cullRangeMin.z;\n", compat.vsOutPrefix);
-			WRITE(p, "    %sgl_CullDistance[1] = u_cullRangeMax.z - projPos.z;\n", compat.vsOutPrefix);
+			WRITE(p, "    %sgl_CullDistance%s = projPos.z - u_cullRangeMin.z;\n", compat.vsOutPrefix, cull0);
+			WRITE(p, "    %sgl_CullDistance%s = u_cullRangeMax.z - projPos.z;\n", compat.vsOutPrefix, cull1);
 			WRITE(p, "  }\n");
 		}
 	}

From 1c7cd67f6ddb7d8d90962ef52d13d695746c6d90 Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Sat, 11 Sep 2021 19:08:02 -0700
Subject: [PATCH 13/17] iOS: Buildfix bad GLES headers.

---
 Common/GPU/OpenGL/GLQueueRunner.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Common/GPU/OpenGL/GLQueueRunner.cpp b/Common/GPU/OpenGL/GLQueueRunner.cpp
index f3bd46dd4b03..e229f59c1167 100644
--- a/Common/GPU/OpenGL/GLQueueRunner.cpp
+++ b/Common/GPU/OpenGL/GLQueueRunner.cpp
@@ -20,6 +20,8 @@
 // These are the same value, alias for simplicity.
 #if defined(GL_CLIP_DISTANCE0_EXT) && !defined(GL_CLIP_DISTANCE0)
 #define GL_CLIP_DISTANCE0 GL_CLIP_DISTANCE0_EXT
+#elif !defined(GL_CLIP_DISTANCE0)
+#define GL_CLIP_DISTANCE0 0x3000
 #endif
 
 static constexpr int TEXCACHE_NAME_CACHE_SIZE = 16;

From 5e6f54033eb937a665f98537e505c4e038910b1d Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Sun, 19 Sep 2021 07:14:54 -0700
Subject: [PATCH 14/17] GPU: Split clip and cull caps.

GL_ARB_cull_distance is needed, sometimes available on older GL.
---
 Common/GPU/D3D11/thin3d_d3d11.cpp   | 3 ++-
 Common/GPU/OpenGL/thin3d_gl.cpp     | 3 ++-
 Common/GPU/Vulkan/thin3d_vulkan.cpp | 3 ++-
 Common/GPU/thin3d.h                 | 3 ++-
 GPU/D3D11/GPU_D3D11.cpp             | 2 +-
 GPU/GLES/GPU_GLES.cpp               | 2 +-
 6 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/Common/GPU/D3D11/thin3d_d3d11.cpp b/Common/GPU/D3D11/thin3d_d3d11.cpp
index e239fcc395a2..52192fce7471 100644
--- a/Common/GPU/D3D11/thin3d_d3d11.cpp
+++ b/Common/GPU/D3D11/thin3d_d3d11.cpp
@@ -246,7 +246,8 @@ D3D11DrawContext::D3D11DrawContext(ID3D11Device *device, ID3D11DeviceContext *de
 	caps_.dualSourceBlend = featureLevel_ >= D3D_FEATURE_LEVEL_10_0;
 	caps_.depthClampSupported = featureLevel_ >= D3D_FEATURE_LEVEL_10_0;
 	// SV_ClipDistance# seems to be 10+.
-	caps_.clipCullDistanceSupported = featureLevel_ >= D3D_FEATURE_LEVEL_10_0;
+	caps_.clipDistanceSupported = featureLevel_ >= D3D_FEATURE_LEVEL_10_0;
+	caps_.cullDistanceSupported = featureLevel_ >= D3D_FEATURE_LEVEL_10_0;
 
 	caps_.depthRangeMinusOneToOne = false;
 	caps_.framebufferBlitSupported = false;
diff --git a/Common/GPU/OpenGL/thin3d_gl.cpp b/Common/GPU/OpenGL/thin3d_gl.cpp
index ab4bb2416153..554f2cdd7e4a 100644
--- a/Common/GPU/OpenGL/thin3d_gl.cpp
+++ b/Common/GPU/OpenGL/thin3d_gl.cpp
@@ -534,7 +534,8 @@ OpenGLContext::OpenGLContext() {
 	caps_.framebufferBlitSupported = gl_extensions.NV_framebuffer_blit || gl_extensions.ARB_framebuffer_object;
 	caps_.framebufferDepthBlitSupported = caps_.framebufferBlitSupported;
 	caps_.depthClampSupported = gl_extensions.ARB_depth_clamp;
-	caps_.clipCullDistanceSupported = gl_extensions.EXT_clip_cull_distance || (!gl_extensions.IsGLES && gl_extensions.VersionGEThan(3, 0));
+	caps_.clipDistanceSupported = gl_extensions.EXT_clip_cull_distance || (!gl_extensions.IsGLES && gl_extensions.VersionGEThan(3, 0));
+	caps_.cullDistanceSupported = gl_extensions.EXT_clip_cull_distance || gl_extensions.ARB_cull_distance;
 
 	// Interesting potential hack for emulating GL_DEPTH_CLAMP (use a separate varying, force depth in fragment shader):
 	// This will induce a performance penalty on many architectures though so a blanket enable of this
diff --git a/Common/GPU/Vulkan/thin3d_vulkan.cpp b/Common/GPU/Vulkan/thin3d_vulkan.cpp
index 8d996aea0c15..1dd083df6bc7 100644
--- a/Common/GPU/Vulkan/thin3d_vulkan.cpp
+++ b/Common/GPU/Vulkan/thin3d_vulkan.cpp
@@ -780,7 +780,8 @@ VKContext::VKContext(VulkanContext *vulkan, bool splitSubmit)
 	caps_.multiViewport = vulkan->GetDeviceFeatures().enabled.multiViewport != 0;
 	caps_.dualSourceBlend = vulkan->GetDeviceFeatures().enabled.dualSrcBlend != 0;
 	caps_.depthClampSupported = vulkan->GetDeviceFeatures().enabled.depthClamp != 0;
-	caps_.clipCullDistanceSupported = vulkan->GetDeviceFeatures().enabled.shaderClipDistance != 0 && vulkan->GetDeviceFeatures().enabled.shaderCullDistance != 0;
+	caps_.clipDistanceSupported = vulkan->GetDeviceFeatures().enabled.shaderClipDistance != 0;
+	caps_.cullDistanceSupported = vulkan->GetDeviceFeatures().enabled.shaderCullDistance != 0;
 	caps_.framebufferBlitSupported = true;
 	caps_.framebufferCopySupported = true;
 	caps_.framebufferDepthBlitSupported = false;  // Can be checked for.
diff --git a/Common/GPU/thin3d.h b/Common/GPU/thin3d.h
index c7a4db531a23..a3f05385c40d 100644
--- a/Common/GPU/thin3d.h
+++ b/Common/GPU/thin3d.h
@@ -520,7 +520,8 @@ struct DeviceCaps {
 	bool dualSourceBlend;
 	bool logicOpSupported;
 	bool depthClampSupported;
-	bool clipCullDistanceSupported;
+	bool clipDistanceSupported;
+	bool cullDistanceSupported;
 	bool framebufferCopySupported;
 	bool framebufferBlitSupported;
 	bool framebufferDepthCopySupported;
diff --git a/GPU/D3D11/GPU_D3D11.cpp b/GPU/D3D11/GPU_D3D11.cpp
index e01dd30b2351..5d3f86467341 100644
--- a/GPU/D3D11/GPU_D3D11.cpp
+++ b/GPU/D3D11/GPU_D3D11.cpp
@@ -128,7 +128,7 @@ void GPU_D3D11::CheckGPUFeatures() {
 		features |= GPU_SUPPORTS_DUALSOURCE_BLEND;
 	if (draw_->GetDeviceCaps().depthClampSupported)
 		features |= GPU_SUPPORTS_DEPTH_CLAMP;
-	if (draw_->GetDeviceCaps().clipCullDistanceSupported)
+	if (draw_->GetDeviceCaps().clipDistanceSupported && draw_->GetDeviceCaps().cullDistanceSupported)
 		features |= GPU_SUPPORTS_CLIP_CULL_DISTANCE;
 	features |= GPU_SUPPORTS_COPY_IMAGE;
 	features |= GPU_SUPPORTS_TEXTURE_FLOAT;
diff --git a/GPU/GLES/GPU_GLES.cpp b/GPU/GLES/GPU_GLES.cpp
index fc28f4d9ed8e..7e59e9e4654f 100644
--- a/GPU/GLES/GPU_GLES.cpp
+++ b/GPU/GLES/GPU_GLES.cpp
@@ -228,7 +228,7 @@ void GPU_GLES::CheckGPUFeatures() {
 		if (gl_extensions.GLES3)
 			features |= GPU_SUPPORTS_DEPTH_TEXTURE;
 	}
-	if (draw_->GetDeviceCaps().clipCullDistanceSupported)
+	if (draw_->GetDeviceCaps().clipDistanceSupported && draw_->GetDeviceCaps().cullDistanceSupported)
 		features |= GPU_SUPPORTS_CLIP_CULL_DISTANCE;
 
 	// If we already have a 16-bit depth buffer, we don't need to round.

From 7b00c4a57206cc5e3d250fd91970b176d94a83ee Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Sun, 19 Sep 2021 23:16:21 -0700
Subject: [PATCH 15/17] GPU: Move Z/W equal hack to bugs from supports.

It's really a bug (might even ideally cap the version?), and we already
have other bugs handled the same way.
---
 Common/GPU/Vulkan/thin3d_vulkan.cpp  | 5 +++++
 Common/GPU/thin3d.h                  | 1 +
 GPU/Common/VertexShaderGenerator.cpp | 5 +++--
 GPU/GPUState.h                       | 2 +-
 GPU/Vulkan/GPU_Vulkan.cpp            | 4 ----
 5 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/Common/GPU/Vulkan/thin3d_vulkan.cpp b/Common/GPU/Vulkan/thin3d_vulkan.cpp
index 1dd083df6bc7..508f36ea8380 100644
--- a/Common/GPU/Vulkan/thin3d_vulkan.cpp
+++ b/Common/GPU/Vulkan/thin3d_vulkan.cpp
@@ -818,6 +818,11 @@ VKContext::VKContext(VulkanContext *vulkan, bool splitSubmit)
 	} else if (caps_.vendor == GPUVendor::VENDOR_INTEL) {
 		// Workaround for Intel driver bug. TODO: Re-enable after some driver version
 		bugs_.Infest(Bugs::DUAL_SOURCE_BLENDING_BROKEN);
+	} else if (caps_.vendor == GPUVendor::VENDOR_ARM) {
+		// These GPUs (up to some certain hardware version?) have a bug where draws where gl_Position.w == .z
+		// corrupt the depth buffer. This is easily worked around by simply scaling Z down a tiny bit when this case
+		// is detected. See: https://github.com/hrydgard/ppsspp/issues/11937
+		bugs_.Infest(Bugs::EQUAL_WZ_CORRUPTS_DEPTH);
 	}
 
 	caps_.deviceID = deviceProps.deviceID;
diff --git a/Common/GPU/thin3d.h b/Common/GPU/thin3d.h
index a3f05385c40d..9d3264e0906a 100644
--- a/Common/GPU/thin3d.h
+++ b/Common/GPU/thin3d.h
@@ -317,6 +317,7 @@ class Bugs {
 		BROKEN_NAN_IN_CONDITIONAL = 4,
 		COLORWRITEMASK_BROKEN_WITH_DEPTHTEST = 5,
 		BROKEN_FLAT_IN_SHADER = 6,
+		EQUAL_WZ_CORRUPTS_DEPTH = 7,
 	};
 
 protected:
diff --git a/GPU/Common/VertexShaderGenerator.cpp b/GPU/Common/VertexShaderGenerator.cpp
index 296c734626ba..d2acccccfeba 100644
--- a/GPU/Common/VertexShaderGenerator.cpp
+++ b/GPU/Common/VertexShaderGenerator.cpp
@@ -159,6 +159,7 @@ bool GenerateVertexShader(const VShaderID &id, char *buffer, const ShaderLanguag
 	bool doShadeMapping = uvGenMode == GE_TEXMAP_ENVIRONMENT_MAP;
 
 	bool flatBug = bugs.Has(Draw::Bugs::BROKEN_FLAT_IN_SHADER) && g_Config.bVendorBugChecksEnabled;
+	bool needsZWHack = bugs.Has(Draw::Bugs::EQUAL_WZ_CORRUPTS_DEPTH) && g_Config.bVendorBugChecksEnabled;
 	bool doFlatShading = id.Bit(VS_BIT_FLATSHADE) && !flatBug;
 
 	bool useHWTransform = id.Bit(VS_BIT_USE_HW_TRANSFORM);
@@ -1134,8 +1135,8 @@ bool GenerateVertexShader(const VShaderID &id, char *buffer, const ShaderLanguag
 	// We've named the output gl_Position in HLSL as well.
 	WRITE(p, "  %sgl_Position = outPos;\n", compat.vsOutPrefix);
 
-	if (gstate_c.Supports(GPU_NEEDS_Z_EQUAL_W_HACK)) {
-		// See comment in GPU_Vulkan.cpp.
+	if (needsZWHack) {
+		// See comment in thin3d_vulkan.cpp.
 		WRITE(p, "  if (%sgl_Position.z == %sgl_Position.w) %sgl_Position.z *= 0.999999;\n",
 			compat.vsOutPrefix, compat.vsOutPrefix, compat.vsOutPrefix);
 	}
diff --git a/GPU/GPUState.h b/GPU/GPUState.h
index 5806fdab240a..84928b1d392d 100644
--- a/GPU/GPUState.h
+++ b/GPU/GPUState.h
@@ -492,7 +492,7 @@ enum {
 	GPU_SUPPORTS_FRAMEBUFFER_BLIT = FLAG_BIT(26),
 	GPU_SUPPORTS_FRAMEBUFFER_BLIT_TO_DEPTH = FLAG_BIT(27),
 	GPU_SUPPORTS_TEXTURE_NPOT = FLAG_BIT(28),
-	GPU_NEEDS_Z_EQUAL_W_HACK = FLAG_BIT(29),
+	// Free bit: 29
 	// Free bit: 30
 	GPU_PREFER_REVERSE_COLOR_ORDER = FLAG_BIT(31),
 };
diff --git a/GPU/Vulkan/GPU_Vulkan.cpp b/GPU/Vulkan/GPU_Vulkan.cpp
index 7af0794ae6e9..86b0369699e2 100644
--- a/GPU/Vulkan/GPU_Vulkan.cpp
+++ b/GPU/Vulkan/GPU_Vulkan.cpp
@@ -206,10 +206,6 @@ void GPU_Vulkan::CheckGPUFeatures() {
 		if (!PSP_CoreParameter().compat.flags().DisableAccurateDepth || driverTooOld) {
 			features |= GPU_SUPPORTS_ACCURATE_DEPTH;
 		}
-		// These GPUs (up to some certain hardware version?) has a bug where draws where gl_Position.w == .z
-		// corrupt the depth buffer. This is easily worked around by simply scaling Z down a tiny bit when this case
-		// is detected. See: https://github.com/hrydgard/ppsspp/issues/11937
-		features |= GPU_NEEDS_Z_EQUAL_W_HACK;
 		break;
 	}
 	default:

From 33598f2e75294d742e49dc15b98505705374e2cb Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Sun, 19 Sep 2021 23:27:30 -0700
Subject: [PATCH 16/17] GPU: Support clip and cull distances separately.

Older GL devices, and it seems Apple devices, may not support cull.
---
 GPU/Common/VertexShaderGenerator.cpp | 8 ++++++--
 GPU/D3D11/GPU_D3D11.cpp              | 6 ++++--
 GPU/GLES/GPU_GLES.cpp                | 6 ++++--
 GPU/GLES/ShaderManagerGLES.cpp       | 2 +-
 GPU/GPUState.h                       | 6 +++---
 GPU/Vulkan/GPU_Vulkan.cpp            | 7 +++++--
 6 files changed, 23 insertions(+), 12 deletions(-)

diff --git a/GPU/Common/VertexShaderGenerator.cpp b/GPU/Common/VertexShaderGenerator.cpp
index d2acccccfeba..44048163c443 100644
--- a/GPU/Common/VertexShaderGenerator.cpp
+++ b/GPU/Common/VertexShaderGenerator.cpp
@@ -401,8 +401,10 @@ bool GenerateVertexShader(const VShaderID &id, char *buffer, const ShaderLanguag
 			WRITE(p, "  vec4 gl_Position   : POSITION;\n");
 		} else {
 			WRITE(p, "  vec4 gl_Position   : SV_Position;\n");
-			if (vertexRangeCulling && gstate_c.Supports(GPU_SUPPORTS_CLIP_CULL_DISTANCE)) {
+			if (vertexRangeCulling && gstate_c.Supports(GPU_SUPPORTS_CLIP_DISTANCE)) {
 				WRITE(p, "  float gl_ClipDistance : SV_ClipDistance0;\n");
+			}
+			if (vertexRangeCulling && gstate_c.Supports(GPU_SUPPORTS_CULL_DISTANCE)) {
 				WRITE(p, "  float2 gl_CullDistance : SV_CullDistance0;\n");
 			}
 		}
@@ -1121,9 +1123,11 @@ bool GenerateVertexShader(const VShaderID &id, char *buffer, const ShaderLanguag
 		const char *clip0 = compat.shaderLanguage == HLSL_D3D11 ? "" : "[0]";
 		const char *cull0 = compat.shaderLanguage == HLSL_D3D11 ? ".x" : "[0]";
 		const char *cull1 = compat.shaderLanguage == HLSL_D3D11 ? ".y" : "[1]";
-		if (gstate_c.Supports(GPU_SUPPORTS_CLIP_CULL_DISTANCE)) {
+		if (gstate_c.Supports(GPU_SUPPORTS_CLIP_DISTANCE)) {
 			// TODO: Not rectangles...
 			WRITE(p, "  %sgl_ClipDistance%s = projZ * outPos.w + outPos.w;\n", compat.vsOutPrefix, clip0);
+		}
+		if (gstate_c.Supports(GPU_SUPPORTS_CULL_DISTANCE)) {
 			// Cull any triangle fully outside in the same direction when depth clamp enabled.
 			WRITE(p, "  if (u_cullRangeMin.w > 0.0) {\n");
 			WRITE(p, "    %sgl_CullDistance%s = projPos.z - u_cullRangeMin.z;\n", compat.vsOutPrefix, cull0);
diff --git a/GPU/D3D11/GPU_D3D11.cpp b/GPU/D3D11/GPU_D3D11.cpp
index 5d3f86467341..c279315d5d7c 100644
--- a/GPU/D3D11/GPU_D3D11.cpp
+++ b/GPU/D3D11/GPU_D3D11.cpp
@@ -128,8 +128,10 @@ void GPU_D3D11::CheckGPUFeatures() {
 		features |= GPU_SUPPORTS_DUALSOURCE_BLEND;
 	if (draw_->GetDeviceCaps().depthClampSupported)
 		features |= GPU_SUPPORTS_DEPTH_CLAMP;
-	if (draw_->GetDeviceCaps().clipDistanceSupported && draw_->GetDeviceCaps().cullDistanceSupported)
-		features |= GPU_SUPPORTS_CLIP_CULL_DISTANCE;
+	if (draw_->GetDeviceCaps().clipDistanceSupported)
+		features |= GPU_SUPPORTS_CLIP_DISTANCE;
+	if (draw_->GetDeviceCaps().cullDistanceSupported)
+		features |= GPU_SUPPORTS_CULL_DISTANCE;
 	features |= GPU_SUPPORTS_COPY_IMAGE;
 	features |= GPU_SUPPORTS_TEXTURE_FLOAT;
 	features |= GPU_SUPPORTS_INSTANCE_RENDERING;
diff --git a/GPU/GLES/GPU_GLES.cpp b/GPU/GLES/GPU_GLES.cpp
index 7e59e9e4654f..8e055cdfa909 100644
--- a/GPU/GLES/GPU_GLES.cpp
+++ b/GPU/GLES/GPU_GLES.cpp
@@ -228,8 +228,10 @@ void GPU_GLES::CheckGPUFeatures() {
 		if (gl_extensions.GLES3)
 			features |= GPU_SUPPORTS_DEPTH_TEXTURE;
 	}
-	if (draw_->GetDeviceCaps().clipDistanceSupported && draw_->GetDeviceCaps().cullDistanceSupported)
-		features |= GPU_SUPPORTS_CLIP_CULL_DISTANCE;
+	if (draw_->GetDeviceCaps().clipDistanceSupported)
+		features |= GPU_SUPPORTS_CLIP_DISTANCE;
+	if (draw_->GetDeviceCaps().cullDistanceSupported)
+		features |= GPU_SUPPORTS_CULL_DISTANCE;
 
 	// If we already have a 16-bit depth buffer, we don't need to round.
 	bool prefer24 = draw_->GetDeviceCaps().preferredDepthBufferFormat == Draw::DataFormat::D24_S8;
diff --git a/GPU/GLES/ShaderManagerGLES.cpp b/GPU/GLES/ShaderManagerGLES.cpp
index a25231c23256..f2c351470222 100644
--- a/GPU/GLES/ShaderManagerGLES.cpp
+++ b/GPU/GLES/ShaderManagerGLES.cpp
@@ -183,7 +183,7 @@ LinkedShader::LinkedShader(GLRenderManager *render, VShaderID VSID, Shader *vs,
 	initialize.push_back({ &u_tess_weights_v, 0, 6 });
 
 	bool useDualSource = (gstate_c.featureFlags & GPU_SUPPORTS_DUALSOURCE_BLEND) != 0;
-	bool useClip0 = VSID.Bit(VS_BIT_VERTEX_RANGE_CULLING) && gstate_c.Supports(GPU_SUPPORTS_CLIP_CULL_DISTANCE);
+	bool useClip0 = VSID.Bit(VS_BIT_VERTEX_RANGE_CULLING) && gstate_c.Supports(GPU_SUPPORTS_CLIP_DISTANCE);
 	program = render->CreateProgram(shaders, semantics, queries, initialize, useDualSource, useClip0);
 
 	// The rest, use the "dirty" mechanism.
diff --git a/GPU/GPUState.h b/GPU/GPUState.h
index 84928b1d392d..af80811cb575 100644
--- a/GPU/GPUState.h
+++ b/GPU/GPUState.h
@@ -482,7 +482,7 @@ enum {
 	GPU_SUPPORTS_32BIT_INT_FSHADER = FLAG_BIT(15),
 	GPU_SUPPORTS_DEPTH_TEXTURE = FLAG_BIT(16),
 	GPU_SUPPORTS_ACCURATE_DEPTH = FLAG_BIT(17),
-	GPU_SUPPORTS_CLIP_CULL_DISTANCE = FLAG_BIT(18),
+	// Free bit: 18
 	GPU_SUPPORTS_COPY_IMAGE = FLAG_BIT(19),
 	GPU_SUPPORTS_ANY_FRAMEBUFFER_FETCH = FLAG_BIT(20),
 	GPU_SCALE_DEPTH_FROM_24BIT_TO_16BIT = FLAG_BIT(21),
@@ -492,8 +492,8 @@ enum {
 	GPU_SUPPORTS_FRAMEBUFFER_BLIT = FLAG_BIT(26),
 	GPU_SUPPORTS_FRAMEBUFFER_BLIT_TO_DEPTH = FLAG_BIT(27),
 	GPU_SUPPORTS_TEXTURE_NPOT = FLAG_BIT(28),
-	// Free bit: 29
-	// Free bit: 30
+	GPU_SUPPORTS_CLIP_DISTANCE = FLAG_BIT(29),
+	GPU_SUPPORTS_CULL_DISTANCE = FLAG_BIT(30),
 	GPU_PREFER_REVERSE_COLOR_ORDER = FLAG_BIT(31),
 };
 
diff --git a/GPU/Vulkan/GPU_Vulkan.cpp b/GPU/Vulkan/GPU_Vulkan.cpp
index 86b0369699e2..5baf466f3669 100644
--- a/GPU/Vulkan/GPU_Vulkan.cpp
+++ b/GPU/Vulkan/GPU_Vulkan.cpp
@@ -241,9 +241,12 @@ void GPU_Vulkan::CheckGPUFeatures() {
 	if (enabledFeatures.depthClamp) {
 		features |= GPU_SUPPORTS_DEPTH_CLAMP;
 	}
-	if (enabledFeatures.shaderClipDistance && enabledFeatures.shaderCullDistance) {
+	if (enabledFeatures.shaderClipDistance) {
+		features |= GPU_SUPPORTS_CLIP_DISTANCE;
+	}
+	if (enabledFeatures.shaderCullDistance) {
 		// Must support at least 8 if feature supported, so we're fine.
-		features |= GPU_SUPPORTS_CLIP_CULL_DISTANCE;
+		features |= GPU_SUPPORTS_CULL_DISTANCE;
 	}
 	if (enabledFeatures.dualSrcBlend) {
 		if (!g_Config.bVendorBugChecksEnabled || !draw_->GetBugs().Has(Draw::Bugs::DUAL_SOURCE_BLENDING_BROKEN)) {

From 275baccc5bec6f811f0e411c1f29e6e6f939688c Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Sun, 19 Sep 2021 23:32:05 -0700
Subject: [PATCH 17/17] GLES: Support GL_APPLE_clip_distance too.

Seems modern Apple mobile chips only support clip.
---
 Common/GPU/OpenGL/GLFeatures.cpp     | 1 +
 Common/GPU/OpenGL/GLFeatures.h       | 3 +++
 Common/GPU/OpenGL/thin3d_gl.cpp      | 9 +++++++--
 GPU/Common/VertexShaderGenerator.cpp | 3 +++
 4 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/Common/GPU/OpenGL/GLFeatures.cpp b/Common/GPU/OpenGL/GLFeatures.cpp
index ef78271525f9..6e15b3aa843a 100644
--- a/Common/GPU/OpenGL/GLFeatures.cpp
+++ b/Common/GPU/OpenGL/GLFeatures.cpp
@@ -385,6 +385,7 @@ void CheckGLExtensions() {
 		gl_extensions.OES_texture_float = g_set_gl_extensions.count("GL_OES_texture_float") != 0;
 		gl_extensions.EXT_buffer_storage = g_set_gl_extensions.count("GL_EXT_buffer_storage") != 0;
 		gl_extensions.EXT_clip_cull_distance = g_set_gl_extensions.count("GL_EXT_clip_cull_distance") != 0;
+		gl_extensions.APPLE_clip_distance = g_set_gl_extensions.count("GL_APPLE_clip_distance") != 0;
 
 #if defined(__ANDROID__)
 		// On Android, incredibly, this is not consistently non-zero! It does seem to have the same value though.
diff --git a/Common/GPU/OpenGL/GLFeatures.h b/Common/GPU/OpenGL/GLFeatures.h
index 6a8f15beb00d..8efd782a1463 100644
--- a/Common/GPU/OpenGL/GLFeatures.h
+++ b/Common/GPU/OpenGL/GLFeatures.h
@@ -94,6 +94,9 @@ struct GLExtensions {
 	// ARM
 	bool ARM_shader_framebuffer_fetch;
 
+	// APPLE
+	bool APPLE_clip_distance;
+
 	// EGL
 	bool EGL_NV_system_time;
 	bool EGL_NV_coverage_sample;
diff --git a/Common/GPU/OpenGL/thin3d_gl.cpp b/Common/GPU/OpenGL/thin3d_gl.cpp
index 554f2cdd7e4a..a9ab5151d094 100644
--- a/Common/GPU/OpenGL/thin3d_gl.cpp
+++ b/Common/GPU/OpenGL/thin3d_gl.cpp
@@ -534,8 +534,13 @@ OpenGLContext::OpenGLContext() {
 	caps_.framebufferBlitSupported = gl_extensions.NV_framebuffer_blit || gl_extensions.ARB_framebuffer_object;
 	caps_.framebufferDepthBlitSupported = caps_.framebufferBlitSupported;
 	caps_.depthClampSupported = gl_extensions.ARB_depth_clamp;
-	caps_.clipDistanceSupported = gl_extensions.EXT_clip_cull_distance || (!gl_extensions.IsGLES && gl_extensions.VersionGEThan(3, 0));
-	caps_.cullDistanceSupported = gl_extensions.EXT_clip_cull_distance || gl_extensions.ARB_cull_distance;
+	if (gl_extensions.IsGLES) {
+		caps_.clipDistanceSupported = gl_extensions.EXT_clip_cull_distance || gl_extensions.APPLE_clip_distance;
+		caps_.cullDistanceSupported = gl_extensions.EXT_clip_cull_distance;
+	} else {
+		caps_.clipDistanceSupported = gl_extensions.VersionGEThan(3, 0);
+		caps_.cullDistanceSupported = gl_extensions.ARB_cull_distance;
+	}
 
 	// Interesting potential hack for emulating GL_DEPTH_CLAMP (use a separate varying, force depth in fragment shader):
 	// This will induce a performance penalty on many architectures though so a blanket enable of this
diff --git a/GPU/Common/VertexShaderGenerator.cpp b/GPU/Common/VertexShaderGenerator.cpp
index 44048163c443..a4d8c68a632c 100644
--- a/GPU/Common/VertexShaderGenerator.cpp
+++ b/GPU/Common/VertexShaderGenerator.cpp
@@ -144,6 +144,9 @@ bool GenerateVertexShader(const VShaderID &id, char *buffer, const ShaderLanguag
 		if (gl_extensions.EXT_clip_cull_distance && id.Bit(VS_BIT_VERTEX_RANGE_CULLING)) {
 			gl_exts.push_back("#extension GL_EXT_clip_cull_distance : enable");
 		}
+		if (gl_extensions.APPLE_clip_distance && id.Bit(VS_BIT_VERTEX_RANGE_CULLING)) {
+			gl_exts.push_back("#extension GL_APPLE_clip_distance : enable");
+		}
 	}
 	ShaderWriter p(buffer, compat, ShaderStage::Vertex, gl_exts.data(), gl_exts.size());