pytorch
diff --git a/‎.pre-commit-config.yaml
+1-1 b/‎.pre-commit-config.yaml
+1-1
diff --git a/‎benchmarks/float8/utils.py
+3 b/‎benchmarks/float8/utils.py
+3
diff --git a/‎test/dtypes/test_affine_quantized.py
+18-3 b/‎test/dtypes/test_affine_quantized.py
+18-3
diff --git a/‎test/quantization/test_qat.py
+3-2 b/‎test/quantization/test_qat.py
+3-2
diff --git a/‎test/quantization/test_quant_api.py
+12 b/‎test/quantization/test_quant_api.py
+12
@@ -2,7 +2,7 @@
 # See https://pre-commit.com/hooks.html for more hooks
 repos:
   - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.4.0
+    rev: v5.0.0
     hooks:
       - id: trailing-whitespace
       - id: end-of-file-fixer
 
@@ -83,6 +83,9 @@ def profiler_output_to_filtered_time_by_kernel_name(
             continue
         elif e.key == "Activity Buffer Request":
             continue
+        elif e.key == "Unrecognized":
+            # TODO I think these are nvjet related
+            continue
 
         kernel_name_to_gpu_time_us[e.key] = e.self_device_time_total
     return kernel_name_to_gpu_time_us
 
@@ -25,6 +25,7 @@
     to_affine_quantized_intx_static,
 )
 from torchao.quantization import (
+    GemliteUIntXWeightOnlyConfig,
     Int4WeightOnlyConfig,
     Int8DynamicActivationInt8WeightConfig,
     float8_weight_only,
@@ -36,7 +37,7 @@
     quantize_,
 )
 from torchao.quantization.quant_primitives import MappingType, ZeroPointDomain
-from torchao.testing.utils import skip_if_no_cuda, skip_if_rocm
+from torchao.testing.utils import skip_if_no_cuda, skip_if_no_gemlite, skip_if_rocm
 from torchao.utils import (
     TORCH_VERSION_AT_LEAST_2_5,
     check_cpu_version,
@@ -176,7 +177,7 @@ def _apply(module, config_or_subclass_inserter):
 
     @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
     def test_register_new_dispatch(self):
-        from torchao.dtypes import AffineQuantizedTensor, to_affine_quantized_intx
+        from torchao.dtypes import AffineQuantizedTensor
         from torchao.dtypes.affine_quantized_tensor_ops import (
             deregister_aqt_quantized_linear_dispatch,
             register_aqt_quantized_linear_dispatch,
@@ -344,7 +345,7 @@ def test_alias(self, device, dtype):
     @common_utils.parametrize("device", ["cuda"])
     @common_utils.parametrize("dtype", [torch.bfloat16])
     @skip_if_no_cuda()
-    def test_slice(self, device, dtype):
+    def test_slice_int4wo(self, device, dtype):
         # in_feature not divisible by 1024
         # out_feature not divisible by 8
         # to test slice + padding for int4 weight only quantization
@@ -354,6 +355,20 @@ def test_slice(self, device, dtype):
         _ = dummy.weight.narrow(0, 0, 64)
         _ = dummy.weight.narrow(1, 0, 128)
 
+    @common_utils.parametrize("device", ["cuda"])
+    @common_utils.parametrize("dtype", [torch.float16, torch.bfloat16])
+    @skip_if_no_cuda()
+    @skip_if_no_gemlite()
+    def test_slice_gemlite(self, device, dtype):
+        # in_feature not divisible by 1024
+        # out_feature not divisible by 8
+        # to test slice + padding for int4 weight only quantization
+        dummy = nn.Linear(256, 512, dtype=dtype, device=device)
+        quantize_(dummy, GemliteUIntXWeightOnlyConfig())
+        # make sure these run without error
+        _ = dummy.weight.narrow(0, 0, 64)
+        _ = dummy.weight.narrow(1, 0, 128)
+
     @common_utils.parametrize("device", ["cuda"])
     @common_utils.parametrize("dtype", [torch.bfloat16])
     def test_matmul(self, device, dtype):
 
@@ -1474,7 +1474,6 @@ def test_fake_quantize_per_token_vs_convert(self, dtype: torch.dtype):
     @unittest.skipIf(
         not TORCH_VERSION_AT_LEAST_2_4, "skipping when torch version is 2.4 or lower"
     )
-    @unittest.skip("Currently failing on sqnr")
     def test_qat_8da4w_prepare_vs_convert(self, dtype: torch.dtype):
         """
         Test that the prepare and convert steps of Int8DynActInt4QATQuantizer produces
@@ -1493,7 +1492,9 @@ def test_qat_8da4w_prepare_vs_convert(self, dtype: torch.dtype):
             torch.manual_seed(seed)
             x = m.example_inputs()
 
-            quantizer = Int8DynActInt4WeightQATQuantizer(groupsize=group_size)
+            quantizer = Int8DynActInt4WeightQATQuantizer(
+                groupsize=group_size, precision=dtype, scales_precision=dtype
+            )
             prepared = quantizer.prepare(m)
             prepared_out = prepared(*x)
             converted = quantizer.convert(prepared)
 
@@ -1005,6 +1005,18 @@ def test_ao_per_module_config_embedding_linear(self):
         assert isinstance(model.emb.weight._layout, QDQLayout)
         assert isinstance(model.linear.weight, LinearActivationQuantizedTensor)
 
+    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+    def test_ao_per_module_config_skip(self):
+        config1 = Int4WeightOnlyConfig(group_size=32)
+        config = AOPerModuleConfig({"_default": config1, "linear2": None})
+        model = ToyLinearModel().cuda().to(dtype=torch.bfloat16)
+        example_inputs = model.example_inputs(device="cuda", dtype=torch.bfloat16)
+        quantize_(model, config)
+        model(*example_inputs)
+        assert isinstance(model.linear1.weight, AffineQuantizedTensor)
+        assert isinstance(model.linear1.weight._layout, TensorCoreTiledLayout)
+        assert not isinstance(model.linear2.weight, AffineQuantizedTensor)
+
 
 class TestMultiTensorFlow(TestCase):
     @unittest.skipIf(not TORCH_VERSION_AT_LEAST_2_4, "Test only enabled for 2.4+")