use fp32 timesteps

huggingface · Oct 21, 2024 · 7916966 · 7916966
1 parent 8b61c52
commit 7916966
Show file tree

Hide file tree

Showing 4 changed files with 30 additions and 8 deletions.
diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
@@ -44,6 +44,7 @@
     DummyInputGenerator,
     DummyPastKeyValuesGenerator,
     DummyTextInputGenerator,
+    DummyTimestepInputGenerator,
     DummyVisionInputGenerator,
     FalconDummyPastKeyValuesGenerator,
     MistralDummyPastKeyValuesGenerator,
@@ -1527,7 +1528,7 @@ def patch_model_for_export(
 
 
 class PooledProjectionsDummyInputGenerator(DummyInputGenerator):
-    SUPPORTED_INPUT_NAMES = "pooled_projection"
+    SUPPORTED_INPUT_NAMES = "pooled_projections"
 
     def __init__(
         self,
@@ -1550,10 +1551,20 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int
         return self.random_float_tensor(shape, framework=framework, dtype=float_dtype)
 
 
+class DummyTransformerTimestpsInputGenerator(DummyTimestepInputGenerator):
+    def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
+        if input_name == "timestep":
+            shape = [self.batch_size]
+            return self.random_float_tensor(shape, max_value=self.vocab_size, framework=framework, dtype=float_dtype)
+        return super().generate(input_name, framework, int_dtype, float_dtype)
+
+
 @register_in_tasks_manager("sd3-transformer", *["semantic-segmentation"], library_name="diffusers")
-class TransformerOpenVINOConfig(UNetOnnxConfig):
-    DUMMY_INPUT_GENERATOR_CLASSES = UNetOnnxConfig.DUMMY_INPUT_GENERATOR_CLASSES + (
-        PooledProjectionsDummyInputGenerator,
+class SD3TransformerOpenVINOConfig(UNetOnnxConfig):
+    DUMMY_INPUT_GENERATOR_CLASSES = (
+        (DummyTransformerTimestpsInputGenerator,)
+        + UNetOnnxConfig.DUMMY_INPUT_GENERATOR_CLASSES
+        + (PooledProjectionsDummyInputGenerator,)
     )
     NORMALIZED_CONFIG_CLASS = NormalizedConfig.with_args(
         image_size="sample_size",

diff --git a/tests/openvino/test_diffusion.py b/tests/openvino/test_diffusion.py
@@ -35,6 +35,7 @@
     OVPipelineForInpainting,
     OVPipelineForText2Image,
 )
+from optimum.intel.utils.import_utils import is_transformers_version
 from optimum.utils.testing_utils import require_diffusers
 
 
@@ -72,7 +73,10 @@ def _generate_images(height=128, width=128, batch_size=1, channel=3, input_type=
 
 
 class OVPipelineForText2ImageTest(unittest.TestCase):
-    SUPPORTED_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl", "latent-consistency", "stable-diffusion-3"]
+    SUPPORTED_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl", "latent-consistency"]
+    if is_transformers_version(">=", "4.40.0"):
+        SUPPORTED_ARCHITECTURES.append("stable-diffusion-3")
+    CALLBACK_SUPPORT_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl", "latent-consistency"]
 
     OVMODEL_CLASS = OVPipelineForText2Image
     AUTOMODEL_CLASS = AutoPipelineForText2Image
@@ -138,7 +142,7 @@ def test_compare_to_diffusers_pipeline(self, model_arch: str):
 
             np.testing.assert_allclose(ov_output, diffusers_output, atol=6e-3, rtol=1e-2)
 
-    @parameterized.expand(["stable-diffusion", "stable-diffusion-xl", "latent-consistency"])
+    @parameterized.expand(CALLBACK_SUPPORT_ARCHITECTURES)
     @require_diffusers
     def test_callback(self, model_arch: str):
         height, width, batch_size = 64, 128, 1
@@ -353,7 +357,9 @@ def test_textual_inversion(self):
 
 
 class OVPipelineForImage2ImageTest(unittest.TestCase):
-    SUPPORTED_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl", "latent-consistency", "stable-diffusion-3"]
+    SUPPORTED_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl", "latent-consistency"]
+    if is_transformers_version(">=", "4.40.0"):
+        SUPPORTED_ARCHITECTURES.append("stable-diffusion-3")
 
     AUTOMODEL_CLASS = AutoPipelineForImage2Image
     OVMODEL_CLASS = OVPipelineForImage2Image
@@ -576,7 +582,10 @@ def test_textual_inversion(self):
 
 
 class OVPipelineForInpaintingTest(unittest.TestCase):
-    SUPPORTED_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl", "stable-diffusion-3"]
+    SUPPORTED_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl"]
+
+    if is_transformers_version(">=", "4.40.0"):
+        SUPPORTED_ARCHITECTURES.append("stable-diffusion-3")
 
     AUTOMODEL_CLASS = AutoPipelineForInpainting
     OVMODEL_CLASS = OVPipelineForInpainting

diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py
@@ -94,6 +94,7 @@ class OVCLIExportTestCase(unittest.TestCase):
         ("stable-diffusion", 72, 195),
         ("stable-diffusion-xl", 84, 331),
         ("latent-consistency", 50, 135),
+        ("stable-diffusion-3", 84, 331),
     )
 
     TEST_4BIT_CONFIGURATONS = [

diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py
@@ -171,6 +171,7 @@
     "stable-diffusion-xl": (366, 34, 42, 66),
     "stable-diffusion-xl-refiner": (366, 34, 42, 66),
     "open-clip": (20, 28),
+    "stable-diffusion-3": (366, 34, 42, 66),
 }