Merge pull request #83 from okotaku/feat/ssd_1b

[Feature] Support SSD-1B
okotaku · Oct 25, 2023 · 48344d4 · 48344d4
2 parents e76ee18 + 5838154
commit 48344d4
Show file tree

Hide file tree

Showing 14 changed files with 825 additions and 1 deletion.
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
@@ -13,4 +13,4 @@ jobs:
     steps:
       - uses: readthedocs/actions/preview@v1
         with:
-          project-slug: "diffengine"
+          project-slug: "DiffEngine"
diff --git a/README.md b/README.md
@@ -141,6 +141,7 @@ For detailed user guides and advanced guides, please refer to our [Documentation
 - [Introduction to DiffEngine](https://medium.com/@to78314910/introduction-to-diffengine-cad272e900c4)
 - [Train ControlNet with DiffEngine](https://medium.com/@to78314910/train-controlnet-with-diffengine-727ef42bc38)
 - [On Architectural Compression of Text-to-Image Diffusion Models](https://medium.com/@to78314910/on-architectural-compression-of-text-to-image-diffusion-models-ce8c9cba512a)
+- [SSD-1B: A Leap in Efficient T2I Generation](https://medium.com/@to78314910/ssd-1b-a-leap-in-efficient-t2i-generation-138bb05fdd75)
 
 </details>
 
@@ -187,6 +188,7 @@ For detailed user guides and advanced guides, please refer to our [Documentation
           <li><a href="configs/t2i_adapter/README.md">T2I-Adapter (2023)</a></li>
           <li><a href="configs/ip_adapter/README.md">IP-Adapter (2023)</a></li>
           <li><a href="configs/esd/README.md">Erasing Concepts from Diffusion Models (2023)</a></li>
+          <li><a href="configs/ssd_1b/README.md">SSD-1B (2023)</a></li>
         </ul>
       </td>
       <td>

diff --git a/configs/_base_/models/distill_ssd_1b.py b/configs/_base_/models/distill_ssd_1b.py
@@ -0,0 +1,7 @@
+model = dict(
+    type="SSD1B",
+    model="stabilityai/stable-diffusion-xl-base-1.0",
+    student_model="segmind/SSD-1B",
+    student_model_weight="unet",
+    vae_model="madebyollin/sdxl-vae-fp16-fix",
+    gradient_checkpointing=True)
diff --git a/configs/_base_/models/distill_ssd_1b_from_sdxl.py b/configs/_base_/models/distill_ssd_1b_from_sdxl.py
@@ -0,0 +1,7 @@
+model = dict(
+    type="SSD1B",
+    model="stabilityai/stable-diffusion-xl-base-1.0",
+    student_model="segmind/SSD-1B",
+    student_model_weight="orig_unet",
+    vae_model="madebyollin/sdxl-vae-fp16-fix",
+    gradient_checkpointing=True)
diff --git a/configs/_base_/models/ssd_1b_lora.py b/configs/_base_/models/ssd_1b_lora.py
@@ -0,0 +1,5 @@
+model = dict(
+    type="StableDiffusionXL",
+    model="segmind/SSD-1B",
+    vae_model="madebyollin/sdxl-vae-fp16-fix",
+    lora_config=dict(rank=8))
diff --git a/configs/ssd_1b/README.md b/configs/ssd_1b/README.md
@@ -0,0 +1,60 @@
+# SSD-1B
+
+[SSD-1B](https://blog.segmind.com/introducing-segmind-ssd-1b/)
+
+## Abstract
+
+Today, Segmind is thrilled to announce the open sourcing of our new foundational model, SSD-1B, the fastest diffusion-based text-to-image model in the market, with unprecedented image generation times for a 1024x1024 image. Developed as part of our distillation series, SSD-1B is 50% smaller and 60% faster compared to the SDXL 1.0 model. This reduction in speed and size comes with a minimal impact on image quality when compared to SDXL 1.0. Furthermore, we are excited to reveal that the SSD-1B model has been licensed for commercial use, opening avenues for businesses and developers to integrate this groundbreaking technology into their services and products.
+
+<div align=center>
+<img src="https://github.com/okotaku/diffengine/assets/24734142/5c5a0e65-d06d-43a0-873d-f804e1900428"/>
+</div>
+
+## Citation
+
+```
+```
+
+## Dependencies
+
+Note that install diffusers from source to use SSD-1B.
+
+```
+pip install -U git+https://github.com/huggingface/diffusers.git
+```
+
+## Run Training
+
+Run Training
+
+```
+# single gpu
+$ mim train diffengine ${CONFIG_FILE}
+# multi gpus
+$ mim train diffengine ${CONFIG_FILE} --gpus 2 --launcher pytorch
+
+# Example.
+$ mim train diffengine configs/ssd_1b/ssd_1b_distill_pokemon_blip.py
+```
+
+## Inference with diffusers
+
+You can see more details on [`docs/source/run_guides/run_xl.md`](../../docs/source/run_guides/run_xl.md#inference-with-diffusers).
+
+## Results Example
+
+#### ssd_1b_distill_from_sdxl_pokemon_blip
+
+![example](https://github.com/okotaku/diffengine/assets/24734142/057a347f-4baf-443d-ac75-e8a073a43a27)
+
+#### ssd_1b_distill_pokemon_blip
+
+![example2](https://github.com/okotaku/diffengine/assets/24734142/304a2cf8-22a5-4c1e-a6b5-b12c1a245bf4)
+
+## Blog post
+
+[SSD-1B: A Leap in Efficient T2I Generation](https://medium.com/@to78314910/ssd-1b-a-leap-in-efficient-t2i-generation-138bb05fdd75)
+
+## Acknowledgement
+
+These implementations are based on [segmind/SSD-1B](https://github.com/segmind/SSD-1B). Thank you for the great open source project.
diff --git a/configs/ssd_1b/ssd_1b_distill_from_sdxl_pokemon_blip.py b/configs/ssd_1b/ssd_1b_distill_from_sdxl_pokemon_blip.py
@@ -0,0 +1,6 @@
+_base_ = [
+    "../_base_/models/distill_ssd_1b_from_sdxl.py",
+    "../_base_/datasets/pokemon_blip_xl.py",
+    "../_base_/schedules/stable_diffusion_xl_50e.py",
+    "../_base_/default_runtime.py",
+]
diff --git a/configs/ssd_1b/ssd_1b_distill_pokemon_blip.py b/configs/ssd_1b/ssd_1b_distill_pokemon_blip.py
@@ -0,0 +1,6 @@
+_base_ = [
+    "../_base_/models/distill_ssd_1b.py",
+    "../_base_/datasets/pokemon_blip_xl.py",
+    "../_base_/schedules/stable_diffusion_xl_50e.py",
+    "../_base_/default_runtime.py",
+]
diff --git a/configs/ssd_1b_dreambooth/README.md b/configs/ssd_1b_dreambooth/README.md
@@ -0,0 +1,103 @@
+# SSD-1B DreamBooth
+
+[DreamBooth: Fine Tuning Text-to-Image Diffusion Models for Subject-Driven Generation](https://arxiv.org/abs/2208.12242)
+[SSD-1B](https://blog.segmind.com/introducing-segmind-ssd-1b/)
+
+## Abstract
+
+Large text-to-image models achieved a remarkable leap in the evolution of AI, enabling high-quality and diverse synthesis of images from a given text prompt. However, these models lack the ability to mimic the appearance of subjects in a given reference set and synthesize novel renditions of them in different contexts. In this work, we present a new approach for "personalization" of text-to-image diffusion models. Given as input just a few images of a subject, we fine-tune a pretrained text-to-image model such that it learns to bind a unique identifier with that specific subject. Once the subject is embedded in the output domain of the model, the unique identifier can be used to synthesize novel photorealistic images of the subject contextualized in different scenes. By leveraging the semantic prior embedded in the model with a new autogenous class-specific prior preservation loss, our technique enables synthesizing the subject in diverse scenes, poses, views and lighting conditions that do not appear in the reference images. We apply our technique to several previously-unassailable tasks, including subject recontextualization, text-guided view synthesis, and artistic rendering, all while preserving the subject's key features. We also provide a new dataset and evaluation protocol for this new task of subject-driven generation.
+
+<div align=center>
+<img src="https://github.com/okotaku/dethub/assets/24734142/33b1953d-ce42-4f9a-bcbc-87050cfe4f6f"/>
+</div>
+
+## Citation
+
+```
+@inproceedings{ruiz2023dreambooth,
+  title={Dreambooth: Fine tuning text-to-image diffusion models for subject-driven generation},
+  author={Ruiz, Nataniel and Li, Yuanzhen and Jampani, Varun and Pritch, Yael and Rubinstein, Michael and Aberman, Kfir},
+  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+  year={2023}
+}
+```
+
+## Dependencies
+
+Note that install diffusers from source to use SSD-1B.
+
+```
+pip install -U git+https://github.com/huggingface/diffusers.git
+```
+
+## Run Training
+
+Run Training
+
+```
+# single gpu
+$ mim train diffengine ${CONFIG_FILE}
+# multi gpus
+$ mim train diffengine ${CONFIG_FILE} --gpus 2 --launcher pytorch
+
+# Example.
+$ mim train diffengine configs/ssd_1b_dreambooth/ssd_1b_dreambooth_lora_dog.py
+```
+
+## Training Speed
+
+Environment:
+
+- A6000 Single GPU
+- nvcr.io/nvidia/pytorch:23.07-py3
+
+Settings:
+
+- 500 iterations training, (validation 4 images / 100 iterations)
+- LoRA (rank=8) / DreamBooth
+
+| Model  | total time |
+| :----: | :--------: |
+|  SDXL  | 18 m 55 s  |
+| SSD-1B | 12 m 30 s  |
+
+## Inference with diffusers
+
+Once you have trained a model, specify the path to where the model is saved, and use it for inference with the `diffusers`.
+
+```py
+import torch
+from diffusers import DiffusionPipeline, AutoencoderKL
+
+checkpoint = 'work_dirs/ssd_1b_dreambooth_lora_dog/step499'
+prompt = 'A photo of sks dog in a bucket'
+
+vae = AutoencoderKL.from_pretrained(
+    'madebyollin/sdxl-vae-fp16-fix',
+    torch_dtype=torch.float16,
+)
+pipe = DiffusionPipeline.from_pretrained(
+    'segmind/SSD-1B', vae=vae, torch_dtype=torch.float16)
+pipe.to('cuda')
+pipe.load_lora_weights(checkpoint)
+
+image = pipe(
+    prompt,
+    num_inference_steps=50,
+    width=1024,
+    height=1024,
+).images[0]
+image.save('demo.png')
+```
+
+You can see more details on [Run  DreamBooth XL docs](../../docs/source/run_guides/run_dreambooth_xl.md#inference-with-diffusers).
+
+## Results Example
+
+#### ssd_1b_dreambooth_lora_dog
+
+![exampledog](https://github.com/okotaku/diffengine/assets/24734142/70a529fa-af82-4e53-951d-837c8eb88915)
+
+## Blog post
+
+[SSD-1B: A Leap in Efficient T2I Generation](https://medium.com/@to78314910/ssd-1b-a-leap-in-efficient-t2i-generation-138bb05fdd75)
diff --git a/configs/ssd_1b_dreambooth/ssd_1b_dreambooth_lora_dog.py b/configs/ssd_1b_dreambooth/ssd_1b_dreambooth_lora_dog.py
@@ -0,0 +1,9 @@
+_base_ = [
+    "../_base_/models/ssd_1b_lora.py",
+    "../_base_/datasets/dog_dreambooth_xl.py",
+    "../_base_/schedules/stable_diffusion_500.py",
+    "../_base_/default_runtime.py",
+]
+
+train_dataloader = dict(
+    dataset=dict(class_image_config=dict(model={{_base_.model.model}})))
diff --git a/diffengine/models/editors/__init__.py b/diffengine/models/editors/__init__.py
@@ -2,6 +2,7 @@
 from .distill_sd import *  # noqa: F403
 from .esd import *  # noqa: F403
 from .ip_adapter import *  # noqa: F403
+from .ssd_1b import *  # noqa: F403
 from .stable_diffusion import *  # noqa: F403
 from .stable_diffusion_controlnet import *  # noqa: F403
 from .stable_diffusion_xl import *  # noqa: F403

diff --git a/diffengine/models/editors/ssd_1b/__init__.py b/diffengine/models/editors/ssd_1b/__init__.py
@@ -0,0 +1,3 @@
+from .ssd_1b import SSD1B
+
+__all__ = ["SSD1B"]