Merge branch 'main' into main

lllyasviel · Jan 1, 2024 · 69bcf81 · 69bcf81
2 parents 15fe767 + 0c4f20a
commit 69bcf81
Show file tree

Hide file tree

Showing 42 changed files with 1,130 additions and 281 deletions.
diff --git a/extras/ip_adapter.py b/extras/ip_adapter.py
@@ -2,12 +2,13 @@
 import ldm_patched.modules.clip_vision
 import safetensors.torch as sf
 import ldm_patched.modules.model_management as model_management
-import contextlib
 import ldm_patched.ldm.modules.attention as attention
 
 from extras.resampler import Resampler
 from ldm_patched.modules.model_patcher import ModelPatcher
 from modules.core import numpy_to_pytorch
+from modules.ops import use_patched_ops
+from ldm_patched.modules.ops import manual_cast
 
 
 SD_V12_CHANNELS = [320] * 4 + [640] * 4 + [1280] * 4 + [1280] * 6 + [640] * 6 + [320] * 6 + [1280] * 2
@@ -116,14 +117,16 @@ def load_ip_adapter(clip_vision_path, ip_negative_path, ip_adapter_path):
         clip_extra_context_tokens = ip_state_dict["image_proj"]["proj.weight"].shape[0] // cross_attention_dim
         clip_embeddings_dim = None
 
-    ip_adapter = IPAdapterModel(
-        ip_state_dict,
-        plus=plus,
-        cross_attention_dim=cross_attention_dim,
-        clip_embeddings_dim=clip_embeddings_dim,
-        clip_extra_context_tokens=clip_extra_context_tokens,
-        sdxl_plus=sdxl_plus
-    )
+    with use_patched_ops(manual_cast):
+        ip_adapter = IPAdapterModel(
+            ip_state_dict,
+            plus=plus,
+            cross_attention_dim=cross_attention_dim,
+            clip_embeddings_dim=clip_embeddings_dim,
+            clip_extra_context_tokens=clip_extra_context_tokens,
+            sdxl_plus=sdxl_plus
+        )
+
     ip_adapter.sdxl = sdxl
     ip_adapter.load_device = load_device
     ip_adapter.offload_device = offload_device
@@ -167,14 +170,7 @@ def preprocess(img, ip_adapter_path):
 
     ldm_patched.modules.model_management.load_model_gpu(clip_vision.patcher)
     pixel_values = clip_preprocess(numpy_to_pytorch(img).to(clip_vision.load_device))
-
-    if clip_vision.dtype != torch.float32:
-        precision_scope = torch.autocast
-    else:
-        precision_scope = lambda a, b: contextlib.nullcontext(a)
-
-    with precision_scope(ldm_patched.modules.model_management.get_autocast_device(clip_vision.load_device), torch.float32):
-        outputs = clip_vision.model(pixel_values=pixel_values, output_hidden_states=True)
+    outputs = clip_vision.model(pixel_values=pixel_values, output_hidden_states=True)
 
     ip_adapter = entry['ip_adapter']
     ip_layers = entry['ip_layers']

diff --git a/extras/resampler.py b/extras/resampler.py
@@ -108,8 +108,7 @@ def __init__(
             )
 
     def forward(self, x):
-
-        latents = self.latents.repeat(x.size(0), 1, 1)
+        latents = self.latents.repeat(x.size(0), 1, 1).to(x)
 
         x = self.proj_in(x)
 
@@ -118,4 +117,4 @@ def forward(self, x):
             latents = ff(latents) + latents
 
         latents = self.proj_out(latents)
-        return self.norm_out(latents)
+        return self.norm_out(latents)
diff --git a/fooocus_version.py b/fooocus_version.py
@@ -1 +1 @@
-version = '2.1.850'
+version = '2.1.859'
diff --git a/ldm_patched/contrib/external.py b/ldm_patched/contrib/external.py
@@ -11,7 +11,7 @@
 import time
 import random
 
-from PIL import Image, ImageOps
+from PIL import Image, ImageOps, ImageSequence
 from PIL.PngImagePlugin import PngInfo
 import numpy as np
 import safetensors.torch
@@ -1412,17 +1412,30 @@ def INPUT_TYPES(s):
     FUNCTION = "load_image"
     def load_image(self, image):
         image_path = ldm_patched.utils.path_utils.get_annotated_filepath(image)
-        i = Image.open(image_path)
-        i = ImageOps.exif_transpose(i)
-        image = i.convert("RGB")
-        image = np.array(image).astype(np.float32) / 255.0
-        image = torch.from_numpy(image)[None,]
-        if 'A' in i.getbands():
-            mask = np.array(i.getchannel('A')).astype(np.float32) / 255.0
-            mask = 1. - torch.from_numpy(mask)
+        img = Image.open(image_path)
+        output_images = []
+        output_masks = []
+        for i in ImageSequence.Iterator(img):
+            i = ImageOps.exif_transpose(i)
+            image = i.convert("RGB")
+            image = np.array(image).astype(np.float32) / 255.0
+            image = torch.from_numpy(image)[None,]
+            if 'A' in i.getbands():
+                mask = np.array(i.getchannel('A')).astype(np.float32) / 255.0
+                mask = 1. - torch.from_numpy(mask)
+            else:
+                mask = torch.zeros((64,64), dtype=torch.float32, device="cpu")
+            output_images.append(image)
+            output_masks.append(mask.unsqueeze(0))
+
+        if len(output_images) > 1:
+            output_image = torch.cat(output_images, dim=0)
+            output_mask = torch.cat(output_masks, dim=0)
         else:
-            mask = torch.zeros((64,64), dtype=torch.float32, device="cpu")
-        return (image, mask.unsqueeze(0))
+            output_image = output_images[0]
+            output_mask = output_masks[0]
+
+        return (output_image, output_mask)
 
     @classmethod
     def IS_CHANGED(s, image):
@@ -1480,13 +1493,10 @@ def IS_CHANGED(s, image, channel):
         return m.digest().hex()
 
     @classmethod
-    def VALIDATE_INPUTS(s, image, channel):
+    def VALIDATE_INPUTS(s, image):
         if not ldm_patched.utils.path_utils.exists_annotated_filepath(image):
             return "Invalid image file: {}".format(image)
 
-        if channel not in s._color_channels:
-            return "Invalid color channel: {}".format(channel)
-
         return True
 
 class ImageScale:
@@ -1871,6 +1881,7 @@ def init_custom_nodes():
         "nodes_video_model.py",
         "nodes_sag.py",
         "nodes_perpneg.py",
+        "nodes_stable3d.py",
     ]
 
     for node_file in extras_files:

diff --git a/ldm_patched/contrib/external_custom_sampler.py b/ldm_patched/contrib/external_custom_sampler.py
@@ -89,15 +89,17 @@ def INPUT_TYPES(s):
         return {"required":
                     {"model": ("MODEL",),
                      "steps": ("INT", {"default": 1, "min": 1, "max": 10}),
+                     "denoise": ("FLOAT", {"default": 1.0, "min": 0, "max": 1.0, "step": 0.01}),
                       }
                }
     RETURN_TYPES = ("SIGMAS",)
     CATEGORY = "sampling/custom_sampling/schedulers"
 
     FUNCTION = "get_sigmas"
 
-    def get_sigmas(self, model, steps):
-        timesteps = torch.flip(torch.arange(1, 11) * 100 - 1, (0,))[:steps]
+    def get_sigmas(self, model, steps, denoise):
+        start_step = 10 - int(10 * denoise)
+        timesteps = torch.flip(torch.arange(1, 11) * 100 - 1, (0,))[start_step:start_step + steps]
         sigmas = model.model.model_sampling.sigma(timesteps)
         sigmas = torch.cat([sigmas, sigmas.new_zeros([1])])
         return (sigmas, )

diff --git a/ldm_patched/contrib/external_images.py b/ldm_patched/contrib/external_images.py
@@ -76,7 +76,7 @@ def INPUT_TYPES(s):
 
     OUTPUT_NODE = True
 
-    CATEGORY = "_for_testing"
+    CATEGORY = "image/animation"
 
     def save_images(self, images, fps, filename_prefix, lossless, quality, method, num_frames=0, prompt=None, extra_pnginfo=None):
         method = self.methods.get(method)
@@ -138,7 +138,7 @@ def INPUT_TYPES(s):
 
     OUTPUT_NODE = True
 
-    CATEGORY = "_for_testing"
+    CATEGORY = "image/animation"
 
     def save_images(self, images, fps, compress_level, filename_prefix="ldm_patched", prompt=None, extra_pnginfo=None):
         filename_prefix += self.prefix_append

diff --git a/ldm_patched/contrib/external_mask.py b/ldm_patched/contrib/external_mask.py
@@ -8,6 +8,7 @@
 from ldm_patched.contrib.external import MAX_RESOLUTION
 
 def composite(destination, source, x, y, mask = None, multiplier = 8, resize_source = False):
+    source = source.to(destination.device)
     if resize_source:
         source = torch.nn.functional.interpolate(source, size=(destination.shape[2], destination.shape[3]), mode="bilinear")
 
@@ -22,7 +23,7 @@ def composite(destination, source, x, y, mask = None, multiplier = 8, resize_sou
     if mask is None:
         mask = torch.ones_like(source)
     else:
-        mask = mask.clone()
+        mask = mask.to(destination.device, copy=True)
         mask = torch.nn.functional.interpolate(mask.reshape((-1, 1, mask.shape[-2], mask.shape[-1])), size=(source.shape[2], source.shape[3]), mode="bilinear")
         mask = ldm_patched.modules.utils.repeat_to_batch_size(mask, source.shape[0])
 

diff --git a/ldm_patched/contrib/external_rebatch.py b/ldm_patched/contrib/external_rebatch.py
@@ -101,10 +101,40 @@ def rebatch(self, latents, batch_size):
 
         return (output_list,)
 
+class ImageRebatch:
+    @classmethod
+    def INPUT_TYPES(s):
+        return {"required": { "images": ("IMAGE",),
+                              "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096}),
+                              }}
+    RETURN_TYPES = ("IMAGE",)
+    INPUT_IS_LIST = True
+    OUTPUT_IS_LIST = (True, )
+
+    FUNCTION = "rebatch"
+
+    CATEGORY = "image/batch"
+
+    def rebatch(self, images, batch_size):
+        batch_size = batch_size[0]
+
+        output_list = []
+        all_images = []
+        for img in images:
+            for i in range(img.shape[0]):
+                all_images.append(img[i:i+1])
+
+        for i in range(0, len(all_images), batch_size):
+            output_list.append(torch.cat(all_images[i:i+batch_size], dim=0))
+
+        return (output_list,)
+
 NODE_CLASS_MAPPINGS = {
     "RebatchLatents": LatentRebatch,
+    "RebatchImages": ImageRebatch,
 }
 
 NODE_DISPLAY_NAME_MAPPINGS = {
     "RebatchLatents": "Rebatch Latents",
-}
+    "RebatchImages": "Rebatch Images",
+}
diff --git a/ldm_patched/contrib/external_sag.py b/ldm_patched/contrib/external_sag.py
@@ -153,7 +153,7 @@ def post_cfg_function(args):
             (sag, _) = ldm_patched.modules.samplers.calc_cond_uncond_batch(model, uncond, None, degraded_noised, sigma, model_options)
             return cfg_result + (degraded - sag) * sag_scale
 
-        m.set_model_sampler_post_cfg_function(post_cfg_function)
+        m.set_model_sampler_post_cfg_function(post_cfg_function, disable_cfg1_optimization=True)
 
         # from diffusers:
         # unet.mid_block.attentions[0].transformer_blocks[0].attn1.patch

diff --git a/ldm_patched/contrib/external_stable3d.py b/ldm_patched/contrib/external_stable3d.py
@@ -0,0 +1,60 @@
+# https://github.com/comfyanonymous/ComfyUI/blob/master/nodes.py 
+
+import torch
+import ldm_patched.contrib.external
+import ldm_patched.modules.utils
+
+def camera_embeddings(elevation, azimuth):
+    elevation = torch.as_tensor([elevation])
+    azimuth = torch.as_tensor([azimuth])
+    embeddings = torch.stack(
+        [
+                torch.deg2rad(
+                    (90 - elevation) - (90)
+                ),  # Zero123 polar is 90-elevation
+                torch.sin(torch.deg2rad(azimuth)),
+                torch.cos(torch.deg2rad(azimuth)),
+                torch.deg2rad(
+                    90 - torch.full_like(elevation, 0)
+                ),
+        ], dim=-1).unsqueeze(1)
+
+    return embeddings
+
+
+class StableZero123_Conditioning:
+    @classmethod
+    def INPUT_TYPES(s):
+        return {"required": { "clip_vision": ("CLIP_VISION",),
+                              "init_image": ("IMAGE",),
+                              "vae": ("VAE",),
+                              "width": ("INT", {"default": 256, "min": 16, "max": ldm_patched.contrib.external.MAX_RESOLUTION, "step": 8}),
+                              "height": ("INT", {"default": 256, "min": 16, "max": ldm_patched.contrib.external.MAX_RESOLUTION, "step": 8}),
+                              "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096}),
+                              "elevation": ("FLOAT", {"default": 0.0, "min": -180.0, "max": 180.0}),
+                              "azimuth": ("FLOAT", {"default": 0.0, "min": -180.0, "max": 180.0}),
+                             }}
+    RETURN_TYPES = ("CONDITIONING", "CONDITIONING", "LATENT")
+    RETURN_NAMES = ("positive", "negative", "latent")
+
+    FUNCTION = "encode"
+
+    CATEGORY = "conditioning/3d_models"
+
+    def encode(self, clip_vision, init_image, vae, width, height, batch_size, elevation, azimuth):
+        output = clip_vision.encode_image(init_image)
+        pooled = output.image_embeds.unsqueeze(0)
+        pixels = ldm_patched.modules.utils.common_upscale(init_image.movedim(-1,1), width, height, "bilinear", "center").movedim(1,-1)
+        encode_pixels = pixels[:,:,:,:3]
+        t = vae.encode(encode_pixels)
+        cam_embeds = camera_embeddings(elevation, azimuth)
+        cond = torch.cat([pooled, cam_embeds.repeat((pooled.shape[0], 1, 1))], dim=-1)
+
+        positive = [[cond, {"concat_latent_image": t}]]
+        negative = [[torch.zeros_like(pooled), {"concat_latent_image": torch.zeros_like(t)}]]
+        latent = torch.zeros([batch_size, 4, height // 8, width // 8])
+        return (positive, negative, {"samples":latent})
+
+NODE_CLASS_MAPPINGS = {
+    "StableZero123_Conditioning": StableZero123_Conditioning,
+}
diff --git a/ldm_patched/ldm/models/autoencoder.py b/ldm_patched/ldm/models/autoencoder.py
@@ -8,6 +8,7 @@
 
 from ldm_patched.ldm.util import instantiate_from_config
 from ldm_patched.ldm.modules.ema import LitEma
+import ldm_patched.modules.ops
 
 class DiagonalGaussianRegularizer(torch.nn.Module):
     def __init__(self, sample: bool = True):
@@ -161,12 +162,12 @@ def __init__(self, embed_dim: int, **kwargs):
             },
             **kwargs,
         )
-        self.quant_conv = torch.nn.Conv2d(
+        self.quant_conv = ldm_patched.modules.ops.disable_weight_init.Conv2d(
             (1 + ddconfig["double_z"]) * ddconfig["z_channels"],
             (1 + ddconfig["double_z"]) * embed_dim,
             1,
         )
-        self.post_quant_conv = torch.nn.Conv2d(embed_dim, ddconfig["z_channels"], 1)
+        self.post_quant_conv = ldm_patched.modules.ops.disable_weight_init.Conv2d(embed_dim, ddconfig["z_channels"], 1)
         self.embed_dim = embed_dim
 
     def get_autoencoder_params(self) -> list:

diff --git a/ldm_patched/ldm/modules/diffusionmodules/model.py b/ldm_patched/ldm/modules/diffusionmodules/model.py
@@ -41,7 +41,7 @@ def nonlinearity(x):
 
 
 def Normalize(in_channels, num_groups=32):
-    return torch.nn.GroupNorm(num_groups=num_groups, num_channels=in_channels, eps=1e-6, affine=True)
+    return ops.GroupNorm(num_groups=num_groups, num_channels=in_channels, eps=1e-6, affine=True)
 
 
 class Upsample(nn.Module):

diff --git a/ldm_patched/ldm/modules/diffusionmodules/upscaling.py b/ldm_patched/ldm/modules/diffusionmodules/upscaling.py
@@ -43,8 +43,8 @@ def register_schedule(self, beta_schedule="linear", timesteps=1000,
 
     def q_sample(self, x_start, t, noise=None):
         noise = default(noise, lambda: torch.randn_like(x_start))
-        return (extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start +
-                extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) * noise)
+        return (extract_into_tensor(self.sqrt_alphas_cumprod.to(x_start.device), t, x_start.shape) * x_start +
+                extract_into_tensor(self.sqrt_one_minus_alphas_cumprod.to(x_start.device), t, x_start.shape) * noise)
 
     def forward(self, x):
         return x, None

diff --git a/ldm_patched/ldm/modules/diffusionmodules/util.py b/ldm_patched/ldm/modules/diffusionmodules/util.py
@@ -51,17 +51,17 @@ def get_alpha(self, image_only_indicator: torch.Tensor) -> torch.Tensor:
         if self.merge_strategy == "fixed":
             # make shape compatible
             # alpha = repeat(self.mix_factor, '1 -> b () t  () ()', t=t, b=bs)
-            alpha = self.mix_factor
+            alpha = self.mix_factor.to(image_only_indicator.device)
         elif self.merge_strategy == "learned":
-            alpha = torch.sigmoid(self.mix_factor)
+            alpha = torch.sigmoid(self.mix_factor.to(image_only_indicator.device))
             # make shape compatible
             # alpha = repeat(alpha, '1 -> s () ()', s = t * bs)
         elif self.merge_strategy == "learned_with_images":
             assert image_only_indicator is not None, "need image_only_indicator ..."
             alpha = torch.where(
                 image_only_indicator.bool(),
                 torch.ones(1, 1, device=image_only_indicator.device),
-                rearrange(torch.sigmoid(self.mix_factor), "... -> ... 1"),
+                rearrange(torch.sigmoid(self.mix_factor.to(image_only_indicator.device)), "... -> ... 1"),
             )
             alpha = rearrange(alpha, self.rearrange_pattern)
             # make shape compatible

diff --git a/ldm_patched/ldm/modules/encoders/noise_aug_modules.py b/ldm_patched/ldm/modules/encoders/noise_aug_modules.py
@@ -15,12 +15,12 @@ def __init__(self, *args, clip_stats_path=None, timestep_dim=256, **kwargs):
 
     def scale(self, x):
         # re-normalize to centered mean and unit variance
-        x = (x - self.data_mean) * 1. / self.data_std
+        x = (x - self.data_mean.to(x.device)) * 1. / self.data_std.to(x.device)
         return x
 
     def unscale(self, x):
         # back to original data stats
-        x = (x * self.data_std) + self.data_mean
+        x = (x * self.data_std.to(x.device)) + self.data_mean.to(x.device)
         return x
 
     def forward(self, x, noise_level=None):