Skip to content

Commit

Permalink
Merge pull request #528 from kohya-ss/dev
Browse files Browse the repository at this point in the history
save_state handling, old LoRA support etc.
  • Loading branch information
kohya-ss authored May 22, 2023
2 parents c924c47 + 99b607c commit b6ba4ca
Show file tree
Hide file tree
Showing 4 changed files with 180 additions and 12 deletions.
13 changes: 13 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,19 @@ The majority of scripts is licensed under ASL 2.0 (including codes from Diffuser

## Change History

### 22 May 2023, 2023/05/22

- Fixed several bugs.
- The state is saved even when the `--save_state` option is not specified in `fine_tune.py` and `train_db.py`. [PR #521](https://github.com/kohya-ss/sd-scripts/pull/521) Thanks to akshaal!
- Cannot load LoRA without `alpha`. [PR #527](https://github.com/kohya-ss/sd-scripts/pull/527) Thanks to Manjiz!
- Minor changes to console output during sample generation. [PR #515](https://github.com/kohya-ss/sd-scripts/pull/515) Thanks to yanhuifair!
- The generation script now uses xformers for VAE as well.
- いくつかのバグ修正を行いました。
- `fine_tune.py``train_db.py``--save_state`オプション未指定時にもstateが保存される。 [PR #521](https://github.com/kohya-ss/sd-scripts/pull/521) akshaal氏に感謝します。
- `alpha`を持たないLoRAを読み込めない。[PR #527](https://github.com/kohya-ss/sd-scripts/pull/527) Manjiz氏に感謝します。
- サンプル生成時のコンソール出力の軽微な変更。[PR #515](https://github.com/kohya-ss/sd-scripts/pull/515) yanhuifair氏に感謝します。
- 生成スクリプトでVAEについてもxformersを使うようにしました。

### 16 May 2023, 2023/05/16

- Fixed an issue where an error would occur if the encoding of the prompt file was different from the default. [PR #510](https://github.com/kohya-ss/sd-scripts/pull/510) Thanks to sdbds!
Expand Down
107 changes: 103 additions & 4 deletions gen_img_diffusers.py
Original file line number Diff line number Diff line change
Expand Up @@ -311,6 +311,7 @@ def backward(ctx, do):
return dq, dk, dv, None, None, None, None


# TODO common train_util.py
def replace_unet_modules(unet: diffusers.models.unet_2d_condition.UNet2DConditionModel, mem_eff_attn, xformers):
if mem_eff_attn:
replace_unet_cross_attn_to_memory_efficient()
Expand All @@ -319,7 +320,7 @@ def replace_unet_modules(unet: diffusers.models.unet_2d_condition.UNet2DConditio


def replace_unet_cross_attn_to_memory_efficient():
print("Replace CrossAttention.forward to use NAI style Hypernetwork and FlashAttention")
print("CrossAttention.forward has been replaced to FlashAttention (not xformers) and NAI style Hypernetwork")
flash_func = FlashAttentionFunction

def forward_flash_attn(self, x, context=None, mask=None):
Expand Down Expand Up @@ -359,7 +360,7 @@ def forward_flash_attn(self, x, context=None, mask=None):


def replace_unet_cross_attn_to_xformers():
print("Replace CrossAttention.forward to use NAI style Hypernetwork and xformers")
print("CrossAttention.forward has been replaced to enable xformers and NAI style Hypernetwork")
try:
import xformers.ops
except ImportError:
Expand Down Expand Up @@ -401,6 +402,104 @@ def forward_xformers(self, x, context=None, mask=None):
diffusers.models.attention.CrossAttention.forward = forward_xformers


def replace_vae_modules(vae: diffusers.models.AutoencoderKL, mem_eff_attn, xformers):
if mem_eff_attn:
replace_vae_attn_to_memory_efficient()
elif xformers:
# とりあえずDiffusersのxformersを使う。AttentionがあるのはMidBlockのみ
print("Use Diffusers xformers for VAE")
vae.set_use_memory_efficient_attention_xformers(True)

"""
# VAEがbfloat16でメモリ消費が大きい問題を解決する
upsamplers = []
for block in vae.decoder.up_blocks:
if block.upsamplers is not None:
upsamplers.extend(block.upsamplers)
def forward_upsample(_self, hidden_states, output_size=None):
assert hidden_states.shape[1] == _self.channels
if _self.use_conv_transpose:
return _self.conv(hidden_states)
dtype = hidden_states.dtype
if dtype == torch.bfloat16:
assert output_size is None
# repeat_interleaveはすごく遅いが、回数はあまり呼ばれないので許容する
hidden_states = hidden_states.repeat_interleave(2, dim=-1)
hidden_states = hidden_states.repeat_interleave(2, dim=-2)
else:
if hidden_states.shape[0] >= 64:
hidden_states = hidden_states.contiguous()
# if `output_size` is passed we force the interpolation output
# size and do not make use of `scale_factor=2`
if output_size is None:
hidden_states = torch.nn.functional.interpolate(hidden_states, scale_factor=2.0, mode="nearest")
else:
hidden_states = torch.nn.functional.interpolate(hidden_states, size=output_size, mode="nearest")
if _self.use_conv:
if _self.name == "conv":
hidden_states = _self.conv(hidden_states)
else:
hidden_states = _self.Conv2d_0(hidden_states)
return hidden_states
# replace upsamplers
for upsampler in upsamplers:
# make new scope
def make_replacer(upsampler):
def forward(hidden_states, output_size=None):
return forward_upsample(upsampler, hidden_states, output_size)
return forward
upsampler.forward = make_replacer(upsampler)
"""


def replace_vae_attn_to_memory_efficient():
print("AttentionBlock.forward has been replaced to FlashAttention (not xformers)")
flash_func = FlashAttentionFunction

def forward_flash_attn(self, hidden_states):
print("forward_flash_attn")
q_bucket_size = 512
k_bucket_size = 1024

residual = hidden_states
batch, channel, height, width = hidden_states.shape

# norm
hidden_states = self.group_norm(hidden_states)

hidden_states = hidden_states.view(batch, channel, height * width).transpose(1, 2)

# proj to q, k, v
query_proj = self.query(hidden_states)
key_proj = self.key(hidden_states)
value_proj = self.value(hidden_states)

query_proj, key_proj, value_proj = map(
lambda t: rearrange(t, "b n (h d) -> b h n d", h=self.num_heads), (query_proj, key_proj, value_proj)
)

out = flash_func.apply(query_proj, key_proj, value_proj, None, False, q_bucket_size, k_bucket_size)

out = rearrange(out, "b h n d -> b n (h d)")

# compute next hidden_states
hidden_states = self.proj_attn(hidden_states)
hidden_states = hidden_states.transpose(-1, -2).reshape(batch, channel, height, width)

# res connect and rescale
hidden_states = (hidden_states + residual) / self.rescale_output_factor
return hidden_states

diffusers.models.attention.AttentionBlock.forward = forward_flash_attn


# endregion

# region 画像生成の本体:lpw_stable_diffusion.py (ASL)からコピーして修正
Expand Down Expand Up @@ -2142,6 +2241,7 @@ def main(args):
# xformers、Hypernetwork対応
if not args.diffusers_xformers:
replace_unet_modules(unet, not args.xformers, args.xformers)
replace_vae_modules(vae, not args.xformers, args.xformers)

# tokenizerを読み込む
print("loading tokenizer")
Expand Down Expand Up @@ -3175,8 +3275,7 @@ def setup_parser() -> argparse.ArgumentParser:
"--vae_slices",
type=int,
default=None,
help=
"number of slices to split image into for VAE to reduce VRAM usage, None for no splitting (default), slower if specified. 16 or 32 recommended / VAE処理時にVRAM使用量削減のため画像を分割するスライス数、Noneの場合は分割しない(デフォルト)、指定すると遅くなる。16か32程度を推奨"
help="number of slices to split image into for VAE to reduce VRAM usage, None for no splitting (default), slower if specified. 16 or 32 recommended / VAE処理時にVRAM使用量削減のため画像を分割するスライス数、Noneの場合は分割しない(デフォルト)、指定すると遅くなる。16か32程度を推奨",
)
parser.add_argument("--steps", type=int, default=50, help="number of ddim sampling steps / サンプリングステップ数")
parser.add_argument(
Expand Down
70 changes: 63 additions & 7 deletions library/train_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -1765,14 +1765,15 @@ def backward(ctx, do):


def replace_unet_modules(unet: diffusers.models.unet_2d_condition.UNet2DConditionModel, mem_eff_attn, xformers):
# unet is not used currently, but it is here for future use
if mem_eff_attn:
replace_unet_cross_attn_to_memory_efficient()
elif xformers:
replace_unet_cross_attn_to_xformers()


def replace_unet_cross_attn_to_memory_efficient():
print("Replace CrossAttention.forward to use FlashAttention (not xformers)")
print("CrossAttention.forward has been replaced to FlashAttention (not xformers)")
flash_func = FlashAttentionFunction

def forward_flash_attn(self, x, context=None, mask=None):
Expand Down Expand Up @@ -1812,7 +1813,7 @@ def forward_flash_attn(self, x, context=None, mask=None):


def replace_unet_cross_attn_to_xformers():
print("Replace CrossAttention.forward to use xformers")
print("CrossAttention.forward has been replaced to enable xformers.")
try:
import xformers.ops
except ImportError:
Expand Down Expand Up @@ -1854,6 +1855,60 @@ def forward_xformers(self, x, context=None, mask=None):
diffusers.models.attention.CrossAttention.forward = forward_xformers


"""
def replace_vae_modules(vae: diffusers.models.AutoencoderKL, mem_eff_attn, xformers):
# vae is not used currently, but it is here for future use
if mem_eff_attn:
replace_vae_attn_to_memory_efficient()
elif xformers:
# とりあえずDiffusersのxformersを使う。AttentionがあるのはMidBlockのみ
print("Use Diffusers xformers for VAE")
vae.encoder.mid_block.attentions[0].set_use_memory_efficient_attention_xformers(True)
vae.decoder.mid_block.attentions[0].set_use_memory_efficient_attention_xformers(True)
def replace_vae_attn_to_memory_efficient():
print("AttentionBlock.forward has been replaced to FlashAttention (not xformers)")
flash_func = FlashAttentionFunction
def forward_flash_attn(self, hidden_states):
print("forward_flash_attn")
q_bucket_size = 512
k_bucket_size = 1024
residual = hidden_states
batch, channel, height, width = hidden_states.shape
# norm
hidden_states = self.group_norm(hidden_states)
hidden_states = hidden_states.view(batch, channel, height * width).transpose(1, 2)
# proj to q, k, v
query_proj = self.query(hidden_states)
key_proj = self.key(hidden_states)
value_proj = self.value(hidden_states)
query_proj, key_proj, value_proj = map(
lambda t: rearrange(t, "b n (h d) -> b h n d", h=self.num_heads), (query_proj, key_proj, value_proj)
)
out = flash_func.apply(query_proj, key_proj, value_proj, None, False, q_bucket_size, k_bucket_size)
out = rearrange(out, "b h n d -> b n (h d)")
# compute next hidden_states
hidden_states = self.proj_attn(hidden_states)
hidden_states = hidden_states.transpose(-1, -2).reshape(batch, channel, height, width)
# res connect and rescale
hidden_states = (hidden_states + residual) / self.rescale_output_factor
return hidden_states
diffusers.models.attention.AttentionBlock.forward = forward_flash_attn
"""


# endregion


Expand Down Expand Up @@ -3167,10 +3222,11 @@ def save_sd_model_on_epoch_end_or_stepwise(
print(f"removing old model: {remove_out_dir}")
shutil.rmtree(remove_out_dir)

if on_epoch_end:
save_and_remove_state_on_epoch_end(args, accelerator, epoch_no)
else:
save_and_remove_state_stepwise(args, accelerator, global_step)
if args.save_state:
if on_epoch_end:
save_and_remove_state_on_epoch_end(args, accelerator, epoch_no)
else:
save_and_remove_state_stepwise(args, accelerator, global_step)


def save_and_remove_state_on_epoch_end(args: argparse.Namespace, accelerator, epoch_no):
Expand Down Expand Up @@ -3294,7 +3350,7 @@ def sample_images(
if steps % args.sample_every_n_steps != 0 or epoch is not None: # steps is not divisible or end of epoch
return

print(f"generating sample images at step / サンプル画像生成 ステップ: {steps}")
print(f"\ngenerating sample images at step / サンプル画像生成 ステップ: {steps}")
if not os.path.isfile(args.sample_prompts):
print(f"No prompt file / プロンプトファイルがありません: {args.sample_prompts}")
return
Expand Down
2 changes: 1 addition & 1 deletion networks/lora.py
Original file line number Diff line number Diff line change
Expand Up @@ -638,7 +638,7 @@ def create_network_from_weights(multiplier, file, vae, text_encoder, unet, weigh
# support old LoRA without alpha
for key in modules_dim.keys():
if key not in modules_alpha:
modules_alpha = modules_dim[key]
modules_alpha[key] = modules_dim[key]

module_class = LoRAInfModule if for_inference else LoRAModule

Expand Down

0 comments on commit b6ba4ca

Please sign in to comment.