From 898615122f224d24cc4c53a30e67c8b0e70579da Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Tue, 19 Nov 2024 15:31:09 -0500 Subject: [PATCH 1/4] Rename add_noise_mask -> noise_mask. --- nodes.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/nodes.py b/nodes.py index 300cfcf1..7f151586 100644 --- a/nodes.py +++ b/nodes.py @@ -382,7 +382,7 @@ class InpaintModelConditioning: "vae": ("VAE", ), "pixels": ("IMAGE", ), "mask": ("MASK", ), - "add_noise_mask": ("BOOLEAN", {"default": True, "tooltip": "Add a noise mask to the latent so sampling will only happen within the mask. Might improve results or completely break things depending on the model."}), + "noise_mask": ("BOOLEAN", {"default": True, "tooltip": "Add a noise mask to the latent so sampling will only happen within the mask. Might improve results or completely break things depending on the model."}), }} RETURN_TYPES = ("CONDITIONING","CONDITIONING","LATENT") @@ -391,7 +391,7 @@ class InpaintModelConditioning: CATEGORY = "conditioning/inpaint" - def encode(self, positive, negative, pixels, vae, mask, add_noise_mask): + def encode(self, positive, negative, pixels, vae, mask, noise_mask): x = (pixels.shape[1] // 8) * 8 y = (pixels.shape[2] // 8) * 8 mask = torch.nn.functional.interpolate(mask.reshape((-1, 1, mask.shape[-2], mask.shape[-1])), size=(pixels.shape[1], pixels.shape[2]), mode="bilinear") @@ -415,7 +415,7 @@ class InpaintModelConditioning: out_latent = {} out_latent["samples"] = orig_latent - if add_noise_mask: + if noise_mask: out_latent["noise_mask"] = mask out = [] From 22535d05896cf78d84924c492c8cfc17b8786c05 Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Wed, 20 Nov 2024 07:33:06 -0500 Subject: [PATCH 2/4] Skip layer guidance now works on stable audio model. --- comfy/ldm/audio/dit.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/comfy/ldm/audio/dit.py b/comfy/ldm/audio/dit.py index 4d2185be..5b3f498f 100644 --- a/comfy/ldm/audio/dit.py +++ b/comfy/ldm/audio/dit.py @@ -612,7 +612,9 @@ class ContinuousTransformer(nn.Module): return_info = False, **kwargs ): + patches_replace = kwargs.get("transformer_options", {}).get("patches_replace", {}) batch, seq, device = *x.shape[:2], x.device + context = kwargs["context"] info = { "hidden_states": [], @@ -643,9 +645,19 @@ class ContinuousTransformer(nn.Module): if self.use_sinusoidal_emb or self.use_abs_pos_emb: x = x + self.pos_emb(x) + blocks_replace = patches_replace.get("dit", {}) # Iterate over the transformer layers - for layer in self.layers: - x = layer(x, rotary_pos_emb = rotary_pos_emb, global_cond=global_cond, **kwargs) + for i, layer in enumerate(self.layers): + if ("double_block", i) in blocks_replace: + def block_wrap(args): + out = {} + out["img"] = layer(args["img"], rotary_pos_emb=args["pe"], global_cond=args["vec"], context=args["txt"]) + return out + + out = blocks_replace[("double_block", i)]({"img": x, "txt": context, "vec": global_cond, "pe": rotary_pos_emb}, {"original_block": block_wrap}) + x = out["img"] + else: + x = layer(x, rotary_pos_emb = rotary_pos_emb, global_cond=global_cond, context=context) # x = checkpoint(layer, x, rotary_pos_emb = rotary_pos_emb, global_cond=global_cond, **kwargs) if return_info: @@ -874,7 +886,6 @@ class AudioDiffusionTransformer(nn.Module): mask=None, return_info=False, control=None, - transformer_options={}, **kwargs): return self._forward( x, From 07f6eeaa134b34ef38ff6d9014061cfeb5d041e5 Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Wed, 20 Nov 2024 17:07:46 -0500 Subject: [PATCH 3/4] Fix mask issue with attention_xformers. --- comfy/ldm/modules/attention.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/comfy/ldm/modules/attention.py b/comfy/ldm/modules/attention.py index 3f543abd..7b4ee215 100644 --- a/comfy/ldm/modules/attention.py +++ b/comfy/ldm/modules/attention.py @@ -372,10 +372,10 @@ def attention_xformers(q, k, v, heads, mask=None, attn_precision=None, skip_resh ) if mask is not None: - pad = 8 - q.shape[1] % 8 - mask_out = torch.empty([q.shape[0], q.shape[1], q.shape[1] + pad], dtype=q.dtype, device=q.device) - mask_out[:, :, :mask.shape[-1]] = mask - mask = mask_out[:, :, :mask.shape[-1]] + pad = 8 - mask.shape[-1] % 8 + mask_out = torch.empty([q.shape[0], q.shape[2], q.shape[1], mask.shape[-1] + pad], dtype=q.dtype, device=q.device) + mask_out[..., :mask.shape[-1]] = mask + mask = mask_out[..., :mask.shape[-1]] out = xformers.ops.memory_efficient_attention(q, k, v, attn_bias=mask) From 772e620e32e2bc50b1a1c13e5afa4f990786c059 Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Wed, 20 Nov 2024 20:42:51 -0500 Subject: [PATCH 4/4] Update readme. --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 714a1d9b..2ddb925c 100644 --- a/README.md +++ b/README.md @@ -140,7 +140,7 @@ Put your VAE in: models/vae ### AMD GPUs (Linux only) AMD users can install rocm and pytorch with pip if you don't have it already installed, this is the command to install the stable version: -```pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.1``` +```pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.2``` This is the command to install the nightly with ROCm 6.2 which might have some performance improvements: