diff --git a/README.md b/README.md index 03d2978a..1d3e64dd 100644 --- a/README.md +++ b/README.md @@ -141,7 +141,7 @@ Put your VAE in: models/vae ### AMD GPUs (Linux only) AMD users can install rocm and pytorch with pip if you don't have it already installed, this is the command to install the stable version: -```pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.1``` +```pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.2``` This is the command to install the nightly with ROCm 6.2 which might have some performance improvements: diff --git a/comfy/ldm/audio/dit.py b/comfy/ldm/audio/dit.py index 4d2185be..5b3f498f 100644 --- a/comfy/ldm/audio/dit.py +++ b/comfy/ldm/audio/dit.py @@ -612,7 +612,9 @@ class ContinuousTransformer(nn.Module): return_info = False, **kwargs ): + patches_replace = kwargs.get("transformer_options", {}).get("patches_replace", {}) batch, seq, device = *x.shape[:2], x.device + context = kwargs["context"] info = { "hidden_states": [], @@ -643,9 +645,19 @@ class ContinuousTransformer(nn.Module): if self.use_sinusoidal_emb or self.use_abs_pos_emb: x = x + self.pos_emb(x) + blocks_replace = patches_replace.get("dit", {}) # Iterate over the transformer layers - for layer in self.layers: - x = layer(x, rotary_pos_emb = rotary_pos_emb, global_cond=global_cond, **kwargs) + for i, layer in enumerate(self.layers): + if ("double_block", i) in blocks_replace: + def block_wrap(args): + out = {} + out["img"] = layer(args["img"], rotary_pos_emb=args["pe"], global_cond=args["vec"], context=args["txt"]) + return out + + out = blocks_replace[("double_block", i)]({"img": x, "txt": context, "vec": global_cond, "pe": rotary_pos_emb}, {"original_block": block_wrap}) + x = out["img"] + else: + x = layer(x, rotary_pos_emb = rotary_pos_emb, global_cond=global_cond, context=context) # x = checkpoint(layer, x, rotary_pos_emb = rotary_pos_emb, global_cond=global_cond, **kwargs) if return_info: @@ -874,7 +886,6 @@ class AudioDiffusionTransformer(nn.Module): mask=None, return_info=False, control=None, - transformer_options={}, **kwargs): return self._forward( x, diff --git a/comfy/ldm/modules/attention.py b/comfy/ldm/modules/attention.py index 3f543abd..7b4ee215 100644 --- a/comfy/ldm/modules/attention.py +++ b/comfy/ldm/modules/attention.py @@ -372,10 +372,10 @@ def attention_xformers(q, k, v, heads, mask=None, attn_precision=None, skip_resh ) if mask is not None: - pad = 8 - q.shape[1] % 8 - mask_out = torch.empty([q.shape[0], q.shape[1], q.shape[1] + pad], dtype=q.dtype, device=q.device) - mask_out[:, :, :mask.shape[-1]] = mask - mask = mask_out[:, :, :mask.shape[-1]] + pad = 8 - mask.shape[-1] % 8 + mask_out = torch.empty([q.shape[0], q.shape[2], q.shape[1], mask.shape[-1] + pad], dtype=q.dtype, device=q.device) + mask_out[..., :mask.shape[-1]] = mask + mask = mask_out[..., :mask.shape[-1]] out = xformers.ops.memory_efficient_attention(q, k, v, attn_bias=mask) diff --git a/nodes.py b/nodes.py index 300cfcf1..7f151586 100644 --- a/nodes.py +++ b/nodes.py @@ -382,7 +382,7 @@ class InpaintModelConditioning: "vae": ("VAE", ), "pixels": ("IMAGE", ), "mask": ("MASK", ), - "add_noise_mask": ("BOOLEAN", {"default": True, "tooltip": "Add a noise mask to the latent so sampling will only happen within the mask. Might improve results or completely break things depending on the model."}), + "noise_mask": ("BOOLEAN", {"default": True, "tooltip": "Add a noise mask to the latent so sampling will only happen within the mask. Might improve results or completely break things depending on the model."}), }} RETURN_TYPES = ("CONDITIONING","CONDITIONING","LATENT") @@ -391,7 +391,7 @@ class InpaintModelConditioning: CATEGORY = "conditioning/inpaint" - def encode(self, positive, negative, pixels, vae, mask, add_noise_mask): + def encode(self, positive, negative, pixels, vae, mask, noise_mask): x = (pixels.shape[1] // 8) * 8 y = (pixels.shape[2] // 8) * 8 mask = torch.nn.functional.interpolate(mask.reshape((-1, 1, mask.shape[-2], mask.shape[-1])), size=(pixels.shape[1], pixels.shape[2]), mode="bilinear") @@ -415,7 +415,7 @@ class InpaintModelConditioning: out_latent = {} out_latent["samples"] = orig_latent - if add_noise_mask: + if noise_mask: out_latent["noise_mask"] = mask out = []