Merge branch 'comfyanonymous:master' into master

2024-11-21 09:57:21 +01:00 · 2024-11-21 09:57:21 +01:00 · e440423c28
parent cf6ab18361 772e620e32
commit e440423c28
4 changed files with 22 additions and 11 deletions
--- a/README.md
+++ b/README.md
@ -141,7 +141,7 @@ Put your VAE in: models/vae
 ### AMD GPUs (Linux only)
 AMD users can install rocm and pytorch with pip if you don't have it already installed, this is the command to install the stable version:

-```pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.1```
+```pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.2```

 This is the command to install the nightly with ROCm 6.2 which might have some performance improvements:

--- a/comfy/ldm/audio/dit.py
+++ b/comfy/ldm/audio/dit.py
@ -612,7 +612,9 @@ class ContinuousTransformer(nn.Module):
        return_info = False,
        **kwargs
    ):
+        patches_replace = kwargs.get("transformer_options", {}).get("patches_replace", {})
        batch, seq, device = *x.shape[:2], x.device
+        context = kwargs["context"]

        info = {
            "hidden_states": [],
@ -643,9 +645,19 @@ class ContinuousTransformer(nn.Module):
        if self.use_sinusoidal_emb or self.use_abs_pos_emb:
            x = x + self.pos_emb(x)

+        blocks_replace = patches_replace.get("dit", {})
        # Iterate over the transformer layers
-        for layer in self.layers:
-            x = layer(x, rotary_pos_emb = rotary_pos_emb, global_cond=global_cond, **kwargs)
+        for i, layer in enumerate(self.layers):
+            if ("double_block", i) in blocks_replace:
+                def block_wrap(args):
+                    out = {}
+                    out["img"] = layer(args["img"], rotary_pos_emb=args["pe"], global_cond=args["vec"], context=args["txt"])
+                    return out
+
+                out = blocks_replace[("double_block", i)]({"img": x, "txt": context, "vec": global_cond, "pe": rotary_pos_emb}, {"original_block": block_wrap})
+                x = out["img"]
+            else:
+                x = layer(x, rotary_pos_emb = rotary_pos_emb, global_cond=global_cond, context=context)
            # x = checkpoint(layer, x, rotary_pos_emb = rotary_pos_emb, global_cond=global_cond, **kwargs)

            if return_info:
@ -874,7 +886,6 @@ class AudioDiffusionTransformer(nn.Module):
        mask=None,
        return_info=False,
        control=None,
-        transformer_options={},
        **kwargs):
            return self._forward(
                x,
--- a/comfy/ldm/modules/attention.py
+++ b/comfy/ldm/modules/attention.py
@ -372,10 +372,10 @@ def attention_xformers(q, k, v, heads, mask=None, attn_precision=None, skip_resh
        )

    if mask is not None:
-        pad = 8 - q.shape[1] % 8
-        mask_out = torch.empty([q.shape[0], q.shape[1], q.shape[1] + pad], dtype=q.dtype, device=q.device)
-        mask_out[:, :, :mask.shape[-1]] = mask
-        mask = mask_out[:, :, :mask.shape[-1]]
+        pad = 8 - mask.shape[-1] % 8
+        mask_out = torch.empty([q.shape[0], q.shape[2], q.shape[1], mask.shape[-1] + pad], dtype=q.dtype, device=q.device)
+        mask_out[..., :mask.shape[-1]] = mask
+        mask = mask_out[..., :mask.shape[-1]]

    out = xformers.ops.memory_efficient_attention(q, k, v, attn_bias=mask)

--- a/nodes.py
+++ b/nodes.py
@ -382,7 +382,7 @@ class InpaintModelConditioning:
                             "vae": ("VAE", ),
                             "pixels": ("IMAGE", ),
                             "mask": ("MASK", ),
-                             "add_noise_mask": ("BOOLEAN", {"default": True, "tooltip": "Add a noise mask to the latent so sampling will only happen within the mask. Might improve results or completely break things depending on the model."}),
+                             "noise_mask": ("BOOLEAN", {"default": True, "tooltip": "Add a noise mask to the latent so sampling will only happen within the mask. Might improve results or completely break things depending on the model."}),
                             }}

    RETURN_TYPES = ("CONDITIONING","CONDITIONING","LATENT")
@ -391,7 +391,7 @@ class InpaintModelConditioning:

    CATEGORY = "conditioning/inpaint"

-    def encode(self, positive, negative, pixels, vae, mask, add_noise_mask):
+    def encode(self, positive, negative, pixels, vae, mask, noise_mask):
        x = (pixels.shape[1] // 8) * 8
        y = (pixels.shape[2] // 8) * 8
        mask = torch.nn.functional.interpolate(mask.reshape((-1, 1, mask.shape[-2], mask.shape[-1])), size=(pixels.shape[1], pixels.shape[2]), mode="bilinear")
@ -415,7 +415,7 @@ class InpaintModelConditioning:
        out_latent = {}

        out_latent["samples"] = orig_latent
-        if add_noise_mask:
+        if noise_mask:
            out_latent["noise_mask"] = mask

        out = []