Merge branch 'comfyanonymous:master' into master

This commit is contained in:
Karol Stępień 2024-11-21 09:57:21 +01:00 committed by GitHub
commit e440423c28
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 22 additions and 11 deletions

View File

@ -141,7 +141,7 @@ Put your VAE in: models/vae
### AMD GPUs (Linux only) ### AMD GPUs (Linux only)
AMD users can install rocm and pytorch with pip if you don't have it already installed, this is the command to install the stable version: AMD users can install rocm and pytorch with pip if you don't have it already installed, this is the command to install the stable version:
```pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.1``` ```pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.2```
This is the command to install the nightly with ROCm 6.2 which might have some performance improvements: This is the command to install the nightly with ROCm 6.2 which might have some performance improvements:

View File

@ -612,7 +612,9 @@ class ContinuousTransformer(nn.Module):
return_info = False, return_info = False,
**kwargs **kwargs
): ):
patches_replace = kwargs.get("transformer_options", {}).get("patches_replace", {})
batch, seq, device = *x.shape[:2], x.device batch, seq, device = *x.shape[:2], x.device
context = kwargs["context"]
info = { info = {
"hidden_states": [], "hidden_states": [],
@ -643,9 +645,19 @@ class ContinuousTransformer(nn.Module):
if self.use_sinusoidal_emb or self.use_abs_pos_emb: if self.use_sinusoidal_emb or self.use_abs_pos_emb:
x = x + self.pos_emb(x) x = x + self.pos_emb(x)
blocks_replace = patches_replace.get("dit", {})
# Iterate over the transformer layers # Iterate over the transformer layers
for layer in self.layers: for i, layer in enumerate(self.layers):
x = layer(x, rotary_pos_emb = rotary_pos_emb, global_cond=global_cond, **kwargs) if ("double_block", i) in blocks_replace:
def block_wrap(args):
out = {}
out["img"] = layer(args["img"], rotary_pos_emb=args["pe"], global_cond=args["vec"], context=args["txt"])
return out
out = blocks_replace[("double_block", i)]({"img": x, "txt": context, "vec": global_cond, "pe": rotary_pos_emb}, {"original_block": block_wrap})
x = out["img"]
else:
x = layer(x, rotary_pos_emb = rotary_pos_emb, global_cond=global_cond, context=context)
# x = checkpoint(layer, x, rotary_pos_emb = rotary_pos_emb, global_cond=global_cond, **kwargs) # x = checkpoint(layer, x, rotary_pos_emb = rotary_pos_emb, global_cond=global_cond, **kwargs)
if return_info: if return_info:
@ -874,7 +886,6 @@ class AudioDiffusionTransformer(nn.Module):
mask=None, mask=None,
return_info=False, return_info=False,
control=None, control=None,
transformer_options={},
**kwargs): **kwargs):
return self._forward( return self._forward(
x, x,

View File

@ -372,10 +372,10 @@ def attention_xformers(q, k, v, heads, mask=None, attn_precision=None, skip_resh
) )
if mask is not None: if mask is not None:
pad = 8 - q.shape[1] % 8 pad = 8 - mask.shape[-1] % 8
mask_out = torch.empty([q.shape[0], q.shape[1], q.shape[1] + pad], dtype=q.dtype, device=q.device) mask_out = torch.empty([q.shape[0], q.shape[2], q.shape[1], mask.shape[-1] + pad], dtype=q.dtype, device=q.device)
mask_out[:, :, :mask.shape[-1]] = mask mask_out[..., :mask.shape[-1]] = mask
mask = mask_out[:, :, :mask.shape[-1]] mask = mask_out[..., :mask.shape[-1]]
out = xformers.ops.memory_efficient_attention(q, k, v, attn_bias=mask) out = xformers.ops.memory_efficient_attention(q, k, v, attn_bias=mask)

View File

@ -382,7 +382,7 @@ class InpaintModelConditioning:
"vae": ("VAE", ), "vae": ("VAE", ),
"pixels": ("IMAGE", ), "pixels": ("IMAGE", ),
"mask": ("MASK", ), "mask": ("MASK", ),
"add_noise_mask": ("BOOLEAN", {"default": True, "tooltip": "Add a noise mask to the latent so sampling will only happen within the mask. Might improve results or completely break things depending on the model."}), "noise_mask": ("BOOLEAN", {"default": True, "tooltip": "Add a noise mask to the latent so sampling will only happen within the mask. Might improve results or completely break things depending on the model."}),
}} }}
RETURN_TYPES = ("CONDITIONING","CONDITIONING","LATENT") RETURN_TYPES = ("CONDITIONING","CONDITIONING","LATENT")
@ -391,7 +391,7 @@ class InpaintModelConditioning:
CATEGORY = "conditioning/inpaint" CATEGORY = "conditioning/inpaint"
def encode(self, positive, negative, pixels, vae, mask, add_noise_mask): def encode(self, positive, negative, pixels, vae, mask, noise_mask):
x = (pixels.shape[1] // 8) * 8 x = (pixels.shape[1] // 8) * 8
y = (pixels.shape[2] // 8) * 8 y = (pixels.shape[2] // 8) * 8
mask = torch.nn.functional.interpolate(mask.reshape((-1, 1, mask.shape[-2], mask.shape[-1])), size=(pixels.shape[1], pixels.shape[2]), mode="bilinear") mask = torch.nn.functional.interpolate(mask.reshape((-1, 1, mask.shape[-2], mask.shape[-1])), size=(pixels.shape[1], pixels.shape[2]), mode="bilinear")
@ -415,7 +415,7 @@ class InpaintModelConditioning:
out_latent = {} out_latent = {}
out_latent["samples"] = orig_latent out_latent["samples"] = orig_latent
if add_noise_mask: if noise_mask:
out_latent["noise_mask"] = mask out_latent["noise_mask"] = mask
out = [] out = []