ComfyUI/comfy/ldm/genmo/joint_model/asymm_models_joint.py

#original code from https://github.com/genmoai/models under apache 2.0 license
#adapted to ComfyUI

from typing import Dict, List, Optional, Tuple

import torch
import torch.nn as nn
import torch.nn.functional as F
from einops import rearrange
# from flash_attn import flash_attn_varlen_qkvpacked_func
from comfy.ldm.modules.attention import optimized_attention

from .layers import (
    FeedForward,
    PatchEmbed,
    RMSNorm,
    TimestepEmbedder,
)

from .rope_mixed import (
    compute_mixed_rotation,
    create_position_matrix,
)
from .temporal_rope import apply_rotary_emb_qk_real
from .utils import (
    AttentionPool,
    modulate,
)

import comfy.ldm.common_dit
import comfy.ops


def modulated_rmsnorm(x, scale, eps=1e-6):
    # Normalize and modulate
    x_normed = comfy.ldm.common_dit.rms_norm(x, eps=eps)
    x_modulated = x_normed * (1 + scale.unsqueeze(1))

    return x_modulated


def residual_tanh_gated_rmsnorm(x, x_res, gate, eps=1e-6):
    # Apply tanh to gate
    tanh_gate = torch.tanh(gate).unsqueeze(1)

    # Normalize and apply gated scaling
    x_normed = comfy.ldm.common_dit.rms_norm(x_res, eps=eps) * tanh_gate

    # Apply residual connection
    output = x + x_normed

    return output

class AsymmetricAttention(nn.Module):
    def __init__(
        self,
        dim_x: int,
        dim_y: int,
        num_heads: int = 8,
        qkv_bias: bool = True,
        qk_norm: bool = False,
        attn_drop: float = 0.0,
        update_y: bool = True,
        out_bias: bool = True,
        attend_to_padding: bool = False,
        softmax_scale: Optional[float] = None,
        device: Optional[torch.device] = None,
        dtype=None,
        operations=None,
    ):
        super().__init__()
        self.dim_x = dim_x
        self.dim_y = dim_y
        self.num_heads = num_heads
        self.head_dim = dim_x // num_heads
        self.attn_drop = attn_drop
        self.update_y = update_y
        self.attend_to_padding = attend_to_padding
        self.softmax_scale = softmax_scale
        if dim_x % num_heads != 0:
            raise ValueError(
                f"dim_x={dim_x} should be divisible by num_heads={num_heads}"
            )

        # Input layers.
        self.qkv_bias = qkv_bias
        self.qkv_x = operations.Linear(dim_x, 3 * dim_x, bias=qkv_bias, device=device, dtype=dtype)
        # Project text features to match visual features (dim_y -> dim_x)
        self.qkv_y = operations.Linear(dim_y, 3 * dim_x, bias=qkv_bias, device=device, dtype=dtype)

        # Query and key normalization for stability.
        assert qk_norm
        self.q_norm_x = RMSNorm(self.head_dim, device=device, dtype=dtype)
        self.k_norm_x = RMSNorm(self.head_dim, device=device, dtype=dtype)
        self.q_norm_y = RMSNorm(self.head_dim, device=device, dtype=dtype)
        self.k_norm_y = RMSNorm(self.head_dim, device=device, dtype=dtype)

        # Output layers. y features go back down from dim_x -> dim_y.
        self.proj_x = operations.Linear(dim_x, dim_x, bias=out_bias, device=device, dtype=dtype)
        self.proj_y = (
            operations.Linear(dim_x, dim_y, bias=out_bias, device=device, dtype=dtype)
            if update_y
            else nn.Identity()
        )

    def forward(
        self,
        x: torch.Tensor,  # (B, N, dim_x)
        y: torch.Tensor,  # (B, L, dim_y)
        scale_x: torch.Tensor,  # (B, dim_x), modulation for pre-RMSNorm.
        scale_y: torch.Tensor,  # (B, dim_y), modulation for pre-RMSNorm.
        crop_y,
        **rope_rotation,
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        rope_cos = rope_rotation.get("rope_cos")
        rope_sin = rope_rotation.get("rope_sin")
        # Pre-norm for visual features
        x = modulated_rmsnorm(x, scale_x)  # (B, M, dim_x) where M = N / cp_group_size

        # Process visual features
        # qkv_x = self.qkv_x(x)  # (B, M, 3 * dim_x)
        # assert qkv_x.dtype == torch.bfloat16
        # qkv_x = all_to_all_collect_tokens(
        #     qkv_x, self.num_heads
        # )  # (3, B, N, local_h, head_dim)

        # Process text features
        y = modulated_rmsnorm(y, scale_y)  # (B, L, dim_y)
        q_y, k_y, v_y = self.qkv_y(y).view(y.shape[0], y.shape[1], 3, self.num_heads, -1).unbind(2)  # (B, N, local_h, head_dim)

        q_y = self.q_norm_y(q_y)
        k_y = self.k_norm_y(k_y)

        # Split qkv_x into q, k, v
        q_x, k_x, v_x = self.qkv_x(x).view(x.shape[0], x.shape[1], 3, self.num_heads, -1).unbind(2)  # (B, N, local_h, head_dim)
        q_x = self.q_norm_x(q_x)
        q_x = apply_rotary_emb_qk_real(q_x, rope_cos, rope_sin)
        k_x = self.k_norm_x(k_x)
        k_x = apply_rotary_emb_qk_real(k_x, rope_cos, rope_sin)

        q = torch.cat([q_x, q_y[:, :crop_y]], dim=1).transpose(1, 2)
        k = torch.cat([k_x, k_y[:, :crop_y]], dim=1).transpose(1, 2)
        v = torch.cat([v_x, v_y[:, :crop_y]], dim=1).transpose(1, 2)

        xy = optimized_attention(q,
                                 k,
                                 v, self.num_heads, skip_reshape=True)

        x, y = torch.tensor_split(xy, (q_x.shape[1],), dim=1)
        x = self.proj_x(x)
        o = torch.zeros(y.shape[0], q_y.shape[1], y.shape[-1], device=y.device, dtype=y.dtype)
        o[:, :y.shape[1]] = y

        y = self.proj_y(o)
        # print("ox", x)
        # print("oy", y)
        return x, y


class AsymmetricJointBlock(nn.Module):
    def __init__(
        self,
        hidden_size_x: int,
        hidden_size_y: int,
        num_heads: int,
        *,
        mlp_ratio_x: float = 8.0,  # Ratio of hidden size to d_model for MLP for visual tokens.
        mlp_ratio_y: float = 4.0,  # Ratio of hidden size to d_model for MLP for text tokens.
        update_y: bool = True,  # Whether to update text tokens in this block.
        device: Optional[torch.device] = None,
        dtype=None,
        operations=None,
        **block_kwargs,
    ):
        super().__init__()
        self.update_y = update_y
        self.hidden_size_x = hidden_size_x
        self.hidden_size_y = hidden_size_y
        self.mod_x = operations.Linear(hidden_size_x, 4 * hidden_size_x, device=device, dtype=dtype)
        if self.update_y:
            self.mod_y = operations.Linear(hidden_size_x, 4 * hidden_size_y, device=device, dtype=dtype)
        else:
            self.mod_y = operations.Linear(hidden_size_x, hidden_size_y, device=device, dtype=dtype)

        # Self-attention:
        self.attn = AsymmetricAttention(
            hidden_size_x,
            hidden_size_y,
            num_heads=num_heads,
            update_y=update_y,
            device=device,
            dtype=dtype,
            operations=operations,
            **block_kwargs,
        )

        # MLP.
        mlp_hidden_dim_x = int(hidden_size_x * mlp_ratio_x)
        assert mlp_hidden_dim_x == int(1536 * 8)
        self.mlp_x = FeedForward(
            in_features=hidden_size_x,
            hidden_size=mlp_hidden_dim_x,
            multiple_of=256,
            ffn_dim_multiplier=None,
            device=device,
            dtype=dtype,
            operations=operations,
        )

        # MLP for text not needed in last block.
        if self.update_y:
            mlp_hidden_dim_y = int(hidden_size_y * mlp_ratio_y)
            self.mlp_y = FeedForward(
                in_features=hidden_size_y,
                hidden_size=mlp_hidden_dim_y,
                multiple_of=256,
                ffn_dim_multiplier=None,
                device=device,
                dtype=dtype,
                operations=operations,
            )

    def forward(
        self,
        x: torch.Tensor,
        c: torch.Tensor,
        y: torch.Tensor,
        **attn_kwargs,
    ):
        """Forward pass of a block.

        Args:
            x: (B, N, dim) tensor of visual tokens
            c: (B, dim) tensor of conditioned features
            y: (B, L, dim) tensor of text tokens
            num_frames: Number of frames in the video. N = num_frames * num_spatial_tokens

        Returns:
            x: (B, N, dim) tensor of visual tokens after block
            y: (B, L, dim) tensor of text tokens after block
        """
        N = x.size(1)

        c = F.silu(c)
        mod_x = self.mod_x(c)
        scale_msa_x, gate_msa_x, scale_mlp_x, gate_mlp_x = mod_x.chunk(4, dim=1)

        mod_y = self.mod_y(c)
        if self.update_y:
            scale_msa_y, gate_msa_y, scale_mlp_y, gate_mlp_y = mod_y.chunk(4, dim=1)
        else:
            scale_msa_y = mod_y

        # Self-attention block.
        x_attn, y_attn = self.attn(
            x,
            y,
            scale_x=scale_msa_x,
            scale_y=scale_msa_y,
            **attn_kwargs,
        )

        assert x_attn.size(1) == N
        x = residual_tanh_gated_rmsnorm(x, x_attn, gate_msa_x)
        if self.update_y:
            y = residual_tanh_gated_rmsnorm(y, y_attn, gate_msa_y)

        # MLP block.
        x = self.ff_block_x(x, scale_mlp_x, gate_mlp_x)
        if self.update_y:
            y = self.ff_block_y(y, scale_mlp_y, gate_mlp_y)

        return x, y

    def ff_block_x(self, x, scale_x, gate_x):
        x_mod = modulated_rmsnorm(x, scale_x)
        x_res = self.mlp_x(x_mod)
        x = residual_tanh_gated_rmsnorm(x, x_res, gate_x)  # Sandwich norm
        return x

    def ff_block_y(self, y, scale_y, gate_y):
        y_mod = modulated_rmsnorm(y, scale_y)
        y_res = self.mlp_y(y_mod)
        y = residual_tanh_gated_rmsnorm(y, y_res, gate_y)  # Sandwich norm
        return y


class FinalLayer(nn.Module):
    """
    The final layer of DiT.
    """

    def __init__(
        self,
        hidden_size,
        patch_size,
        out_channels,
        device: Optional[torch.device] = None,
        dtype=None,
        operations=None,
    ):
        super().__init__()
        self.norm_final = operations.LayerNorm(
            hidden_size, elementwise_affine=False, eps=1e-6, device=device, dtype=dtype
        )
        self.mod = operations.Linear(hidden_size, 2 * hidden_size, device=device, dtype=dtype)
        self.linear = operations.Linear(
            hidden_size, patch_size * patch_size * out_channels, device=device, dtype=dtype
        )

    def forward(self, x, c):
        c = F.silu(c)
        shift, scale = self.mod(c).chunk(2, dim=1)
        x = modulate(self.norm_final(x), shift, scale)
        x = self.linear(x)
        return x


class AsymmDiTJoint(nn.Module):
    """
    Diffusion model with a Transformer backbone.

    Ingests text embeddings instead of a label.
    """

    def __init__(
        self,
        *,
        patch_size=2,
        in_channels=4,
        hidden_size_x=1152,
        hidden_size_y=1152,
        depth=48,
        num_heads=16,
        mlp_ratio_x=8.0,
        mlp_ratio_y=4.0,
        use_t5: bool = False,
        t5_feat_dim: int = 4096,
        t5_token_length: int = 256,
        learn_sigma=True,
        patch_embed_bias: bool = True,
        timestep_mlp_bias: bool = True,
        attend_to_padding: bool = False,
        timestep_scale: Optional[float] = None,
        use_extended_posenc: bool = False,
        posenc_preserve_area: bool = False,
        rope_theta: float = 10000.0,
        image_model=None,
        device: Optional[torch.device] = None,
        dtype=None,
        operations=None,
        **block_kwargs,
    ):
        super().__init__()

        self.dtype = dtype
        self.learn_sigma = learn_sigma
        self.in_channels = in_channels
        self.out_channels = in_channels * 2 if learn_sigma else in_channels
        self.patch_size = patch_size
        self.num_heads = num_heads
        self.hidden_size_x = hidden_size_x
        self.hidden_size_y = hidden_size_y
        self.head_dim = (
            hidden_size_x // num_heads
        )  # Head dimension and count is determined by visual.
        self.attend_to_padding = attend_to_padding
        self.use_extended_posenc = use_extended_posenc
        self.posenc_preserve_area = posenc_preserve_area
        self.use_t5 = use_t5
        self.t5_token_length = t5_token_length
        self.t5_feat_dim = t5_feat_dim
        self.rope_theta = (
            rope_theta  # Scaling factor for frequency computation for temporal RoPE.
        )

        self.x_embedder = PatchEmbed(
            patch_size=patch_size,
            in_chans=in_channels,
            embed_dim=hidden_size_x,
            bias=patch_embed_bias,
            dtype=dtype,
            device=device,
            operations=operations
        )
        # Conditionings
        # Timestep
        self.t_embedder = TimestepEmbedder(
            hidden_size_x, bias=timestep_mlp_bias, timestep_scale=timestep_scale, dtype=dtype, device=device, operations=operations
        )

        if self.use_t5:
            # Caption Pooling (T5)
            self.t5_y_embedder = AttentionPool(
                t5_feat_dim, num_heads=8, output_dim=hidden_size_x, dtype=dtype, device=device, operations=operations
            )

            # Dense Embedding Projection (T5)
            self.t5_yproj = operations.Linear(
                t5_feat_dim, hidden_size_y, bias=True, dtype=dtype, device=device
            )

        # Initialize pos_frequencies as an empty parameter.
        self.pos_frequencies = nn.Parameter(
            torch.empty(3, self.num_heads, self.head_dim // 2, dtype=dtype, device=device)
        )

        assert not self.attend_to_padding

        # for depth 48:
        #  b =  0: AsymmetricJointBlock, update_y=True
        #  b =  1: AsymmetricJointBlock, update_y=True
        #  ...
        #  b = 46: AsymmetricJointBlock, update_y=True
        #  b = 47: AsymmetricJointBlock, update_y=False. No need to update text features.
        blocks = []
        for b in range(depth):
            # Joint multi-modal block
            update_y = b < depth - 1
            block = AsymmetricJointBlock(
                hidden_size_x,
                hidden_size_y,
                num_heads,
                mlp_ratio_x=mlp_ratio_x,
                mlp_ratio_y=mlp_ratio_y,
                update_y=update_y,
                attend_to_padding=attend_to_padding,
                device=device,
                dtype=dtype,
                operations=operations,
                **block_kwargs,
            )

            blocks.append(block)
        self.blocks = nn.ModuleList(blocks)

        self.final_layer = FinalLayer(
            hidden_size_x, patch_size, self.out_channels, dtype=dtype, device=device, operations=operations
        )

    def embed_x(self, x: torch.Tensor) -> torch.Tensor:
        """
        Args:
            x: (B, C=12, T, H, W) tensor of visual tokens

        Returns:
            x: (B, C=3072, N) tensor of visual tokens with positional embedding.
        """
        return self.x_embedder(x)  # Convert BcTHW to BCN

    def prepare(
        self,
        x: torch.Tensor,
        sigma: torch.Tensor,
        t5_feat: torch.Tensor,
        t5_mask: torch.Tensor,
    ):
        """Prepare input and conditioning embeddings."""
        # Visual patch embeddings with positional encoding.
        T, H, W = x.shape[-3:]
        pH, pW = H // self.patch_size, W // self.patch_size
        x = self.embed_x(x)  # (B, N, D), where N = T * H * W / patch_size ** 2
        assert x.ndim == 3
        B = x.size(0)


        pH, pW = H // self.patch_size, W // self.patch_size
        N = T * pH * pW
        assert x.size(1) == N
        pos = create_position_matrix(
            T, pH=pH, pW=pW, device=x.device, dtype=torch.float32
        )  # (N, 3)
        rope_cos, rope_sin = compute_mixed_rotation(
            freqs=comfy.ops.cast_to(self.pos_frequencies, dtype=x.dtype, device=x.device), pos=pos
        )  # Each are (N, num_heads, dim // 2)

        c_t = self.t_embedder(1 - sigma, out_dtype=x.dtype)  # (B, D)

        t5_y_pool = self.t5_y_embedder(t5_feat, t5_mask)  # (B, D)

        c = c_t + t5_y_pool

        y_feat = self.t5_yproj(t5_feat)  # (B, L, t5_feat_dim) --> (B, L, D)

        return x, c, y_feat, rope_cos, rope_sin

    def forward(
        self,
        x: torch.Tensor,
        timestep: torch.Tensor,
        context: List[torch.Tensor],
        attention_mask: List[torch.Tensor],
        num_tokens=256,
        packed_indices: Dict[str, torch.Tensor] = None,
        rope_cos: torch.Tensor = None,
        rope_sin: torch.Tensor = None,
        control=None, **kwargs
    ):
        y_feat = context
        y_mask = attention_mask
        sigma = timestep
        """Forward pass of DiT.

        Args:
            x: (B, C, T, H, W) tensor of spatial inputs (images or latent representations of images)
            sigma: (B,) tensor of noise standard deviations
            y_feat: List((B, L, y_feat_dim) tensor of caption token features. For SDXL text encoders: L=77, y_feat_dim=2048)
            y_mask: List((B, L) boolean tensor indicating which tokens are not padding)
            packed_indices: Dict with keys for Flash Attention. Result of compute_packed_indices.
        """
        B, _, T, H, W = x.shape

        x, c, y_feat, rope_cos, rope_sin = self.prepare(
            x, sigma, y_feat, y_mask
        )
        del y_mask

        for i, block in enumerate(self.blocks):
            x, y_feat = block(
                x,
                c,
                y_feat,
                rope_cos=rope_cos,
                rope_sin=rope_sin,
                crop_y=num_tokens,
            )  # (B, M, D), (B, L, D)
        del y_feat  # Final layers don't use dense text features.

        x = self.final_layer(x, c)  # (B, M, patch_size ** 2 * out_channels)
        x = rearrange(
            x,
            "B (T hp wp) (p1 p2 c) -> B c T (hp p1) (wp p2)",
            T=T,
            hp=H // self.patch_size,
            wp=W // self.patch_size,
            p1=self.patch_size,
            p2=self.patch_size,
            c=self.out_channels,
        )

        return -x
Basic Genmo Mochi video model support. To use: "Load CLIP" node with t5xxl + type mochi "Load Diffusion Model" node with the mochi dit file. "Load VAE" with the mochi vae file. EmptyMochiLatentVideo node for the latent. euler + linear_quadratic in the KSampler node. 2024-10-26 10:54:00 +00:00			`#original code from https://github.com/genmoai/models under apache 2.0 license`
			`#adapted to ComfyUI`

			`from typing import Dict, List, Optional, Tuple`

			`import torch`
			`import torch.nn as nn`
			`import torch.nn.functional as F`
			`from einops import rearrange`
			`# from flash_attn import flash_attn_varlen_qkvpacked_func`
			`from comfy.ldm.modules.attention import optimized_attention`

			`from .layers import (`
			`FeedForward,`
			`PatchEmbed,`
			`RMSNorm,`
			`TimestepEmbedder,`
			`)`

			`from .rope_mixed import (`
			`compute_mixed_rotation,`
			`create_position_matrix,`
			`)`
			`from .temporal_rope import apply_rotary_emb_qk_real`
			`from .utils import (`
			`AttentionPool,`
			`modulate,`
			`)`

			`import comfy.ldm.common_dit`
			`import comfy.ops`


			`def modulated_rmsnorm(x, scale, eps=1e-6):`
			`# Normalize and modulate`
			`x_normed = comfy.ldm.common_dit.rms_norm(x, eps=eps)`
			`x_modulated = x_normed * (1 + scale.unsqueeze(1))`

			`return x_modulated`


			`def residual_tanh_gated_rmsnorm(x, x_res, gate, eps=1e-6):`
			`# Apply tanh to gate`
			`tanh_gate = torch.tanh(gate).unsqueeze(1)`

			`# Normalize and apply gated scaling`
			`x_normed = comfy.ldm.common_dit.rms_norm(x_res, eps=eps) * tanh_gate`

			`# Apply residual connection`
			`output = x + x_normed`

			`return output`

			`class AsymmetricAttention(nn.Module):`
			`def __init__(`
			`self,`
			`dim_x: int,`
			`dim_y: int,`
			`num_heads: int = 8,`
			`qkv_bias: bool = True,`
			`qk_norm: bool = False,`
			`attn_drop: float = 0.0,`
			`update_y: bool = True,`
			`out_bias: bool = True,`
			`attend_to_padding: bool = False,`
			`softmax_scale: Optional[float] = None,`
			`device: Optional[torch.device] = None,`
			`dtype=None,`
			`operations=None,`
			`):`
			`super().__init__()`
			`self.dim_x = dim_x`
			`self.dim_y = dim_y`
			`self.num_heads = num_heads`
			`self.head_dim = dim_x // num_heads`
			`self.attn_drop = attn_drop`
			`self.update_y = update_y`
			`self.attend_to_padding = attend_to_padding`
			`self.softmax_scale = softmax_scale`
			`if dim_x % num_heads != 0:`
			`raise ValueError(`
			`f"dim_x={dim_x} should be divisible by num_heads={num_heads}"`
			`)`

			`# Input layers.`
			`self.qkv_bias = qkv_bias`
			`self.qkv_x = operations.Linear(dim_x, 3 * dim_x, bias=qkv_bias, device=device, dtype=dtype)`
			`# Project text features to match visual features (dim_y -> dim_x)`
			`self.qkv_y = operations.Linear(dim_y, 3 * dim_x, bias=qkv_bias, device=device, dtype=dtype)`

			`# Query and key normalization for stability.`
			`assert qk_norm`
			`self.q_norm_x = RMSNorm(self.head_dim, device=device, dtype=dtype)`
			`self.k_norm_x = RMSNorm(self.head_dim, device=device, dtype=dtype)`
			`self.q_norm_y = RMSNorm(self.head_dim, device=device, dtype=dtype)`
			`self.k_norm_y = RMSNorm(self.head_dim, device=device, dtype=dtype)`

			`# Output layers. y features go back down from dim_x -> dim_y.`
			`self.proj_x = operations.Linear(dim_x, dim_x, bias=out_bias, device=device, dtype=dtype)`
			`self.proj_y = (`
			`operations.Linear(dim_x, dim_y, bias=out_bias, device=device, dtype=dtype)`
			`if update_y`
			`else nn.Identity()`
			`)`

			`def forward(`
			`self,`
			`x: torch.Tensor, # (B, N, dim_x)`
			`y: torch.Tensor, # (B, L, dim_y)`
			`scale_x: torch.Tensor, # (B, dim_x), modulation for pre-RMSNorm.`
			`scale_y: torch.Tensor, # (B, dim_y), modulation for pre-RMSNorm.`
			`crop_y,`
			`**rope_rotation,`
			`) -> Tuple[torch.Tensor, torch.Tensor]:`
			`rope_cos = rope_rotation.get("rope_cos")`
			`rope_sin = rope_rotation.get("rope_sin")`
			`# Pre-norm for visual features`
			`x = modulated_rmsnorm(x, scale_x) # (B, M, dim_x) where M = N / cp_group_size`

			`# Process visual features`
			`# qkv_x = self.qkv_x(x) # (B, M, 3 * dim_x)`
			`# assert qkv_x.dtype == torch.bfloat16`
			`# qkv_x = all_to_all_collect_tokens(`
			`# qkv_x, self.num_heads`
			`# ) # (3, B, N, local_h, head_dim)`

			`# Process text features`
			`y = modulated_rmsnorm(y, scale_y) # (B, L, dim_y)`
			`q_y, k_y, v_y = self.qkv_y(y).view(y.shape[0], y.shape[1], 3, self.num_heads, -1).unbind(2) # (B, N, local_h, head_dim)`

			`q_y = self.q_norm_y(q_y)`
			`k_y = self.k_norm_y(k_y)`

			`# Split qkv_x into q, k, v`
			`q_x, k_x, v_x = self.qkv_x(x).view(x.shape[0], x.shape[1], 3, self.num_heads, -1).unbind(2) # (B, N, local_h, head_dim)`
			`q_x = self.q_norm_x(q_x)`
			`q_x = apply_rotary_emb_qk_real(q_x, rope_cos, rope_sin)`
			`k_x = self.k_norm_x(k_x)`
			`k_x = apply_rotary_emb_qk_real(k_x, rope_cos, rope_sin)`

			`q = torch.cat([q_x, q_y[:, :crop_y]], dim=1).transpose(1, 2)`
			`k = torch.cat([k_x, k_y[:, :crop_y]], dim=1).transpose(1, 2)`
			`v = torch.cat([v_x, v_y[:, :crop_y]], dim=1).transpose(1, 2)`

			`xy = optimized_attention(q,`
			`k,`
			`v, self.num_heads, skip_reshape=True)`

			`x, y = torch.tensor_split(xy, (q_x.shape[1],), dim=1)`
			`x = self.proj_x(x)`
			`o = torch.zeros(y.shape[0], q_y.shape[1], y.shape[-1], device=y.device, dtype=y.dtype)`
			`o[:, :y.shape[1]] = y`

			`y = self.proj_y(o)`
			`# print("ox", x)`
			`# print("oy", y)`
			`return x, y`


			`class AsymmetricJointBlock(nn.Module):`
			`def __init__(`
			`self,`
			`hidden_size_x: int,`
			`hidden_size_y: int,`
			`num_heads: int,`
			`*,`
			`mlp_ratio_x: float = 8.0, # Ratio of hidden size to d_model for MLP for visual tokens.`
			`mlp_ratio_y: float = 4.0, # Ratio of hidden size to d_model for MLP for text tokens.`
			`update_y: bool = True, # Whether to update text tokens in this block.`
			`device: Optional[torch.device] = None,`
			`dtype=None,`
			`operations=None,`
			`**block_kwargs,`
			`):`
			`super().__init__()`
			`self.update_y = update_y`
			`self.hidden_size_x = hidden_size_x`
			`self.hidden_size_y = hidden_size_y`
			`self.mod_x = operations.Linear(hidden_size_x, 4 * hidden_size_x, device=device, dtype=dtype)`
			`if self.update_y:`
			`self.mod_y = operations.Linear(hidden_size_x, 4 * hidden_size_y, device=device, dtype=dtype)`
			`else:`
			`self.mod_y = operations.Linear(hidden_size_x, hidden_size_y, device=device, dtype=dtype)`

			`# Self-attention:`
			`self.attn = AsymmetricAttention(`
			`hidden_size_x,`
			`hidden_size_y,`
			`num_heads=num_heads,`
			`update_y=update_y,`
			`device=device,`
			`dtype=dtype,`
			`operations=operations,`
			`**block_kwargs,`
			`)`

			`# MLP.`
			`mlp_hidden_dim_x = int(hidden_size_x * mlp_ratio_x)`
			`assert mlp_hidden_dim_x == int(1536 * 8)`
			`self.mlp_x = FeedForward(`
			`in_features=hidden_size_x,`
			`hidden_size=mlp_hidden_dim_x,`
			`multiple_of=256,`
			`ffn_dim_multiplier=None,`
			`device=device,`
			`dtype=dtype,`
			`operations=operations,`
			`)`

			`# MLP for text not needed in last block.`
			`if self.update_y:`
			`mlp_hidden_dim_y = int(hidden_size_y * mlp_ratio_y)`
			`self.mlp_y = FeedForward(`
			`in_features=hidden_size_y,`
			`hidden_size=mlp_hidden_dim_y,`
			`multiple_of=256,`
			`ffn_dim_multiplier=None,`
			`device=device,`
			`dtype=dtype,`
			`operations=operations,`
			`)`

			`def forward(`
			`self,`
			`x: torch.Tensor,`
			`c: torch.Tensor,`
			`y: torch.Tensor,`
			`**attn_kwargs,`
			`):`
			`"""Forward pass of a block.`

			`Args:`
			`x: (B, N, dim) tensor of visual tokens`
			`c: (B, dim) tensor of conditioned features`
			`y: (B, L, dim) tensor of text tokens`
			`num_frames: Number of frames in the video. N = num_frames * num_spatial_tokens`

			`Returns:`
			`x: (B, N, dim) tensor of visual tokens after block`
			`y: (B, L, dim) tensor of text tokens after block`
			`"""`
			`N = x.size(1)`

			`c = F.silu(c)`
			`mod_x = self.mod_x(c)`
			`scale_msa_x, gate_msa_x, scale_mlp_x, gate_mlp_x = mod_x.chunk(4, dim=1)`

			`mod_y = self.mod_y(c)`
			`if self.update_y:`
			`scale_msa_y, gate_msa_y, scale_mlp_y, gate_mlp_y = mod_y.chunk(4, dim=1)`
			`else:`
			`scale_msa_y = mod_y`

			`# Self-attention block.`
			`x_attn, y_attn = self.attn(`
			`x,`
			`y,`
			`scale_x=scale_msa_x,`
			`scale_y=scale_msa_y,`
			`**attn_kwargs,`
			`)`

			`assert x_attn.size(1) == N`
			`x = residual_tanh_gated_rmsnorm(x, x_attn, gate_msa_x)`
			`if self.update_y:`
			`y = residual_tanh_gated_rmsnorm(y, y_attn, gate_msa_y)`

			`# MLP block.`
			`x = self.ff_block_x(x, scale_mlp_x, gate_mlp_x)`
			`if self.update_y:`
			`y = self.ff_block_y(y, scale_mlp_y, gate_mlp_y)`

			`return x, y`

			`def ff_block_x(self, x, scale_x, gate_x):`
			`x_mod = modulated_rmsnorm(x, scale_x)`
			`x_res = self.mlp_x(x_mod)`
			`x = residual_tanh_gated_rmsnorm(x, x_res, gate_x) # Sandwich norm`
			`return x`

			`def ff_block_y(self, y, scale_y, gate_y):`
			`y_mod = modulated_rmsnorm(y, scale_y)`
			`y_res = self.mlp_y(y_mod)`
			`y = residual_tanh_gated_rmsnorm(y, y_res, gate_y) # Sandwich norm`
			`return y`


			`class FinalLayer(nn.Module):`
			`"""`
			`The final layer of DiT.`
			`"""`

			`def __init__(`
			`self,`
			`hidden_size,`
			`patch_size,`
			`out_channels,`
			`device: Optional[torch.device] = None,`
			`dtype=None,`
			`operations=None,`
			`):`
			`super().__init__()`
			`self.norm_final = operations.LayerNorm(`
			`hidden_size, elementwise_affine=False, eps=1e-6, device=device, dtype=dtype`
			`)`
			`self.mod = operations.Linear(hidden_size, 2 * hidden_size, device=device, dtype=dtype)`
			`self.linear = operations.Linear(`
			`hidden_size, patch_size * patch_size * out_channels, device=device, dtype=dtype`
			`)`

			`def forward(self, x, c):`
			`c = F.silu(c)`
			`shift, scale = self.mod(c).chunk(2, dim=1)`
			`x = modulate(self.norm_final(x), shift, scale)`
			`x = self.linear(x)`
			`return x`


			`class AsymmDiTJoint(nn.Module):`
			`"""`
			`Diffusion model with a Transformer backbone.`

			`Ingests text embeddings instead of a label.`
			`"""`

			`def __init__(`
			`self,`
			`*,`
			`patch_size=2,`
			`in_channels=4,`
			`hidden_size_x=1152,`
			`hidden_size_y=1152,`
			`depth=48,`
			`num_heads=16,`
			`mlp_ratio_x=8.0,`
			`mlp_ratio_y=4.0,`
			`use_t5: bool = False,`
			`t5_feat_dim: int = 4096,`
			`t5_token_length: int = 256,`
			`learn_sigma=True,`
			`patch_embed_bias: bool = True,`
			`timestep_mlp_bias: bool = True,`
			`attend_to_padding: bool = False,`
			`timestep_scale: Optional[float] = None,`
			`use_extended_posenc: bool = False,`
			`posenc_preserve_area: bool = False,`
			`rope_theta: float = 10000.0,`
			`image_model=None,`
			`device: Optional[torch.device] = None,`
			`dtype=None,`
			`operations=None,`
			`**block_kwargs,`
			`):`
			`super().__init__()`

			`self.dtype = dtype`
			`self.learn_sigma = learn_sigma`
			`self.in_channels = in_channels`
			`self.out_channels = in_channels * 2 if learn_sigma else in_channels`
			`self.patch_size = patch_size`
			`self.num_heads = num_heads`
			`self.hidden_size_x = hidden_size_x`
			`self.hidden_size_y = hidden_size_y`
			`self.head_dim = (`
			`hidden_size_x // num_heads`
			`) # Head dimension and count is determined by visual.`
			`self.attend_to_padding = attend_to_padding`
			`self.use_extended_posenc = use_extended_posenc`
			`self.posenc_preserve_area = posenc_preserve_area`
			`self.use_t5 = use_t5`
			`self.t5_token_length = t5_token_length`
			`self.t5_feat_dim = t5_feat_dim`
			`self.rope_theta = (`
			`rope_theta # Scaling factor for frequency computation for temporal RoPE.`
			`)`

			`self.x_embedder = PatchEmbed(`
			`patch_size=patch_size,`
			`in_chans=in_channels,`
			`embed_dim=hidden_size_x,`
			`bias=patch_embed_bias,`
			`dtype=dtype,`
			`device=device,`
			`operations=operations`
			`)`
			`# Conditionings`
			`# Timestep`
			`self.t_embedder = TimestepEmbedder(`
			`hidden_size_x, bias=timestep_mlp_bias, timestep_scale=timestep_scale, dtype=dtype, device=device, operations=operations`
			`)`

			`if self.use_t5:`
			`# Caption Pooling (T5)`
			`self.t5_y_embedder = AttentionPool(`
			`t5_feat_dim, num_heads=8, output_dim=hidden_size_x, dtype=dtype, device=device, operations=operations`
			`)`

			`# Dense Embedding Projection (T5)`
			`self.t5_yproj = operations.Linear(`
			`t5_feat_dim, hidden_size_y, bias=True, dtype=dtype, device=device`
			`)`

			`# Initialize pos_frequencies as an empty parameter.`
			`self.pos_frequencies = nn.Parameter(`
			`torch.empty(3, self.num_heads, self.head_dim // 2, dtype=dtype, device=device)`
			`)`

			`assert not self.attend_to_padding`

			`# for depth 48:`
			`# b = 0: AsymmetricJointBlock, update_y=True`
			`# b = 1: AsymmetricJointBlock, update_y=True`
			`# ...`
			`# b = 46: AsymmetricJointBlock, update_y=True`
			`# b = 47: AsymmetricJointBlock, update_y=False. No need to update text features.`
			`blocks = []`
			`for b in range(depth):`
			`# Joint multi-modal block`
			`update_y = b < depth - 1`
			`block = AsymmetricJointBlock(`
			`hidden_size_x,`
			`hidden_size_y,`
			`num_heads,`
			`mlp_ratio_x=mlp_ratio_x,`
			`mlp_ratio_y=mlp_ratio_y,`
			`update_y=update_y,`
			`attend_to_padding=attend_to_padding,`
			`device=device,`
			`dtype=dtype,`
			`operations=operations,`
			`**block_kwargs,`
			`)`

			`blocks.append(block)`
			`self.blocks = nn.ModuleList(blocks)`

			`self.final_layer = FinalLayer(`
			`hidden_size_x, patch_size, self.out_channels, dtype=dtype, device=device, operations=operations`
			`)`

			`def embed_x(self, x: torch.Tensor) -> torch.Tensor:`
			`"""`
			`Args:`
			`x: (B, C=12, T, H, W) tensor of visual tokens`

			`Returns:`
			`x: (B, C=3072, N) tensor of visual tokens with positional embedding.`
			`"""`
			`return self.x_embedder(x) # Convert BcTHW to BCN`

			`def prepare(`
			`self,`
			`x: torch.Tensor,`
			`sigma: torch.Tensor,`
			`t5_feat: torch.Tensor,`
			`t5_mask: torch.Tensor,`
			`):`
			`"""Prepare input and conditioning embeddings."""`
			`# Visual patch embeddings with positional encoding.`
			`T, H, W = x.shape[-3:]`
			`pH, pW = H // self.patch_size, W // self.patch_size`
			`x = self.embed_x(x) # (B, N, D), where N = T * H * W / patch_size ** 2`
			`assert x.ndim == 3`
			`B = x.size(0)`


			`pH, pW = H // self.patch_size, W // self.patch_size`
			`N = T * pH * pW`
			`assert x.size(1) == N`
			`pos = create_position_matrix(`
			`T, pH=pH, pW=pW, device=x.device, dtype=torch.float32`
			`) # (N, 3)`
			`rope_cos, rope_sin = compute_mixed_rotation(`
			`freqs=comfy.ops.cast_to(self.pos_frequencies, dtype=x.dtype, device=x.device), pos=pos`
			`) # Each are (N, num_heads, dim // 2)`

			`c_t = self.t_embedder(1 - sigma, out_dtype=x.dtype) # (B, D)`

			`t5_y_pool = self.t5_y_embedder(t5_feat, t5_mask) # (B, D)`

			`c = c_t + t5_y_pool`

			`y_feat = self.t5_yproj(t5_feat) # (B, L, t5_feat_dim) --> (B, L, D)`

			`return x, c, y_feat, rope_cos, rope_sin`

			`def forward(`
			`self,`
			`x: torch.Tensor,`
			`timestep: torch.Tensor,`
			`context: List[torch.Tensor],`
			`attention_mask: List[torch.Tensor],`
			`num_tokens=256,`
			`packed_indices: Dict[str, torch.Tensor] = None,`
			`rope_cos: torch.Tensor = None,`
			`rope_sin: torch.Tensor = None,`
			`control=None, **kwargs`
			`):`
			`y_feat = context`
			`y_mask = attention_mask`
			`sigma = timestep`
			`"""Forward pass of DiT.`

			`Args:`
			`x: (B, C, T, H, W) tensor of spatial inputs (images or latent representations of images)`
			`sigma: (B,) tensor of noise standard deviations`
			`y_feat: List((B, L, y_feat_dim) tensor of caption token features. For SDXL text encoders: L=77, y_feat_dim=2048)`
			`y_mask: List((B, L) boolean tensor indicating which tokens are not padding)`
			`packed_indices: Dict with keys for Flash Attention. Result of compute_packed_indices.`
			`"""`
			`B, _, T, H, W = x.shape`

			`x, c, y_feat, rope_cos, rope_sin = self.prepare(`
			`x, sigma, y_feat, y_mask`
			`)`
			`del y_mask`

			`for i, block in enumerate(self.blocks):`
			`x, y_feat = block(`
			`x,`
			`c,`
			`y_feat,`
			`rope_cos=rope_cos,`
			`rope_sin=rope_sin,`
			`crop_y=num_tokens,`
			`) # (B, M, D), (B, L, D)`
			`del y_feat # Final layers don't use dense text features.`

			`x = self.final_layer(x, c) # (B, M, patch_size ** 2 * out_channels)`
			`x = rearrange(`
			`x,`
			`"B (T hp wp) (p1 p2 c) -> B c T (hp p1) (wp p2)",`
			`T=T,`
			`hp=H // self.patch_size,`
			`wp=W // self.patch_size,`
			`p1=self.patch_size,`
			`p2=self.patch_size,`
			`c=self.out_channels,`
			`)`

			`return -x`