ComfyUI/comfy/ldm/genmo/joint_model/rope_mixed.py

#original code from https://github.com/genmoai/models under apache 2.0 license

# import functools
import math

import torch


def centers(start: float, stop, num, dtype=None, device=None):
    """linspace through bin centers.

    Args:
        start (float): Start of the range.
        stop (float): End of the range.
        num (int): Number of points.
        dtype (torch.dtype): Data type of the points.
        device (torch.device): Device of the points.

    Returns:
        centers (Tensor): Centers of the bins. Shape: (num,).
    """
    edges = torch.linspace(start, stop, num + 1, dtype=dtype, device=device)
    return (edges[:-1] + edges[1:]) / 2


# @functools.lru_cache(maxsize=1)
def create_position_matrix(
    T: int,
    pH: int,
    pW: int,
    device: torch.device,
    dtype: torch.dtype,
    *,
    target_area: float = 36864,
):
    """
    Args:
        T: int - Temporal dimension
        pH: int - Height dimension after patchify
        pW: int - Width dimension after patchify

    Returns:
        pos: [T * pH * pW, 3] - position matrix
    """
    # Create 1D tensors for each dimension
    t = torch.arange(T, dtype=dtype)

    # Positionally interpolate to area 36864.
    # (3072x3072 frame with 16x16 patches = 192x192 latents).
    # This automatically scales rope positions when the resolution changes.
    # We use a large target area so the model is more sensitive
    # to changes in the learned pos_frequencies matrix.
    scale = math.sqrt(target_area / (pW * pH))
    w = centers(-pW * scale / 2, pW * scale / 2, pW)
    h = centers(-pH * scale / 2, pH * scale / 2, pH)

    # Use meshgrid to create 3D grids
    grid_t, grid_h, grid_w = torch.meshgrid(t, h, w, indexing="ij")

    # Stack and reshape the grids.
    pos = torch.stack([grid_t, grid_h, grid_w], dim=-1)  # [T, pH, pW, 3]
    pos = pos.view(-1, 3)  # [T * pH * pW, 3]
    pos = pos.to(dtype=dtype, device=device)

    return pos


def compute_mixed_rotation(
    freqs: torch.Tensor,
    pos: torch.Tensor,
):
    """
    Project each 3-dim position into per-head, per-head-dim 1D frequencies.

    Args:
        freqs: [3, num_heads, num_freqs] - learned rotation frequency (for t, row, col) for each head position
        pos: [N, 3] - position of each token
        num_heads: int

    Returns:
        freqs_cos: [N, num_heads, num_freqs] - cosine components
        freqs_sin: [N, num_heads, num_freqs] - sine components
    """
    assert freqs.ndim == 3
    freqs_sum = torch.einsum("Nd,dhf->Nhf", pos.to(freqs), freqs)
    freqs_cos = torch.cos(freqs_sum)
    freqs_sin = torch.sin(freqs_sum)
    return freqs_cos, freqs_sin
Basic Genmo Mochi video model support. To use: "Load CLIP" node with t5xxl + type mochi "Load Diffusion Model" node with the mochi dit file. "Load VAE" with the mochi vae file. EmptyMochiLatentVideo node for the latent. euler + linear_quadratic in the KSampler node. 2024-10-26 10:54:00 +00:00			`#original code from https://github.com/genmoai/models under apache 2.0 license`

			`# import functools`
			`import math`

			`import torch`


			`def centers(start: float, stop, num, dtype=None, device=None):`
			`"""linspace through bin centers.`

			`Args:`
			`start (float): Start of the range.`
			`stop (float): End of the range.`
			`num (int): Number of points.`
			`dtype (torch.dtype): Data type of the points.`
			`device (torch.device): Device of the points.`

			`Returns:`
			`centers (Tensor): Centers of the bins. Shape: (num,).`
			`"""`
			`edges = torch.linspace(start, stop, num + 1, dtype=dtype, device=device)`
			`return (edges[:-1] + edges[1:]) / 2`


			`# @functools.lru_cache(maxsize=1)`
			`def create_position_matrix(`
			`T: int,`
			`pH: int,`
			`pW: int,`
			`device: torch.device,`
			`dtype: torch.dtype,`
			`*,`
			`target_area: float = 36864,`
			`):`
			`"""`
			`Args:`
			`T: int - Temporal dimension`
			`pH: int - Height dimension after patchify`
			`pW: int - Width dimension after patchify`

			`Returns:`
			`pos: [T * pH * pW, 3] - position matrix`
			`"""`
			`# Create 1D tensors for each dimension`
			`t = torch.arange(T, dtype=dtype)`

			`# Positionally interpolate to area 36864.`
			`# (3072x3072 frame with 16x16 patches = 192x192 latents).`
			`# This automatically scales rope positions when the resolution changes.`
			`# We use a large target area so the model is more sensitive`
			`# to changes in the learned pos_frequencies matrix.`
			`scale = math.sqrt(target_area / (pW * pH))`
			`w = centers(-pW * scale / 2, pW * scale / 2, pW)`
			`h = centers(-pH * scale / 2, pH * scale / 2, pH)`

			`# Use meshgrid to create 3D grids`
			`grid_t, grid_h, grid_w = torch.meshgrid(t, h, w, indexing="ij")`

			`# Stack and reshape the grids.`
			`pos = torch.stack([grid_t, grid_h, grid_w], dim=-1) # [T, pH, pW, 3]`
			`pos = pos.view(-1, 3) # [T * pH * pW, 3]`
			`pos = pos.to(dtype=dtype, device=device)`

			`return pos`


			`def compute_mixed_rotation(`
			`freqs: torch.Tensor,`
			`pos: torch.Tensor,`
			`):`
			`"""`
			`Project each 3-dim position into per-head, per-head-dim 1D frequencies.`

			`Args:`
			`freqs: [3, num_heads, num_freqs] - learned rotation frequency (for t, row, col) for each head position`
			`pos: [N, 3] - position of each token`
			`num_heads: int`

			`Returns:`
			`freqs_cos: [N, num_heads, num_freqs] - cosine components`
			`freqs_sin: [N, num_heads, num_freqs] - sine components`
			`"""`
			`assert freqs.ndim == 3`
			`freqs_sum = torch.einsum("Nd,dhf->Nhf", pos.to(freqs), freqs)`
			`freqs_cos = torch.cos(freqs_sum)`
			`freqs_sin = torch.sin(freqs_sum)`
			`return freqs_cos, freqs_sin`