ComfyUI/comfy/ldm/hydit/posemb_layers.py

import torch
import numpy as np
from typing import Union


def _to_tuple(x):
    if isinstance(x, int):
        return x, x
    else:
        return x


def get_fill_resize_and_crop(src, tgt):
    th, tw = _to_tuple(tgt)
    h, w = _to_tuple(src)

    tr = th / tw        # base resolution
    r = h / w           # target resolution

    # resize
    if r > tr:
        resize_height = th
        resize_width = int(round(th / h * w))
    else:
        resize_width = tw
        resize_height = int(round(tw / w * h))    # resize the target resolution down based on the base resolution

    crop_top = int(round((th - resize_height) / 2.0))
    crop_left = int(round((tw - resize_width) / 2.0))

    return (crop_top, crop_left), (crop_top + resize_height, crop_left + resize_width)


def get_meshgrid(start, *args):
    if len(args) == 0:
        # start is grid_size
        num = _to_tuple(start)
        start = (0, 0)
        stop = num
    elif len(args) == 1:
        # start is start, args[0] is stop, step is 1
        start = _to_tuple(start)
        stop = _to_tuple(args[0])
        num = (stop[0] - start[0], stop[1] - start[1])
    elif len(args) == 2:
        # start is start, args[0] is stop, args[1] is num
        start = _to_tuple(start)
        stop = _to_tuple(args[0])
        num = _to_tuple(args[1])
    else:
        raise ValueError(f"len(args) should be 0, 1 or 2, but got {len(args)}")

    grid_h = np.linspace(start[0], stop[0], num[0], endpoint=False, dtype=np.float32)
    grid_w = np.linspace(start[1], stop[1], num[1], endpoint=False, dtype=np.float32)
    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
    grid = np.stack(grid, axis=0)   # [2, W, H]
    return grid

#################################################################################
#                   Sine/Cosine Positional Embedding Functions                  #
#################################################################################
# https://github.com/facebookresearch/mae/blob/main/util/pos_embed.py

def get_2d_sincos_pos_embed(embed_dim, start, *args, cls_token=False, extra_tokens=0):
    """
    grid_size: int of the grid height and width
    return:
    pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
    """
    grid = get_meshgrid(start, *args)   # [2, H, w]
    # grid_h = np.arange(grid_size, dtype=np.float32)
    # grid_w = np.arange(grid_size, dtype=np.float32)
    # grid = np.meshgrid(grid_w, grid_h)  # here w goes first
    # grid = np.stack(grid, axis=0)   # [2, W, H]

    grid = grid.reshape([2, 1, *grid.shape[1:]])
    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
    if cls_token and extra_tokens > 0:
        pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0)
    return pos_embed


def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
    assert embed_dim % 2 == 0

    # use half of dimensions to encode grid_h
    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)

    emb = np.concatenate([emb_h, emb_w], axis=1)    # (H*W, D)
    return emb


def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
    """
    embed_dim: output dimension for each position
    pos: a list of positions to be encoded: size (W,H)
    out: (M, D)
    """
    assert embed_dim % 2 == 0
    omega = np.arange(embed_dim // 2, dtype=np.float64)
    omega /= embed_dim / 2.
    omega = 1. / 10000**omega  # (D/2,)

    pos = pos.reshape(-1)  # (M,)
    out = np.einsum('m,d->md', pos, omega)  # (M, D/2), outer product

    emb_sin = np.sin(out)   # (M, D/2)
    emb_cos = np.cos(out)   # (M, D/2)

    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
    return emb


#################################################################################
#                   Rotary Positional Embedding Functions                       #
#################################################################################
# https://github.com/facebookresearch/llama/blob/main/llama/model.py#L443

def get_2d_rotary_pos_embed(embed_dim, start, *args, use_real=True):
    """
    This is a 2d version of precompute_freqs_cis, which is a RoPE for image tokens with 2d structure.

    Parameters
    ----------
    embed_dim: int
        embedding dimension size
    start: int or tuple of int
        If len(args) == 0, start is num; If len(args) == 1, start is start, args[0] is stop, step is 1;
        If len(args) == 2, start is start, args[0] is stop, args[1] is num.
    use_real: bool
        If True, return real part and imaginary part separately. Otherwise, return complex numbers.

    Returns
    -------
    pos_embed: torch.Tensor
        [HW, D/2]
    """
    grid = get_meshgrid(start, *args)   # [2, H, w]
    grid = grid.reshape([2, 1, *grid.shape[1:]])   # Returns a sampling matrix with the same resolution as the target resolution
    pos_embed = get_2d_rotary_pos_embed_from_grid(embed_dim, grid, use_real=use_real)
    return pos_embed


def get_2d_rotary_pos_embed_from_grid(embed_dim, grid, use_real=False):
    assert embed_dim % 4 == 0

    # use half of dimensions to encode grid_h
    emb_h = get_1d_rotary_pos_embed(embed_dim // 2, grid[0].reshape(-1), use_real=use_real)  # (H*W, D/4)
    emb_w = get_1d_rotary_pos_embed(embed_dim // 2, grid[1].reshape(-1), use_real=use_real)  # (H*W, D/4)

    if use_real:
        cos = torch.cat([emb_h[0], emb_w[0]], dim=1)    # (H*W, D/2)
        sin = torch.cat([emb_h[1], emb_w[1]], dim=1)    # (H*W, D/2)
        return cos, sin
    else:
        emb = torch.cat([emb_h, emb_w], dim=1)    # (H*W, D/2)
        return emb


def get_1d_rotary_pos_embed(dim: int, pos: Union[np.ndarray, int], theta: float = 10000.0, use_real=False):
    """
    Precompute the frequency tensor for complex exponentials (cis) with given dimensions.

    This function calculates a frequency tensor with complex exponentials using the given dimension 'dim'
    and the end index 'end'. The 'theta' parameter scales the frequencies.
    The returned tensor contains complex values in complex64 data type.

    Args:
        dim (int): Dimension of the frequency tensor.
        pos (np.ndarray, int): Position indices for the frequency tensor. [S] or scalar
        theta (float, optional): Scaling factor for frequency computation. Defaults to 10000.0.
        use_real (bool, optional): If True, return real part and imaginary part separately.
                                   Otherwise, return complex numbers.

    Returns:
        torch.Tensor: Precomputed frequency tensor with complex exponentials. [S, D/2]

    """
    if isinstance(pos, int):
        pos = np.arange(pos)
    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))  # [D/2]
    t = torch.from_numpy(pos).to(freqs.device)  # type: ignore  # [S]
    freqs = torch.outer(t, freqs).float()  # type: ignore   # [S, D/2]
    if use_real:
        freqs_cos = freqs.cos().repeat_interleave(2, dim=1)  # [S, D]
        freqs_sin = freqs.sin().repeat_interleave(2, dim=1)  # [S, D]
        return freqs_cos, freqs_sin
    else:
        freqs_cis = torch.polar(torch.ones_like(freqs), freqs)  # complex64     # [S, D/2]
        return freqs_cis


def calc_sizes(rope_img, patch_size, th, tw):
    if rope_img == 'extend':
        # Expansion mode
        sub_args = [(th, tw)]
    elif rope_img.startswith('base'):
        # Based on the specified dimensions, other dimensions are obtained through interpolation.
        base_size = int(rope_img[4:]) // 8 // patch_size
        start, stop = get_fill_resize_and_crop((th, tw), base_size)
        sub_args = [start, stop, (th, tw)]
    else:
        raise ValueError(f"Unknown rope_img: {rope_img}")
    return sub_args


def init_image_posemb(rope_img,
                      resolutions,
                      patch_size,
                      hidden_size,
                      num_heads,
                      log_fn,
                      rope_real=True,
                      ):
    freqs_cis_img = {}
    for reso in resolutions:
        th, tw = reso.height // 8 // patch_size, reso.width // 8 // patch_size
        sub_args = calc_sizes(rope_img, patch_size, th, tw)
        freqs_cis_img[str(reso)] = get_2d_rotary_pos_embed(hidden_size // num_heads, *sub_args, use_real=rope_real)
        log_fn(f"    Using image RoPE ({rope_img}) ({'real' if rope_real else 'complex'}): {sub_args} | ({reso}) "
               f"{freqs_cis_img[str(reso)][0].shape if rope_real else freqs_cis_img[str(reso)].shape}")
    return freqs_cis_img
Basic hunyuan dit implementation. (#4102) * Let tokenizers return weights to be stored in the saved checkpoint. * Basic hunyuan dit implementation. * Fix some resolutions not working. * Support hydit checkpoint save. * Init with right dtype. * Switch to optimized attention in pooler. * Fix black images on hunyuan dit. 2024-07-25 22:21:08 +00:00			`import torch`
			`import numpy as np`
			`from typing import Union`


			`def _to_tuple(x):`
			`if isinstance(x, int):`
			`return x, x`
			`else:`
			`return x`


			`def get_fill_resize_and_crop(src, tgt):`
			`th, tw = _to_tuple(tgt)`
			`h, w = _to_tuple(src)`

			`tr = th / tw # base resolution`
			`r = h / w # target resolution`

			`# resize`
			`if r > tr:`
			`resize_height = th`
			`resize_width = int(round(th / h * w))`
			`else:`
			`resize_width = tw`
			`resize_height = int(round(tw / w * h)) # resize the target resolution down based on the base resolution`

			`crop_top = int(round((th - resize_height) / 2.0))`
			`crop_left = int(round((tw - resize_width) / 2.0))`

			`return (crop_top, crop_left), (crop_top + resize_height, crop_left + resize_width)`


			`def get_meshgrid(start, *args):`
			`if len(args) == 0:`
			`# start is grid_size`
			`num = _to_tuple(start)`
			`start = (0, 0)`
			`stop = num`
			`elif len(args) == 1:`
			`# start is start, args[0] is stop, step is 1`
			`start = _to_tuple(start)`
			`stop = _to_tuple(args[0])`
			`num = (stop[0] - start[0], stop[1] - start[1])`
			`elif len(args) == 2:`
			`# start is start, args[0] is stop, args[1] is num`
			`start = _to_tuple(start)`
			`stop = _to_tuple(args[0])`
			`num = _to_tuple(args[1])`
			`else:`
			`raise ValueError(f"len(args) should be 0, 1 or 2, but got {len(args)}")`

			`grid_h = np.linspace(start[0], stop[0], num[0], endpoint=False, dtype=np.float32)`
			`grid_w = np.linspace(start[1], stop[1], num[1], endpoint=False, dtype=np.float32)`
			`grid = np.meshgrid(grid_w, grid_h) # here w goes first`
			`grid = np.stack(grid, axis=0) # [2, W, H]`
			`return grid`

			`#################################################################################`
			`# Sine/Cosine Positional Embedding Functions #`
			`#################################################################################`
			`# https://github.com/facebookresearch/mae/blob/main/util/pos_embed.py`

			`def get_2d_sincos_pos_embed(embed_dim, start, *args, cls_token=False, extra_tokens=0):`
			`"""`
			`grid_size: int of the grid height and width`
			`return:`
			`pos_embed: [grid_sizegrid_size, embed_dim] or [1+grid_sizegrid_size, embed_dim] (w/ or w/o cls_token)`
			`"""`
			`grid = get_meshgrid(start, *args) # [2, H, w]`
			`# grid_h = np.arange(grid_size, dtype=np.float32)`
			`# grid_w = np.arange(grid_size, dtype=np.float32)`
			`# grid = np.meshgrid(grid_w, grid_h) # here w goes first`
			`# grid = np.stack(grid, axis=0) # [2, W, H]`

			`grid = grid.reshape([2, 1, *grid.shape[1:]])`
			`pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)`
			`if cls_token and extra_tokens > 0:`
			`pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0)`
			`return pos_embed`


			`def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):`
			`assert embed_dim % 2 == 0`

			`# use half of dimensions to encode grid_h`
			`emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2)`
			`emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2)`

			`emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D)`
			`return emb`


			`def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):`
			`"""`
			`embed_dim: output dimension for each position`
			`pos: a list of positions to be encoded: size (W,H)`
			`out: (M, D)`
			`"""`
			`assert embed_dim % 2 == 0`
			`omega = np.arange(embed_dim // 2, dtype=np.float64)`
			`omega /= embed_dim / 2.`
			`omega = 1. / 10000**omega # (D/2,)`

			`pos = pos.reshape(-1) # (M,)`
			`out = np.einsum('m,d->md', pos, omega) # (M, D/2), outer product`

			`emb_sin = np.sin(out) # (M, D/2)`
			`emb_cos = np.cos(out) # (M, D/2)`

			`emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D)`
			`return emb`


			`#################################################################################`
			`# Rotary Positional Embedding Functions #`
			`#################################################################################`
			`# https://github.com/facebookresearch/llama/blob/main/llama/model.py#L443`

			`def get_2d_rotary_pos_embed(embed_dim, start, *args, use_real=True):`
			`"""`
			`This is a 2d version of precompute_freqs_cis, which is a RoPE for image tokens with 2d structure.`

			`Parameters`
			`----------`
			`embed_dim: int`
			`embedding dimension size`
			`start: int or tuple of int`
			`If len(args) == 0, start is num; If len(args) == 1, start is start, args[0] is stop, step is 1;`
			`If len(args) == 2, start is start, args[0] is stop, args[1] is num.`
			`use_real: bool`
			`If True, return real part and imaginary part separately. Otherwise, return complex numbers.`

			`Returns`
			`-------`
			`pos_embed: torch.Tensor`
			`[HW, D/2]`
			`"""`
			`grid = get_meshgrid(start, *args) # [2, H, w]`
			`grid = grid.reshape([2, 1, *grid.shape[1:]]) # Returns a sampling matrix with the same resolution as the target resolution`
			`pos_embed = get_2d_rotary_pos_embed_from_grid(embed_dim, grid, use_real=use_real)`
			`return pos_embed`


			`def get_2d_rotary_pos_embed_from_grid(embed_dim, grid, use_real=False):`
			`assert embed_dim % 4 == 0`

			`# use half of dimensions to encode grid_h`
			`emb_h = get_1d_rotary_pos_embed(embed_dim // 2, grid[0].reshape(-1), use_real=use_real) # (H*W, D/4)`
			`emb_w = get_1d_rotary_pos_embed(embed_dim // 2, grid[1].reshape(-1), use_real=use_real) # (H*W, D/4)`

			`if use_real:`
			`cos = torch.cat([emb_h[0], emb_w[0]], dim=1) # (H*W, D/2)`
			`sin = torch.cat([emb_h[1], emb_w[1]], dim=1) # (H*W, D/2)`
			`return cos, sin`
			`else:`
			`emb = torch.cat([emb_h, emb_w], dim=1) # (H*W, D/2)`
			`return emb`


			`def get_1d_rotary_pos_embed(dim: int, pos: Union[np.ndarray, int], theta: float = 10000.0, use_real=False):`
			`"""`
			`Precompute the frequency tensor for complex exponentials (cis) with given dimensions.`

			`This function calculates a frequency tensor with complex exponentials using the given dimension 'dim'`
			`and the end index 'end'. The 'theta' parameter scales the frequencies.`
			`The returned tensor contains complex values in complex64 data type.`

			`Args:`
			`dim (int): Dimension of the frequency tensor.`
			`pos (np.ndarray, int): Position indices for the frequency tensor. [S] or scalar`
			`theta (float, optional): Scaling factor for frequency computation. Defaults to 10000.0.`
			`use_real (bool, optional): If True, return real part and imaginary part separately.`
			`Otherwise, return complex numbers.`

			`Returns:`
			`torch.Tensor: Precomputed frequency tensor with complex exponentials. [S, D/2]`

			`"""`
			`if isinstance(pos, int):`
			`pos = np.arange(pos)`
			`freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim)) # [D/2]`
			`t = torch.from_numpy(pos).to(freqs.device) # type: ignore # [S]`
			`freqs = torch.outer(t, freqs).float() # type: ignore # [S, D/2]`
			`if use_real:`
			`freqs_cos = freqs.cos().repeat_interleave(2, dim=1) # [S, D]`
			`freqs_sin = freqs.sin().repeat_interleave(2, dim=1) # [S, D]`
			`return freqs_cos, freqs_sin`
			`else:`
			`freqs_cis = torch.polar(torch.ones_like(freqs), freqs) # complex64 # [S, D/2]`
			`return freqs_cis`



			`def calc_sizes(rope_img, patch_size, th, tw):`
			`if rope_img == 'extend':`
			`# Expansion mode`
			`sub_args = [(th, tw)]`
			`elif rope_img.startswith('base'):`
			`# Based on the specified dimensions, other dimensions are obtained through interpolation.`
			`base_size = int(rope_img[4:]) // 8 // patch_size`
			`start, stop = get_fill_resize_and_crop((th, tw), base_size)`
			`sub_args = [start, stop, (th, tw)]`
			`else:`
			`raise ValueError(f"Unknown rope_img: {rope_img}")`
			`return sub_args`


			`def init_image_posemb(rope_img,`
			`resolutions,`
			`patch_size,`
			`hidden_size,`
			`num_heads,`
			`log_fn,`
			`rope_real=True,`
			`):`
			`freqs_cis_img = {}`
			`for reso in resolutions:`
			`th, tw = reso.height // 8 // patch_size, reso.width // 8 // patch_size`
			`sub_args = calc_sizes(rope_img, patch_size, th, tw)`
			`freqs_cis_img[str(reso)] = get_2d_rotary_pos_embed(hidden_size // num_heads, *sub_args, use_real=rope_real)`
			`log_fn(f" Using image RoPE ({rope_img}) ({'real' if rope_real else 'complex'}): {sub_args} \| ({reso}) "`
			`f"{freqs_cis_img[str(reso)][0].shape if rope_real else freqs_cis_img[str(reso)].shape}")`
			`return freqs_cis_img`