ComfyUI/comfy/ldm/lightricks/vae/causal_video_autoencoder.py

import torch
from torch import nn
from functools import partial
import math
from einops import rearrange
from typing import Any, Mapping, Optional, Tuple, Union, List
from .conv_nd_factory import make_conv_nd, make_linear_nd
from .pixel_norm import PixelNorm


class Encoder(nn.Module):
    r"""
    The `Encoder` layer of a variational autoencoder that encodes its input into a latent representation.

    Args:
        dims (`int` or `Tuple[int, int]`, *optional*, defaults to 3):
            The number of dimensions to use in convolutions.
        in_channels (`int`, *optional*, defaults to 3):
            The number of input channels.
        out_channels (`int`, *optional*, defaults to 3):
            The number of output channels.
        blocks (`List[Tuple[str, int]]`, *optional*, defaults to `[("res_x", 1)]`):
            The blocks to use. Each block is a tuple of the block name and the number of layers.
        base_channels (`int`, *optional*, defaults to 128):
            The number of output channels for the first convolutional layer.
        norm_num_groups (`int`, *optional*, defaults to 32):
            The number of groups for normalization.
        patch_size (`int`, *optional*, defaults to 1):
            The patch size to use. Should be a power of 2.
        norm_layer (`str`, *optional*, defaults to `group_norm`):
            The normalization layer to use. Can be either `group_norm` or `pixel_norm`.
        latent_log_var (`str`, *optional*, defaults to `per_channel`):
            The number of channels for the log variance. Can be either `per_channel`, `uniform`, or `none`.
    """

    def __init__(
        self,
        dims: Union[int, Tuple[int, int]] = 3,
        in_channels: int = 3,
        out_channels: int = 3,
        blocks=[("res_x", 1)],
        base_channels: int = 128,
        norm_num_groups: int = 32,
        patch_size: Union[int, Tuple[int]] = 1,
        norm_layer: str = "group_norm",  # group_norm, pixel_norm
        latent_log_var: str = "per_channel",
    ):
        super().__init__()
        self.patch_size = patch_size
        self.norm_layer = norm_layer
        self.latent_channels = out_channels
        self.latent_log_var = latent_log_var
        self.blocks_desc = blocks

        in_channels = in_channels * patch_size**2
        output_channel = base_channels

        self.conv_in = make_conv_nd(
            dims=dims,
            in_channels=in_channels,
            out_channels=output_channel,
            kernel_size=3,
            stride=1,
            padding=1,
            causal=True,
        )

        self.down_blocks = nn.ModuleList([])

        for block_name, block_params in blocks:
            input_channel = output_channel
            if isinstance(block_params, int):
                block_params = {"num_layers": block_params}

            if block_name == "res_x":
                block = UNetMidBlock3D(
                    dims=dims,
                    in_channels=input_channel,
                    num_layers=block_params["num_layers"],
                    resnet_eps=1e-6,
                    resnet_groups=norm_num_groups,
                    norm_layer=norm_layer,
                )
            elif block_name == "res_x_y":
                output_channel = block_params.get("multiplier", 2) * output_channel
                block = ResnetBlock3D(
                    dims=dims,
                    in_channels=input_channel,
                    out_channels=output_channel,
                    eps=1e-6,
                    groups=norm_num_groups,
                    norm_layer=norm_layer,
                )
            elif block_name == "compress_time":
                block = make_conv_nd(
                    dims=dims,
                    in_channels=input_channel,
                    out_channels=output_channel,
                    kernel_size=3,
                    stride=(2, 1, 1),
                    causal=True,
                )
            elif block_name == "compress_space":
                block = make_conv_nd(
                    dims=dims,
                    in_channels=input_channel,
                    out_channels=output_channel,
                    kernel_size=3,
                    stride=(1, 2, 2),
                    causal=True,
                )
            elif block_name == "compress_all":
                block = make_conv_nd(
                    dims=dims,
                    in_channels=input_channel,
                    out_channels=output_channel,
                    kernel_size=3,
                    stride=(2, 2, 2),
                    causal=True,
                )
            elif block_name == "compress_all_x_y":
                output_channel = block_params.get("multiplier", 2) * output_channel
                block = make_conv_nd(
                    dims=dims,
                    in_channels=input_channel,
                    out_channels=output_channel,
                    kernel_size=3,
                    stride=(2, 2, 2),
                    causal=True,
                )
            else:
                raise ValueError(f"unknown block: {block_name}")

            self.down_blocks.append(block)

        # out
        if norm_layer == "group_norm":
            self.conv_norm_out = nn.GroupNorm(
                num_channels=output_channel, num_groups=norm_num_groups, eps=1e-6
            )
        elif norm_layer == "pixel_norm":
            self.conv_norm_out = PixelNorm()
        elif norm_layer == "layer_norm":
            self.conv_norm_out = LayerNorm(output_channel, eps=1e-6)

        self.conv_act = nn.SiLU()

        conv_out_channels = out_channels
        if latent_log_var == "per_channel":
            conv_out_channels *= 2
        elif latent_log_var == "uniform":
            conv_out_channels += 1
        elif latent_log_var != "none":
            raise ValueError(f"Invalid latent_log_var: {latent_log_var}")
        self.conv_out = make_conv_nd(
            dims, output_channel, conv_out_channels, 3, padding=1, causal=True
        )

        self.gradient_checkpointing = False

    def forward(self, sample: torch.FloatTensor) -> torch.FloatTensor:
        r"""The forward method of the `Encoder` class."""

        sample = patchify(sample, patch_size_hw=self.patch_size, patch_size_t=1)
        sample = self.conv_in(sample)

        checkpoint_fn = (
            partial(torch.utils.checkpoint.checkpoint, use_reentrant=False)
            if self.gradient_checkpointing and self.training
            else lambda x: x
        )

        for down_block in self.down_blocks:
            sample = checkpoint_fn(down_block)(sample)

        sample = self.conv_norm_out(sample)
        sample = self.conv_act(sample)
        sample = self.conv_out(sample)

        if self.latent_log_var == "uniform":
            last_channel = sample[:, -1:, ...]
            num_dims = sample.dim()

            if num_dims == 4:
                # For shape (B, C, H, W)
                repeated_last_channel = last_channel.repeat(
                    1, sample.shape[1] - 2, 1, 1
                )
                sample = torch.cat([sample, repeated_last_channel], dim=1)
            elif num_dims == 5:
                # For shape (B, C, F, H, W)
                repeated_last_channel = last_channel.repeat(
                    1, sample.shape[1] - 2, 1, 1, 1
                )
                sample = torch.cat([sample, repeated_last_channel], dim=1)
            else:
                raise ValueError(f"Invalid input shape: {sample.shape}")

        return sample


class Decoder(nn.Module):
    r"""
    The `Decoder` layer of a variational autoencoder that decodes its latent representation into an output sample.

    Args:
        dims (`int` or `Tuple[int, int]`, *optional*, defaults to 3):
            The number of dimensions to use in convolutions.
        in_channels (`int`, *optional*, defaults to 3):
            The number of input channels.
        out_channels (`int`, *optional*, defaults to 3):
            The number of output channels.
        blocks (`List[Tuple[str, int]]`, *optional*, defaults to `[("res_x", 1)]`):
            The blocks to use. Each block is a tuple of the block name and the number of layers.
        base_channels (`int`, *optional*, defaults to 128):
            The number of output channels for the first convolutional layer.
        norm_num_groups (`int`, *optional*, defaults to 32):
            The number of groups for normalization.
        patch_size (`int`, *optional*, defaults to 1):
            The patch size to use. Should be a power of 2.
        norm_layer (`str`, *optional*, defaults to `group_norm`):
            The normalization layer to use. Can be either `group_norm` or `pixel_norm`.
        causal (`bool`, *optional*, defaults to `True`):
            Whether to use causal convolutions or not.
    """

    def __init__(
        self,
        dims,
        in_channels: int = 3,
        out_channels: int = 3,
        blocks=[("res_x", 1)],
        base_channels: int = 128,
        layers_per_block: int = 2,
        norm_num_groups: int = 32,
        patch_size: int = 1,
        norm_layer: str = "group_norm",
        causal: bool = True,
    ):
        super().__init__()
        self.patch_size = patch_size
        self.layers_per_block = layers_per_block
        out_channels = out_channels * patch_size**2
        self.causal = causal
        self.blocks_desc = blocks

        # Compute output channel to be product of all channel-multiplier blocks
        output_channel = base_channels
        for block_name, block_params in list(reversed(blocks)):
            block_params = block_params if isinstance(block_params, dict) else {}
            if block_name == "res_x_y":
                output_channel = output_channel * block_params.get("multiplier", 2)

        self.conv_in = make_conv_nd(
            dims,
            in_channels,
            output_channel,
            kernel_size=3,
            stride=1,
            padding=1,
            causal=True,
        )

        self.up_blocks = nn.ModuleList([])

        for block_name, block_params in list(reversed(blocks)):
            input_channel = output_channel
            if isinstance(block_params, int):
                block_params = {"num_layers": block_params}

            if block_name == "res_x":
                block = UNetMidBlock3D(
                    dims=dims,
                    in_channels=input_channel,
                    num_layers=block_params["num_layers"],
                    resnet_eps=1e-6,
                    resnet_groups=norm_num_groups,
                    norm_layer=norm_layer,
                )
            elif block_name == "res_x_y":
                output_channel = output_channel // block_params.get("multiplier", 2)
                block = ResnetBlock3D(
                    dims=dims,
                    in_channels=input_channel,
                    out_channels=output_channel,
                    eps=1e-6,
                    groups=norm_num_groups,
                    norm_layer=norm_layer,
                )
            elif block_name == "compress_time":
                block = DepthToSpaceUpsample(
                    dims=dims, in_channels=input_channel, stride=(2, 1, 1)
                )
            elif block_name == "compress_space":
                block = DepthToSpaceUpsample(
                    dims=dims, in_channels=input_channel, stride=(1, 2, 2)
                )
            elif block_name == "compress_all":
                block = DepthToSpaceUpsample(
                    dims=dims,
                    in_channels=input_channel,
                    stride=(2, 2, 2),
                    residual=block_params.get("residual", False),
                )
            else:
                raise ValueError(f"unknown layer: {block_name}")

            self.up_blocks.append(block)

        if norm_layer == "group_norm":
            self.conv_norm_out = nn.GroupNorm(
                num_channels=output_channel, num_groups=norm_num_groups, eps=1e-6
            )
        elif norm_layer == "pixel_norm":
            self.conv_norm_out = PixelNorm()
        elif norm_layer == "layer_norm":
            self.conv_norm_out = LayerNorm(output_channel, eps=1e-6)

        self.conv_act = nn.SiLU()
        self.conv_out = make_conv_nd(
            dims, output_channel, out_channels, 3, padding=1, causal=True
        )

        self.gradient_checkpointing = False

    # def forward(self, sample: torch.FloatTensor, target_shape) -> torch.FloatTensor:
    def forward(self, sample: torch.FloatTensor) -> torch.FloatTensor:
        r"""The forward method of the `Decoder` class."""
        # assert target_shape is not None, "target_shape must be provided"

        sample = self.conv_in(sample, causal=self.causal)

        upscale_dtype = next(iter(self.up_blocks.parameters())).dtype

        checkpoint_fn = (
            partial(torch.utils.checkpoint.checkpoint, use_reentrant=False)
            if self.gradient_checkpointing and self.training
            else lambda x: x
        )

        sample = sample.to(upscale_dtype)

        for up_block in self.up_blocks:
            sample = checkpoint_fn(up_block)(sample, causal=self.causal)

        sample = self.conv_norm_out(sample)
        sample = self.conv_act(sample)
        sample = self.conv_out(sample, causal=self.causal)

        sample = unpatchify(sample, patch_size_hw=self.patch_size, patch_size_t=1)

        return sample


class UNetMidBlock3D(nn.Module):
    """
    A 3D UNet mid-block [`UNetMidBlock3D`] with multiple residual blocks.

    Args:
        in_channels (`int`): The number of input channels.
        dropout (`float`, *optional*, defaults to 0.0): The dropout rate.
        num_layers (`int`, *optional*, defaults to 1): The number of residual blocks.
        resnet_eps (`float`, *optional*, 1e-6 ): The epsilon value for the resnet blocks.
        resnet_groups (`int`, *optional*, defaults to 32):
            The number of groups to use in the group normalization layers of the resnet blocks.

    Returns:
        `torch.FloatTensor`: The output of the last residual block, which is a tensor of shape `(batch_size,
        in_channels, height, width)`.

    """

    def __init__(
        self,
        dims: Union[int, Tuple[int, int]],
        in_channels: int,
        dropout: float = 0.0,
        num_layers: int = 1,
        resnet_eps: float = 1e-6,
        resnet_groups: int = 32,
        norm_layer: str = "group_norm",
    ):
        super().__init__()
        resnet_groups = (
            resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
        )

        self.res_blocks = nn.ModuleList(
            [
                ResnetBlock3D(
                    dims=dims,
                    in_channels=in_channels,
                    out_channels=in_channels,
                    eps=resnet_eps,
                    groups=resnet_groups,
                    dropout=dropout,
                    norm_layer=norm_layer,
                )
                for _ in range(num_layers)
            ]
        )

    def forward(
        self, hidden_states: torch.FloatTensor, causal: bool = True
    ) -> torch.FloatTensor:
        for resnet in self.res_blocks:
            hidden_states = resnet(hidden_states, causal=causal)

        return hidden_states


class DepthToSpaceUpsample(nn.Module):
    def __init__(self, dims, in_channels, stride, residual=False):
        super().__init__()
        self.stride = stride
        self.out_channels = math.prod(stride) * in_channels
        self.conv = make_conv_nd(
            dims=dims,
            in_channels=in_channels,
            out_channels=self.out_channels,
            kernel_size=3,
            stride=1,
            causal=True,
        )
        self.residual = residual

    def forward(self, x, causal: bool = True):
        if self.residual:
            # Reshape and duplicate the input to match the output shape
            x_in = rearrange(
                x,
                "b (c p1 p2 p3) d h w -> b c (d p1) (h p2) (w p3)",
                p1=self.stride[0],
                p2=self.stride[1],
                p3=self.stride[2],
            )
            x_in = x_in.repeat(1, math.prod(self.stride), 1, 1, 1)
            if self.stride[0] == 2:
                x_in = x_in[:, :, 1:, :, :]
        x = self.conv(x, causal=causal)
        x = rearrange(
            x,
            "b (c p1 p2 p3) d h w -> b c (d p1) (h p2) (w p3)",
            p1=self.stride[0],
            p2=self.stride[1],
            p3=self.stride[2],
        )
        if self.stride[0] == 2:
            x = x[:, :, 1:, :, :]
        if self.residual:
            x = x + x_in
        return x


class LayerNorm(nn.Module):
    def __init__(self, dim, eps, elementwise_affine=True) -> None:
        super().__init__()
        self.norm = nn.LayerNorm(dim, eps=eps, elementwise_affine=elementwise_affine)

    def forward(self, x):
        x = rearrange(x, "b c d h w -> b d h w c")
        x = self.norm(x)
        x = rearrange(x, "b d h w c -> b c d h w")
        return x


class ResnetBlock3D(nn.Module):
    r"""
    A Resnet block.

    Parameters:
        in_channels (`int`): The number of channels in the input.
        out_channels (`int`, *optional*, default to be `None`):
            The number of output channels for the first conv layer. If None, same as `in_channels`.
        dropout (`float`, *optional*, defaults to `0.0`): The dropout probability to use.
        groups (`int`, *optional*, default to `32`): The number of groups to use for the first normalization layer.
        eps (`float`, *optional*, defaults to `1e-6`): The epsilon to use for the normalization.
    """

    def __init__(
        self,
        dims: Union[int, Tuple[int, int]],
        in_channels: int,
        out_channels: Optional[int] = None,
        dropout: float = 0.0,
        groups: int = 32,
        eps: float = 1e-6,
        norm_layer: str = "group_norm",
    ):
        super().__init__()
        self.in_channels = in_channels
        out_channels = in_channels if out_channels is None else out_channels
        self.out_channels = out_channels

        if norm_layer == "group_norm":
            self.norm1 = nn.GroupNorm(
                num_groups=groups, num_channels=in_channels, eps=eps, affine=True
            )
        elif norm_layer == "pixel_norm":
            self.norm1 = PixelNorm()
        elif norm_layer == "layer_norm":
            self.norm1 = LayerNorm(in_channels, eps=eps, elementwise_affine=True)

        self.non_linearity = nn.SiLU()

        self.conv1 = make_conv_nd(
            dims,
            in_channels,
            out_channels,
            kernel_size=3,
            stride=1,
            padding=1,
            causal=True,
        )

        if norm_layer == "group_norm":
            self.norm2 = nn.GroupNorm(
                num_groups=groups, num_channels=out_channels, eps=eps, affine=True
            )
        elif norm_layer == "pixel_norm":
            self.norm2 = PixelNorm()
        elif norm_layer == "layer_norm":
            self.norm2 = LayerNorm(out_channels, eps=eps, elementwise_affine=True)

        self.dropout = torch.nn.Dropout(dropout)

        self.conv2 = make_conv_nd(
            dims,
            out_channels,
            out_channels,
            kernel_size=3,
            stride=1,
            padding=1,
            causal=True,
        )

        self.conv_shortcut = (
            make_linear_nd(
                dims=dims, in_channels=in_channels, out_channels=out_channels
            )
            if in_channels != out_channels
            else nn.Identity()
        )

        self.norm3 = (
            LayerNorm(in_channels, eps=eps, elementwise_affine=True)
            if in_channels != out_channels
            else nn.Identity()
        )

    def forward(
        self,
        input_tensor: torch.FloatTensor,
        causal: bool = True,
    ) -> torch.FloatTensor:
        hidden_states = input_tensor

        hidden_states = self.norm1(hidden_states)

        hidden_states = self.non_linearity(hidden_states)

        hidden_states = self.conv1(hidden_states, causal=causal)

        hidden_states = self.norm2(hidden_states)

        hidden_states = self.non_linearity(hidden_states)

        hidden_states = self.dropout(hidden_states)

        hidden_states = self.conv2(hidden_states, causal=causal)

        input_tensor = self.norm3(input_tensor)

        input_tensor = self.conv_shortcut(input_tensor)

        output_tensor = input_tensor + hidden_states

        return output_tensor


def patchify(x, patch_size_hw, patch_size_t=1):
    if patch_size_hw == 1 and patch_size_t == 1:
        return x
    if x.dim() == 4:
        x = rearrange(
            x, "b c (h q) (w r) -> b (c r q) h w", q=patch_size_hw, r=patch_size_hw
        )
    elif x.dim() == 5:
        x = rearrange(
            x,
            "b c (f p) (h q) (w r) -> b (c p r q) f h w",
            p=patch_size_t,
            q=patch_size_hw,
            r=patch_size_hw,
        )
    else:
        raise ValueError(f"Invalid input shape: {x.shape}")

    return x


def unpatchify(x, patch_size_hw, patch_size_t=1):
    if patch_size_hw == 1 and patch_size_t == 1:
        return x

    if x.dim() == 4:
        x = rearrange(
            x, "b (c r q) h w -> b c (h q) (w r)", q=patch_size_hw, r=patch_size_hw
        )
    elif x.dim() == 5:
        x = rearrange(
            x,
            "b (c p r q) f h w -> b c (f p) (h q) (w r)",
            p=patch_size_t,
            q=patch_size_hw,
            r=patch_size_hw,
        )

    return x

class processor(nn.Module):
    def __init__(self):
        super().__init__()
        self.register_buffer("std-of-means", torch.empty(128))
        self.register_buffer("mean-of-means", torch.empty(128))
        self.register_buffer("mean-of-stds", torch.empty(128))
        self.register_buffer("mean-of-stds_over_std-of-means", torch.empty(128))
        self.register_buffer("channel", torch.empty(128))

    def un_normalize(self, x):
        return (x * self.get_buffer("std-of-means").view(1, -1, 1, 1, 1).to(x)) + self.get_buffer("mean-of-means").view(1, -1, 1, 1, 1).to(x)

    def normalize(self, x):
        return (x - self.get_buffer("mean-of-means").view(1, -1, 1, 1, 1).to(x)) / self.get_buffer("std-of-means").view(1, -1, 1, 1, 1).to(x)

class VideoVAE(nn.Module):
    def __init__(self):
        super().__init__()
        config = {
            "_class_name": "CausalVideoAutoencoder",
            "dims": 3,
            "in_channels": 3,
            "out_channels": 3,
            "latent_channels": 128,
            "blocks": [
                ["res_x", 4],
                ["compress_all", 1],
                ["res_x_y", 1],
                ["res_x", 3],
                ["compress_all", 1],
                ["res_x_y", 1],
                ["res_x", 3],
                ["compress_all", 1],
                ["res_x", 3],
                ["res_x", 4],
            ],
            "scaling_factor": 1.0,
            "norm_layer": "pixel_norm",
            "patch_size": 4,
            "latent_log_var": "uniform",
            "use_quant_conv": False,
            "causal_decoder": False,
        }

        double_z = config.get("double_z", True)
        latent_log_var = config.get(
            "latent_log_var", "per_channel" if double_z else "none"
        )

        self.encoder = Encoder(
            dims=config["dims"],
            in_channels=config.get("in_channels", 3),
            out_channels=config["latent_channels"],
            blocks=config.get("encoder_blocks", config.get("blocks")),
            patch_size=config.get("patch_size", 1),
            latent_log_var=latent_log_var,
            norm_layer=config.get("norm_layer", "group_norm"),
        )

        self.decoder = Decoder(
            dims=config["dims"],
            in_channels=config["latent_channels"],
            out_channels=config.get("out_channels", 3),
            blocks=config.get("decoder_blocks", config.get("blocks")),
            patch_size=config.get("patch_size", 1),
            norm_layer=config.get("norm_layer", "group_norm"),
            causal=config.get("causal_decoder", False),
        )

        self.per_channel_statistics = processor()

    def encode(self, x):
        means, logvar = torch.chunk(self.encoder(x), 2, dim=1)
        return self.per_channel_statistics.normalize(means)

    def decode(self, x):
        return self.decoder(self.per_channel_statistics.un_normalize(x))
Support Lightricks LTX-Video model. 2024-11-22 13:44:42 +00:00			`import torch`
			`from torch import nn`
			`from functools import partial`
			`import math`
			`from einops import rearrange`
			`from typing import Any, Mapping, Optional, Tuple, Union, List`
			`from .conv_nd_factory import make_conv_nd, make_linear_nd`
			`from .pixel_norm import PixelNorm`


			`class Encoder(nn.Module):`
			`r"""`
			The `Encoder` layer of a variational autoencoder that encodes its input into a latent representation.

			`Args:`
			dims (`int` or `Tuple[int, int]`, optional, defaults to 3):
			`The number of dimensions to use in convolutions.`
			in_channels (`int`, optional, defaults to 3):
			`The number of input channels.`
			out_channels (`int`, optional, defaults to 3):
			`The number of output channels.`
			blocks (`List[Tuple[str, int]]`, optional, defaults to `[("res_x", 1)]`):
			`The blocks to use. Each block is a tuple of the block name and the number of layers.`
			base_channels (`int`, optional, defaults to 128):
			`The number of output channels for the first convolutional layer.`
			norm_num_groups (`int`, optional, defaults to 32):
			`The number of groups for normalization.`
			patch_size (`int`, optional, defaults to 1):
			`The patch size to use. Should be a power of 2.`
			norm_layer (`str`, optional, defaults to `group_norm`):
			The normalization layer to use. Can be either `group_norm` or `pixel_norm`.
			latent_log_var (`str`, optional, defaults to `per_channel`):
			The number of channels for the log variance. Can be either `per_channel`, `uniform`, or `none`.
			`"""`

			`def __init__(`
			`self,`
			`dims: Union[int, Tuple[int, int]] = 3,`
			`in_channels: int = 3,`
			`out_channels: int = 3,`
			`blocks=[("res_x", 1)],`
			`base_channels: int = 128,`
			`norm_num_groups: int = 32,`
			`patch_size: Union[int, Tuple[int]] = 1,`
			`norm_layer: str = "group_norm", # group_norm, pixel_norm`
			`latent_log_var: str = "per_channel",`
			`):`
			`super().__init__()`
			`self.patch_size = patch_size`
			`self.norm_layer = norm_layer`
			`self.latent_channels = out_channels`
			`self.latent_log_var = latent_log_var`
			`self.blocks_desc = blocks`

			`in_channels = in_channels * patch_size**2`
			`output_channel = base_channels`

			`self.conv_in = make_conv_nd(`
			`dims=dims,`
			`in_channels=in_channels,`
			`out_channels=output_channel,`
			`kernel_size=3,`
			`stride=1,`
			`padding=1,`
			`causal=True,`
			`)`

			`self.down_blocks = nn.ModuleList([])`

			`for block_name, block_params in blocks:`
			`input_channel = output_channel`
			`if isinstance(block_params, int):`
			`block_params = {"num_layers": block_params}`

			`if block_name == "res_x":`
			`block = UNetMidBlock3D(`
			`dims=dims,`
			`in_channels=input_channel,`
			`num_layers=block_params["num_layers"],`
			`resnet_eps=1e-6,`
			`resnet_groups=norm_num_groups,`
			`norm_layer=norm_layer,`
			`)`
			`elif block_name == "res_x_y":`
			`output_channel = block_params.get("multiplier", 2) * output_channel`
			`block = ResnetBlock3D(`
			`dims=dims,`
			`in_channels=input_channel,`
			`out_channels=output_channel,`
			`eps=1e-6,`
			`groups=norm_num_groups,`
			`norm_layer=norm_layer,`
			`)`
			`elif block_name == "compress_time":`
			`block = make_conv_nd(`
			`dims=dims,`
			`in_channels=input_channel,`
			`out_channels=output_channel,`
			`kernel_size=3,`
			`stride=(2, 1, 1),`
			`causal=True,`
			`)`
			`elif block_name == "compress_space":`
			`block = make_conv_nd(`
			`dims=dims,`
			`in_channels=input_channel,`
			`out_channels=output_channel,`
			`kernel_size=3,`
			`stride=(1, 2, 2),`
			`causal=True,`
			`)`
			`elif block_name == "compress_all":`
			`block = make_conv_nd(`
			`dims=dims,`
			`in_channels=input_channel,`
			`out_channels=output_channel,`
			`kernel_size=3,`
			`stride=(2, 2, 2),`
			`causal=True,`
			`)`
			`elif block_name == "compress_all_x_y":`
			`output_channel = block_params.get("multiplier", 2) * output_channel`
			`block = make_conv_nd(`
			`dims=dims,`
			`in_channels=input_channel,`
			`out_channels=output_channel,`
			`kernel_size=3,`
			`stride=(2, 2, 2),`
			`causal=True,`
			`)`
			`else:`
			`raise ValueError(f"unknown block: {block_name}")`

			`self.down_blocks.append(block)`

			`# out`
			`if norm_layer == "group_norm":`
			`self.conv_norm_out = nn.GroupNorm(`
			`num_channels=output_channel, num_groups=norm_num_groups, eps=1e-6`
			`)`
			`elif norm_layer == "pixel_norm":`
			`self.conv_norm_out = PixelNorm()`
			`elif norm_layer == "layer_norm":`
			`self.conv_norm_out = LayerNorm(output_channel, eps=1e-6)`

			`self.conv_act = nn.SiLU()`

			`conv_out_channels = out_channels`
			`if latent_log_var == "per_channel":`
			`conv_out_channels *= 2`
			`elif latent_log_var == "uniform":`
			`conv_out_channels += 1`
			`elif latent_log_var != "none":`
			`raise ValueError(f"Invalid latent_log_var: {latent_log_var}")`
			`self.conv_out = make_conv_nd(`
			`dims, output_channel, conv_out_channels, 3, padding=1, causal=True`
			`)`

			`self.gradient_checkpointing = False`

			`def forward(self, sample: torch.FloatTensor) -> torch.FloatTensor:`
			r"""The forward method of the `Encoder` class."""

			`sample = patchify(sample, patch_size_hw=self.patch_size, patch_size_t=1)`
			`sample = self.conv_in(sample)`

			`checkpoint_fn = (`
			`partial(torch.utils.checkpoint.checkpoint, use_reentrant=False)`
			`if self.gradient_checkpointing and self.training`
			`else lambda x: x`
			`)`

			`for down_block in self.down_blocks:`
			`sample = checkpoint_fn(down_block)(sample)`

			`sample = self.conv_norm_out(sample)`
			`sample = self.conv_act(sample)`
			`sample = self.conv_out(sample)`

			`if self.latent_log_var == "uniform":`
			`last_channel = sample[:, -1:, ...]`
			`num_dims = sample.dim()`

			`if num_dims == 4:`
			`# For shape (B, C, H, W)`
			`repeated_last_channel = last_channel.repeat(`
			`1, sample.shape[1] - 2, 1, 1`
			`)`
			`sample = torch.cat([sample, repeated_last_channel], dim=1)`
			`elif num_dims == 5:`
			`# For shape (B, C, F, H, W)`
			`repeated_last_channel = last_channel.repeat(`
			`1, sample.shape[1] - 2, 1, 1, 1`
			`)`
			`sample = torch.cat([sample, repeated_last_channel], dim=1)`
			`else:`
			`raise ValueError(f"Invalid input shape: {sample.shape}")`

			`return sample`


			`class Decoder(nn.Module):`
			`r"""`
			The `Decoder` layer of a variational autoencoder that decodes its latent representation into an output sample.

			`Args:`
			dims (`int` or `Tuple[int, int]`, optional, defaults to 3):
			`The number of dimensions to use in convolutions.`
			in_channels (`int`, optional, defaults to 3):
			`The number of input channels.`
			out_channels (`int`, optional, defaults to 3):
			`The number of output channels.`
			blocks (`List[Tuple[str, int]]`, optional, defaults to `[("res_x", 1)]`):
			`The blocks to use. Each block is a tuple of the block name and the number of layers.`
			base_channels (`int`, optional, defaults to 128):
			`The number of output channels for the first convolutional layer.`
			norm_num_groups (`int`, optional, defaults to 32):
			`The number of groups for normalization.`
			patch_size (`int`, optional, defaults to 1):
			`The patch size to use. Should be a power of 2.`
			norm_layer (`str`, optional, defaults to `group_norm`):
			The normalization layer to use. Can be either `group_norm` or `pixel_norm`.
			causal (`bool`, optional, defaults to `True`):
			`Whether to use causal convolutions or not.`
			`"""`

			`def __init__(`
			`self,`
			`dims,`
			`in_channels: int = 3,`
			`out_channels: int = 3,`
			`blocks=[("res_x", 1)],`
			`base_channels: int = 128,`
			`layers_per_block: int = 2,`
			`norm_num_groups: int = 32,`
			`patch_size: int = 1,`
			`norm_layer: str = "group_norm",`
			`causal: bool = True,`
			`):`
			`super().__init__()`
			`self.patch_size = patch_size`
			`self.layers_per_block = layers_per_block`
			`out_channels = out_channels * patch_size**2`
			`self.causal = causal`
			`self.blocks_desc = blocks`

			`# Compute output channel to be product of all channel-multiplier blocks`
			`output_channel = base_channels`
			`for block_name, block_params in list(reversed(blocks)):`
			`block_params = block_params if isinstance(block_params, dict) else {}`
			`if block_name == "res_x_y":`
			`output_channel = output_channel * block_params.get("multiplier", 2)`

			`self.conv_in = make_conv_nd(`
			`dims,`
			`in_channels,`
			`output_channel,`
			`kernel_size=3,`
			`stride=1,`
			`padding=1,`
			`causal=True,`
			`)`

			`self.up_blocks = nn.ModuleList([])`

			`for block_name, block_params in list(reversed(blocks)):`
			`input_channel = output_channel`
			`if isinstance(block_params, int):`
			`block_params = {"num_layers": block_params}`

			`if block_name == "res_x":`
			`block = UNetMidBlock3D(`
			`dims=dims,`
			`in_channels=input_channel,`
			`num_layers=block_params["num_layers"],`
			`resnet_eps=1e-6,`
			`resnet_groups=norm_num_groups,`
			`norm_layer=norm_layer,`
			`)`
			`elif block_name == "res_x_y":`
			`output_channel = output_channel // block_params.get("multiplier", 2)`
			`block = ResnetBlock3D(`
			`dims=dims,`
			`in_channels=input_channel,`
			`out_channels=output_channel,`
			`eps=1e-6,`
			`groups=norm_num_groups,`
			`norm_layer=norm_layer,`
			`)`
			`elif block_name == "compress_time":`
			`block = DepthToSpaceUpsample(`
			`dims=dims, in_channels=input_channel, stride=(2, 1, 1)`
			`)`
			`elif block_name == "compress_space":`
			`block = DepthToSpaceUpsample(`
			`dims=dims, in_channels=input_channel, stride=(1, 2, 2)`
			`)`
			`elif block_name == "compress_all":`
			`block = DepthToSpaceUpsample(`
			`dims=dims,`
			`in_channels=input_channel,`
			`stride=(2, 2, 2),`
			`residual=block_params.get("residual", False),`
			`)`
			`else:`
			`raise ValueError(f"unknown layer: {block_name}")`

			`self.up_blocks.append(block)`

			`if norm_layer == "group_norm":`
			`self.conv_norm_out = nn.GroupNorm(`
			`num_channels=output_channel, num_groups=norm_num_groups, eps=1e-6`
			`)`
			`elif norm_layer == "pixel_norm":`
			`self.conv_norm_out = PixelNorm()`
			`elif norm_layer == "layer_norm":`
			`self.conv_norm_out = LayerNorm(output_channel, eps=1e-6)`

			`self.conv_act = nn.SiLU()`
			`self.conv_out = make_conv_nd(`
			`dims, output_channel, out_channels, 3, padding=1, causal=True`
			`)`

			`self.gradient_checkpointing = False`

			`# def forward(self, sample: torch.FloatTensor, target_shape) -> torch.FloatTensor:`
			`def forward(self, sample: torch.FloatTensor) -> torch.FloatTensor:`
			r"""The forward method of the `Decoder` class."""
			`# assert target_shape is not None, "target_shape must be provided"`

			`sample = self.conv_in(sample, causal=self.causal)`

			`upscale_dtype = next(iter(self.up_blocks.parameters())).dtype`

			`checkpoint_fn = (`
			`partial(torch.utils.checkpoint.checkpoint, use_reentrant=False)`
			`if self.gradient_checkpointing and self.training`
			`else lambda x: x`
			`)`

			`sample = sample.to(upscale_dtype)`

			`for up_block in self.up_blocks:`
			`sample = checkpoint_fn(up_block)(sample, causal=self.causal)`

			`sample = self.conv_norm_out(sample)`
			`sample = self.conv_act(sample)`
			`sample = self.conv_out(sample, causal=self.causal)`

			`sample = unpatchify(sample, patch_size_hw=self.patch_size, patch_size_t=1)`

			`return sample`


			`class UNetMidBlock3D(nn.Module):`
			`"""`
			A 3D UNet mid-block [`UNetMidBlock3D`] with multiple residual blocks.

			`Args:`
			in_channels (`int`): The number of input channels.
			dropout (`float`, optional, defaults to 0.0): The dropout rate.
			num_layers (`int`, optional, defaults to 1): The number of residual blocks.
			resnet_eps (`float`, optional, 1e-6 ): The epsilon value for the resnet blocks.
			resnet_groups (`int`, optional, defaults to 32):
			`The number of groups to use in the group normalization layers of the resnet blocks.`

			`Returns:`
			`torch.FloatTensor`: The output of the last residual block, which is a tensor of shape `(batch_size,
			in_channels, height, width)`.

			`"""`

			`def __init__(`
			`self,`
			`dims: Union[int, Tuple[int, int]],`
			`in_channels: int,`
			`dropout: float = 0.0,`
			`num_layers: int = 1,`
			`resnet_eps: float = 1e-6,`
			`resnet_groups: int = 32,`
			`norm_layer: str = "group_norm",`
			`):`
			`super().__init__()`
			`resnet_groups = (`
			`resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)`
			`)`

			`self.res_blocks = nn.ModuleList(`
			`[`
			`ResnetBlock3D(`
			`dims=dims,`
			`in_channels=in_channels,`
			`out_channels=in_channels,`
			`eps=resnet_eps,`
			`groups=resnet_groups,`
			`dropout=dropout,`
			`norm_layer=norm_layer,`
			`)`
			`for _ in range(num_layers)`
			`]`
			`)`

			`def forward(`
			`self, hidden_states: torch.FloatTensor, causal: bool = True`
			`) -> torch.FloatTensor:`
			`for resnet in self.res_blocks:`
			`hidden_states = resnet(hidden_states, causal=causal)`

			`return hidden_states`


			`class DepthToSpaceUpsample(nn.Module):`
			`def __init__(self, dims, in_channels, stride, residual=False):`
			`super().__init__()`
			`self.stride = stride`
			`self.out_channels = math.prod(stride) * in_channels`
			`self.conv = make_conv_nd(`
			`dims=dims,`
			`in_channels=in_channels,`
			`out_channels=self.out_channels,`
			`kernel_size=3,`
			`stride=1,`
			`causal=True,`
			`)`
			`self.residual = residual`

			`def forward(self, x, causal: bool = True):`
			`if self.residual:`
			`# Reshape and duplicate the input to match the output shape`
			`x_in = rearrange(`
			`x,`
			`"b (c p1 p2 p3) d h w -> b c (d p1) (h p2) (w p3)",`
			`p1=self.stride[0],`
			`p2=self.stride[1],`
			`p3=self.stride[2],`
			`)`
			`x_in = x_in.repeat(1, math.prod(self.stride), 1, 1, 1)`
			`if self.stride[0] == 2:`
			`x_in = x_in[:, :, 1:, :, :]`
			`x = self.conv(x, causal=causal)`
			`x = rearrange(`
			`x,`
			`"b (c p1 p2 p3) d h w -> b c (d p1) (h p2) (w p3)",`
			`p1=self.stride[0],`
			`p2=self.stride[1],`
			`p3=self.stride[2],`
			`)`
			`if self.stride[0] == 2:`
			`x = x[:, :, 1:, :, :]`
			`if self.residual:`
			`x = x + x_in`
			`return x`


			`class LayerNorm(nn.Module):`
			`def __init__(self, dim, eps, elementwise_affine=True) -> None:`
			`super().__init__()`
			`self.norm = nn.LayerNorm(dim, eps=eps, elementwise_affine=elementwise_affine)`

			`def forward(self, x):`
			`x = rearrange(x, "b c d h w -> b d h w c")`
			`x = self.norm(x)`
			`x = rearrange(x, "b d h w c -> b c d h w")`
			`return x`


			`class ResnetBlock3D(nn.Module):`
			`r"""`
			`A Resnet block.`

			`Parameters:`
			in_channels (`int`): The number of channels in the input.
			out_channels (`int`, optional, default to be `None`):
			The number of output channels for the first conv layer. If None, same as `in_channels`.
			dropout (`float`, optional, defaults to `0.0`): The dropout probability to use.
			groups (`int`, optional, default to `32`): The number of groups to use for the first normalization layer.
			eps (`float`, optional, defaults to `1e-6`): The epsilon to use for the normalization.
			`"""`

			`def __init__(`
			`self,`
			`dims: Union[int, Tuple[int, int]],`
			`in_channels: int,`
			`out_channels: Optional[int] = None,`
			`dropout: float = 0.0,`
			`groups: int = 32,`
			`eps: float = 1e-6,`
			`norm_layer: str = "group_norm",`
			`):`
			`super().__init__()`
			`self.in_channels = in_channels`
			`out_channels = in_channels if out_channels is None else out_channels`
			`self.out_channels = out_channels`

			`if norm_layer == "group_norm":`
			`self.norm1 = nn.GroupNorm(`
			`num_groups=groups, num_channels=in_channels, eps=eps, affine=True`
			`)`
			`elif norm_layer == "pixel_norm":`
			`self.norm1 = PixelNorm()`
			`elif norm_layer == "layer_norm":`
			`self.norm1 = LayerNorm(in_channels, eps=eps, elementwise_affine=True)`

			`self.non_linearity = nn.SiLU()`

			`self.conv1 = make_conv_nd(`
			`dims,`
			`in_channels,`
			`out_channels,`
			`kernel_size=3,`
			`stride=1,`
			`padding=1,`
			`causal=True,`
			`)`

			`if norm_layer == "group_norm":`
			`self.norm2 = nn.GroupNorm(`
			`num_groups=groups, num_channels=out_channels, eps=eps, affine=True`
			`)`
			`elif norm_layer == "pixel_norm":`
			`self.norm2 = PixelNorm()`
			`elif norm_layer == "layer_norm":`
			`self.norm2 = LayerNorm(out_channels, eps=eps, elementwise_affine=True)`

			`self.dropout = torch.nn.Dropout(dropout)`

			`self.conv2 = make_conv_nd(`
			`dims,`
			`out_channels,`
			`out_channels,`
			`kernel_size=3,`
			`stride=1,`
			`padding=1,`
			`causal=True,`
			`)`

			`self.conv_shortcut = (`
			`make_linear_nd(`
			`dims=dims, in_channels=in_channels, out_channels=out_channels`
			`)`
			`if in_channels != out_channels`
			`else nn.Identity()`
			`)`

			`self.norm3 = (`
			`LayerNorm(in_channels, eps=eps, elementwise_affine=True)`
			`if in_channels != out_channels`
			`else nn.Identity()`
			`)`

			`def forward(`
			`self,`
			`input_tensor: torch.FloatTensor,`
			`causal: bool = True,`
			`) -> torch.FloatTensor:`
			`hidden_states = input_tensor`

			`hidden_states = self.norm1(hidden_states)`

			`hidden_states = self.non_linearity(hidden_states)`

			`hidden_states = self.conv1(hidden_states, causal=causal)`

			`hidden_states = self.norm2(hidden_states)`

			`hidden_states = self.non_linearity(hidden_states)`

			`hidden_states = self.dropout(hidden_states)`

			`hidden_states = self.conv2(hidden_states, causal=causal)`

			`input_tensor = self.norm3(input_tensor)`

			`input_tensor = self.conv_shortcut(input_tensor)`

			`output_tensor = input_tensor + hidden_states`

			`return output_tensor`


			`def patchify(x, patch_size_hw, patch_size_t=1):`
			`if patch_size_hw == 1 and patch_size_t == 1:`
			`return x`
			`if x.dim() == 4:`
			`x = rearrange(`
			`x, "b c (h q) (w r) -> b (c r q) h w", q=patch_size_hw, r=patch_size_hw`
			`)`
			`elif x.dim() == 5:`
			`x = rearrange(`
			`x,`
			`"b c (f p) (h q) (w r) -> b (c p r q) f h w",`
			`p=patch_size_t,`
			`q=patch_size_hw,`
			`r=patch_size_hw,`
			`)`
			`else:`
			`raise ValueError(f"Invalid input shape: {x.shape}")`

			`return x`


			`def unpatchify(x, patch_size_hw, patch_size_t=1):`
			`if patch_size_hw == 1 and patch_size_t == 1:`
			`return x`

			`if x.dim() == 4:`
			`x = rearrange(`
			`x, "b (c r q) h w -> b c (h q) (w r)", q=patch_size_hw, r=patch_size_hw`
			`)`
			`elif x.dim() == 5:`
			`x = rearrange(`
			`x,`
			`"b (c p r q) f h w -> b c (f p) (h q) (w r)",`
			`p=patch_size_t,`
			`q=patch_size_hw,`
			`r=patch_size_hw,`
			`)`

			`return x`

			`class processor(nn.Module):`
			`def __init__(self):`
			`super().__init__()`
			`self.register_buffer("std-of-means", torch.empty(128))`
			`self.register_buffer("mean-of-means", torch.empty(128))`
			`self.register_buffer("mean-of-stds", torch.empty(128))`
			`self.register_buffer("mean-of-stds_over_std-of-means", torch.empty(128))`
			`self.register_buffer("channel", torch.empty(128))`

			`def un_normalize(self, x):`
LTXV lowvram fixes. 2024-11-22 22:17:11 +00:00			`return (x * self.get_buffer("std-of-means").view(1, -1, 1, 1, 1).to(x)) + self.get_buffer("mean-of-means").view(1, -1, 1, 1, 1).to(x)`
Support Lightricks LTX-Video model. 2024-11-22 13:44:42 +00:00
			`def normalize(self, x):`
LTXV lowvram fixes. 2024-11-22 22:17:11 +00:00			`return (x - self.get_buffer("mean-of-means").view(1, -1, 1, 1, 1).to(x)) / self.get_buffer("std-of-means").view(1, -1, 1, 1, 1).to(x)`
Support Lightricks LTX-Video model. 2024-11-22 13:44:42 +00:00
			`class VideoVAE(nn.Module):`
			`def __init__(self):`
			`super().__init__()`
			`config = {`
			`"_class_name": "CausalVideoAutoencoder",`
			`"dims": 3,`
			`"in_channels": 3,`
			`"out_channels": 3,`
			`"latent_channels": 128,`
			`"blocks": [`
			`["res_x", 4],`
			`["compress_all", 1],`
			`["res_x_y", 1],`
			`["res_x", 3],`
			`["compress_all", 1],`
			`["res_x_y", 1],`
			`["res_x", 3],`
			`["compress_all", 1],`
			`["res_x", 3],`
			`["res_x", 4],`
			`],`
			`"scaling_factor": 1.0,`
			`"norm_layer": "pixel_norm",`
			`"patch_size": 4,`
			`"latent_log_var": "uniform",`
			`"use_quant_conv": False,`
			`"causal_decoder": False,`
			`}`

			`double_z = config.get("double_z", True)`
			`latent_log_var = config.get(`
			`"latent_log_var", "per_channel" if double_z else "none"`
			`)`

			`self.encoder = Encoder(`
			`dims=config["dims"],`
			`in_channels=config.get("in_channels", 3),`
			`out_channels=config["latent_channels"],`
			`blocks=config.get("encoder_blocks", config.get("blocks")),`
			`patch_size=config.get("patch_size", 1),`
			`latent_log_var=latent_log_var,`
			`norm_layer=config.get("norm_layer", "group_norm"),`
			`)`

			`self.decoder = Decoder(`
			`dims=config["dims"],`
			`in_channels=config["latent_channels"],`
			`out_channels=config.get("out_channels", 3),`
			`blocks=config.get("decoder_blocks", config.get("blocks")),`
			`patch_size=config.get("patch_size", 1),`
			`norm_layer=config.get("norm_layer", "group_norm"),`
			`causal=config.get("causal_decoder", False),`
			`)`

			`self.per_channel_statistics = processor()`

			`def encode(self, x):`
			`means, logvar = torch.chunk(self.encoder(x), 2, dim=1)`
			`return self.per_channel_statistics.normalize(means)`

			`def decode(self, x):`
			`return self.decoder(self.per_channel_statistics.un_normalize(x))`