ComfyUI/comfy/ldm/lightricks/vae/dual_conv3d.py

import math
from typing import Tuple, Union

import torch
import torch.nn as nn
import torch.nn.functional as F
from einops import rearrange


class DualConv3d(nn.Module):
    def __init__(
        self,
        in_channels,
        out_channels,
        kernel_size,
        stride: Union[int, Tuple[int, int, int]] = 1,
        padding: Union[int, Tuple[int, int, int]] = 0,
        dilation: Union[int, Tuple[int, int, int]] = 1,
        groups=1,
        bias=True,
    ):
        super(DualConv3d, self).__init__()

        self.in_channels = in_channels
        self.out_channels = out_channels
        # Ensure kernel_size, stride, padding, and dilation are tuples of length 3
        if isinstance(kernel_size, int):
            kernel_size = (kernel_size, kernel_size, kernel_size)
        if kernel_size == (1, 1, 1):
            raise ValueError(
                "kernel_size must be greater than 1. Use make_linear_nd instead."
            )
        if isinstance(stride, int):
            stride = (stride, stride, stride)
        if isinstance(padding, int):
            padding = (padding, padding, padding)
        if isinstance(dilation, int):
            dilation = (dilation, dilation, dilation)

        # Set parameters for convolutions
        self.groups = groups
        self.bias = bias

        # Define the size of the channels after the first convolution
        intermediate_channels = (
            out_channels if in_channels < out_channels else in_channels
        )

        # Define parameters for the first convolution
        self.weight1 = nn.Parameter(
            torch.Tensor(
                intermediate_channels,
                in_channels // groups,
                1,
                kernel_size[1],
                kernel_size[2],
            )
        )
        self.stride1 = (1, stride[1], stride[2])
        self.padding1 = (0, padding[1], padding[2])
        self.dilation1 = (1, dilation[1], dilation[2])
        if bias:
            self.bias1 = nn.Parameter(torch.Tensor(intermediate_channels))
        else:
            self.register_parameter("bias1", None)

        # Define parameters for the second convolution
        self.weight2 = nn.Parameter(
            torch.Tensor(
                out_channels, intermediate_channels // groups, kernel_size[0], 1, 1
            )
        )
        self.stride2 = (stride[0], 1, 1)
        self.padding2 = (padding[0], 0, 0)
        self.dilation2 = (dilation[0], 1, 1)
        if bias:
            self.bias2 = nn.Parameter(torch.Tensor(out_channels))
        else:
            self.register_parameter("bias2", None)

        # Initialize weights and biases
        self.reset_parameters()

    def reset_parameters(self):
        nn.init.kaiming_uniform_(self.weight1, a=math.sqrt(5))
        nn.init.kaiming_uniform_(self.weight2, a=math.sqrt(5))
        if self.bias:
            fan_in1, _ = nn.init._calculate_fan_in_and_fan_out(self.weight1)
            bound1 = 1 / math.sqrt(fan_in1)
            nn.init.uniform_(self.bias1, -bound1, bound1)
            fan_in2, _ = nn.init._calculate_fan_in_and_fan_out(self.weight2)
            bound2 = 1 / math.sqrt(fan_in2)
            nn.init.uniform_(self.bias2, -bound2, bound2)

    def forward(self, x, use_conv3d=False, skip_time_conv=False):
        if use_conv3d:
            return self.forward_with_3d(x=x, skip_time_conv=skip_time_conv)
        else:
            return self.forward_with_2d(x=x, skip_time_conv=skip_time_conv)

    def forward_with_3d(self, x, skip_time_conv):
        # First convolution
        x = F.conv3d(
            x,
            self.weight1,
            self.bias1,
            self.stride1,
            self.padding1,
            self.dilation1,
            self.groups,
        )

        if skip_time_conv:
            return x

        # Second convolution
        x = F.conv3d(
            x,
            self.weight2,
            self.bias2,
            self.stride2,
            self.padding2,
            self.dilation2,
            self.groups,
        )

        return x

    def forward_with_2d(self, x, skip_time_conv):
        b, c, d, h, w = x.shape

        # First 2D convolution
        x = rearrange(x, "b c d h w -> (b d) c h w")
        # Squeeze the depth dimension out of weight1 since it's 1
        weight1 = self.weight1.squeeze(2)
        # Select stride, padding, and dilation for the 2D convolution
        stride1 = (self.stride1[1], self.stride1[2])
        padding1 = (self.padding1[1], self.padding1[2])
        dilation1 = (self.dilation1[1], self.dilation1[2])
        x = F.conv2d(x, weight1, self.bias1, stride1, padding1, dilation1, self.groups)

        _, _, h, w = x.shape

        if skip_time_conv:
            x = rearrange(x, "(b d) c h w -> b c d h w", b=b)
            return x

        # Second convolution which is essentially treated as a 1D convolution across the 'd' dimension
        x = rearrange(x, "(b d) c h w -> (b h w) c d", b=b)

        # Reshape weight2 to match the expected dimensions for conv1d
        weight2 = self.weight2.squeeze(-1).squeeze(-1)
        # Use only the relevant dimension for stride, padding, and dilation for the 1D convolution
        stride2 = self.stride2[0]
        padding2 = self.padding2[0]
        dilation2 = self.dilation2[0]
        x = F.conv1d(x, weight2, self.bias2, stride2, padding2, dilation2, self.groups)
        x = rearrange(x, "(b h w) c d -> b c d h w", b=b, h=h, w=w)

        return x

    @property
    def weight(self):
        return self.weight2


def test_dual_conv3d_consistency():
    # Initialize parameters
    in_channels = 3
    out_channels = 5
    kernel_size = (3, 3, 3)
    stride = (2, 2, 2)
    padding = (1, 1, 1)

    # Create an instance of the DualConv3d class
    dual_conv3d = DualConv3d(
        in_channels=in_channels,
        out_channels=out_channels,
        kernel_size=kernel_size,
        stride=stride,
        padding=padding,
        bias=True,
    )

    # Example input tensor
    test_input = torch.randn(1, 3, 10, 10, 10)

    # Perform forward passes with both 3D and 2D settings
    output_conv3d = dual_conv3d(test_input, use_conv3d=True)
    output_2d = dual_conv3d(test_input, use_conv3d=False)

    # Assert that the outputs from both methods are sufficiently close
    assert torch.allclose(
        output_conv3d, output_2d, atol=1e-6
    ), "Outputs are not consistent between 3D and 2D convolutions."
Support Lightricks LTX-Video model. 2024-11-22 13:44:42 +00:00			`import math`
			`from typing import Tuple, Union`

			`import torch`
			`import torch.nn as nn`
			`import torch.nn.functional as F`
			`from einops import rearrange`


			`class DualConv3d(nn.Module):`
			`def __init__(`
			`self,`
			`in_channels,`
			`out_channels,`
			`kernel_size,`
			`stride: Union[int, Tuple[int, int, int]] = 1,`
			`padding: Union[int, Tuple[int, int, int]] = 0,`
			`dilation: Union[int, Tuple[int, int, int]] = 1,`
			`groups=1,`
			`bias=True,`
			`):`
			`super(DualConv3d, self).__init__()`

			`self.in_channels = in_channels`
			`self.out_channels = out_channels`
			`# Ensure kernel_size, stride, padding, and dilation are tuples of length 3`
			`if isinstance(kernel_size, int):`
			`kernel_size = (kernel_size, kernel_size, kernel_size)`
			`if kernel_size == (1, 1, 1):`
			`raise ValueError(`
			`"kernel_size must be greater than 1. Use make_linear_nd instead."`
			`)`
			`if isinstance(stride, int):`
			`stride = (stride, stride, stride)`
			`if isinstance(padding, int):`
			`padding = (padding, padding, padding)`
			`if isinstance(dilation, int):`
			`dilation = (dilation, dilation, dilation)`

			`# Set parameters for convolutions`
			`self.groups = groups`
			`self.bias = bias`

			`# Define the size of the channels after the first convolution`
			`intermediate_channels = (`
			`out_channels if in_channels < out_channels else in_channels`
			`)`

			`# Define parameters for the first convolution`
			`self.weight1 = nn.Parameter(`
			`torch.Tensor(`
			`intermediate_channels,`
			`in_channels // groups,`
			`1,`
			`kernel_size[1],`
			`kernel_size[2],`
			`)`
			`)`
			`self.stride1 = (1, stride[1], stride[2])`
			`self.padding1 = (0, padding[1], padding[2])`
			`self.dilation1 = (1, dilation[1], dilation[2])`
			`if bias:`
			`self.bias1 = nn.Parameter(torch.Tensor(intermediate_channels))`
			`else:`
			`self.register_parameter("bias1", None)`

			`# Define parameters for the second convolution`
			`self.weight2 = nn.Parameter(`
			`torch.Tensor(`
			`out_channels, intermediate_channels // groups, kernel_size[0], 1, 1`
			`)`
			`)`
			`self.stride2 = (stride[0], 1, 1)`
			`self.padding2 = (padding[0], 0, 0)`
			`self.dilation2 = (dilation[0], 1, 1)`
			`if bias:`
			`self.bias2 = nn.Parameter(torch.Tensor(out_channels))`
			`else:`
			`self.register_parameter("bias2", None)`

			`# Initialize weights and biases`
			`self.reset_parameters()`

			`def reset_parameters(self):`
			`nn.init.kaiming_uniform_(self.weight1, a=math.sqrt(5))`
			`nn.init.kaiming_uniform_(self.weight2, a=math.sqrt(5))`
			`if self.bias:`
			`fan_in1, _ = nn.init._calculate_fan_in_and_fan_out(self.weight1)`
			`bound1 = 1 / math.sqrt(fan_in1)`
			`nn.init.uniform_(self.bias1, -bound1, bound1)`
			`fan_in2, _ = nn.init._calculate_fan_in_and_fan_out(self.weight2)`
			`bound2 = 1 / math.sqrt(fan_in2)`
			`nn.init.uniform_(self.bias2, -bound2, bound2)`

			`def forward(self, x, use_conv3d=False, skip_time_conv=False):`
			`if use_conv3d:`
			`return self.forward_with_3d(x=x, skip_time_conv=skip_time_conv)`
			`else:`
			`return self.forward_with_2d(x=x, skip_time_conv=skip_time_conv)`

			`def forward_with_3d(self, x, skip_time_conv):`
			`# First convolution`
			`x = F.conv3d(`
			`x,`
			`self.weight1,`
			`self.bias1,`
			`self.stride1,`
			`self.padding1,`
			`self.dilation1,`
			`self.groups,`
			`)`

			`if skip_time_conv:`
			`return x`

			`# Second convolution`
			`x = F.conv3d(`
			`x,`
			`self.weight2,`
			`self.bias2,`
			`self.stride2,`
			`self.padding2,`
			`self.dilation2,`
			`self.groups,`
			`)`

			`return x`

			`def forward_with_2d(self, x, skip_time_conv):`
			`b, c, d, h, w = x.shape`

			`# First 2D convolution`
			`x = rearrange(x, "b c d h w -> (b d) c h w")`
			`# Squeeze the depth dimension out of weight1 since it's 1`
			`weight1 = self.weight1.squeeze(2)`
			`# Select stride, padding, and dilation for the 2D convolution`
			`stride1 = (self.stride1[1], self.stride1[2])`
			`padding1 = (self.padding1[1], self.padding1[2])`
			`dilation1 = (self.dilation1[1], self.dilation1[2])`
			`x = F.conv2d(x, weight1, self.bias1, stride1, padding1, dilation1, self.groups)`

			`_, _, h, w = x.shape`

			`if skip_time_conv:`
			`x = rearrange(x, "(b d) c h w -> b c d h w", b=b)`
			`return x`

			`# Second convolution which is essentially treated as a 1D convolution across the 'd' dimension`
			`x = rearrange(x, "(b d) c h w -> (b h w) c d", b=b)`

			`# Reshape weight2 to match the expected dimensions for conv1d`
			`weight2 = self.weight2.squeeze(-1).squeeze(-1)`
			`# Use only the relevant dimension for stride, padding, and dilation for the 1D convolution`
			`stride2 = self.stride2[0]`
			`padding2 = self.padding2[0]`
			`dilation2 = self.dilation2[0]`
			`x = F.conv1d(x, weight2, self.bias2, stride2, padding2, dilation2, self.groups)`
			`x = rearrange(x, "(b h w) c d -> b c d h w", b=b, h=h, w=w)`

			`return x`

			`@property`
			`def weight(self):`
			`return self.weight2`


			`def test_dual_conv3d_consistency():`
			`# Initialize parameters`
			`in_channels = 3`
			`out_channels = 5`
			`kernel_size = (3, 3, 3)`
			`stride = (2, 2, 2)`
			`padding = (1, 1, 1)`

			`# Create an instance of the DualConv3d class`
			`dual_conv3d = DualConv3d(`
			`in_channels=in_channels,`
			`out_channels=out_channels,`
			`kernel_size=kernel_size,`
			`stride=stride,`
			`padding=padding,`
			`bias=True,`
			`)`

			`# Example input tensor`
			`test_input = torch.randn(1, 3, 10, 10, 10)`

			`# Perform forward passes with both 3D and 2D settings`
			`output_conv3d = dual_conv3d(test_input, use_conv3d=True)`
			`output_2d = dual_conv3d(test_input, use_conv3d=False)`

			`# Assert that the outputs from both methods are sufficiently close`
			`assert torch.allclose(`
			`output_conv3d, output_2d, atol=1e-6`
			`), "Outputs are not consistent between 3D and 2D convolutions."`