ComfyUI/comfy/t5.py

import torch
import math
from comfy.ldm.modules.attention import optimized_attention_for_device

class T5LayerNorm(torch.nn.Module):
    def __init__(self, hidden_size, eps=1e-6, dtype=None, device=None, operations=None):
        super().__init__()
        self.weight = torch.nn.Parameter(torch.empty(hidden_size, dtype=dtype, device=device))
        self.variance_epsilon = eps

    def forward(self, x):
        variance = x.pow(2).mean(-1, keepdim=True)
        x = x * torch.rsqrt(variance + self.variance_epsilon)
        return self.weight.to(device=x.device, dtype=x.dtype) * x

class T5DenseActDense(torch.nn.Module):
    def __init__(self, model_dim, ff_dim, dtype, device, operations):
        super().__init__()
        self.wi = operations.Linear(model_dim, ff_dim, bias=False, dtype=dtype, device=device)
        self.wo = operations.Linear(ff_dim, model_dim, bias=False, dtype=dtype, device=device)
        # self.dropout = nn.Dropout(config.dropout_rate)

    def forward(self, x):
        x = torch.nn.functional.relu(self.wi(x))
        # x = self.dropout(x)
        x = self.wo(x)
        return x

class T5DenseGatedActDense(torch.nn.Module):
    def __init__(self, model_dim, ff_dim, dtype, device, operations):
        super().__init__()
        self.wi_0 = operations.Linear(model_dim, ff_dim, bias=False, dtype=dtype, device=device)
        self.wi_1 = operations.Linear(model_dim, ff_dim, bias=False, dtype=dtype, device=device)
        self.wo = operations.Linear(ff_dim, model_dim, bias=False, dtype=dtype, device=device)
        # self.dropout = nn.Dropout(config.dropout_rate)

    def forward(self, x):
        hidden_gelu = torch.nn.functional.gelu(self.wi_0(x), approximate="tanh")
        hidden_linear = self.wi_1(x)
        x = hidden_gelu * hidden_linear
        # x = self.dropout(x)
        x = self.wo(x)
        return x

class T5LayerFF(torch.nn.Module):
    def __init__(self, model_dim, ff_dim, ff_activation, dtype, device, operations):
        super().__init__()
        if ff_activation == "gelu_pytorch_tanh":
            self.DenseReluDense = T5DenseGatedActDense(model_dim, ff_dim, dtype, device, operations)
        elif ff_activation == "relu":
            self.DenseReluDense = T5DenseActDense(model_dim, ff_dim, dtype, device, operations)

        self.layer_norm = T5LayerNorm(model_dim, dtype=dtype, device=device, operations=operations)
        # self.dropout = nn.Dropout(config.dropout_rate)

    def forward(self, x):
        forwarded_states = self.layer_norm(x)
        forwarded_states = self.DenseReluDense(forwarded_states)
        # x = x + self.dropout(forwarded_states)
        x += forwarded_states
        return x

class T5Attention(torch.nn.Module):
    def __init__(self, model_dim, inner_dim, num_heads, relative_attention_bias, dtype, device, operations):
        super().__init__()

        # Mesh TensorFlow initialization to avoid scaling before softmax
        self.q = operations.Linear(model_dim, inner_dim, bias=False, dtype=dtype, device=device)
        self.k = operations.Linear(model_dim, inner_dim, bias=False, dtype=dtype, device=device)
        self.v = operations.Linear(model_dim, inner_dim, bias=False, dtype=dtype, device=device)
        self.o = operations.Linear(inner_dim, model_dim, bias=False, dtype=dtype, device=device)
        self.num_heads = num_heads

        self.relative_attention_bias = None
        if relative_attention_bias:
            self.relative_attention_num_buckets = 32
            self.relative_attention_max_distance = 128
            self.relative_attention_bias = torch.nn.Embedding(self.relative_attention_num_buckets, self.num_heads, device=device)

    @staticmethod
    def _relative_position_bucket(relative_position, bidirectional=True, num_buckets=32, max_distance=128):
        """
        Adapted from Mesh Tensorflow:
        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593

        Translate relative position to a bucket number for relative attention. The relative position is defined as
        memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
        position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
        small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
        positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
        This should allow for more graceful generalization to longer sequences than the model has been trained on

        Args:
            relative_position: an int32 Tensor
            bidirectional: a boolean - whether the attention is bidirectional
            num_buckets: an integer
            max_distance: an integer

        Returns:
            a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
        """
        relative_buckets = 0
        if bidirectional:
            num_buckets //= 2
            relative_buckets += (relative_position > 0).to(torch.long) * num_buckets
            relative_position = torch.abs(relative_position)
        else:
            relative_position = -torch.min(relative_position, torch.zeros_like(relative_position))
        # now relative_position is in the range [0, inf)

        # half of the buckets are for exact increments in positions
        max_exact = num_buckets // 2
        is_small = relative_position < max_exact

        # The other half of the buckets are for logarithmically bigger bins in positions up to max_distance
        relative_position_if_large = max_exact + (
            torch.log(relative_position.float() / max_exact)
            / math.log(max_distance / max_exact)
            * (num_buckets - max_exact)
        ).to(torch.long)
        relative_position_if_large = torch.min(
            relative_position_if_large, torch.full_like(relative_position_if_large, num_buckets - 1)
        )

        relative_buckets += torch.where(is_small, relative_position, relative_position_if_large)
        return relative_buckets

    def compute_bias(self, query_length, key_length, device):
        """Compute binned relative position bias"""
        context_position = torch.arange(query_length, dtype=torch.long, device=device)[:, None]
        memory_position = torch.arange(key_length, dtype=torch.long, device=device)[None, :]
        relative_position = memory_position - context_position  # shape (query_length, key_length)
        relative_position_bucket = self._relative_position_bucket(
            relative_position,  # shape (query_length, key_length)
            bidirectional=True,
            num_buckets=self.relative_attention_num_buckets,
            max_distance=self.relative_attention_max_distance,
        )
        values = self.relative_attention_bias(relative_position_bucket)  # shape (query_length, key_length, num_heads)
        values = values.permute([2, 0, 1]).unsqueeze(0)  # shape (1, num_heads, query_length, key_length)
        return values

    def forward(self, x, mask=None, past_bias=None, optimized_attention=None):
        q = self.q(x)
        k = self.k(x)
        v = self.v(x)
        if self.relative_attention_bias is not None:
            past_bias = self.compute_bias(x.shape[1], x.shape[1], x.device)

        if past_bias is not None:
            if mask is not None:
                mask = mask + past_bias
            else:
                mask = past_bias

        out = optimized_attention(q, k * ((k.shape[-1] / self.num_heads) ** 0.5), v, self.num_heads, mask)
        return self.o(out), past_bias

class T5LayerSelfAttention(torch.nn.Module):
    def __init__(self, model_dim, inner_dim, ff_dim, num_heads, relative_attention_bias, dtype, device, operations):
        super().__init__()
        self.SelfAttention = T5Attention(model_dim, inner_dim, num_heads, relative_attention_bias, dtype, device, operations)
        self.layer_norm = T5LayerNorm(model_dim, dtype=dtype, device=device, operations=operations)
        # self.dropout = nn.Dropout(config.dropout_rate)

    def forward(self, x, mask=None, past_bias=None, optimized_attention=None):
        normed_hidden_states = self.layer_norm(x)
        output, past_bias = self.SelfAttention(self.layer_norm(x), mask=mask, past_bias=past_bias, optimized_attention=optimized_attention)
        # x = x + self.dropout(attention_output)
        x += output
        return x, past_bias

class T5Block(torch.nn.Module):
    def __init__(self, model_dim, inner_dim, ff_dim, ff_activation, num_heads, relative_attention_bias, dtype, device, operations):
        super().__init__()
        self.layer = torch.nn.ModuleList()
        self.layer.append(T5LayerSelfAttention(model_dim, inner_dim, ff_dim, num_heads, relative_attention_bias, dtype, device, operations))
        self.layer.append(T5LayerFF(model_dim, ff_dim, ff_activation, dtype, device, operations))

    def forward(self, x, mask=None, past_bias=None, optimized_attention=None):
        x, past_bias = self.layer[0](x, mask, past_bias, optimized_attention)
        x = self.layer[-1](x)
        return x, past_bias

class T5Stack(torch.nn.Module):
    def __init__(self, num_layers, model_dim, inner_dim, ff_dim, ff_activation, num_heads, dtype, device, operations):
        super().__init__()

        self.block = torch.nn.ModuleList(
            [T5Block(model_dim, inner_dim, ff_dim, ff_activation, num_heads, relative_attention_bias=(i == 0), dtype=dtype, device=device, operations=operations) for i in range(num_layers)]
        )
        self.final_layer_norm = T5LayerNorm(model_dim, dtype=dtype, device=device, operations=operations)
        # self.dropout = nn.Dropout(config.dropout_rate)

    def forward(self, x, attention_mask=None, intermediate_output=None, final_layer_norm_intermediate=True):
        mask = None
        if attention_mask is not None:
            mask = 1.0 - attention_mask.to(x.dtype).reshape((attention_mask.shape[0], 1, -1, attention_mask.shape[-1])).expand(attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1])
            mask = mask.masked_fill(mask.to(torch.bool), float("-inf"))

        intermediate = None
        optimized_attention = optimized_attention_for_device(x.device, mask=attention_mask is not None, small_input=True)
        past_bias = None
        for i, l in enumerate(self.block):
            x, past_bias = l(x, mask, past_bias, optimized_attention)
            if i == intermediate_output:
                intermediate = x.clone()
        x = self.final_layer_norm(x)
        if intermediate is not None and final_layer_norm_intermediate:
            intermediate = self.final_layer_norm(intermediate)
        return x, intermediate

class T5(torch.nn.Module):
    def __init__(self, config_dict, dtype, device, operations):
        super().__init__()
        self.num_layers = config_dict["num_layers"]
        model_dim = config_dict["d_model"]

        self.encoder = T5Stack(self.num_layers, model_dim, model_dim, config_dict["d_ff"], config_dict["dense_act_fn"], config_dict["num_heads"], dtype, device, operations)
        self.dtype = dtype
        self.shared = torch.nn.Embedding(config_dict["vocab_size"], model_dim, device=device)

    def get_input_embeddings(self):
        return self.shared

    def set_input_embeddings(self, embeddings):
        self.shared = embeddings

    def forward(self, input_ids, *args, **kwargs):
        x = self.shared(input_ids)
        return self.encoder(x, *args, **kwargs)
SD3 Support. 2024-06-10 17:26:25 +00:00			`import torch`
			`import math`
			`from comfy.ldm.modules.attention import optimized_attention_for_device`

			`class T5LayerNorm(torch.nn.Module):`
			`def __init__(self, hidden_size, eps=1e-6, dtype=None, device=None, operations=None):`
			`super().__init__()`
			`self.weight = torch.nn.Parameter(torch.empty(hidden_size, dtype=dtype, device=device))`
			`self.variance_epsilon = eps`

			`def forward(self, x):`
			`variance = x.pow(2).mean(-1, keepdim=True)`
			`x = x * torch.rsqrt(variance + self.variance_epsilon)`
			`return self.weight.to(device=x.device, dtype=x.dtype) * x`

			`class T5DenseActDense(torch.nn.Module):`
			`def __init__(self, model_dim, ff_dim, dtype, device, operations):`
			`super().__init__()`
			`self.wi = operations.Linear(model_dim, ff_dim, bias=False, dtype=dtype, device=device)`
			`self.wo = operations.Linear(ff_dim, model_dim, bias=False, dtype=dtype, device=device)`
			`# self.dropout = nn.Dropout(config.dropout_rate)`

			`def forward(self, x):`
			`x = torch.nn.functional.relu(self.wi(x))`
			`# x = self.dropout(x)`
			`x = self.wo(x)`
			`return x`

			`class T5DenseGatedActDense(torch.nn.Module):`
			`def __init__(self, model_dim, ff_dim, dtype, device, operations):`
			`super().__init__()`
			`self.wi_0 = operations.Linear(model_dim, ff_dim, bias=False, dtype=dtype, device=device)`
			`self.wi_1 = operations.Linear(model_dim, ff_dim, bias=False, dtype=dtype, device=device)`
			`self.wo = operations.Linear(ff_dim, model_dim, bias=False, dtype=dtype, device=device)`
			`# self.dropout = nn.Dropout(config.dropout_rate)`

			`def forward(self, x):`
			`hidden_gelu = torch.nn.functional.gelu(self.wi_0(x), approximate="tanh")`
			`hidden_linear = self.wi_1(x)`
			`x = hidden_gelu * hidden_linear`
			`# x = self.dropout(x)`
			`x = self.wo(x)`
			`return x`

			`class T5LayerFF(torch.nn.Module):`
			`def __init__(self, model_dim, ff_dim, ff_activation, dtype, device, operations):`
			`super().__init__()`
			`if ff_activation == "gelu_pytorch_tanh":`
			`self.DenseReluDense = T5DenseGatedActDense(model_dim, ff_dim, dtype, device, operations)`
			`elif ff_activation == "relu":`
			`self.DenseReluDense = T5DenseActDense(model_dim, ff_dim, dtype, device, operations)`

			`self.layer_norm = T5LayerNorm(model_dim, dtype=dtype, device=device, operations=operations)`
			`# self.dropout = nn.Dropout(config.dropout_rate)`

			`def forward(self, x):`
			`forwarded_states = self.layer_norm(x)`
			`forwarded_states = self.DenseReluDense(forwarded_states)`
			`# x = x + self.dropout(forwarded_states)`
			`x += forwarded_states`
			`return x`

			`class T5Attention(torch.nn.Module):`
			`def __init__(self, model_dim, inner_dim, num_heads, relative_attention_bias, dtype, device, operations):`
			`super().__init__()`

			`# Mesh TensorFlow initialization to avoid scaling before softmax`
			`self.q = operations.Linear(model_dim, inner_dim, bias=False, dtype=dtype, device=device)`
			`self.k = operations.Linear(model_dim, inner_dim, bias=False, dtype=dtype, device=device)`
			`self.v = operations.Linear(model_dim, inner_dim, bias=False, dtype=dtype, device=device)`
			`self.o = operations.Linear(inner_dim, model_dim, bias=False, dtype=dtype, device=device)`
			`self.num_heads = num_heads`

			`self.relative_attention_bias = None`
			`if relative_attention_bias:`
			`self.relative_attention_num_buckets = 32`
			`self.relative_attention_max_distance = 128`
			`self.relative_attention_bias = torch.nn.Embedding(self.relative_attention_num_buckets, self.num_heads, device=device)`

			`@staticmethod`
			`def _relative_position_bucket(relative_position, bidirectional=True, num_buckets=32, max_distance=128):`
			`"""`
			`Adapted from Mesh Tensorflow:`
			`https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593`

			`Translate relative position to a bucket number for relative attention. The relative position is defined as`
			`memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to`
			`position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for`
			`small absolute relative_position and larger buckets for larger absolute relative_positions. All relative`
			`positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.`
			`This should allow for more graceful generalization to longer sequences than the model has been trained on`

			`Args:`
			`relative_position: an int32 Tensor`
			`bidirectional: a boolean - whether the attention is bidirectional`
			`num_buckets: an integer`
			`max_distance: an integer`

			`Returns:`
			`a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)`
			`"""`
			`relative_buckets = 0`
			`if bidirectional:`
			`num_buckets //= 2`
			`relative_buckets += (relative_position > 0).to(torch.long) * num_buckets`
			`relative_position = torch.abs(relative_position)`
			`else:`
			`relative_position = -torch.min(relative_position, torch.zeros_like(relative_position))`
			`# now relative_position is in the range [0, inf)`

			`# half of the buckets are for exact increments in positions`
			`max_exact = num_buckets // 2`
			`is_small = relative_position < max_exact`

			`# The other half of the buckets are for logarithmically bigger bins in positions up to max_distance`
			`relative_position_if_large = max_exact + (`
			`torch.log(relative_position.float() / max_exact)`
			`/ math.log(max_distance / max_exact)`
			`* (num_buckets - max_exact)`
			`).to(torch.long)`
			`relative_position_if_large = torch.min(`
			`relative_position_if_large, torch.full_like(relative_position_if_large, num_buckets - 1)`
			`)`

			`relative_buckets += torch.where(is_small, relative_position, relative_position_if_large)`
			`return relative_buckets`

			`def compute_bias(self, query_length, key_length, device):`
			`"""Compute binned relative position bias"""`
			`context_position = torch.arange(query_length, dtype=torch.long, device=device)[:, None]`
			`memory_position = torch.arange(key_length, dtype=torch.long, device=device)[None, :]`
			`relative_position = memory_position - context_position # shape (query_length, key_length)`
			`relative_position_bucket = self._relative_position_bucket(`
			`relative_position, # shape (query_length, key_length)`
			`bidirectional=True,`
			`num_buckets=self.relative_attention_num_buckets,`
			`max_distance=self.relative_attention_max_distance,`
			`)`
			`values = self.relative_attention_bias(relative_position_bucket) # shape (query_length, key_length, num_heads)`
			`values = values.permute([2, 0, 1]).unsqueeze(0) # shape (1, num_heads, query_length, key_length)`
			`return values`

			`def forward(self, x, mask=None, past_bias=None, optimized_attention=None):`
			`q = self.q(x)`
			`k = self.k(x)`
			`v = self.v(x)`
			`if self.relative_attention_bias is not None:`
			`past_bias = self.compute_bias(x.shape[1], x.shape[1], x.device)`

			`if past_bias is not None:`
			`if mask is not None:`
			`mask = mask + past_bias`
			`else:`
			`mask = past_bias`

			`out = optimized_attention(q, k * ((k.shape[-1] / self.num_heads) ** 0.5), v, self.num_heads, mask)`
			`return self.o(out), past_bias`

			`class T5LayerSelfAttention(torch.nn.Module):`
			`def __init__(self, model_dim, inner_dim, ff_dim, num_heads, relative_attention_bias, dtype, device, operations):`
			`super().__init__()`
			`self.SelfAttention = T5Attention(model_dim, inner_dim, num_heads, relative_attention_bias, dtype, device, operations)`
			`self.layer_norm = T5LayerNorm(model_dim, dtype=dtype, device=device, operations=operations)`
			`# self.dropout = nn.Dropout(config.dropout_rate)`

			`def forward(self, x, mask=None, past_bias=None, optimized_attention=None):`
			`normed_hidden_states = self.layer_norm(x)`
			`output, past_bias = self.SelfAttention(self.layer_norm(x), mask=mask, past_bias=past_bias, optimized_attention=optimized_attention)`
			`# x = x + self.dropout(attention_output)`
			`x += output`
			`return x, past_bias`

			`class T5Block(torch.nn.Module):`
			`def __init__(self, model_dim, inner_dim, ff_dim, ff_activation, num_heads, relative_attention_bias, dtype, device, operations):`
			`super().__init__()`
			`self.layer = torch.nn.ModuleList()`
			`self.layer.append(T5LayerSelfAttention(model_dim, inner_dim, ff_dim, num_heads, relative_attention_bias, dtype, device, operations))`
			`self.layer.append(T5LayerFF(model_dim, ff_dim, ff_activation, dtype, device, operations))`

			`def forward(self, x, mask=None, past_bias=None, optimized_attention=None):`
			`x, past_bias = self.layer[0](x, mask, past_bias, optimized_attention)`
			`x = self.layer[-1](x)`
			`return x, past_bias`

			`class T5Stack(torch.nn.Module):`
			`def __init__(self, num_layers, model_dim, inner_dim, ff_dim, ff_activation, num_heads, dtype, device, operations):`
			`super().__init__()`

			`self.block = torch.nn.ModuleList(`
			`[T5Block(model_dim, inner_dim, ff_dim, ff_activation, num_heads, relative_attention_bias=(i == 0), dtype=dtype, device=device, operations=operations) for i in range(num_layers)]`
			`)`
			`self.final_layer_norm = T5LayerNorm(model_dim, dtype=dtype, device=device, operations=operations)`
			`# self.dropout = nn.Dropout(config.dropout_rate)`

			`def forward(self, x, attention_mask=None, intermediate_output=None, final_layer_norm_intermediate=True):`
			`mask = None`
			`if attention_mask is not None:`
			`mask = 1.0 - attention_mask.to(x.dtype).reshape((attention_mask.shape[0], 1, -1, attention_mask.shape[-1])).expand(attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1])`
			`mask = mask.masked_fill(mask.to(torch.bool), float("-inf"))`

			`intermediate = None`
			`optimized_attention = optimized_attention_for_device(x.device, mask=attention_mask is not None, small_input=True)`
			`past_bias = None`
			`for i, l in enumerate(self.block):`
			`x, past_bias = l(x, mask, past_bias, optimized_attention)`
			`if i == intermediate_output:`
			`intermediate = x.clone()`
			`x = self.final_layer_norm(x)`
			`if intermediate is not None and final_layer_norm_intermediate:`
			`intermediate = self.final_layer_norm(intermediate)`
			`return x, intermediate`

			`class T5(torch.nn.Module):`
			`def __init__(self, config_dict, dtype, device, operations):`
			`super().__init__()`
			`self.num_layers = config_dict["num_layers"]`
			`model_dim = config_dict["d_model"]`

			`self.encoder = T5Stack(self.num_layers, model_dim, model_dim, config_dict["d_ff"], config_dict["dense_act_fn"], config_dict["num_heads"], dtype, device, operations)`
			`self.dtype = dtype`
			`self.shared = torch.nn.Embedding(config_dict["vocab_size"], model_dim, device=device)`

			`def get_input_embeddings(self):`
			`return self.shared`

			`def set_input_embeddings(self, embeddings):`
			`self.shared = embeddings`

			`def forward(self, input_ids, args, *kwargs):`
			`x = self.shared(input_ids)`
			`return self.encoder(x, args, *kwargs)`