ComfyUI/comfy/sdxl_clip.py

from comfy import sd1_clip
import torch
import os

class SDXLClipG(sd1_clip.SDClipModel):
    def __init__(self, device="cpu", max_length=77, freeze=True, layer="penultimate", layer_idx=None, dtype=None):
        if layer == "penultimate":
            layer="hidden"
            layer_idx=-2

        textmodel_json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_config_bigg.json")
        super().__init__(device=device, freeze=freeze, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, dtype=dtype,
                         special_tokens={"start": 49406, "end": 49407, "pad": 0}, layer_norm_hidden_state=False)

    def load_sd(self, sd):
        return super().load_sd(sd)

class SDXLClipGTokenizer(sd1_clip.SDTokenizer):
    def __init__(self, tokenizer_path=None, embedding_directory=None, tokenizer_data={}):
        super().__init__(tokenizer_path, pad_with_end=False, embedding_directory=embedding_directory, embedding_size=1280, embedding_key='clip_g')


class SDXLTokenizer:
    def __init__(self, embedding_directory=None, tokenizer_data={}):
        self.clip_l = sd1_clip.SDTokenizer(embedding_directory=embedding_directory)
        self.clip_g = SDXLClipGTokenizer(embedding_directory=embedding_directory)

    def tokenize_with_weights(self, text:str, return_word_ids=False):
        out = {}
        out["g"] = self.clip_g.tokenize_with_weights(text, return_word_ids)
        out["l"] = self.clip_l.tokenize_with_weights(text, return_word_ids)
        return out

    def untokenize(self, token_weight_pair):
        return self.clip_g.untokenize(token_weight_pair)

    def state_dict(self):
        return {}

class SDXLClipModel(torch.nn.Module):
    def __init__(self, device="cpu", dtype=None):
        super().__init__()
        self.clip_l = sd1_clip.SDClipModel(layer="hidden", layer_idx=-2, device=device, dtype=dtype, layer_norm_hidden_state=False)
        self.clip_g = SDXLClipG(device=device, dtype=dtype)
        self.dtypes = set([dtype])

    def set_clip_options(self, options):
        self.clip_l.set_clip_options(options)
        self.clip_g.set_clip_options(options)

    def reset_clip_options(self):
        self.clip_g.reset_clip_options()
        self.clip_l.reset_clip_options()

    def encode_token_weights(self, token_weight_pairs):
        token_weight_pairs_g = token_weight_pairs["g"]
        token_weight_pairs_l = token_weight_pairs["l"]
        g_out, g_pooled = self.clip_g.encode_token_weights(token_weight_pairs_g)
        l_out, l_pooled = self.clip_l.encode_token_weights(token_weight_pairs_l)
        return torch.cat([l_out, g_out], dim=-1), g_pooled

    def load_sd(self, sd):
        if "text_model.encoder.layers.30.mlp.fc1.weight" in sd:
            return self.clip_g.load_sd(sd)
        else:
            return self.clip_l.load_sd(sd)

class SDXLRefinerClipModel(sd1_clip.SD1ClipModel):
    def __init__(self, device="cpu", dtype=None):
        super().__init__(device=device, dtype=dtype, clip_name="g", clip_model=SDXLClipG)


class StableCascadeClipGTokenizer(sd1_clip.SDTokenizer):
    def __init__(self, tokenizer_path=None, embedding_directory=None, tokenizer_data={}):
        super().__init__(tokenizer_path, pad_with_end=True, embedding_directory=embedding_directory, embedding_size=1280, embedding_key='clip_g')

class StableCascadeTokenizer(sd1_clip.SD1Tokenizer):
    def __init__(self, embedding_directory=None, tokenizer_data={}):
        super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, clip_name="g", tokenizer=StableCascadeClipGTokenizer)

class StableCascadeClipG(sd1_clip.SDClipModel):
    def __init__(self, device="cpu", max_length=77, freeze=True, layer="hidden", layer_idx=-1, dtype=None):
        textmodel_json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_config_bigg.json")
        super().__init__(device=device, freeze=freeze, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, dtype=dtype,
                         special_tokens={"start": 49406, "end": 49407, "pad": 49407}, layer_norm_hidden_state=False, enable_attention_masks=True)

    def load_sd(self, sd):
        return super().load_sd(sd)

class StableCascadeClipModel(sd1_clip.SD1ClipModel):
    def __init__(self, device="cpu", dtype=None):
        super().__init__(device=device, dtype=dtype, clip_name="g", clip_model=StableCascadeClipG)
Support base SDXL and SDXL refiner models. Large refactor of the model detection and loading code. 2023-06-22 17:03:50 +00:00			`from comfy import sd1_clip`
			`import torch`
			`import os`

SD1 and SD2 clip and tokenizer code is now more similar to the SDXL one. 2023-10-27 19:54:04 +00:00			`class SDXLClipG(sd1_clip.SDClipModel):`
Cleaner CLIP text encoder implementation. Use a simple CLIP model implementation instead of the one from transformers. This will allow some interesting things that would too hackish to implement using the transformers implementation. 2023-12-06 20:55:09 +00:00			`def __init__(self, device="cpu", max_length=77, freeze=True, layer="penultimate", layer_idx=None, dtype=None):`
Fix CLIPSetLastLayer not reverting when removed. 2023-07-15 05:10:33 +00:00			`if layer == "penultimate":`
			`layer="hidden"`
			`layer_idx=-2`

Support base SDXL and SDXL refiner models. Large refactor of the model detection and loading code. 2023-06-22 17:03:50 +00:00			`textmodel_json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_config_bigg.json")`
Cleaner CLIP text encoder implementation. Use a simple CLIP model implementation instead of the one from transformers. This will allow some interesting things that would too hackish to implement using the transformers implementation. 2023-12-06 20:55:09 +00:00			`super().__init__(device=device, freeze=freeze, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, dtype=dtype,`
CLIP code refactor and improvements. More generic clip model class that can be used on more types of text encoders. Don't apply weighting algorithm when weight is 1.0 Don't compute an empty token output when it's not needed. 2023-11-06 18:43:50 +00:00			`special_tokens={"start": 49406, "end": 49407, "pad": 0}, layer_norm_hidden_state=False)`
Support base SDXL and SDXL refiner models. Large refactor of the model detection and loading code. 2023-06-22 17:03:50 +00:00
Add DualClipLoader to load clip models for SDXL. Update LoadClip to load clip models for SDXL refiner. 2023-06-25 05:40:38 +00:00			`def load_sd(self, sd):`
			`return super().load_sd(sd)`

SD1 and SD2 clip and tokenizer code is now more similar to the SDXL one. 2023-10-27 19:54:04 +00:00			`class SDXLClipGTokenizer(sd1_clip.SDTokenizer):`
Make it possible to load tokenizer data from checkpoints. 2024-07-24 20:43:53 +00:00			`def __init__(self, tokenizer_path=None, embedding_directory=None, tokenizer_data={}):`
Support SDXL embedding format with 2 CLIP. 2023-07-10 14:28:38 +00:00			`super().__init__(tokenizer_path, pad_with_end=False, embedding_directory=embedding_directory, embedding_size=1280, embedding_key='clip_g')`
Support base SDXL and SDXL refiner models. Large refactor of the model detection and loading code. 2023-06-22 17:03:50 +00:00

SD1 and SD2 clip and tokenizer code is now more similar to the SDXL one. 2023-10-27 19:54:04 +00:00			`class SDXLTokenizer:`
Make it possible to load tokenizer data from checkpoints. 2024-07-24 20:43:53 +00:00			`def __init__(self, embedding_directory=None, tokenizer_data={}):`
SD1 and SD2 clip and tokenizer code is now more similar to the SDXL one. 2023-10-27 19:54:04 +00:00			`self.clip_l = sd1_clip.SDTokenizer(embedding_directory=embedding_directory)`
Support base SDXL and SDXL refiner models. Large refactor of the model detection and loading code. 2023-06-22 17:03:50 +00:00			`self.clip_g = SDXLClipGTokenizer(embedding_directory=embedding_directory)`

			`def tokenize_with_weights(self, text:str, return_word_ids=False):`
			`out = {}`
			`out["g"] = self.clip_g.tokenize_with_weights(text, return_word_ids)`
			`out["l"] = self.clip_l.tokenize_with_weights(text, return_word_ids)`
			`return out`

			`def untokenize(self, token_weight_pair):`
			`return self.clip_g.untokenize(token_weight_pair)`

Let tokenizers return weights to be stored in the saved checkpoint. 2024-07-25 14:52:09 +00:00			`def state_dict(self):`
			`return {}`

Support base SDXL and SDXL refiner models. Large refactor of the model detection and loading code. 2023-06-22 17:03:50 +00:00			`class SDXLClipModel(torch.nn.Module):`
Initialize text encoder to target dtype. 2023-08-24 01:01:15 +00:00			`def __init__(self, device="cpu", dtype=None):`
Support base SDXL and SDXL refiner models. Large refactor of the model detection and loading code. 2023-06-22 17:03:50 +00:00			`super().__init__()`
Cleaner CLIP text encoder implementation. Use a simple CLIP model implementation instead of the one from transformers. This will allow some interesting things that would too hackish to implement using the transformers implementation. 2023-12-06 20:55:09 +00:00			`self.clip_l = sd1_clip.SDClipModel(layer="hidden", layer_idx=-2, device=device, dtype=dtype, layer_norm_hidden_state=False)`
Initialize text encoder to target dtype. 2023-08-24 01:01:15 +00:00			`self.clip_g = SDXLClipG(device=device, dtype=dtype)`
Load the SD3 T5xxl model in the same dtype stored in the checkpoint. 2024-06-11 21:03:26 +00:00			`self.dtypes = set([dtype])`
Support base SDXL and SDXL refiner models. Large refactor of the model detection and loading code. 2023-06-22 17:03:50 +00:00
Always return unprojected pooled output for gligen. 2024-02-25 12:20:31 +00:00			`def set_clip_options(self, options):`
			`self.clip_l.set_clip_options(options)`
			`self.clip_g.set_clip_options(options)`
Support base SDXL and SDXL refiner models. Large refactor of the model detection and loading code. 2023-06-22 17:03:50 +00:00
Always return unprojected pooled output for gligen. 2024-02-25 12:20:31 +00:00			`def reset_clip_options(self):`
			`self.clip_g.reset_clip_options()`
			`self.clip_l.reset_clip_options()`
Fix CLIPSetLastLayer not reverting when removed. 2023-07-15 05:10:33 +00:00
Support base SDXL and SDXL refiner models. Large refactor of the model detection and loading code. 2023-06-22 17:03:50 +00:00			`def encode_token_weights(self, token_weight_pairs):`
			`token_weight_pairs_g = token_weight_pairs["g"]`
			`token_weight_pairs_l = token_weight_pairs["l"]`
			`g_out, g_pooled = self.clip_g.encode_token_weights(token_weight_pairs_g)`
			`l_out, l_pooled = self.clip_l.encode_token_weights(token_weight_pairs_l)`
			`return torch.cat([l_out, g_out], dim=-1), g_pooled`

Add DualClipLoader to load clip models for SDXL. Update LoadClip to load clip models for SDXL refiner. 2023-06-25 05:40:38 +00:00			`def load_sd(self, sd):`
			`if "text_model.encoder.layers.30.mlp.fc1.weight" in sd:`
			`return self.clip_g.load_sd(sd)`
			`else:`
			`return self.clip_l.load_sd(sd)`

SD1 and SD2 clip and tokenizer code is now more similar to the SDXL one. 2023-10-27 19:54:04 +00:00			`class SDXLRefinerClipModel(sd1_clip.SD1ClipModel):`
Initialize text encoder to target dtype. 2023-08-24 01:01:15 +00:00			`def __init__(self, device="cpu", dtype=None):`
SD1 and SD2 clip and tokenizer code is now more similar to the SDXL one. 2023-10-27 19:54:04 +00:00			`super().__init__(device=device, dtype=dtype, clip_name="g", clip_model=SDXLClipG)`
StableCascade CLIP model support. 2024-02-16 18:29:04 +00:00

			`class StableCascadeClipGTokenizer(sd1_clip.SDTokenizer):`
Make it possible to load tokenizer data from checkpoints. 2024-07-24 20:43:53 +00:00			`def __init__(self, tokenizer_path=None, embedding_directory=None, tokenizer_data={}):`
StableCascade CLIP model support. 2024-02-16 18:29:04 +00:00			`super().__init__(tokenizer_path, pad_with_end=True, embedding_directory=embedding_directory, embedding_size=1280, embedding_key='clip_g')`

			`class StableCascadeTokenizer(sd1_clip.SD1Tokenizer):`
Make it possible to load tokenizer data from checkpoints. 2024-07-24 20:43:53 +00:00			`def __init__(self, embedding_directory=None, tokenizer_data={}):`
			`super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, clip_name="g", tokenizer=StableCascadeClipGTokenizer)`
StableCascade CLIP model support. 2024-02-16 18:29:04 +00:00
			`class StableCascadeClipG(sd1_clip.SDClipModel):`
			`def __init__(self, device="cpu", max_length=77, freeze=True, layer="hidden", layer_idx=-1, dtype=None):`
			`textmodel_json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_config_bigg.json")`
			`super().__init__(device=device, freeze=freeze, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, dtype=dtype,`
			`special_tokens={"start": 49406, "end": 49407, "pad": 49407}, layer_norm_hidden_state=False, enable_attention_masks=True)`

			`def load_sd(self, sd):`
			`return super().load_sd(sd)`

			`class StableCascadeClipModel(sd1_clip.SD1ClipModel):`
			`def __init__(self, device="cpu", dtype=None):`
			`super().__init__(device=device, dtype=dtype, clip_name="g", clip_model=StableCascadeClipG)`