diff --git a/comfy/clip_model.py b/comfy/clip_model.py index bdf1e2b1..42cdc4f6 100644 --- a/comfy/clip_model.py +++ b/comfy/clip_model.py @@ -88,10 +88,11 @@ class CLIPTextModel_(torch.nn.Module): heads = config_dict["num_attention_heads"] intermediate_size = config_dict["intermediate_size"] intermediate_activation = config_dict["hidden_act"] + num_positions = config_dict["max_position_embeddings"] self.eos_token_id = config_dict["eos_token_id"] super().__init__() - self.embeddings = CLIPEmbeddings(embed_dim, dtype=dtype, device=device, operations=operations) + self.embeddings = CLIPEmbeddings(embed_dim, num_positions=num_positions, dtype=dtype, device=device, operations=operations) self.encoder = CLIPEncoder(num_layers, embed_dim, heads, intermediate_size, intermediate_activation, dtype, device, operations) self.final_layer_norm = operations.LayerNorm(embed_dim, dtype=dtype, device=device) diff --git a/comfy/sd.py b/comfy/sd.py index 86ebba2b..1e1a6594 100644 --- a/comfy/sd.py +++ b/comfy/sd.py @@ -24,6 +24,7 @@ import comfy.text_encoders.sa_t5 import comfy.text_encoders.aura_t5 import comfy.text_encoders.hydit import comfy.text_encoders.flux +import comfy.text_encoders.long_clipl import comfy.model_patcher import comfy.lora @@ -443,8 +444,13 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip clip_target.clip = comfy.text_encoders.sa_t5.SAT5Model clip_target.tokenizer = comfy.text_encoders.sa_t5.SAT5Tokenizer else: - clip_target.clip = sd1_clip.SD1ClipModel - clip_target.tokenizer = sd1_clip.SD1Tokenizer + w = clip_data[0].get("text_model.embeddings.position_embedding.weight", None) + if w is not None and w.shape[0] == 248: + clip_target.clip = comfy.text_encoders.long_clipl.LongClipModel + clip_target.tokenizer = comfy.text_encoders.long_clipl.LongClipTokenizer + else: + clip_target.clip = sd1_clip.SD1ClipModel + clip_target.tokenizer = sd1_clip.SD1Tokenizer elif len(clip_data) == 2: if clip_type == CLIPType.SD3: clip_target.clip = comfy.text_encoders.sd3_clip.sd3_clip(clip_l=True, clip_g=True, t5=False) diff --git a/comfy/text_encoders/long_clipl.json b/comfy/text_encoders/long_clipl.json new file mode 100644 index 00000000..5e2056ff --- /dev/null +++ b/comfy/text_encoders/long_clipl.json @@ -0,0 +1,25 @@ +{ + "_name_or_path": "openai/clip-vit-large-patch14", + "architectures": [ + "CLIPTextModel" + ], + "attention_dropout": 0.0, + "bos_token_id": 0, + "dropout": 0.0, + "eos_token_id": 49407, + "hidden_act": "quick_gelu", + "hidden_size": 768, + "initializer_factor": 1.0, + "initializer_range": 0.02, + "intermediate_size": 3072, + "layer_norm_eps": 1e-05, + "max_position_embeddings": 248, + "model_type": "clip_text_model", + "num_attention_heads": 12, + "num_hidden_layers": 12, + "pad_token_id": 1, + "projection_dim": 768, + "torch_dtype": "float32", + "transformers_version": "4.24.0", + "vocab_size": 49408 +} diff --git a/comfy/text_encoders/long_clipl.py b/comfy/text_encoders/long_clipl.py new file mode 100644 index 00000000..4677fb3b --- /dev/null +++ b/comfy/text_encoders/long_clipl.py @@ -0,0 +1,19 @@ +from comfy import sd1_clip +import os + +class LongClipTokenizer_(sd1_clip.SDTokenizer): + def __init__(self, embedding_directory=None, tokenizer_data={}): + super().__init__(max_length=248, embedding_directory=embedding_directory, tokenizer_data=tokenizer_data) + +class LongClipModel_(sd1_clip.SDClipModel): + def __init__(self, device="cpu", dtype=None, model_options={}): + textmodel_json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "long_clipl.json") + super().__init__(device=device, textmodel_json_config=textmodel_json_config, return_projected_pooled=False, dtype=dtype, model_options=model_options) + +class LongClipTokenizer(sd1_clip.SD1Tokenizer): + def __init__(self, embedding_directory=None, tokenizer_data={}): + super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, tokenizer=LongClipTokenizer_) + +class LongClipModel(sd1_clip.SD1ClipModel): + def __init__(self, device="cpu", dtype=None, model_options={}, **kwargs): + super().__init__(device=device, dtype=dtype, model_options=model_options, clip_model=LongClipModel_, **kwargs) diff --git a/comfy/text_encoders/sd3_clip.py b/comfy/text_encoders/sd3_clip.py index 83e8fa1f..e3832ac2 100644 --- a/comfy/text_encoders/sd3_clip.py +++ b/comfy/text_encoders/sd3_clip.py @@ -15,7 +15,7 @@ class T5XXLModel(sd1_clip.SDClipModel): class T5XXLTokenizer(sd1_clip.SDTokenizer): def __init__(self, embedding_directory=None, tokenizer_data={}): tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "t5_tokenizer") - super().__init__(tokenizer_path, pad_with_end=False, embedding_size=4096, embedding_key='t5xxl', tokenizer_class=T5TokenizerFast, has_start_token=False, pad_to_max_length=False, max_length=99999999, min_length=77) + super().__init__(tokenizer_path, embedding_directory=embedding_directory, pad_with_end=False, embedding_size=4096, embedding_key='t5xxl', tokenizer_class=T5TokenizerFast, has_start_token=False, pad_to_max_length=False, max_length=99999999, min_length=77) class SD3Tokenizer: