From 855789403b346c7d7772e2c7ed0a6a327f56047f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=96=B5=E5=93=A9=E4=B8=AA=E5=92=AA?= Date: Thu, 18 Jul 2024 01:12:50 +0800 Subject: [PATCH] support clip-vit-large-patch14-336 (#4042) * support clip-vit-large-patch14-336 * support clip-vit-large-patch14-336 --- comfy/clip_vision.py | 5 ++++- comfy/clip_vision_config_vitl_336.json | 18 ++++++++++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) create mode 100644 comfy/clip_vision_config_vitl_336.json diff --git a/comfy/clip_vision.py b/comfy/clip_vision.py index ac39c622..20dc3345 100644 --- a/comfy/clip_vision.py +++ b/comfy/clip_vision.py @@ -94,7 +94,10 @@ def load_clipvision_from_sd(sd, prefix="", convert_keys=False): elif "vision_model.encoder.layers.30.layer_norm1.weight" in sd: json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_h.json") elif "vision_model.encoder.layers.22.layer_norm1.weight" in sd: - json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl.json") + if sd["vision_model.embeddings.position_embedding.weight"].shape[0] == 577: + json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl_336.json") + else: + json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl.json") else: return None diff --git a/comfy/clip_vision_config_vitl_336.json b/comfy/clip_vision_config_vitl_336.json new file mode 100644 index 00000000..f2694527 --- /dev/null +++ b/comfy/clip_vision_config_vitl_336.json @@ -0,0 +1,18 @@ +{ + "attention_dropout": 0.0, + "dropout": 0.0, + "hidden_act": "quick_gelu", + "hidden_size": 1024, + "image_size": 336, + "initializer_factor": 1.0, + "initializer_range": 0.02, + "intermediate_size": 4096, + "layer_norm_eps": 1e-5, + "model_type": "clip_vision_model", + "num_attention_heads": 16, + "num_channels": 3, + "num_hidden_layers": 24, + "patch_size": 14, + "projection_dim": 768, + "torch_dtype": "float32" +}