diff --git a/comfy/clip_vision.py b/comfy/clip_vision.py index e9b0ec53..e2bc3209 100644 --- a/comfy/clip_vision.py +++ b/comfy/clip_vision.py @@ -52,8 +52,9 @@ def convert_to_transformers(sd, prefix): sd = transformers_convert(sd, prefix, "vision_model.", 32) return sd -def load_clipvision_from_sd(sd, prefix): - sd = convert_to_transformers(sd, prefix) +def load_clipvision_from_sd(sd, prefix="", convert_keys=False): + if convert_keys: + sd = convert_to_transformers(sd, prefix) if "vision_model.encoder.layers.30.layer_norm1.weight" in sd: json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_h.json") else: diff --git a/comfy/sd.py b/comfy/sd.py index 64c95531..f5ed23b0 100644 --- a/comfy/sd.py +++ b/comfy/sd.py @@ -1049,7 +1049,7 @@ def load_checkpoint_guess_config(ckpt_path, output_vae=True, output_clip=True, o if model_config.clip_vision_prefix is not None: if output_clipvision: - clipvision = clip_vision.load_clipvision_from_sd(sd, model_config.clip_vision_prefix) + clipvision = clip_vision.load_clipvision_from_sd(sd, model_config.clip_vision_prefix, True) model = model_config.get_model(sd) model.load_model_weights(sd, "model.diffusion_model.")