2023-06-22 17:03:50 +00:00
|
|
|
import torch
|
|
|
|
from . import model_base
|
|
|
|
from . import utils
|
|
|
|
|
|
|
|
from . import sd1_clip
|
|
|
|
from . import sd2_clip
|
|
|
|
from . import sdxl_clip
|
|
|
|
|
|
|
|
from . import supported_models_base
|
2023-06-23 06:14:12 +00:00
|
|
|
from . import latent_formats
|
2023-06-22 17:03:50 +00:00
|
|
|
|
2023-06-26 16:21:07 +00:00
|
|
|
from . import diffusers_convert
|
|
|
|
|
2023-06-22 17:03:50 +00:00
|
|
|
class SD15(supported_models_base.BASE):
|
|
|
|
unet_config = {
|
|
|
|
"context_dim": 768,
|
|
|
|
"model_channels": 320,
|
|
|
|
"use_linear_in_transformer": False,
|
|
|
|
"adm_in_channels": None,
|
2023-11-24 00:41:33 +00:00
|
|
|
"use_temporal_attention": False,
|
2023-06-22 17:03:50 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
unet_extra_config = {
|
|
|
|
"num_heads": 8,
|
|
|
|
"num_head_channels": -1,
|
|
|
|
}
|
|
|
|
|
2023-06-23 06:14:12 +00:00
|
|
|
latent_format = latent_formats.SD15
|
2023-06-22 17:03:50 +00:00
|
|
|
|
|
|
|
def process_clip_state_dict(self, state_dict):
|
|
|
|
k = list(state_dict.keys())
|
|
|
|
for x in k:
|
|
|
|
if x.startswith("cond_stage_model.transformer.") and not x.startswith("cond_stage_model.transformer.text_model."):
|
|
|
|
y = x.replace("cond_stage_model.transformer.", "cond_stage_model.transformer.text_model.")
|
|
|
|
state_dict[y] = state_dict.pop(x)
|
|
|
|
|
|
|
|
if 'cond_stage_model.transformer.text_model.embeddings.position_ids' in state_dict:
|
|
|
|
ids = state_dict['cond_stage_model.transformer.text_model.embeddings.position_ids']
|
|
|
|
if ids.dtype == torch.float32:
|
|
|
|
state_dict['cond_stage_model.transformer.text_model.embeddings.position_ids'] = ids.round()
|
|
|
|
|
2023-10-27 19:54:04 +00:00
|
|
|
replace_prefix = {}
|
|
|
|
replace_prefix["cond_stage_model."] = "cond_stage_model.clip_l."
|
|
|
|
state_dict = utils.state_dict_prefix_replace(state_dict, replace_prefix)
|
2023-06-22 17:03:50 +00:00
|
|
|
return state_dict
|
|
|
|
|
2023-10-27 19:54:04 +00:00
|
|
|
def process_clip_state_dict_for_saving(self, state_dict):
|
|
|
|
replace_prefix = {"clip_l.": "cond_stage_model."}
|
|
|
|
return utils.state_dict_prefix_replace(state_dict, replace_prefix)
|
|
|
|
|
2023-06-22 17:03:50 +00:00
|
|
|
def clip_target(self):
|
|
|
|
return supported_models_base.ClipTarget(sd1_clip.SD1Tokenizer, sd1_clip.SD1ClipModel)
|
|
|
|
|
|
|
|
class SD20(supported_models_base.BASE):
|
|
|
|
unet_config = {
|
|
|
|
"context_dim": 1024,
|
|
|
|
"model_channels": 320,
|
|
|
|
"use_linear_in_transformer": True,
|
|
|
|
"adm_in_channels": None,
|
2023-11-24 00:41:33 +00:00
|
|
|
"use_temporal_attention": False,
|
2023-06-22 17:03:50 +00:00
|
|
|
}
|
|
|
|
|
2023-06-23 06:14:12 +00:00
|
|
|
latent_format = latent_formats.SD15
|
2023-06-22 17:03:50 +00:00
|
|
|
|
2023-07-17 05:22:12 +00:00
|
|
|
def model_type(self, state_dict, prefix=""):
|
2023-06-22 17:03:50 +00:00
|
|
|
if self.unet_config["in_channels"] == 4: #SD2.0 inpainting models are not v prediction
|
2023-07-05 21:34:45 +00:00
|
|
|
k = "{}output_blocks.11.1.transformer_blocks.0.norm1.bias".format(prefix)
|
2023-06-22 17:03:50 +00:00
|
|
|
out = state_dict[k]
|
|
|
|
if torch.std(out, unbiased=False) > 0.09: # not sure how well this will actually work. I guess we will find out.
|
2023-07-17 05:22:12 +00:00
|
|
|
return model_base.ModelType.V_PREDICTION
|
|
|
|
return model_base.ModelType.EPS
|
2023-06-22 17:03:50 +00:00
|
|
|
|
|
|
|
def process_clip_state_dict(self, state_dict):
|
2023-12-01 00:27:03 +00:00
|
|
|
replace_prefix = {}
|
|
|
|
replace_prefix["conditioner.embedders.0.model."] = "cond_stage_model.model." #SD2 in sgm format
|
|
|
|
state_dict = utils.state_dict_prefix_replace(state_dict, replace_prefix)
|
|
|
|
|
2023-10-27 19:54:04 +00:00
|
|
|
state_dict = utils.transformers_convert(state_dict, "cond_stage_model.model.", "cond_stage_model.clip_h.transformer.text_model.", 24)
|
2023-06-22 17:03:50 +00:00
|
|
|
return state_dict
|
|
|
|
|
2023-06-26 16:21:07 +00:00
|
|
|
def process_clip_state_dict_for_saving(self, state_dict):
|
|
|
|
replace_prefix = {}
|
2023-10-27 19:54:04 +00:00
|
|
|
replace_prefix["clip_h"] = "cond_stage_model.model"
|
2023-09-03 02:33:37 +00:00
|
|
|
state_dict = utils.state_dict_prefix_replace(state_dict, replace_prefix)
|
2023-06-26 16:21:07 +00:00
|
|
|
state_dict = diffusers_convert.convert_text_enc_state_dict_v20(state_dict)
|
|
|
|
return state_dict
|
|
|
|
|
2023-06-22 17:03:50 +00:00
|
|
|
def clip_target(self):
|
|
|
|
return supported_models_base.ClipTarget(sd2_clip.SD2Tokenizer, sd2_clip.SD2ClipModel)
|
|
|
|
|
|
|
|
class SD21UnclipL(SD20):
|
|
|
|
unet_config = {
|
|
|
|
"context_dim": 1024,
|
|
|
|
"model_channels": 320,
|
|
|
|
"use_linear_in_transformer": True,
|
|
|
|
"adm_in_channels": 1536,
|
2023-11-24 00:41:33 +00:00
|
|
|
"use_temporal_attention": False,
|
2023-06-22 17:03:50 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
clip_vision_prefix = "embedder.model.visual."
|
|
|
|
noise_aug_config = {"noise_schedule_config": {"timesteps": 1000, "beta_schedule": "squaredcos_cap_v2"}, "timestep_dim": 768}
|
|
|
|
|
|
|
|
|
|
|
|
class SD21UnclipH(SD20):
|
|
|
|
unet_config = {
|
|
|
|
"context_dim": 1024,
|
|
|
|
"model_channels": 320,
|
|
|
|
"use_linear_in_transformer": True,
|
|
|
|
"adm_in_channels": 2048,
|
2023-11-24 00:41:33 +00:00
|
|
|
"use_temporal_attention": False,
|
2023-06-22 17:03:50 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
clip_vision_prefix = "embedder.model.visual."
|
|
|
|
noise_aug_config = {"noise_schedule_config": {"timesteps": 1000, "beta_schedule": "squaredcos_cap_v2"}, "timestep_dim": 1024}
|
|
|
|
|
|
|
|
class SDXLRefiner(supported_models_base.BASE):
|
|
|
|
unet_config = {
|
|
|
|
"model_channels": 384,
|
|
|
|
"use_linear_in_transformer": True,
|
|
|
|
"context_dim": 1280,
|
|
|
|
"adm_in_channels": 2560,
|
2023-10-27 18:15:45 +00:00
|
|
|
"transformer_depth": [0, 0, 4, 4, 4, 4, 0, 0],
|
2023-11-24 00:41:33 +00:00
|
|
|
"use_temporal_attention": False,
|
2023-06-22 17:03:50 +00:00
|
|
|
}
|
|
|
|
|
2023-06-23 06:14:12 +00:00
|
|
|
latent_format = latent_formats.SDXL
|
2023-06-22 17:03:50 +00:00
|
|
|
|
2023-07-29 18:51:56 +00:00
|
|
|
def get_model(self, state_dict, prefix="", device=None):
|
|
|
|
return model_base.SDXLRefiner(self, device=device)
|
2023-06-22 17:03:50 +00:00
|
|
|
|
|
|
|
def process_clip_state_dict(self, state_dict):
|
|
|
|
keys_to_replace = {}
|
|
|
|
replace_prefix = {}
|
|
|
|
|
|
|
|
state_dict = utils.transformers_convert(state_dict, "conditioner.embedders.0.model.", "cond_stage_model.clip_g.transformer.text_model.", 32)
|
|
|
|
keys_to_replace["conditioner.embedders.0.model.text_projection"] = "cond_stage_model.clip_g.text_projection"
|
2023-07-05 03:01:28 +00:00
|
|
|
keys_to_replace["conditioner.embedders.0.model.logit_scale"] = "cond_stage_model.clip_g.logit_scale"
|
2023-06-22 17:03:50 +00:00
|
|
|
|
2023-09-03 02:33:37 +00:00
|
|
|
state_dict = utils.state_dict_key_replace(state_dict, keys_to_replace)
|
2023-06-22 17:03:50 +00:00
|
|
|
return state_dict
|
|
|
|
|
2023-06-26 16:21:07 +00:00
|
|
|
def process_clip_state_dict_for_saving(self, state_dict):
|
|
|
|
replace_prefix = {}
|
|
|
|
state_dict_g = diffusers_convert.convert_text_enc_state_dict_v20(state_dict, "clip_g")
|
2023-07-25 04:45:20 +00:00
|
|
|
if "clip_g.transformer.text_model.embeddings.position_ids" in state_dict_g:
|
|
|
|
state_dict_g.pop("clip_g.transformer.text_model.embeddings.position_ids")
|
2023-06-26 16:21:07 +00:00
|
|
|
replace_prefix["clip_g"] = "conditioner.embedders.0.model"
|
2023-09-03 02:33:37 +00:00
|
|
|
state_dict_g = utils.state_dict_prefix_replace(state_dict_g, replace_prefix)
|
2023-06-26 16:21:07 +00:00
|
|
|
return state_dict_g
|
|
|
|
|
2023-06-22 17:03:50 +00:00
|
|
|
def clip_target(self):
|
|
|
|
return supported_models_base.ClipTarget(sdxl_clip.SDXLTokenizer, sdxl_clip.SDXLRefinerClipModel)
|
|
|
|
|
|
|
|
class SDXL(supported_models_base.BASE):
|
|
|
|
unet_config = {
|
|
|
|
"model_channels": 320,
|
|
|
|
"use_linear_in_transformer": True,
|
2023-10-27 18:15:45 +00:00
|
|
|
"transformer_depth": [0, 0, 2, 2, 10, 10],
|
2023-06-22 17:03:50 +00:00
|
|
|
"context_dim": 2048,
|
2023-11-24 00:41:33 +00:00
|
|
|
"adm_in_channels": 2816,
|
|
|
|
"use_temporal_attention": False,
|
2023-06-22 17:03:50 +00:00
|
|
|
}
|
|
|
|
|
2023-06-23 06:14:12 +00:00
|
|
|
latent_format = latent_formats.SDXL
|
2023-06-22 17:03:50 +00:00
|
|
|
|
2023-07-17 05:22:12 +00:00
|
|
|
def model_type(self, state_dict, prefix=""):
|
|
|
|
if "v_pred" in state_dict:
|
|
|
|
return model_base.ModelType.V_PREDICTION
|
|
|
|
else:
|
|
|
|
return model_base.ModelType.EPS
|
|
|
|
|
2023-07-29 18:51:56 +00:00
|
|
|
def get_model(self, state_dict, prefix="", device=None):
|
2023-09-01 19:18:25 +00:00
|
|
|
out = model_base.SDXL(self, model_type=self.model_type(state_dict, prefix), device=device)
|
|
|
|
if self.inpaint_model():
|
|
|
|
out.set_inpaint()
|
|
|
|
return out
|
2023-06-22 17:03:50 +00:00
|
|
|
|
|
|
|
def process_clip_state_dict(self, state_dict):
|
|
|
|
keys_to_replace = {}
|
|
|
|
replace_prefix = {}
|
|
|
|
|
|
|
|
replace_prefix["conditioner.embedders.0.transformer.text_model"] = "cond_stage_model.clip_l.transformer.text_model"
|
|
|
|
state_dict = utils.transformers_convert(state_dict, "conditioner.embedders.1.model.", "cond_stage_model.clip_g.transformer.text_model.", 32)
|
|
|
|
keys_to_replace["conditioner.embedders.1.model.text_projection"] = "cond_stage_model.clip_g.text_projection"
|
2023-10-27 18:15:45 +00:00
|
|
|
keys_to_replace["conditioner.embedders.1.model.text_projection.weight"] = "cond_stage_model.clip_g.text_projection"
|
2023-07-05 03:01:28 +00:00
|
|
|
keys_to_replace["conditioner.embedders.1.model.logit_scale"] = "cond_stage_model.clip_g.logit_scale"
|
2023-06-22 17:03:50 +00:00
|
|
|
|
2023-09-03 02:33:37 +00:00
|
|
|
state_dict = utils.state_dict_prefix_replace(state_dict, replace_prefix)
|
|
|
|
state_dict = utils.state_dict_key_replace(state_dict, keys_to_replace)
|
2023-06-22 17:03:50 +00:00
|
|
|
return state_dict
|
|
|
|
|
2023-06-26 16:21:07 +00:00
|
|
|
def process_clip_state_dict_for_saving(self, state_dict):
|
|
|
|
replace_prefix = {}
|
|
|
|
keys_to_replace = {}
|
|
|
|
state_dict_g = diffusers_convert.convert_text_enc_state_dict_v20(state_dict, "clip_g")
|
2023-07-25 04:45:20 +00:00
|
|
|
if "clip_g.transformer.text_model.embeddings.position_ids" in state_dict_g:
|
|
|
|
state_dict_g.pop("clip_g.transformer.text_model.embeddings.position_ids")
|
2023-06-26 16:21:07 +00:00
|
|
|
for k in state_dict:
|
|
|
|
if k.startswith("clip_l"):
|
|
|
|
state_dict_g[k] = state_dict[k]
|
|
|
|
|
|
|
|
replace_prefix["clip_g"] = "conditioner.embedders.1.model"
|
|
|
|
replace_prefix["clip_l"] = "conditioner.embedders.0"
|
2023-09-03 02:33:37 +00:00
|
|
|
state_dict_g = utils.state_dict_prefix_replace(state_dict_g, replace_prefix)
|
2023-06-26 16:21:07 +00:00
|
|
|
return state_dict_g
|
|
|
|
|
2023-06-22 17:03:50 +00:00
|
|
|
def clip_target(self):
|
|
|
|
return supported_models_base.ClipTarget(sdxl_clip.SDXLTokenizer, sdxl_clip.SDXLClipModel)
|
|
|
|
|
2023-10-27 18:15:45 +00:00
|
|
|
class SSD1B(SDXL):
|
|
|
|
unet_config = {
|
|
|
|
"model_channels": 320,
|
|
|
|
"use_linear_in_transformer": True,
|
|
|
|
"transformer_depth": [0, 0, 2, 2, 4, 4],
|
|
|
|
"context_dim": 2048,
|
2023-11-24 00:41:33 +00:00
|
|
|
"adm_in_channels": 2816,
|
|
|
|
"use_temporal_attention": False,
|
2023-10-27 18:15:45 +00:00
|
|
|
}
|
|
|
|
|
2023-12-13 00:09:53 +00:00
|
|
|
class Segmind_Vega(SDXL):
|
|
|
|
unet_config = {
|
|
|
|
"model_channels": 320,
|
|
|
|
"use_linear_in_transformer": True,
|
|
|
|
"transformer_depth": [0, 0, 1, 1, 2, 2],
|
|
|
|
"context_dim": 2048,
|
|
|
|
"adm_in_channels": 2816,
|
|
|
|
"use_temporal_attention": False,
|
|
|
|
}
|
|
|
|
|
2023-11-24 00:41:33 +00:00
|
|
|
class SVD_img2vid(supported_models_base.BASE):
|
|
|
|
unet_config = {
|
|
|
|
"model_channels": 320,
|
|
|
|
"in_channels": 8,
|
|
|
|
"use_linear_in_transformer": True,
|
|
|
|
"transformer_depth": [1, 1, 1, 1, 1, 1, 0, 0],
|
|
|
|
"context_dim": 1024,
|
|
|
|
"adm_in_channels": 768,
|
|
|
|
"use_temporal_attention": True,
|
|
|
|
"use_temporal_resblock": True
|
|
|
|
}
|
|
|
|
|
|
|
|
clip_vision_prefix = "conditioner.embedders.0.open_clip.model.visual."
|
|
|
|
|
|
|
|
latent_format = latent_formats.SD15
|
|
|
|
|
|
|
|
sampling_settings = {"sigma_max": 700.0, "sigma_min": 0.002}
|
|
|
|
|
|
|
|
def get_model(self, state_dict, prefix="", device=None):
|
|
|
|
out = model_base.SVD_img2vid(self, device=device)
|
|
|
|
return out
|
|
|
|
|
|
|
|
def clip_target(self):
|
|
|
|
return None
|
2023-06-22 17:03:50 +00:00
|
|
|
|
2023-12-18 08:18:40 +00:00
|
|
|
class Stable_Zero123(supported_models_base.BASE):
|
|
|
|
unet_config = {
|
|
|
|
"context_dim": 768,
|
|
|
|
"model_channels": 320,
|
|
|
|
"use_linear_in_transformer": False,
|
|
|
|
"adm_in_channels": None,
|
|
|
|
"use_temporal_attention": False,
|
|
|
|
"in_channels": 8,
|
|
|
|
}
|
|
|
|
|
|
|
|
unet_extra_config = {
|
|
|
|
"num_heads": 8,
|
|
|
|
"num_head_channels": -1,
|
|
|
|
}
|
|
|
|
|
|
|
|
clip_vision_prefix = "cond_stage_model.model.visual."
|
|
|
|
|
|
|
|
latent_format = latent_formats.SD15
|
|
|
|
|
|
|
|
def get_model(self, state_dict, prefix="", device=None):
|
|
|
|
out = model_base.Stable_Zero123(self, device=device, cc_projection_weight=state_dict["cc_projection.weight"], cc_projection_bias=state_dict["cc_projection.bias"])
|
|
|
|
return out
|
|
|
|
|
|
|
|
def clip_target(self):
|
|
|
|
return None
|
|
|
|
|
2024-01-03 08:30:39 +00:00
|
|
|
class SD_X4Upscaler(SD20):
|
|
|
|
unet_config = {
|
|
|
|
"context_dim": 1024,
|
|
|
|
"model_channels": 256,
|
|
|
|
'in_channels': 7,
|
|
|
|
"use_linear_in_transformer": True,
|
|
|
|
"adm_in_channels": None,
|
|
|
|
"use_temporal_attention": False,
|
|
|
|
}
|
|
|
|
|
|
|
|
unet_extra_config = {
|
|
|
|
"disable_self_attentions": [True, True, True, False],
|
2024-01-03 19:27:11 +00:00
|
|
|
"num_classes": 1000,
|
2024-01-03 08:30:39 +00:00
|
|
|
"num_heads": 8,
|
|
|
|
"num_head_channels": -1,
|
|
|
|
}
|
|
|
|
|
|
|
|
latent_format = latent_formats.SD_X4
|
|
|
|
|
|
|
|
sampling_settings = {
|
|
|
|
"linear_start": 0.0001,
|
|
|
|
"linear_end": 0.02,
|
|
|
|
}
|
|
|
|
|
|
|
|
def get_model(self, state_dict, prefix="", device=None):
|
|
|
|
out = model_base.SD_X4Upscaler(self, device=device)
|
|
|
|
return out
|
2023-12-18 08:18:40 +00:00
|
|
|
|
2024-01-03 08:30:39 +00:00
|
|
|
models = [Stable_Zero123, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXLRefiner, SDXL, SSD1B, Segmind_Vega, SD_X4Upscaler]
|
2023-11-24 00:41:33 +00:00
|
|
|
models += [SVD_img2vid]
|