From 2258f851593fcb4bf34d22dddd3b7cb711db91ec Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Mon, 18 Dec 2023 03:18:40 -0500 Subject: [PATCH] Support stable zero 123 model. To use it use the ImageOnlyCheckpointLoader to load the checkpoint and the new Stable_Zero123 node. --- comfy/model_base.py | 30 ++++++++++++++++++ comfy/sample.py | 3 +- comfy/supported_models.py | 29 ++++++++++++++++- comfy_extras/nodes_stable3d.py | 58 ++++++++++++++++++++++++++++++++++ nodes.py | 1 + 5 files changed, 119 insertions(+), 2 deletions(-) create mode 100644 comfy_extras/nodes_stable3d.py diff --git a/comfy/model_base.py b/comfy/model_base.py index a7582b33..c80848b2 100644 --- a/comfy/model_base.py +++ b/comfy/model_base.py @@ -328,3 +328,33 @@ class SVD_img2vid(BaseModel): out['image_only_indicator'] = comfy.conds.CONDConstant(torch.zeros((1,), device=device)) out['num_video_frames'] = comfy.conds.CONDConstant(noise.shape[0]) return out + +class Stable_Zero123(BaseModel): + def __init__(self, model_config, model_type=ModelType.EPS, device=None, cc_projection_weight=None, cc_projection_bias=None): + super().__init__(model_config, model_type, device=device) + self.cc_projection = comfy.ops.manual_cast.Linear(cc_projection_weight.shape[1], cc_projection_weight.shape[0], dtype=self.get_dtype(), device=device) + self.cc_projection.weight.copy_(cc_projection_weight) + self.cc_projection.bias.copy_(cc_projection_bias) + + def extra_conds(self, **kwargs): + out = {} + + latent_image = kwargs.get("concat_latent_image", None) + noise = kwargs.get("noise", None) + + if latent_image is None: + latent_image = torch.zeros_like(noise) + + if latent_image.shape[1:] != noise.shape[1:]: + latent_image = utils.common_upscale(latent_image, noise.shape[-1], noise.shape[-2], "bilinear", "center") + + latent_image = utils.resize_to_batch_size(latent_image, noise.shape[0]) + + out['c_concat'] = comfy.conds.CONDNoiseShape(latent_image) + + cross_attn = kwargs.get("cross_attn", None) + if cross_attn is not None: + if cross_attn.shape[-1] != 768: + cross_attn = self.cc_projection(cross_attn) + out['c_crossattn'] = comfy.conds.CONDCrossAttn(cross_attn) + return out diff --git a/comfy/sample.py b/comfy/sample.py index eadd6dcc..4b0d15c4 100644 --- a/comfy/sample.py +++ b/comfy/sample.py @@ -47,7 +47,8 @@ def convert_cond(cond): temp = c[1].copy() model_conds = temp.get("model_conds", {}) if c[0] is not None: - model_conds["c_crossattn"] = comfy.conds.CONDCrossAttn(c[0]) + model_conds["c_crossattn"] = comfy.conds.CONDCrossAttn(c[0]) #TODO: remove + temp["cross_attn"] = c[0] temp["model_conds"] = model_conds out.append(temp) return out diff --git a/comfy/supported_models.py b/comfy/supported_models.py index 2f2dee87..251bf6ac 100644 --- a/comfy/supported_models.py +++ b/comfy/supported_models.py @@ -252,5 +252,32 @@ class SVD_img2vid(supported_models_base.BASE): def clip_target(self): return None -models = [SD15, SD20, SD21UnclipL, SD21UnclipH, SDXLRefiner, SDXL, SSD1B, Segmind_Vega] +class Stable_Zero123(supported_models_base.BASE): + unet_config = { + "context_dim": 768, + "model_channels": 320, + "use_linear_in_transformer": False, + "adm_in_channels": None, + "use_temporal_attention": False, + "in_channels": 8, + } + + unet_extra_config = { + "num_heads": 8, + "num_head_channels": -1, + } + + clip_vision_prefix = "cond_stage_model.model.visual." + + latent_format = latent_formats.SD15 + + def get_model(self, state_dict, prefix="", device=None): + out = model_base.Stable_Zero123(self, device=device, cc_projection_weight=state_dict["cc_projection.weight"], cc_projection_bias=state_dict["cc_projection.bias"]) + return out + + def clip_target(self): + return None + + +models = [Stable_Zero123, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXLRefiner, SDXL, SSD1B, Segmind_Vega] models += [SVD_img2vid] diff --git a/comfy_extras/nodes_stable3d.py b/comfy_extras/nodes_stable3d.py new file mode 100644 index 00000000..fa64d924 --- /dev/null +++ b/comfy_extras/nodes_stable3d.py @@ -0,0 +1,58 @@ +import torch +import nodes +import comfy.utils + +def camera_embeddings(elevation, azimuth): + elevation = torch.as_tensor([elevation]) + azimuth = torch.as_tensor([azimuth]) + embeddings = torch.stack( + [ + torch.deg2rad( + (90 - elevation) - (90) + ), # Zero123 polar is 90-elevation + torch.sin(torch.deg2rad(azimuth)), + torch.cos(torch.deg2rad(azimuth)), + torch.deg2rad( + 90 - torch.full_like(elevation, 0) + ), + ], dim=-1).unsqueeze(1) + + return embeddings + + +class Zero123_Conditioning: + @classmethod + def INPUT_TYPES(s): + return {"required": { "clip_vision": ("CLIP_VISION",), + "init_image": ("IMAGE",), + "vae": ("VAE",), + "width": ("INT", {"default": 256, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 8}), + "height": ("INT", {"default": 256, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 8}), + "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096}), + "elevation": ("FLOAT", {"default": 0.0, "min": -180.0, "max": 180.0}), + "azimuth": ("FLOAT", {"default": 0.0, "min": -180.0, "max": 180.0}), + }} + RETURN_TYPES = ("CONDITIONING", "CONDITIONING", "LATENT") + RETURN_NAMES = ("positive", "negative", "latent") + + FUNCTION = "encode" + + CATEGORY = "conditioning/3d_models" + + def encode(self, clip_vision, init_image, vae, width, height, batch_size, elevation, azimuth): + output = clip_vision.encode_image(init_image) + pooled = output.image_embeds.unsqueeze(0) + pixels = comfy.utils.common_upscale(init_image.movedim(-1,1), width, height, "bilinear", "center").movedim(1,-1) + encode_pixels = pixels[:,:,:,:3] + t = vae.encode(encode_pixels) + cam_embeds = camera_embeddings(elevation, azimuth) + cond = torch.cat([pooled, cam_embeds.repeat((pooled.shape[0], 1, 1))], dim=-1) + + positive = [[cond, {"concat_latent_image": t}]] + negative = [[torch.zeros_like(pooled), {"concat_latent_image": torch.zeros_like(t)}]] + latent = torch.zeros([batch_size, 4, height // 8, width // 8]) + return (positive, negative, {"samples":latent}) + +NODE_CLASS_MAPPINGS = { + "Zero123_Conditioning": Zero123_Conditioning, +} diff --git a/nodes.py b/nodes.py index 3031b10a..7ed7a8e4 100644 --- a/nodes.py +++ b/nodes.py @@ -1869,6 +1869,7 @@ def init_custom_nodes(): "nodes_video_model.py", "nodes_sag.py", "nodes_perpneg.py", + "nodes_stable3d.py", ] for node_file in extras_files: