2023-01-03 06:53:32 +00:00
import torch
2024-02-16 18:29:04 +00:00
from enum import Enum
2024-03-10 15:37:08 +00:00
import logging
2023-01-03 06:53:32 +00:00
2023-04-15 22:55:17 +00:00
from comfy import model_management
2023-10-17 18:51:51 +00:00
from . ldm . models . autoencoder import AutoencoderKL , AutoencodingEngine
2024-02-16 11:30:39 +00:00
from . ldm . cascade . stage_a import StageA
2024-02-19 09:06:49 +00:00
from . ldm . cascade . stage_c_coder import StageC_coder
2024-06-15 16:14:56 +00:00
from . ldm . audio . autoencoder import AudioOobleckVAE
2023-03-13 18:49:18 +00:00
import yaml
2023-02-16 15:38:08 +00:00
2023-08-25 21:25:39 +00:00
import comfy . utils
2023-04-02 03:19:15 +00:00
from . import clip_vision
2023-04-19 13:36:19 +00:00
from . import gligen
2023-05-28 06:02:09 +00:00
from . import diffusers_convert
2023-06-22 17:03:50 +00:00
from . import model_detection
2023-02-03 07:06:34 +00:00
2023-06-22 17:03:50 +00:00
from . import sd1_clip
from . import sd2_clip
2023-06-25 05:40:38 +00:00
from . import sdxl_clip
2024-06-10 17:26:25 +00:00
from . import sd3_clip
2024-06-15 16:14:56 +00:00
from . import sa_t5
2024-07-11 20:51:06 +00:00
import comfy . text_encoders . aura_t5
2023-06-09 16:24:24 +00:00
2023-08-28 18:49:18 +00:00
import comfy . model_patcher
2023-08-25 21:11:51 +00:00
import comfy . lora
2023-08-25 21:25:39 +00:00
import comfy . t2i_adapter . adapter
2023-08-30 03:58:32 +00:00
import comfy . supported_models_base
2023-11-21 17:54:19 +00:00
import comfy . taesd . taesd
2023-08-25 21:11:51 +00:00
2023-06-30 03:40:02 +00:00
def load_lora_for_models ( model , clip , lora , strength_model , strength_clip ) :
2023-11-02 00:27:20 +00:00
key_map = { }
if model is not None :
key_map = comfy . lora . model_lora_keys_unet ( model . model , key_map )
if clip is not None :
key_map = comfy . lora . model_lora_keys_clip ( clip . cond_stage_model , key_map )
2023-08-25 21:11:51 +00:00
loaded = comfy . lora . load_lora ( lora , key_map )
2023-11-02 00:27:20 +00:00
if model is not None :
new_modelpatcher = model . clone ( )
k = new_modelpatcher . add_patches ( loaded , strength_model )
else :
k = ( )
new_modelpatcher = None
if clip is not None :
new_clip = clip . clone ( )
k1 = new_clip . add_patches ( loaded , strength_clip )
else :
k1 = ( )
new_clip = None
2023-02-03 07:06:34 +00:00
k = set ( k )
k1 = set ( k1 )
for x in loaded :
if ( x not in k ) and ( x not in k1 ) :
2024-03-10 15:37:08 +00:00
logging . warning ( " NOT LOADED {} " . format ( x ) )
2023-02-03 07:06:34 +00:00
return ( new_modelpatcher , new_clip )
2023-01-03 06:53:32 +00:00
class CLIP :
2023-06-22 17:03:50 +00:00
def __init__ ( self , target = None , embedding_directory = None , no_init = False ) :
2023-02-03 07:06:34 +00:00
if no_init :
return
2023-07-03 20:09:02 +00:00
params = target . params . copy ( )
2023-06-22 17:03:50 +00:00
clip = target . clip
tokenizer = target . tokenizer
2023-01-29 23:46:44 +00:00
2023-07-01 17:22:51 +00:00
load_device = model_management . text_encoder_device ( )
offload_device = model_management . text_encoder_offload_device ( )
2023-08-28 19:08:45 +00:00
params [ ' device ' ] = offload_device
2024-06-11 21:03:26 +00:00
dtype = model_management . text_encoder_dtype ( load_device )
params [ ' dtype ' ] = dtype
2023-08-24 01:01:15 +00:00
self . cond_stage_model = clip ( * * ( params ) )
2023-06-15 19:21:37 +00:00
2024-06-11 21:03:26 +00:00
for dt in self . cond_stage_model . dtypes :
if not model_management . supports_cast ( load_device , dt ) :
load_device = offload_device
2023-02-05 20:49:03 +00:00
self . tokenizer = tokenizer ( embedding_directory = embedding_directory )
2023-08-28 18:49:18 +00:00
self . patcher = comfy . model_patcher . ModelPatcher ( self . cond_stage_model , load_device = load_device , offload_device = offload_device )
2023-03-06 16:34:02 +00:00
self . layer_idx = None
2024-06-11 21:03:26 +00:00
logging . debug ( " CLIP model load device: {} , offload device: {} " . format ( load_device , offload_device ) )
2023-02-03 07:06:34 +00:00
def clone ( self ) :
n = CLIP ( no_init = True )
n . patcher = self . patcher . clone ( )
n . cond_stage_model = self . cond_stage_model
n . tokenizer = self . tokenizer
2023-03-03 18:04:36 +00:00
n . layer_idx = self . layer_idx
2023-02-03 07:06:34 +00:00
return n
2023-07-14 06:37:30 +00:00
def add_patches ( self , patches , strength_patch = 1.0 , strength_model = 1.0 ) :
return self . patcher . add_patches ( patches , strength_patch , strength_model )
2023-01-03 06:53:32 +00:00
2023-02-05 20:20:18 +00:00
def clip_layer ( self , layer_idx ) :
2023-03-03 18:04:36 +00:00
self . layer_idx = layer_idx
2023-02-05 20:20:18 +00:00
2023-04-14 19:16:55 +00:00
def tokenize ( self , text , return_word_ids = False ) :
return self . tokenizer . tokenize_with_weights ( text , return_word_ids )
2023-04-13 20:06:50 +00:00
2024-07-11 00:06:50 +00:00
def encode_from_tokens ( self , tokens , return_pooled = False , return_dict = False ) :
2024-02-25 12:20:31 +00:00
self . cond_stage_model . reset_clip_options ( )
2023-03-06 16:34:02 +00:00
if self . layer_idx is not None :
2024-02-25 12:20:31 +00:00
self . cond_stage_model . set_clip_options ( { " layer " : self . layer_idx } )
if return_pooled == " unprojected " :
self . cond_stage_model . set_clip_options ( { " projected_pooled " : False } )
2023-07-01 17:22:51 +00:00
2023-08-17 14:58:59 +00:00
self . load_model ( )
2024-07-11 00:06:50 +00:00
o = self . cond_stage_model . encode_token_weights ( tokens )
cond , pooled = o [ : 2 ]
if return_dict :
out = { " cond " : cond , " pooled_output " : pooled }
if len ( o ) > 2 :
for k in o [ 2 ] :
out [ k ] = o [ 2 ] [ k ]
return out
2023-04-19 13:36:19 +00:00
if return_pooled :
2023-07-01 17:22:51 +00:00
return cond , pooled
return cond
2023-01-03 06:53:32 +00:00
2023-04-15 22:46:58 +00:00
def encode ( self , text ) :
2023-04-15 22:55:17 +00:00
tokens = self . tokenize ( text )
2023-04-15 22:46:58 +00:00
return self . encode_from_tokens ( tokens )
2024-02-19 15:29:18 +00:00
def load_sd ( self , sd , full_model = False ) :
if full_model :
return self . cond_stage_model . load_state_dict ( sd , strict = False )
else :
return self . cond_stage_model . load_sd ( sd )
2023-06-22 17:03:50 +00:00
2023-06-26 16:21:07 +00:00
def get_sd ( self ) :
return self . cond_stage_model . state_dict ( )
2023-08-17 14:58:59 +00:00
def load_model ( self ) :
model_management . load_model_gpu ( self . patcher )
return self . patcher
2023-06-26 16:21:07 +00:00
2023-07-14 06:37:30 +00:00
def get_key_patches ( self ) :
return self . patcher . get_key_patches ( )
2023-01-03 06:53:32 +00:00
class VAE :
2024-06-16 06:04:24 +00:00
def __init__ ( self , sd = None , device = None , config = None , dtype = None ) :
2023-10-17 18:51:51 +00:00
if ' decoder.up_blocks.0.resnets.0.norm1.weight ' in sd . keys ( ) : #diffusers format
sd = diffusers_convert . convert_vae_state_dict ( sd )
2023-11-22 23:16:02 +00:00
self . memory_used_encode = lambda shape , dtype : ( 1767 * shape [ 2 ] * shape [ 3 ] ) * model_management . dtype_size ( dtype ) #These are for AutoencoderKL and need tweaking (should be lower)
self . memory_used_decode = lambda shape , dtype : ( 2178 * shape [ 2 ] * shape [ 3 ] * 64 ) * model_management . dtype_size ( dtype )
2024-01-02 18:24:34 +00:00
self . downscale_ratio = 8
2024-02-19 09:06:49 +00:00
self . upscale_ratio = 8
2024-06-16 06:04:24 +00:00
self . latent_channels = 4
2024-06-15 16:14:56 +00:00
self . output_channels = 3
2024-02-16 11:30:39 +00:00
self . process_input = lambda image : image * 2.0 - 1.0
self . process_output = lambda image : torch . clamp ( ( image + 1.0 ) / 2.0 , min = 0.0 , max = 1.0 )
2024-06-16 17:12:54 +00:00
self . working_dtypes = [ torch . bfloat16 , torch . float32 ]
2023-11-21 17:54:19 +00:00
2023-01-03 06:53:32 +00:00
if config is None :
2023-11-24 00:41:33 +00:00
if " decoder.mid.block_1.mix_factor " in sd :
encoder_config = { ' double_z ' : True , ' z_channels ' : 4 , ' resolution ' : 256 , ' in_channels ' : 3 , ' out_ch ' : 3 , ' ch ' : 128 , ' ch_mult ' : [ 1 , 2 , 4 , 4 ] , ' num_res_blocks ' : 2 , ' attn_resolutions ' : [ ] , ' dropout ' : 0.0 }
decoder_config = encoder_config . copy ( )
decoder_config [ " video_kernel_size " ] = [ 3 , 1 , 1 ]
decoder_config [ " alpha " ] = 0.0
self . first_stage_model = AutoencodingEngine ( regularizer_config = { ' target ' : " comfy.ldm.models.autoencoder.DiagonalGaussianRegularizer " } ,
encoder_config = { ' target ' : " comfy.ldm.modules.diffusionmodules.model.Encoder " , ' params ' : encoder_config } ,
decoder_config = { ' target ' : " comfy.ldm.modules.temporal_ae.VideoDecoder " , ' params ' : decoder_config } )
elif " taesd_decoder.1.weight " in sd :
2024-06-16 07:10:04 +00:00
self . latent_channels = sd [ " taesd_decoder.1.weight " ] . shape [ 1 ]
self . first_stage_model = comfy . taesd . taesd . TAESD ( latent_channels = self . latent_channels )
2024-02-16 11:30:39 +00:00
elif " vquantizer.codebook.weight " in sd : #VQGan: stage a of stable cascade
self . first_stage_model = StageA ( )
self . downscale_ratio = 4
2024-02-19 09:06:49 +00:00
self . upscale_ratio = 4
2024-02-16 11:30:39 +00:00
#TODO
#self.memory_used_encode
#self.memory_used_decode
self . process_input = lambda image : image
self . process_output = lambda image : image
2024-02-19 09:06:49 +00:00
elif " backbone.1.0.block.0.1.num_batches_tracked " in sd : #effnet: encoder for stage c latent of stable cascade
self . first_stage_model = StageC_coder ( )
self . downscale_ratio = 32
self . latent_channels = 16
new_sd = { }
for k in sd :
new_sd [ " encoder. {} " . format ( k ) ] = sd [ k ]
sd = new_sd
elif " blocks.11.num_batches_tracked " in sd : #previewer: decoder for stage c latent of stable cascade
self . first_stage_model = StageC_coder ( )
self . latent_channels = 16
new_sd = { }
for k in sd :
new_sd [ " previewer. {} " . format ( k ) ] = sd [ k ]
sd = new_sd
elif " encoder.backbone.1.0.block.0.1.num_batches_tracked " in sd : #combined effnet and previewer for stable cascade
self . first_stage_model = StageC_coder ( )
self . downscale_ratio = 32
self . latent_channels = 16
2024-04-24 13:20:31 +00:00
elif " decoder.conv_in.weight " in sd :
2023-11-21 17:54:19 +00:00
#default SD1.x/SD2.x VAE parameters
ddconfig = { ' double_z ' : True , ' z_channels ' : 4 , ' resolution ' : 256 , ' in_channels ' : 3 , ' out_ch ' : 3 , ' ch ' : 128 , ' ch_mult ' : [ 1 , 2 , 4 , 4 ] , ' num_res_blocks ' : 2 , ' attn_resolutions ' : [ ] , ' dropout ' : 0.0 }
2024-01-03 08:30:39 +00:00
2024-04-19 01:05:33 +00:00
if ' encoder.down.2.downsample.conv.weight ' not in sd and ' decoder.up.3.upsample.conv.weight ' not in sd : #Stable diffusion x4 upscaler VAE
2024-01-03 08:30:39 +00:00
ddconfig [ ' ch_mult ' ] = [ 1 , 2 , 4 ]
self . downscale_ratio = 4
2024-02-19 09:06:49 +00:00
self . upscale_ratio = 4
2024-01-03 08:30:39 +00:00
2024-04-19 01:05:33 +00:00
self . latent_channels = ddconfig [ ' z_channels ' ] = sd [ " decoder.conv_in.weight " ] . shape [ 1 ]
if ' quant_conv.weight ' in sd :
self . first_stage_model = AutoencoderKL ( ddconfig = ddconfig , embed_dim = 4 )
else :
self . first_stage_model = AutoencodingEngine ( regularizer_config = { ' target ' : " comfy.ldm.models.autoencoder.DiagonalGaussianRegularizer " } ,
encoder_config = { ' target ' : " comfy.ldm.modules.diffusionmodules.model.Encoder " , ' params ' : ddconfig } ,
decoder_config = { ' target ' : " comfy.ldm.modules.diffusionmodules.model.Decoder " , ' params ' : ddconfig } )
2024-06-27 15:06:52 +00:00
elif " decoder.layers.1.layers.0.beta " in sd :
2024-06-15 16:14:56 +00:00
self . first_stage_model = AudioOobleckVAE ( )
2024-06-16 15:47:32 +00:00
self . memory_used_encode = lambda shape , dtype : ( 1000 * shape [ 2 ] ) * model_management . dtype_size ( dtype )
self . memory_used_decode = lambda shape , dtype : ( 1000 * shape [ 2 ] * 2048 ) * model_management . dtype_size ( dtype )
2024-06-15 16:14:56 +00:00
self . latent_channels = 64
self . output_channels = 2
self . upscale_ratio = 2048
self . downscale_ratio = 2048
self . process_output = lambda audio : audio
self . process_input = lambda audio : audio
2024-06-16 17:12:54 +00:00
self . working_dtypes = [ torch . float16 , torch . bfloat16 , torch . float32 ]
2024-04-24 13:20:31 +00:00
else :
logging . warning ( " WARNING: No VAE weights detected, VAE not initalized. " )
self . first_stage_model = None
return
2023-01-03 06:53:32 +00:00
else :
2023-05-28 06:02:09 +00:00
self . first_stage_model = AutoencoderKL ( * * ( config [ ' params ' ] ) )
2023-01-03 06:53:32 +00:00
self . first_stage_model = self . first_stage_model . eval ( )
2023-10-17 18:51:51 +00:00
m , u = self . first_stage_model . load_state_dict ( sd , strict = False )
if len ( m ) > 0 :
2024-03-10 15:37:08 +00:00
logging . warning ( " Missing VAE keys {} " . format ( m ) )
2023-10-17 18:51:51 +00:00
if len ( u ) > 0 :
2024-03-11 17:54:56 +00:00
logging . debug ( " Leftover VAE keys {} " . format ( u ) )
2023-05-28 06:02:09 +00:00
2023-03-06 15:50:50 +00:00
if device is None :
2023-07-01 19:22:40 +00:00
device = model_management . vae_device ( )
2023-01-03 06:53:32 +00:00
self . device = device
2023-11-28 09:58:32 +00:00
offload_device = model_management . vae_offload_device ( )
2023-12-12 17:03:29 +00:00
if dtype is None :
2024-06-16 17:12:54 +00:00
dtype = model_management . vae_dtype ( self . device , self . working_dtypes )
2023-12-12 17:03:29 +00:00
self . vae_dtype = dtype
2023-07-06 22:04:28 +00:00
self . first_stage_model . to ( self . vae_dtype )
2023-12-08 07:35:45 +00:00
self . output_device = model_management . intermediate_device ( )
2023-01-03 06:53:32 +00:00
2023-11-28 09:58:32 +00:00
self . patcher = comfy . model_patcher . ModelPatcher ( self . first_stage_model , load_device = self . device , offload_device = offload_device )
2024-06-16 17:12:54 +00:00
logging . debug ( " VAE load device: {} , offload device: {} , dtype: {} " . format ( self . device , offload_device , self . vae_dtype ) )
2023-11-28 09:58:32 +00:00
2024-02-19 09:06:49 +00:00
def vae_encode_crop_pixels ( self , pixels ) :
2024-06-15 16:14:56 +00:00
dims = pixels . shape [ 1 : - 1 ]
for d in range ( len ( dims ) ) :
x = ( dims [ d ] / / self . downscale_ratio ) * self . downscale_ratio
x_offset = ( dims [ d ] % self . downscale_ratio ) / / 2
if x != dims [ d ] :
pixels = pixels . narrow ( d + 1 , x_offset , x )
2024-02-19 09:06:49 +00:00
return pixels
2023-03-22 18:49:00 +00:00
def decode_tiled_ ( self , samples , tile_x = 64 , tile_y = 64 , overlap = 16 ) :
2023-08-25 21:25:39 +00:00
steps = samples . shape [ 0 ] * comfy . utils . get_tiled_scale_steps ( samples . shape [ 3 ] , samples . shape [ 2 ] , tile_x , tile_y , overlap )
steps + = samples . shape [ 0 ] * comfy . utils . get_tiled_scale_steps ( samples . shape [ 3 ] , samples . shape [ 2 ] , tile_x / / 2 , tile_y * 2 , overlap )
steps + = samples . shape [ 0 ] * comfy . utils . get_tiled_scale_steps ( samples . shape [ 3 ] , samples . shape [ 2 ] , tile_x * 2 , tile_y / / 2 , overlap )
pbar = comfy . utils . ProgressBar ( steps )
2023-04-24 10:55:44 +00:00
2024-02-16 11:30:39 +00:00
decode_fn = lambda a : self . first_stage_model . decode ( a . to ( self . vae_dtype ) . to ( self . device ) ) . float ( )
output = self . process_output (
2024-02-19 09:06:49 +00:00
( comfy . utils . tiled_scale ( samples , decode_fn , tile_x / / 2 , tile_y * 2 , overlap , upscale_amount = self . upscale_ratio , output_device = self . output_device , pbar = pbar ) +
comfy . utils . tiled_scale ( samples , decode_fn , tile_x * 2 , tile_y / / 2 , overlap , upscale_amount = self . upscale_ratio , output_device = self . output_device , pbar = pbar ) +
comfy . utils . tiled_scale ( samples , decode_fn , tile_x , tile_y , overlap , upscale_amount = self . upscale_ratio , output_device = self . output_device , pbar = pbar ) )
2024-02-16 11:30:39 +00:00
/ 3.0 )
2023-03-22 18:49:00 +00:00
return output
2024-06-22 15:45:58 +00:00
def decode_tiled_1d ( self , samples , tile_x = 128 , overlap = 32 ) :
decode_fn = lambda a : self . first_stage_model . decode ( a . to ( self . vae_dtype ) . to ( self . device ) ) . float ( )
return comfy . utils . tiled_scale_multidim ( samples , decode_fn , tile = ( tile_x , ) , overlap = overlap , upscale_amount = self . upscale_ratio , out_channels = self . output_channels , output_device = self . output_device )
2024-06-18 02:48:23 +00:00
2023-06-12 03:25:39 +00:00
def encode_tiled_ ( self , pixel_samples , tile_x = 512 , tile_y = 512 , overlap = 64 ) :
2023-08-25 21:25:39 +00:00
steps = pixel_samples . shape [ 0 ] * comfy . utils . get_tiled_scale_steps ( pixel_samples . shape [ 3 ] , pixel_samples . shape [ 2 ] , tile_x , tile_y , overlap )
steps + = pixel_samples . shape [ 0 ] * comfy . utils . get_tiled_scale_steps ( pixel_samples . shape [ 3 ] , pixel_samples . shape [ 2 ] , tile_x / / 2 , tile_y * 2 , overlap )
steps + = pixel_samples . shape [ 0 ] * comfy . utils . get_tiled_scale_steps ( pixel_samples . shape [ 3 ] , pixel_samples . shape [ 2 ] , tile_x * 2 , tile_y / / 2 , overlap )
pbar = comfy . utils . ProgressBar ( steps )
2023-06-12 03:25:39 +00:00
2024-02-16 11:30:39 +00:00
encode_fn = lambda a : self . first_stage_model . encode ( ( self . process_input ( a ) ) . to ( self . vae_dtype ) . to ( self . device ) ) . float ( )
2024-01-02 18:24:34 +00:00
samples = comfy . utils . tiled_scale ( pixel_samples , encode_fn , tile_x , tile_y , overlap , upscale_amount = ( 1 / self . downscale_ratio ) , out_channels = self . latent_channels , output_device = self . output_device , pbar = pbar )
samples + = comfy . utils . tiled_scale ( pixel_samples , encode_fn , tile_x * 2 , tile_y / / 2 , overlap , upscale_amount = ( 1 / self . downscale_ratio ) , out_channels = self . latent_channels , output_device = self . output_device , pbar = pbar )
samples + = comfy . utils . tiled_scale ( pixel_samples , encode_fn , tile_x / / 2 , tile_y * 2 , overlap , upscale_amount = ( 1 / self . downscale_ratio ) , out_channels = self . latent_channels , output_device = self . output_device , pbar = pbar )
2023-06-12 03:25:39 +00:00
samples / = 3.0
return samples
2024-06-22 15:45:58 +00:00
def encode_tiled_1d ( self , samples , tile_x = 128 * 2048 , overlap = 32 * 2048 ) :
encode_fn = lambda a : self . first_stage_model . encode ( ( self . process_input ( a ) ) . to ( self . vae_dtype ) . to ( self . device ) ) . float ( )
return comfy . utils . tiled_scale_multidim ( samples , encode_fn , tile = ( tile_x , ) , overlap = overlap , upscale_amount = ( 1 / self . downscale_ratio ) , out_channels = self . latent_channels , output_device = self . output_device )
2023-03-22 18:49:00 +00:00
def decode ( self , samples_in ) :
try :
2023-11-22 23:16:02 +00:00
memory_used = self . memory_used_decode ( samples_in . shape , self . vae_dtype )
2023-11-28 09:58:32 +00:00
model_management . load_models_gpu ( [ self . patcher ] , memory_required = memory_used )
2023-03-29 06:24:37 +00:00
free_memory = model_management . get_free_memory ( self . device )
2023-08-17 05:06:34 +00:00
batch_number = int ( free_memory / memory_used )
2023-03-29 06:24:37 +00:00
batch_number = max ( 1 , batch_number )
2024-06-15 16:14:56 +00:00
pixel_samples = torch . empty ( ( samples_in . shape [ 0 ] , self . output_channels ) + tuple ( map ( lambda a : a * self . upscale_ratio , samples_in . shape [ 2 : ] ) ) , device = self . output_device )
2023-03-29 06:24:37 +00:00
for x in range ( 0 , samples_in . shape [ 0 ] , batch_number ) :
2023-07-06 22:04:28 +00:00
samples = samples_in [ x : x + batch_number ] . to ( self . vae_dtype ) . to ( self . device )
2024-02-16 11:30:39 +00:00
pixel_samples [ x : x + batch_number ] = self . process_output ( self . first_stage_model . decode ( samples ) . to ( self . output_device ) . float ( ) )
2023-03-22 18:49:00 +00:00
except model_management . OOM_EXCEPTION as e :
2024-03-10 15:37:08 +00:00
logging . warning ( " Warning: Ran out of memory when regular VAE decoding, retrying with tiled VAE decoding. " )
2024-06-18 02:48:23 +00:00
if len ( samples_in . shape ) == 3 :
pixel_samples = self . decode_tiled_1d ( samples_in )
else :
pixel_samples = self . decode_tiled_ ( samples_in )
2023-03-22 18:49:00 +00:00
2023-12-08 07:35:45 +00:00
pixel_samples = pixel_samples . to ( self . output_device ) . movedim ( 1 , - 1 )
2023-01-03 06:53:32 +00:00
return pixel_samples
2023-03-22 07:29:09 +00:00
def decode_tiled ( self , samples , tile_x = 64 , tile_y = 64 , overlap = 16 ) :
2023-11-28 09:58:32 +00:00
model_management . load_model_gpu ( self . patcher )
2023-03-22 18:49:00 +00:00
output = self . decode_tiled_ ( samples , tile_x , tile_y , overlap )
2023-02-24 07:10:10 +00:00
return output . movedim ( 1 , - 1 )
2023-01-03 06:53:32 +00:00
def encode ( self , pixel_samples ) :
2024-02-19 09:06:49 +00:00
pixel_samples = self . vae_encode_crop_pixels ( pixel_samples )
2023-06-12 03:25:39 +00:00
pixel_samples = pixel_samples . movedim ( - 1 , 1 )
try :
2023-11-22 23:16:02 +00:00
memory_used = self . memory_used_encode ( pixel_samples . shape , self . vae_dtype )
2023-11-28 09:58:32 +00:00
model_management . load_models_gpu ( [ self . patcher ] , memory_required = memory_used )
2023-06-12 04:21:50 +00:00
free_memory = model_management . get_free_memory ( self . device )
2023-08-17 05:06:34 +00:00
batch_number = int ( free_memory / memory_used )
2023-06-12 04:21:50 +00:00
batch_number = max ( 1 , batch_number )
2024-06-15 16:14:56 +00:00
samples = torch . empty ( ( pixel_samples . shape [ 0 ] , self . latent_channels ) + tuple ( map ( lambda a : a / / self . downscale_ratio , pixel_samples . shape [ 2 : ] ) ) , device = self . output_device )
2023-06-12 03:25:39 +00:00
for x in range ( 0 , pixel_samples . shape [ 0 ] , batch_number ) :
2024-02-16 11:30:39 +00:00
pixels_in = self . process_input ( pixel_samples [ x : x + batch_number ] ) . to ( self . vae_dtype ) . to ( self . device )
2023-12-08 07:35:45 +00:00
samples [ x : x + batch_number ] = self . first_stage_model . encode ( pixels_in ) . to ( self . output_device ) . float ( )
2023-06-12 04:21:50 +00:00
2023-06-12 03:25:39 +00:00
except model_management . OOM_EXCEPTION as e :
2024-03-10 15:37:08 +00:00
logging . warning ( " Warning: Ran out of memory when regular VAE encoding, retrying with tiled VAE encoding. " )
2024-06-22 15:45:58 +00:00
if len ( pixel_samples . shape ) == 3 :
samples = self . encode_tiled_1d ( pixel_samples )
else :
samples = self . encode_tiled_ ( pixel_samples )
2023-06-12 03:25:39 +00:00
2023-01-03 06:53:32 +00:00
return samples
2023-03-11 20:28:15 +00:00
def encode_tiled ( self , pixel_samples , tile_x = 512 , tile_y = 512 , overlap = 64 ) :
2024-02-19 09:06:49 +00:00
pixel_samples = self . vae_encode_crop_pixels ( pixel_samples )
2023-11-28 09:58:32 +00:00
model_management . load_model_gpu ( self . patcher )
2023-06-12 03:25:39 +00:00
pixel_samples = pixel_samples . movedim ( - 1 , 1 )
samples = self . encode_tiled_ ( pixel_samples , tile_x = tile_x , tile_y = tile_y , overlap = overlap )
2023-03-11 20:28:15 +00:00
return samples
2023-02-25 19:57:28 +00:00
2023-06-26 16:21:07 +00:00
def get_sd ( self ) :
return self . first_stage_model . state_dict ( )
2023-03-05 23:39:25 +00:00
class StyleModel :
def __init__ ( self , model , device = " cpu " ) :
self . model = model
def get_cond ( self , input ) :
return self . model ( input . last_hidden_state )
def load_style_model ( ckpt_path ) :
2023-08-25 21:25:39 +00:00
model_data = comfy . utils . load_torch_file ( ckpt_path , safe_load = True )
2023-03-05 23:39:25 +00:00
keys = model_data . keys ( )
if " style_embedding " in keys :
2023-08-25 21:25:39 +00:00
model = comfy . t2i_adapter . adapter . StyleAdapter ( width = 1024 , context_dim = 768 , num_head = 8 , n_layes = 3 , num_token = 8 )
2023-03-05 23:39:25 +00:00
else :
raise Exception ( " invalid style model {} " . format ( ckpt_path ) )
model . load_state_dict ( model_data )
return StyleModel ( model )
2024-02-16 18:29:04 +00:00
class CLIPType ( Enum ) :
STABLE_DIFFUSION = 1
STABLE_CASCADE = 2
2024-06-12 03:27:39 +00:00
SD3 = 3
2024-06-15 16:14:56 +00:00
STABLE_AUDIO = 4
2023-03-05 23:39:25 +00:00
2024-02-16 18:29:04 +00:00
def load_clip ( ckpt_paths , embedding_directory = None , clip_type = CLIPType . STABLE_DIFFUSION ) :
2023-06-25 05:40:38 +00:00
clip_data = [ ]
for p in ckpt_paths :
2023-08-25 21:25:39 +00:00
clip_data . append ( comfy . utils . load_torch_file ( p , safe_load = True ) )
2023-06-25 05:40:38 +00:00
2023-06-24 17:56:46 +00:00
class EmptyClass :
pass
2023-06-25 05:40:38 +00:00
for i in range ( len ( clip_data ) ) :
if " transformer.resblocks.0.ln_1.weight " in clip_data [ i ] :
2024-02-25 06:41:08 +00:00
clip_data [ i ] = comfy . utils . clip_text_transformers_convert ( clip_data [ i ] , " " , " " )
2024-02-25 13:29:12 +00:00
else :
if " text_projection " in clip_data [ i ] :
clip_data [ i ] [ " text_projection.weight " ] = clip_data [ i ] [ " text_projection " ] . transpose ( 0 , 1 ) #old models saved with the CLIPSave node
2023-06-25 05:40:38 +00:00
2023-06-24 17:56:46 +00:00
clip_target = EmptyClass ( )
clip_target . params = { }
2023-06-25 05:40:38 +00:00
if len ( clip_data ) == 1 :
if " text_model.encoder.layers.30.mlp.fc1.weight " in clip_data [ 0 ] :
2024-02-16 18:29:04 +00:00
if clip_type == CLIPType . STABLE_CASCADE :
clip_target . clip = sdxl_clip . StableCascadeClipModel
clip_target . tokenizer = sdxl_clip . StableCascadeTokenizer
else :
clip_target . clip = sdxl_clip . SDXLRefinerClipModel
clip_target . tokenizer = sdxl_clip . SDXLTokenizer
2023-06-25 05:40:38 +00:00
elif " text_model.encoder.layers.22.mlp.fc1.weight " in clip_data [ 0 ] :
clip_target . clip = sd2_clip . SD2ClipModel
clip_target . tokenizer = sd2_clip . SD2Tokenizer
2024-06-12 03:27:39 +00:00
elif " encoder.block.23.layer.1.DenseReluDense.wi_1.weight " in clip_data [ 0 ] :
2024-07-06 04:53:33 +00:00
weight = clip_data [ 0 ] [ " encoder.block.23.layer.1.DenseReluDense.wi_1.weight " ]
dtype_t5 = weight . dtype
if weight . shape [ - 1 ] == 4096 :
clip_target . clip = sd3_clip . sd3_clip ( clip_l = False , clip_g = False , t5 = True , dtype_t5 = dtype_t5 )
clip_target . tokenizer = sd3_clip . SD3Tokenizer
2024-07-11 20:51:06 +00:00
elif weight . shape [ - 1 ] == 2048 :
clip_target . clip = comfy . text_encoders . aura_t5 . AuraT5Model
clip_target . tokenizer = comfy . text_encoders . aura_t5 . AuraT5Tokenizer
2024-06-15 16:14:56 +00:00
elif " encoder.block.0.layer.0.SelfAttention.k.weight " in clip_data [ 0 ] :
clip_target . clip = sa_t5 . SAT5Model
clip_target . tokenizer = sa_t5 . SAT5Tokenizer
2023-06-25 05:40:38 +00:00
else :
clip_target . clip = sd1_clip . SD1ClipModel
clip_target . tokenizer = sd1_clip . SD1Tokenizer
2024-06-10 17:26:25 +00:00
elif len ( clip_data ) == 2 :
2024-06-12 03:27:39 +00:00
if clip_type == CLIPType . SD3 :
clip_target . clip = sd3_clip . sd3_clip ( clip_l = True , clip_g = True , t5 = False )
clip_target . tokenizer = sd3_clip . SD3Tokenizer
else :
clip_target . clip = sdxl_clip . SDXLClipModel
clip_target . tokenizer = sdxl_clip . SDXLTokenizer
2024-06-10 17:26:25 +00:00
elif len ( clip_data ) == 3 :
clip_target . clip = sd3_clip . SD3ClipModel
clip_target . tokenizer = sd3_clip . SD3Tokenizer
2023-06-24 17:56:46 +00:00
clip = CLIP ( clip_target , embedding_directory = embedding_directory )
2023-06-25 05:40:38 +00:00
for c in clip_data :
m , u = clip . load_sd ( c )
if len ( m ) > 0 :
2024-03-10 15:37:08 +00:00
logging . warning ( " clip missing: {} " . format ( m ) )
2023-06-25 05:40:38 +00:00
if len ( u ) > 0 :
2024-03-11 17:54:56 +00:00
logging . debug ( " clip unexpected: {} " . format ( u ) )
2023-02-05 20:20:18 +00:00
return clip
2023-01-03 06:53:32 +00:00
2023-04-19 13:36:19 +00:00
def load_gligen ( ckpt_path ) :
2023-08-25 21:25:39 +00:00
data = comfy . utils . load_torch_file ( ckpt_path , safe_load = True )
2023-04-19 13:36:19 +00:00
model = gligen . load_gligen ( data )
if model_management . should_use_fp16 ( ) :
model = model . half ( )
2023-08-28 18:49:18 +00:00
return comfy . model_patcher . ModelPatcher ( model , load_device = model_management . get_torch_device ( ) , offload_device = model_management . unet_offload_device ( ) )
2023-04-19 13:36:19 +00:00
2023-06-09 16:24:24 +00:00
def load_checkpoint ( config_path = None , ckpt_path = None , output_vae = True , output_clip = True , embedding_directory = None , state_dict = None , config = None ) :
2024-05-07 00:04:39 +00:00
logging . warning ( " Warning: The load checkpoint with config function is deprecated and will eventually be removed, please use the other one. " )
model , clip , vae , _ = load_checkpoint_guess_config ( ckpt_path , output_vae = output_vae , output_clip = output_clip , output_clipvision = False , embedding_directory = embedding_directory , output_model = True )
2023-06-23 06:14:12 +00:00
#TODO: this function is a mess and should be removed eventually
2023-06-09 16:24:24 +00:00
if config is None :
with open ( config_path , ' r ' ) as stream :
config = yaml . safe_load ( stream )
2023-01-03 06:53:32 +00:00
model_config_params = config [ ' model ' ] [ ' params ' ]
clip_config = model_config_params [ ' cond_stage_config ' ]
scale_factor = model_config_params [ ' scale_factor ' ]
2023-06-09 16:24:24 +00:00
if " parameterization " in model_config_params :
if model_config_params [ " parameterization " ] == " v " :
2024-05-07 00:04:39 +00:00
m = model . clone ( )
class ModelSamplingAdvanced ( comfy . model_sampling . ModelSamplingDiscrete , comfy . model_sampling . V_PREDICTION ) :
pass
m . add_object_patch ( " model_sampling " , ModelSamplingAdvanced ( model . model . model_config ) )
model = m
2023-08-30 03:58:32 +00:00
2024-05-07 00:04:39 +00:00
layer_idx = clip_config . get ( " params " , { } ) . get ( " layer_idx " , None )
if layer_idx is not None :
clip . clip_layer ( layer_idx )
2023-06-22 17:03:50 +00:00
2024-05-07 00:04:39 +00:00
return ( model , clip , vae )
2023-03-03 08:37:35 +00:00
2023-10-06 17:48:18 +00:00
def load_checkpoint_guess_config ( ckpt_path , output_vae = True , output_clip = True , output_clipvision = False , embedding_directory = None , output_model = True ) :
2023-08-25 21:25:39 +00:00
sd = comfy . utils . load_torch_file ( ckpt_path )
2023-03-03 08:37:35 +00:00
sd_keys = sd . keys ( )
clip = None
2023-04-02 03:19:15 +00:00
clipvision = None
2023-03-03 08:37:35 +00:00
vae = None
2023-06-22 17:03:50 +00:00
model = None
2023-10-06 17:48:18 +00:00
model_patcher = None
2023-06-22 17:03:50 +00:00
clip_target = None
2023-03-03 08:37:35 +00:00
2024-06-15 16:14:56 +00:00
diffusion_model_prefix = model_detection . unet_prefix_from_state_dict ( sd )
parameters = comfy . utils . calculate_parameters ( sd , diffusion_model_prefix )
2023-12-11 23:24:44 +00:00
load_device = model_management . get_torch_device ( )
2023-03-03 16:07:10 +00:00
2024-06-15 16:14:56 +00:00
model_config = model_detection . model_config_from_unet ( sd , diffusion_model_prefix )
2024-07-11 15:46:51 +00:00
if model_config is None :
raise RuntimeError ( " ERROR: Could not detect model type of: {} " . format ( ckpt_path ) )
2024-02-16 15:55:08 +00:00
unet_dtype = model_management . unet_dtype ( model_params = parameters , supported_dtypes = model_config . supported_inference_dtypes )
manual_cast_dtype = model_management . unet_manual_cast ( unet_dtype , load_device , model_config . supported_inference_dtypes )
model_config . set_inference_dtype ( unet_dtype , manual_cast_dtype )
2023-12-11 23:24:44 +00:00
2023-06-22 17:03:50 +00:00
if model_config . clip_vision_prefix is not None :
2023-04-02 03:19:15 +00:00
if output_clipvision :
2023-06-23 05:08:05 +00:00
clipvision = clip_vision . load_clipvision_from_sd ( sd , model_config . clip_vision_prefix , True )
2023-03-03 08:37:35 +00:00
2023-10-06 17:48:18 +00:00
if output_model :
2023-10-13 18:35:21 +00:00
inital_load_device = model_management . unet_inital_load_device ( parameters , unet_dtype )
2023-10-06 17:48:18 +00:00
offload_device = model_management . unet_offload_device ( )
2024-06-15 16:14:56 +00:00
model = model_config . get_model ( sd , diffusion_model_prefix , device = inital_load_device )
model . load_model_weights ( sd , diffusion_model_prefix )
2023-04-02 03:19:15 +00:00
2023-06-22 17:03:50 +00:00
if output_vae :
2024-01-30 07:24:38 +00:00
vae_sd = comfy . utils . state_dict_prefix_replace ( sd , { k : " " for k in model_config . vae_key_prefix } , filter_keys = True )
2023-11-21 21:29:18 +00:00
vae_sd = model_config . process_vae_state_dict ( vae_sd )
2023-10-17 18:51:51 +00:00
vae = VAE ( sd = vae_sd )
2023-03-03 08:37:35 +00:00
2023-06-22 17:03:50 +00:00
if output_clip :
2024-06-11 17:14:43 +00:00
clip_target = model_config . clip_target ( state_dict = sd )
2023-10-18 23:48:36 +00:00
if clip_target is not None :
2024-02-19 15:29:18 +00:00
clip_sd = model_config . process_clip_state_dict ( sd )
if len ( clip_sd ) > 0 :
2024-02-13 05:01:08 +00:00
clip = CLIP ( clip_target , embedding_directory = embedding_directory )
2024-02-19 15:29:18 +00:00
m , u = clip . load_sd ( clip_sd , full_model = True )
if len ( m ) > 0 :
2024-05-09 08:39:46 +00:00
m_filter = list ( filter ( lambda a : " .logit_scale " not in a and " .transformer.text_projection.weight " not in a , m ) )
if len ( m_filter ) > 0 :
logging . warning ( " clip missing: {} " . format ( m ) )
else :
logging . debug ( " clip missing: {} " . format ( m ) )
2024-02-19 15:29:18 +00:00
if len ( u ) > 0 :
2024-03-11 17:54:56 +00:00
logging . debug ( " clip unexpected {} : " . format ( u ) )
2024-02-13 05:01:08 +00:00
else :
2024-03-10 15:37:08 +00:00
logging . warning ( " no CLIP/text encoder weights in checkpoint, the text encoder model will not be loaded. " )
2023-06-09 16:24:24 +00:00
2023-06-22 17:03:50 +00:00
left_over = sd . keys ( )
if len ( left_over ) > 0 :
2024-03-11 17:54:56 +00:00
logging . debug ( " left over keys: {} " . format ( left_over ) )
2023-06-14 16:48:02 +00:00
2023-10-06 17:48:18 +00:00
if output_model :
2023-12-11 23:24:44 +00:00
model_patcher = comfy . model_patcher . ModelPatcher ( model , load_device = load_device , offload_device = model_management . unet_offload_device ( ) , current_device = inital_load_device )
2023-10-06 17:48:18 +00:00
if inital_load_device != torch . device ( " cpu " ) :
2024-03-11 17:54:56 +00:00
logging . info ( " loaded straight to GPU " )
2023-10-06 17:48:18 +00:00
model_management . load_model_gpu ( model_patcher )
2023-08-17 05:06:34 +00:00
return ( model_patcher , clip , vae , clipvision )
2023-06-26 16:21:07 +00:00
2023-07-05 21:34:45 +00:00
2024-07-03 15:34:32 +00:00
def load_unet_state_dict ( sd ) : #load unet in diffusers or regular format
#Allow loading unets from checkpoint files
diffusion_model_prefix = model_detection . unet_prefix_from_state_dict ( sd )
temp_sd = comfy . utils . state_dict_prefix_replace ( sd , { diffusion_model_prefix : " " } , filter_keys = True )
if len ( temp_sd ) > 0 :
sd = temp_sd
2023-08-25 21:25:39 +00:00
parameters = comfy . utils . calculate_parameters ( sd )
2023-10-13 18:35:21 +00:00
unet_dtype = model_management . unet_dtype ( model_params = parameters )
2023-12-11 23:24:44 +00:00
load_device = model_management . get_torch_device ( )
2024-07-11 15:37:31 +00:00
model_config = model_detection . model_config_from_unet ( sd , " " )
2023-12-11 23:24:44 +00:00
2024-07-11 15:37:31 +00:00
if model_config is not None :
2024-07-03 15:34:32 +00:00
new_sd = sd
elif ' transformer_blocks.0.attn.add_q_proj.weight ' in sd : #MMDIT SD3
2024-06-20 01:46:37 +00:00
new_sd = model_detection . convert_diffusers_mmdit ( sd , " " )
if new_sd is None :
return None
model_config = model_detection . model_config_from_unet ( new_sd , " " )
if model_config is None :
return None
2023-09-11 20:36:50 +00:00
else : #diffusers
2024-02-16 15:55:08 +00:00
model_config = model_detection . model_config_from_diffusers_unet ( sd )
2023-09-11 20:36:50 +00:00
if model_config is None :
return None
diffusers_keys = comfy . utils . unet_to_diffusers ( model_config . unet_config )
new_sd = { }
for k in diffusers_keys :
if k in sd :
new_sd [ diffusers_keys [ k ] ] = sd . pop ( k )
else :
2024-03-10 15:37:08 +00:00
logging . warning ( " {} {} " . format ( diffusers_keys [ k ] , k ) )
2024-02-16 15:55:08 +00:00
2023-07-22 02:58:16 +00:00
offload_device = model_management . unet_offload_device ( )
2024-02-16 15:55:08 +00:00
unet_dtype = model_management . unet_dtype ( model_params = parameters , supported_dtypes = model_config . supported_inference_dtypes )
manual_cast_dtype = model_management . unet_manual_cast ( unet_dtype , load_device , model_config . supported_inference_dtypes )
model_config . set_inference_dtype ( unet_dtype , manual_cast_dtype )
2023-07-22 02:58:16 +00:00
model = model_config . get_model ( new_sd , " " )
model = model . to ( offload_device )
model . load_model_weights ( new_sd , " " )
2023-11-08 03:15:55 +00:00
left_over = sd . keys ( )
if len ( left_over ) > 0 :
2024-03-11 17:54:56 +00:00
logging . info ( " left over keys in unet: {} " . format ( left_over ) )
2023-12-11 23:24:44 +00:00
return comfy . model_patcher . ModelPatcher ( model , load_device = load_device , offload_device = offload_device )
2023-07-05 21:34:45 +00:00
2023-11-27 22:32:07 +00:00
def load_unet ( unet_path ) :
sd = comfy . utils . load_torch_file ( unet_path )
model = load_unet_state_dict ( sd )
if model is None :
2024-03-10 15:37:08 +00:00
logging . error ( " ERROR UNSUPPORTED UNET {} " . format ( unet_path ) )
2023-11-27 22:32:07 +00:00
raise RuntimeError ( " ERROR: Could not detect model type of: {} " . format ( unet_path ) )
return model
2024-04-08 04:36:22 +00:00
def save_checkpoint ( output_path , model , clip = None , vae = None , clip_vision = None , metadata = None , extra_keys = { } ) :
2024-01-18 00:37:19 +00:00
clip_sd = None
load_models = [ model ]
if clip is not None :
load_models . append ( clip . load_model ( ) )
clip_sd = clip . get_sd ( )
2024-05-12 01:46:05 +00:00
model_management . load_models_gpu ( load_models , force_patch_weights = True )
2024-01-18 00:37:19 +00:00
clip_vision_sd = clip_vision . get_sd ( ) if clip_vision is not None else None
sd = model . model . state_dict_for_saving ( clip_sd , vae . get_sd ( ) , clip_vision_sd )
2024-04-08 04:36:22 +00:00
for k in extra_keys :
sd [ k ] = extra_keys [ k ]
2024-07-03 00:21:51 +00:00
2024-07-03 00:16:33 +00:00
for k in sd :
2024-07-03 00:21:51 +00:00
t = sd [ k ]
if not t . is_contiguous ( ) :
sd [ k ] = t . contiguous ( )
2024-04-08 04:36:22 +00:00
2023-08-25 21:25:39 +00:00
comfy . utils . save_torch_file ( sd , output_path , metadata = metadata )