2023-01-03 06:53:32 +00:00
import os
2023-12-06 20:55:09 +00:00
from transformers import CLIPTokenizer
2023-06-15 00:13:08 +00:00
import comfy . ops
2023-01-03 06:53:32 +00:00
import torch
2023-04-14 17:54:00 +00:00
import traceback
2023-04-14 19:33:43 +00:00
import zipfile
2023-07-01 16:37:23 +00:00
from . import model_management
2023-12-06 20:55:09 +00:00
import comfy . clip_model
import json
2024-03-10 15:37:08 +00:00
import logging
2024-06-19 20:42:41 +00:00
import numbers
2023-01-03 06:53:32 +00:00
2023-11-06 18:43:50 +00:00
def gen_empty_tokens ( special_tokens , length ) :
start_token = special_tokens . get ( " start " , None )
end_token = special_tokens . get ( " end " , None )
pad_token = special_tokens . get ( " pad " )
output = [ ]
if start_token is not None :
output . append ( start_token )
if end_token is not None :
output . append ( end_token )
output + = [ pad_token ] * ( length - len ( output ) )
return output
2023-01-03 06:53:32 +00:00
class ClipTokenWeightEncoder :
def encode_token_weights ( self , token_weight_pairs ) :
2023-11-06 18:43:50 +00:00
to_encode = list ( )
max_token_len = 0
has_weights = False
2023-01-03 06:53:32 +00:00
for x in token_weight_pairs :
2023-07-01 19:07:39 +00:00
tokens = list ( map ( lambda a : a [ 0 ] , x ) )
2023-11-06 18:43:50 +00:00
max_token_len = max ( len ( tokens ) , max_token_len )
has_weights = has_weights or not all ( map ( lambda a : a [ 1 ] == 1.0 , x ) )
2023-07-01 19:07:39 +00:00
to_encode . append ( tokens )
2023-11-06 18:43:50 +00:00
sections = len ( to_encode )
if has_weights or sections == 0 :
to_encode . append ( gen_empty_tokens ( self . special_tokens , max_token_len ) )
2024-07-10 23:31:22 +00:00
o = self . encode ( to_encode )
out , pooled = o [ : 2 ]
2023-11-06 18:43:50 +00:00
if pooled is not None :
2023-12-08 07:35:45 +00:00
first_pooled = pooled [ 0 : 1 ] . to ( model_management . intermediate_device ( ) )
2023-07-01 19:07:39 +00:00
else :
2023-11-06 18:43:50 +00:00
first_pooled = pooled
2023-07-01 19:07:39 +00:00
output = [ ]
2023-11-06 18:43:50 +00:00
for k in range ( 0 , sections ) :
2023-07-06 06:43:40 +00:00
z = out [ k : k + 1 ]
2023-11-06 18:43:50 +00:00
if has_weights :
z_empty = out [ - 1 ]
for i in range ( len ( z ) ) :
for j in range ( len ( z [ i ] ) ) :
weight = token_weight_pairs [ k ] [ j ] [ 1 ]
if weight != 1.0 :
z [ i ] [ j ] = ( z [ i ] [ j ] - z_empty [ j ] ) * weight + z_empty [ j ]
2023-07-01 19:07:39 +00:00
output . append ( z )
2023-01-03 06:53:32 +00:00
if ( len ( output ) == 0 ) :
2024-07-10 23:31:22 +00:00
r = ( out [ - 1 : ] . to ( model_management . intermediate_device ( ) ) , first_pooled )
else :
r = ( torch . cat ( output , dim = - 2 ) . to ( model_management . intermediate_device ( ) ) , first_pooled )
2024-07-11 00:06:50 +00:00
if len ( o ) > 2 :
extra = { }
for k in o [ 2 ] :
v = o [ 2 ] [ k ]
if k == " attention_mask " :
v = v [ : sections ] . flatten ( ) . unsqueeze ( dim = 0 ) . to ( model_management . intermediate_device ( ) )
extra [ k ] = v
r = r + ( extra , )
2024-07-10 23:31:22 +00:00
return r
2023-01-03 06:53:32 +00:00
2023-10-27 19:54:04 +00:00
class SDClipModel ( torch . nn . Module , ClipTokenWeightEncoder ) :
2023-01-03 06:53:32 +00:00
""" Uses the CLIP transformer encoder for text (from huggingface) """
LAYERS = [
" last " ,
" pooled " ,
" hidden "
]
def __init__ ( self , version = " openai/clip-vit-large-patch14 " , device = " cpu " , max_length = 77 ,
2023-12-06 20:55:09 +00:00
freeze = True , layer = " last " , layer_idx = None , textmodel_json_config = None , dtype = None , model_class = comfy . clip_model . CLIPTextModel ,
2024-06-09 20:41:04 +00:00
special_tokens = { " start " : 49406 , " end " : 49407 , " pad " : 49407 } , layer_norm_hidden_state = True , enable_attention_masks = False , zero_out_masked = False ,
2024-08-17 14:15:13 +00:00
return_projected_pooled = True , return_attention_masks = False , model_options = { } ) : # clip-vit-base-patch32
2023-01-03 06:53:32 +00:00
super ( ) . __init__ ( )
assert layer in self . LAYERS
2023-12-06 20:55:09 +00:00
if textmodel_json_config is None :
textmodel_json_config = os . path . join ( os . path . dirname ( os . path . realpath ( __file__ ) ) , " sd1_clip_config.json " )
with open ( textmodel_json_config ) as f :
config = json . load ( f )
2024-08-17 14:15:13 +00:00
operations = model_options . get ( " custom_operations " , None )
if operations is None :
operations = comfy . ops . manual_cast
self . operations = operations
2024-07-31 05:32:35 +00:00
self . transformer = model_class ( config , dtype , device , self . operations )
2023-12-06 20:55:09 +00:00
self . num_layers = self . transformer . num_layers
2023-09-12 01:49:56 +00:00
2023-01-03 06:53:32 +00:00
self . max_length = max_length
if freeze :
self . freeze ( )
self . layer = layer
self . layer_idx = None
2023-11-06 18:43:50 +00:00
self . special_tokens = special_tokens
2024-02-25 06:41:08 +00:00
2023-08-25 02:20:30 +00:00
self . logit_scale = torch . nn . Parameter ( torch . tensor ( 4.6055 ) )
2024-02-16 18:29:04 +00:00
self . enable_attention_masks = enable_attention_masks
2024-06-09 20:41:04 +00:00
self . zero_out_masked = zero_out_masked
2023-08-25 02:20:30 +00:00
2023-11-06 18:43:50 +00:00
self . layer_norm_hidden_state = layer_norm_hidden_state
2024-02-25 19:49:13 +00:00
self . return_projected_pooled = return_projected_pooled
2024-07-10 23:31:22 +00:00
self . return_attention_masks = return_attention_masks
2024-02-25 12:20:31 +00:00
2023-01-03 06:53:32 +00:00
if layer == " hidden " :
assert layer_idx is not None
2023-12-06 20:55:09 +00:00
assert abs ( layer_idx ) < self . num_layers
2024-02-25 12:20:31 +00:00
self . set_clip_options ( { " layer " : layer_idx } )
self . options_default = ( self . layer , self . layer_idx , self . return_projected_pooled )
2023-01-03 06:53:32 +00:00
def freeze ( self ) :
self . transformer = self . transformer . eval ( )
#self.train = disabled_train
for param in self . parameters ( ) :
param . requires_grad = False
2024-02-25 12:20:31 +00:00
def set_clip_options ( self , options ) :
layer_idx = options . get ( " layer " , self . layer_idx )
self . return_projected_pooled = options . get ( " projected_pooled " , self . return_projected_pooled )
if layer_idx is None or abs ( layer_idx ) > self . num_layers :
2023-01-03 06:53:32 +00:00
self . layer = " last "
else :
self . layer = " hidden "
self . layer_idx = layer_idx
2024-02-25 12:20:31 +00:00
def reset_clip_options ( self ) :
self . layer = self . options_default [ 0 ]
self . layer_idx = self . options_default [ 1 ]
self . return_projected_pooled = self . options_default [ 2 ]
2023-07-15 05:10:33 +00:00
2023-01-29 23:46:44 +00:00
def set_up_textual_embeddings ( self , tokens , current_embeds ) :
out_tokens = [ ]
2024-07-30 18:20:28 +00:00
next_new_token = token_dict_size = current_embeds . weight . shape [ 0 ]
2023-01-29 23:46:44 +00:00
embedding_weights = [ ]
for x in tokens :
tokens_temp = [ ]
for y in x :
2024-06-19 20:42:41 +00:00
if isinstance ( y , numbers . Integral ) :
tokens_temp + = [ int ( y ) ]
2023-01-29 23:46:44 +00:00
else :
2023-04-04 15:49:29 +00:00
if y . shape [ 0 ] == current_embeds . weight . shape [ 1 ] :
embedding_weights + = [ y ]
tokens_temp + = [ next_new_token ]
next_new_token + = 1
else :
2024-03-10 15:37:08 +00:00
logging . warning ( " WARNING: shape mismatch when trying to apply embedding, embedding will be ignored {} != {} " . format ( y . shape [ 0 ] , current_embeds . weight . shape [ 1 ] ) )
2023-06-09 03:48:14 +00:00
while len ( tokens_temp ) < len ( x ) :
2023-11-06 18:43:50 +00:00
tokens_temp + = [ self . special_tokens [ " pad " ] ]
2023-01-29 23:46:44 +00:00
out_tokens + = [ tokens_temp ]
2023-08-04 00:27:50 +00:00
n = token_dict_size
2023-01-29 23:46:44 +00:00
if len ( embedding_weights ) > 0 :
2024-07-31 05:32:35 +00:00
new_embedding = self . operations . Embedding ( next_new_token + 1 , current_embeds . weight . shape [ 1 ] , device = current_embeds . weight . device , dtype = current_embeds . weight . dtype )
2024-07-30 18:20:28 +00:00
new_embedding . weight [ : token_dict_size ] = current_embeds . weight
2023-01-29 23:46:44 +00:00
for x in embedding_weights :
new_embedding . weight [ n ] = x
n + = 1
self . transformer . set_input_embeddings ( new_embedding )
2023-08-04 00:27:50 +00:00
processed_tokens = [ ]
for x in out_tokens :
processed_tokens + = [ list ( map ( lambda a : n if a == - 1 else a , x ) ) ] #The EOS token should always be the largest one
return processed_tokens
2023-01-29 23:46:44 +00:00
2023-01-03 06:53:32 +00:00
def forward ( self , tokens ) :
2023-01-29 23:46:44 +00:00
backup_embeds = self . transformer . get_input_embeddings ( )
2023-07-01 16:37:23 +00:00
device = backup_embeds . weight . device
2023-01-29 23:46:44 +00:00
tokens = self . set_up_textual_embeddings ( tokens , backup_embeds )
2023-07-01 16:37:23 +00:00
tokens = torch . LongTensor ( tokens ) . to ( device )
2023-12-11 04:00:54 +00:00
attention_mask = None
2024-07-10 23:31:22 +00:00
if self . enable_attention_masks or self . zero_out_masked or self . return_attention_masks :
2023-12-11 04:00:54 +00:00
attention_mask = torch . zeros_like ( tokens )
2024-06-07 07:05:23 +00:00
end_token = self . special_tokens . get ( " end " , - 1 )
2023-12-11 04:00:54 +00:00
for x in range ( attention_mask . shape [ 0 ] ) :
for y in range ( attention_mask . shape [ 1 ] ) :
attention_mask [ x , y ] = 1
2024-06-07 07:05:23 +00:00
if tokens [ x , y ] == end_token :
2023-12-11 04:00:54 +00:00
break
2024-07-06 03:48:17 +00:00
attention_mask_model = None
if self . enable_attention_masks :
attention_mask_model = attention_mask
2024-07-31 05:32:35 +00:00
outputs = self . transformer ( tokens , attention_mask_model , intermediate_output = self . layer_idx , final_layer_norm_intermediate = self . layer_norm_hidden_state , dtype = torch . float32 )
2023-12-11 04:00:54 +00:00
self . transformer . set_input_embeddings ( backup_embeds )
if self . layer == " last " :
2024-06-09 20:41:04 +00:00
z = outputs [ 0 ] . float ( )
2023-01-03 06:53:32 +00:00
else :
2024-06-09 20:41:04 +00:00
z = outputs [ 1 ] . float ( )
2024-07-06 03:48:17 +00:00
if self . zero_out_masked :
2024-06-09 20:41:04 +00:00
z * = attention_mask . unsqueeze ( - 1 ) . float ( )
2023-12-11 04:00:54 +00:00
2024-02-25 12:20:31 +00:00
pooled_output = None
if len ( outputs ) > = 3 :
if not self . return_projected_pooled and len ( outputs ) > = 4 and outputs [ 3 ] is not None :
pooled_output = outputs [ 3 ] . float ( )
elif outputs [ 2 ] is not None :
pooled_output = outputs [ 2 ] . float ( )
2023-12-11 04:00:54 +00:00
2024-07-11 00:06:50 +00:00
extra = { }
2024-07-10 23:31:22 +00:00
if self . return_attention_masks :
2024-07-11 00:06:50 +00:00
extra [ " attention_mask " ] = attention_mask
if len ( extra ) > 0 :
return z , pooled_output , extra
2024-07-10 23:31:22 +00:00
2024-06-09 20:41:04 +00:00
return z , pooled_output
2023-01-03 06:53:32 +00:00
def encode ( self , tokens ) :
return self ( tokens )
2023-06-25 05:40:38 +00:00
def load_sd ( self , sd ) :
return self . transformer . load_state_dict ( sd , strict = False )
2023-01-03 06:53:32 +00:00
def parse_parentheses ( string ) :
result = [ ]
current_item = " "
nesting_level = 0
for char in string :
if char == " ( " :
if nesting_level == 0 :
if current_item :
result . append ( current_item )
current_item = " ( "
else :
current_item = " ( "
else :
current_item + = char
nesting_level + = 1
elif char == " ) " :
nesting_level - = 1
if nesting_level == 0 :
result . append ( current_item + " ) " )
current_item = " "
else :
current_item + = char
else :
current_item + = char
if current_item :
result . append ( current_item )
return result
def token_weights ( string , current_weight ) :
a = parse_parentheses ( string )
out = [ ]
for x in a :
weight = current_weight
if len ( x ) > = 2 and x [ - 1 ] == ' ) ' and x [ 0 ] == ' ( ' :
x = x [ 1 : - 1 ]
xx = x . rfind ( " : " )
weight * = 1.1
if xx > 0 :
try :
weight = float ( x [ xx + 1 : ] )
x = x [ : xx ]
except :
pass
out + = token_weights ( x , weight )
else :
out + = [ ( x , current_weight ) ]
return out
def escape_important ( text ) :
text = text . replace ( " \\ ) " , " \0 \1 " )
text = text . replace ( " \\ ( " , " \0 \2 " )
return text
def unescape_important ( text ) :
text = text . replace ( " \0 \1 " , " ) " )
text = text . replace ( " \0 \2 " , " ( " )
return text
2023-04-14 19:33:43 +00:00
def safe_load_embed_zip ( embed_path ) :
with zipfile . ZipFile ( embed_path ) as myzip :
names = list ( filter ( lambda a : " data/ " in a , myzip . namelist ( ) ) )
names . reverse ( )
for n in names :
with myzip . open ( n ) as myfile :
data = myfile . read ( )
number = len ( data ) / / 4
length_embed = 1024 #sd2.x
if number < 768 :
continue
if number % 768 == 0 :
length_embed = 768 #sd1.x
num_embeds = number / / length_embed
embed = torch . frombuffer ( data , dtype = torch . float )
out = embed . reshape ( ( num_embeds , length_embed ) ) . clone ( )
del embed
return out
2023-05-05 05:28:48 +00:00
def expand_directory_list ( directories ) :
dirs = set ( )
for x in directories :
dirs . add ( x )
for root , subdir , file in os . walk ( x , followlinks = True ) :
dirs . add ( root )
return list ( dirs )
2023-04-14 19:33:43 +00:00
2024-08-07 17:30:45 +00:00
def bundled_embed ( embed , prefix , suffix ) : #bundled embedding in lora format
2024-08-07 07:45:25 +00:00
i = 0
out_list = [ ]
2024-08-07 17:30:45 +00:00
for k in embed :
if k . startswith ( prefix ) and k . endswith ( suffix ) :
out_list . append ( embed [ k ] )
if len ( out_list ) == 0 :
return None
2024-08-07 07:45:25 +00:00
return torch . cat ( out_list , dim = 0 )
2023-07-10 14:28:38 +00:00
def load_embed ( embedding_name , embedding_directory , embedding_size , embed_key = None ) :
2023-03-18 07:08:43 +00:00
if isinstance ( embedding_directory , str ) :
embedding_directory = [ embedding_directory ]
2023-05-05 05:28:48 +00:00
embedding_directory = expand_directory_list ( embedding_directory )
2023-03-18 07:08:43 +00:00
valid_file = None
for embed_dir in embedding_directory :
2023-10-27 06:42:14 +00:00
embed_path = os . path . abspath ( os . path . join ( embed_dir , embedding_name ) )
embed_dir = os . path . abspath ( embed_dir )
try :
if os . path . commonpath ( ( embed_dir , embed_path ) ) != embed_dir :
continue
except :
continue
2023-03-18 07:08:43 +00:00
if not os . path . isfile ( embed_path ) :
extensions = [ ' .safetensors ' , ' .pt ' , ' .bin ' ]
for x in extensions :
t = embed_path + x
if os . path . isfile ( t ) :
valid_file = t
break
2023-01-29 23:46:44 +00:00
else :
2023-03-18 07:08:43 +00:00
valid_file = embed_path
if valid_file is not None :
break
if valid_file is None :
return None
embed_path = valid_file
2023-01-29 23:46:44 +00:00
2023-04-14 19:33:43 +00:00
embed_out = None
2023-04-14 17:54:00 +00:00
try :
if embed_path . lower ( ) . endswith ( " .safetensors " ) :
import safetensors . torch
embed = safetensors . torch . load_file ( embed_path , device = " cpu " )
2023-02-19 21:59:03 +00:00
else :
2023-04-14 17:54:00 +00:00
if ' weights_only ' in torch . load . __code__ . co_varnames :
2023-04-14 19:33:43 +00:00
try :
embed = torch . load ( embed_path , weights_only = True , map_location = " cpu " )
except :
embed_out = safe_load_embed_zip ( embed_path )
2023-04-14 17:54:00 +00:00
else :
embed = torch . load ( embed_path , map_location = " cpu " )
except Exception as e :
2024-03-10 15:37:08 +00:00
logging . warning ( " {} \n \n error loading embedding, skipping loading: {} " . format ( traceback . format_exc ( ) , embedding_name ) )
2023-04-14 17:54:00 +00:00
return None
2023-04-14 19:33:43 +00:00
if embed_out is None :
if ' string_to_param ' in embed :
values = embed [ ' string_to_param ' ] . values ( )
2023-06-22 17:03:50 +00:00
embed_out = next ( iter ( values ) )
elif isinstance ( embed , list ) :
out_list = [ ]
for x in range ( len ( embed ) ) :
for k in embed [ x ] :
t = embed [ x ] [ k ]
if t . shape [ - 1 ] != embedding_size :
continue
out_list . append ( t . reshape ( - 1 , t . shape [ - 1 ] ) )
embed_out = torch . cat ( out_list , dim = 0 )
2023-07-10 14:28:38 +00:00
elif embed_key is not None and embed_key in embed :
embed_out = embed [ embed_key ]
2023-04-14 19:33:43 +00:00
else :
2024-08-07 17:30:45 +00:00
embed_out = bundled_embed ( embed , ' bundle_emb. ' , ' .string_to_param.* ' )
if embed_out is None :
embed_out = bundled_embed ( embed , ' bundle_emb. ' , ' . {} ' . format ( embed_key ) )
if embed_out is None :
values = embed . values ( )
embed_out = next ( iter ( values ) )
2023-04-14 19:33:43 +00:00
return embed_out
2023-01-29 23:46:44 +00:00
2023-10-27 19:54:04 +00:00
class SDTokenizer :
2024-07-24 20:43:53 +00:00
def __init__ ( self , tokenizer_path = None , max_length = 77 , pad_with_end = True , embedding_directory = None , embedding_size = 768 , embedding_key = ' clip_l ' , tokenizer_class = CLIPTokenizer , has_start_token = True , pad_to_max_length = True , min_length = None , pad_token = None , tokenizer_data = { } ) :
2023-01-03 06:53:32 +00:00
if tokenizer_path is None :
tokenizer_path = os . path . join ( os . path . dirname ( os . path . realpath ( __file__ ) ) , " sd1_tokenizer " )
2023-11-06 06:09:18 +00:00
self . tokenizer = tokenizer_class . from_pretrained ( tokenizer_path )
2023-01-03 06:53:32 +00:00
self . max_length = max_length
2024-02-27 02:36:37 +00:00
self . min_length = min_length
2023-01-29 23:46:44 +00:00
2023-01-03 06:53:32 +00:00
empty = self . tokenizer ( ' ' ) [ " input_ids " ]
2023-11-06 06:09:18 +00:00
if has_start_token :
self . tokens_start = 1
self . start_token = empty [ 0 ]
self . end_token = empty [ 1 ]
else :
self . tokens_start = 0
self . start_token = None
self . end_token = empty [ 0 ]
2024-07-06 04:06:49 +00:00
if pad_token is not None :
self . pad_token = pad_token
elif pad_with_end :
self . pad_token = self . end_token
else :
self . pad_token = 0
2023-01-03 06:53:32 +00:00
self . pad_with_end = pad_with_end
2023-11-06 06:09:18 +00:00
self . pad_to_max_length = pad_to_max_length
2023-01-03 06:53:32 +00:00
vocab = self . tokenizer . get_vocab ( )
self . inv_vocab = { v : k for k , v in vocab . items ( ) }
2023-01-29 23:46:44 +00:00
self . embedding_directory = embedding_directory
self . max_word_length = 8
2023-04-13 20:01:01 +00:00
self . embedding_identifier = " embedding: "
2023-06-22 17:03:50 +00:00
self . embedding_size = embedding_size
2023-07-10 14:28:38 +00:00
self . embedding_key = embedding_key
2023-04-13 20:01:01 +00:00
2023-04-14 19:02:45 +00:00
def _try_get_embedding ( self , embedding_name : str ) :
2023-04-13 20:01:01 +00:00
'''
Takes a potential embedding name and tries to retrieve it .
Returns a Tuple consisting of the embedding and any leftover string , embedding can be None .
'''
2023-07-10 14:28:38 +00:00
embed = load_embed ( embedding_name , self . embedding_directory , self . embedding_size , self . embedding_key )
2023-04-13 20:01:01 +00:00
if embed is None :
stripped = embedding_name . strip ( ' , ' )
if len ( stripped ) < len ( embedding_name ) :
2023-07-10 14:28:38 +00:00
embed = load_embed ( stripped , self . embedding_directory , self . embedding_size , self . embedding_key )
2023-04-13 20:01:01 +00:00
return ( embed , embedding_name [ len ( stripped ) : ] )
return ( embed , " " )
2023-04-14 19:16:55 +00:00
def tokenize_with_weights ( self , text : str , return_word_ids = False ) :
2023-04-13 20:01:01 +00:00
'''
Takes a prompt and converts it to a list of ( token , weight , word id ) elements .
Tokens can both be integer tokens and pre computed CLIP tensors .
Word id values are unique per word and embedding , where the id 0 is reserved for non word tokens .
Returned list has the dimensions NxM where M is the input size of CLIP
'''
2023-01-03 06:53:32 +00:00
text = escape_important ( text )
parsed_weights = token_weights ( text , 1.0 )
2023-04-13 20:01:01 +00:00
#tokenize words
2023-01-03 06:53:32 +00:00
tokens = [ ]
2023-04-13 20:01:01 +00:00
for weighted_segment , weight in parsed_weights :
to_tokenize = unescape_important ( weighted_segment ) . replace ( " \n " , " " ) . split ( ' ' )
to_tokenize = [ x for x in to_tokenize if x != " " ]
for word in to_tokenize :
#if we find an embedding, deal with the embedding
if word . startswith ( self . embedding_identifier ) and self . embedding_directory is not None :
2023-04-14 19:02:45 +00:00
embedding_name = word [ len ( self . embedding_identifier ) : ] . strip ( ' \n ' )
embed , leftover = self . _try_get_embedding ( embedding_name )
2023-02-19 07:50:48 +00:00
if embed is None :
2024-03-10 15:37:08 +00:00
logging . warning ( f " warning, embedding: { embedding_name } does not exist, ignoring " )
2023-04-13 20:01:01 +00:00
else :
2023-01-29 23:46:44 +00:00
if len ( embed . shape ) == 1 :
2023-04-13 20:01:01 +00:00
tokens . append ( [ ( embed , weight ) ] )
2023-01-29 23:46:44 +00:00
else :
2023-04-13 20:01:01 +00:00
tokens . append ( [ ( embed [ x ] , weight ) for x in range ( embed . shape [ 0 ] ) ] )
#if we accidentally have leftover text, continue parsing using leftover, else move on to next word
if leftover != " " :
word = leftover
2023-02-19 07:50:48 +00:00
else :
2023-04-13 20:01:01 +00:00
continue
#parse word
2023-11-06 06:09:18 +00:00
tokens . append ( [ ( t , weight ) for t in self . tokenizer ( word ) [ " input_ids " ] [ self . tokens_start : - 1 ] ] )
2023-04-15 22:46:58 +00:00
2023-04-13 20:01:01 +00:00
#reshape token array to CLIP input size
batched_tokens = [ ]
2023-11-06 06:09:18 +00:00
batch = [ ]
if self . start_token is not None :
batch . append ( ( self . start_token , 1.0 , 0 ) )
2023-04-13 20:01:01 +00:00
batched_tokens . append ( batch )
for i , t_group in enumerate ( tokens ) :
2023-04-14 19:02:45 +00:00
#determine if we're going to try and keep the tokens in a single batch
is_large = len ( t_group ) > = self . max_word_length
2023-04-15 17:38:21 +00:00
2023-04-14 19:02:45 +00:00
while len ( t_group ) > 0 :
2023-04-15 17:38:21 +00:00
if len ( t_group ) + len ( batch ) > self . max_length - 1 :
remaining_length = self . max_length - len ( batch ) - 1
#break word in two and add end token
2023-04-14 19:02:45 +00:00
if is_large :
batch . extend ( [ ( t , w , i + 1 ) for t , w in t_group [ : remaining_length ] ] )
2023-04-15 17:38:21 +00:00
batch . append ( ( self . end_token , 1.0 , 0 ) )
2023-04-14 19:02:45 +00:00
t_group = t_group [ remaining_length : ]
2023-04-15 17:38:21 +00:00
#add end token and pad
2023-02-19 07:50:48 +00:00
else :
2023-04-15 17:38:21 +00:00
batch . append ( ( self . end_token , 1.0 , 0 ) )
2023-11-06 06:09:18 +00:00
if self . pad_to_max_length :
2024-07-06 04:06:49 +00:00
batch . extend ( [ ( self . pad_token , 1.0 , 0 ) ] * ( remaining_length ) )
2023-04-15 17:38:21 +00:00
#start new batch
2023-11-06 06:09:18 +00:00
batch = [ ]
if self . start_token is not None :
batch . append ( ( self . start_token , 1.0 , 0 ) )
2023-04-15 22:46:58 +00:00
batched_tokens . append ( batch )
2023-04-13 20:01:01 +00:00
else :
2023-04-14 19:02:45 +00:00
batch . extend ( [ ( t , w , i + 1 ) for t , w in t_group ] )
t_group = [ ]
2023-04-15 22:46:58 +00:00
2023-04-13 20:01:01 +00:00
#fill last batch
2023-11-06 06:09:18 +00:00
batch . append ( ( self . end_token , 1.0 , 0 ) )
if self . pad_to_max_length :
2024-07-06 04:06:49 +00:00
batch . extend ( [ ( self . pad_token , 1.0 , 0 ) ] * ( self . max_length - len ( batch ) ) )
2024-02-27 02:36:37 +00:00
if self . min_length is not None and len ( batch ) < self . min_length :
2024-07-06 04:06:49 +00:00
batch . extend ( [ ( self . pad_token , 1.0 , 0 ) ] * ( self . min_length - len ( batch ) ) )
2023-01-03 06:53:32 +00:00
2023-04-14 19:16:55 +00:00
if not return_word_ids :
batched_tokens = [ [ ( t , w ) for t , w , _ in x ] for x in batched_tokens ]
2023-01-03 06:53:32 +00:00
2023-04-13 20:01:01 +00:00
return batched_tokens
2023-01-03 06:53:32 +00:00
def untokenize ( self , token_weight_pair ) :
return list ( map ( lambda a : ( a , self . inv_vocab [ a [ 0 ] ] ) , token_weight_pair ) )
2023-10-27 19:54:04 +00:00
2024-07-25 14:52:09 +00:00
def state_dict ( self ) :
return { }
2023-10-27 19:54:04 +00:00
class SD1Tokenizer :
2024-07-24 20:43:53 +00:00
def __init__ ( self , embedding_directory = None , tokenizer_data = { } , clip_name = " l " , tokenizer = SDTokenizer ) :
2023-10-27 19:54:04 +00:00
self . clip_name = clip_name
self . clip = " clip_ {} " . format ( self . clip_name )
2024-07-24 20:43:53 +00:00
setattr ( self , self . clip , tokenizer ( embedding_directory = embedding_directory , tokenizer_data = tokenizer_data ) )
2023-10-27 19:54:04 +00:00
def tokenize_with_weights ( self , text : str , return_word_ids = False ) :
out = { }
out [ self . clip_name ] = getattr ( self , self . clip ) . tokenize_with_weights ( text , return_word_ids )
return out
def untokenize ( self , token_weight_pair ) :
return getattr ( self , self . clip ) . untokenize ( token_weight_pair )
2024-07-25 14:52:09 +00:00
def state_dict ( self ) :
return { }
2023-10-27 19:54:04 +00:00
class SD1ClipModel ( torch . nn . Module ) :
2024-08-17 14:15:13 +00:00
def __init__ ( self , device = " cpu " , dtype = None , model_options = { } , clip_name = " l " , clip_model = SDClipModel , name = None , * * kwargs ) :
2023-10-27 19:54:04 +00:00
super ( ) . __init__ ( )
2024-07-08 12:48:38 +00:00
if name is not None :
self . clip_name = name
self . clip = " {} " . format ( self . clip_name )
else :
self . clip_name = clip_name
self . clip = " clip_ {} " . format ( self . clip_name )
2024-08-17 14:15:13 +00:00
setattr ( self , self . clip , clip_model ( device = device , dtype = dtype , model_options = model_options , * * kwargs ) )
2023-10-27 19:54:04 +00:00
2024-06-11 21:03:26 +00:00
self . dtypes = set ( )
if dtype is not None :
self . dtypes . add ( dtype )
2024-02-25 12:20:31 +00:00
def set_clip_options ( self , options ) :
getattr ( self , self . clip ) . set_clip_options ( options )
2023-10-27 19:54:04 +00:00
2024-02-25 12:20:31 +00:00
def reset_clip_options ( self ) :
getattr ( self , self . clip ) . reset_clip_options ( )
2023-10-27 19:54:04 +00:00
def encode_token_weights ( self , token_weight_pairs ) :
token_weight_pairs = token_weight_pairs [ self . clip_name ]
2024-07-11 00:06:50 +00:00
out = getattr ( self , self . clip ) . encode_token_weights ( token_weight_pairs )
return out
2023-10-27 19:54:04 +00:00
def load_sd ( self , sd ) :
return getattr ( self , self . clip ) . load_sd ( sd )