From 8489cba1405f222f4675c120aee4a3722affb3f8 Mon Sep 17 00:00:00 2001 From: BlenderNeko <126974546+BlenderNeko@users.noreply.github.com> Date: Thu, 13 Apr 2023 22:01:01 +0200 Subject: [PATCH 1/5] add unique ID per word/embedding for tokenizer --- comfy/sd1_clip.py | 117 ++++++++++++++++++++++++++++------------------ 1 file changed, 71 insertions(+), 46 deletions(-) diff --git a/comfy/sd1_clip.py b/comfy/sd1_clip.py index 4f51657c..3dd8262a 100644 --- a/comfy/sd1_clip.py +++ b/comfy/sd1_clip.py @@ -224,60 +224,85 @@ class SD1Tokenizer: self.inv_vocab = {v: k for k, v in vocab.items()} self.embedding_directory = embedding_directory self.max_word_length = 8 + self.embedding_identifier = "embedding:" - def tokenize_with_weights(self, text): + def _try_get_embedding(self, name:str): + ''' + Takes a potential embedding name and tries to retrieve it. + Returns a Tuple consisting of the embedding and any leftover string, embedding can be None. + ''' + embedding_name = name[len(self.embedding_identifier):].strip('\n') + embed = load_embed(embedding_name, self.embedding_directory) + if embed is None: + stripped = embedding_name.strip(',') + if len(stripped) < len(embedding_name): + embed = load_embed(stripped, self.embedding_directory) + return (embed, embedding_name[len(stripped):]) + return (embed, "") + + + def tokenize_with_weights(self, text:str): + ''' + Takes a prompt and converts it to a list of (token, weight, word id) elements. + Tokens can both be integer tokens and pre computed CLIP tensors. + Word id values are unique per word and embedding, where the id 0 is reserved for non word tokens. + Returned list has the dimensions NxM where M is the input size of CLIP + ''' text = escape_important(text) parsed_weights = token_weights(text, 1.0) + #tokenize words tokens = [] - for t in parsed_weights: - to_tokenize = unescape_important(t[0]).replace("\n", " ").split(' ') - while len(to_tokenize) > 0: - word = to_tokenize.pop(0) - temp_tokens = [] - embedding_identifier = "embedding:" - if word.startswith(embedding_identifier) and self.embedding_directory is not None: - embedding_name = word[len(embedding_identifier):].strip('\n') - embed = load_embed(embedding_name, self.embedding_directory) + for weighted_segment, weight in parsed_weights: + to_tokenize = unescape_important(weighted_segment).replace("\n", " ").split(' ') + to_tokenize = [x for x in to_tokenize if x != ""] + for word in to_tokenize: + #if we find an embedding, deal with the embedding + if word.startswith(self.embedding_identifier) and self.embedding_directory is not None: + embed, leftover = self._try_get_embedding(word) if embed is None: - stripped = embedding_name.strip(',') - if len(stripped) < len(embedding_name): - embed = load_embed(stripped, self.embedding_directory) - if embed is not None: - to_tokenize.insert(0, embedding_name[len(stripped):]) - - if embed is not None: - if len(embed.shape) == 1: - temp_tokens += [(embed, t[1])] - else: - for x in range(embed.shape[0]): - temp_tokens += [(embed[x], t[1])] + print(f"warning, embedding:{word} does not exist, ignoring") else: - print("warning, embedding:{} does not exist, ignoring".format(embedding_name)) - elif len(word) > 0: - tt = self.tokenizer(word)["input_ids"][1:-1] - for x in tt: - temp_tokens += [(x, t[1])] - tokens_left = self.max_tokens_per_section - (len(tokens) % self.max_tokens_per_section) + if len(embed.shape) == 1: + tokens.append([(embed, weight)]) + else: + tokens.append([(embed[x], weight) for x in range(embed.shape[0])]) + #if we accidentally have leftover text, continue parsing using leftover, else move on to next word + if leftover != "": + word = leftover + else: + continue + #parse word + tokens.append([(t, weight) for t in self.tokenizer(word)["input_ids"][1:-1]]) + + #reshape token array to CLIP input size + batched_tokens = [] + batch = [] + batched_tokens.append(batch) + for i, t_group in enumerate(tokens): + #start a new batch if there is not enough room + if len(t_group) + len(batch) > self.max_tokens_per_section: + remaining_length = self.max_tokens_per_section - len(batch) + #fill remaining space depending on length of tokens + if len(t_group) > self.max_word_length: + #put part of group of tokens in the batch + batch.extend([(t,w,i+1) for t,w in t_group[:remaining_length]]) + t_group = t_group[remaining_length:] + else: + #filler tokens + batch.extend([(self.end_token, 1.0, 0)] * remaining_length) + batch = [] + batched_tokens.append(batch) + #put current group of tokens in the batch + batch.extend([(t,w,i+1) for t,w in t_group]) + + #fill last batch + batch.extend([(self.end_token, 1.0, 0)] * (self.max_tokens_per_section - len(batch))) + + #add start and end tokens + batched_tokens = [[(self.start_token, 1.0, 0)] + x + [(self.end_token, 1.0, 0)] for x in batched_tokens] + return batched_tokens - #try not to split words in different sections - if tokens_left < len(temp_tokens) and len(temp_tokens) < (self.max_word_length): - for x in range(tokens_left): - tokens += [(self.end_token, 1.0)] - tokens += temp_tokens - - out_tokens = [] - for x in range(0, len(tokens), self.max_tokens_per_section): - o_token = [(self.start_token, 1.0)] + tokens[x:min(self.max_tokens_per_section + x, len(tokens))] - o_token += [(self.end_token, 1.0)] - if self.pad_with_end: - o_token +=[(self.end_token, 1.0)] * (self.max_length - len(o_token)) - else: - o_token +=[(0, 1.0)] * (self.max_length - len(o_token)) - - out_tokens += [o_token] - - return out_tokens def untokenize(self, token_weight_pair): return list(map(lambda a: (a, self.inv_vocab[a[0]]), token_weight_pair)) From 73175cf58c0371903bf9bec107f0e82c5c4363d0 Mon Sep 17 00:00:00 2001 From: BlenderNeko <126974546+BlenderNeko@users.noreply.github.com> Date: Thu, 13 Apr 2023 22:06:50 +0200 Subject: [PATCH 2/5] split tokenizer from encoder --- comfy/sd.py | 6 ++++-- nodes.py | 3 ++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/comfy/sd.py b/comfy/sd.py index 2d7ff5ab..6bd30daf 100644 --- a/comfy/sd.py +++ b/comfy/sd.py @@ -372,10 +372,12 @@ class CLIP: def clip_layer(self, layer_idx): self.layer_idx = layer_idx - def encode(self, text): + def tokenize(self, text): + return self.tokenizer.tokenize_with_weights(text) + + def encode(self, tokens): if self.layer_idx is not None: self.cond_stage_model.clip_layer(self.layer_idx) - tokens = self.tokenizer.tokenize_with_weights(text) try: self.patcher.patch_model() cond = self.cond_stage_model.encode_token_weights(tokens) diff --git a/nodes.py b/nodes.py index 946c6685..b81d1601 100644 --- a/nodes.py +++ b/nodes.py @@ -44,7 +44,8 @@ class CLIPTextEncode: CATEGORY = "conditioning" def encode(self, clip, text): - return ([[clip.encode(text), {}]], ) + tokens = clip.tokenize(text) + return ([[clip.encode(tokens), {}]], ) class ConditioningCombine: @classmethod From 752f7a162ba728b3ab7b9ce53be73c271da25dd5 Mon Sep 17 00:00:00 2001 From: BlenderNeko <126974546+BlenderNeko@users.noreply.github.com> Date: Fri, 14 Apr 2023 21:02:45 +0200 Subject: [PATCH 3/5] align behavior with old tokenize function --- comfy/sd1_clip.py | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/comfy/sd1_clip.py b/comfy/sd1_clip.py index 3dd8262a..45bc9526 100644 --- a/comfy/sd1_clip.py +++ b/comfy/sd1_clip.py @@ -226,12 +226,11 @@ class SD1Tokenizer: self.max_word_length = 8 self.embedding_identifier = "embedding:" - def _try_get_embedding(self, name:str): + def _try_get_embedding(self, embedding_name:str): ''' Takes a potential embedding name and tries to retrieve it. Returns a Tuple consisting of the embedding and any leftover string, embedding can be None. ''' - embedding_name = name[len(self.embedding_identifier):].strip('\n') embed = load_embed(embedding_name, self.embedding_directory) if embed is None: stripped = embedding_name.strip(',') @@ -259,9 +258,10 @@ class SD1Tokenizer: for word in to_tokenize: #if we find an embedding, deal with the embedding if word.startswith(self.embedding_identifier) and self.embedding_directory is not None: - embed, leftover = self._try_get_embedding(word) + embedding_name = word[len(self.embedding_identifier):].strip('\n') + embed, leftover = self._try_get_embedding(embedding_name) if embed is None: - print(f"warning, embedding:{word} does not exist, ignoring") + print(f"warning, embedding:{embedding_name} does not exist, ignoring") else: if len(embed.shape) == 1: tokens.append([(embed, weight)]) @@ -280,21 +280,21 @@ class SD1Tokenizer: batch = [] batched_tokens.append(batch) for i, t_group in enumerate(tokens): - #start a new batch if there is not enough room - if len(t_group) + len(batch) > self.max_tokens_per_section: - remaining_length = self.max_tokens_per_section - len(batch) - #fill remaining space depending on length of tokens - if len(t_group) > self.max_word_length: - #put part of group of tokens in the batch - batch.extend([(t,w,i+1) for t,w in t_group[:remaining_length]]) - t_group = t_group[remaining_length:] + #determine if we're going to try and keep the tokens in a single batch + is_large = len(t_group) >= self.max_word_length + while len(t_group) > 0: + if len(t_group) + len(batch) > self.max_tokens_per_section: + remaining_length = self.max_tokens_per_section - len(batch) + if is_large: + batch.extend([(t,w,i+1) for t,w in t_group[:remaining_length]]) + t_group = t_group[remaining_length:] + else: + batch.extend([(self.end_token, 1.0, 0)] * remaining_length) + batch = [] + batched_tokens.append(batch) else: - #filler tokens - batch.extend([(self.end_token, 1.0, 0)] * remaining_length) - batch = [] - batched_tokens.append(batch) - #put current group of tokens in the batch - batch.extend([(t,w,i+1) for t,w in t_group]) + batch.extend([(t,w,i+1) for t,w in t_group]) + t_group = [] #fill last batch batch.extend([(self.end_token, 1.0, 0)] * (self.max_tokens_per_section - len(batch))) From da115bd78d7c4571dc0747dcb17e280b5c8ff4ea Mon Sep 17 00:00:00 2001 From: BlenderNeko <126974546+BlenderNeko@users.noreply.github.com> Date: Fri, 14 Apr 2023 21:16:55 +0200 Subject: [PATCH 4/5] ensure backwards compat with optional args --- comfy/sd.py | 10 +++++++--- comfy/sd1_clip.py | 6 +++++- nodes.py | 3 +-- 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/comfy/sd.py b/comfy/sd.py index 6bd30daf..6e54bc60 100644 --- a/comfy/sd.py +++ b/comfy/sd.py @@ -372,12 +372,16 @@ class CLIP: def clip_layer(self, layer_idx): self.layer_idx = layer_idx - def tokenize(self, text): - return self.tokenizer.tokenize_with_weights(text) + def tokenize(self, text, return_word_ids=False): + return self.tokenizer.tokenize_with_weights(text, return_word_ids) - def encode(self, tokens): + def encode(self, text, from_tokens=False): if self.layer_idx is not None: self.cond_stage_model.clip_layer(self.layer_idx) + if from_tokens: + tokens = text + else: + tokens = self.tokenizer.tokenize_with_weights(text) try: self.patcher.patch_model() cond = self.cond_stage_model.encode_token_weights(tokens) diff --git a/comfy/sd1_clip.py b/comfy/sd1_clip.py index 45bc9526..02e925c8 100644 --- a/comfy/sd1_clip.py +++ b/comfy/sd1_clip.py @@ -240,7 +240,7 @@ class SD1Tokenizer: return (embed, "") - def tokenize_with_weights(self, text:str): + def tokenize_with_weights(self, text:str, return_word_ids=False): ''' Takes a prompt and converts it to a list of (token, weight, word id) elements. Tokens can both be integer tokens and pre computed CLIP tensors. @@ -301,6 +301,10 @@ class SD1Tokenizer: #add start and end tokens batched_tokens = [[(self.start_token, 1.0, 0)] + x + [(self.end_token, 1.0, 0)] for x in batched_tokens] + + if not return_word_ids: + batched_tokens = [[(t, w) for t, w,_ in x] for x in batched_tokens] + return batched_tokens diff --git a/nodes.py b/nodes.py index b68c8ef4..6468ac6b 100644 --- a/nodes.py +++ b/nodes.py @@ -44,8 +44,7 @@ class CLIPTextEncode: CATEGORY = "conditioning" def encode(self, clip, text): - tokens = clip.tokenize(text) - return ([[clip.encode(tokens), {}]], ) + return ([[clip.encode(text), {}]], ) class ConditioningCombine: @classmethod From d0b1b6c6bf60a6f85e742a3340e9fcd9b06d0bde Mon Sep 17 00:00:00 2001 From: BlenderNeko <126974546+BlenderNeko@users.noreply.github.com> Date: Sat, 15 Apr 2023 19:38:21 +0200 Subject: [PATCH 5/5] fixed improper padding --- comfy/sd1_clip.py | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/comfy/sd1_clip.py b/comfy/sd1_clip.py index 02e925c8..32612cf3 100644 --- a/comfy/sd1_clip.py +++ b/comfy/sd1_clip.py @@ -247,6 +247,11 @@ class SD1Tokenizer: Word id values are unique per word and embedding, where the id 0 is reserved for non word tokens. Returned list has the dimensions NxM where M is the input size of CLIP ''' + if self.pad_with_end: + pad_token = self.end_token + else: + pad_token = 0 + text = escape_important(text) parsed_weights = token_weights(text, 1.0) @@ -277,30 +282,33 @@ class SD1Tokenizer: #reshape token array to CLIP input size batched_tokens = [] - batch = [] + batch = [(self.start_token, 1.0, 0)] batched_tokens.append(batch) for i, t_group in enumerate(tokens): #determine if we're going to try and keep the tokens in a single batch is_large = len(t_group) >= self.max_word_length + while len(t_group) > 0: - if len(t_group) + len(batch) > self.max_tokens_per_section: - remaining_length = self.max_tokens_per_section - len(batch) + if len(t_group) + len(batch) > self.max_length - 1: + remaining_length = self.max_length - len(batch) - 1 + #break word in two and add end token if is_large: batch.extend([(t,w,i+1) for t,w in t_group[:remaining_length]]) + batch.append((self.end_token, 1.0, 0)) t_group = t_group[remaining_length:] + #add end token and pad else: - batch.extend([(self.end_token, 1.0, 0)] * remaining_length) - batch = [] + batch.append((self.end_token, 1.0, 0)) + batch.extend([(pad_token, 1.0, 0)] * (remaining_length)) + #start new batch + batch = [(self.start_token, 1.0, 0)] batched_tokens.append(batch) else: batch.extend([(t,w,i+1) for t,w in t_group]) t_group = [] #fill last batch - batch.extend([(self.end_token, 1.0, 0)] * (self.max_tokens_per_section - len(batch))) - - #add start and end tokens - batched_tokens = [[(self.start_token, 1.0, 0)] + x + [(self.end_token, 1.0, 0)] for x in batched_tokens] + batch.extend([(self.end_token, 1.0, 0)] + [(pad_token, 1.0, 0)] * (self.max_length - len(batch) - 1)) if not return_word_ids: batched_tokens = [[(t, w) for t, w,_ in x] for x in batched_tokens]