From 8489cba1405f222f4675c120aee4a3722affb3f8 Mon Sep 17 00:00:00 2001
From: BlenderNeko <126974546+BlenderNeko@users.noreply.github.com>
Date: Thu, 13 Apr 2023 22:01:01 +0200
Subject: [PATCH 1/5] add unique ID per word/embedding for tokenizer

---
 comfy/sd1_clip.py | 117 ++++++++++++++++++++++++++++------------------
 1 file changed, 71 insertions(+), 46 deletions(-)

diff --git a/comfy/sd1_clip.py b/comfy/sd1_clip.py
index 4f51657c..3dd8262a 100644
--- a/comfy/sd1_clip.py
+++ b/comfy/sd1_clip.py
@@ -224,60 +224,85 @@ class SD1Tokenizer:
         self.inv_vocab = {v: k for k, v in vocab.items()}
         self.embedding_directory = embedding_directory
         self.max_word_length = 8
+        self.embedding_identifier = "embedding:"
 
-    def tokenize_with_weights(self, text):
+    def _try_get_embedding(self, name:str):
+        '''
+        Takes a potential embedding name and tries to retrieve it.
+        Returns a Tuple consisting of the embedding and any leftover string, embedding can be None.
+        '''
+        embedding_name = name[len(self.embedding_identifier):].strip('\n')
+        embed = load_embed(embedding_name, self.embedding_directory)
+        if embed is None:
+            stripped = embedding_name.strip(',')
+            if len(stripped) < len(embedding_name):
+                embed = load_embed(stripped, self.embedding_directory)
+                return (embed, embedding_name[len(stripped):])
+        return (embed, "")
+
+
+    def tokenize_with_weights(self, text:str):
+        '''
+        Takes a prompt and converts it to a list of (token, weight, word id) elements.
+        Tokens can both be integer tokens and pre computed CLIP tensors.
+        Word id values are unique per word and embedding, where the id 0 is reserved for non word tokens.
+        Returned list has the dimensions NxM where M is the input size of CLIP
+        '''
         text = escape_important(text)
         parsed_weights = token_weights(text, 1.0)
 
+        #tokenize words
         tokens = []
-        for t in parsed_weights:
-            to_tokenize = unescape_important(t[0]).replace("\n", " ").split(' ')
-            while len(to_tokenize) > 0:
-                word = to_tokenize.pop(0)
-                temp_tokens = []
-                embedding_identifier = "embedding:"
-                if word.startswith(embedding_identifier) and self.embedding_directory is not None:
-                    embedding_name = word[len(embedding_identifier):].strip('\n')
-                    embed = load_embed(embedding_name, self.embedding_directory)
+        for weighted_segment, weight in parsed_weights:
+            to_tokenize = unescape_important(weighted_segment).replace("\n", " ").split(' ')
+            to_tokenize = [x for x in to_tokenize if x != ""]
+            for word in to_tokenize:
+                #if we find an embedding, deal with the embedding
+                if word.startswith(self.embedding_identifier) and self.embedding_directory is not None:
+                    embed, leftover = self._try_get_embedding(word)
                     if embed is None:
-                        stripped = embedding_name.strip(',')
-                        if len(stripped) < len(embedding_name):
-                            embed = load_embed(stripped, self.embedding_directory)
-                            if embed is not None:
-                                to_tokenize.insert(0, embedding_name[len(stripped):])
-
-                    if embed is not None:
-                        if len(embed.shape) == 1:
-                            temp_tokens += [(embed, t[1])]
-                        else:
-                            for x in range(embed.shape[0]):
-                                temp_tokens += [(embed[x], t[1])]
+                        print(f"warning, embedding:{word} does not exist, ignoring")
                     else:
-                        print("warning, embedding:{} does not exist, ignoring".format(embedding_name))
-                elif len(word) > 0:
-                    tt = self.tokenizer(word)["input_ids"][1:-1]
-                    for x in tt:
-                        temp_tokens += [(x, t[1])]
-                tokens_left = self.max_tokens_per_section - (len(tokens) % self.max_tokens_per_section)
+                        if len(embed.shape) == 1:
+                            tokens.append([(embed, weight)])
+                        else:
+                            tokens.append([(embed[x], weight) for x in range(embed.shape[0])])
+                    #if we accidentally have leftover text, continue parsing using leftover, else move on to next word
+                    if leftover != "":
+                        word = leftover
+                    else:
+                        continue
+                #parse word
+                tokens.append([(t, weight) for t in self.tokenizer(word)["input_ids"][1:-1]])
+        
+        #reshape token array to CLIP input size
+        batched_tokens = []
+        batch = []
+        batched_tokens.append(batch)
+        for i, t_group in enumerate(tokens):
+            #start a new batch if there is not enough room
+            if len(t_group) + len(batch) > self.max_tokens_per_section:
+                remaining_length = self.max_tokens_per_section - len(batch)
+                #fill remaining space depending on length of tokens
+                if len(t_group) > self.max_word_length:
+                    #put part of group of tokens in the batch
+                    batch.extend([(t,w,i+1) for t,w in t_group[:remaining_length]])
+                    t_group = t_group[remaining_length:]
+                else:
+                    #filler tokens
+                    batch.extend([(self.end_token, 1.0, 0)] * remaining_length)
+                batch = []
+                batched_tokens.append(batch)
+            #put current group of tokens in the batch
+            batch.extend([(t,w,i+1) for t,w in t_group])
+        
+        #fill last batch
+        batch.extend([(self.end_token, 1.0, 0)] * (self.max_tokens_per_section - len(batch)))
+        
+        #add start and end tokens
+        batched_tokens = [[(self.start_token, 1.0, 0)] + x + [(self.end_token, 1.0, 0)] for x in batched_tokens]
+        return batched_tokens
 
-                #try not to split words in different sections
-                if tokens_left < len(temp_tokens) and len(temp_tokens) < (self.max_word_length):
-                    for x in range(tokens_left):
-                        tokens += [(self.end_token, 1.0)]
-                tokens += temp_tokens
-
-        out_tokens = []
-        for x in range(0, len(tokens), self.max_tokens_per_section):
-            o_token = [(self.start_token, 1.0)] + tokens[x:min(self.max_tokens_per_section + x, len(tokens))]
-            o_token += [(self.end_token, 1.0)]
-            if self.pad_with_end:
-                o_token +=[(self.end_token, 1.0)] * (self.max_length - len(o_token))
-            else:
-                o_token +=[(0, 1.0)] * (self.max_length - len(o_token))
-
-            out_tokens += [o_token]
-
-        return out_tokens
 
     def untokenize(self, token_weight_pair):
         return list(map(lambda a: (a, self.inv_vocab[a[0]]), token_weight_pair))

From 73175cf58c0371903bf9bec107f0e82c5c4363d0 Mon Sep 17 00:00:00 2001
From: BlenderNeko <126974546+BlenderNeko@users.noreply.github.com>
Date: Thu, 13 Apr 2023 22:06:50 +0200
Subject: [PATCH 2/5] split tokenizer from encoder

---
 comfy/sd.py | 6 ++++--
 nodes.py    | 3 ++-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/comfy/sd.py b/comfy/sd.py
index 2d7ff5ab..6bd30daf 100644
--- a/comfy/sd.py
+++ b/comfy/sd.py
@@ -372,10 +372,12 @@ class CLIP:
     def clip_layer(self, layer_idx):
         self.layer_idx = layer_idx
 
-    def encode(self, text):
+    def tokenize(self, text):
+        return self.tokenizer.tokenize_with_weights(text)
+
+    def encode(self, tokens):
         if self.layer_idx is not None:
             self.cond_stage_model.clip_layer(self.layer_idx)
-        tokens = self.tokenizer.tokenize_with_weights(text)
         try:
             self.patcher.patch_model()
             cond = self.cond_stage_model.encode_token_weights(tokens)
diff --git a/nodes.py b/nodes.py
index 946c6685..b81d1601 100644
--- a/nodes.py
+++ b/nodes.py
@@ -44,7 +44,8 @@ class CLIPTextEncode:
     CATEGORY = "conditioning"
 
     def encode(self, clip, text):
-        return ([[clip.encode(text), {}]], )
+        tokens = clip.tokenize(text)
+        return ([[clip.encode(tokens), {}]], )
 
 class ConditioningCombine:
     @classmethod

From 752f7a162ba728b3ab7b9ce53be73c271da25dd5 Mon Sep 17 00:00:00 2001
From: BlenderNeko <126974546+BlenderNeko@users.noreply.github.com>
Date: Fri, 14 Apr 2023 21:02:45 +0200
Subject: [PATCH 3/5] align behavior with old tokenize function

---
 comfy/sd1_clip.py | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/comfy/sd1_clip.py b/comfy/sd1_clip.py
index 3dd8262a..45bc9526 100644
--- a/comfy/sd1_clip.py
+++ b/comfy/sd1_clip.py
@@ -226,12 +226,11 @@ class SD1Tokenizer:
         self.max_word_length = 8
         self.embedding_identifier = "embedding:"
 
-    def _try_get_embedding(self, name:str):
+    def _try_get_embedding(self, embedding_name:str):
         '''
         Takes a potential embedding name and tries to retrieve it.
         Returns a Tuple consisting of the embedding and any leftover string, embedding can be None.
         '''
-        embedding_name = name[len(self.embedding_identifier):].strip('\n')
         embed = load_embed(embedding_name, self.embedding_directory)
         if embed is None:
             stripped = embedding_name.strip(',')
@@ -259,9 +258,10 @@ class SD1Tokenizer:
             for word in to_tokenize:
                 #if we find an embedding, deal with the embedding
                 if word.startswith(self.embedding_identifier) and self.embedding_directory is not None:
-                    embed, leftover = self._try_get_embedding(word)
+                    embedding_name = word[len(self.embedding_identifier):].strip('\n')
+                    embed, leftover = self._try_get_embedding(embedding_name)
                     if embed is None:
-                        print(f"warning, embedding:{word} does not exist, ignoring")
+                        print(f"warning, embedding:{embedding_name} does not exist, ignoring")
                     else:
                         if len(embed.shape) == 1:
                             tokens.append([(embed, weight)])
@@ -280,21 +280,21 @@ class SD1Tokenizer:
         batch = []
         batched_tokens.append(batch)
         for i, t_group in enumerate(tokens):
-            #start a new batch if there is not enough room
-            if len(t_group) + len(batch) > self.max_tokens_per_section:
-                remaining_length = self.max_tokens_per_section - len(batch)
-                #fill remaining space depending on length of tokens
-                if len(t_group) > self.max_word_length:
-                    #put part of group of tokens in the batch
-                    batch.extend([(t,w,i+1) for t,w in t_group[:remaining_length]])
-                    t_group = t_group[remaining_length:]
+            #determine if we're going to try and keep the tokens in a single batch
+            is_large = len(t_group) >= self.max_word_length
+            while len(t_group) > 0:
+                if len(t_group) + len(batch) > self.max_tokens_per_section:
+                    remaining_length = self.max_tokens_per_section - len(batch)
+                    if is_large:
+                        batch.extend([(t,w,i+1) for t,w in t_group[:remaining_length]])
+                        t_group = t_group[remaining_length:]
+                    else:
+                        batch.extend([(self.end_token, 1.0, 0)] * remaining_length)
+                    batch = []
+                    batched_tokens.append(batch)    
                 else:
-                    #filler tokens
-                    batch.extend([(self.end_token, 1.0, 0)] * remaining_length)
-                batch = []
-                batched_tokens.append(batch)
-            #put current group of tokens in the batch
-            batch.extend([(t,w,i+1) for t,w in t_group])
+                    batch.extend([(t,w,i+1) for t,w in t_group])
+                    t_group = []
         
         #fill last batch
         batch.extend([(self.end_token, 1.0, 0)] * (self.max_tokens_per_section - len(batch)))

From da115bd78d7c4571dc0747dcb17e280b5c8ff4ea Mon Sep 17 00:00:00 2001
From: BlenderNeko <126974546+BlenderNeko@users.noreply.github.com>
Date: Fri, 14 Apr 2023 21:16:55 +0200
Subject: [PATCH 4/5] ensure backwards compat with optional args

---
 comfy/sd.py       | 10 +++++++---
 comfy/sd1_clip.py |  6 +++++-
 nodes.py          |  3 +--
 3 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/comfy/sd.py b/comfy/sd.py
index 6bd30daf..6e54bc60 100644
--- a/comfy/sd.py
+++ b/comfy/sd.py
@@ -372,12 +372,16 @@ class CLIP:
     def clip_layer(self, layer_idx):
         self.layer_idx = layer_idx
 
-    def tokenize(self, text):
-        return self.tokenizer.tokenize_with_weights(text)
+    def tokenize(self, text, return_word_ids=False):
+        return self.tokenizer.tokenize_with_weights(text, return_word_ids)
 
-    def encode(self, tokens):
+    def encode(self, text, from_tokens=False):
         if self.layer_idx is not None:
             self.cond_stage_model.clip_layer(self.layer_idx)
+        if from_tokens:
+            tokens = text
+        else:
+            tokens = self.tokenizer.tokenize_with_weights(text)
         try:
             self.patcher.patch_model()
             cond = self.cond_stage_model.encode_token_weights(tokens)
diff --git a/comfy/sd1_clip.py b/comfy/sd1_clip.py
index 45bc9526..02e925c8 100644
--- a/comfy/sd1_clip.py
+++ b/comfy/sd1_clip.py
@@ -240,7 +240,7 @@ class SD1Tokenizer:
         return (embed, "")
 
 
-    def tokenize_with_weights(self, text:str):
+    def tokenize_with_weights(self, text:str, return_word_ids=False):
         '''
         Takes a prompt and converts it to a list of (token, weight, word id) elements.
         Tokens can both be integer tokens and pre computed CLIP tensors.
@@ -301,6 +301,10 @@ class SD1Tokenizer:
         
         #add start and end tokens
         batched_tokens = [[(self.start_token, 1.0, 0)] + x + [(self.end_token, 1.0, 0)] for x in batched_tokens]
+
+        if not return_word_ids:
+            batched_tokens = [[(t, w) for t, w,_ in x] for x in batched_tokens]
+
         return batched_tokens
 
 
diff --git a/nodes.py b/nodes.py
index b68c8ef4..6468ac6b 100644
--- a/nodes.py
+++ b/nodes.py
@@ -44,8 +44,7 @@ class CLIPTextEncode:
     CATEGORY = "conditioning"
 
     def encode(self, clip, text):
-        tokens = clip.tokenize(text)
-        return ([[clip.encode(tokens), {}]], )
+        return ([[clip.encode(text), {}]], )
 
 class ConditioningCombine:
     @classmethod

From d0b1b6c6bf60a6f85e742a3340e9fcd9b06d0bde Mon Sep 17 00:00:00 2001
From: BlenderNeko <126974546+BlenderNeko@users.noreply.github.com>
Date: Sat, 15 Apr 2023 19:38:21 +0200
Subject: [PATCH 5/5] fixed improper padding

---
 comfy/sd1_clip.py | 26 +++++++++++++++++---------
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/comfy/sd1_clip.py b/comfy/sd1_clip.py
index 02e925c8..32612cf3 100644
--- a/comfy/sd1_clip.py
+++ b/comfy/sd1_clip.py
@@ -247,6 +247,11 @@ class SD1Tokenizer:
         Word id values are unique per word and embedding, where the id 0 is reserved for non word tokens.
         Returned list has the dimensions NxM where M is the input size of CLIP
         '''
+        if self.pad_with_end:
+            pad_token = self.end_token
+        else:
+            pad_token = 0
+
         text = escape_important(text)
         parsed_weights = token_weights(text, 1.0)
 
@@ -277,30 +282,33 @@ class SD1Tokenizer:
         
         #reshape token array to CLIP input size
         batched_tokens = []
-        batch = []
+        batch = [(self.start_token, 1.0, 0)]
         batched_tokens.append(batch)
         for i, t_group in enumerate(tokens):
             #determine if we're going to try and keep the tokens in a single batch
             is_large = len(t_group) >= self.max_word_length
+
             while len(t_group) > 0:
-                if len(t_group) + len(batch) > self.max_tokens_per_section:
-                    remaining_length = self.max_tokens_per_section - len(batch)
+                if len(t_group) + len(batch) > self.max_length - 1:
+                    remaining_length = self.max_length - len(batch) - 1
+                    #break word in two and add end token
                     if is_large:
                         batch.extend([(t,w,i+1) for t,w in t_group[:remaining_length]])
+                        batch.append((self.end_token, 1.0, 0))
                         t_group = t_group[remaining_length:]
+                    #add end token and pad
                     else:
-                        batch.extend([(self.end_token, 1.0, 0)] * remaining_length)
-                    batch = []
+                        batch.append((self.end_token, 1.0, 0))
+                        batch.extend([(pad_token, 1.0, 0)] * (remaining_length))
+                    #start new batch
+                    batch = [(self.start_token, 1.0, 0)]
                     batched_tokens.append(batch)    
                 else:
                     batch.extend([(t,w,i+1) for t,w in t_group])
                     t_group = []
         
         #fill last batch
-        batch.extend([(self.end_token, 1.0, 0)] * (self.max_tokens_per_section - len(batch)))
-        
-        #add start and end tokens
-        batched_tokens = [[(self.start_token, 1.0, 0)] + x + [(self.end_token, 1.0, 0)] for x in batched_tokens]
+        batch.extend([(self.end_token, 1.0, 0)] + [(pad_token, 1.0, 0)] * (self.max_length - len(batch) - 1))
 
         if not return_word_ids:
             batched_tokens = [[(t, w) for t, w,_ in x] for x in batched_tokens]