2024-07-12 05:08:45 +00:00
|
|
|
import os
|
2024-07-24 20:43:53 +00:00
|
|
|
import torch
|
2024-07-12 05:08:45 +00:00
|
|
|
|
2024-07-22 16:21:45 +00:00
|
|
|
class SPieceTokenizer:
|
2024-07-23 18:17:42 +00:00
|
|
|
add_eos = True
|
|
|
|
|
2024-07-12 05:08:45 +00:00
|
|
|
@staticmethod
|
|
|
|
def from_pretrained(path):
|
2024-07-22 16:21:45 +00:00
|
|
|
return SPieceTokenizer(path)
|
2024-07-12 05:08:45 +00:00
|
|
|
|
|
|
|
def __init__(self, tokenizer_path):
|
|
|
|
import sentencepiece
|
2024-07-24 20:43:53 +00:00
|
|
|
if torch.is_tensor(tokenizer_path):
|
|
|
|
tokenizer_path = tokenizer_path.numpy().tobytes()
|
|
|
|
|
2024-07-23 18:17:42 +00:00
|
|
|
if isinstance(tokenizer_path, bytes):
|
|
|
|
self.tokenizer = sentencepiece.SentencePieceProcessor(model_proto=tokenizer_path, add_eos=self.add_eos)
|
|
|
|
else:
|
|
|
|
self.tokenizer = sentencepiece.SentencePieceProcessor(model_file=tokenizer_path, add_eos=self.add_eos)
|
2024-07-12 05:08:45 +00:00
|
|
|
|
|
|
|
def get_vocab(self):
|
|
|
|
out = {}
|
|
|
|
for i in range(self.tokenizer.get_piece_size()):
|
|
|
|
out[self.tokenizer.id_to_piece(i)] = i
|
|
|
|
return out
|
|
|
|
|
|
|
|
def __call__(self, string):
|
|
|
|
out = self.tokenizer.encode(string)
|
|
|
|
return {"input_ids": out}
|
2024-07-25 22:21:08 +00:00
|
|
|
|
|
|
|
def serialize_model(self):
|
|
|
|
return torch.ByteTensor(list(self.tokenizer.serialized_model_proto()))
|