-
Notifications
You must be signed in to change notification settings - Fork 1
/
tokenizer.py
52 lines (48 loc) · 2.39 KB
/
tokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
from transformers import PreTrainedTokenizerFast
import sentencepiece as spm
import tiktoken
from helper import format_chat_history, format_user_chat_history
sp = spm.SentencePieceProcessor()
def history_token_length(chat_history, model_type):
formatted_history = format_chat_history(chat_history=chat_history)
if model_type == "llama3":
sp.load('./tokenizer/llama/llama.model')
return len(sp.encode_as_pieces(formatted_history))
elif model_type =="mistral":
sp.load("./tokenizer/mistral/tokenizer.model")
return len(sp.encode_as_pieces(formatted_history))
elif model_type =="nai":
sp.load("./tokenizer/nai/tokenizer.model")
return len(sp.encode_as_pieces(formatted_history))
elif model_type =="trin":
sp.load("./tokenizer/trin/spiece.model")
return len(sp.encode_as_pieces(formatted_history))
elif model_type == "claude":
tokenizer = PreTrainedTokenizerFast(tokenizer_file="./tokenizer/claude/claude.json")
return len(tokenizer.tokenize(formatted_history))
else:
# defaults to cl100k_base, chatgpt's tokenizer
encoding = tiktoken.get_encoding("cl100k_base")
return len(encoding.encode(formatted_history))
def text_token_length(text, model_type):
if model_type == "llama3":
sp.load('./tokenizer/llama/llama.model')
return len(sp.encode_as_pieces(text))
elif model_type =="mistral":
sp.load("./tokenizer/mistral/tokenizer.model")
return len(sp.encode_as_pieces(text))
elif model_type =="nai":
sp.load("./tokenizer/nai/tokenizer.model")
return len(sp.encode_as_pieces(text))
elif model_type =="trin":
sp.load("./tokenizer/trin/spiece.model")
return len(sp.encode_as_pieces(text))
elif model_type == "claude":
tokenizer = PreTrainedTokenizerFast(tokenizer_file="./tokenizer/claude/claude.json")
return len(tokenizer.tokenize(text))
else:
# defaults to cl100k_base, chatgpt's tokenizer
encoding = tiktoken.get_encoding("cl100k_base")
return len(encoding.encode(text))
chat_history = [{'role': "system", 'content': "You are a very helpful assistant"},{'role': "user", 'content': "dududu max verstappen goes vroom"},{'role': "assistant", 'content': "erm what the sigma"},{'role': "user", 'content': "L to you Chatgpt"},]
# TODO: add multimodal tokenizing support, but idk how. HELP