diff --git a/.gitignore b/.gitignore index 66ce54fc7..66dfb8612 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,4 @@ __pycache__ docs/build .coverage .idea/ +*.gguf diff --git a/examples/llamacpp_example.py b/examples/llamacpp_example.py index 56dacca5d..d3c420243 100644 --- a/examples/llamacpp_example.py +++ b/examples/llamacpp_example.py @@ -31,7 +31,7 @@ class Character(BaseModel): if __name__ == "__main__": # Download model from https://huggingface.co/TheBloke/phi-2-GGUF - model = outlines.models.llamacpp("./phi-2.Q3_K_M.gguf", device="cpu") + model = outlines.models.llamacpp("./phi-2.Q4_K_M.gguf", device="cpu") # Construct guided sequence generator generator = outlines.generate.json(model, Character, max_tokens=512) diff --git a/examples/llamacpp_processor.py b/examples/llamacpp_processor.py new file mode 100644 index 000000000..01f8a3581 --- /dev/null +++ b/examples/llamacpp_processor.py @@ -0,0 +1,49 @@ +from enum import Enum + +from llama_cpp import Llama, LogitsProcessorList +from pydantic import BaseModel, constr + +from outlines.models.llamacpp import LlamaCppTokenizer, LlamaJSONLogitsProcessor + + +class Weapon(str, Enum): + sword = "sword" + axe = "axe" + mace = "mace" + spear = "spear" + bow = "bow" + crossbow = "crossbow" + + +class Armor(str, Enum): + leather = "leather" + chainmail = "chainmail" + plate = "plate" + + +class Character(BaseModel): + name: constr(max_length=10) + age: int + armor: Armor + weapon: Weapon + strength: int + + +if __name__ == "__main__": + llama = Llama("./phi-2.Q4_K_M.gguf") + tokenizer = LlamaCppTokenizer(llama) + + prompt = "Instruct: You are a leading role play gamer. You have seen thousands of different characters and their attributes.\nPlease return a JSON object with common attributes of an RPG character. Give me a character description\nOutput:" + + logits_processor = LlamaJSONLogitsProcessor(Character, tokenizer) + + json_str = llama.create_completion( + prompt, + top_k=40, + top_p=0.95, + temperature=0.7, + max_tokens=100, + logits_processor=LogitsProcessorList([logits_processor]), + )["choices"][0]["text"] + + print(json_str) diff --git a/outlines/generate/cfg.py b/outlines/generate/cfg.py index ddedcef31..8bace59e0 100644 --- a/outlines/generate/cfg.py +++ b/outlines/generate/cfg.py @@ -3,8 +3,10 @@ from outlines.fsm.fsm import CFGFSM from outlines.generate.api import SequenceGenerator +from outlines.generate.processors import CFGLogitsProcessor from outlines.generate.samplers import Sampler, multinomial from outlines.models import OpenAI +from outlines.models.llamacpp import LlamaCpp @singledispatch @@ -25,6 +27,26 @@ def cfg( return generator +@cfg.register(LlamaCpp) +def cfg_llamacpp( + model: LlamaCpp, + cfg_str: str, + max_tokens: Optional[int] = None, + stop_at: Optional[Union[str, List[str]]] = None, + sampler: Sampler = multinomial, +): + if not sampler == multinomial: + raise NotImplementedError( + r"The llama.cpp integration does not currently support any other sampling algorithm " + + "that the multinomial sampler." + ) + + logits_processor = CFGLogitsProcessor(cfg_str, model.tokenizer) + model.logits_processor = logits_processor + + return model + + @cfg.register(OpenAI) def cfg_openai( model, diff --git a/outlines/generate/json.py b/outlines/generate/json.py index 365200853..e231a1204 100644 --- a/outlines/generate/json.py +++ b/outlines/generate/json.py @@ -7,6 +7,7 @@ from outlines.fsm.json_schema import build_regex_from_object, get_schema_from_signature from outlines.generate.samplers import Sampler, multinomial from outlines.models import OpenAI +from outlines.models.llamacpp import LlamaCpp from .regex import regex @@ -43,6 +44,36 @@ def json( return generator +@regex.register(LlamaCpp) +def json_llamacpp( + model, + schema_object: Union[str, object, Callable], + max_tokens: Optional[int] = None, + sampler: Sampler = multinomial, +): + if isinstance(schema_object, type(BaseModel)): + schema = pyjson.dumps(schema_object.model_json_schema()) + regex_str = build_regex_from_object(schema) + elif callable(schema_object): + schema = pyjson.dumps(get_schema_from_signature(schema_object)) + regex_str = build_regex_from_object(schema) + elif isinstance(schema_object, str): + schema = schema_object + regex_str = build_regex_from_object(schema) + else: + raise ValueError( + f"Cannot parse schema {schema_object}. The schema must be either " + + "a Pydantic object, a function or a string that contains the JSON " + + "Schema specification" + ) + + # TODO: format the output + # We should be able to use the same interface as transformers and make this + # function redundant by adding a `format_sequence` method to `LlamaCpp` + + return regex(model, regex_str, max_tokens, sampler) + + @json.register(OpenAI) def json_openai( model, diff --git a/outlines/generate/processors.py b/outlines/generate/processors.py new file mode 100644 index 000000000..7c92aeb9c --- /dev/null +++ b/outlines/generate/processors.py @@ -0,0 +1,100 @@ +import json +import math +from typing import Union + +import numpy as np +import torch +from numpy.typing import NDArray + +from outlines.fsm.fsm import CFGFSM, FSM, FSMState, RegexFSM +from outlines.fsm.json_schema import build_regex_from_object +from outlines.models.tokenizer import Tokenizer + + +class LogitsProcessor: + def __init__(self, tokenizer: Tokenizer, fsm: FSM): + """Super class for logit processors. + + Parameters + ---------- + tokenizer + An instance of `Tokenizer` + + """ + self.tokenizer = tokenizer + self.fsm_state: FSMState = None # type: ignore + self.fsm: FSM = fsm + + def __call__( + self, input_ids: NDArray[np.int64], scores: NDArray[np.float32] + ) -> NDArray[np.float32]: + """Use the FSM to bias the logits before sampling the next token.""" + if self.fsm is None: + raise NotImplementedError() + + if self.fsm_state is None: + self.fsm_state = FSMState(0) + else: + last_token = input_ids[-1] + self.fsm_state = self.fsm.next_state(self.fsm_state, last_token) + + allowed_tokens = self.fsm.allowed_token_ids(self.fsm_state) + + mask = torch.full((scores.shape[-1],), -math.inf, device="cpu").numpy() + mask[allowed_tokens] = 0 + biased_scores = scores + mask + + biased_scores[self.tokenizer.eos_token_id] = 0 + + return biased_scores + + +class RegexLogitsProcessor(LogitsProcessor): + def __init__(self, regex_string: str, tokenizer: Tokenizer): + """Compile the FSM that drives the regex-guided generation. + + Parameters + ---------- + regex_string + A string that represents a regular expression + tokenizer + An instance of `Tokenizer` + + """ + fsm = RegexFSM(regex_string, tokenizer) + super().__init__(tokenizer, fsm) + + +class JSONLogitsProcessor(RegexLogitsProcessor): + def __init__(self, schema: Union[str, dict], tokenizer: Tokenizer): + """Compile the FSM that drives the JSON-guided generation. + + Parameters + ---------- + schema + A JSON schema that encodes the structure we want the model to generate + tokenizer + An instance of `Tokenizer` + + """ + # TODO: Why is this needed? We are using regexes + if isinstance(schema, dict): + schema = json.dumps(schema) + regex_string = build_regex_from_object(schema) + super().__init__(regex_string, tokenizer) + + +class CFGLogitsProcessor(LogitsProcessor): + def __init__(self, cfg_str: str, tokenizer: Tokenizer): + """Compile the FSM that drives the CFG-guided generation. + + Parameters + ---------- + cfg_str + A string that represents a grammar + tokenizer + An instance of `Tokenizer` + + """ + fsm = CFGFSM(cfg_str, tokenizer) + super().__init__(tokenizer, fsm) diff --git a/outlines/generate/regex.py b/outlines/generate/regex.py index 16daaac93..50104a033 100644 --- a/outlines/generate/regex.py +++ b/outlines/generate/regex.py @@ -3,8 +3,10 @@ from outlines.fsm.fsm import RegexFSM from outlines.generate.api import SequenceGenerator +from outlines.generate.processors import RegexLogitsProcessor from outlines.generate.samplers import Sampler, multinomial from outlines.models import OpenAI +from outlines.models.llamacpp import LlamaCpp @singledispatch @@ -22,9 +24,28 @@ def regex( return generator +@regex.register(LlamaCpp) +def regex_llamacpp( + model: LlamaCpp, + regex_str: str, + max_tokens: Optional[int] = None, + sampler: Sampler = multinomial, +): + if not sampler == multinomial: + raise NotImplementedError( + r"The llama.cpp integration does not currently support any other sampling algorithm " + + "that the multinomial sampler." + ) + + logits_processor = RegexLogitsProcessor(regex_str, model.tokenizer) + model.logits_processor = logits_processor + + return model + + @regex.register(OpenAI) def regex_openai( - model, + model: OpenAI, regex_str: str, max_tokens: Optional[int] = None, sampler: Sampler = multinomial, diff --git a/outlines/generate/text.py b/outlines/generate/text.py index a688f1d1a..40e0921a0 100644 --- a/outlines/generate/text.py +++ b/outlines/generate/text.py @@ -6,6 +6,7 @@ from outlines.generate import SequenceGenerator from outlines.generate.samplers import Sampler, multinomial from outlines.models import OpenAI +from outlines.models.llamacpp import LlamaCpp @singledispatch @@ -14,7 +15,6 @@ def text( max_tokens: Optional[int] = None, stop_at: Optional[Union[str, List[str]]] = None, *, - samples: int = 1, sampler: Sampler = multinomial, ) -> SequenceGenerator: """Generate text with a `Transformer` model. @@ -43,11 +43,6 @@ def text( A `SequenceGenerator` instance that generates text. """ - if samples > 1: - raise NotImplementedError( - "It is currently impossible to generate several samples with `transformers` models." - ) - fsm = StopAtEosFSM(model.tokenizer) device = model.device @@ -58,6 +53,23 @@ def text( return generator +@text.register(LlamaCpp) +def text_llamacpp( + model: LlamaCpp, + max_tokens: Optional[int] = None, + stop_at: Optional[Union[List[str], str]] = None, + *, + sampler: Sampler = multinomial, +): + if not sampler == multinomial: + raise NotImplementedError( + r"The OpenAI API does not support any other sampling algorithm " + + "that the multinomial sampler." + ) + + return model + + @text.register(OpenAI) def text_openai( model: OpenAI, diff --git a/outlines/models/llamacpp.py b/outlines/models/llamacpp.py index c51f600f8..be51ca06d 100644 --- a/outlines/models/llamacpp.py +++ b/outlines/models/llamacpp.py @@ -1,79 +1,30 @@ -import ctypes -from typing import List, Optional, Tuple, Union +from typing import TYPE_CHECKING, List, Optional, Tuple, Union import numpy as np import torch + +# TODO: in order to make sub classing work we need to move the installation check here +from llama_cpp import Llama from numpy.typing import NDArray from outlines.models.tokenizer import Tokenizer +if TYPE_CHECKING: + from outlines.generate.processors import LogitsProcessor + -class LlamaCpp: +class LlamaCpp(Llama): """Represents a `llama_cpp` model.""" def __init__( - self, llama_instance, model, tokenizer, device, context_params, **kwargs + self, model, logits_processor: Optional["LogitsProcessor"] = None, **kwargs ): - self.device = device - self.llama_instance = llama_instance - self.tokenizer = tokenizer - - # Note: the concept of padding does not exist in llama.cpp as a batched sequence is just - # a flat array of tokens that can be assigned to one or more sequences. - # To make it compatible with the transformers inspired tokenizer interface - # we need a padding token to homogenize to token_ids tensor. - self.pad_token_id = -1 - - self.n_past = 0 - self.n_vocab = kwargs.pop("n_vocab") - - self.ctx = llama_instance.llama_new_context_with_model(model, context_params) - - def forward(self, input_ids: torch.LongTensor, *_): - """Compute a forward pass through the llama_cpp model.""" - if input_ids.ndim == 2: - seq_tensor = input_ids[:, self.n_past :] - elif input_ids.ndim == 1: - seq_tensor = input_ids.view(1, -1)[:, self.n_past :] - else: - raise Exception("Only one and two dimensional inputs allowed.") - - tokens_total = torch.numel(seq_tensor[seq_tensor != self.pad_token_id]) - batch = self.llama_instance.llama_batch_init(tokens_total, 0, 1) - - seq_token_ids = [] - for seq_idx, seq in enumerate(seq_tensor): - for token_pos, token_id in enumerate(seq): - if token_id == self.pad_token_id: - break - batch.token[batch.n_tokens] = token_id.item() - batch.pos[batch.n_tokens] = token_pos - batch.seq_id[batch.n_tokens][0] = seq_idx - batch.n_seq_id[batch.n_tokens] = 1 - batch.logits[batch.n_tokens] = False - - batch.n_tokens += 1 - self.n_past += 1 - - batch.logits[batch.n_tokens - 1] = True - seq_token_ids.append(batch.n_tokens - 1) - - if self.llama_instance.llama_decode(self.ctx, batch) != 0: - print("Error decoding") + super().__init__(model, **kwargs) - all_logits = [] - for seq_token in seq_token_ids: - logits = self.llama_instance.llama_get_logits_ith(self.ctx, seq_token) - logits_list = (ctypes.c_float * self.n_vocab)( - *[logits[token_id] for token_id in range(self.n_vocab)] - ) - logits_tensor = torch.tensor(logits_list) - all_logits.append(logits_tensor) - - self.llama_instance.llama_batch_free(batch) + self.device = "cpu" - stacked_logits = torch.stack(all_logits) - return stacked_logits, None + self.logits_processor = logits_processor + self.tokenizer = LlamaCppTokenizer(self) def __call__( self, @@ -81,51 +32,48 @@ def __call__( attention_mask: torch.LongTensor, past_key_values: Optional[Tuple] = None, ) -> torch.FloatTensor: - logits, kv_cache = self.forward(input_ids, attention_mask, past_key_values) - next_token_logits = logits + all_logits = [] + + for i in range(input_ids.size(0)): + super().eval(input_ids[i, self.n_tokens :]) - return next_token_logits, kv_cache + logits = super().eval_logits + logits_tensor = torch.FloatTensor(list(logits)) + all_logits.append(logits_tensor[0, :]) + + return torch.stack(all_logits), None + + def stream(self): + raise NotImplementedError class LlamaCppTokenizer(Tokenizer): - def __init__(self, llama_instance, model, model_name: str, **kwargs): - self.model_name = model_name - self.llama_instance = llama_instance + def __init__(self, model, **kwargs): + self.model = model self.is_llama = False + self.model_name = self.model.model_path - self.model = model - self.n_vocab = kwargs.pop("n_vocab") + self.tokenizer = self.model.tokenizer() - self.eos_token_id = llama_instance.llama_token_eos(model) - self.eos_token = self._get_eos_token() + self.eos_token_id = self.model.token_eos() + self.bos_token_id = self.model.token_bos() self.pad_token_id = -1 - self.bos_token_id = llama_instance.llama_token_eos(model) - self.nl_token_id = llama_instance.llama_token_nl(model) + self.eos_token = self.tokenizer.decode([self.eos_token_id]) + self.bos_token = self.tokenizer.decode([self.bos_token_id]) + + self.n_vocab = self.model.n_vocab() self.vocabulary = {} self._create_vocabulary() - self.n_past = 0 - - self.special_tokens = { - self.eos_token_id, - self.pad_token_id, - self.bos_token_id, - self.nl_token_id, - } + self.special_tokens = {} def _create_vocabulary(self): for t in range(self.n_vocab): - size = 32 - buffer = (ctypes.c_char * size)() - n = self.llama_instance.llama_token_to_piece( - self.model, self.llama_instance.llama_token(t), buffer, size - ) - try: - token_piece = buffer[:n].decode("utf-8") + token_piece = self.tokenizer.decode([t]) self.vocabulary[token_piece] = t except Exception as e: - print(f"Failed to convert token ({buffer[:n]}): {e}") + print(f"Failed to convert token: {e}") continue def encode( @@ -136,27 +84,9 @@ def encode( else: prompts = [prompt] - max_len = 0 token_ids = [] for p in prompts: - embd_inp = (self.llama_instance.llama_token * (len(p) + 1))() - - n_of_tok = self.llama_instance.llama_tokenize( - model=self.model, - text=bytes(str(p), "utf-8"), - text_len=len(embd_inp), - tokens=embd_inp, - n_max_tokens=len(embd_inp), - add_bos=self.n_past == 0, - special=False, - ) - - self.n_past += n_of_tok - - if n_of_tok > max_len: - max_len = n_of_tok - - embd_inp = embd_inp[:n_of_tok] + embd_inp = self.tokenizer.encode(p) token_ids.append(np.array(embd_inp)) max_len = np.max([len(a) for a in token_ids]) @@ -176,39 +106,22 @@ def encode( return token_ids, torch.ones_like(token_ids) def decode(self, token_ids: NDArray[np.int64]) -> List[str]: + if isinstance(token_ids, torch.Tensor): + token_ids = token_ids.numpy() if isinstance(token_ids, list): - token_ids = np.array(token_ids) - if token_ids.ndim == 1: - token_ids = [token_ids] - - pieces = [] - for tokens in token_ids: - seq = [] - for id in tokens: - size = 32 - buffer = (ctypes.c_char * size)() - n = self.llama_instance.llama_token_to_piece( - self.model, self.llama_instance.llama_token(id), buffer, size - ) - - token_piece = buffer[:n].decode("utf-8") # type: ignore - - seq.append(token_piece) - - pieces.append("".join(seq)) - - return pieces - - def _get_eos_token(self): - size = 32 - buffer = (ctypes.c_char * size)() - n = self.llama_instance.llama_token_to_piece( - self.model, self.llama_instance.llama_token(self.eos_token_id), buffer, size - ) - - token_piece = buffer[:n].decode("utf-8") - - return token_piece + return [self.tokenizer.decode(np.array(t).tolist()) for t in token_ids] + elif isinstance(token_ids, np.ndarray): + if token_ids.ndim == 1: + return [self.tokenizer.decode(token_ids.tolist())] + elif token_ids.ndim == 2: + return [ + self.tokenizer.decode(np.array(t).tolist()) + for t in token_ids.tolist() + ] + else: + return [] + else: + return [] def convert_token_to_string(self, token: str) -> str: return token @@ -228,77 +141,4 @@ def llamacpp( model_kwargs: dict = {}, tokenizer_kwargs: dict = {}, ): - try: - import llama_cpp - except ImportError: - raise ImportError( - "The `llama-cpp-python` library needs to be installed in order to use LlamaCpp." - ) - - if device is None: - device = "cpu" - - llama_cpp.llama_backend_init(numa=False) - - model_params = llama_cpp.llama_model_default_params() - - if "cuda" in device: - model_params.n_gpu_layers = 999 - else: - model_params.n_gpu_layers = model_kwargs.pop( - "n_gpu_layers", model_params.n_gpu_layers - ) - - if "tensor_split" in model_kwargs.keys(): - tensor_split = model_kwargs.get("tensor_split") - if isinstance(tensor_split, list): - tensor_split_arr = (ctypes.c_float * llama_cpp.LLAMA_MAX_DEVICES)( - *[t for t in tensor_split] - ) - model_params.tensor_split = tensor_split_arr - - context_params = llama_cpp.llama_context_default_params() - context_params.n_batch = model_kwargs.pop("n_batch", context_params.n_batch) - context_params.n_ctx = model_kwargs.pop("n_ctx", context_params.n_ctx) - context_params.n_threads = model_kwargs.pop("n_threads", context_params.n_threads) - context_params.n_threads_batch = model_kwargs.pop( - "n_threads_batch", context_params.n_threads_batch - ) - context_params.rope_scaling_type = model_kwargs.pop( - "rope_scaling_type", context_params.rope_scaling_type - ) - context_params.rope_freq_base = model_kwargs.pop( - "rope_freq_base", context_params.rope_freq_base - ) - context_params.rope_freq_scale = model_kwargs.pop( - "rope_freq_scale", context_params.rope_freq_scale - ) - context_params.yarn_ext_factor = model_kwargs.pop( - "yarn_ext_factor", context_params.yarn_ext_factor - ) - context_params.yarn_attn_factor = model_kwargs.pop( - "yarn_attn_factor", context_params.yarn_attn_factor - ) - context_params.yarn_beta_fast = model_kwargs.pop( - "yarn_beta_fast", context_params.yarn_beta_fast - ) - context_params.yarn_beta_slow = model_kwargs.pop( - "yarn_beta_slow", context_params.yarn_beta_slow - ) - context_params.yarn_orig_ctx = model_kwargs.pop( - "yarn_orig_ctx", context_params.yarn_orig_ctx - ) - context_params.offload_kqv = model_kwargs.pop( - "offload_kqv", context_params.offload_kqv - ) - - model = llama_cpp.llama_load_model_from_file( - model_name.encode("utf-8"), model_params - ) - - model_kwargs["n_vocab"] = llama_cpp.llama_n_vocab(model) - tokenizer_kwargs["n_vocab"] = model_kwargs.get("n_vocab") - - tokenizer = LlamaCppTokenizer(llama_cpp, model, model_name, **tokenizer_kwargs) - - return LlamaCpp(llama_cpp, model, tokenizer, "cpu", context_params, **model_kwargs) + return LlamaCpp(model_name, **model_kwargs) diff --git a/tests/models/test_llama_cpp.py b/tests/models/test_llama_cpp.py index 68e998239..7843f8e2b 100644 --- a/tests/models/test_llama_cpp.py +++ b/tests/models/test_llama_cpp.py @@ -43,6 +43,15 @@ def test_tokenizer(model_download): text = tokenizer.decode(np.array([0, 1, 2])) assert isinstance(text, list) + text = tokenizer.decode(np.array([[0, 1, 2]])) + assert isinstance(text, list) + + text = tokenizer.decode(torch.tensor([0, 1, 2])) + assert isinstance(text, list) + + text = tokenizer.decode(torch.tensor([[0, 1, 2]])) + assert isinstance(text, list) + def test_model(model_download): model = llamacpp(TEST_MODEL) @@ -53,14 +62,14 @@ def test_model(model_download): assert logits.ndim == 2 assert logits.shape[0] == 1 - model.n_past = 0 + model.reset() input_ids = torch.tensor([[0, 1, 2], [3, 4, 5], [6, 7, 8]]) logits, kv_cache = model(input_ids, torch.ones_like(input_ids)) assert logits.ndim == 2 assert logits.shape[0] == 3 - model.n_past = 0 + model.reset() input_ids = torch.tensor([[0, 1, 2], [3, -1, -1]]) logits, kv_cache = model(input_ids, torch.ones_like(input_ids))