Send API request with application/protobuf content-type. (#11)

friendliai · Sep 5, 2023 · 3e732f7 · 3e732f7
1 parent 10b3344
commit 3e732f7
Show file tree

Hide file tree

Showing 11 changed files with 390 additions and 11 deletions.
diff --git a/docs/docs/cli/checkpoint/upload.mdx b/docs/docs/cli/checkpoint/upload.mdx
@@ -143,4 +143,4 @@ as checkpoint file(`*.h5`).
 | **`--source-path`**, **`-p`** | TEXT | Path to source file or directory to upload | - | ✅ |
 | `--iteration` | INTEGER | The iteration number of the checkpoint. | None | ❌ |
 | `--attr-file`, `-f` | TEXT | Path to the file containing the checkpoint attributes. The file should be in YAML format. | None | ❌ |
-| `--max-workers`, `-w` | INTEGER | The number of threads to upload files. | 12 | ❌ |
+| `--max-workers`, `-w` | INTEGER | The number of threads to upload files. | 14 | ❌ |
diff --git a/docs/docs/sdk/api/completion.mdx b/docs/docs/sdk/api/completion.mdx
@@ -68,12 +68,16 @@ The `options` argument gets a `V1CompletionOptions` object, which has the follow
 | `stop` | `Optional[List[str]]` | `None` |
 | `stop_tokens` | `Optional[List[TokenSequence]]` <br></br> `(TokenSequence: {"tokens": List[int]})` | `None` |
 | `seed` | `Optional[List[int]]` | `None` |
+| `token_index_to_replace` | `Optional[List[int]]` | `None` |
+| `embedding_to_replace` | `Optional[List[float]]` | `None` |
 | `beam_search_type` | `Optional[BeamSearchType]` | `None` |
 | `beam_compat_pre_normalization` | `Optional[bool]` | `None` |
 | `beam_compat_no_post_normalization` | `Optional[bool]` | `None` |
 | `bad_words` | `Optional[List[str]]` | `None` |
 | `bad_word_tokens` | `Optional[List[TokenSequence]]` <br></br> `(TokenSequence: {"tokens": List[int]})` | `None` |
 | `include_output_logits` | `Optional[bool]` | `None` |
+| `include_output_logprobs` | `Optional[bool]` | `None` |
+| `forced_output_tokens` | `Optional[List[int]]` | `None` |
 | `eos_token` | `Optional[List[int]]` | `None` |
 
 Followings are the descriptions for each field.
@@ -100,10 +104,14 @@ Followings are the descriptions for each field.
 - **stop**: When one of the stop phrases appears in the generation result, the API will stop generation. The phrase is included in the generated result. If you are using beam search, all of the active beams should contain the stop phrase to terminate generation. Before checking whether a stop phrase is included in the result, the phrase is converted into tokens. We recommend using `stop_tokens` because it is clearer. For example, after tokenization, phrases "clear" and " clear" can result in different token sequences due to the prepended space character. Defaults to empty list.
 - **stop_tokens**: Same as the above `stop` field, but receives token sequences instead of text phrases. A TokenSequence type is a dict with the key 'tokens' and the value type List[int].
 - **seed**: Seed to control random procedure. If nothing is given, the API generate the seed randomly, use it for sampling, and return the seed along with the generated result. When using the `n` argument, you can pass a list of seed values to control all of the independent generations.
+- **token_index_to_replace**: A list of token indices where to replace the embeddings of input tokens provided via either `tokens` or `prompt`.
+- **embedding_to_replace**: A list of flattened embedding vectors used for replacing the tokens at the specified indices provided via `token_index_to_replace`.
 - **beam_search_type**: Which beam search type to use. `DETERMINISTIC` means the standard, deterministic beam search, which is similar to Hugging Face's [`beam_search`](https://huggingface.co/docs/transformers/v4.26.0/en/main_classes/text_generation#transformers.GenerationMixin.beam_search). Argmuents for controlling random sampling such as `top_k` and `top_p` are not allowed for this option. `STOCHASTIC` means stochastic beam search (more details in [Kool et al. (2019)](https://proceedings.mlr.press/v97/kool19a.html)). `NAIVE_SAMPLING` is similar to Hugging Face's [`beam_sample`](https://huggingface.co/docs/transformers/v4.26.0/en/main_classes/text_generation#transformers.GenerationMixin.beam_sample). Defaults to `DETERMINISTIC`.
 - **bad_words**: Text phrases that should not be generated. For a bad word phrase that contains N tokens, if the first N-1 tokens appears at the last of the generated result, the logit for the last token of the phrase is set to -inf. We recommend using `bad_word_tokens` because it is clearer (more details in the document for `stop` field). Defaults to empty list.
 - **bad_word_tokens**: Same as the above `bad_words` field, but receives token sequences instead of text phrases. A TokenSequence type is a dict with the key 'tokens' and the value type List[int]. This is similar to Hugging Face's <a href="https://huggingface.co/docs/transformers/v4.26.0/en/main_classes/text_generation#transformers.GenerationConfig.bad_words_ids(List[List[int]]," target="_top">`bad_word_ids`</a> argument.
 - **include_output_logits**: Whether to include the output logits to the generation output.
+- **include_output_logprobs**: Whether to include the output logprobs to the generation output.
+- **forced_output_tokens**: A token sequence that is enforced as a generation output. This option can be used when evaluating the model for the datasets with multi-choice problems (e.g., [HellaSwag](https://huggingface.co/datasets/hellaswag), [MMLU](https://huggingface.co/datasets/cais/mmlu)). Use this option with `include_output_logprobs` to get logprobs for the evaluation..
 - **eos_token**: A list of endpoint sentence tokens.
 
 :::note

diff --git a/periflow/schema/api/v1/codegen/__init__.py b/periflow/schema/api/v1/codegen/__init__.py
@@ -0,0 +1,3 @@
+# Copyright (c) 2022-present, FriendliAI Inc. All rights reserved.
+
+"""PeriFlow V1 API Schemas Auto-generated."""
diff --git a/periflow/schema/api/v1/codegen/completion_pb2.py b/periflow/schema/api/v1/codegen/completion_pb2.py
@@ -0,0 +1,40 @@
+# Copyright (c) 2022-present, FriendliAI Inc. All rights reserved.
+
+# pylint: disable-all
+
+# -*- coding: utf-8 -*-
+# Generated by the protocol buffer compiler.  DO NOT EDIT!
+# source: completion.proto
+"""Generated protocol buffer code."""
+from __future__ import annotations
+
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import descriptor_pool as _descriptor_pool
+from google.protobuf import symbol_database as _symbol_database
+from google.protobuf.internal import builder as _builder
+
+# @@protoc_insertion_point(imports)
+
+_sym_db = _symbol_database.Default()
+
+
+DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(
+    b'\n\x10\x63ompletion.proto\x12\x04orca"\xdd\x0c\n\x14V1CompletionsRequest\x12\x13\n\x06stream\x18\x01 \x01(\x08H\x00\x88\x01\x01\x12\x13\n\x06prompt\x18\x03 \x01(\tH\x01\x88\x01\x01\x12\x0e\n\x06tokens\x18\x04 \x03(\x05\x12!\n\x14timeout_microseconds\x18\x05 \x01(\x05H\x02\x88\x01\x01\x12\x17\n\nmax_tokens\x18\x06 \x01(\x05H\x03\x88\x01\x01\x12\x1d\n\x10max_total_tokens\x18\x07 \x01(\x05H\x04\x88\x01\x01\x12\x17\n\nmin_tokens\x18\x08 \x01(\x05H\x05\x88\x01\x01\x12\x1d\n\x10min_total_tokens\x18\t \x01(\x05H\x06\x88\x01\x01\x12\x0e\n\x01n\x18\n \x01(\x05H\x07\x88\x01\x01\x12\x16\n\tnum_beams\x18\x0b \x01(\x05H\x08\x88\x01\x01\x12\x1b\n\x0elength_penalty\x18\x0c \x01(\x02H\t\x88\x01\x01\x12\x1b\n\x0e\x65\x61rly_stopping\x18\x0f \x01(\x08H\n\x88\x01\x01\x12\x1c\n\x0fno_repeat_ngram\x18\x11 \x01(\x05H\x0b\x88\x01\x01\x12$\n\x17\x65ncoder_no_repeat_ngram\x18\x12 \x01(\x05H\x0c\x88\x01\x01\x12\x1f\n\x12repetition_penalty\x18\x13 \x01(\x02H\r\x88\x01\x01\x12\'\n\x1a\x65ncoder_repetition_penalty\x18" \x01(\x02H\x0e\x88\x01\x01\x12\x18\n\x0btemperature\x18\x14 \x01(\x02H\x0f\x88\x01\x01\x12\x12\n\x05top_k\x18\x15 \x01(\x05H\x10\x88\x01\x01\x12\x12\n\x05top_p\x18\x16 \x01(\x02H\x11\x88\x01\x01\x12\x0c\n\x04stop\x18\x17 \x03(\t\x12=\n\x0bstop_tokens\x18\x18 \x03(\x0b\x32(.orca.V1CompletionsRequest.TokenSequence\x12\x0c\n\x04seed\x18\x1a \x03(\x04\x12\x1e\n\x16token_index_to_replace\x18\x1b \x03(\x05\x12\x1c\n\x14\x65mbedding_to_replace\x18\x1c \x03(\x02\x12H\n\x10\x62\x65\x61m_search_type\x18\x1d \x01(\x0e\x32).orca.V1CompletionsRequest.BeamSearchTypeH\x12\x88\x01\x01\x12*\n\x1d\x62\x65\x61m_compat_pre_normalization\x18\x1e \x01(\x08H\x13\x88\x01\x01\x12.\n!beam_compat_no_post_normalization\x18\x1f \x01(\x08H\x14\x88\x01\x01\x12\x11\n\tbad_words\x18  \x03(\t\x12\x41\n\x0f\x62\x61\x64_word_tokens\x18! \x03(\x0b\x32(.orca.V1CompletionsRequest.TokenSequence\x12"\n\x15include_output_logits\x18/ \x01(\x08H\x15\x88\x01\x01\x12$\n\x17include_output_logprobs\x18\x32 \x01(\x08H\x16\x88\x01\x01\x12\x1c\n\x14\x66orced_output_tokens\x18\x33 \x03(\x05\x12\x11\n\teos_token\x18. \x03(\x05\x1a\x1f\n\rTokenSequence\x12\x0e\n\x06tokens\x18\x01 \x03(\x05"G\n\x0e\x42\x65\x61mSearchType\x12\x11\n\rDETERMINISTIC\x10\x00\x12\x0e\n\nSTOCHASTIC\x10\x01\x12\x12\n\x0eNAIVE_SAMPLING\x10\x02\x42\t\n\x07_streamB\t\n\x07_promptB\x17\n\x15_timeout_microsecondsB\r\n\x0b_max_tokensB\x13\n\x11_max_total_tokensB\r\n\x0b_min_tokensB\x13\n\x11_min_total_tokensB\x04\n\x02_nB\x0c\n\n_num_beamsB\x11\n\x0f_length_penaltyB\x11\n\x0f_early_stoppingB\x12\n\x10_no_repeat_ngramB\x1a\n\x18_encoder_no_repeat_ngramB\x15\n\x13_repetition_penaltyB\x1d\n\x1b_encoder_repetition_penaltyB\x0e\n\x0c_temperatureB\x08\n\x06_top_kB\x08\n\x06_top_pB\x13\n\x11_beam_search_typeB \n\x1e_beam_compat_pre_normalizationB$\n"_beam_compat_no_post_normalizationB\x18\n\x16_include_output_logitsB\x1a\n\x18_include_output_logprobs"3\n\x11V1TokenizeRequest\x12\x13\n\x06prompt\x18\x02 \x01(\tH\x00\x88\x01\x01\x42\t\n\x07_prompt"%\n\x13V1DetokenizeRequest\x12\x0e\n\x06tokens\x18\x02 \x03(\x05\x62\x06proto3'
+)
+
+_globals = globals()
+_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
+_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "completion_pb2", _globals)
+if _descriptor._USE_C_DESCRIPTORS == False:
+    DESCRIPTOR._options = None
+    _globals["_V1COMPLETIONSREQUEST"]._serialized_start = 27
+    _globals["_V1COMPLETIONSREQUEST"]._serialized_end = 1656
+    _globals["_V1COMPLETIONSREQUEST_TOKENSEQUENCE"]._serialized_start = 1090
+    _globals["_V1COMPLETIONSREQUEST_TOKENSEQUENCE"]._serialized_end = 1121
+    _globals["_V1COMPLETIONSREQUEST_BEAMSEARCHTYPE"]._serialized_start = 1123
+    _globals["_V1COMPLETIONSREQUEST_BEAMSEARCHTYPE"]._serialized_end = 1194
+    _globals["_V1TOKENIZEREQUEST"]._serialized_start = 1658
+    _globals["_V1TOKENIZEREQUEST"]._serialized_end = 1709
+    _globals["_V1DETOKENIZEREQUEST"]._serialized_start = 1711
+    _globals["_V1DETOKENIZEREQUEST"]._serialized_end = 1748
+# @@protoc_insertion_point(module_scope)
diff --git a/periflow/schema/api/v1/codegen/completion_pb2.pyi b/periflow/schema/api/v1/codegen/completion_pb2.pyi
@@ -0,0 +1,192 @@
+# Copyright (c) 2022-present, FriendliAI Inc. All rights reserved.
+
+from __future__ import annotations
+
+from typing import ClassVar as _ClassVar
+from typing import Iterable as _Iterable
+from typing import Mapping as _Mapping
+from typing import Optional as _Optional
+from typing import Union as _Union
+
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import message as _message
+from google.protobuf.internal import containers as _containers
+from google.protobuf.internal import enum_type_wrapper as _enum_type_wrapper
+
+DESCRIPTOR: _descriptor.FileDescriptor
+
+class V1CompletionsRequest(_message.Message):
+    __slots__ = [
+        "stream",
+        "prompt",
+        "tokens",
+        "timeout_microseconds",
+        "max_tokens",
+        "max_total_tokens",
+        "min_tokens",
+        "min_total_tokens",
+        "n",
+        "num_beams",
+        "length_penalty",
+        "early_stopping",
+        "no_repeat_ngram",
+        "encoder_no_repeat_ngram",
+        "repetition_penalty",
+        "encoder_repetition_penalty",
+        "temperature",
+        "top_k",
+        "top_p",
+        "stop",
+        "stop_tokens",
+        "seed",
+        "token_index_to_replace",
+        "embedding_to_replace",
+        "beam_search_type",
+        "beam_compat_pre_normalization",
+        "beam_compat_no_post_normalization",
+        "bad_words",
+        "bad_word_tokens",
+        "include_output_logits",
+        "include_output_logprobs",
+        "forced_output_tokens",
+        "eos_token",
+    ]
+
+    class BeamSearchType(int, metaclass=_enum_type_wrapper.EnumTypeWrapper):
+        __slots__ = []
+        DETERMINISTIC: _ClassVar[V1CompletionsRequest.BeamSearchType]
+        STOCHASTIC: _ClassVar[V1CompletionsRequest.BeamSearchType]
+        NAIVE_SAMPLING: _ClassVar[V1CompletionsRequest.BeamSearchType]
+    DETERMINISTIC: V1CompletionsRequest.BeamSearchType
+    STOCHASTIC: V1CompletionsRequest.BeamSearchType
+    NAIVE_SAMPLING: V1CompletionsRequest.BeamSearchType
+
+    class TokenSequence(_message.Message):
+        __slots__ = ["tokens"]
+        TOKENS_FIELD_NUMBER: _ClassVar[int]
+        tokens: _containers.RepeatedScalarFieldContainer[int]
+        def __init__(self, tokens: _Optional[_Iterable[int]] = ...) -> None: ...
+    STREAM_FIELD_NUMBER: _ClassVar[int]
+    PROMPT_FIELD_NUMBER: _ClassVar[int]
+    TOKENS_FIELD_NUMBER: _ClassVar[int]
+    TIMEOUT_MICROSECONDS_FIELD_NUMBER: _ClassVar[int]
+    MAX_TOKENS_FIELD_NUMBER: _ClassVar[int]
+    MAX_TOTAL_TOKENS_FIELD_NUMBER: _ClassVar[int]
+    MIN_TOKENS_FIELD_NUMBER: _ClassVar[int]
+    MIN_TOTAL_TOKENS_FIELD_NUMBER: _ClassVar[int]
+    N_FIELD_NUMBER: _ClassVar[int]
+    NUM_BEAMS_FIELD_NUMBER: _ClassVar[int]
+    LENGTH_PENALTY_FIELD_NUMBER: _ClassVar[int]
+    EARLY_STOPPING_FIELD_NUMBER: _ClassVar[int]
+    NO_REPEAT_NGRAM_FIELD_NUMBER: _ClassVar[int]
+    ENCODER_NO_REPEAT_NGRAM_FIELD_NUMBER: _ClassVar[int]
+    REPETITION_PENALTY_FIELD_NUMBER: _ClassVar[int]
+    ENCODER_REPETITION_PENALTY_FIELD_NUMBER: _ClassVar[int]
+    TEMPERATURE_FIELD_NUMBER: _ClassVar[int]
+    TOP_K_FIELD_NUMBER: _ClassVar[int]
+    TOP_P_FIELD_NUMBER: _ClassVar[int]
+    STOP_FIELD_NUMBER: _ClassVar[int]
+    STOP_TOKENS_FIELD_NUMBER: _ClassVar[int]
+    SEED_FIELD_NUMBER: _ClassVar[int]
+    TOKEN_INDEX_TO_REPLACE_FIELD_NUMBER: _ClassVar[int]
+    EMBEDDING_TO_REPLACE_FIELD_NUMBER: _ClassVar[int]
+    BEAM_SEARCH_TYPE_FIELD_NUMBER: _ClassVar[int]
+    BEAM_COMPAT_PRE_NORMALIZATION_FIELD_NUMBER: _ClassVar[int]
+    BEAM_COMPAT_NO_POST_NORMALIZATION_FIELD_NUMBER: _ClassVar[int]
+    BAD_WORDS_FIELD_NUMBER: _ClassVar[int]
+    BAD_WORD_TOKENS_FIELD_NUMBER: _ClassVar[int]
+    INCLUDE_OUTPUT_LOGITS_FIELD_NUMBER: _ClassVar[int]
+    INCLUDE_OUTPUT_LOGPROBS_FIELD_NUMBER: _ClassVar[int]
+    FORCED_OUTPUT_TOKENS_FIELD_NUMBER: _ClassVar[int]
+    EOS_TOKEN_FIELD_NUMBER: _ClassVar[int]
+    stream: bool
+    prompt: str
+    tokens: _containers.RepeatedScalarFieldContainer[int]
+    timeout_microseconds: int
+    max_tokens: int
+    max_total_tokens: int
+    min_tokens: int
+    min_total_tokens: int
+    n: int
+    num_beams: int
+    length_penalty: float
+    early_stopping: bool
+    no_repeat_ngram: int
+    encoder_no_repeat_ngram: int
+    repetition_penalty: float
+    encoder_repetition_penalty: float
+    temperature: float
+    top_k: int
+    top_p: float
+    stop: _containers.RepeatedScalarFieldContainer[str]
+    stop_tokens: _containers.RepeatedCompositeFieldContainer[
+        V1CompletionsRequest.TokenSequence
+    ]
+    seed: _containers.RepeatedScalarFieldContainer[int]
+    token_index_to_replace: _containers.RepeatedScalarFieldContainer[int]
+    embedding_to_replace: _containers.RepeatedScalarFieldContainer[float]
+    beam_search_type: V1CompletionsRequest.BeamSearchType
+    beam_compat_pre_normalization: bool
+    beam_compat_no_post_normalization: bool
+    bad_words: _containers.RepeatedScalarFieldContainer[str]
+    bad_word_tokens: _containers.RepeatedCompositeFieldContainer[
+        V1CompletionsRequest.TokenSequence
+    ]
+    include_output_logits: bool
+    include_output_logprobs: bool
+    forced_output_tokens: _containers.RepeatedScalarFieldContainer[int]
+    eos_token: _containers.RepeatedScalarFieldContainer[int]
+    def __init__(
+        self,
+        stream: bool = ...,
+        prompt: _Optional[str] = ...,
+        tokens: _Optional[_Iterable[int]] = ...,
+        timeout_microseconds: _Optional[int] = ...,
+        max_tokens: _Optional[int] = ...,
+        max_total_tokens: _Optional[int] = ...,
+        min_tokens: _Optional[int] = ...,
+        min_total_tokens: _Optional[int] = ...,
+        n: _Optional[int] = ...,
+        num_beams: _Optional[int] = ...,
+        length_penalty: _Optional[float] = ...,
+        early_stopping: bool = ...,
+        no_repeat_ngram: _Optional[int] = ...,
+        encoder_no_repeat_ngram: _Optional[int] = ...,
+        repetition_penalty: _Optional[float] = ...,
+        encoder_repetition_penalty: _Optional[float] = ...,
+        temperature: _Optional[float] = ...,
+        top_k: _Optional[int] = ...,
+        top_p: _Optional[float] = ...,
+        stop: _Optional[_Iterable[str]] = ...,
+        stop_tokens: _Optional[
+            _Iterable[_Union[V1CompletionsRequest.TokenSequence, _Mapping]]
+        ] = ...,
+        seed: _Optional[_Iterable[int]] = ...,
+        token_index_to_replace: _Optional[_Iterable[int]] = ...,
+        embedding_to_replace: _Optional[_Iterable[float]] = ...,
+        beam_search_type: _Optional[
+            _Union[V1CompletionsRequest.BeamSearchType, str]
+        ] = ...,
+        beam_compat_pre_normalization: bool = ...,
+        beam_compat_no_post_normalization: bool = ...,
+        bad_words: _Optional[_Iterable[str]] = ...,
+        bad_word_tokens: _Optional[
+            _Iterable[_Union[V1CompletionsRequest.TokenSequence, _Mapping]]
+        ] = ...,
+        include_output_logits: bool = ...,
+        include_output_logprobs: bool = ...,
+        forced_output_tokens: _Optional[_Iterable[int]] = ...,
+        eos_token: _Optional[_Iterable[int]] = ...,
+    ) -> None: ...
+
+class V1TokenizeRequest(_message.Message):
+    __slots__ = ["prompt"]
+    PROMPT_FIELD_NUMBER: _ClassVar[int]
+    prompt: str
+    def __init__(self, prompt: _Optional[str] = ...) -> None: ...
+
+class V1DetokenizeRequest(_message.Message):
+    __slots__ = ["tokens"]
+    TOKENS_FIELD_NUMBER: _ClassVar[int]
+    tokens: _containers.RepeatedScalarFieldContainer[int]
+    def __init__(self, tokens: _Optional[_Iterable[int]] = ...) -> None: ...
diff --git a/periflow/schema/api/v1/completion.py b/periflow/schema/api/v1/completion.py
@@ -44,6 +44,12 @@ class V1CompletionOptions(BaseModel):
     stop: Optional[List[str]] = None  # List of stop words.
     stop_tokens: Optional[List[TokenSequence]] = None  # List of stop tokens.
     seed: Optional[List[int]] = None  # Seed.
+    token_index_to_replace: Optional[
+        List[int]
+    ] = None  # List of token indices where to replace embeddings.
+    embedding_to_replace: Optional[
+        List[float]
+    ] = None  # List of flattened embedding vectors to replace the tokens.
     beam_search_type: Optional[BeamSearchType] = None  # Beam search type.
     beam_compat_pre_normalization: Optional[bool] = None
     beam_compat_no_post_normalization: Optional[bool] = None