v0.1.16 (#26)

Co-authored-by: SeungAh Lee <seungah.lee@friendli.ai> Co-authored-by: Sungwoo Cho <swcho@friendli.ai>
friendliai · Sep 5, 2023 · 0ef04d3 · 0ef04d3
1 parent b53f356
commit 0ef04d3
Show file tree

Hide file tree

Showing 14 changed files with 171 additions and 131 deletions.
diff --git a/docs/docs/cli/deployment/create.mdx b/docs/docs/cli/deployment/create.mdx
@@ -14,9 +14,9 @@ The deployment settings are described in a configuration YAML file, and the path
 passed to the `-f` option. The following is an example YAML file:
 
 ```yaml
-orca_config:
-    max_batch_size: 384
-    max_token_count: 12288
+max_batch_size: 384
+max_token_count: 12288
+max_num_tokens_to_replace: 0
 ```
 
 :::tip
@@ -68,7 +68,8 @@ And a TokenSequence type is a dict with the key 'tokens' and the value type List
 | **`--name`**, **`-n`** | TEXT | The name of deployment.  | - | ✅ |
 | **`--cloud`**, **`-c`** | CHOICE: [aws, azure, gcp] | Type of cloud. | - | ✅ |
 | **`--region`**, **`-r`** | TEXT | Region of cloud. | - | ✅ |
-| **`--vm-type`**, **`-v`** | CHOICE: [g5.xlarge, a2-highgpu-1g, a2-ultragpu-1g, a2-ultragpu-2g, a2-ultragpu-4g, a2-ultragpu-8g] | The VM type for the deployment. | - | ✅ |
+| **`--gpu-type`**, **`-g`** | CHOICE: [a10g, a100, a100-80g] | The GPU type for the deployment. | - | ✅ |
+| **`--num-gpus`**, **`-ng`** | INTEGER | The number of GPUs for the deployment. Equals to the tensor parallelism degree. | - | ✅ |
 | `--config-file`, `-f` | FILENAME | Path to configuration file. | None | ❌ |
 | `--type`, `-t` | CHOICE: [dev, prod] | Type of deployment. | DeploymentType.PROD | ❌ |
 | `--description`, `-d` | TEXT | Deployment description. | None | ❌ |

diff --git a/docs/docs/cli/vm/_category_.json → docs/docs/cli/gpu/_category_.json b/docs/docs/cli/vm/_category_.json → docs/docs/cli/gpu/_category_.json
@@ -1,5 +1,5 @@
 {
-    "label": "pf vm",
+    "label": "pf gpu",
     "collapsible": true,
     "collapsed": false
 }
diff --git a/docs/docs/cli/gpu/list.mdx b/docs/docs/cli/gpu/list.mdx
@@ -0,0 +1,11 @@
+# pf gpu list
+
+## Usage
+
+```bash
+pf gpu list
+```
+
+## Summary
+
+List up available GPUs.
diff --git a/docs/docs/cli/vm/list.mdx b/docs/docs/cli/vm/list.mdx
diff --git a/docs/docs/sdk/resource/deployment.mdx b/docs/docs/sdk/resource/deployment.mdx
@@ -19,7 +19,8 @@ create(
     name: str,
     cloud: CloudType,
     region: str,
-    vm_type: VMType,
+    gpu_type: GpuType,
+    num_gpus: int,
     config: Dict[str, Any],
     deployment_type: DeploymentType = DeploymentType.PROD,
     description: Optional[str] = None,
@@ -41,6 +42,8 @@ Creates a new deployment.
 | `name` | `str` | The name of deployment. | - | ❌ |
 | `cloud` | `CloudType` | Type of cloud provider. | - | ❌ |
 | `region` | `str` | Cloud region to create a deployment. | - | ❌ |
+| `gpu_type` | `GpuType` | Type of GPU. | - | ❌ |
+| `num_gpus` | `int` | The number of GPUs. | - | ❌ |
 | `vm_type` | `VMType` | Type of VM. | - | ❌ |
 | `config` | `Dict[str, Any]` | Deployment configuration. | - | ❌ |
 | `deployment_type` | `DeploymentType` | Type of deployment. Defaults to DeploymentType.PROD. | `DeploymentType.PROD` | ✅ |
@@ -82,13 +85,15 @@ pf.init(
 config = {
     "max_batch_size": 256,
     "max_token_count": 8146,
+    "max_num_tokens_to_replace": 0,
 }
 deployment = pf.Deployment.create(
     checkpoint_id="YOUR_CHECKPOINT_ID",
     name="my-deployment",
     cloud="gcp",
     region="asia-northeast3",
-    vm_type="a2-highgpu-1g",
+    gpu_type="a100",
+    num_gpus=1,
     config=config,
 )
 ```
@@ -97,10 +102,9 @@ The format of `config` should be:
 
 ```python
 {
-    "orca_config": {
-        "max_batch_size": Optioanl[int],
-        "max_token_count": Optioanl[int],
-    }
+    "max_batch_size": Optioanl[int],
+    "max_token_count": Optioanl[int],
+    "max_num_tokens_to_replace": Optional[int],
 }
 ```
 

diff --git a/docs/docs/tutorials/container/how_to_run_periflow_container.mdx b/docs/docs/tutorials/container/how_to_run_periflow_container.mdx
@@ -35,7 +35,7 @@ You can use PeriFlow Container trial for a period of two weeks free of charge.
 In the next step, you will need to pull the Docker container image.
 You can find instructions on how to do this in the guide on the [PeriFlow Container webpage](https://container.periflow.ai).
 
-If you're new to PeriFlow Container and you don't yet have an account, sign up to the [waitlist](https://waitlist.periflow.ai/container) and we'll let you know as soon as we open up new account.
+If you're new to PeriFlow Container and you don't yet have an account, sign up to the [preview](https://preview.periflow.ai/container) and we'll let you know as soon as we open up new account.
 
 :::info
 ### Requirement for converting Hugging Face model checkpoints (Optional)
@@ -165,7 +165,7 @@ The following tables are the options for encoder decoder models: T5, BlenderBot
 | `--num-heads` | INT | Number of attention heads. | - | ✅ |
 | `--head-size` | INT | Attention head size | - | ✅ |
 | `--hidden-size` | INT | Hidden size of model. | `num-heads * head-size` | ❌ |
-| `--ff-intermediate-size` | INT | Feed forward intermediate size. | `hidden-size * 4` | ✅ |
+| `--ff-intermediate-size` | INT | Feed forward intermediate size. | `hidden-size * 4` | ❌ |
 | `--max-input-length` | INT | Max input token length. | - | ✅ |
 | `--max-output-length` | INT | Max output token length. | - | ✅ |
 | `--num-pos-emb-buckets` | INT | Number of position embedding buckets. Only for models with T5-style relative pos embedding. | - | <ul><li> T5: ✅ </li><li> BlenderBot: ❌ </li></ul> |

diff --git a/periflow/cli/deployment.py b/periflow/cli/deployment.py
@@ -15,7 +15,7 @@
 from dateutil.parser import parse
 
 from periflow.client.user import UserGroupProjectClient
-from periflow.enums import CloudType, DeploymentSecurityLevel, DeploymentType, VMType
+from periflow.enums import CloudType, DeploymentSecurityLevel, DeploymentType, GpuType
 from periflow.errors import (
     AuthenticationError,
     EntityTooLargeError,
@@ -63,6 +63,7 @@
         "config.region",
         "config.orca_config.max_batch_size",
         "config.orca_config.max_token_count",
+        "config.orca_config.max_num_tokens_to_replace",
     ],
     headers=[
         "ID",
@@ -86,6 +87,7 @@
         "Region",
         "Max batch size",
         "Max token count",
+        "Max num tokens to replace",
     ],
     extra_fields=["error"],
     extra_headers=["error"],
@@ -422,8 +424,14 @@ def create(
     ),
     cloud: CloudType = typer.Option(..., "--cloud", "-c", help="Type of cloud."),
     region: str = typer.Option(..., "--region", "-r", help="Region of cloud."),
-    vm_type: VMType = typer.Option(
-        ..., "--vm-type", "-v", help="The VM type for the deployment."
+    gpu_type: GpuType = typer.Option(
+        ..., "--gpu-type", "-g", help="The GPU type for the deployment."
+    ),
+    num_gpus: int = typer.Option(
+        ...,
+        "--num-gpus",
+        "-ng",
+        help="The number of GPUs for the deployment. Equals to the tensor parallelism degree.",
     ),
     config_file: Optional[typer.FileText] = typer.Option(
         None, "--config-file", "-f", help="Path to configuration file."
@@ -471,9 +479,9 @@ def create(
     passed to the `-f` option. The following is an example YAML file:
 
     ```yaml
-    orca_config:
-        max_batch_size: 384
-        max_token_count: 12288
+    max_batch_size: 384
+    max_token_count: 12288
+    max_num_tokens_to_replace: 0
     ```
 
     :::tip
@@ -523,17 +531,23 @@ def create(
     if config_file:
         try:
             config = yaml.safe_load(config_file)
-            if default_request_config_file is not None:
-                default_request_config = yaml.safe_load(default_request_config_file)
         except yaml.YAMLError as e:
             secho_error_and_exit(
                 f"Error occurred while parsing engine config file... {e}"
             )
-    else:
-        config["orca_config"] = {
-            "max_batch_size": DEFAULT_MAX_BATCH_SIZE,
-            "max_token_count": DEFAULT_MAX_TOKEN_COUNT,
-        }
+
+    if "max_batch_size" not in config:
+        config["max_batch_size"] = DEFAULT_MAX_BATCH_SIZE
+    if "max_token_count" not in config:
+        config["max_token_count"] = DEFAULT_MAX_TOKEN_COUNT
+
+    if default_request_config_file is not None:
+        try:
+            default_request_config = yaml.safe_load(default_request_config_file)
+        except yaml.YAMLError as e:
+            secho_error_and_exit(
+                f"Error occurred while parsing default request config file... {e}"
+            )
 
     try:
         deployment = DeploymentAPI.create(
@@ -542,7 +556,8 @@ def create(
             deployment_type=deployment_type,
             cloud=cloud,
             region=region,
-            vm_type=vm_type,
+            gpu_type=gpu_type,
+            num_gpus=num_gpus,
             config=config,
             description=description,
             default_request_config=default_request_config,

diff --git a/periflow/cli/gpu.py b/periflow/cli/gpu.py
@@ -0,0 +1,65 @@
+# Copyright (c) 2022-present, FriendliAI Inc. All rights reserved.
+
+"""PeriFlow GPU CLI."""
+
+from __future__ import annotations
+
+import typer
+
+from periflow.client.deployment import PFSVMClient
+from periflow.enums import GpuType
+from periflow.formatter import TableFormatter
+
+app = typer.Typer(
+    no_args_is_help=True,
+    context_settings={"help_option_names": ["-h", "--help"]},
+    add_completion=False,
+)
+
+serving_gpu_formatter = TableFormatter(
+    name="Serving GPU instances",
+    fields=[
+        "cloud",
+        "region",
+        "gpu_type",
+        "supported_num_gpus",
+    ],
+    headers=[
+        "Cloud",
+        "Region",
+        "GPU type",
+        "Supported #GPUs",
+    ],
+)
+
+
+# pylint: disable=redefined-builtin
+@app.command()
+def list():
+    """List up available GPUs."""
+    pfs_vm_client = PFSVMClient()
+    response = pfs_vm_client.list_vms()
+    vm_dict = {}
+
+    def _gpu_key(nodegroup_list_dict, nodegroup) -> str:
+        return f'{nodegroup_list_dict["cloud"].upper()}-{nodegroup_list_dict["region"]}\
+            -{nodegroup["vm"]["gpu_type"].upper()}'
+
+    for nodegroup_list_dict in response:
+        for nodegroup in nodegroup_list_dict["nodegroup_list"]:
+            if nodegroup["vm"]["gpu_type"] in [gpu_type.value for gpu_type in GpuType]:
+                gpu_key = _gpu_key(nodegroup_list_dict, nodegroup)
+                if gpu_key in vm_dict:
+                    vm_dict[gpu_key][
+                        "supported_num_gpus"
+                    ] += f', {nodegroup["vm"]["total_gpus"]}'
+                else:
+                    vm_dict[gpu_key] = {
+                        "cloud": nodegroup_list_dict["cloud"].upper(),
+                        "region": nodegroup_list_dict["region"],
+                        "vm": nodegroup["vm"],
+                        "gpu_type": nodegroup["vm"]["gpu_type"].upper(),
+                        "supported_num_gpus": str(nodegroup["vm"]["total_gpus"]),
+                    }
+
+    serving_gpu_formatter.render(vm_dict.values())
diff --git a/periflow/cli/main.py b/periflow/cli/main.py
@@ -11,7 +11,7 @@
 from requests import HTTPError, Response
 
 from periflow.auth import TokenType, clear_tokens, get_token, update_token
-from periflow.cli import checkpoint, credential, deployment, group, key, project, vm
+from periflow.cli import checkpoint, credential, deployment, gpu, group, key, project
 from periflow.client.project import ProjectClient
 from periflow.client.user import UserClient, UserGroupClient, UserMFAClient
 from periflow.context import (
@@ -38,7 +38,7 @@
 
 app.add_typer(credential.app, name="credential", help="Manage credentials")
 app.add_typer(checkpoint.app, name="checkpoint", help="Manage checkpoints")
-app.add_typer(vm.app, name="vm", help="Manage VMs")
+app.add_typer(gpu.app, name="gpu", help="Manage GPUs")
 app.add_typer(deployment.app, name="deployment", help="Manage deployments")
 app.add_typer(project.app, name="project", help="Manage projects")
 app.add_typer(group.app, name="org", help="Manage organizations")

diff --git a/periflow/cli/vm.py b/periflow/cli/vm.py
diff --git a/periflow/configurator/deployment.py b/periflow/configurator/deployment.py
@@ -83,13 +83,10 @@ def validation_schema(self) -> dict:
         return {
             "type": "object",
             "properties": {
-                "orca_config": {
-                    "type": "object",
-                    "properties": {
-                        "max_batch_size": {"type": "integer"},
-                        "max_token_count": {"type": "integer"},
-                    },
-                },
+                "max_batch_size": {"type": "integer"},
+                "max_token_count": {"type": "integer"},
+                "max_num_tokens_to_replace": {"type": "integer"},
             },
-            "required": ["orca_config"],
+            "minProperties": 1,
+            "additionalProperties": False,
         }