diff --git a/docs/docs/cli/deployment/create.mdx b/docs/docs/cli/deployment/create.mdx
index d8b483e6..ccc57607 100644
--- a/docs/docs/cli/deployment/create.mdx
+++ b/docs/docs/cli/deployment/create.mdx
@@ -14,9 +14,9 @@ The deployment settings are described in a configuration YAML file, and the path
 passed to the `-f` option. The following is an example YAML file:
 
 ```yaml
-orca_config:
-    max_batch_size: 384
-    max_token_count: 12288
+max_batch_size: 384
+max_token_count: 12288
+max_num_tokens_to_replace: 0
 ```
 
 :::tip
@@ -68,7 +68,8 @@ And a TokenSequence type is a dict with the key 'tokens' and the value type List
 | **`--name`**, **`-n`** | TEXT | The name of deployment.  | - | ✅ |
 | **`--cloud`**, **`-c`** | CHOICE: [aws, azure, gcp] | Type of cloud. | - | ✅ |
 | **`--region`**, **`-r`** | TEXT | Region of cloud. | - | ✅ |
-| **`--vm-type`**, **`-v`** | CHOICE: [g5.xlarge, a2-highgpu-1g, a2-ultragpu-1g, a2-ultragpu-2g, a2-ultragpu-4g, a2-ultragpu-8g] | The VM type for the deployment. | - | ✅ |
+| **`--gpu-type`**, **`-g`** | CHOICE: [a10g, a100, a100-80g] | The GPU type for the deployment. | - | ✅ |
+| **`--num-gpus`**, **`-ng`** | INTEGER | The number of GPUs for the deployment. Equals to the tensor parallelism degree. | - | ✅ |
 | `--config-file`, `-f` | FILENAME | Path to configuration file. | None | ❌ |
 | `--type`, `-t` | CHOICE: [dev, prod] | Type of deployment. | DeploymentType.PROD | ❌ |
 | `--description`, `-d` | TEXT | Deployment description. | None | ❌ |
diff --git a/docs/docs/cli/vm/_category_.json b/docs/docs/cli/gpu/_category_.json
similarity index 68%
rename from docs/docs/cli/vm/_category_.json
rename to docs/docs/cli/gpu/_category_.json
index 08074b22..6a93aac0 100644
--- a/docs/docs/cli/vm/_category_.json
+++ b/docs/docs/cli/gpu/_category_.json
@@ -1,5 +1,5 @@
 {
-    "label": "pf vm",
+    "label": "pf gpu",
     "collapsible": true,
     "collapsed": false
 }
\ No newline at end of file
diff --git a/docs/docs/cli/gpu/list.mdx b/docs/docs/cli/gpu/list.mdx
new file mode 100644
index 00000000..131a7fdf
--- /dev/null
+++ b/docs/docs/cli/gpu/list.mdx
@@ -0,0 +1,11 @@
+# pf gpu list
+
+## Usage
+
+```bash
+pf gpu list
+```
+
+## Summary
+
+List up available GPUs.
diff --git a/docs/docs/cli/vm/list.mdx b/docs/docs/cli/vm/list.mdx
deleted file mode 100644
index 67249930..00000000
--- a/docs/docs/cli/vm/list.mdx
+++ /dev/null
@@ -1,11 +0,0 @@
-# pf vm list
-
-## Usage
-
-```bash
-pf vm list
-```
-
-## Summary
-
-List up available VMs.
diff --git a/docs/docs/sdk/resource/deployment.mdx b/docs/docs/sdk/resource/deployment.mdx
index b54c1a26..72ac60e4 100644
--- a/docs/docs/sdk/resource/deployment.mdx
+++ b/docs/docs/sdk/resource/deployment.mdx
@@ -19,7 +19,8 @@ create(
     name: str,
     cloud: CloudType,
     region: str,
-    vm_type: VMType,
+    gpu_type: GpuType,
+    num_gpus: int,
     config: Dict[str, Any],
     deployment_type: DeploymentType = DeploymentType.PROD,
     description: Optional[str] = None,
@@ -41,6 +42,8 @@ Creates a new deployment.
 | `name` | `str` | The name of deployment. | - | ❌ |
 | `cloud` | `CloudType` | Type of cloud provider. | - | ❌ |
 | `region` | `str` | Cloud region to create a deployment. | - | ❌ |
+| `gpu_type` | `GpuType` | Type of GPU. | - | ❌ |
+| `num_gpus` | `int` | The number of GPUs. | - | ❌ |
 | `vm_type` | `VMType` | Type of VM. | - | ❌ |
 | `config` | `Dict[str, Any]` | Deployment configuration. | - | ❌ |
 | `deployment_type` | `DeploymentType` | Type of deployment. Defaults to DeploymentType.PROD. | `DeploymentType.PROD` | ✅ |
@@ -82,13 +85,15 @@ pf.init(
 config = {
     "max_batch_size": 256,
     "max_token_count": 8146,
+    "max_num_tokens_to_replace": 0,
 }
 deployment = pf.Deployment.create(
     checkpoint_id="YOUR_CHECKPOINT_ID",
     name="my-deployment",
     cloud="gcp",
     region="asia-northeast3",
-    vm_type="a2-highgpu-1g",
+    gpu_type="a100",
+    num_gpus=1,
     config=config,
 )
 ```
@@ -97,10 +102,9 @@ The format of `config` should be:
 
 ```python
 {
-    "orca_config": {
-        "max_batch_size": Optioanl[int],
-        "max_token_count": Optioanl[int],
-    }
+    "max_batch_size": Optioanl[int],
+    "max_token_count": Optioanl[int],
+    "max_num_tokens_to_replace": Optional[int],
 }
 ```
 
diff --git a/docs/docs/tutorials/container/how_to_run_periflow_container.mdx b/docs/docs/tutorials/container/how_to_run_periflow_container.mdx
index 1b733067..ba126600 100644
--- a/docs/docs/tutorials/container/how_to_run_periflow_container.mdx
+++ b/docs/docs/tutorials/container/how_to_run_periflow_container.mdx
@@ -35,7 +35,7 @@ You can use PeriFlow Container trial for a period of two weeks free of charge.
 In the next step, you will need to pull the Docker container image.
 You can find instructions on how to do this in the guide on the [PeriFlow Container webpage](https://container.periflow.ai).
 
-If you're new to PeriFlow Container and you don't yet have an account, sign up to the [waitlist](https://waitlist.periflow.ai/container) and we'll let you know as soon as we open up new account.
+If you're new to PeriFlow Container and you don't yet have an account, sign up to the [preview](https://preview.periflow.ai/container) and we'll let you know as soon as we open up new account.
 
 :::info
 ### Requirement for converting Hugging Face model checkpoints (Optional)
@@ -165,7 +165,7 @@ The following tables are the options for encoder decoder models: T5, BlenderBot
 | `--num-heads` | INT | Number of attention heads. | - | ✅ |
 | `--head-size` | INT | Attention head size | - | ✅ |
 | `--hidden-size` | INT | Hidden size of model. | `num-heads * head-size` | ❌ |
-| `--ff-intermediate-size` | INT | Feed forward intermediate size. | `hidden-size * 4` | ✅ |
+| `--ff-intermediate-size` | INT | Feed forward intermediate size. | `hidden-size * 4` | ❌ |
 | `--max-input-length` | INT | Max input token length. | - | ✅ |
 | `--max-output-length` | INT | Max output token length. | - | ✅ |
 | `--num-pos-emb-buckets` | INT | Number of position embedding buckets. Only for models with T5-style relative pos embedding. | - | <ul><li> T5: ✅ </li><li> BlenderBot: ❌ </li></ul> |
diff --git a/periflow/cli/deployment.py b/periflow/cli/deployment.py
index d7119fd5..f748c199 100644
--- a/periflow/cli/deployment.py
+++ b/periflow/cli/deployment.py
@@ -15,7 +15,7 @@
 from dateutil.parser import parse
 
 from periflow.client.user import UserGroupProjectClient
-from periflow.enums import CloudType, DeploymentSecurityLevel, DeploymentType, VMType
+from periflow.enums import CloudType, DeploymentSecurityLevel, DeploymentType, GpuType
 from periflow.errors import (
     AuthenticationError,
     EntityTooLargeError,
@@ -63,6 +63,7 @@
         "config.region",
         "config.orca_config.max_batch_size",
         "config.orca_config.max_token_count",
+        "config.orca_config.max_num_tokens_to_replace",
     ],
     headers=[
         "ID",
@@ -86,6 +87,7 @@
         "Region",
         "Max batch size",
         "Max token count",
+        "Max num tokens to replace",
     ],
     extra_fields=["error"],
     extra_headers=["error"],
@@ -422,8 +424,14 @@ def create(
     ),
     cloud: CloudType = typer.Option(..., "--cloud", "-c", help="Type of cloud."),
     region: str = typer.Option(..., "--region", "-r", help="Region of cloud."),
-    vm_type: VMType = typer.Option(
-        ..., "--vm-type", "-v", help="The VM type for the deployment."
+    gpu_type: GpuType = typer.Option(
+        ..., "--gpu-type", "-g", help="The GPU type for the deployment."
+    ),
+    num_gpus: int = typer.Option(
+        ...,
+        "--num-gpus",
+        "-ng",
+        help="The number of GPUs for the deployment. Equals to the tensor parallelism degree.",
     ),
     config_file: Optional[typer.FileText] = typer.Option(
         None, "--config-file", "-f", help="Path to configuration file."
@@ -471,9 +479,9 @@ def create(
     passed to the `-f` option. The following is an example YAML file:
 
     ```yaml
-    orca_config:
-        max_batch_size: 384
-        max_token_count: 12288
+    max_batch_size: 384
+    max_token_count: 12288
+    max_num_tokens_to_replace: 0
     ```
 
     :::tip
@@ -523,17 +531,23 @@ def create(
     if config_file:
         try:
             config = yaml.safe_load(config_file)
-            if default_request_config_file is not None:
-                default_request_config = yaml.safe_load(default_request_config_file)
         except yaml.YAMLError as e:
             secho_error_and_exit(
                 f"Error occurred while parsing engine config file... {e}"
             )
-    else:
-        config["orca_config"] = {
-            "max_batch_size": DEFAULT_MAX_BATCH_SIZE,
-            "max_token_count": DEFAULT_MAX_TOKEN_COUNT,
-        }
+
+    if "max_batch_size" not in config:
+        config["max_batch_size"] = DEFAULT_MAX_BATCH_SIZE
+    if "max_token_count" not in config:
+        config["max_token_count"] = DEFAULT_MAX_TOKEN_COUNT
+
+    if default_request_config_file is not None:
+        try:
+            default_request_config = yaml.safe_load(default_request_config_file)
+        except yaml.YAMLError as e:
+            secho_error_and_exit(
+                f"Error occurred while parsing default request config file... {e}"
+            )
 
     try:
         deployment = DeploymentAPI.create(
@@ -542,7 +556,8 @@ def create(
             deployment_type=deployment_type,
             cloud=cloud,
             region=region,
-            vm_type=vm_type,
+            gpu_type=gpu_type,
+            num_gpus=num_gpus,
             config=config,
             description=description,
             default_request_config=default_request_config,
diff --git a/periflow/cli/gpu.py b/periflow/cli/gpu.py
new file mode 100644
index 00000000..7ff90e87
--- /dev/null
+++ b/periflow/cli/gpu.py
@@ -0,0 +1,65 @@
+# Copyright (c) 2022-present, FriendliAI Inc. All rights reserved.
+
+"""PeriFlow GPU CLI."""
+
+from __future__ import annotations
+
+import typer
+
+from periflow.client.deployment import PFSVMClient
+from periflow.enums import GpuType
+from periflow.formatter import TableFormatter
+
+app = typer.Typer(
+    no_args_is_help=True,
+    context_settings={"help_option_names": ["-h", "--help"]},
+    add_completion=False,
+)
+
+serving_gpu_formatter = TableFormatter(
+    name="Serving GPU instances",
+    fields=[
+        "cloud",
+        "region",
+        "gpu_type",
+        "supported_num_gpus",
+    ],
+    headers=[
+        "Cloud",
+        "Region",
+        "GPU type",
+        "Supported #GPUs",
+    ],
+)
+
+
+# pylint: disable=redefined-builtin
+@app.command()
+def list():
+    """List up available GPUs."""
+    pfs_vm_client = PFSVMClient()
+    response = pfs_vm_client.list_vms()
+    vm_dict = {}
+
+    def _gpu_key(nodegroup_list_dict, nodegroup) -> str:
+        return f'{nodegroup_list_dict["cloud"].upper()}-{nodegroup_list_dict["region"]}\
+            -{nodegroup["vm"]["gpu_type"].upper()}'
+
+    for nodegroup_list_dict in response:
+        for nodegroup in nodegroup_list_dict["nodegroup_list"]:
+            if nodegroup["vm"]["gpu_type"] in [gpu_type.value for gpu_type in GpuType]:
+                gpu_key = _gpu_key(nodegroup_list_dict, nodegroup)
+                if gpu_key in vm_dict:
+                    vm_dict[gpu_key][
+                        "supported_num_gpus"
+                    ] += f', {nodegroup["vm"]["total_gpus"]}'
+                else:
+                    vm_dict[gpu_key] = {
+                        "cloud": nodegroup_list_dict["cloud"].upper(),
+                        "region": nodegroup_list_dict["region"],
+                        "vm": nodegroup["vm"],
+                        "gpu_type": nodegroup["vm"]["gpu_type"].upper(),
+                        "supported_num_gpus": str(nodegroup["vm"]["total_gpus"]),
+                    }
+
+    serving_gpu_formatter.render(vm_dict.values())
diff --git a/periflow/cli/main.py b/periflow/cli/main.py
index ed5812e4..04540c6c 100644
--- a/periflow/cli/main.py
+++ b/periflow/cli/main.py
@@ -11,7 +11,7 @@
 from requests import HTTPError, Response
 
 from periflow.auth import TokenType, clear_tokens, get_token, update_token
-from periflow.cli import checkpoint, credential, deployment, group, key, project, vm
+from periflow.cli import checkpoint, credential, deployment, gpu, group, key, project
 from periflow.client.project import ProjectClient
 from periflow.client.user import UserClient, UserGroupClient, UserMFAClient
 from periflow.context import (
@@ -38,7 +38,7 @@
 
 app.add_typer(credential.app, name="credential", help="Manage credentials")
 app.add_typer(checkpoint.app, name="checkpoint", help="Manage checkpoints")
-app.add_typer(vm.app, name="vm", help="Manage VMs")
+app.add_typer(gpu.app, name="gpu", help="Manage GPUs")
 app.add_typer(deployment.app, name="deployment", help="Manage deployments")
 app.add_typer(project.app, name="project", help="Manage projects")
 app.add_typer(group.app, name="org", help="Manage organizations")
diff --git a/periflow/cli/vm.py b/periflow/cli/vm.py
deleted file mode 100644
index c2c7151f..00000000
--- a/periflow/cli/vm.py
+++ /dev/null
@@ -1,67 +0,0 @@
-# Copyright (c) 2022-present, FriendliAI Inc. All rights reserved.
-
-"""PeriFlow VM CLI."""
-
-from __future__ import annotations
-
-import typer
-
-from periflow.client.deployment import PFSVMClient
-from periflow.enums import GpuType
-from periflow.formatter import TableFormatter
-
-app = typer.Typer(
-    no_args_is_help=True,
-    context_settings={"help_option_names": ["-h", "--help"]},
-    add_completion=False,
-)
-quota_app = typer.Typer(
-    no_args_is_help=True,
-    context_settings={"help_option_names": ["-h", "--help"]},
-    add_completion=False,
-    deprecated=True,
-)
-
-serving_vm_formatter = TableFormatter(
-    name="Serving VM instances",
-    fields=[
-        "cloud",
-        "region",
-        "gpu_type",
-        "vm.name",
-        "vm.total_gpus",
-        "vm.vcpu",
-        "memory",
-    ],
-    headers=[
-        "Cloud",
-        "Region",
-        "GPU type",
-        "VM type",
-        "GPU",
-        "vCPUs",
-        "Memory (GiB)",
-    ],
-)
-
-
-# pylint: disable=redefined-builtin
-@app.command()
-def list():
-    """List up available VMs."""
-    pfs_vm_client = PFSVMClient()
-    response = pfs_vm_client.list_vms()
-
-    vm_dict_list = [
-        {
-            "cloud": nodegroup_list_dict["cloud"].upper(),
-            "region": nodegroup_list_dict["region"],
-            "vm": nodegroup["vm"],
-            "gpu_type": nodegroup["vm"]["gpu_type"].upper(),
-            "memory": int(nodegroup["vm"]["cpu_memory"]),
-        }
-        for nodegroup_list_dict in response
-        for nodegroup in nodegroup_list_dict["nodegroup_list"]
-        if nodegroup["vm"]["gpu_type"] in [gpu_type.value for gpu_type in GpuType]
-    ]
-    serving_vm_formatter.render(vm_dict_list)
diff --git a/periflow/configurator/deployment.py b/periflow/configurator/deployment.py
index fbf18637..330f12d6 100644
--- a/periflow/configurator/deployment.py
+++ b/periflow/configurator/deployment.py
@@ -83,13 +83,10 @@ def validation_schema(self) -> dict:
         return {
             "type": "object",
             "properties": {
-                "orca_config": {
-                    "type": "object",
-                    "properties": {
-                        "max_batch_size": {"type": "integer"},
-                        "max_token_count": {"type": "integer"},
-                    },
-                },
+                "max_batch_size": {"type": "integer"},
+                "max_token_count": {"type": "integer"},
+                "max_num_tokens_to_replace": {"type": "integer"},
             },
-            "required": ["orca_config"],
+            "minProperties": 1,
+            "additionalProperties": False,
         }
diff --git a/periflow/sdk/resource/deployment.py b/periflow/sdk/resource/deployment.py
index 5bb1fede..6cf82b2d 100644
--- a/periflow/sdk/resource/deployment.py
+++ b/periflow/sdk/resource/deployment.py
@@ -32,8 +32,8 @@
     CloudType,
     DeploymentSecurityLevel,
     DeploymentType,
+    GpuType,
     ServiceTier,
-    VMType,
 )
 from periflow.errors import (
     AuthenticationError,
@@ -48,7 +48,7 @@
 from periflow.sdk.resource.base import ResourceAPI
 from periflow.utils.format import extract_datetime_part, extract_deployment_id_part
 from periflow.utils.fs import download_file, upload_file
-from periflow.utils.maps import cloud_vm_map, vm_num_gpu_map
+from periflow.utils.maps import cloud_gpu_map, gpu_num_map
 from periflow.utils.validate import validate_enums
 
 
@@ -61,7 +61,8 @@ def create(
         name: str,
         cloud: CloudType,
         region: str,
-        vm_type: VMType,
+        gpu_type: GpuType,
+        num_gpus: int,
         config: Dict[str, Any],
         deployment_type: DeploymentType = DeploymentType.PROD,
         description: Optional[str] = None,
@@ -78,6 +79,8 @@ def create(
             name (str): The name of deployment.
             cloud (CloudType): Type of cloud provider.
             region (str): Cloud region to create a deployment.
+            gpu_type (GpuType): Type of GPU.
+            num_gpus (int): The number of GPUs.
             vm_type (VMType): Type of VM.
             config (Dict[str, Any]): Deployment configuration.
             deployment_type (DeploymentType, optional): Type of deployment. Defaults to DeploymentType.PROD.
@@ -110,13 +113,15 @@ def create(
             config = {
                 "max_batch_size": 256,
                 "max_token_count": 8146,
+                "max_num_tokens_to_replace": 0,
             }
             deployment = pf.Deployment.create(
                 checkpoint_id="YOUR_CHECKPOINT_ID",
                 name="my-deployment",
                 cloud="gcp",
                 region="asia-northeast3",
-                vm_type="a2-highgpu-1g",
+                gpu_type="a100",
+                num_gpus=1,
                 config=config,
             )
             ```
@@ -125,10 +130,9 @@ def create(
 
             ```python
             {
-                "orca_config": {
-                    "max_batch_size": Optioanl[int],
-                    "max_token_count": Optioanl[int],
-                }
+                "max_batch_size": Optioanl[int],
+                "max_token_count": Optioanl[int],
+                "max_num_tokens_to_replace": Optional[int],
             }
             ```
 
@@ -157,7 +161,7 @@ def create(
         """
         # pylint: disable=too-many-statements
         cloud = validate_enums(cloud, CloudType)
-        vm_type = validate_enums(vm_type, VMType)
+        gpu_type = validate_enums(gpu_type, GpuType)
         deployment_type = validate_enums(deployment_type, DeploymentType)
         security_level = validate_enums(security_level, DeploymentSecurityLevel)
 
@@ -187,13 +191,19 @@ def create(
                 f"Should be min_replicas('{min_replicas}') <= max_replicas('{max_replicas}')."
             )
 
-        if vm_type not in cloud_vm_map[cloud]:
+        if gpu_type not in cloud_gpu_map[cloud]:
+            raise InvalidConfigError(
+                f"GPU type {gpu_type.value} is not supported in cloud {cloud.value}."
+            )
+
+        if num_gpus not in gpu_num_map[gpu_type]:
             raise InvalidConfigError(
-                f"VM type {vm_type.value} is not supported in cloud {cloud.value}."
+                f"Num gpus {num_gpus} is not supported for GPU {gpu_type.value}."
             )
 
         deploy_configurator = OrcaDeploymentConfigurator(config=config)
         deploy_configurator.validate()
+        config = {"orca_config": config}
 
         if default_request_config is not None:
             drc_configurator = DRCConfigurator(config=default_request_config)
@@ -240,7 +250,6 @@ def create(
                 file_client.make_misc_file_uploaded(misc_file_id=file_id)
                 config["orca_config"]["default_request_config_id"] = file_id
 
-        num_gpus = vm_num_gpu_map[vm_type]
         config["orca_config"]["num_devices"] = num_gpus
 
         config["scaler_config"] = {}
@@ -252,7 +261,7 @@ def create(
             "model_id": str(checkpoint_id),
             "deployment_type": deployment_type.value,
             "name": name,
-            "vm": {"name": vm_type.value},
+            "vm": {"gpu_type": gpu_type.value},
             "cloud": cloud.value,
             "region": region,
             "total_gpus": num_gpus,
diff --git a/periflow/utils/maps.py b/periflow/utils/maps.py
index 94245455..4f221b2c 100644
--- a/periflow/utils/maps.py
+++ b/periflow/utils/maps.py
@@ -8,7 +8,7 @@
 
 from pydantic import BaseModel
 
-from periflow.enums import CloudType, CredType, VMType
+from periflow.enums import CloudType, CredType, GpuType, VMType
 from periflow.schema.resource.v1.credential import (
     V1AzureBlobCredential,
     V1GCSCredential,
@@ -49,6 +49,20 @@
 }
 
 
+cloud_gpu_map: Dict[CloudType, list[GpuType]] = {
+    CloudType.AWS: [GpuType.A10G],
+    CloudType.AZURE: [GpuType.A100_80G],
+    CloudType.GCP: [GpuType.A100],
+}
+
+
+gpu_num_map: Dict[GpuType, list[int]] = {
+    GpuType.A10G: [1],
+    GpuType.A100_80G: [1, 2, 4, 8],
+    GpuType.A100: [1],
+}
+
+
 vm_num_gpu_map: Dict[VMType, int] = {
     VMType.G5_XLARGE: 1,
     VMType.A2_HIGHGPU_1G: 1,
diff --git a/tests/unit_tests/client/test_deployment.py b/tests/unit_tests/client/test_deployment.py
index 9287e1b8..cf49afd8 100644
--- a/tests/unit_tests/client/test_deployment.py
+++ b/tests/unit_tests/client/test_deployment.py
@@ -133,7 +133,8 @@ def test_deployment_client_create_deployment(
         "model_id": "ffffffff-ffff-ffff-ffff-ffffffffffff",
         "deployment_type": DeploymentType.DEV,
         "name": "test_deployment",
-        "vm_type": "g5.xlarge",
+        "gpu_type": "a10g",
+        "num_gpus": 1,
         "cloud": "aws",
         "region": "test_region",
     }
@@ -165,7 +166,8 @@ def test_deployment_client_create_deployment(
         "model_id": "ffffffff-ffff-ffff-ffff-ffffffffffff",
         "deployment_type": DeploymentType.DEV,
         "name": "test_deployment",
-        "vm_type": "g5.xlarge",
+        "gpu_type": "a10g",
+        "num_gpus": 1,
         "cloud": "aws",
         "region": "test_region",
         "num_replicas": 2,