[CI] Add unittest for llumlet and backends (#14)

AlibabaPAI · Aug 26, 2024 · 70b8c9a · 70b8c9a
1 parent 6b7b099
commit 70b8c9a
Show file tree

Hide file tree

Showing 14 changed files with 744 additions and 5 deletions.
diff --git a/docs/Quickstart.md b/docs/Quickstart.md
@@ -15,7 +15,7 @@ cd llumnix
 make install
 ```
 
-If you want to use gloo as migration backend, please install [Bazel](https://github.com/bazelbuild/bazel) >= 5.1.0. Then, run `make pygloo` to install [pygloo](https://github.com/ZeldaHuang/pygloo). 
+If you want to use gloo as migration backend, please refer to [this link](https://github.com/ZeldaHuang/pygloo/blob/main/.github/workflows/ubuntu_basic.yml#L24C1-L26C1) to install [Bazel](https://github.com/bazelbuild/bazel) >= 5.1.0. Then, run `make pygloo` to install [pygloo](https://github.com/ZeldaHuang/pygloo).
 
 Note: Using conda is not recommended, as it cannot properly handle pygloo's dependency on gcc libstdc++.so.6: version GLIBCXX_3.4.30.
 
@@ -45,7 +45,7 @@ python -m llumnix.entrypoints.vllm.api_server \
 Upon starting the server, Llumnix's components are automatically configured.
 In addition to the server arguments provided above, it's necessary to specify both the Llumnix arguments and the vLLM arguments. For detailed configuration options, please consult the documentation for [Llumnix arguments](./Arguments.md) and [vLLM arguments](https://docs.vllm.ai/en/v0.4.2/models/engine_args.html).
 
-2. Launch multiple servers and connect to the Llumnix cluster. Llumnix uses Ray to manage multiple vLLM servers and instances. You need to configure the following environment variables for Llumnix to correctly set up the cluster. 
+2. Launch multiple servers and connect to the Llumnix cluster. Llumnix uses Ray to manage multiple vLLM servers and instances. You need to configure the following environment variables for Llumnix to correctly set up the cluster.
 ```
 # Configure on all nodes.
 export HEAD_NODE_IP=$HEAD_NODE_IP_ADDRESS
@@ -66,7 +66,7 @@ When you include the --launch-ray-cluster option in Llumnix's serving deployment
 
 
 # Benchmarking
-We provide a benchmarking example to help you get through the usage of Llumnix. 
+We provide a benchmarking example to help you get through the usage of Llumnix.
 First, you should start the server to launch Llumnix and backend LLM engine instances:
 ```
 HEAD_NODE=1 python -m llumnix.entrypoints.vllm.api_server \

diff --git a/tests/__init__.py b/tests/__init__.py
@@ -0,0 +1,12 @@
+# Copyright (c) 2024, Alibaba Group;
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+# http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/tests/backends/__init__.py b/tests/backends/__init__.py
@@ -0,0 +1,12 @@
+# Copyright (c) 2024, Alibaba Group;
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+# http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/tests/backends/vllm/__init__.py b/tests/backends/vllm/__init__.py
@@ -0,0 +1,12 @@
+# Copyright (c) 2024, Alibaba Group;
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+# http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/tests/backends/vllm/test_llm_engine.py b/tests/backends/vllm/test_llm_engine.py
@@ -0,0 +1,91 @@
+# Copyright (c) 2024, Alibaba Group;
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+# http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+from unittest.mock import MagicMock
+
+from vllm.sequence import (Logprob, SequenceGroupOutput, SequenceOutput,
+                           SequenceStatus,SamplerOutput)
+from vllm import EngineArgs
+from vllm.engine.output_processor.single_step import SingleStepOutputProcessor
+from vllm.engine.output_processor.stop_checker import StopChecker
+from vllm.transformers_utils.detokenizer import Detokenizer
+from vllm.utils import Counter
+
+from llumnix.backends.vllm.llm_engine import LLMEngineLlumnix
+from llumnix.backends.vllm.executor import LlumnixRayGPUExecutor, SimGPUExecutor
+from llumnix.backends.profiling import LatencyMemData
+
+from .utils import create_dummy_prompt, initialize_scheduler
+
+
+class MockEngine(LLMEngineLlumnix):
+    def __init__(self, executor_class=None, *args, **kwargs):
+        self.scheduler = initialize_scheduler()
+        detokenizer = MagicMock(spec=Detokenizer)
+        stop_checker = MagicMock(spec=StopChecker)
+        seq_counter = Counter()
+        self.executor_class = executor_class
+
+        self.output_processor = SingleStepOutputProcessor(self.scheduler.scheduler_config,detokenizer, self.scheduler, seq_counter, stop_checker)
+
+
+def test_llm_engine_process_model_outputs():
+
+    llm_engine = MockEngine()
+    _, seq_group_0 = create_dummy_prompt(
+        "0", prompt_length=7, block_size=4
+    )
+    _, seq_group_1 = create_dummy_prompt(
+        "1", prompt_length=7, block_size=4
+    )
+    llm_engine.scheduler.add_seq_group(seq_group_0)
+    llm_engine.scheduler.add_seq_group(seq_group_1)
+    metas, out = llm_engine.scheduler.schedule()
+
+    seqs = [seq_group_0.get_seqs()[0], seq_group_1.get_seqs()[0]]
+
+    outputs = [
+        SequenceGroupOutput(
+            samples=[
+                SequenceOutput(
+                    parent_seq_id=seq.seq_id,
+                    output_token=1,
+                    logprobs={1: Logprob(0.0)},
+                )
+            ],
+            prompt_logprobs=None,
+        ) for seq in seqs
+    ]
+    sampler_outputs = [SamplerOutput(outputs=outputs)]
+
+    scheduled_seq_groups = out.scheduled_seq_groups
+    # normal case, all requests be processed
+    ret = llm_engine._process_model_outputs(sampler_outputs, scheduled_seq_groups,[], metas)
+    assert len(ret) == 2
+    metas, out = llm_engine.scheduler.schedule()
+    scheduled_seq_groups = out.scheduled_seq_groups
+    seqs[0].status=SequenceStatus.WAITING
+    # migration case , requests stopping during last stage migration, stop process
+    ret = llm_engine._process_model_outputs(sampler_outputs, scheduled_seq_groups,[], metas)
+    assert len(ret) == 1
+
+def test_llm_engine_from_engine_args():
+    engine_args = EngineArgs(model="facebook/opt-125m", worker_use_ray=True)
+    llm_engine = MockEngine.from_engine_args(engine_args, instance_id="0", migration_config=None)
+    assert llm_engine.executor_class == LlumnixRayGPUExecutor
+
+    latency_data = LatencyMemData({},{},{})
+    llm_engine = MockEngine.from_engine_args(engine_args, instance_id="0", migration_config=None, latency_mem=latency_data)
+    assert llm_engine.executor_class == SimGPUExecutor
diff --git a/tests/backends/vllm/test_migration.py b/tests/backends/vllm/test_migration.py
@@ -0,0 +1,144 @@
+# Copyright (c) 2024, Alibaba Group;
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+# http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+import torch
+import time
+import ray
+from ray.util.queue import Queue as RayQueue
+from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy
+
+from vllm import EngineArgs, SamplingParams
+from vllm.utils import random_uuid
+
+from llumnix.backends.vllm.llm_engine import BackendVLLM
+from llumnix.llumlet.llumlet import Llumlet
+from llumnix.backends.utils import BackendType
+from llumnix.config import MigrationConfig
+from llumnix.server_info import ServerInfo
+
+from .test_llm_engine import MockEngine
+from .utils import create_dummy_prompt
+
+TEST_PROMPTS = ["hello world, ",
+                "Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020.\n",
+                "Write a short story about a robot that dreams for the first time.\n",
+                "Explain the cultural significance of the Mona Lisa painting, and how its perception might vary in Western versus Eastern societies.\n",
+                "Swahili: 'The early bird catches the worm.'\n"]
+
+class MockBackendVLLM(BackendVLLM):
+    def __init__(self):
+        self.engine = MockEngine()
+
+class MockLlumlet(Llumlet):
+    def __init__(self):
+        self.instance_id = "0"
+        self.backend_engine = MockBackendVLLM()
+
+@pytest.mark.skipif(torch.cuda.device_count() < 2,
+                    reason="Need at least 2 GPUs to run the test.")
+def test_migration_correctness():
+    ray.init(namespace="llumnix", ignore_reinit_error=True)
+    engine_args = EngineArgs(model="facebook/opt-125m",worker_use_ray=True)
+    id_rank_map = {"0":0,"1":1}
+    migration_config = MigrationConfig("LCFS", "gloo",16,1,4,5,20)
+    que = RayQueue(actor_options={
+        "scheduling_strategy": NodeAffinitySchedulingStrategy(
+            node_id=ray.get_runtime_context().get_node_id(),
+            soft=False,)
+    })
+    server_info = ServerInfo("0",que)
+
+    llumlet_0:Llumlet = Llumlet.from_args(
+                            False,
+                            True,
+                            ray.get_runtime_context().get_node_id(),
+                            "0",
+                            BackendType.VLLM,
+                            1,
+                            migration_config,
+                            engine_args,)
+
+    llumlet_1:Llumlet = Llumlet.from_args(
+                            False,
+                            True,
+                            ray.get_runtime_context().get_node_id(),
+                            "1",
+                            BackendType.VLLM,
+                            1,
+                            migration_config,
+                            engine_args,
+                     )
+    while True:
+        res = ray.get([llumlet_0.is_ready.remote(),llumlet_1.is_ready.remote()])
+        if all(res):
+            break
+    ray.get([llumlet_0.execute_engine_method.remote("_run_workers","rebuild_migration_backend", id_rank_map, "llumnix"),
+            llumlet_1.execute_engine_method.remote("_run_workers","rebuild_migration_backend", id_rank_map, "llumnix")])
+    # empty instance migrate out
+    res = ray.get(llumlet_0.migrate_out.remote("instance_1"))
+    assert not res
+
+    # running without migration
+    def test_correctness(prompt):
+        sampling_params = SamplingParams(top_k=1, temperature=0, ignore_eos=True, max_tokens=100)
+        request_id0 = random_uuid()
+        llumlet_0.generate.remote(request_id0, server_info, prompt, sampling_params)
+        request_output_queue = que
+        origin_output = None
+        finished = False
+        while not finished:
+            qsize = ray.get(request_output_queue.actor.qsize.remote())
+            request_outputs = ray.get(request_output_queue.actor.get_nowait_batch.remote(qsize))
+            for request_output in request_outputs:
+                origin_output = request_output.outputs[0]
+                finished = request_output.finished
+
+        request_id1 = random_uuid()
+        llumlet_0.generate.remote(request_id1, server_info, prompt, sampling_params)
+        # wait prefill done
+        while True:
+            if ray.get(llumlet_0.execute_engine_method.remote("get_last_running_request")):
+                break
+        # migrate request
+        res = ray.get(llumlet_0.migrate_out.remote("instance_1"))
+        assert len(res) == 1
+        request_output_queue = que
+        output = None
+        finished = False
+        while not finished:
+            qsize = ray.get(request_output_queue.actor.qsize.remote())
+            request_outputs = ray.get(request_output_queue.actor.get_nowait_batch.remote(qsize))
+            for request_output in request_outputs:
+                if request_output.request_id != request_id1:
+                    continue
+                output = request_output.outputs[0]
+                finished = request_output.finished
+        assert output.text == origin_output.text
+        assert output.cumulative_logprob == origin_output.cumulative_logprob
+    for prompt in TEST_PROMPTS:
+        test_correctness(prompt)
+    ray.shutdown()
+
+def test_clear_migration_states():
+    llumlet = MockLlumlet()
+    llumlet.backend_engine.pre_alloc("0", 1)
+    num_gpu_blocks = 8
+    block_size = 4
+
+    llumlet.clear_migration_states(is_migrate_in=True)
+    assert len(llumlet.backend_engine.pre_alloc("0", num_gpu_blocks)) == num_gpu_blocks
+    _, seq_group = create_dummy_prompt("0",7,block_size)
+    llumlet.backend_engine.add_migrating_out_request_last_stage(seq_group)
+    llumlet.clear_migration_states(is_migrate_in=False)
+    assert llumlet.backend_engine.get_last_running_request() is not None