fix

AlibabaPAI · Sep 12, 2024 · 1f7dea5 · 1f7dea5
1 parent d631720
commit 1f7dea5
Show file tree

Hide file tree

Showing 10 changed files with 17 additions and 11,782 deletions.
diff --git a/Makefile b/Makefile
@@ -29,7 +29,7 @@ lint: check_pylint_installed check_pytest_installed
 
 .PHONY: test
 test: check_pytest_installed
-	@pytest -x --ignore=third_party/ --ignore=tests/e2e_test --disable-warnings
+	@pytest -x -s --ignore=third_party/ --ignore=tests/e2e_test --disable-warnings
 
 #################### pygloo install for gloo migration backend begin ####################
 

diff --git a/benchmark_len.png b/benchmark_len.png
diff --git a/conftest.py b/conftest.py
@@ -6,7 +6,7 @@
 def pytest_sessionstart(session):
     subprocess.run(["ray", "stop", "--force"], check=True)
     sleep(3)
-    subprocess.run(["ray", "start", "--head"], check=True)
+    subprocess.run(["ray", "start", "--head", "--disable-usage-stats", "--port=30050"], check=True)
     sleep(3)
 
 def pytest_sessionfinish(session, exitstatus):

diff --git a/instance_37037.out b/instance_37037.out
diff --git a/instance_37038.out b/instance_37038.out
diff --git a/llumnix/backends/vllm/llm_engine.py b/llumnix/backends/vllm/llm_engine.py
@@ -166,6 +166,8 @@ def step(self) -> None:
         instance_info.step_id = next(self.step_counter)
         instance_info.timestamp = time.time()
         instance_info.latency = self.model_executor.last_inference_latency
+
+        self.scheduler.scheduler_lock.acquire()
         seq_groups = self.scheduler.running
         if seq_groups:
             tot_blocks = []
@@ -174,7 +176,8 @@ def step(self) -> None:
                 tot_blocks.extend(blocks)
             tot_blocks = set(tot_blocks)
             instance_info.num_blocks_last_running_request = len(tot_blocks)
-
+        self.scheduler.scheduler_lock.release()
+
         if request_outputs:
             self._put_request_outputs_to_server(request_outputs, server_infos)
         self.instance_info = instance_info
@@ -257,8 +260,8 @@ def commit_dst_request(self, backend_request: SequenceGroupLlumnix) -> None:
         logger.info("add seq {} to block table".format(seq.seq_id))
         pre_alloc_blocks = self.engine.scheduler.pre_alloc_cache_dict.pop(backend_request.request_id)
         self.engine.scheduler.block_manager.add_block_table(pre_alloc_blocks, seq.seq_id)
-        self.add_running_request(backend_request)
         backend_request.reset_migration_args()
+        self.add_running_request(backend_request)
 
     def send_blocks(self, dst_ray_actor: "ray.actor.ActorHandle", src_blocks: List[int], dst_blocks: List[int]) -> None:
         ray.get(dst_ray_actor.execute_engine_method.remote("_run_workers",

diff --git a/llumnix/backends/vllm/worker.py b/llumnix/backends/vllm/worker.py
@@ -95,7 +95,7 @@ def init_migration(self, instance_id: str, migration_config: MigrationConfig, sr
         self.instance_id = instance_id
         self.global_world_size = 0
         self.global_rank = -1
-        self.migration_config = migration_config
+        # self.migration_config = migration_config
         self.migration_backend: MigrationBackendBase = get_migration_backend(migration_config=migration_config,
                                                                              cache_engine=self.cache_engine,
                                                                              worker_handle_list=src_worker_handle_list,

diff --git a/tests/e2e_test/test_bench.py b/tests/e2e_test/test_bench.py
@@ -15,7 +15,6 @@
 import json
 import os
 import subprocess
-import unittest
 import pytest
 import ray
 import torch
@@ -36,6 +35,7 @@ def generate_bench_command(ip_ports: str, model: str, num_prompts: int, dataset_
         f"--backend vLLM "
         f"--tokenizer {model} "
         f"--trust_remote_code "
+        f"--log_filename bench_{ip_ports} "
         f"--random_prompt_count {num_prompts} "
         f"--dataset_type {dataset_type} "
         f"--dataset_path {dataset_path} "
@@ -96,21 +96,14 @@ def parse_log_file():
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model", ['/mnt/model/Qwen-7B'])
 async def test_simple_benchmark(model):
-    # clear state
-    shutdown_llumnix_service()
-
-    os.environ['HEAD_NODE_IP'] = "127.0.0.1"
-    os.environ['HEAD_NODE'] = "1"
-    launch_ray_cluster(ray_cluster_port=30050)
-
     device_count = torch.cuda.device_count()
     base_port = 37037
     for i in range(device_count):
         launch_command = generate_launch_command(result_filename=str(base_port+i)+".out",
                                                  launch_ray_cluster=False, port=base_port+i, model=model)
         subprocess.run(launch_command, shell=True, check=True)
 
-    await asyncio.sleep(90)
+    await asyncio.sleep(60)
 
     async def run_bench_command(command):
         process = await asyncio.create_subprocess_shell(command)
@@ -126,11 +119,11 @@ async def run_bench_command(command):
                                                results_filename=f"{base_port+i}.out")
         tasks.append(run_bench_command(bench_command))
 
+    await asyncio.wait(tasks, timeout=60*30)
+
     parse_log_file()
 
     shutdown_llumnix_service()
     clear_ray_state()
+    assert 1==0
     await asyncio.sleep(10)
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/tests/e2e_test/test_e2e.py b/tests/e2e_test/test_e2e.py
@@ -28,6 +28,8 @@ def generate_launch_command(result_filename: str = "", launch_ray_cluster: bool
         f"nohup python -m llumnix.entrypoints.vllm.api_server "
         f"--host {ip} "
         f"--port {port} "
+        f"--disable-init-instance-by-manager "
+        f"--disable-fixed-node-init-instance "
         f"--initial-instances {instances_num} "
         f"--enable-migration "
         f"--model {model} "
@@ -42,7 +44,7 @@ def generate_launch_command(result_filename: str = "", launch_ray_cluster: bool
         f"--tensor-parallel-size 1 "
         f"--request-output-queue-port {1234+port} "
         f"{'--launch-ray-cluster ' if launch_ray_cluster else ''}"
-        f"{'> '+result_filename if len(result_filename)> 0 else ''} &"
+        f"{'> instance_'+result_filename if len(result_filename)> 0 else ''} &"
     )
     return command
 
@@ -107,7 +109,6 @@ async def test_e2e(model):
     }
 
     # generate llumnix outputs
-    shutdown_llumnix_service()
     launch_llumnix_service(model, max_model_len=max_model_len)
     await asyncio.sleep(60)
 

diff --git a/tools/bench_test.sh b/tools/bench_test.sh
@@ -3,4 +3,4 @@ set -ex
 
 nvidia-docker run --rm -t --net host --ipc host -v ${PWD}:/workspace -v /mnt:/mnt -w /workspace \
   registry.cn-beijing.aliyuncs.com/llumnix/llumnix-dev:20240909_action_678a439 \
-  bash -c "pip install -e . > /dev/null && pytest ./tests/e2e_test/test_bench.py"
+  bash -c "pip install -e . > /dev/null && pytest -s ./tests/e2e_test/test_bench.py"