diff --git a/Makefile b/Makefile index 4954032..ccb4f4e 100644 --- a/Makefile +++ b/Makefile @@ -25,7 +25,7 @@ lint: check_pylint_installed .PHONY: test test: check_pytest_installed - @pytest -q -x --ignore=third_party/ --disable-warnings + @pytest -x --ignore=third_party/ --disable-warnings #################### pygloo install for gloo migration backend begin #################### diff --git a/llumnix/backends/vllm/llm_engine.py b/llumnix/backends/vllm/llm_engine.py index 97b3c61..b7c1ab4 100644 --- a/llumnix/backends/vllm/llm_engine.py +++ b/llumnix/backends/vllm/llm_engine.py @@ -111,6 +111,8 @@ def _process_model_outputs( scheduled_seq_groups = new_scheduled_seq_groups output[0].outputs = new_output seq_group_metadata_list = new_seq_group_metadata_list + for ignored_seq_group in ignored_seq_groups: + server_info_list.append(ignored_seq_group.server_info) request_outputs = super()._process_model_outputs(output, scheduled_seq_groups, ignored_seq_groups, seq_group_metadata_list) # TODO(ZeldaHuang) Use LlumnixRequestOutput to store llumnix output args. return request_outputs, server_info_list @@ -149,7 +151,8 @@ def add_request(self, request_id: str, server_info: ServerInfo, *args, **kwargs) super().add_request(request_id, *args, **kwargs) seq_group = self.scheduler.waiting[-1] self.scheduler.waiting[-1] = SequenceGroupLlumnix(request_id, server_info, [seq_group.get_seqs()[0]], seq_group.sampling_params, - seq_group.metrics.arrival_time, seq_group.lora_request, seq_group.multi_modal_data) + seq_group.metrics.arrival_time, seq_group.lora_request, seq_group.multi_modal_data) + self.scheduler.scheduler_lock.release() def _put_request_output_to_server(self, request_outputs, server_infos: List[ServerInfo]) -> None: server_request_outputs = defaultdict(list) diff --git a/llumnix/backends/vllm/scheduler.py b/llumnix/backends/vllm/scheduler.py index 2de57b7..aed61fe 100644 --- a/llumnix/backends/vllm/scheduler.py +++ b/llumnix/backends/vllm/scheduler.py @@ -195,7 +195,7 @@ def _get_instance_info(self) -> InstanceInfo: instance_info.inference_type = self.running[-1].inference_type # TODO(ZeldaHuang) adapt chunked-prefill instance_info.num_batched_tokens = sum([seq_group.request_len for seq_group in self.running])\ - if instance_info.inference_type == RequestInferenceType.PREFILL else len(instance_info.running_seq_lens) + if instance_info.inference_type == RequestInferenceType.PREFILL else len(instance_info.running_seq_lens) instance_info.finished_request_ids = [seq_group.request_id for seq_group in self.running if seq_group.is_finished()] return instance_info @@ -205,8 +205,10 @@ def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]: self.update_instance_info_callback(self._get_instance_info()) return seq_group_metadata_list, scheduler_outputs - @scheduler_lock def add_seq_group(self, *args, **kwargs): + # The scheduler lock is mannually released in the end of LLMEngineLlumnix.add_request function. + # pylint: disable=R1732 + self.scheduler_lock.acquire() return super().add_seq_group(*args, **kwargs) @scheduler_lock diff --git a/llumnix/backends/vllm/sequence.py b/llumnix/backends/vllm/sequence.py index 96cd238..1c0c445 100644 --- a/llumnix/backends/vllm/sequence.py +++ b/llumnix/backends/vllm/sequence.py @@ -15,6 +15,7 @@ from llumnix.llumlet.request import LlumnixRequest, RequestInferenceType + class SequenceGroupLlumnix(SequenceGroup, LlumnixRequest): def __init__(self, request_id, server_info, *args, **kwargs) -> None: SequenceGroup.__init__(self, request_id, *args, **kwargs) diff --git a/tests/backends/vllm/test_llm_engine.py b/tests/backends/vllm/test_llm_engine.py index e6e8977..d6e4b14 100644 --- a/tests/backends/vllm/test_llm_engine.py +++ b/tests/backends/vllm/test_llm_engine.py @@ -97,7 +97,8 @@ def test_llm_engine_add_requset(): engine_args = EngineArgs(model="facebook/opt-125m", worker_use_ray=True) llm_engine = LLMEngineLlumnix.from_engine_args(engine_args, instance_id="0", migration_config=None, latency_mem=MagicMock(sepc=LatencyMemData)) sampling_params = SamplingParams(top_k=1, temperature=0, ignore_eos=True, max_tokens=100) - llm_engine.add_request("0", None,"prompt", sampling_params) + llm_engine.scheduler.scheduler_lock = MagicMock() + llm_engine.add_request("0", None, "prompt", sampling_params) assert len(llm_engine.scheduler.waiting) == 1 assert llm_engine.scheduler.waiting[-1].request_id == "0" assert isinstance(llm_engine.scheduler.waiting[-1], LlumnixRequest) \ No newline at end of file diff --git a/tests/backends/vllm/utils.py b/tests/backends/vllm/utils.py index 0558155..b49cb32 100644 --- a/tests/backends/vllm/utils.py +++ b/tests/backends/vllm/utils.py @@ -23,18 +23,26 @@ from llumnix.backends.vllm.scheduler import SchedulerLlumnix from llumnix.backends.vllm.sequence import SequenceGroupLlumnix + +class SchedulerLlumnixTest(SchedulerLlumnix): + def add_seq_group(self, *args, **kwargs): + ret = super().add_seq_group(*args, **kwargs) + self.scheduler_lock.release() + return ret + + def initialize_scheduler(*, max_num_seqs=1000, max_token_budget=1000, max_model_len=1000, - lora_config=None) -> SchedulerLlumnix: + lora_config=None) -> SchedulerLlumnixTest: block_size = 4 scheduler_config = SchedulerConfig(max_token_budget, max_num_seqs, max_model_len) cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config.num_cpu_blocks = 8 cache_config.num_gpu_blocks = 8 - scheduler = SchedulerLlumnix(scheduler_config, cache_config, lora_config) + scheduler = SchedulerLlumnixTest(scheduler_config, cache_config, lora_config) scheduler.update_instance_info_callback = MagicMock() return scheduler