Compare commits
1 Commits
codex/remo
...
woosuk/min
| Author | SHA1 | Date | |
|---|---|---|---|
| 2705f03cad |
@ -334,7 +334,8 @@ class Worker(WorkerBase):
|
|||||||
self.model_runner._dummy_run(size,
|
self.model_runner._dummy_run(size,
|
||||||
skip_eplb=True,
|
skip_eplb=True,
|
||||||
remove_lora=False)
|
remove_lora=False)
|
||||||
self.model_runner.maybe_remove_all_loras(self.model_runner.lora_config)
|
if self.lora_config is not None:
|
||||||
|
self.model_runner.maybe_remove_all_loras(self.lora_config)
|
||||||
|
|
||||||
# Warmup and tune the kernels used during model execution before
|
# Warmup and tune the kernels used during model execution before
|
||||||
# cuda graph capture.
|
# cuda graph capture.
|
||||||
@ -427,6 +428,9 @@ class Worker(WorkerBase):
|
|||||||
self,
|
self,
|
||||||
scheduler_output: "SchedulerOutput",
|
scheduler_output: "SchedulerOutput",
|
||||||
) -> Optional[Union[ModelRunnerOutput, AsyncModelRunnerOutput]]:
|
) -> Optional[Union[ModelRunnerOutput, AsyncModelRunnerOutput]]:
|
||||||
|
if len(get_pp_group().ranks) == 1:
|
||||||
|
return self.model_runner.execute_model(scheduler_output)
|
||||||
|
|
||||||
intermediate_tensors = None
|
intermediate_tensors = None
|
||||||
forward_pass = scheduler_output.total_num_scheduled_tokens > 0
|
forward_pass = scheduler_output.total_num_scheduled_tokens > 0
|
||||||
num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
|
num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
|
||||||
@ -445,8 +449,6 @@ class Worker(WorkerBase):
|
|||||||
|
|
||||||
output = self.model_runner.execute_model(scheduler_output,
|
output = self.model_runner.execute_model(scheduler_output,
|
||||||
intermediate_tensors)
|
intermediate_tensors)
|
||||||
if isinstance(output, (ModelRunnerOutput, AsyncModelRunnerOutput)):
|
|
||||||
return output
|
|
||||||
|
|
||||||
assert isinstance(output, IntermediateTensors)
|
assert isinstance(output, IntermediateTensors)
|
||||||
parallel_config = self.vllm_config.parallel_config
|
parallel_config = self.vllm_config.parallel_config
|
||||||
|
|||||||
Reference in New Issue
Block a user