diff --git a/examples/offline_inference/multilora_inference.py b/examples/offline_inference/multilora_inference.py index f0c00bcaae..6040683c68 100644 --- a/examples/offline_inference/multilora_inference.py +++ b/examples/offline_inference/multilora_inference.py @@ -23,7 +23,7 @@ def create_test_prompts( 2 requests for base model, 4 requests for the LoRA. We define 2 different LoRA adapters (using the same model for demo purposes). Since we also set `max_loras=1`, the expectation is that the requests - with the second LoRA adapter will be ran after all requests with the + with the second LoRA adapter will be run after all requests with the first adapter have finished. """ return [ diff --git a/vllm/distributed/device_communicators/pynccl.py b/vllm/distributed/device_communicators/pynccl.py index 502bfd3900..3e4d0d250a 100644 --- a/vllm/distributed/device_communicators/pynccl.py +++ b/vllm/distributed/device_communicators/pynccl.py @@ -31,7 +31,7 @@ class PyNcclCommunicator: group: the process group to work on. If None, it will use the default process group. device: the device to bind the PyNcclCommunicator to. If None, - it will be bind to f"cuda:{local_rank}". + it will be bound to f"cuda:{local_rank}". library_path: the path to the NCCL library. If None, it will use the default library path. It is the caller's responsibility to make sure each communicator diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index b89aee99c8..fc96c2ac92 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -939,8 +939,8 @@ def get_pipeline_model_parallel_group(): def graph_capture(device: torch.device): """ `graph_capture` is a context manager which should surround the code that - is capturing the CUDA graph. Its main purpose is to ensure that the - some operations will be run after the graph is captured, before the graph + is capturing the CUDA graph. Its main purpose is to ensure that some + operations will be run after the graph is captured, before the graph is replayed. It returns a `GraphCaptureContext` object which contains the necessary data for the graph capture. Currently, it only contains the stream that the graph capture is running on. This stream is set to the diff --git a/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py index 73329cdf70..992f141bef 100644 --- a/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py @@ -165,7 +165,7 @@ class PythonicToolParser(ToolParser): index] += delta.function.arguments # HACK: serving_chat.py inspects the internal state of tool parsers - # when determining it's final streaming delta, automatically + # when determining its final streaming delta, automatically # adding autocompleted JSON. # These two lines avoid that nonsense while ensuring finish_reason # is set to tool_calls when at least one tool is called. diff --git a/vllm/model_executor/layers/fused_moe/moe_pallas.py b/vllm/model_executor/layers/fused_moe/moe_pallas.py index 582ae3e12c..23f618b1a5 100644 --- a/vllm/model_executor/layers/fused_moe/moe_pallas.py +++ b/vllm/model_executor/layers/fused_moe/moe_pallas.py @@ -7,7 +7,7 @@ import torch.nn.functional as F def _histogram(input: torch.Tensor, min: int, max: int) -> torch.Tensor: """ - Compute the histogram of a int32 tensor. The bin edges are defined by the + Compute the histogram of an int32 tensor. The bin edges are defined by the min and max values, with step = 1. """ assert input.dtype == torch.int32, "input must be of torch.int32 dtype." diff --git a/vllm/model_executor/models/ovis.py b/vllm/model_executor/models/ovis.py index 04a06e5f9d..41fd272397 100644 --- a/vllm/model_executor/models/ovis.py +++ b/vllm/model_executor/models/ovis.py @@ -544,7 +544,7 @@ class Ovis(nn.Module, SupportsMultiModal, SupportsPP): vision_embeddings) input_ids = None - # up until here we have a inputs_embeds 100% numerical identity + # up until here we have an inputs_embeds 100% numerical identity # between the OG HF Transformers implementation and ours hidden_states = self.llm( input_ids=input_ids, diff --git a/vllm/model_executor/models/phi4mm_audio.py b/vllm/model_executor/models/phi4mm_audio.py index 0b0d66ae77..b5e4d727bf 100644 --- a/vllm/model_executor/models/phi4mm_audio.py +++ b/vllm/model_executor/models/phi4mm_audio.py @@ -43,7 +43,7 @@ class ConformerEncoderLayer(nn.Module): if set different to 0, the number of depthwise_seperable_out_channel will be used as a channel_out of the second conv1d layer. - otherwise, it equal to 0, the second conv1d layer is skipped. + otherwise, it equals to 0, the second conv1d layer is skipped. depthwise_multiplier: int number of input_dim channels duplication. this value will be used to compute the hidden channels of the Conv1D. @@ -115,7 +115,7 @@ class ConformerEncoderLayer(nn.Module): we recalculate activation in backward. default "". export: bool, optional - if set to True, it remove the padding from convolutional layers + if set to True, it removes the padding from convolutional layers and allow the onnx conversion for inference. default False. use_pt_scaled_dot_product_attention: bool, optional @@ -686,7 +686,7 @@ class ConformerEncoder(TransformerEncoderBase): only work for glu_in_attention !=0 default "swish". export: bool, optional - if set to True, it remove the padding from convolutional layers + if set to True, it removes the padding from convolutional layers and allow the onnx conversion for inference. default False. activation_checkpointing: str, optional diff --git a/vllm/model_executor/models/phi4mm_utils.py b/vllm/model_executor/models/phi4mm_utils.py index c4890d8427..5953550382 100644 --- a/vllm/model_executor/models/phi4mm_utils.py +++ b/vllm/model_executor/models/phi4mm_utils.py @@ -258,7 +258,7 @@ class DepthWiseSeperableConv1d(nn.Module): if set different to 0, the number of depthwise_seperable_out_channel will be used as a channel_out of the second conv1d layer. - otherwise, it equal to 0, the second conv1d layer is skipped. + otherwise, it equals to 0, the second conv1d layer is skipped. kernel_size: int kernel_size depthwise_multiplier: int diff --git a/vllm/third_party/pynvml.py b/vllm/third_party/pynvml.py index d215e5d8bf..c06aa56744 100644 --- a/vllm/third_party/pynvml.py +++ b/vllm/third_party/pynvml.py @@ -1022,7 +1022,7 @@ def _extractNVMLErrorsAsClasses(): Each NVML Error gets a new NVMLError subclass. This way try,except blocks can filter appropriate exceptions more easily. - NVMLError is a parent class. Each NVML_ERROR_* gets it's own subclass. + NVMLError is a parent class. Each NVML_ERROR_* gets its own subclass. e.g. NVML_ERROR_ALREADY_INITIALIZED will be turned into NVMLError_AlreadyInitialized ''' this_module = sys.modules[__name__] diff --git a/vllm/transformers_utils/configs/nemotron.py b/vllm/transformers_utils/configs/nemotron.py index 9a7243b126..090fefa142 100644 --- a/vllm/transformers_utils/configs/nemotron.py +++ b/vllm/transformers_utils/configs/nemotron.py @@ -26,7 +26,7 @@ logger = logging.get_logger(__name__) class NemotronConfig(PretrainedConfig): r""" This is the configuration class to store the configuration of a - [`NemotronModel`]. It is used to instantiate an Nemotron model + [`NemotronModel`]. It is used to instantiate a Nemotron model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the Nemotron-8B. diff --git a/vllm/transformers_utils/configs/nemotron_h.py b/vllm/transformers_utils/configs/nemotron_h.py index 027f291154..581bed5716 100644 --- a/vllm/transformers_utils/configs/nemotron_h.py +++ b/vllm/transformers_utils/configs/nemotron_h.py @@ -38,7 +38,7 @@ class NemotronHConfig(PretrainedConfig): passed when calling [`NemotronHModel`] tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether the model's input and output word embeddings should be - tied. Note that this is only relevant if the model has a output + tied. Note that this is only relevant if the model has an output word embedding layer. hidden_size (`int`, *optional*, defaults to 4096): Dimension of the hidden representations. diff --git a/vllm/transformers_utils/processors/ovis.py b/vllm/transformers_utils/processors/ovis.py index 557d251c45..0077a7a8ce 100644 --- a/vllm/transformers_utils/processors/ovis.py +++ b/vllm/transformers_utils/processors/ovis.py @@ -55,7 +55,7 @@ class OvisProcessorKwargs(ProcessingKwargs, total=False): # type: ignore[call- class OvisProcessor(ProcessorMixin): r""" - Constructs a Ovis processor which wraps a Ovis image processor and a Qwen2 tokenizer into a single processor. + Constructs an Ovis processor which wraps an Ovis image processor and a Qwen2 tokenizer into a single processor. [`OvisProcessor`] offers all the functionalities of [`Qwen2VLImageProcessor`] and [`Qwen2TokenizerFast`]. See the [`~OvisProcessor.__call__`] and [`~OvisProcessor.decode`] for more information. Args: diff --git a/vllm/transformers_utils/processors/ovis2_5.py b/vllm/transformers_utils/processors/ovis2_5.py index d3273257ff..282e9cb211 100644 --- a/vllm/transformers_utils/processors/ovis2_5.py +++ b/vllm/transformers_utils/processors/ovis2_5.py @@ -41,7 +41,7 @@ class Ovis2_5ProcessorKwargs(ProcessingKwargs, class Ovis2_5Processor(ProcessorMixin): r""" - Constructs a Ovis processor which wraps a Ovis image processor + Constructs an Ovis processor which wraps an Ovis image processor and a Qwen2 tokenizer into a single processor. [`OvisProcessor`] offers all the functionalities of [`Qwen2VLImageProcessor`] and [`Qwen2TokenizerFast`]. diff --git a/vllm/v1/spec_decode/ngram_proposer.py b/vllm/v1/spec_decode/ngram_proposer.py index fbcf2cb50d..b92e396d45 100644 --- a/vllm/v1/spec_decode/ngram_proposer.py +++ b/vllm/v1/spec_decode/ngram_proposer.py @@ -107,7 +107,7 @@ def _find_longest_matched_ngram_and_propose_tokens( longest_ngram = 0 position = 0 - # lps[0] always equal to 0, we starts with index 1 + # lps[0] always equal to 0, we start with index 1 prev_lps = 0 i = 1 while i < total_token: