diff --git a/.gitignore b/.gitignore index e49d1d6ba6..88a42a5c0f 100644 --- a/.gitignore +++ b/.gitignore @@ -200,5 +200,5 @@ benchmarks/**/*.json actionlint shellcheck*/ -# Ingore moe/marlin_moe gen code +# Ignore moe/marlin_moe gen code csrc/moe/marlin_moe_wna16/kernel_* diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index a105b0e14c..e13738d671 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -20,12 +20,10 @@ repos: args: [--output-format, github, --fix] - id: ruff-format files: ^(.buildkite|benchmarks|examples)/.* -- repo: https://github.com/codespell-project/codespell - rev: v2.4.1 +- repo: https://github.com/crate-ci/typos + rev: v1.32.0 hooks: - - id: codespell - additional_dependencies: ['tomli'] - args: ['--toml', 'pyproject.toml'] + - id: typos - repo: https://github.com/PyCQA/isort rev: 6.0.1 hooks: diff --git a/csrc/cpu/attention.cpp b/csrc/cpu/attention.cpp index 0257d8ff16..82862fea7f 100644 --- a/csrc/cpu/attention.cpp +++ b/csrc/cpu/attention.cpp @@ -137,8 +137,8 @@ FORCE_INLINE std::pair reduceSoftmaxAlibi(T* data, const int size, } template -FORCE_INLINE void reducePartitonSoftmax(const T* max_data, T* sum_data, - const int size) { +FORCE_INLINE void reducePartitionSoftmax(const T* max_data, T* sum_data, + const int size) { T max = max_data[0]; for (int i = 1; i < size; ++i) { max = max >= max_data[i] ? max : max_data[i]; @@ -634,7 +634,7 @@ struct paged_attention_v2_impl { if (partition_num == 1) continue; - reducePartitonSoftmax( + reducePartitionSoftmax( max_logits + seq_idx * num_heads * max_num_partitions + head_idx * max_num_partitions, exp_sums + seq_idx * num_heads * max_num_partitions + diff --git a/csrc/cpu/cpu_types_x86.hpp b/csrc/cpu/cpu_types_x86.hpp index 9a613ba588..3952c43cbc 100644 --- a/csrc/cpu/cpu_types_x86.hpp +++ b/csrc/cpu/cpu_types_x86.hpp @@ -83,7 +83,7 @@ struct FP16Vec16 : public Vec { explicit FP16Vec16(const void* ptr) : reg((__m256i)_mm256_loadu_si256((__m256i*)ptr)) {} - // non-temproal load + // non-temporal load explicit FP16Vec16(bool, void* ptr) : reg(_mm256_stream_load_si256((__m256i*)ptr)) {} @@ -120,7 +120,7 @@ struct BF16Vec16 : public Vec { explicit BF16Vec16(const void* ptr) : reg((__m256i)_mm256_loadu_si256((__m256i*)ptr)) {} - // non-temproal load + // non-temporal load explicit BF16Vec16(bool, void* ptr) : reg(_mm256_stream_load_si256((__m256i*)ptr)) {} @@ -327,7 +327,7 @@ struct FP32Vec16 : public Vec { // normal load explicit FP32Vec16(const float* ptr) : reg(_mm512_loadu_ps(ptr)) {} - // non-temproal load + // non-temporal load explicit FP32Vec16(bool, void* ptr) : reg((__m512)_mm512_stream_load_si512(ptr)) {} @@ -576,7 +576,7 @@ struct INT8Vec64 : public Vec { // normal load explicit INT8Vec64(void* ptr) : reg(_mm512_loadu_epi8(ptr)) {} - // non-temproal load + // non-temporal load explicit INT8Vec64(bool, void* ptr) : reg(_mm512_stream_load_si512(ptr)) {} void save(void* ptr) const { _mm512_storeu_epi8(ptr, reg); } @@ -587,7 +587,7 @@ struct INT8Vec64 : public Vec { _mm512_mask_storeu_epi8(ptr, mask, reg); } - // non-temproal save + // non-temporal save void nt_save(int8_t* ptr) { _mm512_stream_si512((__m512i*)ptr, reg); } }; #endif diff --git a/csrc/moe/moe_permute_unpermute_op.cu b/csrc/moe/moe_permute_unpermute_op.cu index 68f429fac1..a77471a7f2 100644 --- a/csrc/moe/moe_permute_unpermute_op.cu +++ b/csrc/moe/moe_permute_unpermute_op.cu @@ -12,7 +12,7 @@ void moe_permute( const torch::Tensor& input, // [n_token, hidden] const torch::Tensor& topk_weights, //[n_token, topk] torch::Tensor& topk_ids, // [n_token, topk] - const torch::Tensor& token_expert_indicies, // [n_token, topk] + const torch::Tensor& token_expert_indices, // [n_token, topk] const std::optional& expert_map, // [n_expert] int64_t n_expert, int64_t n_local_expert, int64_t topk, const std::optional& align_block_size, @@ -27,15 +27,15 @@ void moe_permute( "expert_first_token_offset must be int64"); TORCH_CHECK(topk_ids.scalar_type() == at::ScalarType::Int, "topk_ids must be int32"); - TORCH_CHECK(token_expert_indicies.scalar_type() == at::ScalarType::Int, - "token_expert_indicies must be int32"); + TORCH_CHECK(token_expert_indices.scalar_type() == at::ScalarType::Int, + "token_expert_indices must be int32"); TORCH_CHECK(src_row_id2dst_row_id_map.scalar_type() == at::ScalarType::Int, "src_row_id2dst_row_id_map must be int32"); TORCH_CHECK(expert_first_token_offset.size(0) == n_local_expert + 1, "expert_first_token_offset shape != n_local_expert+1") TORCH_CHECK( - src_row_id2dst_row_id_map.sizes() == token_expert_indicies.sizes(), - "token_expert_indicies shape must be same as src_row_id2dst_row_id_map"); + src_row_id2dst_row_id_map.sizes() == token_expert_indices.sizes(), + "token_expert_indices shape must be same as src_row_id2dst_row_id_map"); auto n_token = input.sizes()[0]; auto n_hidden = input.sizes()[1]; auto align_block_size_value = @@ -71,7 +71,7 @@ void moe_permute( expert_map_ptr, n_expert, stream); } // expert sort topk expert id and scan expert id get expert_first_token_offset - sortAndScanExpert(get_ptr(topk_ids), get_ptr(token_expert_indicies), + sortAndScanExpert(get_ptr(topk_ids), get_ptr(token_expert_indices), get_ptr(permuted_experts_id), get_ptr(dst_row_id2src_row_id_map), get_ptr(expert_first_token_offset), n_token, @@ -190,7 +190,7 @@ void shuffle_rows(const torch::Tensor& input_tensor, void moe_permute(const torch::Tensor& input, const torch::Tensor& topk_weights, torch::Tensor& topk_ids, - const torch::Tensor& token_expert_indicies, + const torch::Tensor& token_expert_indices, const std::optional& expert_map, int64_t n_expert, int64_t n_local_expert, int64_t topk, const std::optional& align_block_size, @@ -203,7 +203,7 @@ void moe_permute(const torch::Tensor& input, const torch::Tensor& topk_weights, void moe_unpermute(const torch::Tensor& input, const torch::Tensor& topk_weights, torch::Tensor& topk_ids, - const torch::Tensor& token_expert_indicies, + const torch::Tensor& token_expert_indices, const std::optional& expert_map, int64_t n_expert, int64_t n_local_expert, int64_t topk, const std::optional& align_block_size, diff --git a/csrc/moe/topk_softmax_kernels.cu b/csrc/moe/topk_softmax_kernels.cu index 10be47966f..dea5b1f21e 100644 --- a/csrc/moe/topk_softmax_kernels.cu +++ b/csrc/moe/topk_softmax_kernels.cu @@ -425,7 +425,7 @@ void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, f #define LAUNCH_SOFTMAX(NUM_EXPERTS, WARPS_PER_TB) \ topkGatingSoftmaxLauncherHelper( \ - gating_output, nullptr, topk_weights, topk_indicies, \ + gating_output, nullptr, topk_weights, topk_indices, \ token_expert_indices, num_tokens, topk, 0, num_experts, \ stream); @@ -433,7 +433,7 @@ template void topkGatingSoftmaxKernelLauncher( const float* gating_output, float* topk_weights, - IndType* topk_indicies, + IndType* topk_indices, int* token_expert_indices, float* softmax_workspace, const int num_tokens, @@ -476,7 +476,7 @@ void topkGatingSoftmaxKernelLauncher( moeSoftmax<<>>( gating_output, nullptr, softmax_workspace, num_experts); moeTopK<<>>( - softmax_workspace, nullptr, topk_weights, topk_indicies, token_expert_indices, + softmax_workspace, nullptr, topk_weights, topk_indices, token_expert_indices, num_experts, topk, 0, num_experts); } } diff --git a/csrc/moe/torch_bindings.cpp b/csrc/moe/torch_bindings.cpp index a74eb3720c..d6ef4940b6 100644 --- a/csrc/moe/torch_bindings.cpp +++ b/csrc/moe/torch_bindings.cpp @@ -66,7 +66,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) { m.def( "moe_permute(Tensor input, Tensor topk_weight, Tensor! topk_ids," - "Tensor token_expert_indicies, Tensor? expert_map, int n_expert," + "Tensor token_expert_indices, Tensor? expert_map, int n_expert," "int n_local_expert," "int topk, int? align_block_size,Tensor! permuted_input, Tensor! " "expert_first_token_offset, Tensor! src_row_id2dst_row_id_map, Tensor! " diff --git a/csrc/quantization/machete/machete_mainloop.cuh b/csrc/quantization/machete/machete_mainloop.cuh index 572894064d..eca5d328b0 100644 --- a/csrc/quantization/machete/machete_mainloop.cuh +++ b/csrc/quantization/machete/machete_mainloop.cuh @@ -1003,7 +1003,7 @@ struct MacheteCollectiveMma { static constexpr int A_CPY_VEC = decltype(max_common_vector(tCsA, tCrA_load)){}; - static constexpr int COVERSION_WIDTH = + static constexpr int CONVERSION_WIDTH = std::min(A_CPY_VEC, int(size<0>(tCrA_mma))); auto load_A_to_registers = [&](int read_stage) { @@ -1026,8 +1026,8 @@ struct MacheteCollectiveMma { // PIPELINED MAIN LOOP // - auto convert_A = [&, a_vec = Int{}](int k_block, - int read_stage) { + auto convert_A = [&, a_vec = Int{}](int k_block, + int read_stage) { load_extra_info_to_registers(partitioned_extra_info, copy_partitions_extra_info, k_block, read_stage); diff --git a/csrc/rocm/skinny_gemms.cu b/csrc/rocm/skinny_gemms.cu index e31aa01626..6212570c79 100644 --- a/csrc/rocm/skinny_gemms.cu +++ b/csrc/rocm/skinny_gemms.cu @@ -320,7 +320,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) // Goal is to bring the activation matrix A to the LDS // and use it across the lifetime of the work group // TODO: When activation matrix is larger than 64 KB - // then this is not goint to work! + // then this is not going to work! //---------------------------------------------------- __shared__ scalar_t s[max_lds_len]; @@ -581,7 +581,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) // Goal is to bring the activation matrix A to the LDS // and use it across the lifetime of the work group // TODO: When activation matrix is larger than 64 KB - // then this is not goint to work! + // then this is not going to work! //---------------------------------------------------- __shared__ scalar_t s[max_lds_len]; @@ -601,7 +601,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) // int _WvPrGrp = mindiv(N, CuCount * YTILE, WvPrGrp); uint32_t m = (blockIdx.x * _WvPrGrp + threadIdx.y) * YTILE; - // Check whether there will be fragmenation! + // Check whether there will be fragmentation! // This will happen only for the last wave! if (m < M && (m + YTILE) >= M) { uint32_t startColumn = M - YTILE; @@ -827,7 +827,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) m += CuCount * _WvPrGrp * YTILE; - // Check whether there will be fragmenation! + // Check whether there will be fragmentation! // This will happen only for the last wave! if (m < M && (m + YTILE) >= M) { uint32_t startColumn = M - YTILE; @@ -882,7 +882,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) // Goal is to bring the activation matrix A to the LDS // and use it across the lifetime of the work group // TODO: When activation matrix is larger than 64 KB - // then this is not goint to work! + // then this is not going to work! //---------------------------------------------------- __shared__ scalar_t s[max_lds_len]; @@ -904,7 +904,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) //---------------------------------------------------- uint32_t m = (blockIdx.x * _WvPrGrp + threadIdx.y) * YTILE; - // Check whether there will be fragmenation! + // Check whether there will be fragmentation! // This will happen only for the last wave! if (m < M && (m + YTILE) >= M) { uint32_t startColumn = M - YTILE; @@ -1176,7 +1176,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) m += CuCount * _WvPrGrp * YTILE; kBase = 0; - // Check whether there will be fragmenation! + // Check whether there will be fragmentation! // This will happen only for the last wave! if (m < M && (m + YTILE) >= M) { uint32_t startColumn = M - YTILE; diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu index 3dcaa6373f..d053ecc8dd 100644 --- a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu +++ b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu @@ -277,7 +277,7 @@ CompressorResult cutlass_sparse_compress_sm90(torch::Tensor const& a) { uint32_t const m = 1; // Set M to 1 for compression uint32_t const n = a.size(1); - // Note: For correctess, the compressed format must be invariant in: + // Note: For correctness, the compressed format must be invariant in: // - M, the flattened number of tokens // - Whether output dtype is fp16 or bf16 // - CUTLASS epilogues diff --git a/pyproject.toml b/pyproject.toml index 307878f7e3..e8c2403af0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -137,10 +137,6 @@ exclude = [ 'vllm/attention/ops/.*\.py$' ] -[tool.codespell] -ignore-words-list = "dout, te, indicies, subtile, ElementE" -skip = "tests/models/fixtures/*,tests/prompts/*,benchmarks/sonnet.txt,tests/lora/data/*,build/*,vllm/third_party/*" - [tool.isort] skip_glob = [ ".buildkite/*", diff --git a/tests/compile/test_async_tp.py b/tests/compile/test_async_tp.py index 1e4ee571f1..508056ea19 100644 --- a/tests/compile/test_async_tp.py +++ b/tests/compile/test_async_tp.py @@ -223,7 +223,7 @@ def test_async_tp_pass_correctness( "VLLM_USE_V1": "1", } - aysnc_tp_args = [ + async_tp_args = [ *common_args, "--tensor-parallel-size", str(tp_size), @@ -242,7 +242,7 @@ def test_async_tp_pass_correctness( ] compare_two_settings(model_id, - aysnc_tp_args, + async_tp_args, tp_args, async_tp_env, tp_env, diff --git a/tests/core/block/e2e/test_correctness.py b/tests/core/block/e2e/test_correctness.py index f296c81e17..93222b564e 100644 --- a/tests/core/block/e2e/test_correctness.py +++ b/tests/core/block/e2e/test_correctness.py @@ -437,8 +437,8 @@ def test_auto_prefix_caching_with_preemption(baseline_llm_generator, "enable_prefix_caching": True, }]) @pytest.mark.parametrize("seed", [1]) -def test_auto_prefix_caching_after_evition_start(baseline_llm_generator, - test_llm_generator): +def test_auto_prefix_caching_after_eviction_start(baseline_llm_generator, + test_llm_generator): """Verify block manager v2 with auto prefix caching could works normal even when eviction started. With APC enabled, all blocks are held by native block at the beginning. diff --git a/tests/core/block/e2e/test_correctness_sliding_window.py b/tests/core/block/e2e/test_correctness_sliding_window.py index 3429a858dd..4d67eea226 100644 --- a/tests/core/block/e2e/test_correctness_sliding_window.py +++ b/tests/core/block/e2e/test_correctness_sliding_window.py @@ -33,8 +33,8 @@ BLOCK_SIZE = 16 @pytest.mark.parametrize("batch_size", [5]) @pytest.mark.parametrize("seed", [1]) @pytest.mark.parametrize("backend", ["FLASH_ATTN", "FLASHINFER", "XFORMERS"]) -def test_sliding_window_retrival(baseline_llm_generator, test_llm_generator, - batch_size, seed, backend, monkeypatch): +def test_sliding_window_retrieval(baseline_llm_generator, test_llm_generator, + batch_size, seed, backend, monkeypatch): """ The test does a bunch of assignments "x1 = 10\nx2 = 33\n..." and then asks for value of one of them (which is outside the sliding window). @@ -100,7 +100,7 @@ def test_sliding_window_retrival(baseline_llm_generator, test_llm_generator, def test_sliding_window_chunked_prefill(test_llm_generator, batch_size, seed, backend, monkeypatch): """ - This is similar to test_sliding_window_retrival, however, it doesn't + This is similar to test_sliding_window_retrieval, however, it doesn't compare against the v1 block manager since v1 doesn't support chunked prefill with sliding window. diff --git a/tests/core/test_scheduler.py b/tests/core/test_scheduler.py index db78a9d556..5e8e5f9767 100644 --- a/tests/core/test_scheduler.py +++ b/tests/core/test_scheduler.py @@ -594,8 +594,8 @@ def test_decode_schedule_preempted(): # should be preempted. 1 will also be preempted. budget = create_token_budget() output = scheduler._schedule_running(budget, curr_loras) - remainig_running = scheduler.running - assert len(remainig_running) == 0 + remaining_running = scheduler.running + assert len(remaining_running) == 0 assert len(output.decode_seq_groups) == 1 assert len(output.prefill_seq_groups) == 0 assert output.decode_seq_groups[0].seq_group.request_id == "0" diff --git a/tests/entrypoints/openai/test_chat_template.py b/tests/entrypoints/openai/test_chat_template.py index daa4a78c93..6e32887f5e 100644 --- a/tests/entrypoints/openai/test_chat_template.py +++ b/tests/entrypoints/openai/test_chat_template.py @@ -16,7 +16,7 @@ chatml_jinja_path = VLLM_PATH / "examples/template_chatml.jinja" assert chatml_jinja_path.exists() # Define models, templates, and their corresponding expected outputs -MODEL_TEMPLATE_GENERATON_OUTPUT = [ +MODEL_TEMPLATE_GENERATION_OUTPUT = [ ("facebook/opt-125m", chatml_jinja_path, True, False, """<|im_start|>user Hello<|im_end|> <|im_start|>assistant @@ -91,7 +91,7 @@ def test_no_load_chat_template_literallike(): @pytest.mark.parametrize( "model,template,add_generation_prompt,continue_final_message,expected_output", - MODEL_TEMPLATE_GENERATON_OUTPUT) + MODEL_TEMPLATE_GENERATION_OUTPUT) def test_get_gen_prompt(model, template, add_generation_prompt, continue_final_message, expected_output): model_info = HF_EXAMPLE_MODELS.find_hf_info(model) diff --git a/tests/kernels/attention/test_cache.py b/tests/kernels/attention/test_cache.py index e508505c2b..7895076155 100644 --- a/tests/kernels/attention/test_cache.py +++ b/tests/kernels/attention/test_cache.py @@ -72,8 +72,8 @@ def test_copy_blocks( # destination blocks. assert 2 * num_mappings <= num_blocks src_blocks = random.sample(range(num_blocks), num_mappings) - remainig_blocks = list(set(range(num_blocks)) - set(src_blocks)) - dst_blocks = random.sample(remainig_blocks, 2 * num_mappings) + remaining_blocks = list(set(range(num_blocks)) - set(src_blocks)) + dst_blocks = random.sample(remaining_blocks, 2 * num_mappings) block_mapping: list[tuple[int, int]] = [] for i in range(num_mappings): src = src_blocks[i] @@ -189,12 +189,12 @@ def test_reshape_and_cache( # Run the reference implementation. reshaped_key = key.reshape(num_tokens, *key_cache[0, :, :, 0, :].shape) - block_indicies = torch.div(slot_mapping, block_size, rounding_mode="floor") - block_indicies_lst = block_indicies.cpu().tolist() + block_indices = torch.div(slot_mapping, block_size, rounding_mode="floor") + block_indices_lst = block_indices.cpu().tolist() block_offsets = slot_mapping % block_size block_offsets_lst = block_offsets.cpu().tolist() for i in range(num_tokens): - block_idx = block_indicies_lst[i] + block_idx = block_indices_lst[i] block_offset = block_offsets_lst[i] cloned_key_cache[block_idx, :, :, block_offset, :] = reshaped_key[i] cloned_value_cache[block_idx, :, :, block_offset] = value[i] @@ -322,12 +322,12 @@ def test_reshape_and_cache_flash( kv_dtype=kv_cache_dtype) # Run the reference implementation. - block_indicies = torch.div(slot_mapping, block_size, rounding_mode="floor") - block_indicies_lst = block_indicies.cpu().tolist() + block_indices = torch.div(slot_mapping, block_size, rounding_mode="floor") + block_indices_lst = block_indices.cpu().tolist() block_offsets = slot_mapping % block_size block_offsets_lst = block_offsets.cpu().tolist() for i in range(num_tokens): - block_idx = block_indicies_lst[i] + block_idx = block_indices_lst[i] block_offset = block_offsets_lst[i] if kv_cache_layout == "NHD": cloned_key_cache[block_idx, block_offset, :, :] = key[i] diff --git a/tests/kernels/attention/test_encoder_decoder_attn.py b/tests/kernels/attention/test_encoder_decoder_attn.py index 8efc701f3a..a2e6986460 100644 --- a/tests/kernels/attention/test_encoder_decoder_attn.py +++ b/tests/kernels/attention/test_encoder_decoder_attn.py @@ -46,7 +46,7 @@ CUDA_DEVICE = "cuda:0" MAX_DEC_SEQ_LENS = [128] MAX_ENC_SEQ_LENS = [128] -# Narrow teest-cases for unsupported-scenario +# Narrow test-cases for unsupported-scenario # tests HEAD_SIZES_FOR_UNSUPP = [HEAD_SIZES[0]] diff --git a/tests/kernels/core/test_rotary_embedding.py b/tests/kernels/core/test_rotary_embedding.py index db0fdcbf5e..d1fd960bf1 100644 --- a/tests/kernels/core/test_rotary_embedding.py +++ b/tests/kernels/core/test_rotary_embedding.py @@ -39,10 +39,10 @@ def rotary_embedding_opcheck(rot, @pytest.mark.parametrize("head_size", [32, 108]) @pytest.mark.parametrize("seq_len", [11, 1024]) @pytest.mark.parametrize("use_key", [True, False]) -@pytest.mark.parametrize("head_stride_is_contingous", [True, False]) +@pytest.mark.parametrize("head_stride_is_contiguous", [True, False]) def test_rotary_embedding_opcheck(dist_init, device, max_position, is_neox_style, rotary_dim, head_size, - seq_len, use_key, head_stride_is_contingous): + seq_len, use_key, head_stride_is_contiguous): batch_size = 1 base = 10000 num_heads = 7 @@ -52,7 +52,7 @@ def test_rotary_embedding_opcheck(dist_init, device, max_position, positions = torch.randint(0, max_position, (batch_size, seq_len), device=device) - head_stride = head_size + (64 if head_stride_is_contingous else 0) + head_stride = head_size + (64 if head_stride_is_contiguous else 0) query = torch.randn(batch_size, seq_len, @@ -72,7 +72,7 @@ def test_rotary_embedding_opcheck(dist_init, device, max_position, # if we have a contiguous head stride, test the alternate # [..., num_heads * head_dim] shape/layout - if head_stride_is_contingous: + if head_stride_is_contiguous: rotary_embedding_opcheck( rot, positions, query.flatten(start_dim=-2), key.flatten(start_dim=-2) if use_key else None) diff --git a/tests/kernels/mamba/test_mamba_ssm_ssd.py b/tests/kernels/mamba/test_mamba_ssm_ssd.py index abed1252a3..ccf0ff6abd 100644 --- a/tests/kernels/mamba/test_mamba_ssm_ssd.py +++ b/tests/kernels/mamba/test_mamba_ssm_ssd.py @@ -107,15 +107,15 @@ def generate_random_inputs(batch_size, return A, dt, X, B, C -def generate_continous_batched_examples(example_lens_by_batch, - num_examples, - full_length, - last_taken, - exhausted, - n_heads, - d_head, - itype, - device='cuda'): +def generate_continuous_batched_examples(example_lens_by_batch, + num_examples, + full_length, + last_taken, + exhausted, + n_heads, + d_head, + itype, + device='cuda'): # this function generates a random examples of certain length # and then cut according to "example_lens_by_batch" and feed @@ -269,11 +269,10 @@ def test_mamba_chunk_scan_cont_batch(d_head, n_heads, seq_len_chunk_size_cases, exhausted: dict = {} # map: eg -> boolean indicating example is exhausted states = None - for Y_min, cu_seqlens, seq_idx, (A, dt, X, B, - C) in generate_continous_batched_examples( - cases, num_examples, seqlen, - last_taken, exhausted, n_heads, - d_head, itype): + for Y_min, cu_seqlens, seq_idx, ( + A, dt, X, B, C) in generate_continuous_batched_examples( + cases, num_examples, seqlen, last_taken, exhausted, n_heads, + d_head, itype): chunk_indices, chunk_offsets = \ _query_start_loc_to_chunk_indices_offsets( diff --git a/tests/lora/test_transfomers_model.py b/tests/lora/test_transformers_model.py similarity index 100% rename from tests/lora/test_transfomers_model.py rename to tests/lora/test_transformers_model.py diff --git a/tests/models/language/generation/test_bart.py b/tests/models/language/generation/test_bart.py index 7d8acab5e8..b4c7718401 100644 --- a/tests/models/language/generation/test_bart.py +++ b/tests/models/language/generation/test_bart.py @@ -118,7 +118,7 @@ def run_test( # default to enforce_eager=True if enforce_eager # is left unspecified. However, the # VllmRunner test fixture (which wraps around the LLM class) defaults to - # enforce_eager=False (a behavior which a number of already-exisitng + # enforce_eager=False (a behavior which a number of already-existing # decoder-only unit tests expect), so when testing an encoder/decoder # model we must explicitly specify enforce_eager=True in the VllmRunner # constructor. diff --git a/tests/samplers/test_typical_acceptance_sampler.py b/tests/samplers/test_typical_acceptance_sampler.py index 418471b8e5..119841470b 100644 --- a/tests/samplers/test_typical_acceptance_sampler.py +++ b/tests/samplers/test_typical_acceptance_sampler.py @@ -248,7 +248,7 @@ def test_temperature_zero_target_distribution(seed: int, device: str): size=(batch_size, 1), dtype=torch.int64) # The target probaility distribution is a temperature zero distribution - # with zero entroy. Since our draft token ids don't match the probability + # with zero entropy. Since our draft token ids don't match the probability # 1.0 tokens in the target distribution we will reject all of them and # fallback to the greedy sampling for selecting 1 token for each sequence. # Verify the same. diff --git a/tests/spec_decode/e2e/test_eagle_correctness.py b/tests/spec_decode/e2e/test_eagle_correctness.py index 9893946142..fd838285ab 100644 --- a/tests/spec_decode/e2e/test_eagle_correctness.py +++ b/tests/spec_decode/e2e/test_eagle_correctness.py @@ -18,7 +18,7 @@ However, we still need to verify below scenario could be passed: * Test greedy equality under various number of speculative tokens. With those tests, we can say at least, EAGLE would not break the -correctess for the target model outputs. +correctness for the target model outputs. """ import pytest diff --git a/tests/spec_decode/e2e/test_medusa_correctness.py b/tests/spec_decode/e2e/test_medusa_correctness.py index 064a6e10ae..bc9501bd57 100644 --- a/tests/spec_decode/e2e/test_medusa_correctness.py +++ b/tests/spec_decode/e2e/test_medusa_correctness.py @@ -18,7 +18,7 @@ However, we still need to verify below scenario could be passed: * Test greedy equality under various number of speculative tokens. With those tests, we can say at least, Medusa would not break the -correctess for the target model outputs. +correctness for the target model outputs. """ import pytest diff --git a/tests/spec_decode/e2e/test_mtp_correctness.py b/tests/spec_decode/e2e/test_mtp_correctness.py index d4d4d519b7..d9c7be8ffe 100644 --- a/tests/spec_decode/e2e/test_mtp_correctness.py +++ b/tests/spec_decode/e2e/test_mtp_correctness.py @@ -18,7 +18,7 @@ However, we still need to verify below scenario could be passed: * Test greedy equality under various number of speculative tokens. With those tests, we can say at least, mtp would not break the -correctess for the target model outputs. +correctness for the target model outputs. """ import pytest diff --git a/tests/spec_decode/e2e/test_ngram_correctness.py b/tests/spec_decode/e2e/test_ngram_correctness.py index c10329a9ba..5aefc1df84 100644 --- a/tests/spec_decode/e2e/test_ngram_correctness.py +++ b/tests/spec_decode/e2e/test_ngram_correctness.py @@ -22,8 +22,8 @@ However, we still need to verify below scenario could be passed: * Test greedy equality under preemption * Test greedy equality under various ngram sizes / speculative sizes -With those tests, we can say at least, ngram spec would not break the correctess -for the target model outputs. +With those tests, we can say at least, ngram spec would not break the +correctness for the target model outputs. """ import pytest diff --git a/tests/v1/e2e/test_correctness_sliding_window.py b/tests/v1/e2e/test_correctness_sliding_window.py index d8882b1d94..277ea3c838 100644 --- a/tests/v1/e2e/test_correctness_sliding_window.py +++ b/tests/v1/e2e/test_correctness_sliding_window.py @@ -30,7 +30,7 @@ model_config = { ]) @pytest.mark.parametrize("batch_size", [5]) @pytest.mark.parametrize("seed", [1]) -def test_sliding_window_retrival(monkeypatch, model, batch_size, seed): +def test_sliding_window_retrieval(monkeypatch, model, batch_size, seed): """ The test does a bunch of assignments "x1 = 10\nx2 = 33\n..." and then asks for value of one of them (which is outside the sliding window). diff --git a/tests/v1/kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/unit/test_nixl_connector.py index 622ab6f35d..a0bcb8f602 100644 --- a/tests/v1/kv_connector/unit/test_nixl_connector.py +++ b/tests/v1/kv_connector/unit/test_nixl_connector.py @@ -7,7 +7,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector import ( from .utils import create_request, create_scheduler, create_vllm_config -def test_basic_inferface(): +def test_basic_interface(): """Unit test for basic NixlConnector interface functionality.""" vllm_config = create_vllm_config() @@ -25,7 +25,7 @@ def test_basic_inferface(): scheduler.add_request(request) - # Remote Prefill, triggers NixlConnectorMetdata. + # Remote Prefill, triggers NixlConnectorMetadata. scheduler_output = scheduler.schedule() kv_connector_metadata = scheduler_output.kv_connector_metadata assert kv_connector_metadata is not None diff --git a/tests/v1/sample/test_logprobs_e2e.py b/tests/v1/sample/test_logprobs_e2e.py index 085b2ee097..0b135613ff 100644 --- a/tests/v1/sample/test_logprobs_e2e.py +++ b/tests/v1/sample/test_logprobs_e2e.py @@ -32,7 +32,7 @@ def test_prompt_logprobs_e2e(): ), f"Expected: {EXPECTED_VALUE} | Measured: {measured_value}" -def test_promt_logprobs_e2e_server(): +def test_prompt_logprobs_e2e_server(): with RemoteOpenAIServer(MODEL, SERVER_ARGS) as remote_server: url = f"{remote_server.url_for('v1')}/completions" diff --git a/tests/worker/test_model_input.py b/tests/worker/test_model_input.py index a5e61128d1..ec33d334ab 100644 --- a/tests/worker/test_model_input.py +++ b/tests/worker/test_model_input.py @@ -209,32 +209,32 @@ def test_multi_step_model_runner_input(): received_model_input = (StatefulModelInput.from_broadcasted_tensor_dict( tensor_dict, attn_backend=attn_backend)) - receieved_frozen_input = received_model_input.frozen_model_input + received_frozen_input = received_model_input.frozen_model_input # Check that received copy has correct values. assert isinstance(received_model_input, StatefulModelInput) - assert receieved_frozen_input.input_tokens is not None - assert (receieved_frozen_input.input_tokens == + assert received_frozen_input.input_tokens is not None + assert (received_frozen_input.input_tokens == frozen_model_input.input_tokens).all() - assert receieved_frozen_input.input_positions is not None - assert (receieved_frozen_input.input_positions == + assert received_frozen_input.input_positions is not None + assert (received_frozen_input.input_positions == frozen_model_input.input_positions).all() - assert receieved_frozen_input.multi_modal_kwargs is None + assert received_frozen_input.multi_modal_kwargs is None assert (frozen_model_input.multi_modal_kwargs == frozen_model_input.multi_modal_kwargs) - assert receieved_frozen_input.lora_requests is None - assert (receieved_frozen_input.lora_requests == + assert received_frozen_input.lora_requests is None + assert (received_frozen_input.lora_requests == frozen_model_input.lora_requests) - assert receieved_frozen_input.lora_mapping is None + assert received_frozen_input.lora_mapping is None assert ( - receieved_frozen_input.lora_mapping == frozen_model_input.lora_mapping) + received_frozen_input.lora_mapping == frozen_model_input.lora_mapping) for field in dataclasses.fields(AttentionMetadata): - assert getattr(receieved_frozen_input.attn_metadata, field.name, + assert getattr(received_frozen_input.attn_metadata, field.name, None) == getattr(attn_metadata, field.name, None) # For sampling metadata, only selected_token_indices is copied. - assert (receieved_frozen_input.sampling_metadata.selected_token_indices == + assert (received_frozen_input.sampling_metadata.selected_token_indices == sampling_metadata.selected_token_indices) - assert receieved_frozen_input.sampling_metadata.seq_groups is None + assert received_frozen_input.sampling_metadata.seq_groups is None # check non frozen fields assert received_model_input.is_last_step == model_input.is_last_step diff --git a/tools/report_build_time_ninja.py b/tools/report_build_time_ninja.py index 7368ae9531..7386cdd9f7 100644 --- a/tools/report_build_time_ninja.py +++ b/tools/report_build_time_ninja.py @@ -116,7 +116,7 @@ def ReadTargets(log, show_all): # If ninja.exe is rudely halted then the .ninja_log file may be # corrupt. Silently continue. continue - start, end, _, name, cmdhash = parts # Ignore restat. + start, end, _, name, cmdhash = parts # Ignore restart. # Convert from integral milliseconds to float seconds. start = int(start) / 1000.0 end = int(end) / 1000.0 diff --git a/typos.toml b/typos.toml new file mode 100644 index 0000000000..f51ce2f362 --- /dev/null +++ b/typos.toml @@ -0,0 +1,179 @@ +[files] +# these files may be written in non english words +extend-exclude = ["tests/models/fixtures/*", "tests/prompts/*", + "benchmarks/sonnet.txt", "tests/lora/data/*", "build/*", + "vllm/third_party/*"] +ignore-hidden = true +ignore-files = true +ignore-dot = true +ignore-vcs = true +ignore-global = true +ignore-parent = true + +[default] +binary = false +check-filename = false +check-file = true +unicode = true +ignore-hex = true +identifier-leading-digits = false +locale = "en" +extend-ignore-identifiers-re = ["NVML_*", ".*Unc.*", ".*_thw", + ".*UE8M0.*", ".*[UE4M3|ue4m3].*", ".*eles.*", ".*fo.*", ".*ba.*", + ".*ot.*", ".*[Tt]h[rR].*"] +extend-ignore-words-re = [] +extend-ignore-re = [] + +[default.extend-identifiers] +bbc5b7ede = "bbc5b7ede" +womens_doubles = "womens_doubles" +v_2nd = "v_2nd" +splitted_input = "splitted_input" +NOOPs = "NOOPs" +typ = "typ" +nin_shortcut = "nin_shortcut" +UperNetDecoder = "UperNetDecoder" +subtile = "subtile" +cudaDevAttrMaxSharedMemoryPerBlockOptin = "cudaDevAttrMaxSharedMemoryPerBlockOptin" +SFOuput = "SFOuput" +# huggingface transformers repo uses these words +depthwise_seperable_out_channel = "depthwise_seperable_out_channel" +DepthWiseSeperableConv1d = "DepthWiseSeperableConv1d" +depthwise_seperable_CNN = "depthwise_seperable_CNN" + +[default.extend-words] +iy = "iy" +tendencias = "tendencias" +# intel cpu features +tme = "tme" +dout = "dout" +Pn = "Pn" +arange = "arange" + +[type.py] +extend-glob = [] +extend-ignore-identifiers-re = [] +extend-ignore-words-re = [] +extend-ignore-re = [] + +[type.py.extend-identifiers] +arange = "arange" +NDArray = "NDArray" +EOFError = "EOFError" + +[type.py.extend-words] + +[type.cpp] +extend-glob = [] +extend-ignore-identifiers-re = [] +extend-ignore-words-re = [] +extend-ignore-re = [] + +[type.cpp.extend-identifiers] +countr_one = "countr_one" + +[type.cpp.extend-words] + +[type.rust] +extend-glob = [] +extend-ignore-identifiers-re = [] +extend-ignore-words-re = [] +extend-ignore-re = [] + +[type.rust.extend-identifiers] +flate2 = "flate2" + +[type.rust.extend-words] +ser = "ser" + +[type.lock] +extend-glob = [] +check-file = false +extend-ignore-identifiers-re = [] +extend-ignore-words-re = [] +extend-ignore-re = [] + +[type.lock.extend-identifiers] + +[type.lock.extend-words] + +[type.jl] +extend-glob = [] +extend-ignore-identifiers-re = [] +extend-ignore-words-re = [] +extend-ignore-re = [] + +[type.jl.extend-identifiers] + +[type.jl.extend-words] +modul = "modul" +egals = "egals" +usig = "usig" +egal = "egal" + +[type.go] +extend-glob = [] +extend-ignore-identifiers-re = [] +extend-ignore-words-re = [] +extend-ignore-re = [] + +[type.go.extend-identifiers] +flate = "flate" + +[type.go.extend-words] + +[type.css] +extend-glob = [] +extend-ignore-identifiers-re = [] +extend-ignore-words-re = [] +extend-ignore-re = [] + +[type.css.extend-identifiers] +nd = "nd" + +[type.css.extend-words] + +[type.man] +extend-glob = [] +extend-ignore-identifiers-re = [] +extend-ignore-words-re = [] +extend-ignore-re = [] + +[type.man.extend-identifiers] +Nd = "Nd" + +[type.man.extend-words] + +[type.cert] +extend-glob = [] +check-file = false +extend-ignore-identifiers-re = [] +extend-ignore-words-re = [] +extend-ignore-re = [] + +[type.cert.extend-identifiers] + +[type.cert.extend-words] + +[type.sh] +extend-glob = [] +extend-ignore-identifiers-re = [] +extend-ignore-words-re = [] +extend-ignore-re = [] + +[type.sh.extend-identifiers] +stap = "stap" +ot = "ot" + +[type.sh.extend-words] + +[type.vimscript] +extend-glob = [] +extend-ignore-identifiers-re = [] +extend-ignore-words-re = [] +extend-ignore-re = [] + +[type.vimscript.extend-identifiers] +windo = "windo" + +[type.vimscript.extend-words] diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 92de1f5efa..d6bbfbc328 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -1550,10 +1550,10 @@ def moe_wna16_gemm(input: torch.Tensor, output: torch.Tensor, def topk_softmax(topk_weights: torch.Tensor, topk_ids: torch.Tensor, - token_expert_indicies: torch.Tensor, + token_expert_indices: torch.Tensor, gating_output: torch.Tensor) -> None: - torch.ops._moe_C.topk_softmax(topk_weights, topk_ids, - token_expert_indicies, gating_output) + torch.ops._moe_C.topk_softmax(topk_weights, topk_ids, token_expert_indices, + gating_output) def moe_wna16_marlin_gemm(input: torch.Tensor, output: Optional[torch.Tensor], diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py index e3f02a1936..34e059067d 100644 --- a/vllm/attention/backends/utils.py +++ b/vllm/attention/backends/utils.py @@ -373,7 +373,7 @@ class CommonAttentionState(AttentionState): f"Expected attn_backend name to be either 'XFORMERS'," \ f"'ROCM_FLASH', or 'FLASH_ATTN', but " \ f"got '{self.runner.attn_backend.get_name()}'" - self._add_additonal_input_buffers_for_enc_dec_model( + self._add_additional_input_buffers_for_enc_dec_model( attn_metadata=attn_metadata, input_buffers=input_buffers) return input_buffers @@ -427,7 +427,7 @@ class CommonAttentionState(AttentionState): attn_metadata.max_encoder_seq_len = self.runner.max_seq_len_to_capture attn_metadata.num_encoder_tokens = 0 - def _add_additonal_input_buffers_for_enc_dec_model( + def _add_additional_input_buffers_for_enc_dec_model( self, attn_metadata, input_buffers: Dict[str, Any]): """ Saves additional input buffers specific to the encoder-decoder model diff --git a/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py index e5dcdf9a07..92004de030 100644 --- a/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py @@ -40,7 +40,7 @@ class Internlm2ToolParser(ToolParser): request.skip_special_tokens = False return request - def get_argments(self, obj): + def get_arguments(self, obj): if "parameters" in obj: return obj.get("parameters") elif "arguments" in obj: @@ -119,9 +119,9 @@ class Internlm2ToolParser(ToolParser): # now we know we're on the same tool call and we're streaming # arguments else: - prev_arguments = self.get_argments( + prev_arguments = self.get_arguments( self.prev_tool_call_arr[self.current_tool_id]) - cur_arguments = self.get_argments(tool_call_arr) + cur_arguments = self.get_arguments(tool_call_arr) # not arguments generated if not cur_arguments and not prev_arguments: @@ -170,7 +170,7 @@ class Internlm2ToolParser(ToolParser): # check to see if the name is defined and has been sent. if so, # stream the name - otherwise keep waiting # finish by setting old and returning None as base case - tool_call_arr["arguments"] = self.get_argments(tool_call_arr) + tool_call_arr["arguments"] = self.get_arguments(tool_call_arr) self.prev_tool_call_arr = [tool_call_arr] return delta except Exception: diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index 66e037a97d..3d0c583175 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -1202,7 +1202,7 @@ class LinearScalingRotaryEmbeddingWithLoRA(BaseLayerWithLoRA): multiple LoRA adapters with a specialized kernel. Replace LinearScalingRotaryEmbedding with MultiLinearScalingRotaryEmbedding - which can handle multi lora adapters in a specialied kernel. + which can handle multi lora adapters in a specialized kernel. """ def __init__(self, base_layer: RotaryEmbedding) -> None: diff --git a/vllm/lora/punica_wrapper/utils.py b/vllm/lora/punica_wrapper/utils.py index 0b0a7989f3..8430cb9186 100644 --- a/vllm/lora/punica_wrapper/utils.py +++ b/vllm/lora/punica_wrapper/utils.py @@ -68,11 +68,11 @@ def convert_mapping( LoRA indices. sampler_indices: Tensor of shape [batch_size] mapping requests to LoRA indices for sampler. For generation, this will be the - same as base_indicies. For prefill, this will map requests + same as base_indices. For prefill, this will map requests to LoRA indices. sampler_indices_padded: Tensor of shape [batch_size] mapping requests to LoRA indices for sampler with padding. - Same as sampler_indicies, but -1 is replaced with + Same as sampler_indices, but -1 is replaced with max_loras. embeddings_indices: Tensor of shape [2, batch_size] mapping requests to embedding indices. First row is for embeddings diff --git a/vllm/model_executor/layers/mamba/mamba_mixer2.py b/vllm/model_executor/layers/mamba/mamba_mixer2.py index 6d9ea53878..cd3b0b3907 100644 --- a/vllm/model_executor/layers/mamba/mamba_mixer2.py +++ b/vllm/model_executor/layers/mamba/mamba_mixer2.py @@ -319,7 +319,7 @@ class MambaMixer2(CustomOp): n_groups == 1, # if there was only one group ) intermediate_settings = (intermediate_size, 0, False) - head_setings = (self.num_heads, 0, False) + head_settings = (self.num_heads, 0, False) # - the weight already has a "weight_loader" attribute # which set_weight_attrs will raise if we do not @@ -372,7 +372,7 @@ class MambaMixer2(CustomOp): intermediate_settings, group_shard_settings, group_shard_settings, - head_setings, # for dt + head_settings, # for dt ], self.tp_size, tp_rank, diff --git a/vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py b/vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py index 58bfb661d3..ad58a9918f 100644 --- a/vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py +++ b/vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py @@ -516,7 +516,7 @@ def _chunk_state_varlen_kernel( offs_n[None, :] * stride_chunk_states_dstate) else: - # - this seems repetitve, buts its to help the compiler + # - this seems repetitive, buts its to help the compiler if start_idx < pid_c * chunk_size: past_states_ptrs = chunk_states_ptr + ( offs_m[:, None] * stride_chunk_states_hdim + diff --git a/vllm/model_executor/layers/quantization/utils/int8_utils.py b/vllm/model_executor/layers/quantization/utils/int8_utils.py index a694a19174..1fdf7d174e 100644 --- a/vllm/model_executor/layers/quantization/utils/int8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/int8_utils.py @@ -219,7 +219,7 @@ def per_token_group_quant_int8( quantized tensor along with the scaling factor used for quantization. Args: - x: The input tenosr with ndim >= 2. + x: The input tensor with ndim >= 2. group_size: The group size used for quantization. eps: The minimum to avoid dividing zero. dtype: The dype of output tensor. Note that only `torch.int8` diff --git a/vllm/model_executor/model_loader/bitsandbytes_loader.py b/vllm/model_executor/model_loader/bitsandbytes_loader.py index 068a4e355f..3146c35a4e 100644 --- a/vllm/model_executor/model_loader/bitsandbytes_loader.py +++ b/vllm/model_executor/model_loader/bitsandbytes_loader.py @@ -401,7 +401,7 @@ class BitsAndBytesModelLoader(BaseModelLoader): self.target_modules.append( name.replace(rep_name, sub_name)) # Add original module name even if the module has stacked map, - # in case model has a mixture of disk-merged and disk-splitted + # in case model has a mixture of disk-merged and disk-split # weights with same last name. self.target_modules.append(name) diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py index 0de5de5e83..804a2f1785 100644 --- a/vllm/model_executor/models/baichuan.py +++ b/vllm/model_executor/models/baichuan.py @@ -131,7 +131,7 @@ class BaiChuanAttention(nn.Module): self.num_heads = (self.total_num_heads // tensor_model_parallel_world_size) self.head_dim = hidden_size // self.total_num_heads - self.postion_embedding = position_embedding + self.position_embedding = position_embedding self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings @@ -151,7 +151,7 @@ class BaiChuanAttention(nn.Module): quant_config=quant_config, ) # Create the alibi slopes and slice them. - if self.postion_embedding == "ALIBI": + if self.position_embedding == "ALIBI": tp_rank = get_tensor_model_parallel_rank() head_start = tp_rank * self.num_heads head_end = (tp_rank + 1) * self.num_heads @@ -187,7 +187,7 @@ class BaiChuanAttention(nn.Module): ) -> torch.Tensor: qkv, _ = self.W_pack(hidden_states) q, k, v = qkv.chunk(chunks=3, dim=-1) - if self.postion_embedding != "ALIBI": + if self.position_embedding != "ALIBI": q, k = self.rotary_emb(positions, q, k) attn_output = self.attn(q, k, v) output, _ = self.o_proj(attn_output) diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py index 765718e575..d8c01f83ed 100644 --- a/vllm/model_executor/models/deepseek_vl2.py +++ b/vllm/model_executor/models/deepseek_vl2.py @@ -344,7 +344,7 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): self.image_newline = nn.Parameter( torch.randn(self.projector_config.n_embed) * embed_std) # This is a typo in original implementation - self.view_seperator = nn.Parameter( + self.view_separator = nn.Parameter( torch.randn(self.projector_config.n_embed) * embed_std) else: raise ValueError( @@ -549,13 +549,13 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): if self.global_view_pos == "head": global_local_features = torch.cat([ global_features, - self.view_seperator[None, :], + self.view_separator[None, :], local_features, ]) else: global_local_features = torch.cat([ local_features, - self.view_seperator[None, :], + self.view_separator[None, :], global_features, ]) diff --git a/vllm/model_executor/models/eagle.py b/vllm/model_executor/models/eagle.py index 2219321457..d219b5228a 100644 --- a/vllm/model_executor/models/eagle.py +++ b/vllm/model_executor/models/eagle.py @@ -197,7 +197,7 @@ class EAGLE(nn.Module): return logits def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): - # This implementation is incompitable with https://huggingface.co/yuhuili/EAGLE-LLaMA3-Instruct-8B + # This implementation is incompatible with https://huggingface.co/yuhuili/EAGLE-LLaMA3-Instruct-8B # due to missing lm_head weights and its config being that of a # Llama model. Here's a compatible version with the same weights: # https://huggingface.co/abhigoyal/EAGLE-LLaMA3-Instruct-8B-vllm diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py index 23e2517079..18cb6ea68d 100644 --- a/vllm/model_executor/models/gemma3_mm.py +++ b/vllm/model_executor/models/gemma3_mm.py @@ -634,13 +634,13 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP, kwargs["has_images"] = True # NOTE(woosuk): Here, we distinguish the sequences by the position id 0. # This is a HACK. Fix this. - start_idices = (positions == 0).cpu().nonzero() - num_seqs = len(start_idices) + start_indices = (positions == 0).cpu().nonzero() + num_seqs = len(start_indices) seq_lens = [] for i in range(num_seqs): - start_idx = start_idices[i].item() + start_idx = start_indices[i].item() if i < num_seqs - 1: - end_idx = start_idices[i + 1].item() + end_idx = start_indices[i + 1].item() else: end_idx = len(input_ids) seq_lens.append(end_idx - start_idx) diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py index a852be66bd..9fb73261cd 100644 --- a/vllm/model_executor/models/llama4.py +++ b/vllm/model_executor/models/llama4.py @@ -52,7 +52,7 @@ class Llama4MoE(nn.Module): renormalize: bool, ) -> tuple[torch.Tensor, torch.Tensor]: router_scores, router_indices = fast_topk(gating_output, topk, dim=-1) - # psuedo-standard is that the router scores are floats + # pseudo-standard is that the router scores are floats router_scores = torch.sigmoid(router_scores.float()) return (router_scores, router_indices.to(torch.int32)) diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py index 3183c762d2..c8ad358c62 100644 --- a/vllm/model_executor/models/mixtral_quant.py +++ b/vllm/model_executor/models/mixtral_quant.py @@ -114,9 +114,9 @@ class MixtralMoE(nn.Module): f"Tensor parallel size {self.tp_size} is greater than " f"the number of experts {self.num_total_experts}.") # Split experts equally between ranks - self.expert_indicies = np.array_split(range( - self.num_total_experts), self.tp_size)[self.rank].tolist() - if not self.expert_indicies: + self.expert_indices = np.array_split(range(self.num_total_experts), + self.tp_size)[self.rank].tolist() + if not self.expert_indices: raise ValueError( f"Rank {self.rank} has no experts assigned to it.") @@ -125,7 +125,7 @@ class MixtralMoE(nn.Module): config.hidden_size, config.intermediate_size, quant_config=quant_config) - if idx in self.expert_indicies else None + if idx in self.expert_indices else None for idx in range(self.num_total_experts) ]) self.gate = ReplicatedLinear(config.hidden_size, @@ -146,7 +146,7 @@ class MixtralMoE(nn.Module): routing_weights /= routing_weights.sum(dim=-1, keepdim=True) final_hidden_states = None - for expert_idx in self.expert_indicies: + for expert_idx in self.expert_indices: expert_layer = self.experts[expert_idx] expert_mask = (selected_experts == expert_idx) expert_weights = (routing_weights * expert_mask).sum(dim=-1, diff --git a/vllm/model_executor/models/ovis.py b/vllm/model_executor/models/ovis.py index 5c11d54c61..770e08aa2a 100644 --- a/vllm/model_executor/models/ovis.py +++ b/vllm/model_executor/models/ovis.py @@ -283,7 +283,7 @@ class OvisProcessingInfo(BaseProcessingInfo): def get_image_size_with_most_features(self) -> ImageSize: height, width = self.get_hf_processor().get_image_size() hs = self.get_hf_config().visual_tokenizer_config.hidden_stride - # NOTE(Isotr0py): 9 is `max_partion` hardcoded in original code + # NOTE(Isotr0py): 9 is `max_partition` hardcoded in original code # https://huggingface.co/AIDC-AI/Ovis2-1B/blob/main/modeling_ovis.py#L96 return ImageSize(width=width * hs * 9, height=height * hs * 9) diff --git a/vllm/model_executor/models/phi3_small.py b/vllm/model_executor/models/phi3_small.py index 533655fd52..754ddda233 100644 --- a/vllm/model_executor/models/phi3_small.py +++ b/vllm/model_executor/models/phi3_small.py @@ -145,7 +145,7 @@ class Phi3SmallSelfAttention(nn.Module): self.num_q_per_kv = self.num_heads // self.num_key_value_heads if self.tp_size > 1: assert self.num_key_value_heads % self.tp_size == 0 - self.num_kv_heads_per_partion = max( + self.num_kv_heads_per_partition = max( 1, self.num_key_value_heads // self.tp_size) self.num_heads_per_partition = self.num_heads // self.tp_size @@ -212,7 +212,7 @@ class Phi3SmallSelfAttention(nn.Module): bs_params = { 'max_seqlen': self.max_position_embeddings, 'num_heads': self.num_heads_per_partition, - "num_kv_heads": self.num_kv_heads_per_partion, + "num_kv_heads": self.num_kv_heads_per_partition, "block_size": self.sparse_block_size, "local_blocks": self.local_blocks, "vert_stride": self.vert_stride, @@ -222,7 +222,7 @@ class Phi3SmallSelfAttention(nn.Module): self.attn = Attention(self.num_heads_per_partition, self.head_dim, self.scale, - num_kv_heads=self.num_kv_heads_per_partion, + num_kv_heads=self.num_kv_heads_per_partition, cache_config=cache_config, quant_config=quant_config, blocksparse_params=bs_params, @@ -243,8 +243,8 @@ class Phi3SmallSelfAttention(nn.Module): # NOTE: this is required by RotaryEmbed, which indeed does not have to # TODO: allow 3D QK for rotary forward q = q.reshape(-1, self.head_dim * self.num_heads_per_partition) - k = k.reshape(-1, self.head_dim * self.num_kv_heads_per_partion) - v = v.reshape(-1, self.head_dim * self.num_kv_heads_per_partion) + k = k.reshape(-1, self.head_dim * self.num_kv_heads_per_partition) + v = v.reshape(-1, self.head_dim * self.num_kv_heads_per_partition) q, k = self.rotary_emb(positions, q, k) attn_output = self.attn(q, k, v) diff --git a/vllm/model_executor/models/phi4mm_audio.py b/vllm/model_executor/models/phi4mm_audio.py index ae7a8a732c..0b0d66ae77 100644 --- a/vllm/model_executor/models/phi4mm_audio.py +++ b/vllm/model_executor/models/phi4mm_audio.py @@ -41,7 +41,7 @@ class ConformerEncoderLayer(nn.Module): for the last pointwise conv after swish activation. depthwise_seperable_out_channel: int if set different to 0, the number of - depthwise_seperable_out_channel will be used as a + depthwise_seperable_out_channel will be used as a channel_out of the second conv1d layer. otherwise, it equal to 0, the second conv1d layer is skipped. depthwise_multiplier: int @@ -126,7 +126,7 @@ class ConformerEncoderLayer(nn.Module): (Multi-Head Attention), 1 = typical Multi-Head Attention, 1 < attn_group_sizes < attention_heads = Grouped-Query Attention - attn_group_sizes = attenion_heads = Multi-Query Attention + attn_group_sizes = attention_heads = Multi-Query Attention """ def __init__( @@ -318,7 +318,7 @@ class TransformerEncoderBase(abc.ABC, nn.Module): 1 = typical Multi-Head Attention, 1 < attention_group_size < attention_heads = Grouped-Query Attention - attention_group_size = attenion_heads = Multi-Query Attention + attention_group_size = attention_heads = Multi-Query Attention """ def __init__( @@ -744,7 +744,7 @@ class ConformerEncoder(TransformerEncoderBase): 1 = typical Multi-Head Attention, 1 < attention_group_size < attention_heads = Grouped-Query Attention - attention_group_size = attenion_heads = Multi-Query Attention + attention_group_size = attention_heads = Multi-Query Attention """ extra_multi_layer_output_idxs: list[int] diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py index dddd19c746..cdb7e0d18d 100644 --- a/vllm/model_executor/models/phimoe.py +++ b/vllm/model_executor/models/phimoe.py @@ -147,15 +147,15 @@ class mp(torch.autograd.Function): grad_at_output = grad_at_output * multiplier - grad_at_scores_expaned = masked_gates * grad_at_output.mul(-1) - grad_at_scores_expaned.scatter_add_( + grad_at_scores_expanded = masked_gates * grad_at_output.mul(-1) + grad_at_scores_expanded.scatter_add_( dim=-1, index=selected_experts, src=grad_at_output, ) return ( - grad_at_scores_expaned, + grad_at_scores_expanded, None, None, None, diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index 11a25f8515..5e61d460fa 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -324,7 +324,7 @@ def merge_and_sort_multimodal_metadata( Returns: list[str]: List of item modalities in order of their positions in the input sequence. - list[PlaceholderRange]: Sorted list of all PlaceholdeRanges from + list[PlaceholderRange]: Sorted list of all PlaceholderRanges from mm_positions. Optional[list[str]]: Sorted list of all hashes from mm_hashes if given, None otherwise. diff --git a/vllm/transformers_utils/processors/ovis.py b/vllm/transformers_utils/processors/ovis.py index 4fe76d0df6..557d251c45 100644 --- a/vllm/transformers_utils/processors/ovis.py +++ b/vllm/transformers_utils/processors/ovis.py @@ -68,7 +68,7 @@ class OvisProcessor(ProcessorMixin): """ attributes = ["image_processor", "tokenizer"] - valid_kwargs = ["chat_template", "image_pad_token", "image_segement_len"] + valid_kwargs = ["chat_template", "image_pad_token", "image_segment_len"] image_processor_class = "AutoImageProcessor" tokenizer_class = "AutoTokenizer" diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index 17123d2b48..5860368298 100644 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -886,7 +886,7 @@ class HPUModelRunnerBase(ModelRunnerBase[TModelInputForHPU]): num_decode_tokens=0, slot_mapping=slot_mapping, multi_modal_placeholder_index_maps= - None, # FIXME(kzawora): mutli-modality will not work here + None, # FIXME(kzawora): multi-modality will not work here enable_kv_scales_calculation=False, ) multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list) diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py index cc0cc855e7..0680e60b52 100644 --- a/vllm/worker/multi_step_model_runner.py +++ b/vllm/worker/multi_step_model_runner.py @@ -277,7 +277,7 @@ class StatefulModelInput(BroadcastableModelInput): assert fmi.input_tokens.shape[0] >= self.num_seqs fmi_new_input_tokens: torch.Tensor = fmi.input_tokens[:self.num_seqs] - # Update frozen_model_input::input_positons. + # Update frozen_model_input::input_positions. assert fmi.input_positions is not None assert fmi.input_positions.shape[0] >= self.num_seqs fmi_new_input_positions: torch.Tensor = fmi.input_positions[:self. diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py index 5f1535271b..336bc0bcec 100644 --- a/vllm/worker/tpu_model_runner.py +++ b/vllm/worker/tpu_model_runner.py @@ -798,9 +798,9 @@ class ModelWrapper(nn.Module): """ batch_size, seq_len = token_ids.shape # Calculate the positions to sample from. - start_indicies = torch.arange( + start_indices = torch.arange( batch_size, dtype=torch.int32, device=input_lens.device) * seq_len - logits_indices = start_indicies + input_lens - 1 + logits_indices = start_indices + input_lens - 1 attn_metadata = get_forward_context().attn_metadata # FIXME(woosuk): This is a temporary hack to avoid using the existing @@ -822,14 +822,14 @@ class ModelWrapper(nn.Module): num_kv_heads, num_blocks, block_size, _ = kv_caches[0][0].shape slot_mapping = attn_metadata.slot_mapping slot_mapping = slot_mapping.flatten() - head_indicies = torch.arange(0, - num_kv_heads, - device=slot_mapping.device, - dtype=slot_mapping.dtype) - head_indicies *= block_size * num_blocks + head_indices = torch.arange(0, + num_kv_heads, + device=slot_mapping.device, + dtype=slot_mapping.dtype) + head_indices *= block_size * num_blocks slot_mapping = slot_mapping.repeat_interleave(num_kv_heads).view( -1, num_kv_heads) - slot_mapping = slot_mapping + head_indicies.view(1, -1) + slot_mapping = slot_mapping + head_indices.view(1, -1) slot_mapping = slot_mapping.flatten() attn_metadata.slot_mapping = slot_mapping