From e92676ef4e69b7f8c60e201683d4bdb45ba99b48 Mon Sep 17 00:00:00 2001 From: yewentao256 Date: Wed, 27 Aug 2025 21:36:03 +0000 Subject: [PATCH] update for fp8 Signed-off-by: yewentao256 --- CMakeLists.txt | 2 +- csrc/attention/attention_kernels.cuh | 4 ++-- csrc/cache_kernels.cu | 4 ++-- csrc/layernorm_quant_kernels.cu | 2 +- csrc/quantization/activation_kernels.cu | 2 +- csrc/quantization/fused_kernels/quant_conversions.cuh | 2 +- csrc/quantization/{ => w8a8}/fp8/amd/quant_utils.cuh | 2 +- csrc/quantization/{ => w8a8}/fp8/common.cu | 2 +- csrc/quantization/{ => w8a8}/fp8/common.cuh | 0 csrc/quantization/{ => w8a8}/fp8/nvidia/quant_utils.cuh | 2 +- csrc/rocm/attention.cu | 2 +- csrc/rocm/skinny_gemms.cu | 2 +- 12 files changed, 13 insertions(+), 13 deletions(-) rename csrc/quantization/{ => w8a8}/fp8/amd/quant_utils.cuh (99%) rename csrc/quantization/{ => w8a8}/fp8/common.cu (99%) rename csrc/quantization/{ => w8a8}/fp8/common.cuh (100%) rename csrc/quantization/{ => w8a8}/fp8/nvidia/quant_utils.cuh (99%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 45fe62e4e5..fb645b1837 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -244,7 +244,7 @@ set(VLLM_EXT_SRC "csrc/cuda_view.cu" "csrc/quantization/gptq/q_gemm.cu" "csrc/quantization/w8a8/int8/scaled_quant.cu" - "csrc/quantization/fp8/common.cu" + "csrc/quantization/w8a8/fp8/common.cu" "csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu" "csrc/quantization/gguf/gguf_kernel.cu" "csrc/quantization/activation_kernels.cu" diff --git a/csrc/attention/attention_kernels.cuh b/csrc/attention/attention_kernels.cuh index 57382c1ddc..052ff168ce 100644 --- a/csrc/attention/attention_kernels.cuh +++ b/csrc/attention/attention_kernels.cuh @@ -28,10 +28,10 @@ #ifdef USE_ROCM #include - #include "../quantization/fp8/amd/quant_utils.cuh" + #include "../quantization/w8a8/fp8/amd/quant_utils.cuh" typedef __hip_bfloat16 __nv_bfloat16; #else - #include "../quantization/fp8/nvidia/quant_utils.cuh" + #include "../quantization/w8a8/fp8/nvidia/quant_utils.cuh" #endif #define MAX(a, b) ((a) > (b) ? (a) : (b)) diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu index b3a985c2d5..21381c30d0 100644 --- a/csrc/cache_kernels.cu +++ b/csrc/cache_kernels.cu @@ -8,9 +8,9 @@ #include "quantization/vectorization_utils.cuh" #ifdef USE_ROCM - #include "quantization/fp8/amd/quant_utils.cuh" + #include "quantization/w8a8/fp8/amd/quant_utils.cuh" #else - #include "quantization/fp8/nvidia/quant_utils.cuh" + #include "quantization/w8a8/fp8/nvidia/quant_utils.cuh" #endif #include diff --git a/csrc/layernorm_quant_kernels.cu b/csrc/layernorm_quant_kernels.cu index 0fd5849d96..8f074e8872 100644 --- a/csrc/layernorm_quant_kernels.cu +++ b/csrc/layernorm_quant_kernels.cu @@ -6,7 +6,7 @@ */ #include "type_convert.cuh" -#include "quantization/fp8/common.cuh" +#include "quantization/w8a8/fp8/common.cuh" #include "dispatch_utils.h" #include diff --git a/csrc/quantization/activation_kernels.cu b/csrc/quantization/activation_kernels.cu index 8bc2b9bff3..74dcc5dca0 100644 --- a/csrc/quantization/activation_kernels.cu +++ b/csrc/quantization/activation_kernels.cu @@ -7,7 +7,7 @@ #include "../cuda_compat.h" #include "dispatch_utils.h" -#include "quantization/fp8/common.cuh" +#include "quantization/w8a8/fp8/common.cuh" namespace vllm { diff --git a/csrc/quantization/fused_kernels/quant_conversions.cuh b/csrc/quantization/fused_kernels/quant_conversions.cuh index 4e6118e52e..2b1eb1d568 100644 --- a/csrc/quantization/fused_kernels/quant_conversions.cuh +++ b/csrc/quantization/fused_kernels/quant_conversions.cuh @@ -6,7 +6,7 @@ #include "quantization/vectorization.cuh" // TODO(luka/varun):refactor common.cuh to use this file instead -#include "quantization/fp8/common.cuh" +#include "quantization/w8a8/fp8/common.cuh" namespace vllm { diff --git a/csrc/quantization/fp8/amd/quant_utils.cuh b/csrc/quantization/w8a8/fp8/amd/quant_utils.cuh similarity index 99% rename from csrc/quantization/fp8/amd/quant_utils.cuh rename to csrc/quantization/w8a8/fp8/amd/quant_utils.cuh index e51a4e14e5..81f5cb83f3 100644 --- a/csrc/quantization/fp8/amd/quant_utils.cuh +++ b/csrc/quantization/w8a8/fp8/amd/quant_utils.cuh @@ -5,7 +5,7 @@ #include #include -#include "../../../attention/attention_dtypes.h" +#include "../../../../attention/attention_dtypes.h" namespace vllm { #ifdef USE_ROCM diff --git a/csrc/quantization/fp8/common.cu b/csrc/quantization/w8a8/fp8/common.cu similarity index 99% rename from csrc/quantization/fp8/common.cu rename to csrc/quantization/w8a8/fp8/common.cu index 5fe5dd04bd..66dc0610ad 100644 --- a/csrc/quantization/fp8/common.cu +++ b/csrc/quantization/w8a8/fp8/common.cu @@ -1,6 +1,6 @@ #include "common.cuh" #include "dispatch_utils.h" -#include "../vectorization_utils.cuh" +#include "quantization/vectorization_utils.cuh" #include #include diff --git a/csrc/quantization/fp8/common.cuh b/csrc/quantization/w8a8/fp8/common.cuh similarity index 100% rename from csrc/quantization/fp8/common.cuh rename to csrc/quantization/w8a8/fp8/common.cuh diff --git a/csrc/quantization/fp8/nvidia/quant_utils.cuh b/csrc/quantization/w8a8/fp8/nvidia/quant_utils.cuh similarity index 99% rename from csrc/quantization/fp8/nvidia/quant_utils.cuh rename to csrc/quantization/w8a8/fp8/nvidia/quant_utils.cuh index f8cd1dcba4..5a397b3ef8 100644 --- a/csrc/quantization/fp8/nvidia/quant_utils.cuh +++ b/csrc/quantization/w8a8/fp8/nvidia/quant_utils.cuh @@ -1,6 +1,6 @@ #pragma once -#include "../../../attention/attention_dtypes.h" +#include "../../../../attention/attention_dtypes.h" #include #include #include diff --git a/csrc/rocm/attention.cu b/csrc/rocm/attention.cu index e3a0e15f53..8db0baeba8 100644 --- a/csrc/rocm/attention.cu +++ b/csrc/rocm/attention.cu @@ -23,7 +23,7 @@ #include #include "../attention/dtype_fp8.cuh" -#include "../quantization/fp8/amd/quant_utils.cuh" +#include "../quantization/w8a8/fp8/amd/quant_utils.cuh" #if defined(__HIPCC__) && \ (defined(__gfx90a__) || defined(__gfx942__) || defined(__gfx950__)) diff --git a/csrc/rocm/skinny_gemms.cu b/csrc/rocm/skinny_gemms.cu index eb47139208..b8a1b43975 100644 --- a/csrc/rocm/skinny_gemms.cu +++ b/csrc/rocm/skinny_gemms.cu @@ -11,7 +11,7 @@ #include "../cuda_compat.h" #include "dispatch_utils.h" -#include "quantization/fp8/common.cuh" +#include "quantization/w8a8/fp8/common.cuh" #if defined(__HIPCC__) && \ (defined(__gfx90a__) || defined(__gfx942__) || defined(__gfx950__))