Enable scaled FP8 (e4m3fn) KV cache on ROCm (AMD GPU) (#3290)

Co-authored-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com> Co-authored-by: HaiShaw <hixiao@gmail.com> Co-authored-by: AdrianAbeyta <Adrian.Abeyta@amd.com> Co-authored-by: Matthew Wong <Matthew.Wong2@amd.com> Co-authored-by: root <root@gt-pla-u18-08.pla.dcgpu> Co-authored-by: mawong-amd <156021403+mawong-amd@users.noreply.github.com> Co-authored-by: ttbachyinsda <ttbachyinsda@outlook.com> Co-authored-by: guofangze <guofangze@kuaishou.com> Co-authored-by: Michael Goin <mgoin64@gmail.com> Co-authored-by: jacobthebanana <50071502+jacobthebanana@users.noreply.github.com> Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2024-04-03 16:15:55 -05:00
parent 3dcb3e8b98
commit 2ff767b513
41 changed files with 2592 additions and 142 deletions
--- a/tests/fp8_kv/llama2-70b-fp8-kv/kv_cache_scales.json
+++ b/tests/fp8_kv/llama2-70b-fp8-kv/kv_cache_scales.json
@ -0,0 +1,90 @@
+{
+    "model_type": "llama",
+    "kv_cache": {
+        "dtype": "float8_e4m3fn",
+        "scaling_factor": {
+            "0": {
+                "0": 0.0230364128947258,
+                "1": 0.01979283057153225,
+                "2": 0.0241350457072258,
+                "3": 0.0308314748108387,
+                "4": 0.0430733822286129,
+                "5": 0.0370396226644516,
+                "6": 0.0306222103536129,
+                "7": 0.0357491634786129,
+                "8": 0.0358189195394516,
+                "9": 0.0443289652466774,
+                "10": 0.0433175228536129,
+                "11": 0.0416782945394516,
+                "12": 0.0366908498108387,
+                "13": 0.0432477705180645,
+                "14": 0.0410505048930645,
+                "15": 0.0457589291036129,
+                "16": 0.0418526791036129,
+                "17": 0.0432477705180645,
+                "18": 0.0469447560608387,
+                "19": 0.0514787957072258,
+                "20": 0.0541294664144516,
+                "21": 0.0587681382894516,
+                "22": 0.0625,
+                "23": 0.0585588738322258,
+                "24": 0.0600237175822258,
+                "25": 0.0588030144572258,
+                "26": 0.0531180277466774,
+                "27": 0.06396484375,
+                "28": 0.0603027381002903,
+                "29": 0.0582101047039032,
+                "30": 0.0625348836183548,
+                "31": 0.0585588738322258,
+                "32": 0.0582798570394516,
+                "33": 0.0575125589966774,
+                "34": 0.0590820349752903,
+                "35": 0.0614188089966774,
+                "36": 0.0631975457072258,
+                "37": 0.0615931935608387,
+                "38": 0.0601283498108387,
+                "39": 0.0571986623108387,
+                "40": 0.0670340433716774,
+                "41": 0.0523507259786129,
+                "42": 0.0547223798930645,
+                "43": 0.0631975457072258,
+                "44": 0.0663713738322258,
+                "45": 0.0603376142680645,
+                "46": 0.0652204304933548,
+                "47": 0.0734514519572258,
+                "48": 0.0693708211183548,
+                "49": 0.0725446492433548,
+                "50": 0.0627790242433548,
+                "51": 0.0691266804933548,
+                "52": 0.0688825398683548,
+                "53": 0.068429134786129,
+                "54": 0.0605119988322258,
+                "55": 0.0799386203289032,
+                "56": 0.0853097140789032,
+                "57": 0.0661969929933548,
+                "58": 0.0689871683716774,
+                "59": 0.0724051371216774,
+                "60": 0.0541643425822258,
+                "61": 0.0626743882894516,
+                "62": 0.0628487765789032,
+                "63": 0.0607212632894516,
+                "64": 0.0589076466858387,
+                "65": 0.0451660193502903,
+                "66": 0.0453055277466774,
+                "67": 0.0414341539144516,
+                "68": 0.0385044664144516,
+                "69": 0.0414341539144516,
+                "70": 0.0466308631002903,
+                "71": 0.0399693101644516,
+                "72": 0.0437011756002903,
+                "73": 0.0434221550822258,
+                "74": 0.0428989976644516,
+                "75": 0.0401785746216774,
+                "76": 0.0431082621216774,
+                "77": 0.0484444759786129,
+                "78": 0.0417829267680645,
+                "79": 0.0418178029358387
+            }
+        }
+    }
+}
--- a/tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json
+++ b/tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json
@ -0,0 +1,42 @@
+{
+    "model_type": "llama",
+    "kv_cache": {
+        "dtype": "float8_e4m3fn",
+        "scaling_factor": {
+            "0": {
+                "0": 0.0152239128947258,
+                "1": 0.0188860222697258,
+                "2": 0.0354178324341774,
+                "3": 0.0376674123108387,
+                "4": 0.0418526791036129,
+                "5": 0.0433175228536129,
+                "6": 0.0397600457072258,
+                "7": 0.0424455925822258,
+                "8": 0.0415387861430645,
+                "9": 0.0408412404358387,
+                "10": 0.0395856611430645,
+                "11": 0.0377371683716774,
+                "12": 0.0400739423930645,
+                "13": 0.040771484375,
+                "14": 0.0393415205180645,
+                "15": 0.0369001142680645,
+                "16": 0.03857421875,
+                "17": 0.0387486070394516,
+                "18": 0.0403180830180645,
+                "19": 0.0396205373108387,
+                "20": 0.0375627800822258,
+                "21": 0.0407366082072258,
+                "22": 0.0432477705180645,
+                "23": 0.0377022884786129,
+                "24": 0.0399693101644516,
+                "25": 0.0374581478536129,
+                "26": 0.0413295216858387,
+                "27": 0.0442243330180645,
+                "28": 0.0424804724752903,
+                "29": 0.0456891767680645,
+                "30": 0.0409109964966774,
+                "31": 0.0482352152466774
+            }
+        }
+    }
+}
--- a/tests/kernels/test_attention.py
+++ b/tests/kernels/test_attention.py
@ -32,7 +32,7 @@ HEAD_SIZES = [64, 80, 96, 112, 128, 256

 BLOCK_SIZES = [16, 32]
 USE_ALIBI = [False, True]
-KV_CACHE_DTYPE = ["auto", "fp8_e5m2"]
+KV_CACHE_DTYPE = ["auto", "fp8"]
 SEEDS = [0]
 CUDA_DEVICES = [
    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
@ -172,6 +172,9 @@ def test_paged_attention(
                                                device)
    key_cache, value_cache = key_caches[0], value_caches[0]

+    # Using default kv_scale
+    kv_scale = 1.0
+
    # Call the paged attention kernel.
    output = torch.empty_like(query)
    if version == "v1":
@ -188,6 +191,7 @@ def test_paged_attention(
            max_context_len,
            alibi_slopes,
            kv_cache_dtype,
+            kv_scale,
        )
    elif version == "v2":
        num_partitions = ((max_context_len + PARTITION_SIZE - 1) //
@ -219,12 +223,13 @@ def test_paged_attention(
            max_context_len,
            alibi_slopes,
            kv_cache_dtype,
+            kv_scale,
        )
    else:
        raise AssertionError(f"Unknown version: {version}")

    # Run the reference implementation.
-    if kv_cache_dtype == "fp8_e5m2":
+    if kv_cache_dtype == "fp8":
        # Convert cache data back to dtype.
        x = 16 // torch.tensor([], dtype=dtype).element_size()
        key_cache_shape = (NUM_BLOCKS, num_kv_heads, head_size // x,
@ -232,14 +237,14 @@ def test_paged_attention(
        dequantized_key_cache = torch.empty(size=key_cache_shape,
                                            dtype=dtype,
                                            device=device)
-        cache_ops.convert_fp8_e5m2(key_cache, dequantized_key_cache)
+        cache_ops.convert_fp8(key_cache, dequantized_key_cache)
        key_cache = dequantized_key_cache

        value_cache_shape = value_cache.shape
        dequantized_value_cache = torch.empty(size=value_cache_shape,
                                              dtype=dtype,
                                              device=device)
-        cache_ops.convert_fp8_e5m2(value_cache, dequantized_value_cache)
+        cache_ops.convert_fp8(value_cache, dequantized_value_cache)
        value_cache = dequantized_value_cache

    ref_output = torch.empty_like(query)
@ -263,7 +268,8 @@ def test_paged_attention(

    # NOTE(zhaoyang): FP8 KV Cache will introduce quantization error,
    # so we use a relaxed tolerance for the test.
-    if kv_cache_dtype == "fp8_e5m2":
+    atol, rtol = 1e-3, 1e-5
+    if kv_cache_dtype == "fp8":
        atol, rtol = 1e-2, 1e-5
    assert torch.allclose(output, ref_output, atol=atol, rtol=rtol)

--- a/tests/kernels/test_cache.py
+++ b/tests/kernels/test_cache.py
@ -5,6 +5,7 @@ import pytest
 import torch

 from vllm._C import cache_ops
+from vllm.utils import is_hip

 COPYING_DIRECTION = [('cuda', 'cpu'), ('cuda', 'cuda'), ('cpu', 'cuda')]
 DTYPES = [torch.half, torch.bfloat16, torch.float]
@ -23,7 +24,7 @@ SEEDS = [0]
 CUDA_DEVICES = [
    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
 ]
-KV_CACHE_DTYPE = ["auto", "fp8_e5m2"]
+KV_CACHE_DTYPE = ["auto", "fp8"]


@pytest.mark.parametrize("num_mappings", NUM_MAPPINGS)
@ -105,6 +106,7 @@ def test_copy_blocks(
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("seed", SEEDS)
@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
@torch.inference_mode()
 def test_reshape_and_cache(
    kv_cache_factory,
@ -116,7 +118,10 @@ def test_reshape_and_cache(
    dtype: torch.dtype,
    seed: int,
    device: str,
+    kv_cache_dtype: str,
 ) -> None:
+    if not is_hip() and kv_cache_dtype == "fp8":
+        pytest.skip()  # This test is not tuned for e5m2 cuda precision
    random.seed(seed)
    torch.random.manual_seed(seed)
    if torch.cuda.is_available():
@ -132,17 +137,33 @@ def test_reshape_and_cache(

    # Create the KV caches.
    key_caches, value_caches = kv_cache_factory(num_blocks, block_size, 1,
-                                                num_heads, head_size, dtype,
-                                                None, seed, device)
+                                                num_heads, head_size,
+                                                kv_cache_dtype, dtype, seed,
+                                                device)
    key_cache, value_cache = key_caches[0], value_caches[0]

    # Clone the KV caches.
-    cloned_key_cache = key_cache.clone()
-    cloned_value_cache = value_cache.clone()
+    if kv_cache_dtype == "fp8":
+        cloned_key_cache = torch.empty_like(key_cache, dtype=torch.float16)
+        cache_ops.convert_fp8(key_cache, cloned_key_cache)
+        cloned_value_cache = torch.empty_like(value_cache, dtype=torch.float16)
+        cache_ops.convert_fp8(value_cache, cloned_value_cache)
+    else:
+        cloned_key_cache = key_cache.clone()
+        cloned_value_cache = value_cache.clone()
+
+    # Using default kv_scale
+    kv_scale = 1.0

    # Call the reshape_and_cache kernel.
    cache_ops.reshape_and_cache(key, value, key_cache, value_cache,
-                                slot_mapping, "auto")
+                                slot_mapping, kv_cache_dtype, kv_scale)
+
+    if kv_cache_dtype == "fp8":
+        result_key_cache = torch.empty_like(key_cache, dtype=torch.float16)
+        cache_ops.convert_fp8(key_cache, result_key_cache)
+        result_value_cache = torch.empty_like(value_cache, dtype=torch.float16)
+        cache_ops.convert_fp8(value_cache, result_value_cache)

    # Run the reference implementation.
    reshaped_key = key.reshape(num_tokens, *key_cache[0, :, :, 0, :].shape)
@ -156,8 +177,18 @@ def test_reshape_and_cache(
        cloned_key_cache[block_idx, :, :, block_offset, :] = reshaped_key[i]
        cloned_value_cache[block_idx, :, :, block_offset] = value[i]

-    assert torch.allclose(key_cache, cloned_key_cache)
-    assert torch.allclose(value_cache, cloned_value_cache)
+    if kv_cache_dtype == "fp8":
+        assert torch.allclose(result_key_cache,
+                              cloned_key_cache,
+                              atol=0.001,
+                              rtol=0.1)
+        assert torch.allclose(result_value_cache,
+                              cloned_value_cache,
+                              atol=0.001,
+                              rtol=0.1)
+    else:
+        assert torch.allclose(key_cache, cloned_key_cache)
+        assert torch.allclose(value_cache, cloned_value_cache)


@pytest.mark.parametrize("direction", COPYING_DIRECTION)
@ -169,6 +200,7 @@ def test_reshape_and_cache(
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("seed", SEEDS)
@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
@torch.inference_mode()
 def test_swap_blocks(
    kv_cache_factory,
@ -181,7 +213,12 @@ def test_swap_blocks(
    dtype: torch.dtype,
    seed: int,
    device: str,
+    kv_cache_dtype: str,
 ) -> None:
+    if kv_cache_dtype == "fp8" and "cpu" in direction:
+        pytest.skip()
+    if not is_hip() and kv_cache_dtype == "fp8":
+        pytest.skip()  # This test is not tuned for e5m2 cuda precision
    random.seed(seed)
    torch.random.manual_seed(seed)
    if torch.cuda.is_available():
@ -202,13 +239,13 @@ def test_swap_blocks(

    # Create the KV caches on the first device.
    src_key_caches, src_value_caches = kv_cache_factory(
-        num_blocks, block_size, 1, num_heads, head_size, dtype, None, seed,
-        src_device)
+        num_blocks, block_size, 1, num_heads, head_size, kv_cache_dtype, dtype,
+        seed, src_device)

    # Create the KV caches on the second device.
    dist_key_caches, dist_value_caches = kv_cache_factory(
-        num_blocks, block_size, 1, num_heads, head_size, dtype, None, seed,
-        dst_device)
+        num_blocks, block_size, 1, num_heads, head_size, kv_cache_dtype, dtype,
+        seed, dst_device)

    src_key_caches_clone = src_key_caches[0].clone()
    src_value_caches_clone = src_value_caches[0].clone()
@ -223,3 +260,40 @@ def test_swap_blocks(
                              dist_key_caches[0][dst].cpu())
        assert torch.allclose(src_value_caches_clone[src].cpu(),
                              dist_value_caches[0][dst].cpu())
+
+
+@pytest.mark.skipif(not is_hip(), reason="FP8 conversion test requires e4m3")
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("block_size", BLOCK_SIZES)
+@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@torch.inference_mode()
+def test_fp8_conversion(
+    num_heads: int,
+    head_size: int,
+    block_size: int,
+    num_blocks: int,
+    dtype: torch.dtype,
+    seed: int,
+    device: str,
+) -> None:
+    random.seed(seed)
+    torch.random.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+
+    low = -224.0
+    high = 224.0
+    shape = (num_blocks, num_heads, head_size, block_size)
+    cache = torch.empty(shape, dtype=dtype, device=device)
+    cache.uniform_(low, high)
+
+    cache_fp8 = torch.empty_like(cache, dtype=torch.uint8)
+    cache_ops.convert_fp8(cache, cache_fp8)
+
+    converted_cache = torch.empty_like(cache)
+    cache_ops.convert_fp8(cache_fp8, converted_cache)
+
+    assert torch.allclose(cache, converted_cache, atol=0.001, rtol=0.1)