add compile

2024-07-26 19:29:36 -07:00
655 changed files with 14566 additions and 54280 deletions
--- a/.buildkite/lm-eval-harness/configs/DeepSeek-V2-Lite-Chat.yaml
+++ b/.buildkite/lm-eval-harness/configs/DeepSeek-V2-Lite-Chat.yaml
@ -9,4 +9,3 @@ tasks:
    value: 0.664
 limit: 1000
 num_fewshot: 5
-trust_remote_code: True
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
@ -1,11 +0,0 @@
-# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
-model_name: "HandH1998/QQQ-Llama-3-8b-g128"
-tasks:
- name: "gsm8k"
-  metrics:
-  - name: "exact_match,strict-match"
-    value: 0.419
-  - name: "exact_match,flexible-extract"
-    value: 0.416
-limit: 1000
-num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml
@ -1,11 +1,11 @@
-# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m mgoin/Minitron-4B-Base-FP8 -b auto -l 1000 -f 5 -t 1
-model_name: "mgoin/Minitron-4B-Base-FP8"
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nvidia/Minitron-4B-Base -b auto -l 1000 -f 5 -t 1
+model_name: "nvidia/Minitron-4B-Base"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
-    value: 0.233
+    value: 0.252
  - name: "exact_match,flexible-extract"
-    value: 0.236
+    value: 0.252
 limit: 1000
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/models-small.txt
+++ b/.buildkite/lm-eval-harness/configs/models-small.txt
@ -4,7 +4,6 @@ Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
 Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
 Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
 Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
-Minitron-4B-Base-FP8.yaml
+Minitron-4B-Base.yaml
 Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
 Qwen2-1.5B-Instruct-FP8W8.yaml
-Meta-Llama-3-8B-QQQ.yaml
--- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
@ -14,7 +14,7 @@ import lm_eval
 import numpy
 import yaml

-RTOL = 0.05
+RTOL = 0.02
 TEST_DATA_FILE = os.environ.get(
    "LM_EVAL_TEST_DATA_FILE",
    ".buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml")
@ -23,12 +23,9 @@ TP_SIZE = os.environ.get("LM_EVAL_TP_SIZE", 1)


 def launch_lm_eval(eval_config):
-    trust_remote_code = eval_config.get('trust_remote_code', False)
-
    model_args = f"pretrained={eval_config['model_name']}," \
                 f"tensor_parallel_size={TP_SIZE}," \
-                 f"add_bos_token=true," \
-                 f"trust_remote_code={trust_remote_code}"
+                 f"add_bos_token=true"

    results = lm_eval.simple_evaluate(
        model="vllm",
--- a/.buildkite/nightly-benchmarks/README.md
+++ b/.buildkite/nightly-benchmarks/README.md
@ -34,18 +34,17 @@ See  [vLLM performance dashboard](https://perf.vllm.ai) for the latest performan

 Performance benchmark will be triggered when:
 - A PR being merged into vllm.
- Every commit for those PRs with `perf-benchmarks` label AND `ready` label.
+- Every commit for those PRs with `perf-benchmarks` label.

 Nightly benchmark will be triggered when:
- Every commit for those PRs with `perf-benchmarks` label and `nightly-benchmarks` label.
+- Every commit for those PRs with `nightly-benchmarks` label.




 ## Performance benchmark details

-
-See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
+See [descriptions.md](tests/descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.


 #### Latency test
@ -69,7 +68,7 @@ Here is an example of one test inside `latency-tests.json`:

 In this example:
 -  The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`.
-  The `parameters` attribute control the command line arguments to be used for `benchmark_latency.py`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `benchmark_latency.py`. For example, the corresponding command line arguments for `benchmark_latency.py` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
+-  The `parameters` attribute control the command line arguments to be used for `benchmark_latency.py`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-benchmarks-suite.sh` will convert the underline to dash when feeding the arguments to `benchmark_latency.py`. For example, the corresponding command line arguments for `benchmark_latency.py` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`

 Note that the performance numbers are highly sensitive to the value of the parameters. Please make sure the parameters are set correctly.

--- a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
@ -21,7 +21,7 @@ steps:
          containers:
          - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
            command:
-            - bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+            - bash .buildkite/nightly-benchmarks/run-benchmarks-suite.sh
            resources:
              limits:
                nvidia.com/gpu: 8
@ -42,20 +42,20 @@ steps:
          - name: devshm
            emptyDir:
              medium: Memory
-  # - label: "H100"
-  #   agents:
-  #     queue: H100
-  #   plugins:
-  #   - docker#v5.11.0:
-  #       image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
-  #       command:
-  #       - bash
-  #       - .buildkite/nightly-benchmarks/run-benchmarks-suite.sh
-  #       mount-buildkite-agent: true
-  #       propagate-environment: true
-  #       ipc: host
-  #       gpus: all
-  #       environment:
-  #       - VLLM_USAGE_SOURCE
-  #       - HF_TOKEN
+  - label: "H100"
+    agents:
+      queue: H100
+    plugins:
+    - docker#v5.11.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+        command:
+        - bash
+        - .buildkite/nightly-benchmarks/run-benchmarks-suite.sh
+        mount-buildkite-agent: true
+        propagate-environment: true
+        ipc: host
+        gpus: all
+        environment:
+        - VLLM_USAGE_SOURCE
+        - HF_TOKEN

--- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
@ -34,15 +34,6 @@ check_hf_token() {
  fi
 }

-ensure_sharegpt_downloaded() {
-  local FILE=ShareGPT_V3_unfiltered_cleaned_split.json
-  if [ ! -f "$FILE" ]; then
-    wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/$FILE
-  else
-    echo "$FILE already exists."
-  fi
-}
-
 json2args() {
  # transforms the JSON string to command line args, and '_' is replaced to '-'
  # example:
@ -68,38 +59,40 @@ wait_for_server() {
    done' && return 0 || return 1
 }

-kill_processes_launched_by_current_bash() {
-  # Kill all python processes launched from current bash script
-  current_shell_pid=$$
-  processes=$(ps -eo pid,ppid,command | awk -v ppid="$current_shell_pid" -v proc="$1" '$2 == ppid && $3 ~ proc {print $1}')
-  if [ -n "$processes" ]; then
-    echo "Killing the following processes matching '$1':"
-    echo "$processes"
-    echo "$processes" | xargs kill -9
-  else
-    echo "No processes found matching '$1'."
-  fi
-}
-
 kill_gpu_processes() {
+  # kill all processes on GPU.
+  pids=$(nvidia-smi --query-compute-apps=pid --format=csv,noheader)
+  if [ -z "$pids" ]; then
+      echo "No GPU processes found."
+  else
+      for pid in $pids; do
+          kill -9 "$pid"
+          echo "Killed process with PID: $pid"
+      done

-  ps -aux
-  lsof -t -i:8000 | xargs -r kill -9
-  pkill -f pt_main_thread
-  # this line doesn't work now
-  # ps aux | grep python | grep openai | awk '{print $2}' | xargs -r kill -9
-  pkill -f python3
-  pkill -f /usr/bin/python3
+      echo "All GPU processes have been killed."
+  fi

+  # Sometimes kill with pid doesn't work properly, we can also kill all process running python or python3
+  # since we are in container anyway
+  pkill -9 -f python
+  pkill -9 -f python3

-  # wait until GPU memory usage smaller than 1GB
-  while [ $(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1) -ge 1000 ]; do
+  # waiting for GPU processes to be fully killed
+  # loop while nvidia-smi returns any processes
+  while [ -n "$(nvidia-smi --query-compute-apps=pid --format=csv,noheader)" ]; do
    sleep 1
+    echo "Waiting for GPU processes to be killed"
  done

  # remove vllm config file
  rm -rf ~/.config/vllm

+  # Print the GPU memory usage
+  # so that we know if all GPU processes are killed.
+  gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
+  # The memory usage should be 0 MB.
+  echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
 }

 upload_to_buildkite() {
@ -117,7 +110,7 @@ upload_to_buildkite() {
  fi

  # Use the determined command to annotate and upload artifacts
-  $BUILDKITE_AGENT_COMMAND annotate --style "info" --context "$BUILDKITE_LABEL-benchmark-results" <$RESULTS_FOLDER/benchmark_results.md
+  $BUILDKITE_AGENT_COMMAND annotate --style "info" --context "$BUILDKITE_LABEL-benchmark-results" < $RESULTS_FOLDER/benchmark_results.md
  $BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*"
 }

@ -169,7 +162,7 @@ run_latency_tests() {
        latency_command: $latency,
        gpu_type: $gpu
      }')
-    echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands"
+    echo "$jq_output" > "$RESULTS_FOLDER/$test_name.commands"

    # run the benchmark
    eval "$latency_command"
@ -179,6 +172,7 @@ run_latency_tests() {
  done
 }

+
 run_throughput_tests() {
  # run throughput tests using `benchmark_throughput.py`
  # $1: a json file specifying throughput test cases
@ -226,7 +220,7 @@ run_throughput_tests() {
        throughput_command: $command,
        gpu_type: $gpu
      }')
-    echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands"
+    echo "$jq_output" > "$RESULTS_FOLDER/$test_name.commands"

    # run the benchmark
    eval "$throughput_command"
@ -258,6 +252,7 @@ run_serving_tests() {
      continue
    fi

+
    # get client and server arguments
    server_params=$(echo "$params" | jq -r '.server_parameters')
    client_params=$(echo "$params" | jq -r '.client_parameters')
@ -335,7 +330,7 @@ run_serving_tests() {
          client_command: $client,
          gpu_type: $gpu
        }')
-      echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
+      echo "$jq_output" > "$RESULTS_FOLDER/${new_test_name}.commands"

    done

@ -352,7 +347,6 @@ main() {
  # dependencies
  (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
  (which jq) || (apt-get update && apt-get -y install jq)
-  (which lsof) || (apt-get update && apt-get install -y lsof)

  # get the current IP address, required by benchmark_serving.py
  export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
@ -361,7 +355,7 @@ main() {

  # prepare for benchmarking
  cd benchmarks || exit 1
-  ensure_sharegpt_downloaded
+  wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
  declare -g RESULTS_FOLDER=results/
  mkdir -p $RESULTS_FOLDER
  QUICK_BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
@ -371,6 +365,7 @@ main() {
  run_latency_tests $QUICK_BENCHMARK_ROOT/tests/latency-tests.json
  run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/throughput-tests.json

+
  # postprocess benchmarking results
  pip install tabulate pandas
  python3 $QUICK_BENCHMARK_ROOT/scripts/convert-results-json-to-markdown.py
--- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
@ -174,8 +174,8 @@ if __name__ == "__main__":
    # document the result
    with open(results_folder / "benchmark_results.md", "w") as f:

-        results = read_markdown("../.buildkite/nightly-benchmarks/" +
-                                "performance-benchmarks-descriptions.md")
+        results = read_markdown(
+            "../.buildkite/nightly-benchmarks/tests/descriptions.md")
        results = results.format(
            latency_tests_markdown_table=latency_md_table,
            throughput_tests_markdown_table=throughput_md_table,
--- a/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md
+++ b/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md
@ -1,42 +1,47 @@

 ## Latency tests

+This test suite aims to test vllm's end-to-end latency under a controlled setup.
+
 - Input length: 32 tokens.
 - Output length: 128 tokens.
 - Batch size: fixed (8).
- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
+- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
 - Evaluation metrics: end-to-end latency (mean, median, p99).

+### Latency benchmarking results

 {latency_tests_markdown_table}

-
 ## Throughput tests

+This test suite aims to test vllm's throughput.
+
 - Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
 - Output length: the corresponding output length of these 200 prompts.
 - Batch size: dynamically determined by vllm to achieve maximum throughput.
- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
+- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
 - Evaluation metrics: throughput.

+### Throughput benchmarking results

 {throughput_tests_markdown_table}

-
 ## Serving tests

+This test suite aims to test vllm's real serving metrics.
+
 - Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
 - Output length: the corresponding output length of these 200 prompts.
 - Batch size: dynamically determined by vllm and the arrival pattern of the requests.
 - **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
- We also added a speculative decoding test for llama-3 70B, under QPS 2
+- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
 - Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99).

+### Serving benchmarking results

 {serving_tests_markdown_table}

-
 ## json version of the benchmarking tables

 This section contains the data of the markdown tables above in JSON format. 
--- a/.buildkite/nightly-benchmarks/tests/latency-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/latency-tests.json
@ -2,7 +2,7 @@
    {
        "test_name": "latency_llama8B_tp1",
        "parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Meta-Llama-3-8B",
            "tensor_parallel_size": 1,
            "load_format": "dummy",
            "num_iters_warmup": 5,
@ -12,7 +12,7 @@
    {
        "test_name": "latency_llama70B_tp4",
        "parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
+            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
            "tensor_parallel_size": 4,
            "load_format": "dummy",
            "num-iters-warmup": 5,
--- a/.buildkite/nightly-benchmarks/tests/serving-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests.json
@ -3,7 +3,7 @@
        "test_name": "serving_llama8B_tp1_sharegpt",
        "qps_list": [1, 4, 16, "inf"],
        "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Meta-Llama-3-8B",
            "tensor_parallel_size": 1,
            "swap_space": 16,
            "disable_log_stats": "",
@ -11,7 +11,7 @@
            "load_format": "dummy"
        },
        "client_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Meta-Llama-3-8B",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
@ -22,7 +22,7 @@
        "test_name": "serving_llama70B_tp4_sharegpt",
        "qps_list": [1, 4, 16, "inf"],
        "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
+            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
            "tensor_parallel_size": 4,
            "swap_space": 16,
            "disable_log_stats": "",
@ -30,7 +30,7 @@
            "load_format": "dummy"
        },
        "client_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
+            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
@ -55,26 +55,5 @@
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
-    },
-    {
-        "test_name": "serving_llama70B_tp4_sharegpt_specdecode",
-        "qps_list": [2],
-        "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
-            "disable_log_requests": "", 
-            "tensor_parallel_size": 4,
-            "swap_space": 16, 
-            "speculative_model": "turboderp/Qwama-0.5B-Instruct",
-            "num_speculative_tokens": 4,
-            "speculative_draft_tensor_parallel_size": 1,
-            "use_v2_block_manager": ""
-        },
-        "client_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200 
-        }
    }
-]
+]
--- a/.buildkite/nightly-benchmarks/tests/throughput-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/throughput-tests.json
@ -2,7 +2,7 @@
    {
        "test_name": "throughput_llama8B_tp1",
        "parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Meta-Llama-3-8B",
            "tensor_parallel_size": 1,
            "load_format": "dummy",
            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
@ -13,7 +13,7 @@
    {
        "test_name": "throughput_llama70B_tp4",
        "parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
+            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
            "tensor_parallel_size": 4,
            "load_format": "dummy",
            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@ -1,27 +1,9 @@
 steps:
-  - label: "Build wheel - CUDA 12.1"
+  - label: "Build wheel - CUDA {{matrix.cuda_version}}"
    agents:
      queue: cpu_queue
    commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg buildkite_commit=$BUILDKITE_COMMIT --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain ."
-      - "mkdir artifacts"
-      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      # rename the files to change linux -> manylinux1
-      - "for f in artifacts/dist/*.whl; do mv -- \"$$f\" \"$${f/linux/manylinux1}\"; done"
-      - "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/$BUILDKITE_COMMIT/"
-      - "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/nightly/"
-    env:
-      DOCKER_BUILDKIT: "1"
-
-  - block: "Build CUDA 11.8 wheel"
-    key: block-build-cu118-wheel
-  
-  - label: "Build wheel - CUDA 11.8"
-    depends_on: block-build-cu118-wheel
-    agents:
-      queue: cpu_queue
-    commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg buildkite_commit=$BUILDKITE_COMMIT --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg buildkite_commit=$BUILDKITE_COMMIT --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION={{matrix.cuda_version}} --tag vllm-ci:build-image --target build --progress plain ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
      # rename the files to change linux -> manylinux1
@ -30,3 +12,8 @@ steps:
      - "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/nightly/"
    env:
      DOCKER_BUILDKIT: "1"
+    matrix:
+      setup:
+        cuda_version:
+          - "11.8.0"
+          - "12.1.0"
--- a/.buildkite/run-amd-test.sh
+++ b/.buildkite/run-amd-test.sh
@ -55,7 +55,8 @@ while true; do
 done

 echo "--- Pulling container" 
-image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}"
+docker login registry-1.docker.io -u alexeivivanovamd -p ${DH_TOKEN}
+image_name="rocmshared/vllm-ci:${BUILDKITE_COMMIT}"
 container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
 docker pull ${image_name}

--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@ -22,8 +22,8 @@ docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py"

 # Run basic model test
 docker exec cpu-test bash -c "
-  pip install pytest matplotlib einops transformers_stream_generator
-  pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py --ignore=tests/models/test_oot_registration.py --ignore=tests/models/test_registry.py --ignore=tests/models/test_jamba.py --ignore=tests/models/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported
+  pip install pytest Pillow protobuf
+  pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py --ignore=tests/models/test_registry.py --ignore=tests/models/test_jamba.py" # Mamba on CPU is not supported

 # online inference
 docker exec cpu-test bash -c "
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -5,49 +5,11 @@
 # https://github.com/vllm-project/buildkite-ci/blob/main/scripts/test-template-aws.j2 
 # to generate the final pipeline yaml file.

-# Documentation
-# label(str): the name of the test. emoji allowed.
-# fast_check(bool): whether to run this on each commit on fastcheck pipeline.
-# fast_check_only(bool): run this test on fastcheck pipeline only
-# command(str): the single command to run for tests. incompatible with commands.
-# commands(list): the list of commands to run for test. incompatbile with command.
-# mirror_hardwares(list): the list of hardwares to run the test on as well. currently only supports [amd]
-# gpu(str): override the GPU selection for the test. default is on L4 GPUs. currently only supports a100
-# num_gpus(int): override the number of GPUs for the test. default to 1 GPU. currently support 2,4.
-# num_nodes(int): whether to simulate multi-node setup by launch multiple containers on one host, 
-#     in this case, commands must be specified. the first command runs on first host, the second
-#     command runs on the second host.
-# working_dir(str): specify the place where command should execute, default to /vllm-workspace/tests
-# source_file_dependencies(list): the list of prefix to opt-in the test for, if empty, the test will always run.
-
-# When adding a test
-# - If the test belong to an existing group, add it there
-# - If the test is short, add to any existing step
-# - If the test takes more than 10min, then it is okay to create a new step. 
-#   Note that all steps execute in parallel. 

 steps:
-##### fast check tests  #####
-
- label: Documentation Build # 2min
-  working_dir: "/vllm-workspace/test_docs/docs"
+- label: Async Engine, Inputs, Utils, Worker Test
  fast_check: true
-  no_gpu: True
-  commands:
-  - pip install -r requirements-docs.txt
-  - SPHINXOPTS=\"-W\" make html
-  # Check API reference (if it fails, you may have missing mock imports)
-  - grep \"sig sig-object py\" build/html/dev/sampling_params.html
-
- label: Async Engine, Inputs, Utils, Worker Test # 15min
-  fast_check: true
-  source_file_dependencies:
-  - vllm/
-  - tests/async_engine
-  - tests/test_inputs
-  - tests/multimodal
-  - tests/test_utils
-  - tests/worker
+  fast_check_only: true
  commands:
  - pytest -v -s async_engine # Async Engine
  - pytest -v -s test_inputs.py
@ -55,347 +17,274 @@ steps:
  - pytest -v -s test_utils.py # Utils
  - pytest -v -s worker # Worker

- label: Basic Correctness Test # 30min
-  #mirror_hardwares: [amd]
+- label: Metrics, Tracing Test
  fast_check: true
-  source_file_dependencies:
-  - vllm/
-  - tests/basic_correctness
+  fast_check_only: true
  commands:
+  - pytest -v -s metrics # Metrics
+  - "pip install \
+      opentelemetry-sdk \
+      opentelemetry-api \
+      opentelemetry-exporter-otlp \
+      opentelemetry-semantic-conventions-ai" # Tracing
+  - pytest -v -s tracing
+
+- label: Regression Test
+  mirror_hardwares: [amd]
+  fast_check: true
+  command: pytest -v -s test_regression.py
+  working_dir: "/vllm-workspace/tests" # optional
+
+- label: AsyncEngine Test
+  #mirror_hardwares: [amd]
+  command: pytest -v -s async_engine
+
+- label: Basic Correctness Test
+  mirror_hardwares: [amd]
+  fast_check: true
+  commands:
+  # This flashinfer installation will fail on AMD ROCm, so it is set as optional.
+  - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl || true
  - pytest -v -s basic_correctness/test_basic_correctness.py
  - pytest -v -s basic_correctness/test_cpu_offload.py
  - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
  - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
  - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
-  
- label: Core Test # 10min
+
+- label: Core Test
  mirror_hardwares: [amd]
  fast_check: true
-  source_file_dependencies:
-  - vllm/core
-  - vllm/distributed
-  - tests/core
  commands:
  - pytest -v -s core
+  - pytest -v -s distributed/test_parallel_state.py

- label: Entrypoints Test # 20min
-  working_dir: "/vllm-workspace/tests"
-  fast_check: true
+- label: Distributed Comm Ops Test
  #mirror_hardwares: [amd]
-  source_file_dependencies:
-  - vllm/
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
  commands:
-  - pip install -e ./plugins/vllm_add_dummy_model
-  - pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@a4987bba6e9e9b3f22bd3a6c1ecf0abd04fd5622#egg=lm_eval[api]
-  - pytest -v -s entrypoints/llm
-  - pytest -v -s entrypoints/openai
+  - pytest -v -s distributed/test_comm_ops.py
+  - pytest -v -s distributed/test_shm_broadcast.py

- label: Distributed Tests (4 GPUs) # 10min
+- label: 2 Node Tests (4 GPUs in total)
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  num_nodes: 2
+  commands:
+  - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
+    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
+    - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
+  - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
+    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
+
+- label: Distributed Tests (2 GPUs)
+  mirror_hardwares: [amd]
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  commands:
+  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py
+  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
+  - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
+  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray VLLM_USE_RAY_SPMD_WORKER=1 VLLM_USE_RAY_COMPILED_DAG=1 pytest -v -s distributed/test_basic_distributed_correctness.py
+  - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray VLLM_USE_RAY_SPMD_WORKER=1 VLLM_USE_RAY_COMPILED_DAG=1 pytest -v -s distributed/test_basic_distributed_correctness.py
+  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
+  - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
+  - TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_multimodal_broadcast.py
+  - TEST_DIST_MODEL=microsoft/Phi-3-vision-128k-instruct DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_multimodal_broadcast.py
+  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
+  - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
+  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
+  - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
+  - TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py
+  - TEST_DIST_MODEL=microsoft/Phi-3-vision-128k-instruct DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py
+  - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
+  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
+  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py
+
+- label: Distributed Tests (4 GPUs)
+  #mirror_hardwares: [amd]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 4
  fast_check: true
-  source_file_dependencies:
-  - vllm/distributed/
-  - vllm/core/
-  - tests/distributed
-  - tests/spec_decode/e2e/test_integration_dist_tp4
  commands:
  - pytest -v -s distributed/test_pynccl.py
+  # We want to test that models which use 2 GPUs work with 4 GPUs, which is why we duplicate them here.
+  # See https://github.com/vllm-project/vllm/pull/5473#issuecomment-2166601837 for context.
+  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
+  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray VLLM_USE_RAY_SPMD_WORKER=1 VLLM_USE_RAY_COMPILED_DAG=1 pytest -v -s distributed/test_basic_distributed_correctness.py
+  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
  - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py

- label: Metrics, Tracing Test # 10min
-  num_gpus: 2 
-  fast_check: true
-  source_file_dependencies:
-  - vllm/
-  - tests/metrics
-  - tests/tracing
+- label: Pipeline Parallelism Test
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 4
  commands:
-  - pytest -v -s metrics 
-  - "pip install \
-      'opentelemetry-sdk>=1.26.0,<1.27.0' \
-      'opentelemetry-api>=1.26.0,<1.27.0' \
-      'opentelemetry-exporter-otlp>=1.26.0,<1.27.0' \
-      'opentelemetry-semantic-conventions-ai>=0.4.1,<0.5.0'"
-  - pytest -v -s tracing
+  - pytest -v -s distributed/test_pipeline_parallel.py

-##### fast check tests  #####
-#####  1 GPU test  #####
-
- label: Regression Test # 5min
+- label: Engine Test
  mirror_hardwares: [amd]
-  source_file_dependencies:
-  - vllm/
-  - tests/test_regression
-  command: pytest -v -s test_regression.py
-  working_dir: "/vllm-workspace/tests" # optional
-
- label: Engine Test # 10min
-  mirror_hardwares: [amd]
-  source_file_dependencies:
-  - vllm/
-  - tests/engine
-  - tests/tokenization
  commands:
  - pytest -v -s engine test_sequence.py test_config.py test_logger.py
  # OOM in the CI unless we run this separately
  - pytest -v -s tokenization

- label: Examples Test # 12min
-  working_dir: "/vllm-workspace/examples"
-  #mirror_hardwares: [amd]
-  source_file_dependencies:
-  - vllm/entrypoints
-  - examples/
+- label: Entrypoints Test
+  fast_check: true
+  mirror_hardwares: [amd]
+
  commands:
-    - pip install awscli tensorizer # for llava example and tensorizer test
+  - pytest -v -s entrypoints/llm
+  - pytest -v -s entrypoints/openai
+
+- label: Examples Test
+  working_dir: "/vllm-workspace/examples"
+  mirror_hardwares: [amd]
+  commands:
+    # install aws cli for llava_example.py
+    # install tensorizer for tensorize_vllm_model.py
+    - pip install awscli tensorizer
    - python3 offline_inference.py
    - python3 cpu_offload.py
-    - python3 offline_inference_chat.py
    - python3 offline_inference_with_prefix.py
    - python3 llm_engine_example.py
-    - python3 offline_inference_vision_language.py
+    - python3 llava_example.py
    - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
-    - python3 offline_inference_encoder_decoder.py

- label: Models Test # 1hr10min
-  source_file_dependencies:
-  - vllm/
-  - tests/models
-  commands:
-    - pip install -e ./plugins/vllm_add_dummy_model
-    - pytest -v -s models/test_oot_registration.py # it needs a clean process
-    - pytest -v -s models -m \"not vlm\" --ignore=models/test_oot_registration.py
-
- label: torch compile integration test
-  source_file_dependencies:
-  - vllm/
-  commands:
-    - pytest -v -s ./compile/test_full_graph.py
-
-
- label: Vision Language Models Test # 42min
+- label: Inputs Test
  #mirror_hardwares: [amd]
-  source_file_dependencies:
-  - vllm/
+  commands:
+    - pytest -v -s test_inputs.py
+    - pytest -v -s multimodal
+
+- label: Kernels Test %N
+  #mirror_hardwares: [amd]
+  commands:
+    - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl
+    - pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+  parallelism: 4
+
+- label: Models Test
+  #mirror_hardwares: [amd]
+  commands:
+    - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl
+    - pytest -v -s models -m \"not vlm\"
+
+- label: Vision Language Models Test
+  mirror_hardwares: [amd]
  commands:
    - pytest -v -s models -m vlm

- label: Prefix Caching Test # 7min
-  #mirror_hardwares: [amd]
-  source_file_dependencies:
-  - vllm/
-  - tests/prefix_caching
+- label: Prefix Caching Test
+  mirror_hardwares: [amd]
  commands:
    - pytest -v -s prefix_caching

- label: Samplers Test # 18min
-  source_file_dependencies:
-  - vllm/model_executor/layers
-  - vllm/sampling_metadata.py
-  - tests/samplers
-  commands:
-    - pytest -v -s samplers
-    - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
+- label: Samplers Test
+  #mirror_hardwares: [amd]
+  command: pytest -v -s samplers

- label: LogitsProcessor Test # 5min
+- label: LogitsProcessor Test
  mirror_hardwares: [amd]
-  source_file_dependencies:
-  - vllm/model_executor/layers
-  - tests/test_logits_processor
  command: pytest -v -s test_logits_processor.py

- label: Speculative decoding tests # 22min
-  source_file_dependencies:
-  - vllm/spec_decode
-  - tests/spec_decode
+- label: Utils Test
+  commands:
+    - pytest -v -s test_utils.py
+    - pytest -v -s test_embedded_commit.py
+
+- label: Worker Test
+  mirror_hardwares: [amd]
+  command: pytest -v -s worker
+
+- label: Speculative decoding tests
+  #mirror_hardwares: [amd]
  commands:
    # See https://github.com/vllm-project/vllm/issues/5152
    - export VLLM_ATTENTION_BACKEND=XFORMERS
    - pytest -v -s spec_decode

- label: LoRA Test %N # 30min each
-  source_file_dependencies:
-  - vllm/lora
-  - csrc/punica
-  - tests/lora
+- label: LoRA Test %N
+  #mirror_hardwares: [amd]
  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py
  parallelism: 4

- label: Kernels Test %N # 30min each
-  source_file_dependencies:
-  - csrc/
-  - vllm/attention
-  - tests/kernels
-  commands:
-    - pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
-  parallelism: 4
-
- label: Tensorizer Test # 11min
-  soft_fail: true
-  source_file_dependencies:
-  - vllm/model_executor/model_loader
-  - tests/tensorizer_loader
-  commands:
-    - apt-get install -y curl libsodium23
-    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-    - pytest -v -s tensorizer_loader
-
- label: Benchmarks # 9min
-  working_dir: "/vllm-workspace/.buildkite"
-  mirror_hardwares: [amd]
-  source_file_dependencies:
-  - benchmarks/
-  commands:
-  - pip install aiohttp
-  - bash run-benchmarks.sh
-
- label: Quantization Test # 15min
-  source_file_dependencies:
-  - csrc/
-  - vllm/model_executor/layers/quantization
-  - tests/quantization
-  command: pytest -v -s quantization
-
- label: LM Eval Small Models # 53min
-  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
-  source_file_dependencies:
-  - csrc/
-  - vllm/model_executor/layers/quantization
-  commands:
-  - pip install lm-eval
-  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - bash ./run-tests.sh -c configs/models-small.txt -t 1
-
-#####  1 GPU test  #####
-#####  multi gpus test  #####
-
- label: Distributed Comm Ops Test # 7min
-  working_dir: "/vllm-workspace/tests"
-  num_gpus: 2
-  source_file_dependencies:
-  - vllm/distributed
-  - tests/distributed
-  commands:
-  - pytest -v -s distributed/test_comm_ops.py
-  - pytest -v -s distributed/test_shm_broadcast.py
-
- label: 2 Node Tests (4 GPUs in total) # 16min
-  working_dir: "/vllm-workspace/tests"
-  num_gpus: 2
-  num_nodes: 2
-  source_file_dependencies:
-  - vllm/distributed/
-  - vllm/engine/
-  - vllm/executor/
-  - vllm/model_executor/models/
-  - tests/distributed/
-  commands:
-  - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
-    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
-    - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
-    - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
-  - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
-    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
-
- label: Distributed Tests (2 GPUs) # 28min
+- label: LoRA Long Context (Distributed)
  #mirror_hardwares: [amd]
-  working_dir: "/vllm-workspace/tests"
-  num_gpus: 2
-  source_file_dependencies:
-  - vllm/distributed/
-  - vllm/engine/
-  - vllm/executor/
-  - vllm/model_executor/models/
-  - tests/distributed/
-  commands:
-  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py
-  - TARGET_TEST_SUITE=L4 pytest -v -s distributed/test_basic_distributed_correctness.py
-  - pytest -v -s distributed/test_basic_distributed_correctness_enc_dec.py
-  - pytest -v -s distributed/test_chunked_prefill_distributed.py
-  - pytest -v -s distributed/test_multimodal_broadcast.py
-  - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
-  - pip install -e ./plugins/vllm_add_dummy_model
-  - pytest -v -s distributed/test_distributed_oot.py
-  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
-  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py
-
- label: Multi-step Tests (4 GPUs) # 21min
-  working_dir: "/vllm-workspace/tests"
  num_gpus: 4
-  source_file_dependencies:
-  - vllm/model_executor/layers/sampler.py
-  - vllm/sequence.py
-  - vllm/worker/worker_base.py
-  - vllm/worker/worker.py
-  - vllm/worker/multi_step_worker.py
-  - vllm/worker/model_runner_base.py
-  - vllm/worker/model_runner.py
-  - vllm/worker/multi_step_model_runner.py
-  - vllm/engine
-  - tests/multi_step
-  commands:
-  - pytest -v -s multi_step/test_correctness.py
-
- label: Pipeline Parallelism Test # 23min
-  working_dir: "/vllm-workspace/tests"
-  num_gpus: 4
-  source_file_dependencies:
-  - vllm/distributed/
-  - vllm/engine/
-  - vllm/executor/
-  - vllm/model_executor/models/
-  - tests/distributed/
-  commands:
-  - pytest -v -s distributed/test_pp_cudagraph.py
-  - pytest -v -s distributed/test_pipeline_parallel.py
-
- label: LoRA Long Context (Distributed) # 11min
  # This test runs llama 13B, so it is required to run on 4 GPUs.
-  num_gpus: 4
-  source_file_dependencies:
-  - vllm/lora
-  - csrc/punica
-  - tests/lora/test_long_context
  commands:
    # FIXIT: find out which code initialize cuda before running the test
    # before the fix, we need to use spawn to test it
    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
    - pytest -v -s -x lora/test_long_context.py

- label: Weight Loading Multiple GPU Test
-  working_dir: "/vllm-workspace/tests"
-  num_gpus: 2
-  source_file_dependencies:
-  - vllm/
-  - tests/weight_loading
+- label: Tensorizer Test
+  #mirror_hardwares: [amd]
+  soft_fail: true
+  fast_check: true
  commands:
-    - bash weight_loading/run_model_weight_loading_test.sh
+    - apt-get install -y curl libsodium23
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - pytest -v -s tensorizer_loader

+- label: Metrics Test
+  mirror_hardwares: [amd]
+  command: pytest -v -s metrics

-##### multi gpus test #####
-##### A100 test #####
+- label: Quantization Test
+  #mirror_hardwares: [amd]
+  command: pytest -v -s quantization

- label: Distributed Tests (A100) # optional
-  gpu: a100
-  num_gpus: 4
-  source_file_dependencies:
-  - vllm/
+- label: Tracing Test
  commands: 
-  # NOTE: don't test llama model here, it seems hf implementation is buggy
-  # see https://github.com/vllm-project/vllm/pull/5689 for details
-  - pytest -v -s distributed/test_custom_all_reduce.py
-  - TARGET_TEST_SUITE=A100 pytest -v -s distributed/test_basic_distributed_correctness.py
-  - pytest -v -s -x lora/test_mixtral.py
+    - "pip install \
+        opentelemetry-sdk \
+        opentelemetry-api \
+        opentelemetry-exporter-otlp \
+        opentelemetry-semantic-conventions-ai"
+    - pytest -v -s tracing

- label: LM Eval Large Models # optional
+- label: Benchmarks
+  working_dir: "/vllm-workspace/.buildkite"
+  mirror_hardwares: [amd]
+  commands:
+  - pip install aiohttp
+  - bash run-benchmarks.sh
+
+- label: LM Eval Small Models
+  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  commands:
+  - pip install lm-eval
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - bash ./run-tests.sh -c configs/models-small.txt -t 1
+
+- label: LM Eval Large Models
  gpu: a100
  num_gpus: 4
  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
-  source_file_dependencies:
-  - csrc/
-  - vllm/model_executor/layers/quantization
  commands:
  - pip install lm-eval
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - bash ./run-tests.sh -c configs/models-large.txt -t 4
+
+- label: Documentation Build
+  working_dir: "/vllm-workspace/test_docs/docs"
+  fast_check: true
+  no_gpu: True
+  commands:
+  - pip install -r requirements-docs.txt
+  - SPHINXOPTS=\"-W\" make html
+
+- label: Distributed Tests (A100)
+  gpu: a100
+  num_gpus: 4
+  commands: 
+  # NOTE: don't test llama model here, it seems hf implementation is buggy
+  # see https://github.com/vllm-project/vllm/pull/5689 for details
+  - pytest -v -s distributed/test_custom_all_reduce.py
+  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
+  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
+  - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl
+  - VLLM_ATTENTION_BACKEND=FLASHINFER TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
+  - VLLM_ATTENTION_BACKEND=FLASHINFER TEST_DIST_MODEL=meta-llama/Meta-Llama-3-8B DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
+  - pytest -v -s -x lora/test_mixtral.py
--- a/.dockerignore
+++ b/.dockerignore
@ -1,4 +1 @@
 vllm/*.so
-/.venv
-/build
-dist
--- a/.github/ISSUE_TEMPLATE/100-documentation.yml
+++ b/.github/ISSUE_TEMPLATE/100-documentation.yml
@ -20,10 +20,3 @@ body:
  attributes:
    value: >
      Thanks for contributing 🎉!
- type: checkboxes
-  id: askllm
-  attributes:
-    label: Before submitting a new issue...
-    options:
-      - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
-        required: true
--- a/.github/ISSUE_TEMPLATE/200-installation.yml
+++ b/.github/ISSUE_TEMPLATE/200-installation.yml
@ -38,10 +38,3 @@ body:
  attributes:
    value: >
      Thanks for contributing 🎉!
- type: checkboxes
-  id: askllm
-  attributes:
-    label: Before submitting a new issue...
-    options:
-      - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
-        required: true
--- a/.github/ISSUE_TEMPLATE/300-usage.yml
+++ b/.github/ISSUE_TEMPLATE/300-usage.yml
@ -36,10 +36,3 @@ body:
  attributes:
    value: >
      Thanks for contributing 🎉!
- type: checkboxes
-  id: askllm
-  attributes:
-    label: Before submitting a new issue...
-    options:
-      - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
-        required: true
--- a/.github/ISSUE_TEMPLATE/400-bug
+++ b/.github/ISSUE_TEMPLATE/400-bug
@ -20,14 +20,9 @@ body:
      ```
      It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
    value: |
-      <details>
-      <summary>The output of `python collect_env.py`</summary>
-
      ```text
-      Your output of `python collect_env.py` here
+      The output of `python collect_env.py`
      ```
-      
-      </details>
  validations:
    required: true
 - type: textarea
@ -89,10 +84,3 @@ body:
      - If the error only appears in vllm, please provide the detailed script of how you run `transformers` and `vllm`, also highlight the difference and what you expect.

      Thanks for contributing 🎉!
- type: checkboxes
-  id: askllm
-  attributes:
-    label: Before submitting a new issue...
-    options:
-      - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
-        required: true
--- a/.github/ISSUE_TEMPLATE/500-feature
+++ b/.github/ISSUE_TEMPLATE/500-feature
@ -29,10 +29,3 @@ body:
  attributes:
    value: >
      Thanks for contributing 🎉!
- type: checkboxes
-  id: askllm
-  attributes:
-    label: Before submitting a new issue...
-    options:
-      - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
-        required: true
--- a/.github/ISSUE_TEMPLATE/600-new
+++ b/.github/ISSUE_TEMPLATE/600-new
@ -31,10 +31,3 @@ body:
  attributes:
    value: >
      Thanks for contributing 🎉!
- type: checkboxes
-  id: askllm
-  attributes:
-    label: Before submitting a new issue...
-    options:
-      - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
-        required: true
--- a/.github/ISSUE_TEMPLATE/700-performance
+++ b/.github/ISSUE_TEMPLATE/700-performance
@ -50,10 +50,3 @@ body:
  attributes:
    value: >
      Thanks for contributing 🎉!
- type: checkboxes
-  id: askllm
-  attributes:
-    label: Before submitting a new issue...
-    options:
-      - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
-        required: true
--- a/.github/ISSUE_TEMPLATE/750-RFC.yml
+++ b/.github/ISSUE_TEMPLATE/750-RFC.yml
@ -47,10 +47,3 @@ body:
  attributes:
    value: >
      Thanks for contributing 🎉!
- type: checkboxes
-  id: askllm
-  attributes:
-    label: Before submitting a new issue...
-    options:
-      - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
-        required: true
--- a/.github/ISSUE_TEMPLATE/800-misc
+++ b/.github/ISSUE_TEMPLATE/800-misc
@ -19,10 +19,3 @@ body:
  attributes:
    value: >
      Thanks for contributing 🎉!
- type: checkboxes
-  id: askllm
-  attributes:
-    label: Before submitting a new issue...
-    options:
-      - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
-        required: true
--- a/.github/workflows/clang-format.yml
+++ b/.github/workflows/clang-format.yml
@ -30,11 +30,12 @@ jobs:
      run: |
        EXCLUDES=(
            'csrc/moe/topk_softmax_kernels.cu'
-            'csrc/quantization/gguf/ggml-common.h'
-            'csrc/quantization/gguf/dequantize.cuh'
-            'csrc/quantization/gguf/vecdotq.cuh'
-            'csrc/quantization/gguf/mmq.cuh'
-            'csrc/quantization/gguf/mmvq.cuh'
+            'csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu'
+            'csrc/punica/bgmv/bgmv_config.h'
+            'csrc/punica/bgmv/bgmv_impl.cuh'
+            'csrc/punica/bgmv/vec_dtypes.cuh'
+            'csrc/punica/punica_ops.cu'
+            'csrc/punica/type_convert.h'
        )
        find csrc/ \( -name '*.h' -o -name '*.cpp' -o -name '*.cu' -o -name '*.cuh' \) -print \
            | grep -vFf <(printf "%s\n" "${EXCLUDES[@]}") \
--- a/.github/workflows/mypy.yaml
+++ b/.github/workflows/mypy.yaml
@ -15,7 +15,7 @@ jobs:
    runs-on: ubuntu-latest
    strategy:
      matrix:
-        python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
+        python-version: ["3.8", "3.9", "3.10", "3.11"]
    steps:
    - uses: actions/checkout@v2
    - name: Set up Python ${{ matrix.python-version }}
@ -25,23 +25,29 @@ jobs:
    - name: Install dependencies
      run: |
        python -m pip install --upgrade pip
-        pip install mypy==1.11.1
+        pip install mypy==1.9.0
        pip install types-setuptools
        pip install types-PyYAML
        pip install types-requests
        pip install types-setuptools
    - name: Mypy
      run: |
-        mypy
-        mypy tests --follow-imports skip
-        mypy vllm/attention --follow-imports skip
-        mypy vllm/core --follow-imports skip
-        mypy vllm/distributed --follow-imports skip
-        mypy vllm/engine  --follow-imports skip
-        mypy vllm/executor --follow-imports skip
-        mypy vllm/lora --follow-imports skip
-        mypy vllm/model_executor  --follow-imports skip
-        mypy vllm/prompt_adapter --follow-imports skip
-        mypy vllm/spec_decode --follow-imports skip
-        mypy vllm/worker --follow-imports skip
+        mypy tests --config-file pyproject.toml
+        mypy vllm/*.py --config-file pyproject.toml
+        mypy vllm/attention --config-file pyproject.toml
+        mypy vllm/core --config-file pyproject.toml
+        mypy vllm/distributed --config-file pyproject.toml
+        mypy vllm/engine  --config-file pyproject.toml
+        mypy vllm/entrypoints --config-file pyproject.toml
+        mypy vllm/executor --config-file pyproject.toml
+        mypy vllm/inputs --config-file pyproject.toml
+        mypy vllm/logging --config-file pyproject.toml
+        mypy vllm/lora --config-file pyproject.toml
+        mypy vllm/model_executor  --config-file pyproject.toml
+        mypy vllm/multimodal --config-file pyproject.toml
+        mypy vllm/platforms --config-file pyproject.toml
+        mypy vllm/spec_decode --config-file pyproject.toml
+        mypy vllm/transformers_utils --config-file pyproject.toml
+        mypy vllm/usage --config-file pyproject.toml
+        mypy vllm/worker --config-file pyproject.toml

--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@ -48,8 +48,8 @@ jobs:
      fail-fast: false
      matrix:
          os: ['ubuntu-20.04']
-          python-version: ['3.8', '3.9', '3.10', '3.11', '3.12']
-          pytorch-version: ['2.4.0']  # Must be the most recent version that meets requirements-cuda.txt.
+          python-version: ['3.8', '3.9', '3.10', '3.11']
+          pytorch-version: ['2.3.1']  # Must be the most recent version that meets requirements-cuda.txt.
          cuda-version: ['11.8', '12.1']

    steps:
--- a/.github/workflows/remove_label_not_ready_comment.yml
+++ b/.github/workflows/remove_label_not_ready_comment.yml
@ -1,23 +0,0 @@
-name: Remove ready Label on notready Comment
-
-on:
-  issue_comment:
-    types: [created]
-
-jobs:
-  add-ready-label:
-    runs-on: ubuntu-latest
-    if: github.event.issue.pull_request && contains(github.event.comment.body, '/notready')
-    steps:
-        -   name: Remove ready label
-            uses: actions/github-script@v5
-            with:
-                script: |
-                    github.rest.issues.removeLabel({
-                        owner: context.repo.owner,
-                        repo: context.repo.repo,
-                        issue_number: context.issue.number,
-                        name: 'ready'
-                    })
-            env:
-                GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/ruff.yml
+++ b/.github/workflows/ruff.yml
@ -15,7 +15,7 @@ jobs:
    runs-on: ubuntu-latest
    strategy:
      matrix:
-        python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
+        python-version: ["3.8", "3.9", "3.10", "3.11"]
    steps:
    - uses: actions/checkout@v2
    - name: Set up Python ${{ matrix.python-version }}
--- a/.github/workflows/scripts/build.sh
+++ b/.github/workflows/scripts/build.sh
@ -13,6 +13,8 @@ $python_executable -m pip install -r requirements-cuda.txt

 # Limit the number of parallel jobs to avoid OOM
 export MAX_JOBS=1
+# Make sure punica is built for the release (for LoRA)
+export VLLM_INSTALL_PUNICA_KERNELS=1
 # Make sure release wheels are built for the following architectures
 export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
 # Build
--- a/.github/workflows/yapf.yml
+++ b/.github/workflows/yapf.yml
@ -14,7 +14,7 @@ jobs:
    runs-on: ubuntu-latest
    strategy:
      matrix:
-        python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
+        python-version: ["3.8", "3.9", "3.10", "3.11"]
    steps:
    - uses: actions/checkout@v2
    - name: Set up Python ${{ matrix.python-version }}
--- a/.gitignore
+++ b/.gitignore
@ -87,9 +87,6 @@ target/
 profile_default/
 ipython_config.py

-# generated files
-**/generated/**
-
 # pyenv
 #   For a library or package, you might want to ignore these files since the code is
 #   intended to run in multiple environments; otherwise, check them in:
@ -192,4 +189,4 @@ _build/
 hip_compat.h

 # Benchmark dataset
-benchmarks/*.json
+*.json
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@ -10,7 +10,6 @@ build:

 sphinx:
   configuration: docs/source/conf.py
-   fail_on_warning: true

 # If using Sphinx, optionally build your docs in additional formats such as PDF
 formats:
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.26)
+cmake_minimum_required(VERSION 3.21)

 project(vllm_extensions LANGUAGES CXX)

@ -10,14 +10,11 @@ message(STATUS "Target device: ${VLLM_TARGET_DEVICE}")

 include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)

-# Suppress potential warnings about unused manually-specified variables
-set(ignoreMe "${VLLM_PYTHON_PATH}")
-
 #
 # Supported python versions.  These versions will be searched in order, the
 # first match will be selected.  These should be kept in sync with setup.py.
 #
-set(PYTHON_SUPPORTED_VERSIONS "3.8" "3.9" "3.10" "3.11" "3.12")
+set(PYTHON_SUPPORTED_VERSIONS "3.8" "3.9" "3.10" "3.11")

 # Supported NVIDIA architectures.
 set(CUDA_SUPPORTED_ARCHS "7.0;7.5;8.0;8.6;8.9;9.0")
@ -35,7 +32,7 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx11
 # requirements.txt files and should be kept consistent.  The ROCm torch
 # versions are derived from Dockerfile.rocm
 #
-set(TORCH_SUPPORTED_VERSION_CUDA "2.4.0")
+set(TORCH_SUPPORTED_VERSION_CUDA "2.3.1")
 set(TORCH_SUPPORTED_VERSION_ROCM "2.5.0")

 #
@ -69,39 +66,6 @@ endif()
 #
 find_package(Torch REQUIRED)

-#
-# Add the `default` target which detects which extensions should be
-# built based on platform/architecture.  This is the same logic that
-# setup.py uses to select which extensions should be built and should
-# be kept in sync.
-#
-# The `default` target makes direct use of cmake easier since knowledge
-# of which extensions are supported has been factored in, e.g.
-#
-# mkdir build && cd build
-# cmake -G Ninja -DVLLM_PYTHON_EXECUTABLE=`which python3` -DCMAKE_LIBRARY_OUTPUT_DIRECTORY=../vllm ..
-# cmake --build . --target default
-#
-add_custom_target(default)
-message(STATUS "Enabling core extension.")
-
-# Define _core_C extension
-#  built for (almost) every target platform, (excludes TPU and Neuron)
-
-set(VLLM_EXT_SRC
-  "csrc/core/torch_bindings.cpp")
-
-define_gpu_extension_target(
-  _core_C
-  DESTINATION vllm
-  LANGUAGE CXX
-  SOURCES ${VLLM_EXT_SRC}
-  COMPILE_FLAGS ${CXX_COMPILE_FLAGS}
-  USE_SABI 3
-  WITH_SOABI)
-
-add_dependencies(default _core_C)
-
 #
 # Forward the non-CUDA device extensions to external CMake scripts.
 #
@ -110,7 +74,7 @@ if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda" AND
    if (VLLM_TARGET_DEVICE STREQUAL "cpu")
        include(${CMAKE_CURRENT_LIST_DIR}/cmake/cpu_extension.cmake)
    else()
-        return()
+        message(FATAL_ERROR "Unsupported vLLM target device: ${VLLM_TARGET_DEVICE}")
    endif()
    return()
 endif()
@ -168,7 +132,7 @@ if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA")
 endif()

 #
-# Define other extension targets
+# Define extension targets
 #

 #
@ -192,13 +156,12 @@ set(VLLM_EXT_SRC

 if(VLLM_GPU_LANG STREQUAL "CUDA")
  include(FetchContent)
-  SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
+  SET(CUTLASS_ENABLE_HEADERS_ONLY=ON)
  FetchContent_Declare(
        cutlass
        GIT_REPOSITORY https://github.com/nvidia/cutlass.git
-        # CUTLASS 3.5.1
-        GIT_TAG 06b21349bcf6ddf6a1686a47a137ad1446579db9 
-        GIT_PROGRESS TRUE
+        # CUTLASS 3.5.0
+        GIT_TAG 7d49e6c7e2f8896c47f586706e67e1fb215529dc
  )
  FetchContent_MakeAvailable(cutlass)

@ -207,11 +170,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    "csrc/quantization/awq/gemm_kernels.cu"
    "csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
    "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
-    "csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu"
    "csrc/quantization/gptq_marlin/gptq_marlin.cu"
    "csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
    "csrc/quantization/gptq_marlin/awq_marlin_repack.cu"
-    "csrc/quantization/gguf/gguf_kernel.cu"
    "csrc/quantization/fp8/fp8_marlin.cu"
    "csrc/custom_all_reduce.cu"
    "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
@ -230,51 +191,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
          "-gencode arch=compute_90a,code=sm_90a")
  endif()

-  #
-  # Machete kernels
-
-  # The machete kernels only work on hopper and require CUDA 12.0 or later.
-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0)
-    #
-    # For the Machete kernels we automatically generate sources for various 
-    # preselected input type pairs and schedules.
-    # Generate sources:
-    execute_process(
-      COMMAND ${CMAKE_COMMAND} -E env 
-      PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH 
-        ${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/machete/generate.py
-      RESULT_VARIABLE machete_generation_result
-      OUTPUT_VARIABLE machete_generation_output
-      OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log
-      ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log
-    )
-
-    if (NOT machete_generation_result EQUAL 0)
-      message(FATAL_ERROR "Machete generation failed."
-                          " Result: \"${machete_generation_result}\"" 
-                          "\nCheck the log for details: "
-                          "${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log")
-    else()
-      message(STATUS "Machete generation completed successfully.")
-    endif()
-
-    # Add machete generated sources
-    file(GLOB MACHETE_GEN_SOURCES "csrc/quantization/machete/generated/*.cu")
-    list(APPEND VLLM_EXT_SRC ${MACHETE_GEN_SOURCES})
-    message(STATUS "Machete generated sources: ${MACHETE_GEN_SOURCES}")
-
-    set_source_files_properties(
-          ${MACHETE_GEN_SOURCES}
-          PROPERTIES
-          COMPILE_FLAGS
-          "-gencode arch=compute_90a,code=sm_90a")
-  endif()
-
-  # Add pytorch binding for machete (add on even CUDA < 12.0 so that we can
-  #  raise an error if the user that this was built with an incompatible 
-  #  CUDA version)
-  list(APPEND VLLM_EXT_SRC
-    csrc/quantization/machete/machete_pytorch.cu)
 endif()

 define_gpu_extension_target(
@ -284,7 +200,7 @@ define_gpu_extension_target(
  SOURCES ${VLLM_EXT_SRC}
  COMPILE_FLAGS ${VLLM_GPU_FLAGS}
  ARCHITECTURES ${VLLM_GPU_ARCHES}
-  INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR}
+  INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR};${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
  USE_SABI 3
  WITH_SOABI)

@ -306,7 +222,76 @@ define_gpu_extension_target(
  USE_SABI 3
  WITH_SOABI)

+#
+# _punica_C extension
+#

+set(VLLM_PUNICA_EXT_SRC
+  "csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu"
+  "csrc/punica/bgmv/bgmv_bf16_fp32_bf16.cu"
+  "csrc/punica/bgmv/bgmv_fp16_fp16_fp16.cu"
+  "csrc/punica/bgmv/bgmv_fp16_fp32_fp16.cu"
+  "csrc/punica/bgmv/bgmv_fp32_bf16_bf16.cu"
+  "csrc/punica/bgmv/bgmv_fp32_fp16_fp16.cu"
+  "csrc/punica/punica_ops.cu"
+  "csrc/punica/torch_bindings.cpp")
+
+#
+# Copy GPU compilation flags+update for punica
+#
+set(VLLM_PUNICA_GPU_FLAGS ${VLLM_GPU_FLAGS})
+list(REMOVE_ITEM VLLM_PUNICA_GPU_FLAGS
+  "-D__CUDA_NO_HALF_OPERATORS__"
+  "-D__CUDA_NO_HALF_CONVERSIONS__"
+  "-D__CUDA_NO_BFLOAT16_CONVERSIONS__"
+  "-D__CUDA_NO_HALF2_OPERATORS__")
+
+#
+# Filter out CUDA architectures < 8.0 for punica.
+#
+if (${VLLM_GPU_LANG} STREQUAL "CUDA")
+  set(VLLM_PUNICA_GPU_ARCHES)
+  foreach(ARCH ${VLLM_GPU_ARCHES})
+    string_to_ver(CODE_VER ${ARCH})
+    if (CODE_VER GREATER_EQUAL 8.0)
+      list(APPEND VLLM_PUNICA_GPU_ARCHES ${ARCH})
+    endif()
+  endforeach()
+  message(STATUS "Punica target arches: ${VLLM_PUNICA_GPU_ARCHES}")
+elseif(${VLLM_GPU_LANG} STREQUAL "HIP")
+  set(VLLM_PUNICA_GPU_ARCHES ${VLLM_GPU_ARCHES})
+  message(STATUS "Punica target arches: ${VLLM_PUNICA_GPU_ARCHES}")
+endif()
+
+if (VLLM_PUNICA_GPU_ARCHES)
+  define_gpu_extension_target(
+    _punica_C
+    DESTINATION vllm
+    LANGUAGE ${VLLM_GPU_LANG}
+    SOURCES ${VLLM_PUNICA_EXT_SRC}
+    COMPILE_FLAGS ${VLLM_PUNICA_GPU_FLAGS}
+    ARCHITECTURES ${VLLM_PUNICA_GPU_ARCHES}
+    USE_SABI 3
+    WITH_SOABI)
+else()
+  message(WARNING "Unable to create _punica_C target because none of the "
+    "requested architectures (${VLLM_GPU_ARCHES}) are supported, i.e. >= 8.0")
+endif()
+
+#
+# Add the `default` target which detects which extensions should be
+# built based on platform/architecture.  This is the same logic that
+# setup.py uses to select which extensions should be built and should
+# be kept in sync.
+#
+# The `default` target makes direct use of cmake easier since knowledge
+# of which extensions are supported has been factored in, e.g.
+#
+# mkdir build && cd build
+# cmake -G Ninja -DVLLM_PYTHON_EXECUTABLE=`which python3` -DCMAKE_LIBRARY_OUTPUT_DIRECTORY=../vllm ..
+# cmake --build . --target default
+#
+add_custom_target(default)

 if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
  message(STATUS "Enabling C extension.")
@ -315,4 +300,12 @@ if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
  message(STATUS "Enabling moe extension.")
  add_dependencies(default _moe_C)

+  # Enable punica if -DVLLM_INSTALL_PUNICA_KERNELS=ON or
+  # VLLM_INSTALL_PUNICA_KERNELS is set in the environment and
+  # there are supported target arches.
+  if (VLLM_PUNICA_GPU_ARCHES AND
+      (ENV{VLLM_INSTALL_PUNICA_KERNELS} OR VLLM_INSTALL_PUNICA_KERNELS))
+    message(STATUS "Enabling punica extension.")
+    add_dependencies(default _punica_C)
+  endif()
 endif()
--- a/65
+++ b/65
@ -9,23 +9,28 @@ ARG CUDA_VERSION=12.4.1
 #################### BASE BUILD IMAGE ####################
 # prepare basic build environment
 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base
+
 ARG CUDA_VERSION=12.4.1
 ARG PYTHON_VERSION=3.10
+
 ENV DEBIAN_FRONTEND=noninteractive

-# Install Python and other dependencies
 RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
    && apt-get update -y \
-    && apt-get install -y ccache software-properties-common git curl sudo \
+    && apt-get install -y ccache software-properties-common \
    && add-apt-repository ppa:deadsnakes/ppa \
    && apt-get update -y \
    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
-    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
-    && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
-    && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
-    && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
-    && python3 --version && python3 -m pip --version
+    && if [ "${PYTHON_VERSION}" != "3" ]; then update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1; fi \
+    && python3 --version
+
+RUN apt-get update -y \
+    && apt-get install -y git curl sudo
+
+# Install pip s.t. it will be compatible with our PYTHON_VERSION
+RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION}
+RUN python3 -m pip --version

 # Workaround for https://github.com/openai/triton/issues/2507 and
 # https://github.com/pytorch/pytorch/issues/107960 -- hopefully
@ -37,7 +42,6 @@ WORKDIR /workspace

 # install build and runtime dependencies
 COPY requirements-common.txt requirements-common.txt
-COPY requirements-adag.txt requirements-adag.txt
 COPY requirements-cuda.txt requirements-cuda.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
    python3 -m pip install -r requirements-cuda.txt
@ -57,19 +61,23 @@ ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
 #################### WHEEL BUILD IMAGE ####################
 FROM base AS build

+ARG PYTHON_VERSION=3.10
+
 # install build dependencies
 COPY requirements-build.txt requirements-build.txt

 RUN --mount=type=cache,target=/root/.cache/pip \
    python3 -m pip install -r requirements-build.txt

+# install compiler cache to speed up compilation leveraging local or remote caching
+RUN apt-get update -y && apt-get install -y ccache
+
 # files and directories related to build wheels
 COPY csrc csrc
 COPY setup.py setup.py
 COPY cmake cmake
 COPY CMakeLists.txt CMakeLists.txt
 COPY requirements-common.txt requirements-common.txt
-COPY requirements-adag.txt requirements-adag.txt
 COPY requirements-cuda.txt requirements-cuda.txt
 COPY pyproject.toml pyproject.toml
 COPY vllm vllm
@ -80,13 +88,13 @@ ENV MAX_JOBS=${max_jobs}
 # number of threads used by nvcc
 ARG nvcc_threads=8
 ENV NVCC_THREADS=$nvcc_threads
+# make sure punica kernels are built (for LoRA)
+ENV VLLM_INSTALL_PUNICA_KERNELS=1

 ARG buildkite_commit
 ENV BUILDKITE_COMMIT=${buildkite_commit}

 ARG USE_SCCACHE
-ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
-ARG SCCACHE_REGION_NAME=us-west-2
 # if USE_SCCACHE is set, use sccache to speed up compilation
 RUN --mount=type=cache,target=/root/.cache/pip \
    if [ "$USE_SCCACHE" = "1" ]; then \
@ -95,9 +103,12 @@ RUN --mount=type=cache,target=/root/.cache/pip \
        && tar -xzf sccache.tar.gz \
        && sudo mv sccache-v0.8.1-x86_64-unknown-linux-musl/sccache /usr/bin/sccache \
        && rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \
-        && export SCCACHE_BUCKET=${SCCACHE_BUCKET_NAME} \
-        && export SCCACHE_REGION=${SCCACHE_REGION_NAME} \
-        && export SCCACHE_IDLE_TIMEOUT=0 \
+        && if [ "$CUDA_VERSION" = "11.8.0" ]; then \
+            export SCCACHE_BUCKET=vllm-build-sccache-2; \
+           else \
+            export SCCACHE_BUCKET=vllm-build-sccache; \
+           fi \
+        && export SCCACHE_REGION=us-west-2 \
        && export CMAKE_BUILD_TYPE=Release \
        && sccache --show-stats \
        && python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38 \
@ -149,24 +160,23 @@ FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu20.04 AS vllm-base
 ARG CUDA_VERSION=12.4.1
 ARG PYTHON_VERSION=3.10
 WORKDIR /vllm-workspace
-ENV DEBIAN_FRONTEND=noninteractive

-RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
-    echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
-
-# Install Python and other dependencies
 RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
    && apt-get update -y \
-    && apt-get install -y ccache software-properties-common git curl sudo vim python3-pip \
+    && apt-get install -y ccache software-properties-common \
    && add-apt-repository ppa:deadsnakes/ppa \
    && apt-get update -y \
-    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \
-    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
-    && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
-    && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
-    && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
-    && python3 --version && python3 -m pip --version
+    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
+    && if [ "${PYTHON_VERSION}" != "3" ]; then update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1; fi \
+    && python3 --version
+
+RUN apt-get update -y \
+    && apt-get install -y python3-pip git vim curl libibverbs-dev
+
+# Install pip s.t. it will be compatible with our PYTHON_VERSION
+RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION}
+RUN python3 -m pip --version

 # Workaround for https://github.com/openai/triton/issues/2507 and
 # https://github.com/pytorch/pytorch/issues/107960 -- hopefully
@ -184,8 +194,7 @@ RUN --mount=type=bind,from=mamba-builder,src=/usr/src/mamba,target=/usr/src/mamb
    python3 -m pip install /usr/src/mamba/*.whl --no-cache-dir

 RUN --mount=type=cache,target=/root/.cache/pip \
-    . /etc/environment && \
-    python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.4/flashinfer-0.1.4+cu121torch2.4-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl
+    python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.9/flashinfer-0.0.9+cu121torch2.3-cp310-cp310-linux_x86_64.whl
 #################### vLLM installation IMAGE ####################


--- a/Dockerfile.cpu
+++ b/Dockerfile.cpu
@ -2,49 +2,37 @@

 FROM ubuntu:22.04 AS cpu-test-1

-RUN --mount=type=cache,target=/var/cache/apt \
-    apt-get update -y \
-    && apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
+RUN apt-get update -y \
+    && apt-get install -y curl git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
    && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12

 # https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html
 # intel-openmp provides additional performance improvement vs. openmp
 # tcmalloc provides better memory allocation efficiency, e.g, holding memory in caches to speed up access of commonly-used objects.
-RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install intel-openmp
+RUN pip install intel-openmp

-ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/lib/libiomp5.so"
+ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/lib/libiomp5.so:$LD_PRELOAD"

 RUN echo 'ulimit -c 0' >> ~/.bashrc

 RUN pip install https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/cpu/intel_extension_for_pytorch-2.4.0%2Bgitfbaa4bc-cp310-cp310-linux_x86_64.whl

-ENV PIP_EXTRA_INDEX_URL=https://download.pytorch.org/whl/cpu
-RUN --mount=type=cache,target=/root/.cache/pip \
-    --mount=type=bind,src=requirements-build.txt,target=requirements-build.txt \
-    pip install --upgrade pip && \
-    pip install -r requirements-build.txt
+RUN pip install --upgrade pip \
+    && pip install wheel packaging ninja "setuptools>=49.4.0" numpy

 FROM cpu-test-1 AS build

+COPY ./ /workspace/vllm
+
 WORKDIR /workspace/vllm

-RUN --mount=type=cache,target=/root/.cache/pip \
-    --mount=type=bind,src=requirements-common.txt,target=requirements-common.txt \
-    --mount=type=bind,src=requirements-cpu.txt,target=requirements-cpu.txt \
-    pip install -v -r requirements-cpu.txt
-
-COPY ./ ./
+RUN pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/test/cpu

 # Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
 ARG VLLM_CPU_DISABLE_AVX512
 ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}

-ENV CCACHE_DIR=/root/.cache/ccache
-RUN --mount=type=cache,target=/root/.cache/pip \
-    --mount=type=cache,target=/root/.cache/ccache \
-    VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \
-    pip install dist/*.whl
+RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install

 WORKDIR /workspace/

--- a/Dockerfile.neuron
+++ b/Dockerfile.neuron
@ -1,5 +1,5 @@
 # default base image
-ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.1.2-neuronx-py310-sdk2.19.1-ubuntu20.04"
+ARG BASE_IMAGE="763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-inference-neuronx:2.1.1-neuronx-py310-sdk2.17.0-ubuntu20.04"

 FROM $BASE_IMAGE

--- a/Dockerfile.openvino
+++ b/Dockerfile.openvino
@ -1,7 +1,7 @@
 # The vLLM Dockerfile is used to construct vLLM image that can be directly used
 # to run the OpenAI compatible server.

-FROM ubuntu:22.04 AS dev
+FROM ubuntu:20.04 AS dev

 RUN apt-get update -y && \
    apt-get install -y python3-pip git
@ -13,15 +13,12 @@ COPY requirements-common.txt /workspace/vllm/
 COPY requirements-openvino.txt /workspace/vllm/

 COPY vllm/ /workspace/vllm/vllm
-COPY csrc/core /workspace/vllm/csrc/core
-COPY cmake/utils.cmake /workspace/vllm/cmake/
-COPY CMakeLists.txt /workspace/vllm/
 COPY setup.py /workspace/vllm/

 # install build requirements
 RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/vllm/requirements-build.txt
 # build vLLM with OpenVINO backend
-RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace/vllm/
+RUN PIP_PRE=1 PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu https://storage.openvinotoolkit.org/simple/wheels/nightly/" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace/vllm/

 COPY examples/ /workspace/vllm/examples
 COPY benchmarks/ /workspace/vllm/benchmarks
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@ -55,8 +55,8 @@ RUN case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
        *"rocm-6.1"*) \
            python3 -m pip uninstall -y torch torchvision \
            && python3 -m pip install --no-cache-dir --pre \
-                torch==2.5.0.dev20240726 \
-                torchvision==0.20.0.dev20240726 \
+                torch==2.5.0.dev20240710 \
+                torchvision==0.20.0.dev20240710 \
               --index-url https://download.pytorch.org/whl/nightly/rocm6.1;; \
        *) ;; esac

@ -131,7 +131,8 @@ COPY . .
 RUN --mount=type=cache,target=/root/.cache/pip \
    python3 -m pip install --upgrade numba scipy huggingface-hub[cli]

-
+# Make sure punica kernels are built (for LoRA)
+ENV VLLM_INSTALL_PUNICA_KERNELS=1
 # Workaround for ray >= 2.10.0
 ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
 # Silences the HF Tokenizers warning
--- a/Dockerfile.tpu
+++ b/Dockerfile.tpu
@ -1,17 +1,20 @@
-ARG NIGHTLY_DATE="20240808"
+ARG NIGHTLY_DATE="20240713"
 ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm_$NIGHTLY_DATE"

 FROM $BASE_IMAGE
 WORKDIR /workspace

+# Install aiohttp separately to avoid build errors.
+RUN pip install aiohttp
+# Install NumPy 1 instead of NumPy 2.
+RUN pip install "numpy<2"
 # Install the TPU and Pallas dependencies.
-RUN python3 -m pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html
-RUN python3 -m pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
+RUN pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html
+RUN pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html

 # Build vLLM.
 COPY . /workspace/vllm
 ENV VLLM_TARGET_DEVICE="tpu"
-RUN cd /workspace/vllm && python3 -m pip install -r requirements-tpu.txt
-RUN cd /workspace/vllm && python3 setup.py develop
+RUN cd /workspace/vllm && python setup.py develop

 CMD ["/bin/bash"]
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -1,5 +1,4 @@
 include LICENSE
-include requirements-adag.txt
 include requirements-common.txt
 include requirements-cuda.txt
 include requirements-rocm.txt
--- a/README.md
+++ b/README.md
@ -10,19 +10,10 @@ Easy, fast, and cheap LLM serving for everyone
 </h3>

 <p align="center">
-| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://discord.gg/jz7wjKhh6g"><b>Discord</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> |
+| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://discord.gg/jz7wjKhh6g"><b>Discord</b></a> |

 </p>

-
---
-
-**vLLM & NVIDIA Triton User Meetup (Monday, September 9, 5pm-9pm PT) at Fort Mason, San Francisco**
-
-We are excited to announce our sixth vLLM Meetup, in collaboration with NVIDIA Triton Team.
-Join us to hear the vLLM's recent update about performance.
-Register now [here](https://lu.ma/87q3nvnh) and be part of the event!
-
 ---

 *Latest News* 🔥
@ -45,12 +36,10 @@ vLLM is fast with:
 - Efficient management of attention key and value memory with **PagedAttention**
 - Continuous batching of incoming requests
 - Fast model execution with CUDA/HIP graph
- Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), INT4, INT8, and FP8.
- Optimized CUDA kernels, including integration with FlashAttention and FlashInfer.
- Speculative decoding
- Chunked prefill
+- Quantization: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [SqueezeLLM](https://arxiv.org/abs/2306.07629), FP8 KV Cache
+- Optimized CUDA kernels

-**Performance benchmark**: We include a [performance benchmark](https://buildkite.com/vllm/performance-benchmark/builds/4068) that compares the performance of vLLM against other LLM serving engines ([TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [text-generation-inference](https://github.com/huggingface/text-generation-inference) and [lmdeploy](https://github.com/InternLM/lmdeploy)).
+**Performance benchmark**: We include a [performance benchmark](https://buildkite.com/vllm/performance-benchmark/builds/4068) that compares the performance of vllm against other LLM serving engines ([TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [text-generation-inference](https://github.com/huggingface/text-generation-inference) and [lmdeploy](https://github.com/InternLM/lmdeploy)).

 vLLM is flexible and easy to use with:

@ -59,21 +48,20 @@ vLLM is flexible and easy to use with:
 - Tensor parallelism and pipeline parallelism support for distributed inference
 - Streaming outputs
 - OpenAI-compatible API server
- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, TPU, and AWS Neuron.
- Prefix caching support
- Multi-lora support
+- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs
+- (Experimental) Prefix caching support
+- (Experimental) Multi-lora support

 vLLM seamlessly supports most popular open-source models on HuggingFace, including:
 - Transformer-like LLMs (e.g., Llama)
 - Mixture-of-Expert LLMs (e.g., Mixtral)
- Embedding Models (e.g. E5-Mistral)
 - Multi-modal LLMs (e.g., LLaVA)

 Find the full list of supported models [here](https://docs.vllm.ai/en/latest/models/supported_models.html).

 ## Getting Started

-Install vLLM with `pip` or [from source](https://vllm.readthedocs.io/en/latest/getting_started/installation.html#build-from-source):
+Install vLLM with pip or [from source](https://vllm.readthedocs.io/en/latest/getting_started/installation.html#build-from-source):

 ```bash
 pip install vllm
@ -111,7 +99,6 @@ vLLM is a community project. Our compute resources for development and testing a
 - Roblox
 - RunPod
 - Sequoia Capital
- Skywork AI
 - Trainy
 - UC Berkeley
 - UC San Diego
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@ -225,8 +225,8 @@ async def async_request_openai_completions(
 ) -> RequestFuncOutput:
    api_url = request_func_input.api_url
    assert api_url.endswith(
-        ("completions", "profile")
-    ), "OpenAI Completions API URL must end with 'completions' or 'profile'."
+        "completions"
+    ), "OpenAI Completions API URL must end with 'completions'."

    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
        assert not request_func_input.use_beam_search
@ -276,9 +276,8 @@ async def async_request_openai_completions(
                                    output.ttft = ttft

                                # Decoding phase
-                                else:
-                                    output.itl.append(timestamp -
-                                                      most_recent_timestamp)
+                                output.itl.append(timestamp -
+                                                  most_recent_timestamp)

                                most_recent_timestamp = timestamp
                                generated_text += data["choices"][0]["text"]
--- a/benchmarks/benchmark_prefix_caching.py
+++ b/benchmarks/benchmark_prefix_caching.py
@ -1,45 +1,8 @@
-"""
-Benchmark the efficiency of prefix caching.
-
-This script allows you to benchmark the performance of
-a model with and without prefix caching using either fixed prompts
-or prompts sampled from the ShareGPT dataset.
-
-Fixed example usage:
-    python benchmark_prefix_caching.py \
-        --model meta-llama/Llama-2-7b-chat-hf \
-        --enable-prefix-caching \
-        --num-prompts 1 \
-        --repeat-count 100
-
-ShareGPT example usage:
-    # This command samples 20 prompts with input lengths
-    # between 128 and 256 tokens from the ShareGPT dataset,
-    # then replicates each prompt 5 times.
-    python benchmark_prefix_caching.py \
-        --model meta-llama/Llama-2-7b-chat-hf \
-        --dataset-path /path/to/ShareGPT_V3_unfiltered_cleaned_split.json \
-        --enable-prefix-caching \
-        --num-prompts 20 \
-        --repeat-count 5 \
-        --input-length-range 128:256
-"""
-
-import json
-import random
 import time
-from typing import List, Optional, Tuple
-
-from transformers import PreTrainedTokenizerBase

 from vllm import LLM, SamplingParams
 from vllm.utils import FlexibleArgumentParser

-try:
-    from vllm.transformers_utils.tokenizer import get_tokenizer
-except ImportError:
-    from backend_request_func import get_tokenizer
-
 PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as fellows. You need to answer my question about the table.\n# Table\n|Opening|Opening|Sl. No.|Film|Cast|Director|Music Director|Notes|\n|----|----|----|----|----|----|----|----|\n|J A N|9|1|Agni Pushpam|Jayabharathi, Kamalahasan|Jeassy|M. K. Arjunan||\n|J A N|16|2|Priyamvada|Mohan Sharma, Lakshmi, KPAC Lalitha|K. S. Sethumadhavan|V. Dakshinamoorthy||\n|J A N|23|3|Yakshagaanam|Madhu, Sheela|Sheela|M. S. Viswanathan||\n|J A N|30|4|Paalkkadal|Sheela, Sharada|T. K. Prasad|A. T. Ummer||\n|F E B|5|5|Amma|Madhu, Srividya|M. Krishnan Nair|M. K. Arjunan||\n|F E B|13|6|Appooppan|Thikkurissi Sukumaran Nair, Kamal Haasan|P. Bhaskaran|M. S. Baburaj||\n|F E B|20|7|Srishti|Chowalloor Krishnankutty, Ravi Alummoodu|K. T. Muhammad|M. S. Baburaj||\n|F E B|20|8|Vanadevatha|Prem Nazir, Madhubala|Yusufali Kechery|G. Devarajan||\n|F E B|27|9|Samasya|Madhu, Kamalahaasan|K. Thankappan|Shyam||\n|F E B|27|10|Yudhabhoomi|K. P. Ummer, Vidhubala|Crossbelt Mani|R. K. Shekhar||\n|M A R|5|11|Seemantha Puthran|Prem Nazir, Jayabharathi|A. B. Raj|M. K. Arjunan||\n|M A R|12|12|Swapnadanam|Rani Chandra, Dr. Mohandas|K. G. George|Bhaskar Chandavarkar||\n|M A R|19|13|Thulavarsham|Prem Nazir, sreedevi, Sudheer|N. Sankaran Nair|V. Dakshinamoorthy||\n|M A R|20|14|Aruthu|Kaviyoor Ponnamma, Kamalahasan|Ravi|G. Devarajan||\n|M A R|26|15|Swimming Pool|Kamal Haasan, M. G. Soman|J. Sasikumar|M. K. Arjunan||\n\n# Question\nWhat' s the content in the (1,1) cells\n"  # noqa: E501


@ -52,83 +15,7 @@ def test_prefix(llm=None, sampling_params=None, prompts=None):
    print(f"cost time {end_time - start_time}")


-def sample_requests(
-    dataset_path: str,
-    num_requests: int,
-    tokenizer: PreTrainedTokenizerBase,
-    input_length_range: Tuple[int, int],
-    fixed_output_len: Optional[int],
-) -> List[Tuple[str, int, int]]:
-    if fixed_output_len is not None and fixed_output_len < 4:
-        raise ValueError("output_len too small")
-
-    # Load the dataset.
-    with open(dataset_path) as f:
-        dataset = json.load(f)
-    # Filter out the conversations with less than 2 turns.
-    dataset = [data for data in dataset if len(data["conversations"]) >= 2]
-    # Only keep the first two turns of each conversation.
-    dataset = [(data["conversations"][0]["value"],
-                data["conversations"][1]["value"]) for data in dataset]
-
-    # Shuffle the dataset.
-    random.shuffle(dataset)
-
-    min_len, max_len = input_length_range
-
-    # Filter out sequences that are too long or too short
-    filtered_dataset: List[Tuple[str, int, int]] = []
-    for i in range(len(dataset)):
-        if len(filtered_dataset) == num_requests:
-            break
-
-        # Tokenize the prompts and completions.
-        prompt = dataset[i][0]
-        prompt_token_ids = tokenizer(prompt).input_ids
-        completion = dataset[i][1]
-        completion_token_ids = tokenizer(completion).input_ids
-        prompt_len = len(prompt_token_ids)
-        output_len = len(completion_token_ids
-                         ) if fixed_output_len is None else fixed_output_len
-        if prompt_len < 4 or output_len < 4:
-            # Prune too short sequences.
-            continue
-        if min_len <= prompt_len <= max_len:
-            filtered_dataset.append((prompt, prompt_len, output_len))
-
-    return filtered_dataset
-
-
-def repeat_and_sort_requests(requests: List[Tuple[str, int, int]],
-                             repeat_count: int,
-                             sort: bool = False) -> List[str]:
-    repeated_requests = requests * repeat_count
-    if sort:
-        repeated_requests.sort(key=lambda x: x[1])
-    else:
-        random.shuffle(repeated_requests)
-    return [req[0] for req in repeated_requests]
-
-
 def main(args):
-    tokenizer = get_tokenizer(args.model, trust_remote_code=True)
-    input_length_range = tuple(map(int, args.input_length_range.split(':')))
-
-    if args.dataset_path is not None:
-        print(f"Start to sample {args.num_prompts} prompts"
-              "from {args.dataset_path}")
-        filtered_datasets = sample_requests(
-            dataset_path=args.dataset_path,
-            num_requests=args.num_prompts,
-            tokenizer=tokenizer,
-            input_length_range=input_length_range,
-            fixed_output_len=args.output_len,
-        )
-    else:
-        prompt_len = len(tokenizer(PROMPT).input_ids)
-        filtered_datasets = [(PROMPT, prompt_len, args.output_len)
-                             ] * args.num_prompts
-
    llm = LLM(model=args.model,
              tokenizer_mode='auto',
              trust_remote_code=True,
@ -137,13 +24,10 @@ def main(args):
              tensor_parallel_size=args.tensor_parallel_size,
              enable_prefix_caching=args.enable_prefix_caching)

+    num_prompts = 100
+    prompts = [PROMPT] * num_prompts
    sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)

-    print("Testing filtered datasets")
-    prompts = repeat_and_sort_requests(filtered_datasets,
-                                       repeat_count=args.repeat_count,
-                                       sort=args.sort)
-
    print("------warm up------")
    test_prefix(
        llm=llm,
@ -161,15 +45,11 @@ def main(args):

 if __name__ == "__main__":
    parser = FlexibleArgumentParser(
-        description=
-        'Benchmark the performance with or without automatic prefix caching.')
+        description='Benchmark the performance with or without automatic '
+        'prefix caching.')
    parser.add_argument('--model',
                        type=str,
                        default='baichuan-inc/Baichuan2-13B-Chat')
-    parser.add_argument("--dataset-path",
-                        type=str,
-                        default=None,
-                        help="Path to the dataset.")
    parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
    parser.add_argument('--output-len', type=int, default=10)
    parser.add_argument('--enable-prefix-caching',
@ -178,21 +58,5 @@ if __name__ == "__main__":
    parser.add_argument('--use-v2-block-manager',
                        action='store_true',
                        help='Use BlockSpaceMangerV2')
-    parser.add_argument('--num-prompts',
-                        type=int,
-                        default=1,
-                        help="Number of the prompts sampled from dataset")
-    parser.add_argument('--repeat-count',
-                        type=int,
-                        default=100,
-                        help='Number of times to repeat each prompt')
-    parser.add_argument('--sort',
-                        action='store_true',
-                        help='Sort prompts by input length')
-    parser.add_argument('--input-length-range',
-                        type=str,
-                        default='128:256',
-                        help='Range of input lengths for sampling prompts,'
-                        'specified as "min:max" (e.g., "128:256").')
    args = parser.parse_args()
    main(args)
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@ -295,7 +295,6 @@ def calculate_metrics(
 async def benchmark(
    backend: str,
    api_url: str,
-    base_url: str,
    model_id: str,
    tokenizer: PreTrainedTokenizerBase,
    input_requests: List[Tuple[str, int, int]],
@ -303,7 +302,6 @@ async def benchmark(
    use_beam_search: bool,
    request_rate: float,
    disable_tqdm: bool,
-    profile: bool,
 ):
    if backend in ASYNC_REQUEST_FUNCS:
        request_func = ASYNC_REQUEST_FUNCS[backend]
@ -328,22 +326,6 @@ async def benchmark(
            f"are correctly specified. Error: {test_output.error}")
    else:
        print("Initial test run completed. Starting main benchmark run...")
-
-    if profile:
-        print("Starting profiler...")
-        profile_input = RequestFuncInput(
-            model=model_id,
-            prompt=test_prompt,
-            api_url=base_url + "/start_profile",
-            prompt_len=test_prompt_len,
-            output_len=test_output_len,
-            best_of=best_of,
-            use_beam_search=use_beam_search,
-        )
-        profile_output = await request_func(request_func_input=profile_input)
-        if profile_output.success:
-            print("Profiler started")
-
    print(f"Traffic request rate: {request_rate}")

    pbar = None if disable_tqdm else tqdm(total=len(input_requests))
@ -367,21 +349,6 @@ async def benchmark(
                             pbar=pbar)))
    outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)

-    if profile:
-        print("Stopping profiler...")
-        profile_input = RequestFuncInput(
-            model=model_id,
-            prompt=test_prompt,
-            api_url=base_url + "/stop_profile",
-            prompt_len=test_prompt_len,
-            output_len=test_output_len,
-            best_of=best_of,
-            use_beam_search=use_beam_search,
-        )
-        profile_output = await request_func(request_func_input=profile_input)
-        if profile_output.success:
-            print("Profiler stopped")
-
    if pbar is not None:
        pbar.close()

@ -466,10 +433,8 @@ def main(args: argparse.Namespace):

    if args.base_url is not None:
        api_url = f"{args.base_url}{args.endpoint}"
-        base_url = f"{args.base_url}"
    else:
        api_url = f"http://{args.host}:{args.port}{args.endpoint}"
-        base_url = f"http://{args.host}:{args.port}"

    tokenizer = get_tokenizer(tokenizer_id,
                              trust_remote_code=args.trust_remote_code)
@ -541,7 +506,6 @@ def main(args: argparse.Namespace):
        benchmark(
            backend=backend,
            api_url=api_url,
-            base_url=base_url,
            model_id=model_id,
            tokenizer=tokenizer,
            input_requests=input_requests,
@ -549,7 +513,6 @@ def main(args: argparse.Namespace):
            use_beam_search=args.use_beam_search,
            request_rate=args.request_rate,
            disable_tqdm=args.disable_tqdm,
-            profile=args.profile,
        ))

    # Save config and results to json
@ -730,12 +693,6 @@ if __name__ == "__main__":
        action="store_true",
        help="Specify to disable tqdm progress bar.",
    )
-    parser.add_argument(
-        "--profile",
-        action="store_true",
-        help="Use Torch Profiler. The endpoint must be launched with "
-        "VLLM_TORCH_PROFILER_DIR to enable profiler.",
-    )
    parser.add_argument(
        "--save-result",
        action="store_true",
--- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
@ -13,7 +13,7 @@ from weight_shapes import WEIGHT_SHAPES
 from vllm import _custom_ops as ops
 from vllm.utils import FlexibleArgumentParser

-DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
+DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())[1:]
 DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
 DEFAULT_TP_SIZES = [1]

@ -32,6 +32,7 @@ def to_int8(tensor: torch.Tensor) -> torch.Tensor:

 def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
                      k: int) -> Tuple[torch.Tensor, torch.Tensor]:
+
    a = torch.randn((m, k), device='cuda') * 5
    b = torch.randn((n, k), device='cuda').t() * 5

@ -43,18 +44,59 @@ def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
    raise ValueError("unsupported dtype")


+# impl
+
+
+def pytorch_mm_impl(a: torch.Tensor, b: torch.Tensor, scale_a: torch.Tensor,
+                    scale_b: torch.Tensor,
+                    out_dtype: torch.dtype) -> torch.Tensor:
+    return torch.mm(a, b)
+
+
+def pytorch_fp8_impl(a: torch.Tensor, b: torch.Tensor, scale_a: torch.Tensor,
+                     scale_b: torch.Tensor,
+                     out_dtype: torch.dtype) -> torch.Tensor:
+    return torch._scaled_mm(a,
+                            b,
+                            scale_a=scale_a,
+                            scale_b=scale_b,
+                            out_dtype=out_dtype)
+
+
+def pytorch_fp8_impl_fast_accum(a: torch.Tensor, b: torch.Tensor,
+                                scale_a: torch.Tensor, scale_b: torch.Tensor,
+                                out_dtype: torch.dtype) -> torch.Tensor:
+    return torch._scaled_mm(a,
+                            b,
+                            scale_a=scale_a,
+                            scale_b=scale_b,
+                            out_dtype=out_dtype,
+                            use_fast_accum=True)
+
+
+def cutlass_impl(a: torch.Tensor, b: torch.Tensor, scale_a: torch.Tensor,
+                 scale_b: torch.Tensor,
+                 out_dtype: torch.dtype) -> torch.Tensor:
+    return ops.cutlass_scaled_mm(a, b, scale_a, scale_b, out_dtype=out_dtype)
+
+
 # bench
-def bench_fn(label: str, sub_label: str, description: str, fn: Callable, *args,
-             **kwargs) -> TMeasurement:
+def bench_fn(a: torch.Tensor, b: torch.Tensor, scale_a: torch.Tensor,
+             scale_b: torch.Tensor, out_dtype: torch.dtype, label: str,
+             sub_label: str, fn: Callable, description: str) -> TMeasurement:
+
    min_run_time = 1

    globals = {
-        "args": args,
-        "kwargs": kwargs,
+        "a": a,
+        "b": b,
+        "scale_a": scale_a,
+        "scale_b": scale_b,
+        "out_dtype": out_dtype,
        "fn": fn,
    }
    return TBenchmark.Timer(
-        stmt="fn(*args, **kwargs)",
+        stmt="fn(a, b, scale_a, scale_b, out_dtype)",
        globals=globals,
        label=label,
        sub_label=sub_label,
@ -68,58 +110,19 @@ def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
    a, b = make_rand_tensors(torch.int8, m, n, k)
    scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
    scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-    bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
-    azp = torch.zeros((m, ), device="cuda", dtype=torch.int32)
-    azp_adj = torch.zeros((n, ), device="cuda", dtype=torch.int32)

    timers = []
-    # pytorch impl - bfloat16
+    # pytorch impl
    timers.append(
-        bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
-                 torch.mm, a.to(dtype=torch.bfloat16),
-                 b.to(dtype=torch.bfloat16)))
-
-    # pytorch impl - float16
-    timers.append(
-        bench_fn(label, sub_label,
-                 "pytorch_fp16_fp16_fp16_matmul-no-scales", torch.mm,
-                 a.to(dtype=torch.float16), b.to(dtype=torch.float16)))
+        bench_fn(a.to(dtype=torch.bfloat16, device="cuda"),
+                 b.to(dtype=torch.bfloat16, device="cuda"), scale_a, scale_b,
+                 torch.bfloat16, label, sub_label, pytorch_mm_impl,
+                 "pytorch_bf16_bf16_bf16_matmul-no-scales"))

    # cutlass impl
    timers.append(
-        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm",
-                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b,
-                 torch.bfloat16))
-
-    # cutlass with bias
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_bias",
-                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.bfloat16,
-                 bias))
-
-    # cutlass with azp per-tensor
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp",
-                 ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b,
-                 torch.bfloat16, azp_adj))
-
-    # cutlass with azp per-tensor + bias
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp_bias",
-                 ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b,
-                 torch.bfloat16, azp_adj, None, bias))
-
-    # cutlass with azp per-token
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp_pt",
-                 ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b,
-                 torch.bfloat16, azp_adj, azp))
-
-    # cutlass with azp per-token + bias
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp_pt_bias",
-                 ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b,
-                 torch.bfloat16, azp_adj, azp, bias))
+        bench_fn(a, b, scale_a, scale_b, torch.bfloat16, label, sub_label,
+                 cutlass_impl, "cutlass_i8_i8_bf16_scaled_mm"))

    return timers

@ -130,88 +133,46 @@ def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
    a, b = make_rand_tensors(torch.float8_e4m3fn, m, n, k)
    scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
    scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-    bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)

    timers = []

    # pytorch impl w. bf16
    timers.append(
-        bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
-                 torch.mm, a.to(dtype=torch.bfloat16, device="cuda"),
-                 b.to(dtype=torch.bfloat16, device="cuda")))
+        bench_fn(a.to(dtype=torch.bfloat16, device="cuda"),
+                 b.to(dtype=torch.bfloat16, device="cuda"), scale_a, scale_b,
+                 torch.bfloat16, label, sub_label, pytorch_mm_impl,
+                 "pytorch_bf16_bf16_bf16_matmul-no-scales"))

    # pytorch impl: bf16 output, without fp8 fast accum
    timers.append(
-        bench_fn(label,
-                 sub_label,
-                 "pytorch_fp8_fp8_bf16_scaled_mm",
-                 torch._scaled_mm,
-                 a,
-                 b,
-                 scale_a=scale_a,
-                 scale_b=scale_b,
-                 out_dtype=torch.bfloat16))
+        bench_fn(a, b, scale_a, scale_b, torch.bfloat16, label, sub_label,
+                 pytorch_fp8_impl, "pytorch_fp8_fp8_bf16_scaled_mm"))

    # pytorch impl: bf16 output, with fp8 fast accum
    timers.append(
-        bench_fn(label,
-                 sub_label,
-                 "pytorch_fp8_fp8_bf16_scaled_mm_fast_accum",
-                 torch._scaled_mm,
-                 a,
-                 b,
-                 scale_a=scale_a,
-                 scale_b=scale_b,
-                 out_dtype=torch.bfloat16,
-                 use_fast_accum=True))
+        bench_fn(a, b, scale_a, scale_b, torch.bfloat16, label, sub_label,
+                 pytorch_fp8_impl_fast_accum,
+                 "pytorch_fp8_fp8_bf16_scaled_mm_fast_accum"))

    # pytorch impl: fp16 output, without fp8 fast accum
    timers.append(
-        bench_fn(label,
-                 sub_label,
-                 "pytorch_fp8_fp8_fp16_scaled_mm",
-                 torch._scaled_mm,
-                 a,
-                 b,
-                 scale_a=scale_a,
-                 scale_b=scale_b,
-                 out_dtype=torch.float16))
+        bench_fn(a, b, scale_a, scale_b, torch.float16, label, sub_label,
+                 pytorch_fp8_impl, "pytorch_fp8_fp8_fp16_scaled_mm"))

    # pytorch impl: fp16 output, with fp8 fast accum
    timers.append(
-        bench_fn(label,
-                 sub_label,
-                 "pytorch_fp8_fp8_fp16_scaled_mm_fast_accum",
-                 torch._scaled_mm,
-                 a,
-                 b,
-                 scale_a=scale_a,
-                 scale_b=scale_b,
-                 out_dtype=torch.float16,
-                 use_fast_accum=True))
+        bench_fn(a, b, scale_a, scale_b, torch.float16, label, sub_label,
+                 pytorch_fp8_impl_fast_accum,
+                 "pytorch_fp8_fp8_fp16_scaled_mm_fast_accum"))

    # cutlass impl: bf16 output
    timers.append(
-        bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_mm",
-                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b,
-                 torch.bfloat16))
+        bench_fn(a, b, scale_a, scale_b, torch.bfloat16, label, sub_label,
+                 cutlass_impl, "cutlass_fp8_fp8_bf16_scaled_mm"))
    # cutlass impl: fp16 output
    timers.append(
-        bench_fn(label, sub_label, "cutlass_fp8_fp8_fp16_scaled_mm",
-                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.float16))
-
-    # cutlass impl: bf16 output, with bias
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_mm_bias",
-                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.bfloat16,
-                 bias))
-
-    # cutlass impl: fp16 output, with bias
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_fp8_fp8_fp16_scaled_mm_bias",
-                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.float16,
-                 bias.to(dtype=torch.float16)))
-
+        bench_fn(a, b, scale_a, scale_b, torch.float16, label, sub_label,
+                 cutlass_impl, "cutlass_fp8_fp8_fp16_scaled_mm"))
    return timers


@ -232,6 +193,7 @@ def print_timers(timers: Iterable[TMeasurement]):

 def run(dtype: torch.dtype,
        MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]:
+
    results = []
    for m, k, n in MKNs:
        timers = bench(dtype, m, k, n, f"scaled-{dtype}-gemm",
@ -247,6 +209,7 @@ def make_output(data: Iterable[TMeasurement],
                MKNs: Iterable[Tuple[int, int, int]],
                base_description: str,
                timestamp=None):
+
    print(f"== All Results {base_description} ====")
    print_timers(data)

@ -281,6 +244,7 @@ def run_range_bench(args):


 def run_model_bench(args):
+
    print("Benchmarking models:")
    for i, model in enumerate(args.models):
        print(f"[{i}]  {model}")
--- a/benchmarks/kernels/benchmark_layernorm.py
+++ b/benchmarks/kernels/benchmark_layernorm.py
@ -1,89 +0,0 @@
-import random
-import time
-
-import torch
-
-from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
-
-
-@torch.inference_mode()
-def main(num_tokens: int,
-         hidden_size: int,
-         add_residual: bool,
-         dtype: torch.dtype,
-         seed: int = 0,
-         do_profile: bool = False,
-         num_warmup_iters: int = 5,
-         num_iters: int = 100) -> None:
-    random.seed(seed)
-    torch.random.manual_seed(seed)
-    if torch.cuda.is_available():
-        torch.cuda.manual_seed(seed)
-    torch.set_default_device("cuda")
-
-    layer = RMSNorm(hidden_size).to(dtype=dtype)
-    layer.weight.data.normal_(mean=1.0, std=0.1)
-    scale = 1 / (2 * hidden_size)
-    x = torch.randn(num_tokens, hidden_size, dtype=dtype)
-    x *= scale
-    residual = torch.randn_like(x) * scale if add_residual else None
-
-    def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
-        torch.cuda.synchronize()
-        if profile:
-            torch.cuda.cudart().cudaProfilerStart()
-        start_time = time.perf_counter()
-
-        for _ in range(num_iters):
-            layer(x, residual)
-        torch.cuda.synchronize()
-
-        end_time = time.perf_counter()
-        if profile:
-            torch.cuda.cudart().cudaProfilerStart()
-        return (end_time - start_time) / num_iters
-
-    # Warmup.
-    print("Warming up...")
-    run_benchmark = run_cuda_benchmark
-    run_benchmark(num_iters=num_warmup_iters, profile=False)
-
-    # Benchmark.
-    if do_profile:
-        latency = run_benchmark(num_iters=1, profile=True)
-    else:
-        latency = run_benchmark(num_iters=num_iters, profile=False)
-    print(f"Kernel running time: {latency * 1000000:.3f} us")
-
-
-if __name__ == '__main__':
-    parser = FlexibleArgumentParser(
-        description="Benchmark the layernorm kernel.")
-    parser.add_argument("--num-tokens", type=int, default=4096)
-    parser.add_argument("--hidden-size", type=int, default=8192)
-    parser.add_argument("--add-residual", action="store_true")
-    parser.add_argument("--dtype",
-                        type=str,
-                        choices=["half", "bfloat16", "float"],
-                        default="half")
-    parser.add_argument("--seed", type=int, default=0)
-    parser.add_argument("--profile", action="store_true")
-    parser.add_argument("--num-warmup-iters", type=int, default=5)
-    parser.add_argument("--num-iters",
-                        type=int,
-                        default=100,
-                        help="Number of benchmark iterations. "
-                        "If --profile is set, this number is ignored")
-
-    args = parser.parse_args()
-    print(args)
-
-    main(num_tokens=args.num_tokens,
-         hidden_size=args.hidden_size,
-         add_residual=args.add_residual,
-         dtype=STR_DTYPE_TO_TORCH_DTYPE[args.dtype],
-         seed=args.seed,
-         do_profile=args.profile,
-         num_warmup_iters=args.num_warmup_iters,
-         num_iters=args.num_iters)
--- a/benchmarks/kernels/benchmark_machete.py
+++ b/benchmarks/kernels/benchmark_machete.py
@ -1,372 +0,0 @@
-import argparse
-import copy
-import itertools
-import math
-import pickle as pkl
-import time
-from typing import Callable, Iterable, List, Tuple
-
-import torch
-import torch.utils.benchmark as TBenchmark
-from torch.utils.benchmark import Measurement as TMeasurement
-from weight_shapes import WEIGHT_SHAPES
-
-from vllm import _custom_ops as ops
-from vllm.model_executor.layers.quantization.utils.marlin_utils import (
-    GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N, marlin_permute_scales)
-from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
-    MarlinWorkspace)
-from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    gptq_pack, pack_rows, quantize_weights)
-from vllm.scalar_type import ScalarType, scalar_types
-from vllm.utils import FlexibleArgumentParser
-
-DEFAULT_MODELS = ["meta-llama/Llama-3-8b", "meta-llama/Llama-2-70b-hf"]
-DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512, 1024]
-DEFAULT_TP_SIZES = [1]
-
-
-def machete_pack_weights(w_q: torch.tensor, wtype: ScalarType) -> torch.tensor:
-    w_q = pack_rows(w_q, wtype.size_bits, *w_q.shape)
-    w_q = w_q.t().contiguous().t()  # make col major
-    return ops.machete_prepack_B(w_q, wtype)
-
-
-def make_bench_tensors(
-    atype: torch.dtype, wtype: ScalarType, group_size: int, m: int, n: int,
-    k: int
-) -> Tuple[torch.tensor, List[Tuple[torch.tensor, torch.tensor, torch.tensor,
-                                    torch.tensor]]]:
-    assert wtype.is_integer(), "TODO: support floating point weights"
-
-    # we want to make sure that weights don't fit into L2 cache between runs so
-    #  we construct enough weights to exceed L2 cache, which is 50mb on a H100
-    #  so we target total weight size > 2*50mb
-    num_weights = math.ceil(2 * 50 * 1024**2 * 8 / (k * n * wtype.size_bits))
-
-    a = torch.randn((m, k), device="cuda", dtype=atype) * 5
-    weights = [
-        torch.randn((k, n), device="cuda", dtype=atype)
-        for _ in range(num_weights)
-    ]
-    quanitized_weights = [
-        quantize_weights(w, wtype, group_size) for w in weights
-    ]
-
-    return a, quanitized_weights
-
-
-# impl
-
-
-# bench
-def bench_fn(label: str, sub_label: str, description: str,
-             fn: Callable) -> TMeasurement:
-
-    min_run_time = 1
-    return TBenchmark.Timer(
-        stmt="fn()",
-        globals={
-            "fn": fn
-        },
-        label=label,
-        sub_label=sub_label,
-        description=description,
-    ).blocked_autorange(min_run_time=min_run_time)
-
-
-def loop_over_weights(
-    a: torch.tensor, weights: List[Tuple[torch.tensor, torch.tensor,
-                                         torch.tensor, torch.tensor]],
-    fn: Callable[[torch.tensor, torch.tensor, torch.tensor, torch.tensor],
-                 None]):
-    for w_ref, w_q, w_s, _ in weights:
-        fn(a, w_ref, w_q, w_s)
-
-
-def bench(atype: torch.dtype,
-          wtype: ScalarType,
-          group_size: int,
-          m: int,
-          k: int,
-          n: int,
-          label: str,
-          sub_label: str,
-          benchmark_marlinv1: bool = True,
-          sweep_schedules: bool = True) -> Iterable[TMeasurement]:
-    a, weights = make_bench_tensors(atype, wtype, group_size, m, n, k)
-    sub_label += f", L={len(weights)}"
-
-    weights_machete = [(w_ref, machete_pack_weights(w_q, wtype), w_s, w_zp)
-                       for w_ref, w_q, w_s, w_zp in weights]
-
-    timers = []
-    # pytorch impl
-    timers.append(
-        bench_fn(
-            label, sub_label, "torch.matmul", lambda: loop_over_weights(
-                a,
-                weights,
-                lambda a, w_ref, w_q, w_s: torch.matmul(a, w_ref),
-            )))
-
-    if benchmark_marlinv1:
-        w_ref = weights[0][0]
-
-        w_zp_empty = torch.empty(0, dtype=torch.int, device=w_ref.device)
-        sort_indices = torch.empty(0, dtype=torch.int, device=w_ref.device)
-        g_idx = torch.empty(0, dtype=torch.int, device=w_ref.device)
-
-        def marlinv1_pack_weights(w_q: torch.tensor) -> torch.tensor:
-            w_q_gptq = gptq_pack(w_q, wtype.size_bits, *w_ref.shape)
-            return ops.gptq_marlin_repack(w_q_gptq, sort_indices, *w_ref.shape,
-                                          wtype.size_bits)
-
-        def marlinv1_permute_scales(w_s: torch.tensor) -> torch.tensor:
-            return marlin_permute_scales(w_s, *w_ref.shape, group_size)
-
-        weights_marlinv1 = [(w_ref, marlinv1_pack_weights(w_q),
-                             marlinv1_permute_scales(w_s), w_zp)
-                            for w_ref, w_q, w_s, w_zp in weights]
-
-        workspace = MarlinWorkspace(w_ref.shape[1], GPTQ_MARLIN_MIN_THREAD_N,
-                                    GPTQ_MARLIN_MAX_PARALLEL)
-
-        # marlinv1
-        timers.append(
-            bench_fn(
-                label, sub_label, "marlin_orig", lambda: loop_over_weights(
-                    a, weights_marlinv1, lambda a, w_ref, w_q, w_s: ops.
-                    gptq_marlin_gemm(a,
-                                     w_q,
-                                     w_s,
-                                     w_zp_empty,
-                                     g_idx,
-                                     sort_indices,
-                                     workspace.scratch,
-                                     wtype,
-                                     size_m=a.shape[0],
-                                     size_n=w_ref.shape[1],
-                                     size_k=w_ref.shape[0],
-                                     is_k_full=True))))
-
-    # machete
-    timers.append(
-        bench_fn(
-            label, sub_label, "machete_heuristic", lambda: loop_over_weights(
-                a, weights_machete, lambda a, _, w_q, w_s: ops.machete_gemm(
-                    a, w_q, wtype, b_scales=w_s, b_group_size=group_size))))
-
-    if sweep_schedules:
-        print("Finding best schedule for machete")
-        best = None
-        best_schedule = None
-        schedules = ops.machete_supported_schedules(wtype)
-        for schedule in reversed(schedules):
-
-            def run(a, _, w_q, w_s, schedule=schedule):
-                ops.machete_gemm(a,
-                                 w_q,
-                                 wtype,
-                                 w_s,
-                                 b_group_size=group_size,
-                                 schedule=schedule)
-
-            res = bench_fn(label, sub_label, "machete_best",
-                           lambda: loop_over_weights(a, weights_machete, run))
-
-            print(f"  {res.median:5.5} ", schedule)
-            if not best or res.median < best.median:
-                best = res
-                best_schedule = schedule
-        print("Best schedule:", best_schedule)
-        timers.append(best)
-
-    return timers
-
-
-# runner
-def print_timers(timers: Iterable[TMeasurement]):
-    compare = TBenchmark.Compare(timers)
-    compare.print()
-
-
-def run(dtype: torch.dtype, sweep_schedules: bool,
-        MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]:
-
-    results = []
-    for m, k, n in MKNs:
-        timers = bench(dtype,
-                       scalar_types.uint4b8,
-                       128,
-                       m,
-                       k,
-                       n,
-                       f"{dtype}-gemm",
-                       f"MKN=({m}x{k}x{n})",
-                       sweep_schedules=sweep_schedules)
-        print_timers(timers)
-        results.extend(timers)
-
-    return results
-
-
-# output makers
-def make_output(
-    data: Iterable[TMeasurement],
-    MKNs: Iterable[Tuple[int, int, int]],
-    base_description: str,
-    timestamp=None,
-):
-
-    print(f"== All Results {base_description} ====")
-    print_timers(data)
-
-    # pickle all the results
-    timestamp = int(time.time()) if timestamp is None else timestamp
-    with open(f"{base_description}-{timestamp}.pkl", "wb") as f:
-        pkl.dump(data, f)
-
-
-# argparse runners
-
-
-def run_square_bench(args):
-    dim_sizes = list(
-        range(args.dim_start, args.dim_end + 1, args.dim_increment))
-    MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes))
-    data = run(args.dtype, args.sweep_schedules, MKNs)
-
-    make_output(data, MKNs, f"square_bench-{args.dtype}")
-
-
-def run_range_bench(args):
-    dim_sizes = list(range(args.dim_start, args.dim_end, args.dim_increment))
-    n = len(dim_sizes)
-    Ms = [args.m_constant] * n if args.m_constant is not None else dim_sizes
-    Ks = [args.k_constant] * n if args.k_constant is not None else dim_sizes
-    Ns = [args.n_constant] * n if args.n_constant is not None else dim_sizes
-    MKNs = list(zip(Ms, Ks, Ns))
-    data = run(args.dtype, args.sweep_schedules, MKNs)
-
-    make_output(data, MKNs, f"range_bench-{args.dtype}")
-
-
-def run_model_bench(args):
-
-    print("Benchmarking models:")
-    for i, model in enumerate(args.models):
-        print(f"[{i}]  {model}")
-
-    def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]:
-        KNs = []
-        for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model_name]):
-            KN[tp_split_dim] = KN[tp_split_dim] // tp_size
-            KNs.append(KN)
-        return KNs
-
-    model_bench_data = []
-    models_tps = list(itertools.product(args.models, args.tp_sizes))
-    for model, tp_size in models_tps:
-        Ms = args.batch_sizes
-        KNs = model_shapes(model, tp_size)
-        MKNs = []
-        for m in Ms:
-            for k, n in KNs:
-                MKNs.append((m, k, n))
-
-        data = run(args.dtype, args.sweep_schedules, MKNs)
-        model_bench_data.append(data)
-
-    # Print all results
-    for data, model_tp in zip(model_bench_data, models_tps):
-        model, tp_size = model_tp
-        print(f"== Results {args.dtype} {model}-TP{tp_size} ====")
-        print_timers(data)
-
-    timestamp = int(time.time())
-
-    all_data = []
-    for d in model_bench_data:
-        all_data.extend(d)
-    # pickle all data
-    with open(f"model_bench-{args.dtype}-{timestamp}.pkl", "wb") as f:
-        pkl.dump(all_data, f)
-
-
-if __name__ == "__main__":
-
-    def to_torch_dtype(dt):
-        if dt == "bfloat16":
-            return torch.bfloat16
-        if dt == "float16":
-            return torch.float16
-        raise ValueError("unsupported dtype")
-
-    parser = FlexibleArgumentParser(
-        description="""
-Benchmark Machete GEMM.
-
-    To run square GEMMs:
-        python3 ./benchmarks/kernels/benchmark_machete.py --dtype float16 square_bench --dim-start 128 --dim-end 512 --dim-increment 64
-    
-    To run constant N and K and sweep M:
-        python3 ./benchmarks/kernels/benchmark_machete.py --dtype float16 range_bench --dim-start 128 --dim-end 512 --dim-increment 64 --n-constant 16384 --k-constant 16384
-    
-    To run dimensions from a model:
-        python3 ./benchmarks/kernels/benchmark_machete.py --dtype float16 model_bench --models meta-llama/Llama-2-7b-hf --batch-sizes 16 --tp-sizes 1
-    
-    Output:
-        - a .pkl file, that is a list of raw torch.benchmark.utils.Measurements for the pytorch and cutlass implementations for the various GEMMs.
-            """,  # noqa: E501
-        formatter_class=argparse.RawTextHelpFormatter,
-    )
-
-    parser.add_argument(
-        "--dtype",
-        type=to_torch_dtype,
-        required=True,
-        help="Available options are ['bfloat16', 'float16']",
-    )
-    parser.add_argument(
-        "--sweep-schedules",
-        action="store_true",
-        help="Run a sweep over all supported schedules",
-    )
-    subparsers = parser.add_subparsers(dest="cmd", required=True)
-
-    square_parser = subparsers.add_parser("square_bench")
-    square_parser.add_argument("--dim-start", type=int, required=True)
-    square_parser.add_argument("--dim-end", type=int, required=True)
-    square_parser.add_argument("--dim-increment", type=int, required=True)
-    square_parser.set_defaults(func=run_square_bench)
-
-    range_parser = subparsers.add_parser("range_bench")
-    range_parser.add_argument("--dim-start", type=int, required=True)
-    range_parser.add_argument("--dim-end", type=int, required=True)
-    range_parser.add_argument("--dim-increment", type=int, required=True)
-    range_parser.add_argument("--m-constant", type=int, default=None)
-    range_parser.add_argument("--n-constant", type=int, default=None)
-    range_parser.add_argument("--k-constant", type=int, default=None)
-    range_parser.set_defaults(func=run_range_bench)
-
-    model_parser = subparsers.add_parser("model_bench")
-    model_parser.add_argument(
-        "--models",
-        nargs="+",
-        type=str,
-        default=DEFAULT_MODELS,
-        choices=WEIGHT_SHAPES.keys(),
-    )
-    model_parser.add_argument("--tp-sizes",
-                              nargs="+",
-                              type=int,
-                              default=DEFAULT_TP_SIZES)
-    model_parser.add_argument("--batch-sizes",
-                              nargs="+",
-                              type=int,
-                              default=DEFAULT_BATCH_SIZES)
-    model_parser.set_defaults(func=run_model_bench)
-
-    args = parser.parse_args()
-    args.func(args)
--- a/benchmarks/kernels/benchmark_marlin.py
+++ b/benchmarks/kernels/benchmark_marlin.py
@ -7,17 +7,16 @@ from benchmark_shapes import WEIGHT_SHAPES
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
    GPTQ_MARLIN_24_MAX_PARALLEL, GPTQ_MARLIN_24_MIN_THREAD_N,
-    GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES)
+    GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_24_SUPPORTED_NUM_BITS)
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
    GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N,
-    MARLIN_SUPPORTED_GROUP_SIZES, query_marlin_supported_quant_types)
+    GPTQ_MARLIN_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_SUPPORTED_NUM_BITS)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
    MarlinWorkspace, marlin_quantize)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_test_24 import (
    marlin_24_quantize)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    gptq_pack, gptq_quantize_weights, sort_weights)
-from vllm.scalar_type import ScalarType
+    gptq_pack, quantize_weights, sort_weights)
 from vllm.utils import FlexibleArgumentParser

 DEFAULT_MODELS = ["meta-llama/Llama-2-7b-hf/TP1"]
@ -28,14 +27,13 @@ K_FULL_OPTS = [False, True]


 def bench_run(results: List[benchmark.Measurement], model: str,
-              act_order: bool, is_k_full: bool, quant_type: ScalarType,
-              group_size: int, size_m: int, size_k: int, size_n: int):
+              act_order: bool, is_k_full: bool, num_bits: int, group_size: int,
+              size_m: int, size_k: int, size_n: int):
    label = "Quant Matmul"

-    sub_label = ("{}, act={} k_full={}, q={}, g={}, "
-                 "MKN=({}x{}x{})".format(model, act_order, is_k_full,
-                                         str(quant_type), group_size, size_m,
-                                         size_k, size_n))
+    sub_label = ("{}, act={} k_full={}, b={}, g={}, "
+                 "MKN=({}x{}x{})".format(model, act_order, is_k_full, num_bits,
+                                         group_size, size_m, size_k, size_n))

    print(f"Testing: {sub_label}")

@ -52,18 +50,16 @@ def bench_run(results: List[benchmark.Measurement], model: str,
        marlin_g_idx,
        marlin_sort_indices,
        marlin_rand_perm,
-    ) = marlin_quantize(b, quant_type, group_size, act_order)
+    ) = marlin_quantize(b, num_bits, group_size, act_order)

    # Marlin_24 quant
    (marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta,
-     marlin_24_s) = marlin_24_quantize(b, quant_type, group_size)
-
-    marlin_zp = torch.empty(0, dtype=torch.int, device=b.device)
+     marlin_24_s) = marlin_24_quantize(b, num_bits, group_size)

    # GPTQ quant
    (w_ref, q_w, s, g_idx,
-     rand_perm) = gptq_quantize_weights(b, quant_type, group_size, act_order)
-    q_w_gptq = gptq_pack(q_w, quant_type.size_bits, size_k, size_n)
+     rand_perm) = quantize_weights(b, num_bits, group_size, act_order)
+    q_w_gptq = gptq_pack(q_w, num_bits, size_k, size_n)

    # For act_order, sort the "weights" and "g_idx"
    # so that group ids are increasing
@ -77,11 +73,10 @@ def bench_run(results: List[benchmark.Measurement], model: str,

    marlin_24_workspace = MarlinWorkspace(size_n, GPTQ_MARLIN_24_MIN_THREAD_N,
                                          GPTQ_MARLIN_24_MAX_PARALLEL)
-    marlin_zp = torch.zeros_like(marlin_s, dtype=torch.int)

    globals = {
        # Gen params
-        "quant_type": quant_type,
+        "num_bits": num_bits,
        "group_size": group_size,
        "size_m": size_m,
        "size_n": size_n,
@ -92,7 +87,6 @@ def bench_run(results: List[benchmark.Measurement], model: str,
        "marlin_w_ref": marlin_w_ref,
        "marlin_q_w": marlin_q_w,
        "marlin_s": marlin_s,
-        "marlin_zp": marlin_zp,
        "marlin_g_idx": marlin_g_idx,
        "marlin_sort_indices": marlin_sort_indices,
        "marlin_rand_perm": marlin_rand_perm,
@ -131,29 +125,19 @@ def bench_run(results: List[benchmark.Measurement], model: str,
    results.append(
        benchmark.Timer(
            stmt=
-            "output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False)",  # noqa: E501
+            "output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, num_bits, size_m, size_n, size_k, is_k_full)",  # noqa: E501
            globals=globals,
            label=label,
            sub_label=sub_label,
-            description="gptq_marlin_gemm_fp16",
+            description="gptq_marlin_gemm",
        ).blocked_autorange(min_run_time=min_run_time))

-    results.append(
-        benchmark.Timer(
-            stmt=
-            "output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True)",  # noqa: E501
-            globals=globals,
-            label=label,
-            sub_label=sub_label,
-            description="gptq_marlin_gemm_fp32",
-        ).blocked_autorange(min_run_time=min_run_time))
-
-    if (quant_type in GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES
+    if (num_bits in GPTQ_MARLIN_24_SUPPORTED_NUM_BITS
            and group_size in GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES):
        results.append(
            benchmark.Timer(
                stmt=
-                "output = gptq_marlin_24_gemm(a, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s, marlin_24_workspace.scratch, quant_type, size_m, size_n, size_k)",  # noqa: E501
+                "output = gptq_marlin_24_gemm(a, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s, marlin_24_workspace.scratch, num_bits, size_m, size_n, size_k)",  # noqa: E501
                globals=globals,
                label=label,
                sub_label=sub_label,
@ -163,7 +147,7 @@ def bench_run(results: List[benchmark.Measurement], model: str,
    results.append(
        benchmark.Timer(
            stmt=
-            "q_res = gptq_marlin_repack(q_w_gptq, repack_sort_indices, size_k, size_n, quant_type.size_bits)",  # noqa: E501
+            "q_res = gptq_marlin_repack(q_w_gptq, repack_sort_indices, size_k, size_n, num_bits)",  # noqa: E501
            globals=globals,
            label=label,
            sub_label=sub_label,
@ -199,13 +183,12 @@ def main(args):
                           ) > 0 and is_k_full not in args.limit_k_full:
                        continue

-                    for quant_type in query_marlin_supported_quant_types(
-                            False):
-                        if len(args.limit_num_bits) > 0 and \
-                            quant_type.size_bits not in args.limit_num_bits:
+                    for num_bits in GPTQ_MARLIN_SUPPORTED_NUM_BITS:
+                        if len(args.limit_num_bits
+                               ) > 0 and num_bits not in args.limit_num_bits:
                            continue

-                        for group_size in MARLIN_SUPPORTED_GROUP_SIZES:
+                        for group_size in GPTQ_MARLIN_SUPPORTED_GROUP_SIZES:
                            if len(
                                    args.limit_group_size
                            ) > 0 and group_size not in args.limit_group_size:
@ -219,8 +202,8 @@ def main(args):

                            for size_m in args.batch_sizes:
                                bench_run(results, model, act_order, is_k_full,
-                                          quant_type, group_size, size_m,
-                                          size_k, size_n)
+                                          num_bits, group_size, size_m, size_k,
+                                          size_n)

    compare = benchmark.Compare(results)
    compare.print()
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@ -30,36 +30,19 @@ def benchmark_config(
    hidden_size: int,
    topk: int,
    dtype: torch.dtype,
-    use_fp8_w8a8: bool,
-    use_int8_w8a16: bool,
+    use_fp8: bool,
    num_iters: int = 100,
 ) -> float:
-    init_dtype = torch.float16 if use_fp8_w8a8 else dtype
+    init_dtype = torch.float16 if use_fp8 else dtype
    x = torch.randn(num_tokens, hidden_size, dtype=dtype)
-    if use_int8_w8a16:
-        w1 = torch.randint(-127,
-                           127, (
-                               num_experts,
-                               shard_intermediate_size,
-                               hidden_size,
-                           ),
-                           dtype=torch.int8)
-        w2 = torch.randint(-127,
-                           127, (
-                               num_experts,
-                               hidden_size,
-                               shard_intermediate_size // 2,
-                           ),
-                           dtype=torch.int8)
-    else:
-        w1 = torch.randn(num_experts,
-                         shard_intermediate_size,
-                         hidden_size,
-                         dtype=init_dtype)
-        w2 = torch.randn(num_experts,
-                         hidden_size,
-                         shard_intermediate_size // 2,
-                         dtype=init_dtype)
+    w1 = torch.randn(num_experts,
+                     shard_intermediate_size,
+                     hidden_size,
+                     dtype=init_dtype)
+    w2 = torch.randn(num_experts,
+                     hidden_size,
+                     shard_intermediate_size // 2,
+                     dtype=init_dtype)
    gating_output = torch.randn(num_iters,
                                num_tokens,
                                num_experts,
@ -69,11 +52,7 @@ def benchmark_config(
    w2_scale = None
    a1_scale = None
    a2_scale = None
-    if use_int8_w8a16:
-        w1_scale = torch.randn((num_experts, 2 * shard_intermediate_size),
-                               dtype=torch.float32)
-        w2_scale = torch.randn((hidden_size, num_experts), dtype=torch.float32)
-    if use_fp8_w8a8:
+    if use_fp8:
        w1_scale = torch.randn(num_experts, dtype=torch.float32)
        w2_scale = torch.randn(num_experts, dtype=torch.float32)
        a1_scale = torch.randn(1, dtype=torch.float32)
@ -97,8 +76,7 @@ def benchmark_config(
            renormalize=True,
            inplace=True,
            override_config=config,
-            use_fp8_w8a8=use_fp8_w8a8,
-            use_int8_w8a16=use_int8_w8a16,
+            use_fp8=use_fp8,
            w1_scale=w1_scale,
            w2_scale=w2_scale,
            a1_scale=a1_scale,
@ -177,13 +155,11 @@ class BenchmarkWorker:
        hidden_size: int,
        topk: int,
        dtype: torch.dtype,
-        use_fp8_w8a8: bool,
-        use_int8_w8a16: bool,
+        use_fp8: bool,
    ) -> Tuple[Dict[str, int], float]:
        torch.cuda.manual_seed_all(self.seed)
-        dtype_str = get_config_dtype_str(dtype,
-                                         use_int8_w8a16=use_int8_w8a16,
-                                         use_fp8_w8a8=use_fp8_w8a8)
+
+        dtype_str = "float8" if use_fp8 else None
        # NOTE(woosuk): The current naming convention uses w2.shape[2], which
        # is the intermediate size after silu_and_mul.
        op_config = get_moe_configs(num_experts, shard_intermediate_size // 2,
@ -197,8 +173,7 @@ class BenchmarkWorker:
                                   key=lambda x: abs(x - num_tokens))]
        kernel_time = benchmark_config(config, num_tokens, num_experts,
                                       shard_intermediate_size, hidden_size,
-                                       topk, dtype, use_fp8_w8a8,
-                                       use_int8_w8a16)
+                                       topk, dtype, use_fp8)
        return config, kernel_time

    def tune(
@ -209,10 +184,9 @@ class BenchmarkWorker:
        hidden_size: int,
        topk: int,
        dtype: torch.dtype,
-        use_fp8_w8a8: bool,
-        use_int8_w8a16: bool,
-        search_space: List[Dict[str, int]],
-    ) -> Dict[str, int]:
+        use_fp8: bool,
+        search_space: List[BenchmarkConfig],
+    ) -> BenchmarkConfig:
        best_config = None
        best_time = float("inf")
        for config in tqdm(search_space):
@ -224,8 +198,7 @@ class BenchmarkWorker:
                                               hidden_size,
                                               topk,
                                               dtype,
-                                               use_fp8_w8a8,
-                                               use_int8_w8a16,
+                                               use_fp8,
                                               num_iters=10)
            except triton.runtime.autotuner.OutOfResources:
                # Some configurations may be invalid and fail to compile.
@ -251,19 +224,20 @@ def sort_config(config: BenchmarkConfig) -> BenchmarkConfig:
    }


-def save_configs(configs: Dict[int, BenchmarkConfig], num_experts: int,
-                 shard_intermediate_size: int, hidden_size: int, topk: int,
-                 dtype: torch.dtype, use_fp8_w8a8: bool,
-                 use_int8_w8a16: bool) -> None:
-    dtype_str = get_config_dtype_str(dtype,
-                                     use_int8_w8a16=use_int8_w8a16,
-                                     use_fp8_w8a8=use_fp8_w8a8)
-
+def save_configs(
+    configs: Dict[int, BenchmarkConfig],
+    num_experts: int,
+    shard_intermediate_size: int,
+    hidden_size: int,
+    topk: int,
+    dtype: torch.dtype,
+    use_fp8: bool,
+) -> None:
+    dtype_str = "float8" if use_fp8 else None
    # NOTE(woosuk): The current naming convention uses w2.shape[2], which
    # is the intermediate size after silu_and_mul.
    filename = get_config_file_name(num_experts, shard_intermediate_size // 2,
                                    dtype_str)
-
    print(f"Writing best config to {filename}...")
    with open(filename, "w") as f:
        json.dump(configs, f, indent=4)
@ -279,11 +253,6 @@ def main(args: argparse.Namespace):
        topk = config.ffn_config.moe_top_k
        intermediate_size = config.ffn_config.ffn_hidden_size
        shard_intermediate_size = 2 * intermediate_size // args.tp_size
-    elif config.architectures[0] == "JambaForCausalLM":
-        E = config.num_experts
-        topk = config.num_experts_per_tok
-        intermediate_size = config.intermediate_size
-        shard_intermediate_size = 2 * intermediate_size // args.tp_size
    else:
        # Default: Mixtral.
        E = config.num_local_experts
@ -293,8 +262,7 @@ def main(args: argparse.Namespace):

    hidden_size = config.hidden_size
    dtype = config.torch_dtype
-    use_fp8_w8a8 = args.dtype == "fp8_w8a8"
-    use_int8_w8a16 = args.dtype == "int8_w8a16"
+    use_fp8 = args.dtype == "fp8"

    if args.batch_size is None:
        batch_sizes = [
@ -326,21 +294,21 @@ def main(args: argparse.Namespace):
        start = time.time()
        configs = _distribute(
            "tune", [(batch_size, E, shard_intermediate_size, hidden_size,
-                      topk, dtype, use_fp8_w8a8, use_int8_w8a16, search_space)
+                      topk, dtype, use_fp8, search_space)
                     for batch_size in batch_sizes])
        best_configs = {
            M: sort_config(config)
            for M, config in zip(batch_sizes, configs)
        }
        save_configs(best_configs, E, shard_intermediate_size, hidden_size,
-                     topk, dtype, use_fp8_w8a8, use_int8_w8a16)
+                     topk, dtype, use_fp8)
        end = time.time()
        print(f"Tuning took {end - start:.2f} seconds")
    else:
-        outputs = _distribute(
-            "benchmark", [(batch_size, E, shard_intermediate_size, hidden_size,
-                           topk, dtype, use_fp8_w8a8, use_int8_w8a16)
-                          for batch_size in batch_sizes])
+        outputs = _distribute("benchmark",
+                              [(batch_size, E, shard_intermediate_size,
+                                hidden_size, topk, dtype, use_fp8)
+                               for batch_size in batch_sizes])

        for batch_size, (config, kernel_time) in zip(batch_sizes, outputs):
            print(f"Batch size: {batch_size}, config: {config}")
@ -355,7 +323,7 @@ if __name__ == "__main__":
    parser.add_argument("--tp-size", "-tp", type=int, default=2)
    parser.add_argument("--dtype",
                        type=str,
-                        choices=["auto", "fp8_w8a8", "int8_w8a16"],
+                        choices=["auto", "fp8"],
                        default="auto")
    parser.add_argument("--seed", type=int, default=0)
    parser.add_argument("--batch-size", type=int, required=False)
--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
@ -175,7 +175,7 @@ if __name__ == '__main__':
    parser.add_argument("--num-kv-heads", type=int, default=8)
    parser.add_argument("--head-size",
                        type=int,
-                        choices=[64, 80, 96, 112, 120, 128, 192, 256],
+                        choices=[64, 80, 96, 112, 128, 192, 256],
                        default=128)
    parser.add_argument("--block-size", type=int, choices=[16, 32], default=16)
    parser.add_argument("--use-alibi", action="store_true")
--- a/benchmarks/kernels/benchmark_quant.py
+++ b/benchmarks/kernels/benchmark_quant.py
@ -1,103 +0,0 @@
-import random
-import time
-
-import torch
-
-from vllm import _custom_ops as ops
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
-
-
-@torch.inference_mode()
-def main(num_tokens: int,
-         hidden_size: int,
-         static_scale: bool,
-         quant_dtype: torch.dtype,
-         dtype: torch.dtype,
-         seed: int = 0,
-         do_profile: bool = False,
-         num_warmup_iters: int = 5,
-         num_iters: int = 100) -> None:
-    random.seed(seed)
-    torch.random.manual_seed(seed)
-    if torch.cuda.is_available():
-        torch.cuda.manual_seed(seed)
-    torch.set_default_device("cuda")
-
-    x = torch.randn(num_tokens, hidden_size, dtype=dtype)
-    scale = torch.randn(1, 1, dtype=torch.float32) if static_scale else None
-
-    def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
-        torch.cuda.synchronize()
-        if profile:
-            torch.cuda.cudart().cudaProfilerStart()
-        start_time = time.perf_counter()
-
-        for _ in range(num_iters):
-            if quant_dtype == torch.int8:
-                ops.scaled_int8_quant(x, scale)
-            else:
-                ops.scaled_fp8_quant(x, scale)
-        torch.cuda.synchronize()
-
-        end_time = time.perf_counter()
-        if profile:
-            torch.cuda.cudart().cudaProfilerStart()
-        return (end_time - start_time) / num_iters
-
-    # Warmup.
-    print("Warming up...")
-    run_benchmark = run_cuda_benchmark
-    run_benchmark(num_iters=num_warmup_iters, profile=False)
-
-    # Benchmark.
-    if do_profile:
-        latency = run_benchmark(num_iters=1, profile=True)
-    else:
-        latency = run_benchmark(num_iters=num_iters, profile=False)
-    print(f"Kernel running time: {latency * 1000000:.3f} us")
-
-
-if __name__ == '__main__':
-
-    def to_torch_dtype(dt):
-        if dt == "int8":
-            return torch.int8
-        if dt == "fp8":
-            return torch.float8_e4m3fn
-        raise ValueError(f"Unsupported dtype: {dt}")
-
-    parser = FlexibleArgumentParser(
-        description="Benchmark the quantization (fp8 or int8) kernel.")
-    parser.add_argument("--num-tokens", type=int, default=4096)
-    parser.add_argument("--hidden-size", type=int, default=8192)
-    parser.add_argument("--static-scale", action="store_true")
-    parser.add_argument("--quant-dtype",
-                        type=str,
-                        choices=["fp8", "int8"],
-                        default="int8")
-    parser.add_argument("--dtype",
-                        type=str,
-                        choices=["half", "bfloat16", "float"],
-                        default="half")
-
-    parser.add_argument("--seed", type=int, default=0)
-    parser.add_argument("--profile", action="store_true")
-    parser.add_argument("--num-warmup-iters", type=int, default=5)
-    parser.add_argument("--num-iters",
-                        type=int,
-                        default=100,
-                        help="Number of benchmark iterations. "
-                        "If --profile is set, this number is ignored")
-
-    args = parser.parse_args()
-    print(args)
-
-    main(num_tokens=args.num_tokens,
-         hidden_size=args.hidden_size,
-         static_scale=args.static_scale,
-         quant_dtype=to_torch_dtype(args.quant_dtype),
-         dtype=STR_DTYPE_TO_TORCH_DTYPE[args.dtype],
-         seed=args.seed,
-         do_profile=args.profile,
-         num_warmup_iters=args.num_warmup_iters,
-         num_iters=args.num_iters)
--- a/benchmarks/kernels/benchmark_rope.py
+++ b/benchmarks/kernels/benchmark_rope.py
@ -94,7 +94,7 @@ if __name__ == '__main__':
    parser.add_argument("--num-heads", type=int, default=8)
    parser.add_argument("--head-size",
                        type=int,
-                        choices=[64, 80, 96, 112, 120, 128, 192, 256],
+                        choices=[64, 80, 96, 112, 128, 192, 256],
                        default=128)
    parser.add_argument("--rotary-dim", type=int, choices=[16, 32], default=32)
    parser.add_argument("--dtype",
--- a/benchmarks/kernels/graph_machete_bench.py
+++ b/benchmarks/kernels/graph_machete_bench.py
@ -1,64 +0,0 @@
-import math
-import pickle
-import re
-from collections import defaultdict
-from typing import List
-
-import matplotlib.pyplot as plt
-import pandas as pd
-import seaborn as sns
-from torch.utils.benchmark import Measurement as TMeasurement
-
-from vllm.utils import FlexibleArgumentParser
-
-if __name__ == "__main__":
-    parser = FlexibleArgumentParser(
-        description='Benchmark the latency of processing a single batch of '
-        'requests till completion.')
-    parser.add_argument('filename', type=str)
-
-    args = parser.parse_args()
-
-    with open(args.filename, 'rb') as f:
-        data: List[TMeasurement] = pickle.load(f)
-
-    results = defaultdict(lambda: list())
-    for v in data:
-        result = re.search(r"MKN=\(\d+x(\d+x\d+)\)", v.task_spec.sub_label)
-        if result is not None:
-            KN = result.group(1)
-        else:
-            raise Exception("MKN not found")
-        result = re.search(r"MKN=\((\d+)x\d+x\d+\)", v.task_spec.sub_label)
-        if result is not None:
-            M = result.group(1)
-        else:
-            raise Exception("MKN not found")
-
-        kernel = v.task_spec.description
-        results[KN].append({
-            "kernel": kernel,
-            "batch_size": M,
-            "median": v.median
-        })
-
-    rows = int(math.ceil(len(results) / 2))
-    fig, axs = plt.subplots(rows, 2, figsize=(12, 5 * rows))
-    axs = axs.flatten()
-    axs_idx = 0
-    for shape, data in results.items():
-        plt.sca(axs[axs_idx])
-        df = pd.DataFrame(data)
-        sns.lineplot(data=df,
-                     x="batch_size",
-                     y="median",
-                     hue="kernel",
-                     style="kernel",
-                     markers=True,
-                     dashes=False,
-                     palette="Dark2")
-        plt.title(f"Shape: {shape}")
-        plt.ylabel("time (median, s)")
-        axs_idx += 1
-    plt.tight_layout()
-    plt.savefig("graph_machete_bench.pdf")
--- a/benchmarks/kernels/weight_shapes.py
+++ b/benchmarks/kernels/weight_shapes.py
@ -1,43 +0,0 @@
-# Weight Shapes are in the format
-# ([K, N], TP_SPLIT_DIM)
-# Example:
-#  A shape of ([14336, 4096], 0) indicates the following GEMM shape,
-#   - TP1 : K = 14336, N = 4096
-#   - TP2 : K = 7168, N = 4096
-#  A shape of ([4096, 6144], 1) indicates the following GEMM shape,
-#   - TP1 : K = 4096, N = 6144
-#   - TP4 : K = 4096, N = 1536
-
-# TP1 shapes
-WEIGHT_SHAPES = {
-    "mistralai/Mistral-7B-v0.1": [
-        ([4096, 6144], 1),
-        ([4096, 4096], 0),
-        ([4096, 28672], 1),
-        ([14336, 4096], 0),
-    ],
-    "meta-llama/Llama-2-7b-hf": [
-        ([4096, 12288], 1),
-        ([4096, 4096], 0),
-        ([4096, 22016], 1),
-        ([11008, 4096], 0),
-    ],
-    "meta-llama/Llama-3-8b": [
-        ([4096, 6144], 1),
-        ([4096, 4096], 0),
-        ([4096, 28672], 1),
-        ([14336, 4096], 0),
-    ],
-    "meta-llama/Llama-2-13b-hf": [
-        ([5120, 15360], 1),
-        ([5120, 5120], 0),
-        ([5120, 27648], 1),
-        ([13824, 5120], 0),
-    ],
-    "meta-llama/Llama-2-70b-hf": [
-        ([8192, 10240], 1),
-        ([8192, 8192], 0),
-        ([8192, 57344], 1),
-        ([28672, 8192], 0),
-    ],
-}
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@ -113,5 +113,6 @@ define_gpu_extension_target(
    WITH_SOABI
 )

+add_custom_target(default)
 message(STATUS "Enabling C extension.")
 add_dependencies(default _C)
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@ -181,7 +181,7 @@ macro(override_gpu_arches GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES)
    #
    # The torch cmake setup hardcodes the detected architecture flags in
    # `CMAKE_CUDA_FLAGS`.  Since `CMAKE_CUDA_FLAGS` is a "global" variable, it
-    # can't modified on a per-target basis.
+    # can't modified on a per-target basis, e.g. for the `punica` extension.
    # So, all the `-gencode` flags need to be extracted and removed from
    # `CMAKE_CUDA_FLAGS` for processing so they can be passed by another method.
    # Since it's not possible to use `target_compiler_options` for adding target
--- a/collect_env.py
+++ b/collect_env.py
@ -65,9 +65,6 @@ DEFAULT_CONDA_PATTERNS = {
    "optree",
    "nccl",
    "transformers",
-    "zmq",
-    "nvidia",
-    "pynvml",
 }

 DEFAULT_PIP_PATTERNS = {
@ -80,9 +77,6 @@ DEFAULT_PIP_PATTERNS = {
    "onnx",
    "nccl",
    "transformers",
-    "zmq",
-    "nvidia",
-    "pynvml",
 }


@ -269,9 +263,8 @@ def get_neuron_sdk_version(run_lambda):
 def get_vllm_version():
    try:
        import vllm
-        return vllm.__version__ + "@" + vllm.__commit__
-    except Exception:
-        # old version of vllm does not have __commit__
+        return vllm.__version__
+    except ImportError:
        return 'N/A'


--- a/csrc/attention/attention_kernels.cu
+++ b/csrc/attention/attention_kernels.cu
@ -706,7 +706,7 @@ void paged_attention_v1_launcher(
  int kv_block_stride = key_cache.stride(0);
  int kv_head_stride = key_cache.stride(1);

-  [[maybe_unused]] int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1);
+  int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1);
  assert(head_size % thread_group_size == 0);

  // NOTE: alibi_slopes is optional.
@ -751,9 +751,6 @@ void paged_attention_v1_launcher(
    case 112:
      LAUNCH_PAGED_ATTENTION_V1(112);
      break;
-    case 120:
-      LAUNCH_PAGED_ATTENTION_V1(120);
-      break;
    case 128:
      LAUNCH_PAGED_ATTENTION_V1(128);
      break;
@ -865,7 +862,7 @@ void paged_attention_v2_launcher(
  int kv_block_stride = key_cache.stride(0);
  int kv_head_stride = key_cache.stride(1);

-  [[maybe_unused]] int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1);
+  int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1);
  assert(head_size % thread_group_size == 0);

  // NOTE: alibi_slopes is optional.
@ -915,9 +912,6 @@ void paged_attention_v2_launcher(
    case 112:
      LAUNCH_PAGED_ATTENTION_V2(112);
      break;
-    case 120:
-      LAUNCH_PAGED_ATTENTION_V2(120);
-      break;
    case 128:
      LAUNCH_PAGED_ATTENTION_V2(128);
      break;
--- a/csrc/attention/attention_utils.cuh
+++ b/csrc/attention/attention_utils.cuh
@ -34,7 +34,7 @@ inline __device__ float qk_dot_(const Vec (&q)[N], const Vec (&k)[N]) {
  A_vec qk_vec = mul<A_vec, Vec, Vec>(q[0], k[0]);
 #pragma unroll
  for (int ii = 1; ii < N; ++ii) {
-    qk_vec = vllm::fma(q[ii], k[ii], qk_vec);
+    qk_vec = fma(q[ii], k[ii], qk_vec);
  }

  // Finalize the reduction across lanes.
--- a/csrc/attention/dtype_bfloat16.cuh
+++ b/csrc/attention/dtype_bfloat16.cuh
@ -94,7 +94,6 @@ inline __device__ float2 bf1622float2(const __nv_bfloat162 val) {
 #else
  return __bfloat1622float2(val);
 #endif
-  __builtin_unreachable();  // Suppress missing return statement warning
 }

 inline __device__ __nv_bfloat162 bf162bf162(const __nv_bfloat16 val) {
@ -103,7 +102,6 @@ inline __device__ __nv_bfloat162 bf162bf162(const __nv_bfloat16 val) {
 #else
  return __bfloat162bfloat162(val);
 #endif
-  __builtin_unreachable();  // Suppress missing return statement warning
 }

 // Vector addition.
@ -117,7 +115,6 @@ inline __device__ __nv_bfloat16 add(__nv_bfloat16 a, __nv_bfloat16 b) {
  return __hadd(a, b);
  #endif
 #endif
-  __builtin_unreachable();  // Suppress missing return statement warning
 }

 inline __device__ __nv_bfloat162 add(__nv_bfloat162 a, __nv_bfloat162 b) {
@ -126,7 +123,6 @@ inline __device__ __nv_bfloat162 add(__nv_bfloat162 a, __nv_bfloat162 b) {
 #else
  return __hadd2(a, b);
 #endif
-  __builtin_unreachable();  // Suppress missing return statement warning
 }

 inline __device__ bf16_4_t add(bf16_4_t a, bf16_4_t b) {
@ -174,7 +170,6 @@ inline __device__ __nv_bfloat16 mul(__nv_bfloat16 a, __nv_bfloat16 b) {
 #else
  return __hmul(a, b);
 #endif
-  __builtin_unreachable();  // Suppress missing return statement warning
 }

 template <>
@ -184,7 +179,6 @@ inline __device__ __nv_bfloat162 mul(__nv_bfloat162 a, __nv_bfloat162 b) {
 #else
  return __hmul2(a, b);
 #endif
-  __builtin_unreachable();  // Suppress missing return statement warning
 }

 template <>
@ -295,7 +289,6 @@ inline __device__ __nv_bfloat162 fma(__nv_bfloat162 a, __nv_bfloat162 b,
 #else
  return __hfma2(a, b, c);
 #endif
-  __builtin_unreachable();  // Suppress missing return statement warning
 }

 inline __device__ __nv_bfloat162 fma(__nv_bfloat16 a, __nv_bfloat162 b,
@ -305,7 +298,6 @@ inline __device__ __nv_bfloat162 fma(__nv_bfloat16 a, __nv_bfloat162 b,
 #else
  return __hfma2(bf162bf162(a), b, c);
 #endif
-  __builtin_unreachable();  // Suppress missing return statement warning
 }

 inline __device__ bf16_4_t fma(bf16_4_t a, bf16_4_t b, bf16_4_t c) {
--- a/csrc/core/scalar_type.hpp
+++ b/csrc/core/scalar_type.hpp
@ -1,547 +0,0 @@
-#pragma once
-
-#include <torch/custom_class.h>
-
-namespace vllm {
-
-//
-//  ScalarType can represent a wide range of floating point and integer types,
-//  in particular it can be used to represent sub-byte data types (something
-//  that torch.dtype currently does not support).
-//
-//  ScalarTypeTorch is a subclass of ScalarType that is compatible with
-//  TORCH_LIBRARY, making it accessible from Python as well meaning this class
-//  can be used as a argument for custom operators, helping to simplify these
-//  interfaces.
-//
-//  The type definitions on the Python side can be found in: vllm/_core_ext.pyi
-//  these type definitions should be kept up to date with any Python API changes
-//  here.
-//
-class ScalarType {
- public:
-  enum NanRepr : uint8_t {
-    NAN_NONE = 0,                // nans are not supported
-    NAN_IEEE_754 = 1,            // nans are: exp all 1s, mantissa not all 0s
-    NAN_EXTD_RANGE_MAX_MIN = 2,  // nans are: exp all 1s, mantissa all 1s
-
-    NAN_REPR_ID_MAX
-  };
-
-  constexpr ScalarType(uint8_t exponent, uint8_t mantissa, bool signed_,
-                       int32_t bias, bool finite_values_only = false,
-                       NanRepr nan_repr = NAN_IEEE_754)
-      : exponent(exponent),
-        mantissa(mantissa),
-        signed_(signed_),
-        bias(bias),
-        finite_values_only(finite_values_only),
-        nan_repr(nan_repr){};
-
-  static constexpr ScalarType int_(uint8_t size_bits, int32_t bias = 0) {
-    return ScalarType(0, size_bits - 1, true, bias);
-  }
-
-  static constexpr ScalarType uint(uint8_t size_bits, int32_t bias = 0) {
-    return ScalarType(0, size_bits, false, bias);
-  }
-
-  // IEEE 754 compliant floating point type
-  static constexpr ScalarType float_IEEE754(uint8_t exponent,
-                                            uint8_t mantissa) {
-    TORCH_CHECK(mantissa > 0 && exponent > 0);
-    return ScalarType(exponent, mantissa, true, 0, false, NAN_IEEE_754);
-  }
-
-  // IEEE 754 non-compliant floating point type
-  static constexpr ScalarType float_(uint8_t exponent, uint8_t mantissa,
-                                     bool finite_values_only,
-                                     NanRepr nan_repr) {
-    TORCH_CHECK(nan_repr < NAN_REPR_ID_MAX, "Invalid NanRepr");
-    TORCH_CHECK(mantissa > 0 && exponent > 0);
-    TORCH_CHECK(nan_repr != NAN_IEEE_754,
-                "use `float_IEEE754` constructor for floating point types that "
-                "follow IEEE 754 conventions");
-    return ScalarType(exponent, mantissa, true, 0, finite_values_only,
-                      nan_repr);
-  }
-
-  uint8_t const exponent;  // size of the exponent field (0 for integer types)
-  uint8_t const mantissa;  // size of the mantissa field (size of the integer
-                           // excluding the sign bit for integer types)
-  bool const signed_;  // flag if the type supports negative numbers (i.e. has a
-                       // sign bit)
-  int32_t const bias;  // stored values equal value + bias,
-                       // used for quantized type
-
-  // Extra Floating point info
-  bool const finite_values_only;  // i.e. no +/-inf if true
-  NanRepr const nan_repr;         // how NaNs are represented
-                                  // (not applicable for integer types)
-
-  using Id = int64_t;
-
- private:
-  // Field size in id
-  template <typename T_>
-  static constexpr size_t member_id_field_width() {
-    using T = std::decay_t<T_>;
-    return std::is_same_v<T, bool> ? 1 : sizeof(T) * 8;
-  }
-
-  template <typename Fn, typename Init, typename Member, typename... Rest>
-  static constexpr auto reduce_members_helper(Fn f, Init val, Member member,
-                                              Rest... rest) {
-    auto new_val = f(val, member);
-    if constexpr (sizeof...(rest) > 0) {
-      return reduce_members_helper(f, new_val, rest...);
-    } else {
-      return new_val;
-    };
-  }
-
-  template <typename Fn, typename Init>
-  constexpr auto reduce_members(Fn f, Init init) const {
-    // Should be in constructor order for `from_id`
-    return reduce_members_helper(f, init, exponent, mantissa, signed_, bias,
-                                 finite_values_only, nan_repr);
-  };
-
-  template <typename Fn, typename Init>
-  static constexpr auto reduce_member_types(Fn f, Init init) {
-    constexpr auto dummy_type = ScalarType(0, 0, false, 0, false, NAN_NONE);
-    return dummy_type.reduce_members(f, init);
-  };
-
-  static constexpr auto id_size_bits() {
-    return reduce_member_types(
-        [](int acc, auto member) -> int {
-          return acc + member_id_field_width<decltype(member)>();
-        },
-        0);
-  }
-
- public:
-  // unique id for this scalar type that can be computed at compile time for
-  //  c++17 template specialization this is not needed once we migrate to
-  //  c++20 and can pass literal classes as template parameters
-  constexpr Id id() const {
-    static_assert(id_size_bits() <= sizeof(Id) * 8,
-                  "ScalarType id is too large to be stored");
-
-    auto or_and_advance = [](std::pair<Id, uint32_t> result,
-                             auto member) -> std::pair<Id, uint32_t> {
-      auto [id, bit_offset] = result;
-      auto constexpr bits = member_id_field_width<decltype(member)>();
-      return {id | (int64_t(member) & ((uint64_t(1) << bits) - 1))
-                       << bit_offset,
-              bit_offset + bits};
-    };
-    return reduce_members(or_and_advance, std::pair<Id, uint32_t>{}).first;
-  }
-
-  // create a ScalarType from an id, for c++17 template specialization,
-  //  this is not needed once we migrate to c++20 and can pass literal
-  //  classes as template parameters
-  static constexpr ScalarType from_id(Id id) {
-    auto extract_and_advance = [id](auto result, auto member) {
-      using T = decltype(member);
-      auto [tuple, bit_offset] = result;
-      auto constexpr bits = member_id_field_width<T>();
-      auto extracted_val = static_cast<T>((int64_t(id) >> bit_offset) &
-                                          ((uint64_t(1) << bits) - 1));
-      auto new_tuple = std::tuple_cat(tuple, std::make_tuple(extracted_val));
-      return std::pair<decltype(new_tuple), int>{new_tuple, bit_offset + bits};
-    };
-
-    auto [tuple_args, _] = reduce_member_types(extract_and_advance,
-                                               std::pair<std::tuple<>, int>{});
-    return std::apply([](auto... args) { return ScalarType(args...); },
-                      tuple_args);
-  }
-
-  constexpr int64_t size_bits() const {
-    return mantissa + exponent + is_signed();
-  }
-  constexpr bool is_signed() const { return signed_; }
-  constexpr bool is_integer() const { return exponent == 0; }
-  constexpr bool is_floating_point() const { return exponent > 0; }
-  constexpr bool is_ieee_754() const {
-    return is_floating_point() && finite_values_only == false &&
-           nan_repr == NAN_IEEE_754;
-  }
-  constexpr bool has_nans() const {
-    return is_floating_point() && nan_repr != NAN_NONE;
-  }
-  constexpr bool has_infs() const {
-    return is_floating_point() && finite_values_only == false;
-  }
-  constexpr bool has_bias() const { return bias != 0; }
-
- private:
-  double _floating_point_max() const {
-    TORCH_CHECK(mantissa <= 52 && exponent <= 11,
-                "Cannot represent max/min as a double for type ", str());
-
-    uint64_t max_mantissa = (uint64_t(1) << mantissa) - 1;
-    if (nan_repr == NAN_EXTD_RANGE_MAX_MIN) {
-      max_mantissa -= 1;
-    }
-
-    uint64_t max_exponent = (uint64_t(1) << exponent) - 2;
-    if (nan_repr == NAN_EXTD_RANGE_MAX_MIN || nan_repr == NAN_NONE) {
-      TORCH_CHECK(exponent < 11,
-                  "Cannot represent max/min as a double for type ", str());
-      max_exponent += 1;
-    }
-
-    // adjust the exponent to match that of a double
-    //  for now we assume the exponent bias is the standard 2^(e-1) -1, (where e
-    //  is the exponent bits), there is some precedent for non-standard biases,
-    //  example `float8_e4m3b11fnuz` here: https://github.com/jax-ml/ml_dtypes
-    //  but to avoid premature over complication we are just assuming the
-    //  standard exponent bias until there is a need to support non-standard
-    //  biases
-    uint64_t exponent_bias = (uint64_t(1) << (exponent - 1)) - 1;
-    uint64_t exponent_bias_double = (uint64_t(1) << 10) - 1;  // double e = 11
-
-    uint64_t max_exponent_double =
-        max_exponent - exponent_bias + exponent_bias_double;
-
-    // shift the mantissa into the position for a double and
-    // the exponent
-    uint64_t double_raw =
-        (max_mantissa << (52 - mantissa)) | (max_exponent_double << 52);
-
-    return *reinterpret_cast<double*>(&double_raw);
-  }
-
-  constexpr std::variant<int64_t, double> _raw_max() const {
-    if (is_floating_point()) {
-      return {_floating_point_max()};
-    } else {
-      TORCH_CHECK(size_bits() < 64 || size_bits() == 64 && is_signed(),
-                  "Cannot represent max as a int64_t");
-      return {(int64_t(1) << mantissa) - 1};
-    }
-  }
-
-  constexpr std::variant<int64_t, double> _raw_min() const {
-    if (is_floating_point()) {
-      TORCH_CHECK(is_signed(),
-                  "We currently assume all floating point types are signed");
-      constexpr uint64_t sign_bit_double = (uint64_t(1) << 63);
-
-      double max = _floating_point_max();
-      uint64_t max_raw = *reinterpret_cast<uint64_t*>(&max);
-      uint64_t min_raw = max_raw | sign_bit_double;
-      return {*reinterpret_cast<double*>(&min_raw)};
-    } else {
-      TORCH_CHECK(!is_signed() || size_bits() <= 64,
-                  "Cannot represent min as a int64_t");
-      if (is_signed()) {
-        // set the top bit to 1 (i.e. INT64_MIN) and the rest to 0
-        // then perform an arithmetic shift right to set all the bits above
-        // (size_bits() - 1) to 1
-        return {INT64_MIN >> (64 - size_bits())};
-      } else {
-        return {int64_t(0)};
-      }
-    }
-  }
-
- public:
-  // Max representable value for this scalar type.
-  // (accounting for bias if there is one)
-  constexpr std::variant<int64_t, double> max() const {
-    return std::visit(
-        [this](auto x) -> std::variant<int64_t, double> { return {x - bias}; },
-        _raw_max());
-  }
-
-  // Min representable value for this scalar type.
-  // (accounting for bias if there is one)
-  constexpr std::variant<int64_t, double> min() const {
-    return std::visit(
-        [this](auto x) -> std::variant<int64_t, double> { return {x - bias}; },
-        _raw_min());
-  }
-
-  std::string str() const {
-    /* naming generally follows: https://github.com/jax-ml/ml_dtypes
-     * for floating point types (leading f) the scheme is:
-     *  `float<size_bits>_e<exponent_bits>m<mantissa_bits>[flags]`
-     *  flags:
-     *  - no-flags: means it follows IEEE 754 conventions
-     *  - f: means finite values only (no infinities)
-     *  - n: means nans are supported (non-standard encoding)
-     * for integer types the scheme is:
-     *  `[u]int<size_bits>[b<bias>]`
-     *  - if bias is not present it means its zero
-     */
-    if (is_floating_point()) {
-      auto ret = "float" + std::to_string(size_bits()) + "_e" +
-                 std::to_string(exponent) + "m" + std::to_string(mantissa);
-      if (!is_ieee_754()) {
-        if (finite_values_only) {
-          ret += "f";
-        }
-        if (nan_repr != NAN_NONE) {
-          ret += "n";
-        }
-      }
-      return ret;
-    } else {
-      auto ret = ((is_signed()) ? "int" : "uint") + std::to_string(size_bits());
-      if (has_bias()) {
-        ret += "b" + std::to_string(bias);
-      }
-      return ret;
-    }
-  }
-
-  constexpr bool operator==(ScalarType const& other) const {
-    return mantissa == other.mantissa && exponent == other.exponent &&
-           bias == other.bias && signed_ == other.signed_ &&
-           finite_values_only == other.finite_values_only &&
-           nan_repr == other.nan_repr;
-  }
-};
-
-// Create a TORCH_LIBRARY compatible version of ScalarType (i.e. inherit from
-//  torch::CustomClassHolder), we use multiple inheritance here since we cannot
-//  have ScalarType inherit from torch::CustomClassHolder and have a constexpr
-//  constructor at the same time (torch::CustomClassHolder does not have a
-//  constexpr destructor)
-// See also:
-// https://docs.google.com/document/d/18fBMPuOJ0fY5ZQ6YyrHUppw9FA332CpNtgB6SOIgyuA
-class ScalarTypeTorch : public torch::CustomClassHolder, public ScalarType {
- public:
-  ScalarTypeTorch(int64_t exponent, int64_t mantissa, int64_t bias,
-                  bool _signed)
-      : ScalarType(exponent, mantissa, bias, _signed){};
-
-  ScalarTypeTorch(ScalarType type) : ScalarType(type){};
-
-  using Base = ScalarType;
-  using Self = ScalarTypeTorch;
-  using SelfPtr = c10::intrusive_ptr<Self>;
-
-  static void check_size_bits(int64_t size_bits, bool signed_) {
-    TORCH_CHECK(
-        size_bits <=
-            std::numeric_limits<decltype(std::declval<Self>().mantissa)>::max(),
-        "size_bits bit width is too large to be represented");
-  }
-
-  static void check_bias(int64_t bias) {
-    using Bias = decltype(std::declval<Self>().bias);
-    TORCH_CHECK(bias <= std::numeric_limits<Bias>::max() &&
-                    bias >= std::numeric_limits<Bias>::min(),
-                "bias too large or small to be represented");
-  }
-
-  static void check_exponent(int64_t exponent) {
-    TORCH_CHECK(
-        exponent <=
-            std::numeric_limits<decltype(std::declval<Self>().exponent)>::max(),
-        "exponent bit width is too large to be represented");
-  }
-
-  static void check_mantissa(int64_t mantissa) {
-    TORCH_CHECK(
-        mantissa <=
-            std::numeric_limits<decltype(std::declval<Self>().mantissa)>::max(),
-        "mantissa bit width is too large to be represented");
-  }
-
-  static SelfPtr int_(int64_t size_bits, c10::optional<int64_t> bias) {
-    check_size_bits(size_bits, true);
-    check_bias(bias.value_or(0));
-    return c10::make_intrusive<Self>(
-        ScalarType::int_(size_bits, bias.value_or(0)));
-  }
-
-  static SelfPtr uint(int64_t size_bits, c10::optional<int64_t> bias) {
-    check_size_bits(size_bits, true);
-    check_bias(bias.value_or(0));
-    return c10::make_intrusive<Self>(
-        ScalarType::uint(size_bits, bias.value_or(0)));
-  }
-
-  static SelfPtr float_IEEE754(int64_t exponent, int64_t mantissa) {
-    check_mantissa(mantissa);
-    check_exponent(exponent);
-    return c10::make_intrusive<Self>(
-        ScalarType::float_IEEE754(exponent, mantissa));
-  }
-
-  static SelfPtr float_(int64_t exponent, int64_t mantissa,
-                        bool finite_values_only, int64_t nan_repr) {
-    check_mantissa(mantissa);
-    check_exponent(exponent);
-    return c10::make_intrusive<Self>(ScalarType::float_(
-        exponent, mantissa, finite_values_only, NanRepr(nan_repr)));
-  }
-
-  // This needs to be implemented and throw a TypeError in order for
-  // PyTorch's opcheck to work on ops that use ScalarTypes.
-  int64_t len() const {
-    throw c10::TypeError("__len__ not implemented");
-    return 0;
-  }
-
-  // Serialize a ScalarType into a tuple of pairs.  Where each pair
-  // is a (fieldname, value).
-  // For simplicity, we are just going to convert to a ScalarTypeId.
-  std::tuple<std::tuple<std::string, int64_t>> obj_flatten() const {
-    return {{"ScalarType", id()}};
-  }
-
-  // Deserialize a scalar type that has been serialized by obj_flatten,
-  // ostensibly from a tuple of (member name, value) pairs, but in reality
-  // just a ScalarTypeId.
-  static SelfPtr obj_unflatten(
-      std::tuple<std::tuple<std::string, int64_t>> const& flat_type) {
-    return c10::make_intrusive<Self>(
-        from_id(std::get<1>(std::get<0>(flat_type))));
-  }
-
-  template <typename T>
-  static void bind_readonly_property(torch::class_<Self>& cls,
-                                     std::string const& name, T Base::*field) {
-    auto getter_func_helper = [field = std::move(field)](SelfPtr const& self) {
-      if constexpr (std::is_member_function_pointer_v<decltype(field)>) {
-        return (self.get()->*field)();
-      } else {
-        return self.get()->*field;
-      }
-    };
-
-    auto getter_func = [field = std::move(field),
-                        getter_func_helper = std::move(getter_func_helper)](
-                           SelfPtr const& self) {
-      auto val = getter_func_helper(self);
-      // upconvert uint8_t, int32_t etc. to int64_t for python
-      if constexpr (std::is_integral_v<T>) {
-        return static_cast<int64_t>(val);
-      } else {
-        return val;
-      }
-    };
-
-    cls.def_property(name, getter_func);
-  }
-
-  template <typename MemberFunc, typename Cls>
-  static void bind_function(torch::class_<Self>& cls, const std::string& name,
-                            MemberFunc Cls::*member) {
-    cls.def(name, [member = std::move(member)](SelfPtr const& self) {
-      return (self.get()->*member)();
-    });
-  }
-
-  template <typename Func>
-  static void bind_function(torch::class_<Self>& cls, const std::string& name,
-                            Func func) {
-    cls.def(name, func);
-  }
-
-  template <typename Func>
-  static void bind_static_function(torch::class_<Self>& cls,
-                                   const std::string& name, Func func) {
-    cls.def_static(name, func);
-  }
-
-  static void bind_class(torch::Library& lib) {
-    auto cls = lib.class_<ScalarTypeTorch>("ScalarType")
-                   .def(torch::init<int64_t, int64_t, int64_t, bool>());
-
-    // Bind Properties
-    bind_readonly_property(cls, "mantissa", &Base::mantissa);
-    bind_readonly_property(cls, "exponent", &Base::exponent);
-    bind_readonly_property(cls, "bias", &Base::bias);
-    bind_readonly_property(cls, "signed", &Base::is_signed);
-    bind_readonly_property(cls, "size_bits", &Base::size_bits);
-
-    // Bind member functions
-    bind_function(cls, "is_signed", &Base::is_signed);
-    bind_function(cls, "is_integer", &Base::is_integer);
-    bind_function(cls, "is_floating_point", &Base::is_floating_point);
-    bind_function(cls, "is_ieee_754", &Base::is_ieee_754);
-    bind_function(cls, "has_nans", &Base::has_nans);
-    bind_function(cls, "has_infs", &Base::has_infs);
-    bind_function(cls, "has_bias", &Base::has_bias);
-
-    bind_function(cls, "max", [](SelfPtr const& self) {
-      return std::visit([](auto arg) { return c10::IValue(arg); },
-                        self.get()->max());
-    });
-    bind_function(cls, "min", [](SelfPtr const& self) {
-      return std::visit([](auto arg) { return c10::IValue(arg); },
-                        self.get()->min());
-    });
-
-    bind_function(cls, "__len__", &ScalarTypeTorch::len);
-    bind_function(cls, "__str__", &Base::str);
-    bind_function(cls, "__eq__", [](SelfPtr const& self, SelfPtr const& other) {
-      return *self == *other;
-    });
-    bind_function(cls, "__repr__", [](SelfPtr const& self) {
-      return "ScalarType." + self.get()->str();
-    });
-
-    bind_function(cls, "__obj_flatten__", &ScalarTypeTorch::obj_flatten);
-    bind_static_function(cls, "__obj_unflatten__",
-                         &ScalarTypeTorch::obj_unflatten);
-
-    // Bind static functions (convenience constructors)
-    bind_static_function(cls, "int_", &ScalarTypeTorch::int_);
-    bind_static_function(cls, "uint", &ScalarTypeTorch::uint);
-    bind_static_function(cls, "float_IEEE754", &ScalarTypeTorch::float_IEEE754);
-    bind_static_function(cls, "float_", &ScalarTypeTorch::float_);
-  }
-};
-
-using ScalarTypeId = int64_t;
-using ScalarTypeTorchPtr = c10::intrusive_ptr<ScalarTypeTorch>;
-
-// "rust style" names generally following:
-//   https://github.com/pytorch/pytorch/blob/6d9f74f0af54751311f0dd71f7e5c01a93260ab3/torch/csrc/api/include/torch/types.h#L60-L70
-static inline constexpr auto kS4 = ScalarType::int_(4);
-static inline constexpr auto kU4 = ScalarType::uint(4);
-static inline constexpr auto kU4B8 = ScalarType::uint(4, 8);
-static inline constexpr auto kS8 = ScalarType::int_(8);
-static inline constexpr auto kU8 = ScalarType::uint(8);
-static inline constexpr auto kU8B128 = ScalarType::uint(8, 128);
-
-static inline constexpr auto kFE3M2f =
-    ScalarType::float_(3, 2, true, ScalarType::NAN_NONE);
-static inline constexpr auto kFE4M3fn =
-    ScalarType::float_(4, 3, true, ScalarType::NAN_EXTD_RANGE_MAX_MIN);
-static inline constexpr auto kFE5M2 = ScalarType::float_IEEE754(5, 2);
-static inline constexpr auto kFE8M7 = ScalarType::float_IEEE754(8, 7);
-static inline constexpr auto kFE5M10 = ScalarType::float_IEEE754(5, 10);
-
-// Fixed width style names, generally following:
-//  https://github.com/pytorch/pytorch/blob/6d9f74f0af54751311f0dd71f7e5c01a93260ab3/torch/csrc/api/include/torch/types.h#L47-L57
-static inline constexpr auto kInt4 = kS4;
-static inline constexpr auto kUint4 = kU4;
-static inline constexpr auto kUint4b8 = kU4B8;
-static inline constexpr auto kInt8 = kS8;
-static inline constexpr auto kUint8 = kU8;
-static inline constexpr auto kUint8b128 = kU8B128;
-
-static inline constexpr auto kFloat6_e3m2f = kFE3M2f;
-static inline constexpr auto kFloat8_e4m3fn = kFE4M3fn;
-static inline constexpr auto kFloat8_e5m2 = kFE5M2;
-static inline constexpr auto kFloat16_e8m7 = kFE8M7;
-static inline constexpr auto kFloat16_e5m10 = kFE5M10;
-
-// colloquial names
-static inline constexpr auto kHalf = kFE5M10;
-static inline constexpr auto kFloat16 = kHalf;
-static inline constexpr auto kBFloat16 = kFE8M7;
-
-static inline constexpr auto kFloat16Id = kFloat16.id();
-};  // namespace vllm
--- a/csrc/core/torch_bindings.cpp
+++ b/csrc/core/torch_bindings.cpp
@ -1,16 +0,0 @@
-#include <torch/library.h>
-
-#include "scalar_type.hpp"
-#include "registration.h"
-
-// Note the CORE exstension will be built for (almost) all hardware targets so
-// new additions must account for this. (currently not built for TPU and Neuron)
-
-TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, lib) {
-  // ScalarType, a custom class for representing data types that supports
-  // quantized types, declared here so it can be used when creating interfaces
-  // for custom ops.
-  vllm::ScalarTypeTorch::bind_class(lib);
-}
-
-REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
--- a/csrc/cpu/torch_bindings.cpp
+++ b/csrc/cpu/torch_bindings.cpp
@ -1,6 +1,6 @@
 #include "cache.h"
 #include "ops.h"
-#include "core/registration.h"
+#include "registration.h"

 #include <torch/library.h>

--- a/csrc/cuda_utils.h
+++ b/csrc/cuda_utils.h
@ -1,15 +1,5 @@
 #pragma once

-#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
-  #define HOST_DEVICE_INLINE __forceinline__ __host__ __device__
-  #define DEVICE_INLINE __forceinline__ __device__
-  #define HOST_INLINE __forceinline__ __host__
-#else
-  #define HOST_DEVICE_INLINE inline
-  #define DEVICE_INLINE inline
-  #define HOST_INLINE inline
-#endif
-
 int64_t get_device_attribute(int64_t attribute, int64_t device_id);

 int64_t get_max_shared_memory_per_block_device_attribute(int64_t device_id);
--- a/csrc/cutlass_extensions/cute_utils.cuh
+++ b/csrc/cutlass_extensions/cute_utils.cuh
@ -1,68 +0,0 @@
-#pragma once
-
-#include <cute/tensor.hpp>
-#include <torch/all.h>
-namespace cute {
-
-////////////////////////////////////////////////////////////////////
-// layout utils
-////////////////////////////////////////////////////////////////////
-
-// Permute layout based on indices, example:
-//   permute_layout<1, 0>(layout) will swap the two dimensions
-//   permute_layout<0, 2, 1>(layout) will swap the last two dimensions
-template <size_t... I, typename Layout>
-CUTE_HOST_DEVICE static constexpr auto permute_layout(Layout l) {
-  static_assert(rank(l) == sizeof...(I), "Invalid permutation, rank mismatch");
-  return cute::make_layout(cute::get<I>(l)...);
-}
-
-// is the layout f(x) = x
-template <typename Layout>
-CUTE_HOST_DEVICE static constexpr bool is_identity_layout() {
-  if constexpr (std::is_same_v<Layout, void>)
-    return true;
-  else {
-    constexpr auto coalesced_layout = coalesce(Layout{});
-    if constexpr (rank(coalesced_layout) == 1 &&
-                  stride<0>(coalesced_layout) == 1) {
-      return true;
-    }
-    return false;
-  }
-}
-
-////////////////////////////////////////////////////////////////////
-// Pointer utils
-////////////////////////////////////////////////////////////////////
-
-template <class PointerType>
-static constexpr auto get_logical_ptr(PointerType* ptr) {
-  if constexpr (cute::sizeof_bits_v<PointerType> < 8) {
-    return cute::subbyte_iterator<PointerType>(ptr);
-  } else {
-    return ptr;
-  }
-}
-
-////////////////////////////////////////////////////////////////////
-// Misc utils
-////////////////////////////////////////////////////////////////////
-
-template <typename T, typename Elements>
-CUTE_HOST_DEVICE static constexpr auto create_auto_vectorizing_copy() {
-  constexpr auto bits = sizeof_bits_v<T> * Elements{};
-  if constexpr (bits % 128 == 0) {
-    return AutoVectorizingCopyWithAssumedAlignment<128>{};
-  } else if constexpr (bits % 64 == 0) {
-    return AutoVectorizingCopyWithAssumedAlignment<64>{};
-  } else if constexpr (bits % 32 == 0) {
-    return AutoVectorizingCopyWithAssumedAlignment<32>{};
-  } else if constexpr (bits % 16 == 0) {
-    return AutoVectorizingCopyWithAssumedAlignment<16>{};
-  } else {
-    return AutoVectorizingCopyWithAssumedAlignment<8>{};
-  }
-}
-
-};  // namespace cute
--- a/csrc/cutlass_extensions/torch_utils.hpp
+++ b/csrc/cutlass_extensions/torch_utils.hpp
@ -1,154 +0,0 @@
-#pragma once
-
-#include <torch/all.h>
-
-#include "cute/layout.hpp"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/bfloat16.h"
-#include "cutlass/half.h"
-
-using ColumnMajor = typename cutlass::layout::ColumnMajor;
-using RowMajor = typename cutlass::layout::RowMajor;
-
-namespace cute {
-
-namespace detail {
-
-template <class T, class F, class G, int... I>
-CUTE_HOST_DEVICE constexpr auto tapply_with_idx(T&& t, F&& f, G&& g,
-                                                seq<I...>) {
-  return g(f(cute::get<I>(static_cast<T&&>(t)), I)...);
-}
-
-template <class F, int... I>
-CUTE_HOST_DEVICE constexpr auto make_shape_from_idx(F&& f, seq<I...>) {
-  return make_shape(f(I)...);
-}
-
-};  // namespace detail
-
-template <class T, class F>
-CUTE_HOST_DEVICE constexpr auto transform_with_idx(T const& t, F&& f) {
-  if constexpr (cute::is_tuple<T>::value) {
-    return detail::tapply_with_idx(
-        t, f, [](auto const&... a) { return cute::make_tuple(a...); },
-        tuple_seq<T>{});
-  } else {
-    return f(t);
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-// calls: make_shape(f(0), f(1), ..., f(N-1))
-template <int N, class F>
-CUTE_HOST_DEVICE constexpr auto make_shape_from_idx(F&& f) {
-  return detail::make_shape_from_idx(f, make_seq<N>{});
-}
-
-};  // namespace cute
-
-// Make a layout from a tensor with `rank(Stride{})`, where the shape is the
-// shape of the passed in tensor and the strides are of type `Stride` and
-// contain the strides of the passed in tensor, checking that any static strides
-// in `Stride{}` match the strides of the passed in tensor.
-// If `tensor.dim() < rank(Stride{})`, the shape is padded with 1s and the extra
-// strides are set to be 0 or 1.
-template <typename Stride>
-static inline auto make_cute_layout(torch::Tensor const& tensor,
-                                    std::string_view name = "tensor") {
-  TORCH_CHECK(tensor.dim() <= rank(Stride{}));
-  auto stride = cute::transform_with_idx(
-      Stride{}, [&](auto const& stride_ele, auto const& idx) {
-        using StrideEle = std::decay_t<decltype(stride_ele)>;
-
-        if (idx < tensor.dim()) {
-          if constexpr (cute::is_static_v<StrideEle>) {
-            TORCH_CHECK(StrideEle::value == tensor.stride(idx), "Expected ",
-                        name, ".stride(", idx, ") to be ", StrideEle::value);
-            return StrideEle{};
-          } else {
-            return tensor.stride(idx);
-          }
-        } else {
-          // Extra strides are assumed to be 0 or 1
-          if constexpr (cute::is_static_v<StrideEle>) {
-            static_assert(StrideEle::value == 0 || StrideEle::value == 1);
-          }
-          return StrideEle{};
-        }
-      });
-
-  auto shape = cute::make_shape_from_idx<rank(Stride{})>([&](auto const& idx) {
-    if (idx < tensor.dim())
-      return tensor.size(idx);
-    else
-      return int64_t(1);
-  });
-
-  return make_layout(shape, stride);
-}
-
-template <typename Stride>
-static inline auto maybe_make_cute_layout(
-    c10::optional<torch::Tensor> const& tensor,
-    std::string_view name = "tensor") {
-  using Layout = decltype(make_cute_layout<Stride>(*tensor));
-
-  if (tensor) {
-    return std::optional<Layout>{make_cute_layout<Stride>(*tensor, name)};
-  } else {
-    return std::optional<Layout>{};
-  }
-}
-
-//
-//  Torch Type to Cutlass Type (equivalent_cutlass_type)
-//
-
-template <typename T>
-struct equivalent_cutlass_type {
-  using type = T;
-};
-
-template <typename T>
-using equivalent_cutlass_type_t = typename equivalent_cutlass_type<T>::type;
-
-template <>
-struct equivalent_cutlass_type<c10::Half> {
-  using type = cutlass::half_t;
-};
-
-template <>
-struct equivalent_cutlass_type<c10::BFloat16> {
-  using type = cutlass::bfloat16_t;
-};
-
-//
-// equivalent_scalar_t (basically inverse of equivalent_cutlass_type)
-//
-
-// Return a `c10::CppTypeToScalarType<T>` compatible type, i.e. get the C++ from
-// c10 that is equivalent to T, e.g.: `cutlass::half_t -> c10::Half`
-template <typename T>
-struct equivalent_scalar_type {
-  using type = T;
-};
-
-template <typename T>
-using equivalent_scalar_type_t = typename equivalent_scalar_type<T>::type;
-
-template <>
-struct equivalent_scalar_type<cutlass::half_t> {
-  using type = c10::Half;
-};
-
-template <>
-struct equivalent_scalar_type<cutlass::bfloat16_t> {
-  using type = c10::BFloat16;
-};
-
-// get equivalent c10::ScalarType tag from compile time type
-template <typename T>
-static inline constexpr c10::ScalarType equivalent_scalar_type_v =
-    c10::CppTypeToScalarType<equivalent_scalar_type_t<T>>::value;
--- a/csrc/cutlass_extensions/vllm_collective_builder.cuh
+++ b/csrc/cutlass_extensions/vllm_collective_builder.cuh
@ -1,43 +0,0 @@
-#pragma once
-
-#include "cutlass/gemm/collective/collective_builder.hpp"
-
-namespace cutlass::gemm::collective {
-using namespace cute;
-
-//
-// VLLMCollectiveBuilder is a wrapper around CollectiveBuilder that allows for
-// for custom kernel tags, allowing you to build custom collectives. Without
-// touching the cutlass library headers, using `CutlassKernelTag` will mean it
-// will resort to using the standard cutlass collective builder.
-//
-
-// Use the default Cutlass collective builder, i.e. use an unmodified cutless
-// collective
-struct CutlassKernelTag {};
-
-template <class KernelTag, class ArchTag, class OpClass, class ElementA,
-          class GmemLayoutA, int AlignmentA, class ElementB, class GmemLayoutB,
-          int AlignmentB, class ElementAccumulator, class TileShape_MNK,
-          class ClusterShape_MNK, class StageCountType,
-          class KernelScheduleType, class Enable = void>
-struct VLLMCollectiveBuilder {
-  static_assert(sizeof(ElementA) == 0,
-                "Could not build a collective for given parameters.");
-};
-
-template <class ArchTag, class OpClass, class ElementA, class GmemLayoutA,
-          int AlignmentA, class ElementB, class GmemLayoutB, int AlignmentB,
-          class ElementAccumulator, class TileShape_MNK, class ClusterShape_MNK,
-          class StageCountType, class KernelScheduleType>
-struct VLLMCollectiveBuilder<
-    CutlassKernelTag, ArchTag, OpClass, ElementA, GmemLayoutA, AlignmentA,
-    ElementB, GmemLayoutB, AlignmentB, ElementAccumulator, TileShape_MNK,
-    ClusterShape_MNK, StageCountType, KernelScheduleType> {
-  using CollectiveOp = typename CollectiveBuilder<
-      ArchTag, OpClass, ElementA, GmemLayoutA, AlignmentA, ElementB,
-      GmemLayoutB, AlignmentB, ElementAccumulator, TileShape_MNK,
-      ClusterShape_MNK, StageCountType, KernelScheduleType>::CollectiveOp;
-};
-
-};  // namespace cutlass::gemm::collective
--- a/csrc/cutlass_extensions/vllm_custom_types.cuh
+++ b/csrc/cutlass_extensions/vllm_custom_types.cuh
@ -1,50 +0,0 @@
-#pragma once
-
-#include "cutlass/integer_subbyte.h"
-
-namespace cutlass {
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <int Bits, int Bias, bool Signed = false>
-struct vllm_biased_integer_subbyte : public integer_subbyte<Bits, Signed> {
-  using Base = integer_subbyte<Bits, Signed>;
-
-  using Storage = typename Base::Storage;
-  using xint_t = typename Base::xint_t;
-
-  using Base::bits_mask_;
-  using Base::sign_mask_;
-  using Base::storage;
-
-  //
-  // Methods
-  //
-
-  /// No operation
-  vllm_biased_integer_subbyte() = default;
-
-  /// Conversion from integer type
-  CUTLASS_HOST_DEVICE explicit vllm_biased_integer_subbyte(int value)
-      : Base(value) {}
-  CUTLASS_HOST_DEVICE explicit vllm_biased_integer_subbyte(unsigned value)
-      : Base(value) {}
-  CUTLASS_HOST_DEVICE explicit vllm_biased_integer_subbyte(double value)
-      : Base(value) {}
-};
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-// "GPTQ" types, i.e. symmetric quantization
-using vllm_uint4b8_t = vllm_biased_integer_subbyte<4, 8>;      // u4b8
-using vllm_uint8b128_t = vllm_biased_integer_subbyte<8, 128>;  // u8b128
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <int Bits, int Bias, bool Signed>
-struct sizeof_bits<vllm_biased_integer_subbyte<Bits, Bias, Signed>> {
-  static constexpr int value = Bits;
-};
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace cutlass
--- a/csrc/cutlass_extensions/vllm_cutlass_library_extension.py
+++ b/csrc/cutlass_extensions/vllm_cutlass_library_extension.py
@ -1,49 +0,0 @@
-import enum
-from typing import Dict, Union
-
-from cutlass_library import *
-
-#
-#   Extend cutlass library with custom types, and missing values
-#
-
-
-class VLLMDataType(enum.Enum):
-    u4b8 = enum_auto()
-    u8b128 = enum_auto()
-
-
-class MixedInputKernelScheduleType(enum.Enum):
-    TmaWarpSpecializedMixedInput = enum_auto()
-    TmaWarpSpecializedPingpongMixedInput = enum_auto()
-    TmaWarpSpecializedCooperativeMixedInput = enum_auto()
-
-
-VLLMDataTypeNames: Dict[Union[VLLMDataType, DataType], str] = {
-    **DataTypeNames,  # type: ignore
-    **{
-        VLLMDataType.u4b8: "u4b8",
-        VLLMDataType.u8b128: "u8b128",
-    }
-}
-
-VLLMDataTypeTag: Dict[Union[VLLMDataType, DataType], str] = {
-    **DataTypeTag,  # type: ignore
-    **{
-        VLLMDataType.u4b8: "cutlass::vllm_uint4b8_t",
-        VLLMDataType.u8b128: "cutlass::vllm_uint8b128_t",
-    }
-}
-
-VLLMKernelScheduleTag: Dict[Union[
-    MixedInputKernelScheduleType, KernelScheduleType], str] = {
-        **KernelScheduleTag,  # type: ignore
-        **{
-            MixedInputKernelScheduleType.TmaWarpSpecializedMixedInput:
-            "cutlass::gemm::KernelTmaWarpSpecializedMixedInput",
-            MixedInputKernelScheduleType.TmaWarpSpecializedPingpongMixedInput:
-            "cutlass::gemm::KernelTmaWarpSpecializedPingpongMixedInput",
-            MixedInputKernelScheduleType.TmaWarpSpecializedCooperativeMixedInput:
-            "cutlass::gemm::KernelTmaWarpSpecializedCooperativeMixedInput",
-        }
-    }
--- a/csrc/cutlass_extensions/vllm_numeric_conversion.cuh
+++ b/csrc/cutlass_extensions/vllm_numeric_conversion.cuh
@ -1,795 +0,0 @@
-#pragma once
-
-#include "cutlass/numeric_conversion.h"
-#include "cutlass_extensions/vllm_custom_types.cuh"
-#include "cutlass_extensions/cute_utils.cuh"
-
-// this file extends:
-//   https://github.com/NVIDIA/cutlass/blob/cutlass-3.5.0/include/cutlass/numeric_conversion.h
-// with vllm specific type conversions, namely: vllm_uint4b8_t, vllm_uint8b128_t
-// as well as adds interleaved numeric array converters for specific types.
-// (interleaved numeric array converters can be more efficient for subbyte
-// types)
-
-namespace cutlass {
-
-// InterleavedNumericArrayConverter is like NumericArrayConverter but also
-// deinterleaves converted elements based on IlvBlkLayout, interleaving can
-// make subbyte converts more efficient by allowing for efficient extraction
-// of subbyte elements from a 32bit register.
-template <typename IlvBlkLayout, typename T, typename S, int N,
-          FloatRoundStyle Round = FloatRoundStyle::round_to_nearest,
-          class Enable = void>
-struct InterleavedNumericArrayConverter {
-  using Converter = NumericArrayConverter<T, S, N, Round>;
-
-  using result_type = typename Converter::result_type;
-  using source_type = typename Converter::source_type;
-
-  CUTLASS_DEVICE
-  static result_type convert(source_type const& source) {
-    CUTE_INVALID_CONTROL_PATH(
-        "InterleavedNumericArrayConverter not implemented\n");
-    return {};
-  }
-
-  CUTLASS_DEVICE
-  result_type operator()(source_type const& s) const { return convert(s); }
-};
-
-template <typename IlvBlkLayout, typename T, typename S, int N,
-          FloatRoundStyle Round>
-struct InterleavedNumericArrayConverter<
-    IlvBlkLayout, T, S, N, Round,
-    std::enable_if_t<is_identity_layout<IlvBlkLayout>()>> {
-  using Converter = NumericArrayConverter<T, S, N, Round>;
-
-  using result_type = typename Converter::result_type;
-  using source_type = typename Converter::source_type;
-
-  CUTLASS_DEVICE
-  static result_type convert(source_type const& source) {
-    return Converter::convert(source);
-  }
-
-  CUTLASS_DEVICE
-  result_type operator()(source_type const& s) const { return convert(s); }
-};
-
-// TODO (LucasWilkinson): Implement
-// for Array<cutlass::float8_e4m3fn, N> <= Array<vllm_uint4b8_t, N>
-
-// ....
-
-template <typename RegConvert32bit, typename T, typename S, int N>
-struct ArrayConverterPacked32Bit {
-  using result_type = Array<T, N>;
-  using source_type = Array<S, N>;
-
-  using result_packed_8_t = Array<T, 8>;
-  using result_packed_4_t = Array<T, 4>;
-  using result_packed_2_t = Array<T, 2>;
-  using src_packed_8_t = Array<S, 8>;
-  using src_packed_4_t = Array<S, 4>;
-  using src_packed_2_t = Array<S, 2>;
-
-  static_assert(N % 2 == 0, "N must be a multiple of 2");
-  static_assert(cutlass::sizeof_bits_v<S> >= 4);  // TODO: add 16 packed sources
-  static_assert(32 % cutlass::sizeof_bits_v<S> == 0);
-  static constexpr auto src_elems_per_32bit_reg =
-      32 / cutlass::sizeof_bits_v<S>;
-
-  // Maybe not Valid. ScalarConverter will not actually work unless
-  // NumericConverter<T, S, Round> is implemented. However it won't be used
-  // anyways since we assert N % 2 == 0, just here for compliance with
-  // VectorizedConverter.
-  using ScalarConverter = NumericConverter<T, S>;
-
-  template <typename PackedSrc>
-  CUTLASS_DEVICE static uint32_t to_reg(PackedSrc const& source) {
-    if constexpr (sizeof(PackedSrc) == 1) {
-      return static_cast<uint32_t>(reinterpret_cast<const uint8_t&>(source));
-    } else if constexpr (sizeof(PackedSrc) == 2) {
-      return static_cast<uint32_t>(reinterpret_cast<const uint16_t&>(source));
-    } else {
-      static_assert(sizeof(PackedSrc) == 4);
-      return reinterpret_cast<const uint32_t&>(source);
-    }
-  }
-
-  // The core converter uses bit tricks to construct a known FP16 number, then
-  // does a subtraction in FP16 for the final result.
-  template <typename PackedResultType, typename PackedSrcType>
-  CUTLASS_DEVICE static PackedResultType packed_convert(
-      PackedSrcType const& source) {
-    static_assert(PackedSrcType::kElements == PackedResultType::kElements);
-    static_assert(PackedResultType::kElements == 2 ||
-                      PackedResultType::kElements == 4 ||
-                      PackedResultType::kElements == 8,
-                  "Invalid PackedResultType must be 2, 4 or 8.");
-    static_assert(std::is_same_v<typename PackedSrcType::Element, S>);
-    static_assert(std::is_same_v<typename PackedResultType::Element, T>);
-
-    return RegConvert32bit::template convert<PackedResultType>(to_reg(source));
-  }
-
-  friend class detail::VectorizedConverter;
-
- public:
-  CUTLASS_DEVICE static result_type convert(source_type const& source) {
-    result_type result;
-    using ConverterType =
-        ArrayConverterPacked32Bit<RegConvert32bit,
-                                  typename result_type::Element,
-                                  typename source_type::Element, N>;
-
-    if constexpr (src_elems_per_32bit_reg >= 8) {
-      detail::VectorizedConverter::convert<
-          ConverterType, result_packed_8_t, src_packed_8_t, result_packed_4_t,
-          src_packed_4_t, result_packed_2_t, src_packed_2_t>(result, source);
-    } else if constexpr (src_elems_per_32bit_reg >= 4) {
-      detail::VectorizedConverter::convert<ConverterType, result_packed_4_t,
-                                           src_packed_4_t, result_packed_2_t,
-                                           src_packed_2_t>(result, source);
-    } else {
-      detail::VectorizedConverter::convert<ConverterType, result_packed_2_t,
-                                           src_packed_2_t>(result, source);
-    }
-
-    return result;
-  }
-};
-
-// for Array<cutlass::half_t, N> <= Array<vllm_uint4b8_t, N>
-template <FloatRoundStyle Round, int N>
-struct NumericArrayConverter<cutlass::half_t, vllm_uint4b8_t, N, Round> {
-  using result_type = Array<cutlass::half_t, N>;
-  using source_type = Array<vllm_uint4b8_t, N>;
-
-  struct RegConvert {
-    template <typename PackedResultType>
-    CUTLASS_DEVICE static PackedResultType convert(uint32_t src) {
-      using RegArray =
-          cutlass::AlignedArray<uint32_t, PackedResultType::kElements / 2,
-                                sizeof(PackedResultType)>;
-      RegArray r;
-
-      // Below constructs the following temporary:
-      // fp16s_01 = {0x00, i4_01, 0x00, i4_01}
-      // fp16s_23 = {0x00, i4_23, 0x00, i4_23}
-      // fp16s_45 = {0x00, i4_45, 0x00, i4_45}
-      // fp16s_67 = {0x00, i4_67, 0x00, i4_67}
-      // We use inline asm instead of __byte_perm intrinsic since we don't want
-      // the documented (& 0x7) on the index. NVCC might be able to optimize it
-      // out since the index is a constexpr, but we choose to be safe about it
-      // here.
-      uint32_t prmt_indices[4] = {0x4040, 0x4141, 0x4242, 0x4343};
-      static_assert(RegArray::kElements <= 4,
-                    "Too many inputs for F16 -> I4 vector converter");
-      CUTLASS_PRAGMA_UNROLL
-      for (int ii = 0; ii < RegArray::kElements; ++ii) {
-        asm volatile(
-            "{\n"
-            "  prmt.b32 %0, %1, %2, %3;\n"
-            "}\n"
-            : "=r"(r[ii])
-            : "r"(src), "n"(0), "r"(prmt_indices[ii]));
-      }
-
-      // Since the stored 4bit values are biased by 8 we get stored_val = (x+8)
-      //  we are trying to construct x and a fp16 value
-      // The below XOR does the following:
-      //  1) Sets the exponent bits of the FP16 to the correct value for the
-      //  FP16 magic_num. We will be constructing {1024+16*(x1+8), 1024+(x0+8)},
-      //  where x1 in the high nibble and x0 is the low nibble then using hfma
-      //  to subtract 1032 from that
-      // The AND does the following:
-      //  1) Clear the set bits for the int4 we will ignore.
-      // We use lop3 so that we can use 1 instruction for AND and XOR.
-      static constexpr uint32_t xor_mask = 0x64006400;
-      static constexpr uint32_t and_mask = 0xFFF0FF0F;
-      static constexpr uint32_t immLut = (0xf0 & 0xcc) ^ 0xaa;
-
-      // For each operand, computes:
-      // r[i] = (r[i] & and_mask) ^ xor_mask
-      CUTLASS_PRAGMA_UNROLL
-      for (int ii = 0; ii < RegArray::kElements; ++ii) {
-        asm volatile(
-            "{\n"
-            "  lop3.b32 %0, %0, %1, %2, %3;\n"
-            "}\n"
-            : "+r"(r[ii])
-            : "n"(and_mask), "n"(xor_mask), "n"(immLut));
-      }
-
-      // We will issue 2 hfmas that do the following:
-      // {x1, x0} = {1024+16*(x1+8), 1024+(x0+8)} * {1/16, 1} - {72, 1032}
-      //          = {x1 + 1152, x0 + 1032} * {1/16, 1} - {72, 1032}
-      static constexpr uint32_t hfma_bias_rep = 0xD480E408;   // {72, 1032}
-      static constexpr uint32_t hfma_scale_rep = 0x2C003C00;  // {1 / 16, 1}
-
-      const half2& hfma_bias = reinterpret_cast<const half2&>(hfma_bias_rep);
-      const half2& hfma_scale = reinterpret_cast<const half2&>(hfma_scale_rep);
-      CUTLASS_PRAGMA_UNROLL
-      for (int ii = 0; ii < RegArray::kElements; ++ii) {
-        half2& fp16x2_val = reinterpret_cast<__half2&>(r[ii]);
-        fp16x2_val = __hfma2(hfma_scale, fp16x2_val, hfma_bias);
-      }
-
-      return reinterpret_cast<PackedResultType&>(r);
-    };
-  };
-
- public:
-  CUTLASS_DEVICE
-  static result_type convert(source_type const& source) {
-    return ArrayConverterPacked32Bit<RegConvert, typename result_type::Element,
-                                     typename source_type::Element,
-                                     N>::convert(source);
-  }
-
-  CUTLASS_DEVICE
-  result_type operator()(source_type const& s) const { return convert(s); }
-};
-
-// for Array<cutlass::half_t, N> <= Array<vllm_uint4b8_t, N>
-//   for IlvdLayout: (2, 4):(4, 1)
-template <FloatRoundStyle Round, int N>
-struct InterleavedNumericArrayConverter<Layout<Shape<_2, _4>, Stride<_4, _1>>,
-                                        cutlass::half_t, vllm_uint4b8_t, N,
-                                        Round, void> {
-  using IlvdLayout = Layout<Shape<_2, _4>, Stride<_4, _1>>;
-  static_assert(N % size(IlvdLayout{}) == 0);
-
-  using result_type = Array<cutlass::half_t, N>;
-  using source_type = Array<vllm_uint4b8_t, N>;
-
-  static FloatRoundStyle const round_style = Round;
-
- private:
-  struct RegConvert {
-    template <typename PackedResultType>
-    CUTLASS_DEVICE static PackedResultType convert(uint32_t src) {
-      using RegArray =
-          cutlass::AlignedArray<uint32_t, PackedResultType::kElements / 2,
-                                sizeof(PackedResultType)>;
-      RegArray r;
-
-      static_assert(PackedResultType::kElements <= size(IlvdLayout{}));
-      static constexpr uint32_t xor_mask = 0x64006400;
-
-      for (int ii = 0; ii < RegArray::kElements; ii += 2) {
-        auto src_ = src >> (4 * (ii));
-        r[ii + 0] = src_;
-        r[ii + 1] = src_;
-
-        static constexpr uint32_t and_xor_imm_lut = (0xf0 & 0xcc) ^ 0xaa;
-
-        static constexpr uint32_t low_nib_mask = 0x000F000F;
-        static constexpr uint32_t high_nib_mask = 0x00F000F0;
-
-        asm volatile(
-            "{\n"
-            "  lop3.b32 %0, %0, %1, %2, %3;\n"
-            "}\n"
-            : "+r"(r[ii + 0])
-            : "n"(low_nib_mask), "n"(xor_mask), "n"(and_xor_imm_lut));
-
-        asm volatile(
-            "{\n"
-            "  lop3.b32 %0, %0, %1, %2, %3;\n"
-            "}\n"
-            : "+r"(r[ii + 1])
-            : "n"(high_nib_mask), "n"(xor_mask), "n"(and_xor_imm_lut));
-
-        // For low nibble:
-        //  {x1, x0} = {1024+(x1+8), 1024+(x0+8)} * {1, 1} - {1032, 1032}
-        // For high nibble:
-        //  {x1, x0} = {1024+16*(x1+8), 1024+16*(x0+8)} * {1/16, 1/16}
-        //             - {72, 72}
-        static constexpr uint32_t low_nib_bias = 0x64086408;    // {1032, 1032}
-        static constexpr uint32_t high_nib_scale = 0x2C002C00;  // {1/16, 1/16}
-        static constexpr uint32_t high_nib_bias = 0xD480D480;   // {-72, -72}
-
-        {
-          half2& fp16x2_val = reinterpret_cast<__half2&>(r[ii + 0]);
-          fp16x2_val =
-              __hsub2(fp16x2_val, reinterpret_cast<const half2&>(low_nib_bias));
-        }
-
-        {
-          half2& fp16x2_val = reinterpret_cast<__half2&>(r[ii + 1]);
-          fp16x2_val = __hfma2(fp16x2_val,
-                               reinterpret_cast<const half2&>(high_nib_scale),
-                               reinterpret_cast<const half2&>(high_nib_bias));
-        }
-      }
-
-      return reinterpret_cast<PackedResultType&>(r);
-    };
-  };
-
- public:
-  CUTLASS_DEVICE
-  static result_type convert(source_type const& source) {
-    return ArrayConverterPacked32Bit<RegConvert, typename result_type::Element,
-                                     typename source_type::Element,
-                                     N>::convert(source);
-  }
-
-  CUTLASS_DEVICE
-  result_type operator()(source_type const& s) const { return convert(s); }
-};
-
-// for Array<cutlass::half_t, N> <= Array<uint4_t, N>
-//   for IlvdLayout: (2, 4):(4, 1)
-template <FloatRoundStyle Round, int N>
-struct InterleavedNumericArrayConverter<Layout<Shape<_2, _4>, Stride<_4, _1>>,
-                                        cutlass::half_t, uint4_t, N, Round,
-                                        void> {
-  using IlvdLayout = Layout<Shape<_2, _4>, Stride<_4, _1>>;
-  static_assert(N % size(IlvdLayout{}) == 0);
-
-  using result_type = Array<cutlass::half_t, N>;
-  using source_type = Array<uint4_t, N>;
-
-  static FloatRoundStyle const round_style = Round;
-
- private:
-  struct RegConvert {
-    template <typename PackedResultType>
-    CUTLASS_DEVICE static PackedResultType convert(uint32_t src) {
-      using RegArray =
-          cutlass::AlignedArray<uint32_t, PackedResultType::kElements / 2,
-                                sizeof(PackedResultType)>;
-      RegArray r;
-
-      static_assert(PackedResultType::kElements <= size(IlvdLayout{}));
-      static constexpr uint32_t xor_mask = 0x64006400;
-
-      for (int ii = 0; ii < RegArray::kElements; ii += 2) {
-        auto src_ = src >> (4 * (ii));
-        r[ii + 0] = src_;
-        r[ii + 1] = src_;
-
-        static constexpr uint32_t and_xor_imm_lut = (0xf0 & 0xcc) ^ 0xaa;
-
-        static constexpr uint32_t low_nib_mask = 0x000F000F;
-        static constexpr uint32_t high_nib_mask = 0x00F000F0;
-
-        asm volatile(
-            "{\n"
-            "  lop3.b32 %0, %0, %1, %2, %3;\n"
-            "}\n"
-            : "+r"(r[ii + 0])
-            : "n"(low_nib_mask), "n"(xor_mask), "n"(and_xor_imm_lut));
-
-        asm volatile(
-            "{\n"
-            "  lop3.b32 %0, %0, %1, %2, %3;\n"
-            "}\n"
-            : "+r"(r[ii + 1])
-            : "n"(high_nib_mask), "n"(xor_mask), "n"(and_xor_imm_lut));
-
-        // For low nibble:
-        //  {x1, x0} = {1024+x1, 1024+x0} - {1024, 1024}
-        // For high nibble:
-        //  {x1, x0} = {1024+16*x1, 1024+16*x0} * {1/16, 1/16} - {64, 64}
-        static constexpr uint32_t low_nib_bias = 0x64006400;    // {1024, 1024}
-        static constexpr uint32_t high_nib_scale = 0x2C002C00;  // {1/16, 1/16}
-        static constexpr uint32_t high_nib_bias = 0xD400D400;   // {-64, -64}
-
-        {
-          half2& fp16x2_val = reinterpret_cast<__half2&>(r[ii + 0]);
-          fp16x2_val =
-              __hsub2(fp16x2_val, reinterpret_cast<const half2&>(low_nib_bias));
-        }
-
-        {
-          half2& fp16x2_val = reinterpret_cast<__half2&>(r[ii + 1]);
-          fp16x2_val = __hfma2(fp16x2_val,
-                               reinterpret_cast<const half2&>(high_nib_scale),
-                               reinterpret_cast<const half2&>(high_nib_bias));
-        }
-      }
-
-      return reinterpret_cast<PackedResultType&>(r);
-    };
-  };
-
- public:
-  CUTLASS_DEVICE
-  static result_type convert(source_type const& source) {
-    return ArrayConverterPacked32Bit<RegConvert, typename result_type::Element,
-                                     typename source_type::Element,
-                                     N>::convert(source);
-  }
-
-  CUTLASS_DEVICE
-  result_type operator()(source_type const& s) const { return convert(s); }
-};
-
-// for Array<cutlass::half_t, N> <= Array<vllm_uint8b128_t, N>
-template <FloatRoundStyle Round, int N>
-struct NumericArrayConverter<cutlass::half_t, vllm_uint8b128_t, N, Round> {
-  using result_type = Array<cutlass::half_t, N>;
-  using source_type = Array<vllm_uint8b128_t, N>;
-
-  struct RegConvert {
-    template <typename PackedResultType>
-    CUTLASS_DEVICE static PackedResultType convert(uint32_t src) {
-      // Hold output FP16s in reg. We need 1 reg for every 2 elements
-      using RegArray =
-          cutlass::AlignedArray<uint32_t, PackedResultType::kElements / 2,
-                                sizeof(PackedResultType)>;
-      RegArray r;
-
-      uint32_t const prmt_indices[2] = {0x5150, 0x5352};
-      static constexpr uint32_t start_byte_for_fp16 = 0x64646464;
-
-      for (int ii = 0; ii < RegArray::kElements; ++ii) {
-        asm volatile("prmt.b32 %0,%1,%2,%3;\n"
-                     : "=r"(r[ii])
-                     : "r"(src), "n"(start_byte_for_fp16),
-                       "r"(prmt_indices[ii]));
-      }
-
-      // -128 is folded into bias subtraction, i.e. the 0x80 in the low bytes
-      static constexpr uint32_t bias_rep = 0x64806480;
-      const half2& bias = reinterpret_cast<const half2&>(bias_rep);
-      CUTLASS_PRAGMA_UNROLL
-      for (int ii = 0; ii < RegArray::kElements; ++ii) {
-        half2& fp16x2_val = reinterpret_cast<__half2&>(r[ii]);
-        fp16x2_val = __hsub2(fp16x2_val, bias);
-      }
-
-      return reinterpret_cast<PackedResultType&>(r);
-    };
-  };
-
- public:
-  CUTLASS_DEVICE
-  static result_type convert(source_type const& source) {
-    return ArrayConverterPacked32Bit<RegConvert, typename result_type::Element,
-                                     typename source_type::Element,
-                                     N>::convert(source);
-  }
-
-  CUTLASS_DEVICE
-  result_type operator()(source_type const& s) const { return convert(s); }
-};
-
-// for Array<cutlass::float, N> <= Array<vllm_uint8b128_t, N>
-template <FloatRoundStyle Round, int N>
-struct NumericArrayConverter<float, vllm_uint8b128_t, N, Round> {
-  using result_type = Array<float, N>;
-  using source_type = Array<vllm_uint8b128_t, N>;
-  static FloatRoundStyle const round_style = Round;
-
- private:
-  struct RegConvert {
-    template <typename PackedResultType>
-    CUTLASS_DEVICE static PackedResultType convert(uint32_t src) {
-      PackedResultType r;
-
-      // __byte_perm simulates the add.u32 0x4B000000 to every u8 element of
-      // u8x4 source and stores the result in r (without introducing extra
-      // cvt.u32.u8 instruction)
-      uint32_t const prmt_indices[4] = {0x7650, 0x7651, 0x7652, 0x7653};
-      uint32_t* result_as_int = reinterpret_cast<uint32_t*>(&r);
-      for (int ii = 0; ii < PackedResultType::kElements; ++ii) {
-        result_as_int[ii] = __byte_perm(src, 0x4B000000, prmt_indices[ii]);
-        // Subtract the magic number 0x4B000000 from tmp in floating-point
-        // arithmetic to obtain final result
-        r[ii] -= (8388608.f + 128.f);  // fold in -128 bias
-      }
-
-      return r;
-    };
-  };
-
- public:
-  CUTLASS_DEVICE
-  static result_type convert(source_type const& source) {
-    return ArrayConverterPacked32Bit<RegConvert, typename result_type::Element,
-                                     typename source_type::Element,
-                                     N>::convert(source);
-  }
-
-  CUTLASS_DEVICE
-  result_type operator()(source_type const& s) const { return convert(s); }
-};
-
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
-
-// for Array<cutlass::bfloat16_t, N> <= Array<vllm_uint4b8_t, N>
-template <FloatRoundStyle Round, int N>
-struct NumericArrayConverter<cutlass::bfloat16_t, vllm_uint4b8_t, N, Round> {
-  using result_type = Array<cutlass::bfloat16_t, N>;
-  using source_type = Array<vllm_uint4b8_t, N>;
-
-  static FloatRoundStyle const round_style = Round;
-
- private:
-  struct RegConvert {
-    template <typename PackedResultType>
-    CUTLASS_DEVICE static PackedResultType convert(uint32_t src_reg) {
-      // Hold output BF16s in reg. We need 1 reg for every 2 elements
-      using RegArray =
-          cutlass::AlignedArray<uint32_t, PackedResultType::kElements / 2,
-                                sizeof(PackedResultType)>;
-      RegArray r;
-      uint32_t src_reg_shifted = src_reg >> 4;
-
-      // Below constructs the following temporary:
-      uint32_t const prmt_indices[4] = {0xF4F0, 0xF5F1, 0xF6F2, 0xF7F3};
-      static_assert(RegArray::kElements <= 4,
-                    "Too many inputs for uint4b8_t -> BF16 vector converter");
-      CUTLASS_PRAGMA_UNROLL
-      for (int ii = 0; ii < RegArray::kElements; ++ii) {
-        asm volatile(
-            "{\n"
-            "  prmt.b32 %0, %1, %2, %3;\n"
-            "}\n"
-            : "=r"(r[ii])
-            : "r"(src_reg), "r"(src_reg_shifted), "r"(prmt_indices[ii]));
-      }
-
-      // Since the stored 4bit values are biased by 8 we get stored_val = (x+8)
-      //  we are trying to construct x and a BF16 value
-      // The below XOR does the following:
-      //  1) Sets the exponent bits of the BF16 to the correct value for the
-      //  BF16 magic_num. We will be constructing {128 + (x1+8), 128 + (x0+8)}
-      //  and subtracting 136 to get {x1, x0}
-      static constexpr uint32_t xor_mask = 0x43004300;
-      static constexpr uint32_t and_mask = 0x000F000F;
-      static constexpr uint32_t immLut = (0xf0 & 0xcc) ^ 0xaa;
-
-      // For each operand, computes:
-      // r[i] = (r[i] & and_mask) ^ xor_mask
-      CUTLASS_PRAGMA_UNROLL
-      for (int ii = 0; ii < RegArray::kElements; ++ii) {
-        asm volatile(
-            "{\n"
-            "  lop3.b32 %0, %0, %1, %2, %3;\n"
-            "}\n"
-            : "+r"(r[ii])
-            : "n"(and_mask), "n"(xor_mask), "n"(immLut));
-      }
-
-      // We will issue 2 bfmas that do the following:
-      // high BF16:
-      // hi_bf16 - 136, lo_bf16 - 136
-
-      // This is the BF16 {136, 136} represented as an integer.
-      static constexpr uint32_t bias_rep = 0x43084308;
-      const __nv_bfloat162& bias =
-          reinterpret_cast<const __nv_bfloat162&>(bias_rep);
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int ii = 0; ii < RegArray::kElements; ++ii) {
-        __nv_bfloat162& bf16x2_val = reinterpret_cast<__nv_bfloat162&>(r[ii]);
-        bf16x2_val = __hsub2(bf16x2_val, bias);
-      }
-
-      return reinterpret_cast<PackedResultType&>(r);
-    }
-  };
-
- public:
-  CUTLASS_DEVICE
-  static result_type convert(source_type const& source) {
-    return ArrayConverterPacked32Bit<RegConvert, typename result_type::Element,
-                                     typename source_type::Element,
-                                     N>::convert(source);
-  }
-
-  CUTLASS_DEVICE
-  result_type operator()(source_type const& s) const { return convert(s); }
-};
-
-// for Array<cutlass::bfloat16_t, N> <= Array<vllm_uint4b8_t, N>
-//   for IlvdLayout: (2, 4):(4, 1)
-template <FloatRoundStyle Round, int N>
-struct InterleavedNumericArrayConverter<Layout<Shape<_2, _4>, Stride<_4, _1>>,
-                                        cutlass::bfloat16_t, vllm_uint4b8_t, N,
-                                        Round, void> {
-  using IlvdLayout = Layout<Shape<_2, _4>, Stride<_4, _1>>;
-  static_assert(N % size(IlvdLayout{}) == 0);
-
-  using result_type = Array<cutlass::bfloat16_t, N>;
-  using source_type = Array<vllm_uint4b8_t, N>;
-
- private:
-  struct RegConvert {
-    template <typename PackedResultType>
-    CUTLASS_DEVICE static PackedResultType convert(uint32_t src) {
-      using RegArray =
-          cutlass::AlignedArray<uint32_t, PackedResultType::kElements / 2,
-                                sizeof(PackedResultType)>;
-      RegArray r;
-
-      static_assert(PackedResultType::kElements <= size(IlvdLayout{}));
-      static constexpr uint32_t or_mask = 0x43004300;
-
-      // Unlike float16 where the mantissa is large enough to contain 2
-      // nibbles, bfloat16 can only fit one, so we can only convert one
-      // nibble at a time
-      for (int ii = 0; ii < RegArray::kElements; ++ii) {
-        r[ii] = src >> (4 * ii);
-
-        static constexpr uint32_t and_or_imm_lut = (0xf0 & 0xcc) | 0xaa;
-        static constexpr uint32_t low_nib_mask = 0x000F000F;
-
-        asm volatile(
-            "{\n"
-            "  lop3.b32 %0, %0, %1, %2, %3;\n"
-            "}\n"
-            : "+r"(r[ii + 0])
-            : "n"(low_nib_mask), "n"(or_mask), "n"(and_or_imm_lut));
-
-        // For low nibble:
-        //  {x1, x0} = {128+(x1+8), 128+(x0+8)} * {1, 1} - {136, 136}
-        static constexpr uint32_t low_nib_bias = 0x43084308;  // {136, 136}
-
-        {
-          __nv_bfloat162& fp16x2_val = reinterpret_cast<__nv_bfloat162&>(r[ii]);
-          fp16x2_val =
-              __hsub2(fp16x2_val,
-                      reinterpret_cast<const __nv_bfloat162&>(low_nib_bias));
-        }
-      }
-
-      return reinterpret_cast<PackedResultType&>(r);
-    };
-  };
-
- public:
-  CUTLASS_DEVICE
-  static result_type convert(source_type const& source) {
-    return ArrayConverterPacked32Bit<RegConvert, typename result_type::Element,
-                                     typename source_type::Element,
-                                     N>::convert(source);
-  }
-
-  CUTLASS_DEVICE
-  result_type operator()(source_type const& s) const { return convert(s); }
-};
-
-// for Array<cutlass::bfloat16_t, N> <= Array<uint4_t, N>
-//   for IlvdLayout: (2, 4):(4, 1)
-template <FloatRoundStyle Round, int N>
-struct InterleavedNumericArrayConverter<Layout<Shape<_2, _4>, Stride<_4, _1>>,
-                                        cutlass::bfloat16_t, uint4_t, N, Round,
-                                        void> {
-  using IlvdLayout = Layout<Shape<_2, _4>, Stride<_4, _1>>;
-  static_assert(N % size(IlvdLayout{}) == 0);
-
-  using result_type = Array<cutlass::bfloat16_t, N>;
-  using source_type = Array<uint4_t, N>;
-
- private:
-  struct RegConvert {
-    template <typename PackedResultType>
-    CUTLASS_DEVICE static PackedResultType convert(uint32_t src) {
-      using RegArray =
-          cutlass::AlignedArray<uint32_t, PackedResultType::kElements / 2,
-                                sizeof(PackedResultType)>;
-      RegArray r;
-
-      static_assert(PackedResultType::kElements <= size(IlvdLayout{}));
-      static constexpr uint32_t or_mask = 0x43004300;
-
-      // Unlike float16 where the mantissa is large enough to contain 2
-      // nibbles, bfloat16 can only fit one, so we can only convert one
-      // nibble at a time
-      for (int ii = 0; ii < RegArray::kElements; ++ii) {
-        r[ii] = src >> (4 * ii);
-
-        static constexpr uint32_t and_or_imm_lut = (0xf0 & 0xcc) | 0xaa;
-        static constexpr uint32_t low_nib_mask = 0x000F000F;
-
-        asm volatile(
-            "{\n"
-            "  lop3.b32 %0, %0, %1, %2, %3;\n"
-            "}\n"
-            : "+r"(r[ii])
-            : "n"(low_nib_mask), "n"(or_mask), "n"(and_or_imm_lut));
-
-        // For low nibble:
-        //  {x1, x0} = {128 + x1, 128 + x0} * {1, 1} - {128, 128}
-        static constexpr uint32_t low_nib_bias = 0x43004300;  // {128, 128}
-
-        {
-          __nv_bfloat162& fp16x2_val = reinterpret_cast<__nv_bfloat162&>(r[ii]);
-          fp16x2_val =
-              __hsub2(fp16x2_val,
-                      reinterpret_cast<const __nv_bfloat162&>(low_nib_bias));
-        }
-      }
-
-      return reinterpret_cast<PackedResultType&>(r);
-    };
-  };
-
- public:
-  CUTLASS_DEVICE
-  static result_type convert(source_type const& source) {
-    return ArrayConverterPacked32Bit<RegConvert, typename result_type::Element,
-                                     typename source_type::Element,
-                                     N>::convert(source);
-  }
-
-  CUTLASS_DEVICE
-  result_type operator()(source_type const& s) const { return convert(s); }
-};
-
-// for Array<cutlass::bfloat16_t, N> <= Array<vllm_uint8b128_t, N>
-template <FloatRoundStyle Round, int N>
-struct NumericArrayConverter<cutlass::bfloat16_t, vllm_uint8b128_t, N, Round> {
-  using result_type = Array<cutlass::bfloat16_t, N>;
-  using source_type = Array<vllm_uint8b128_t, N>;
-  static FloatRoundStyle const round_style = Round;
-
- private:
-  using result_packed_4_t = Array<cutlass::bfloat16_t, 4>;
-  using result_packed_2_t = Array<cutlass::bfloat16_t, 2>;
-  using src_packed_4_t = Array<vllm_uint8b128_t, 4>;
-  using src_packed_2_t = Array<vllm_uint8b128_t, 2>;
-
-  // Not Valid, not supported, only here to satisfy the interface and to avoid
-  //  a compile error. ScalarConverter will not actually work until
-  //  NumericConverter<cutlass::bfloat16_t, vllm_uint8b128_t, Round> is
-  //  implemented
-  using ScalarConverter =
-      NumericConverter<cutlass::bfloat16_t, vllm_uint8b128_t, Round>;
-
-  template <typename PackedResultType, typename PackedSrcType>
-  CUTLASS_DEVICE static PackedResultType packed_convert(
-      PackedSrcType const& source) {
-    static_assert(
-        (platform::is_same<PackedSrcType, src_packed_2_t>::value &&
-         platform::is_same<PackedResultType, result_packed_2_t>::value) ||
-            (platform::is_same<PackedSrcType, src_packed_4_t>::value &&
-             platform::is_same<PackedResultType, result_packed_4_t>::value),
-        "Invalid PackedSrcType/PackedResultType must be 2 or 4 to use private "
-        "convert dispatch.");
-
-    NumericArrayConverter<float, vllm_uint8b128_t, PackedResultType::kElements,
-                          Round>
-        convert_uint8_to_f32;
-    Array<float, PackedResultType::kElements> tmp =
-        convert_uint8_to_f32(source);
-    NumericArrayConverter<cutlass::bfloat16_t, float,
-                          PackedResultType::kElements, Round>
-        convert_f32_to_bf16_;
-    return convert_f32_to_bf16_(tmp);
-  }
-
-  friend class detail::VectorizedConverter;
-
- public:
-  CUTLASS_DEVICE
-  static result_type convert(source_type const& source) {
-    result_type result;
-    using ConverterType =
-        NumericArrayConverter<typename result_type::Element,
-                              typename source_type::Element, N, Round>;
-    detail::VectorizedConverter::convert<ConverterType, result_packed_4_t,
-                                         src_packed_4_t, result_packed_2_t,
-                                         src_packed_2_t>(result, source);
-
-    return result;
-  }
-
-  CUTLASS_DEVICE
-  result_type operator()(source_type const& s) const { return convert(s); }
-};
-
-#endif
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
--- a/csrc/layernorm_kernels.cu
+++ b/csrc/layernorm_kernels.cu
@ -3,16 +3,13 @@
 #include <c10/cuda/CUDAGuard.h>

 #include "dispatch_utils.h"
+#include "reduction_utils.cuh"
 #ifndef USE_ROCM
  #include <cuda_bf16.h>
  #include <cuda_fp16.h>
-  #include <cub/util_type.cuh>
-  #include <cub/cub.cuh>
 #else
  #include <hip/hip_bf16.h>
  #include <hip/hip_fp16.h>
-  #include <hipcub/util_type.hpp>
-  #include <hipcub/hipcub.hpp>

 using __nv_bfloat16 = __hip_bfloat16;
 using __nv_bfloat162 = __hip_bfloat162;
@ -34,11 +31,7 @@ __global__ void rms_norm_kernel(
    const float x = (float)input[blockIdx.x * hidden_size + idx];
    variance += x * x;
  }
-
-  using BlockReduce = cub::BlockReduce<float, 1024>;
-  __shared__ typename BlockReduce::TempStorage reduceStore;
-  variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x);
-
+  variance = blockReduceSum<float>(variance);
  if (threadIdx.x == 0) {
    s_variance = rsqrtf(variance / hidden_size + epsilon);
  }
@ -235,11 +228,12 @@ fused_add_rms_norm_kernel(
    variance += temp.sum_squares();
    residual_v[id] = temp;
  }
-
-  using BlockReduce = cub::BlockReduce<float, 1024>;
-  __shared__ typename BlockReduce::TempStorage reduceStore;
-  variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x);
-
+  /* Keep the following if-else block in sync with the
+     calculation of max_block_size in fused_add_rms_norm */
+  if (num_tokens < 256) {
+    variance = blockReduceSum<float, 1024>(variance);
+  } else
+    variance = blockReduceSum<float, 256>(variance);
  if (threadIdx.x == 0) {
    s_variance = rsqrtf(variance / hidden_size + epsilon);
  }
@ -274,11 +268,12 @@ fused_add_rms_norm_kernel(
    variance += x * x;
    residual[blockIdx.x * hidden_size + idx] = z;
  }
-
-  using BlockReduce = cub::BlockReduce<float, 1024>;
-  __shared__ typename BlockReduce::TempStorage reduceStore;
-  variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x);
-
+  /* Keep the following if-else block in sync with the
+     calculation of max_block_size in fused_add_rms_norm */
+  if (num_tokens < 256) {
+    variance = blockReduceSum<float, 1024>(variance);
+  } else
+    variance = blockReduceSum<float, 256>(variance);
  if (threadIdx.x == 0) {
    s_variance = rsqrtf(variance / hidden_size + epsilon);
  }
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@ -1,4 +1,4 @@
-#include "core/registration.h"
+#include "registration.h"
 #include "moe_ops.h"

 TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
--- a/csrc/ops.h
+++ b/csrc/ops.h
@ -3,8 +3,6 @@
 #include <optional>
 #include <torch/library.h>

-#include "core/scalar_type.hpp"
-
 void paged_attention_v1(
    torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
    torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
@ -63,12 +61,12 @@ void advance_step(int64_t num_seqs, int64_t num_queries, int64_t block_size,
 torch::Tensor aqlm_gemm(const torch::Tensor& input, const torch::Tensor& codes,
                        const torch::Tensor& codebooks,
                        const torch::Tensor& scales,
-                        const std::vector<int64_t>& codebook_partition_sizes,
+                        const torch::Tensor& codebook_partition_sizes,
                        const std::optional<torch::Tensor>& bias);

-torch::Tensor aqlm_dequant(
-    const torch::Tensor& codes, const torch::Tensor& codebooks,
-    const std::vector<int64_t>& codebook_partition_sizes);
+torch::Tensor aqlm_dequant(const torch::Tensor& codes,
+                           const torch::Tensor& codebooks,
+                           const torch::Tensor& codebook_partition_sizes);

 torch::Tensor awq_gemm(torch::Tensor _in_feats, torch::Tensor _kernel,
                       torch::Tensor _scaling_factors, torch::Tensor _zeros,
@ -83,41 +81,19 @@ torch::Tensor marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
                          torch::Tensor& b_scales, torch::Tensor& workspace,
                          int64_t size_m, int64_t size_n, int64_t size_k);

-namespace machete {
-
-std::vector<std::string> supported_schedules(
-    vllm::ScalarTypeTorchPtr const& btype);
-
-torch::Tensor gemm(torch::Tensor const& A, torch::Tensor const& B,
-                   vllm::ScalarTypeTorchPtr const& btype,
-                   c10::optional<torch::Tensor> const& scales,
-                   c10::optional<torch::Tensor> const& zeros,
-                   c10::optional<int64_t> group_size,
-                   c10::optional<torch::Tensor> const& C,
-                   c10::optional<double> alpha, c10::optional<double> beta,
-                   c10::optional<std::string> schedule);
-
-torch::Tensor prepack_B(torch::Tensor const& B,
-                        vllm::ScalarTypeTorchPtr const& btype);
-
-};  // namespace machete
-
 torch::Tensor gptq_marlin_24_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
                                  torch::Tensor& b_meta,
                                  torch::Tensor& b_scales,
-                                  torch::Tensor& workspace,
-                                  vllm::ScalarTypeTorchPtr const& b_q_type,
+                                  torch::Tensor& workspace, int64_t num_bits,
                                  int64_t size_m, int64_t size_n,
                                  int64_t size_k);

 torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
                               torch::Tensor& b_scales, torch::Tensor& b_zeros,
                               torch::Tensor& g_idx, torch::Tensor& perm,
-                               torch::Tensor& workspace,
-                               vllm::ScalarTypeTorchPtr const& b_q_type,
+                               torch::Tensor& workspace, int64_t num_bits,
                               int64_t size_m, int64_t size_n, int64_t size_k,
-                               bool is_k_full, bool has_zp,
-                               bool use_fp32_reduce);
+                               bool is_k_full, bool has_zp);

 torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
                                 int64_t size_k, int64_t size_n,
@ -126,15 +102,6 @@ torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
 torch::Tensor awq_marlin_repack(torch::Tensor& b_q_weight, int64_t size_k,
                                int64_t size_n, int64_t num_bits);

-torch::Tensor ggml_dequantize(torch::Tensor W, int64_t type, int64_t m,
-                              int64_t n);
-
-torch::Tensor ggml_mul_mat_vec_a8(torch::Tensor W, torch::Tensor X,
-                                  int64_t type, int64_t row);
-
-torch::Tensor ggml_mul_mat_a8(torch::Tensor W, torch::Tensor X, int64_t type,
-                              int64_t row);
-
 torch::Tensor fp8_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
                              torch::Tensor& b_scales, torch::Tensor& workspace,
                              int64_t num_bits, int64_t size_m, int64_t size_n,
@ -147,21 +114,6 @@ void cutlass_scaled_mm(torch::Tensor& out, torch::Tensor const& a,
                       torch::Tensor const& b_scales,
                       c10::optional<torch::Tensor> const& bias);

-void cutlass_scaled_mm_azp(torch::Tensor& out, torch::Tensor const& a,
-                           torch::Tensor const& b,
-                           torch::Tensor const& a_scales,
-                           torch::Tensor const& b_scales,
-                           torch::Tensor const& azp_adj,
-                           c10::optional<torch::Tensor> const& azp,
-                           c10::optional<torch::Tensor> const& bias);
-
-torch::Tensor marlin_qqq_gemm(torch::Tensor const& a,
-                              torch::Tensor const& b_q_weight,
-                              torch::Tensor const& s_tok,
-                              torch::Tensor const& s_ch,
-                              torch::Tensor const& s_group,
-                              torch::Tensor& workspace, int64_t size_m,
-                              int64_t size_n, int64_t size_k);
 #endif

 void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
--- a/csrc/punica/LICENSE
+++ b/csrc/punica/LICENSE
@ -0,0 +1,217 @@
+Contains code from https://github.com/punica-ai/punica
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "{}"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright {yyyy} {name of copyright owner}
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+------------------------------------------------------------------------------------
+
+This product bundles various third-party components under other open source licenses.
+This section summarizes those components and their licenses. See licenses/
+for text of these licenses.
+
+
+Apache-2.0
+* third_party/nvbench (with LLVM exception)
+* third_party/flashinfer
+
+BSD-3-Clause:
+* third_party/cutlass
--- a/csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu
+++ b/csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu
@ -0,0 +1,5 @@
+#include "bgmv_config.h"
+#include "bgmv_impl.cuh"
+
+FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_bfloat16, nv_bfloat16, nv_bfloat16)
+FOR_INST_BGMV_WIDE_NARROW(INST_BGMV_ONESIDE, nv_bfloat16, nv_bfloat16, nv_bfloat16)
--- a/csrc/punica/bgmv/bgmv_bf16_fp32_bf16.cu
+++ b/csrc/punica/bgmv/bgmv_bf16_fp32_bf16.cu
@ -0,0 +1,5 @@
+#include "bgmv_config.h"
+#include "bgmv_impl.cuh"
+
+FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_bfloat16, float, nv_bfloat16)
+FOR_INST_BGMV_WIDE_NARROW(INST_BGMV_ONESIDE, nv_bfloat16, float, nv_bfloat16)
--- a/csrc/punica/bgmv/bgmv_config.h
+++ b/csrc/punica/bgmv/bgmv_config.h
@ -0,0 +1,218 @@
+#pragma once
+
+template <int feat_in, int feat_out, typename in_T, typename out_T,
+          typename W_T>
+void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
+                 const W_T *__restrict__ W,
+                 const int64_t *__restrict__ indicies, int64_t y_offset,
+                 int64_t full_y_size, int64_t batch_size, int64_t num_layers,
+                 int64_t layer_idx, float scale);
+
+// clang-format off
+
+#define FOR_BGMV_WIDE(f, in_T, out_T, W_T, narrow) \
+    f(in_T, out_T, W_T, narrow, 128) \
+    f(in_T, out_T, W_T, narrow, 256) \
+    f(in_T, out_T, W_T, narrow, 512) \
+    f(in_T, out_T, W_T, narrow, 640) \
+    f(in_T, out_T, W_T, narrow, 768) \
+    f(in_T, out_T, W_T, narrow, 896) \
+    f(in_T, out_T, W_T, narrow, 1024) \
+    f(in_T, out_T, W_T, narrow, 1152) \
+    f(in_T, out_T, W_T, narrow, 1216) \
+    f(in_T, out_T, W_T, narrow, 1280) \
+    f(in_T, out_T, W_T, narrow, 1536) \
+    f(in_T, out_T, W_T, narrow, 1664) \
+    f(in_T, out_T, W_T, narrow, 1728) \
+    f(in_T, out_T, W_T, narrow, 1792) \
+    f(in_T, out_T, W_T, narrow, 2048) \
+    f(in_T, out_T, W_T, narrow, 2240) \
+    f(in_T, out_T, W_T, narrow, 2304) \
+    f(in_T, out_T, W_T, narrow, 2368) \
+    f(in_T, out_T, W_T, narrow, 2432) \
+    f(in_T, out_T, W_T, narrow, 2560) \
+    f(in_T, out_T, W_T, narrow, 2752) \
+    f(in_T, out_T, W_T, narrow, 2816) \
+    f(in_T, out_T, W_T, narrow, 3072) \
+    f(in_T, out_T, W_T, narrow, 3328) \
+    f(in_T, out_T, W_T, narrow, 3456) \
+    f(in_T, out_T, W_T, narrow, 3584) \
+    f(in_T, out_T, W_T, narrow, 3712) \
+    f(in_T, out_T, W_T, narrow, 4096) \
+    f(in_T, out_T, W_T, narrow, 4480) \
+    f(in_T, out_T, W_T, narrow, 4608) \
+    f(in_T, out_T, W_T, narrow, 4736) \
+    f(in_T, out_T, W_T, narrow, 4864) \
+    f(in_T, out_T, W_T, narrow, 5120) \
+    f(in_T, out_T, W_T, narrow, 5504) \
+    f(in_T, out_T, W_T, narrow, 5632) \
+    f(in_T, out_T, W_T, narrow, 5888) \
+    f(in_T, out_T, W_T, narrow, 6144) \
+    f(in_T, out_T, W_T, narrow, 6400) \
+    f(in_T, out_T, W_T, narrow, 6848) \
+    f(in_T, out_T, W_T, narrow, 6912) \
+    f(in_T, out_T, W_T, narrow, 7168) \
+    f(in_T, out_T, W_T, narrow, 7424) \
+    f(in_T, out_T, W_T, narrow, 8192) \
+    f(in_T, out_T, W_T, narrow, 8960) \
+    f(in_T, out_T, W_T, narrow, 9216) \
+    f(in_T, out_T, W_T, narrow, 9472) \
+    f(in_T, out_T, W_T, narrow, 10240) \
+    f(in_T, out_T, W_T, narrow, 11008) \
+    f(in_T, out_T, W_T, narrow, 11264) \
+    f(in_T, out_T, W_T, narrow, 12288) \
+    f(in_T, out_T, W_T, narrow, 13696) \
+    f(in_T, out_T, W_T, narrow, 13824) \
+    f(in_T, out_T, W_T, narrow, 14336) \
+    f(in_T, out_T, W_T, narrow, 14784) \
+    f(in_T, out_T, W_T, narrow, 14848) \
+    f(in_T, out_T, W_T, narrow, 15360) \
+    f(in_T, out_T, W_T, narrow, 16384) \
+    f(in_T, out_T, W_T, narrow, 18944) \
+    f(in_T, out_T, W_T, narrow, 20480) \
+    f(in_T, out_T, W_T, narrow, 22016) \
+    f(in_T, out_T, W_T, narrow, 22528) \
+    f(in_T, out_T, W_T, narrow, 24576) \
+    f(in_T, out_T, W_T, narrow, 27392) \
+    f(in_T, out_T, W_T, narrow, 27648) \
+    f(in_T, out_T, W_T, narrow, 28672) \
+    f(in_T, out_T, W_T, narrow, 29568) \
+    f(in_T, out_T, W_T, narrow, 29696) \
+    f(in_T, out_T, W_T, narrow, 32000) \
+    f(in_T, out_T, W_T, narrow, 32256) \
+    f(in_T, out_T, W_T, narrow, 32512) \
+    f(in_T, out_T, W_T, narrow, 32768) \
+    f(in_T, out_T, W_T, narrow, 33024) \
+    f(in_T, out_T, W_T, narrow, 36864) \
+    f(in_T, out_T, W_T, narrow, 43264) \
+    f(in_T, out_T, W_T, narrow, 49152) \
+    f(in_T, out_T, W_T, narrow, 49408) \
+    f(in_T, out_T, W_T, narrow, 60544) \
+    f(in_T, out_T, W_T, narrow, 60672) \
+    f(in_T, out_T, W_T, narrow, 64000) \
+    f(in_T, out_T, W_T, narrow, 64256) \
+    f(in_T, out_T, W_T, narrow, 64512) \
+    f(in_T, out_T, W_T, narrow, 102400) \
+    f(in_T, out_T, W_T, narrow, 102656) \
+    f(in_T, out_T, W_T, narrow, 102912) \
+    f(in_T, out_T, W_T, narrow, 128000) \
+    f(in_T, out_T, W_T, narrow, 128256) \
+    f(in_T, out_T, W_T, narrow, 128512) \
+    
+    
+// Keep above in sync with vllm/lora/layers::LogitsProcessorWithLoRA
+// and vllm/tests/lora/test_punica.py
+
+// Used for defining kernels going from the variety of
+// dim in to the narrow dim out
+    // Using it for the fully sharded column
+    // parallel LoRA A which splits the rank dim
+#define FOR_INST_BGMV_NARROW(f, in_T, out_T, W_T, narrow) \
+    f(in_T, out_T, W_T, 128, narrow) \
+    f(in_T, out_T, W_T, 256, narrow) \
+    f(in_T, out_T, W_T, 512, narrow) \
+    f(in_T, out_T, W_T, 640, narrow) \
+    f(in_T, out_T, W_T, 768, narrow) \
+    f(in_T, out_T, W_T, 896, narrow) \
+    f(in_T, out_T, W_T, 1024, narrow) \
+    f(in_T, out_T, W_T, 1152, narrow) \
+    f(in_T, out_T, W_T, 1216, narrow) \
+    f(in_T, out_T, W_T, 1280, narrow) \
+    f(in_T, out_T, W_T, 1536, narrow) \
+    f(in_T, out_T, W_T, 1664, narrow) \
+    f(in_T, out_T, W_T, 1728, narrow) \
+    f(in_T, out_T, W_T, 1792, narrow) \
+    f(in_T, out_T, W_T, 2048, narrow) \
+    f(in_T, out_T, W_T, 2240, narrow) \
+    f(in_T, out_T, W_T, 2304, narrow) \
+    f(in_T, out_T, W_T, 2368, narrow) \
+    f(in_T, out_T, W_T, 2432, narrow) \
+    f(in_T, out_T, W_T, 2560, narrow) \
+    f(in_T, out_T, W_T, 2752, narrow) \
+    f(in_T, out_T, W_T, 2816, narrow) \
+    f(in_T, out_T, W_T, 3072, narrow) \
+    f(in_T, out_T, W_T, 3328, narrow) \
+    f(in_T, out_T, W_T, 3456, narrow) \
+    f(in_T, out_T, W_T, 3584, narrow) \
+    f(in_T, out_T, W_T, 3712, narrow) \
+    f(in_T, out_T, W_T, 4096, narrow) \
+    f(in_T, out_T, W_T, 4480, narrow) \
+    f(in_T, out_T, W_T, 4608, narrow) \
+    f(in_T, out_T, W_T, 4736, narrow) \
+    f(in_T, out_T, W_T, 4864, narrow) \
+    f(in_T, out_T, W_T, 5120, narrow) \
+    f(in_T, out_T, W_T, 5504, narrow) \
+    f(in_T, out_T, W_T, 5632, narrow) \
+    f(in_T, out_T, W_T, 5888, narrow) \
+    f(in_T, out_T, W_T, 6144, narrow) \
+    f(in_T, out_T, W_T, 6400, narrow) \
+    f(in_T, out_T, W_T, 6848, narrow) \
+    f(in_T, out_T, W_T, 6912, narrow) \
+    f(in_T, out_T, W_T, 7168, narrow) \
+    f(in_T, out_T, W_T, 7424, narrow) \
+    f(in_T, out_T, W_T, 8192, narrow) \
+    f(in_T, out_T, W_T, 8960, narrow) \
+    f(in_T, out_T, W_T, 9216, narrow) \
+    f(in_T, out_T, W_T, 9472, narrow) \
+    f(in_T, out_T, W_T, 10240, narrow) \
+    f(in_T, out_T, W_T, 11008, narrow) \
+    f(in_T, out_T, W_T, 11264, narrow) \
+    f(in_T, out_T, W_T, 12288, narrow) \
+    f(in_T, out_T, W_T, 13696, narrow) \
+    f(in_T, out_T, W_T, 13824, narrow) \
+    f(in_T, out_T, W_T, 14336, narrow) \
+    f(in_T, out_T, W_T, 14784, narrow) \
+    f(in_T, out_T, W_T, 14848, narrow) \
+    f(in_T, out_T, W_T, 15360, narrow) \
+    f(in_T, out_T, W_T, 16384, narrow) \
+    f(in_T, out_T, W_T, 18944, narrow) \
+    f(in_T, out_T, W_T, 20480, narrow) \
+    f(in_T, out_T, W_T, 22016, narrow) \
+    f(in_T, out_T, W_T, 22528, narrow) \
+    f(in_T, out_T, W_T, 24576, narrow) \
+    f(in_T, out_T, W_T, 27392, narrow) \
+    f(in_T, out_T, W_T, 27648, narrow) \
+    f(in_T, out_T, W_T, 28672, narrow) \
+    f(in_T, out_T, W_T, 29568, narrow) \
+    f(in_T, out_T, W_T, 29696, narrow) \
+    f(in_T, out_T, W_T, 32000, narrow) \
+    f(in_T, out_T, W_T, 32256, narrow) \
+    f(in_T, out_T, W_T, 32512, narrow) \
+    f(in_T, out_T, W_T, 32768, narrow) \
+    f(in_T, out_T, W_T, 33024, narrow) \
+    f(in_T, out_T, W_T, 36864, narrow) \
+    f(in_T, out_T, W_T, 43264, narrow) \
+    f(in_T, out_T, W_T, 49152, narrow) \
+    f(in_T, out_T, W_T, 49408, narrow) \
+    f(in_T, out_T, W_T, 60544, narrow) \
+    f(in_T, out_T, W_T, 60672, narrow) \
+    f(in_T, out_T, W_T, 64000, narrow) \
+    f(in_T, out_T, W_T, 64256, narrow) \
+    f(in_T, out_T, W_T, 64512, narrow) \
+    f(in_T, out_T, W_T, 102400, narrow) \
+    f(in_T, out_T, W_T, 102656, narrow) \
+    f(in_T, out_T, W_T, 102912, narrow) \
+    f(in_T, out_T, W_T, 128000, narrow) \
+    f(in_T, out_T, W_T, 128256, narrow) \
+    f(in_T, out_T, W_T, 128512, narrow) \
+// Keep above in sync with vllm/lora/layers::SamplerWithLoRA
+
+
+// Keep this in sync with vllm/config::LoRAConfig
+#define FOR_BGMV_WIDE_NARROW(f, in_T, out_T, W_T) \
+    FOR_BGMV_WIDE(f, in_T, out_T, W_T, 8)  \
+    FOR_BGMV_WIDE(f, in_T, out_T, W_T, 16) \
+    FOR_BGMV_WIDE(f, in_T, out_T, W_T, 32) \
+    FOR_BGMV_WIDE(f, in_T, out_T, W_T, 64)
+
+
+#define FOR_INST_BGMV_WIDE_NARROW(f, in_T, out_T, W_T) \
+    FOR_INST_BGMV_NARROW(f, in_T, out_T, W_T, 1) \
+    FOR_INST_BGMV_NARROW(f, in_T, out_T, W_T, 2) \
+    FOR_INST_BGMV_NARROW(f, in_T, out_T, W_T, 4) \
+    f(in_T, out_T, W_T, 8, 64) \
+    f(in_T, out_T, W_T, 16, 64) \
+    f(in_T, out_T, W_T, 32, 64) \
+    f(in_T, out_T, W_T, 64, 64)
+
+// clang-format on
--- a/csrc/punica/bgmv/bgmv_fp16_fp16_fp16.cu
+++ b/csrc/punica/bgmv/bgmv_fp16_fp16_fp16.cu
@ -0,0 +1,5 @@
+#include "bgmv_config.h"
+#include "bgmv_impl.cuh"
+
+FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_half, nv_half, nv_half)
+FOR_INST_BGMV_WIDE_NARROW(INST_BGMV_ONESIDE, nv_half, nv_half, nv_half)
--- a/csrc/punica/bgmv/bgmv_fp16_fp32_fp16.cu
+++ b/csrc/punica/bgmv/bgmv_fp16_fp32_fp16.cu
@ -0,0 +1,5 @@
+#include "bgmv_config.h"
+#include "bgmv_impl.cuh"
+
+FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_half, float, nv_half)
+FOR_INST_BGMV_WIDE_NARROW(INST_BGMV_ONESIDE, nv_half, float, nv_half)
--- a/csrc/punica/bgmv/bgmv_fp32_bf16_bf16.cu
+++ b/csrc/punica/bgmv/bgmv_fp32_bf16_bf16.cu
@ -0,0 +1,5 @@
+#include "bgmv_config.h"
+#include "bgmv_impl.cuh"
+
+FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, float, nv_bfloat16, nv_bfloat16)
+FOR_INST_BGMV_WIDE_NARROW(INST_BGMV_ONESIDE, float, nv_bfloat16, nv_bfloat16)
--- a/csrc/punica/bgmv/bgmv_fp32_fp16_fp16.cu
+++ b/csrc/punica/bgmv/bgmv_fp32_fp16_fp16.cu
@ -0,0 +1,5 @@
+#include "bgmv_config.h"
+#include "bgmv_impl.cuh"
+
+FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, float, nv_half, nv_half)
+FOR_INST_BGMV_WIDE_NARROW(INST_BGMV_ONESIDE, float, nv_half, nv_half)
--- a/csrc/punica/bgmv/bgmv_impl.cuh
+++ b/csrc/punica/bgmv/bgmv_impl.cuh
@ -0,0 +1,451 @@
+#pragma once
+
+#include <ATen/cuda/CUDAContext.h>
+#ifndef USE_ROCM
+#include <cooperative_groups.h>
+#else
+#include <hip/hip_cooperative_groups.h>
+#endif
+#ifndef USE_ROCM
+#include <cuda/pipeline>
+#endif
+#include <cuda_runtime.h>
+#include <iostream>
+#include <stdio.h>
+
+#include "vec_dtypes.cuh"
+
+namespace cg = cooperative_groups;
+
+#ifdef USE_ROCM
+template <size_t len>
+__host__ __device__
+inline void* memcpy_blocking(void *dst, const void *src) {
+  // Does not handle the case of long datatypes
+  char *d = reinterpret_cast<char *>(dst);
+  const char *s = reinterpret_cast<const char *>(src);
+  size_t i = 0;
+#pragma unroll
+  for (i = 0; i < len; ++i) {
+    d[i] = s[i];
+  }
+  return dst;
+}
+#endif
+
+#ifndef USE_ROCM
+
+// nthrs = (32, 4)
+template <int feat_in, int feat_out, size_t vec_size, size_t X_copy_size,
+          size_t W_copy_size, int tx, int ty, int tz, typename in_T,
+          typename out_T, typename W_T>
+__global__ void
+bgmv_shrink_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
+                   const W_T *__restrict__ W,
+                   const int64_t *__restrict__ indicies, int64_t y_offset,
+                   int64_t full_y_size, int64_t num_layers, int64_t layer_idx,
+                   float scale) {
+  size_t batch_idx = blockIdx.y;
+  int64_t idx = indicies[batch_idx] * num_layers + layer_idx;
+  if (idx < 0) {
+    return;
+  }
+
+  auto block = cg::this_thread_block();
+  size_t j = blockIdx.x;
+  constexpr size_t num_pipeline_stages = 2;
+  constexpr size_t tile_size = tx * ty * vec_size;
+  __shared__ W_T W_shared[num_pipeline_stages * tile_size];
+  __shared__ in_T X_shared[num_pipeline_stages * tile_size];
+  __shared__ float y_warpwise[ty];
+
+  size_t W_shared_offset[num_pipeline_stages] = {0U, 1U * tile_size};
+  size_t X_shared_offset[num_pipeline_stages] = {0U, 1U * tile_size};
+  auto pipe = cuda::make_pipeline();
+
+  // pipeline load W/X and compute WX;
+  pipe.producer_acquire();
+  cuda::memcpy_async(W_shared + (threadIdx.y * tx + threadIdx.x) * vec_size,
+                     W + (idx * feat_out + j) * feat_in +
+                         (threadIdx.y * tx + threadIdx.x) * vec_size,
+                     cuda::aligned_size_t<W_copy_size>(W_copy_size), pipe);
+  cuda::memcpy_async(X_shared + (threadIdx.y * tx + threadIdx.x) * vec_size,
+                     X + (batch_idx * feat_in) +
+                         (threadIdx.y * tx + threadIdx.x) * vec_size,
+                     cuda::aligned_size_t<X_copy_size>(X_copy_size), pipe);
+  pipe.producer_commit();
+  size_t copy_idx, compute_idx;
+  float y = 0.f;
+  vec_t<in_T, vec_size> x_vec;
+  vec_t<W_T, vec_size> w_vec;
+  size_t tile_idx;
+
+#pragma unroll
+  for (tile_idx = 1; tile_idx < (feat_in + tile_size - 1) / tile_size;
+       ++tile_idx) {
+    copy_idx = tile_idx % num_pipeline_stages;
+    // pipeline stage: async copy W fragment
+    pipe.producer_acquire();
+    if (tile_idx * tile_size + threadIdx.y * tx * vec_size < feat_in) {
+      cuda::memcpy_async(W_shared + W_shared_offset[copy_idx] +
+                             (threadIdx.y * tx + threadIdx.x) * vec_size,
+                         W + (idx * feat_out + j) * feat_in +
+                             tile_idx * tile_size +
+                             (threadIdx.y * tx + threadIdx.x) * vec_size,
+                         cuda::aligned_size_t<W_copy_size>(W_copy_size), pipe);
+      cuda::memcpy_async(X_shared + X_shared_offset[copy_idx] +
+                             (threadIdx.y * tx + threadIdx.x) * vec_size,
+                         X + (batch_idx * feat_in) + tile_idx * tile_size +
+                             (threadIdx.y * tx + threadIdx.x) * vec_size,
+                         cuda::aligned_size_t<X_copy_size>(X_copy_size), pipe);
+    }
+    pipe.producer_commit();
+
+    compute_idx = (tile_idx - 1) % num_pipeline_stages;
+    // pipeline stage: compute WX
+    pipe.consumer_wait();
+    block.sync();
+    x_vec.load(X_shared + X_shared_offset[compute_idx] +
+               (threadIdx.y * tx + threadIdx.x) * vec_size);
+    w_vec.load(W_shared + W_shared_offset[compute_idx] +
+               (threadIdx.y * tx + threadIdx.x) * vec_size);
+    float sum = 0.f;
+#pragma unroll
+    for (size_t i = 0; i < vec_size; ++i) {
+      sum += float(w_vec[i]) * float(x_vec[i]) * scale;
+    }
+#pragma unroll
+    for (size_t offset = tx / 2; offset > 0; offset /= 2) {
+      sum += __shfl_down_sync(0xffffffff, sum, offset);
+    }
+    y_warpwise[threadIdx.y] = sum;
+    block.sync();
+#pragma unroll
+    for (size_t i = 0; i < ty; ++i) {
+      y += y_warpwise[i];
+    }
+
+    block.sync();
+    pipe.consumer_release();
+  }
+
+  compute_idx = (tile_idx - 1) % num_pipeline_stages;
+  // final pipeline stage
+  pipe.consumer_wait();
+  block.sync();
+  x_vec.load(X_shared + X_shared_offset[compute_idx] +
+             (threadIdx.y * tx + threadIdx.x) * vec_size);
+  w_vec.load(W_shared + W_shared_offset[compute_idx] +
+             (threadIdx.y * tx + threadIdx.x) * vec_size);
+  float sum = 0.f;
+#pragma unroll
+  for (size_t i = 0; i < vec_size; ++i) {
+    sum += float(w_vec[i]) * float(x_vec[i]) * scale;
+  }
+#pragma unroll
+  for (size_t offset = tx / 2; offset > 0; offset /= 2) {
+    sum += __shfl_down_sync(0xffffffff, sum, offset);
+  }
+  y_warpwise[threadIdx.y] =
+      ((tile_idx - 1) * tile_size + threadIdx.y * tx * vec_size < feat_in)
+          ? sum
+          : 0.f;
+  block.sync();
+#pragma unroll
+  for (size_t i = 0; i < ty; ++i) {
+    y += y_warpwise[i];
+  }
+
+  block.sync();
+  pipe.consumer_release();
+
+  // write Y;
+  if (block.thread_rank() == 0) {
+    Y[batch_idx * full_y_size + y_offset + j] += static_cast<out_T>(y);
+  }
+}
+
+#else
+
+template <int feat_in, int feat_out, size_t vec_size, size_t X_copy_size,
+          size_t W_copy_size, int tx, int ty, int tz, typename in_T,
+          typename out_T, typename W_T>
+__global__ void
+bgmv_shrink_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
+                   const W_T *__restrict__ W,
+                   const int64_t *__restrict__ indicies, int64_t y_offset,
+                   int64_t full_y_size, int64_t num_layers, int64_t layer_idx,
+                   float scale) {
+  size_t batch_idx = blockIdx.y;
+  int64_t idx = indicies[batch_idx] * num_layers + layer_idx;
+  if (idx < 0) {
+    return;
+  }
+
+  size_t j = blockIdx.x;
+  constexpr size_t tile_size = tx * ty * vec_size;
+  constexpr size_t num_tiles = (feat_in + tile_size - 1) / tile_size;
+  __shared__ float y_warpwise[ty];
+
+  float y = 0;
+  vec_t<in_T, vec_size> x_vec;
+  vec_t<W_T, vec_size> w_vec;
+  size_t tile_idx;
+
+#pragma unroll
+  for (tile_idx = 0; tile_idx < num_tiles; ++tile_idx) {
+    if (tile_idx * tile_size + (threadIdx.y * tx + threadIdx.x + 1) * vec_size - 1 < feat_in) {
+      x_vec.load(X + (batch_idx * feat_in) +
+                     tile_idx * tile_size +
+                     (threadIdx.y * tx + threadIdx.x) * vec_size);
+      w_vec.load(W + (idx * feat_out + j) * feat_in +
+                     tile_idx * tile_size +
+                     (threadIdx.y * tx + threadIdx.x) * vec_size);
+    }
+
+    float sum = 0.f;
+#pragma unroll
+    for (size_t i = 0; i < vec_size; ++i) {
+      sum += convert_type<W_T, float>(w_vec[i]) * convert_type<in_T, float>(x_vec[i]) * scale;
+    }
+#pragma unroll
+    for (size_t offset = tx / 2; offset > 0; offset /= 2) {
+      sum += VLLM_SHFL_DOWN_SYNC(sum, offset);
+    }
+
+    __syncthreads();
+
+    if (tile_idx * tile_size + (threadIdx.y * tx + threadIdx.x + 1) * vec_size - 1 < feat_in) {
+      y += sum;
+    }
+  }
+
+  if (threadIdx.x == 0) {
+    y_warpwise[threadIdx.y] = y;
+  }
+  __syncthreads();
+
+  float y_write = 0.f;
+#pragma unroll
+  for (size_t i = 0; i < ty; ++i) {
+    y_write += y_warpwise[i];
+  }
+ 
+  // write Y;
+  if (threadIdx.x == 0 && threadIdx.y == 0) {
+    size_t y_idx = batch_idx * full_y_size + y_offset + j;
+    Y[y_idx] = vllm_add<out_T>(Y[y_idx], convert_type<float, out_T>(y_write));
+  }
+}
+
+#endif
+
+// nthrs = (2, 16, 4)
+template <int feat_in, int feat_out, size_t vec_size, int tx, int ty, int tz,
+          typename in_T, typename out_T, typename W_T>
+__global__ void
+bgmv_expand_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
+                   const W_T *__restrict__ W,
+                   const int64_t *__restrict__ indicies, int64_t y_offset,
+                   int64_t full_y_size, int64_t num_layers, int64_t layer_idx,
+                   float scale) {
+  size_t batch_idx = blockIdx.y;
+  int64_t idx = indicies[batch_idx] * num_layers + layer_idx;
+
+  if (idx < 0) {
+    return;
+  }
+
+  auto block = cg::this_thread_block();
+  size_t tile_idx = blockIdx.x;
+
+  // load X;
+  vec_t<in_T, vec_size> x_vec;
+  x_vec.load(X + batch_idx * feat_in + threadIdx.x * vec_size);
+
+  // load W;
+  vec_t<W_T, vec_size> w_vec;
+  w_vec.load(W + (idx * feat_out + tile_idx * tz * ty) * feat_in +
+             block.thread_rank() * vec_size);
+
+  float sum = 0.f;
+#pragma unroll
+  for (size_t i = 0; i < vec_size; ++i) {
+#ifndef USE_ROCM
+    sum += float(w_vec[i]) * float(x_vec[i]) * scale;
+#else
+    sum += convert_type<W_T, float>(w_vec[i]) * convert_type<in_T, float>(x_vec[i]) * scale;
+#endif
+  }
+
+  cg::thread_block_tile g = cg::tiled_partition<tx>(block);
+#pragma unroll
+  for (size_t offset = tx / 2; offset > 0; offset /= 2) {
+    sum += g.shfl_down(sum, offset);
+  }
+  sum = g.shfl(sum, 0);
+
+  if (threadIdx.x == 0) {
+#ifndef USE_ROCM
+    Y[batch_idx * full_y_size + y_offset + tile_idx * (tz * ty) +
+      threadIdx.z * ty + threadIdx.y] += static_cast<out_T>(sum);
+#else
+    size_t y_idx = batch_idx * full_y_size + y_offset + tile_idx * (tz * ty) +
+                   threadIdx.z * ty + threadIdx.y;
+    Y[y_idx] = vllm_add<out_T>(Y[y_idx], convert_type<float, out_T>(sum));
+#endif
+  }
+}
+
+template <int feat_in, int feat_out, typename in_T, typename out_T,
+          typename W_T>
+void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
+                 const W_T *__restrict__ W,
+                 const int64_t *__restrict__ indicies, int64_t y_offset,
+                 int64_t full_y_size, int64_t batch_size, int64_t num_layers,
+                 int64_t layer_idx, float scale) {
+  constexpr size_t vec_size = 8;
+  constexpr int tz = 4;
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  if constexpr (feat_in <= feat_out) {
+    static_assert(feat_in % vec_size == 0);
+    constexpr int tx = feat_in / vec_size;
+
+    static_assert((32 % tx == 0 && feat_out % (32 / tx * tz) == 0) ||
+                  (16 % tx == 0 && feat_out % (16 / tx * tz) == 0) ||
+                  (8 % tx == 0 && feat_out % (8 / tx * tz) == 0));
+
+    if constexpr (32 % tx == 0 && feat_out % (32 / tx * tz) == 0) {
+      constexpr int ty = 32 / tx;
+      dim3 nblks(feat_out / (ty * tz), batch_size);
+      dim3 nthrs(tx, ty, tz);
+
+      bgmv_expand_kernel<feat_in, feat_out, vec_size, tx, ty, tz>
+          <<<nblks, nthrs, 0, stream>>>(Y, X, W, indicies, y_offset,
+                                        full_y_size, num_layers, layer_idx,
+                                        scale);
+    } else if (16 % tx == 0 && feat_out % (16 / tx * tz) == 0) {
+      constexpr int ty = 16 / tx;
+      dim3 nblks(feat_out / (ty * tz), batch_size);
+      dim3 nthrs(tx, ty, tz);
+
+      bgmv_expand_kernel<feat_in, feat_out, vec_size, tx, ty, tz>
+          <<<nblks, nthrs, 0, stream>>>(Y, X, W, indicies, y_offset,
+                                        full_y_size, num_layers, layer_idx,
+                                        scale);
+    } else {
+      constexpr int ty = 8 / tx;
+      dim3 nblks(feat_out / (ty * tz), batch_size);
+      dim3 nthrs(tx, ty, tz);
+
+      bgmv_expand_kernel<feat_in, feat_out, vec_size, tx, ty, tz>
+          <<<nblks, nthrs, 0, stream>>>(Y, X, W, indicies, y_offset,
+                                        full_y_size, num_layers, layer_idx,
+                                        scale);
+    }
+  } else {
+#ifndef USE_ROCM
+    static_assert(feat_in % (vec_size * 32) == 0 ||
+                  feat_in % (vec_size * 16) == 0 ||
+                  feat_in % (vec_size * 8) == 0);
+
+    if constexpr (feat_in % (vec_size * 32) == 0) {
+      constexpr int tx = 32;
+      constexpr int ty = 4;
+
+      dim3 nblks(feat_out, batch_size);
+      dim3 nthrs(tx, ty);
+
+      bgmv_shrink_kernel<feat_in, feat_out, vec_size, vec_size * sizeof(in_T),
+                         vec_size * sizeof(W_T), tx, ty, tz>
+          <<<nblks, nthrs, 0, stream>>>(Y, X, W, indicies, y_offset,
+                                        full_y_size, num_layers, layer_idx,
+                                        scale);
+    } else if constexpr (feat_in % (vec_size / 2 * 32) == 0) {
+      constexpr int tx = 32;
+      constexpr int ty = 4;
+
+      dim3 nblks(feat_out, batch_size);
+      dim3 nthrs(tx, ty);
+
+      bgmv_shrink_kernel<feat_in, feat_out, vec_size / 2,
+                         vec_size * sizeof(in_T) / 2,
+                         vec_size * sizeof(W_T) / 2, tx, ty, tz>
+          <<<nblks, nthrs, 0, stream>>>(Y, X, W, indicies, y_offset,
+                                        full_y_size, num_layers, layer_idx,
+                                        scale);
+    } else if constexpr (feat_in % (vec_size / 2 * 16) == 0) {
+      constexpr int tx = 16;
+      constexpr int ty = 4;
+
+      dim3 nblks(feat_out, batch_size);
+      dim3 nthrs(tx, ty);
+
+      bgmv_shrink_kernel<feat_in, feat_out, vec_size / 2,
+                         vec_size * sizeof(in_T) / 2,
+                         vec_size * sizeof(W_T) / 2, tx, ty, tz>
+          <<<nblks, nthrs, 0, stream>>>(Y, X, W, indicies, y_offset,
+                                        full_y_size, num_layers, layer_idx,
+                                        scale);
+    }
+#else
+    constexpr size_t rocm_warp_size = warpSize;
+
+#define CHECK_INPUT_TILEABLE_BY(vec_size_) \
+    feat_in % (rocm_warp_size * vec_size_) == 0
+
+#define LAUNCH_BGMV_SHRINK_KERNELS_ROCM(factor_, vec_size_, tx_, ty_)       \
+    if constexpr (CHECK_INPUT_TILEABLE_BY(factor_)) {                       \
+      constexpr size_t vec_size_shrink = vec_size_;                         \
+      constexpr int tx = tx_;                                               \
+      constexpr int ty = ty_;                                               \
+      dim3 nblks(feat_out, batch_size);                                     \
+      dim3 nthrs(tx, ty);                                                   \
+      bgmv_shrink_kernel<feat_in, feat_out, vec_size_shrink,                \
+                          vec_size_shrink * sizeof(in_T),                   \
+                          vec_size_shrink * sizeof(W_T),                    \
+                          tx, ty, tz>                                       \
+          <<<nblks, nthrs, 0, stream>>>(Y, X, W, indicies, y_offset,        \
+                                        full_y_size, num_layers, layer_idx, \
+                                        scale);                             \
+    }
+
+    static_assert(CHECK_INPUT_TILEABLE_BY(32) ||
+                  CHECK_INPUT_TILEABLE_BY(16) ||
+                  CHECK_INPUT_TILEABLE_BY( 8) ||
+                  CHECK_INPUT_TILEABLE_BY( 4) ||
+                  CHECK_INPUT_TILEABLE_BY( 2) ||
+                  CHECK_INPUT_TILEABLE_BY( 1));
+    
+    LAUNCH_BGMV_SHRINK_KERNELS_ROCM(32, vec_size, rocm_warp_size, 32/vec_size)
+    else
+    LAUNCH_BGMV_SHRINK_KERNELS_ROCM(16, vec_size, rocm_warp_size, 16/vec_size)
+    else
+    LAUNCH_BGMV_SHRINK_KERNELS_ROCM( 8, vec_size, rocm_warp_size,  8/vec_size)
+    else
+    LAUNCH_BGMV_SHRINK_KERNELS_ROCM( 4, vec_size, rocm_warp_size/(vec_size/4), vec_size/4)
+    else
+    LAUNCH_BGMV_SHRINK_KERNELS_ROCM( 2, vec_size, rocm_warp_size/(vec_size/2), vec_size/2)
+    else
+    LAUNCH_BGMV_SHRINK_KERNELS_ROCM( 1, vec_size, rocm_warp_size/(vec_size/1), vec_size/1)
+
+#undef CHECK_INPUT_TILEABLE_BY
+#undef LAUNCH_BGMV_SHRINK_KERNELS_ROCM
+#endif
+  }
+}
+
+#define INST_BGMV(feat_in, feat_out, in_T, out_T, W_T)                         \
+  template void bgmv_kernel<feat_in, feat_out>(                                \
+      out_T * __restrict__ Y, const in_T *__restrict__ X,                      \
+      const W_T *__restrict__ W, const int64_t *__restrict__ indicies,         \
+      int64_t y_offset, int64_t full_y_size, int64_t batch_size,               \
+      int64_t num_layers, int64_t layer_idx, float scale);
+
+#define INST_BGMV_ONESIDE(in_T, out_T, W_T, feat_in, feat_out)                 \
+  INST_BGMV(feat_in, feat_out, in_T, out_T, W_T)
+
+#define INST_BGMV_TWOSIDE(in_T, out_T, W_T, narrow, wide)                      \
+  INST_BGMV(narrow, wide, in_T, out_T, W_T)                                    \
+  INST_BGMV(wide, narrow, in_T, out_T, W_T)
--- a/csrc/punica/bgmv/generator.py
+++ b/csrc/punica/bgmv/generator.py
@ -0,0 +1,48 @@
+DTYPES = ["fp16", "bf16", "fp32"]
+DTYPE_MAP = {
+    "fp16": "nv_half",
+    "bf16": "nv_bfloat16",
+    "fp32": "float",
+}
+
+TEMPLATE = """
+#include "bgmv_config.h"
+#include "bgmv_impl.cuh"
+
+FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, {input_dtype}, {output_dtype}, {weight_dtype})
+FOR_INST_BGMV_WIDE_NARROW(INST_BGMV_ONESIDE, {input_dtype}, {output_dtype}, {weight_dtype})
+""".lstrip()  # noqa: E501
+
+for input_dtype in DTYPES:
+    for output_dtype in DTYPES:
+        for weight_dtype in DTYPES:
+            if weight_dtype == "fp32":
+                # FP32 weights are not supported.
+                continue
+            if output_dtype == "fp32":
+                # LoRA A matrix.
+                if input_dtype != weight_dtype:
+                    # NOTE(woosuk): While Punica supports the case where the
+                    # input and weight dtypes are different, we only generate
+                    # the kernels the same dtypes to reduce the binary size.
+                    continue
+            elif input_dtype == "fp32":
+                # LoRA B matrix.
+                if output_dtype != weight_dtype:
+                    # NOTE(woosuk): While Punica supports the case where the
+                    # output and weight dtypes are different, we only generate
+                    # the kernels the same dtypes to reduce the binary size.
+                    continue
+            elif not (input_dtype == output_dtype == weight_dtype):
+                # NOTE(woosuk): While Punica supports mixed data types for
+                # input, output, and weight, we only generate the kernels with
+                # the same data types to reduce the binary size.
+                continue
+
+            kernel_definition = TEMPLATE.format(
+                input_dtype=DTYPE_MAP[input_dtype],
+                output_dtype=DTYPE_MAP[output_dtype],
+                weight_dtype=DTYPE_MAP[weight_dtype])
+            filename = f"bgmv_{input_dtype}_{output_dtype}_{weight_dtype}.cu"
+            with open(filename, "w") as f:
+                f.write(kernel_definition)
--- a/csrc/punica/bgmv/vec_dtypes.cuh
+++ b/csrc/punica/bgmv/vec_dtypes.cuh
--- a/csrc/punica/punica_ops.cu
+++ b/csrc/punica/punica_ops.cu
@ -0,0 +1,569 @@
+#include <torch/all.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <cstdint>
+
+#include "type_convert.h"
+#include "../cuda_compat.h"
+#include "bgmv/bgmv_config.h"
+
+
+//====== utils ======
+
+inline void check_shape(const torch::Tensor &a, const torch::Tensor &b,
+                        const char *a_name, const char *b_name) {
+  TORCH_CHECK(a.dim() == b.dim(), a_name, ".dim() != ", b_name, ".dim(). ",
+              a.dim(), " vs ", b.dim());
+  for (int i = 0; i < a.dim(); ++i) {
+    TORCH_CHECK(a.size(i) == b.size(i), a_name, ".size(", i, ") != ", b_name,
+                ".size(", i, ")");
+  }
+}
+
+inline constexpr uint64_t pack_u32(uint32_t a, uint32_t b) {
+  return (uint64_t(a) << 32) | uint64_t(b);
+}
+
+#define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")
+
+#define CHECK_CONTIGUOUS(x)                                                    \
+  TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
+
+#define CHECK_INPUT(x)                                                         \
+  CHECK_CUDA(x);                                                               \
+  CHECK_CONTIGUOUS(x)
+
+#define CHECK_DIM(d, x)                                                        \
+  TORCH_CHECK(x.dim() == d, #x " must be a " #d "D tensor")
+
+#define CHECK_SHAPE(a, b) check_shape(a, b, #a, #b)
+
+#define CHECK_EQ(a, b)                                                         \
+  TORCH_CHECK(a == b, "CHECK_EQ(" #a ", " #b ") failed. ", a, " vs ", b)
+
+//====== bgmv ======
+
+template <typename in_T, typename out_T, typename W_T>
+inline bool launch_bgmv_kernel(out_T *Y, const in_T *X, const W_T *W,
+                               const int64_t *lora_indices,
+                               uint32_t in_features, uint32_t out_features,
+                               int64_t y_offset, int64_t full_y_size,
+                               int64_t batch_size, int64_t num_layers,
+                               int64_t layer_idx, float scale) {
+  // NOTE(woosuk): While Punica supports various combinations of input/output
+  // data types, we limit the supported data types to reduce the binary size.
+  constexpr bool is_input_float = std::is_same<in_T, float>::value;
+  constexpr bool is_output_float = std::is_same<out_T, float>::value;
+  if (is_input_float) {
+    if (!std::is_same<out_T, W_T>::value) {
+      return false;
+    }
+  } else if (is_output_float) {
+    if (!std::is_same<in_T, W_T>::value) {
+      return false;
+    }
+  } else if (!(std::is_same<in_T, W_T>::value &&
+               std::is_same<out_T, W_T>::value)) {
+    return false;
+  }
+
+  switch (pack_u32(in_features, out_features)) {
+#define CASE_ONESIDE(_in_T, _out_T, _W_T, feat_in, feat_out)                   \
+  case pack_u32(feat_in, feat_out):                                            \
+    bgmv_kernel<feat_in, feat_out>(Y, X, W, lora_indices, y_offset,            \
+                                   full_y_size, batch_size, num_layers,        \
+                                   layer_idx, scale);                          \
+    break;
+#define CASE(_in_T, _out_T, _W_T, narrow, wide)                                \
+  CASE_ONESIDE(in_T, out_T, W_T, narrow, wide)                                 \
+  CASE_ONESIDE(in_T, out_T, W_T, wide, narrow)
+
+    FOR_BGMV_WIDE_NARROW(CASE, _, _, _)
+    FOR_INST_BGMV_WIDE_NARROW(CASE_ONESIDE, _, _, _)
+#undef CASE
+#undef CASE_ONESIDE
+  default:
+    return false;
+  }
+  return true;
+}
+
+void dispatch_bgmv(torch::Tensor y, torch::Tensor x, torch::Tensor w,
+                   torch::Tensor indicies, int64_t layer_idx, double scale) {
+  CHECK_INPUT(y);
+  CHECK_INPUT(x);
+  CHECK_INPUT(w);
+  CHECK_INPUT(indicies);
+
+  CHECK_DIM(2, y);
+  CHECK_DIM(2, x);
+  CHECK_DIM(4, w);
+  CHECK_DIM(1, indicies);
+
+  int64_t B = x.size(0);
+  int64_t h_in = x.size(1);
+  int64_t h_out = y.size(1);
+  int64_t num_layers = w.size(1);
+  CHECK_EQ(w.size(3), h_in);
+  CHECK_EQ(w.size(2), h_out);
+  CHECK_EQ(indicies.size(0), x.size(0));
+  CHECK_EQ(y.size(0), x.size(0));
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(x));
+  bool ok = false;
+  if (h_in <= 128512 && h_out <= 128512) {
+    // TODO: See if we can get rid of this massive nested switch
+    switch (x.scalar_type()) {
+    case at::ScalarType::Half:
+      switch (y.scalar_type()) {
+      case at::ScalarType::Half:
+        switch (w.scalar_type()) {
+        case at::ScalarType::Half:
+          ok = launch_bgmv_kernel(static_cast<nv_half *>(y.data_ptr()),
+                                  static_cast<nv_half *>(x.data_ptr()),
+                                  static_cast<nv_half *>(w.data_ptr()),
+                                  indicies.data_ptr<int64_t>(), h_in, h_out, 0,
+                                  h_out, B, num_layers, layer_idx, scale);
+          break;
+        case at::ScalarType::BFloat16:
+          ok = launch_bgmv_kernel(static_cast<nv_half *>(y.data_ptr()),
+                                  static_cast<nv_half *>(x.data_ptr()),
+                                  static_cast<nv_bfloat16 *>(w.data_ptr()),
+                                  indicies.data_ptr<int64_t>(), h_in, h_out, 0,
+                                  h_out, B, num_layers, layer_idx, scale);
+          break;
+        default:
+          break;
+        }
+        break;
+      case at::ScalarType::BFloat16:
+        switch (w.scalar_type()) {
+        case at::ScalarType::Half:
+          ok = launch_bgmv_kernel(static_cast<nv_bfloat16 *>(y.data_ptr()),
+                                  static_cast<nv_half *>(x.data_ptr()),
+                                  static_cast<nv_half *>(w.data_ptr()),
+                                  indicies.data_ptr<int64_t>(), h_in, h_out, 0,
+                                  h_out, B, num_layers, layer_idx, scale);
+          break;
+        case at::ScalarType::BFloat16:
+          ok = launch_bgmv_kernel(static_cast<nv_bfloat16 *>(y.data_ptr()),
+                                  static_cast<nv_half *>(x.data_ptr()),
+                                  static_cast<nv_bfloat16 *>(w.data_ptr()),
+                                  indicies.data_ptr<int64_t>(), h_in, h_out, 0,
+                                  h_out, B, num_layers, layer_idx, scale);
+          break;
+        default:
+          break;
+        }
+        break;
+      case at::ScalarType::Float:
+        switch (w.scalar_type()) {
+        case at::ScalarType::Half:
+          ok = launch_bgmv_kernel(static_cast<float *>(y.data_ptr()),
+                                  static_cast<nv_half *>(x.data_ptr()),
+                                  static_cast<nv_half *>(w.data_ptr()),
+                                  indicies.data_ptr<int64_t>(), h_in, h_out, 0,
+                                  h_out, B, num_layers, layer_idx, scale);
+          break;
+        case at::ScalarType::BFloat16:
+          ok = launch_bgmv_kernel(static_cast<float *>(y.data_ptr()),
+                                  static_cast<nv_half *>(x.data_ptr()),
+                                  static_cast<nv_bfloat16 *>(w.data_ptr()),
+                                  indicies.data_ptr<int64_t>(), h_in, h_out, 0,
+                                  h_out, B, num_layers, layer_idx, scale);
+          break;
+        default:
+          break;
+        }
+        break;
+      default:
+        break;
+      }
+      break;
+    case at::ScalarType::BFloat16:
+      switch (y.scalar_type()) {
+      case at::ScalarType::Half:
+        switch (w.scalar_type()) {
+        case at::ScalarType::Half:
+          ok = launch_bgmv_kernel(static_cast<nv_half *>(y.data_ptr()),
+                                  static_cast<nv_bfloat16 *>(x.data_ptr()),
+                                  static_cast<nv_half *>(w.data_ptr()),
+                                  indicies.data_ptr<int64_t>(), h_in, h_out, 0,
+                                  h_out, B, num_layers, layer_idx, scale);
+          break;
+        case at::ScalarType::BFloat16:
+          ok = launch_bgmv_kernel(static_cast<nv_half *>(y.data_ptr()),
+                                  static_cast<nv_bfloat16 *>(x.data_ptr()),
+                                  static_cast<nv_bfloat16 *>(w.data_ptr()),
+                                  indicies.data_ptr<int64_t>(), h_in, h_out, 0,
+                                  h_out, B, num_layers, layer_idx, scale);
+          break;
+        default:
+          break;
+        }
+        break;
+      case at::ScalarType::BFloat16:
+        switch (w.scalar_type()) {
+        case at::ScalarType::Half:
+          ok = launch_bgmv_kernel(static_cast<nv_bfloat16 *>(y.data_ptr()),
+                                  static_cast<nv_bfloat16 *>(x.data_ptr()),
+                                  static_cast<nv_half *>(w.data_ptr()),
+                                  indicies.data_ptr<int64_t>(), h_in, h_out, 0,
+                                  h_out, B, num_layers, layer_idx, scale);
+          break;
+        case at::ScalarType::BFloat16:
+          ok = launch_bgmv_kernel(static_cast<nv_bfloat16 *>(y.data_ptr()),
+                                  static_cast<nv_bfloat16 *>(x.data_ptr()),
+                                  static_cast<nv_bfloat16 *>(w.data_ptr()),
+                                  indicies.data_ptr<int64_t>(), h_in, h_out, 0,
+                                  h_out, B, num_layers, layer_idx, scale);
+          break;
+        default:
+          break;
+        }
+        break;
+      case at::ScalarType::Float:
+        switch (w.scalar_type()) {
+        case at::ScalarType::Half:
+          ok = launch_bgmv_kernel(static_cast<float *>(y.data_ptr()),
+                                  static_cast<nv_bfloat16 *>(x.data_ptr()),
+                                  static_cast<nv_half *>(w.data_ptr()),
+                                  indicies.data_ptr<int64_t>(), h_in, h_out, 0,
+                                  h_out, B, num_layers, layer_idx, scale);
+          break;
+        case at::ScalarType::BFloat16:
+          ok = launch_bgmv_kernel(static_cast<float *>(y.data_ptr()),
+                                  static_cast<nv_bfloat16 *>(x.data_ptr()),
+                                  static_cast<nv_bfloat16 *>(w.data_ptr()),
+                                  indicies.data_ptr<int64_t>(), h_in, h_out, 0,
+                                  h_out, B, num_layers, layer_idx, scale);
+          break;
+        default:
+          break;
+        }
+        break;
+      default:
+        break;
+      }
+      break;
+    case at::ScalarType::Float:
+      switch (y.scalar_type()) {
+      case at::ScalarType::Half:
+        switch (w.scalar_type()) {
+        case at::ScalarType::Half:
+          ok = launch_bgmv_kernel(static_cast<nv_half *>(y.data_ptr()),
+                                  static_cast<float *>(x.data_ptr()),
+                                  static_cast<nv_half *>(w.data_ptr()),
+                                  indicies.data_ptr<int64_t>(), h_in, h_out, 0,
+                                  h_out, B, num_layers, layer_idx, scale);
+          break;
+        case at::ScalarType::BFloat16:
+          ok = launch_bgmv_kernel(static_cast<nv_half *>(y.data_ptr()),
+                                  static_cast<float *>(x.data_ptr()),
+                                  static_cast<nv_bfloat16 *>(w.data_ptr()),
+                                  indicies.data_ptr<int64_t>(), h_in, h_out, 0,
+                                  h_out, B, num_layers, layer_idx, scale);
+          break;
+        default:
+          break;
+        }
+        break;
+      case at::ScalarType::BFloat16:
+        switch (w.scalar_type()) {
+        case at::ScalarType::Half:
+          ok = launch_bgmv_kernel(static_cast<nv_bfloat16 *>(y.data_ptr()),
+                                  static_cast<float *>(x.data_ptr()),
+                                  static_cast<nv_half *>(w.data_ptr()),
+                                  indicies.data_ptr<int64_t>(), h_in, h_out, 0,
+                                  h_out, B, num_layers, layer_idx, scale);
+          break;
+        case at::ScalarType::BFloat16:
+          ok = launch_bgmv_kernel(static_cast<nv_bfloat16 *>(y.data_ptr()),
+                                  static_cast<float *>(x.data_ptr()),
+                                  static_cast<nv_bfloat16 *>(w.data_ptr()),
+                                  indicies.data_ptr<int64_t>(), h_in, h_out, 0,
+                                  h_out, B, num_layers, layer_idx, scale);
+          break;
+        default:
+          break;
+        }
+        break;
+      case at::ScalarType::Float:
+        switch (w.scalar_type()) {
+        case at::ScalarType::Half:
+          ok = launch_bgmv_kernel(static_cast<float *>(y.data_ptr()),
+                                  static_cast<float *>(x.data_ptr()),
+                                  static_cast<nv_half *>(w.data_ptr()),
+                                  indicies.data_ptr<int64_t>(), h_in, h_out, 0,
+                                  h_out, B, num_layers, layer_idx, scale);
+          break;
+        case at::ScalarType::BFloat16:
+          ok = launch_bgmv_kernel(static_cast<float *>(y.data_ptr()),
+                                  static_cast<float *>(x.data_ptr()),
+                                  static_cast<nv_bfloat16 *>(w.data_ptr()),
+                                  indicies.data_ptr<int64_t>(), h_in, h_out, 0,
+                                  h_out, B, num_layers, layer_idx, scale);
+          break;
+        default:
+          break;
+        }
+        break;
+      default:
+        break;
+      }
+      break;
+    default:
+      break;
+    }
+  }
+  TORCH_CHECK(ok, "No suitable kernel.", " h_in=", h_in, " h_out=", h_out,
+              " dtype=", x.scalar_type(), " out_dtype=", y.scalar_type());
+}
+
+void dispatch_bgmv_low_level(torch::Tensor y, torch::Tensor x, torch::Tensor w,
+                             torch::Tensor indicies, int64_t layer_idx,
+                             double scale, int64_t h_in, int64_t h_out,
+                             int64_t y_offset) {
+  CHECK_INPUT(y);
+  CHECK_INPUT(x);
+  CHECK_INPUT(w);
+  CHECK_INPUT(indicies);
+
+  CHECK_DIM(2, y);
+  CHECK_DIM(2, x);
+  CHECK_DIM(4, w);
+  CHECK_DIM(1, indicies);
+
+  int64_t B = x.size(0);
+  int64_t num_layers = w.size(1);
+  int64_t full_y_size = y.size(1);
+  CHECK_EQ(w.size(3), h_in);
+  CHECK_EQ(w.size(2), h_out);
+  CHECK_EQ(indicies.size(0), x.size(0));
+  CHECK_EQ(y.size(0), x.size(0));
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(x));
+  bool ok = false;
+  if (h_in <= 128512 && h_out <= 128512) {
+    // TODO: See if we can get rid of this massive nested switch
+    switch (x.scalar_type()) {
+    case at::ScalarType::Half:
+      switch (y.scalar_type()) {
+      case at::ScalarType::Half:
+        switch (w.scalar_type()) {
+        case at::ScalarType::Half:
+          ok = launch_bgmv_kernel(static_cast<nv_half *>(y.data_ptr()),
+                                  static_cast<nv_half *>(x.data_ptr()),
+                                  static_cast<nv_half *>(w.data_ptr()),
+                                  indicies.data_ptr<int64_t>(), h_in, h_out,
+                                  y_offset, full_y_size, B, num_layers,
+                                  layer_idx, scale);
+          break;
+        case at::ScalarType::BFloat16:
+          ok = launch_bgmv_kernel(static_cast<nv_half *>(y.data_ptr()),
+                                  static_cast<nv_half *>(x.data_ptr()),
+                                  static_cast<nv_bfloat16 *>(w.data_ptr()),
+                                  indicies.data_ptr<int64_t>(), h_in, h_out,
+                                  y_offset, full_y_size, B, num_layers,
+                                  layer_idx, scale);
+          break;
+        default:
+          break;
+        }
+        break;
+      case at::ScalarType::BFloat16:
+        switch (w.scalar_type()) {
+        case at::ScalarType::Half:
+          ok = launch_bgmv_kernel(static_cast<nv_bfloat16 *>(y.data_ptr()),
+                                  static_cast<nv_half *>(x.data_ptr()),
+                                  static_cast<nv_half *>(w.data_ptr()),
+                                  indicies.data_ptr<int64_t>(), h_in, h_out,
+                                  y_offset, full_y_size, B, num_layers,
+                                  layer_idx, scale);
+          break;
+        case at::ScalarType::BFloat16:
+          ok = launch_bgmv_kernel(static_cast<nv_bfloat16 *>(y.data_ptr()),
+                                  static_cast<nv_half *>(x.data_ptr()),
+                                  static_cast<nv_bfloat16 *>(w.data_ptr()),
+                                  indicies.data_ptr<int64_t>(), h_in, h_out,
+                                  y_offset, full_y_size, B, num_layers,
+                                  layer_idx, scale);
+          break;
+        default:
+          break;
+        }
+        break;
+      case at::ScalarType::Float:
+        switch (w.scalar_type()) {
+        case at::ScalarType::Half:
+          ok = launch_bgmv_kernel(static_cast<float *>(y.data_ptr()),
+                                  static_cast<nv_half *>(x.data_ptr()),
+                                  static_cast<nv_half *>(w.data_ptr()),
+                                  indicies.data_ptr<int64_t>(), h_in, h_out,
+                                  y_offset, full_y_size, B, num_layers,
+                                  layer_idx, scale);
+          break;
+        case at::ScalarType::BFloat16:
+          ok = launch_bgmv_kernel(static_cast<float *>(y.data_ptr()),
+                                  static_cast<nv_half *>(x.data_ptr()),
+                                  static_cast<nv_bfloat16 *>(w.data_ptr()),
+                                  indicies.data_ptr<int64_t>(), h_in, h_out,
+                                  y_offset, full_y_size, B, num_layers,
+                                  layer_idx, scale);
+          break;
+        default:
+          break;
+        }
+        break;
+      default:
+        break;
+      }
+      break;
+    case at::ScalarType::BFloat16:
+      switch (y.scalar_type()) {
+      case at::ScalarType::Half:
+        switch (w.scalar_type()) {
+        case at::ScalarType::Half:
+          ok = launch_bgmv_kernel(static_cast<nv_half *>(y.data_ptr()),
+                                  static_cast<nv_bfloat16 *>(x.data_ptr()),
+                                  static_cast<nv_half *>(w.data_ptr()),
+                                  indicies.data_ptr<int64_t>(), h_in, h_out,
+                                  y_offset, full_y_size, B, num_layers,
+                                  layer_idx, scale);
+          break;
+        case at::ScalarType::BFloat16:
+          ok = launch_bgmv_kernel(static_cast<nv_half *>(y.data_ptr()),
+                                  static_cast<nv_bfloat16 *>(x.data_ptr()),
+                                  static_cast<nv_bfloat16 *>(w.data_ptr()),
+                                  indicies.data_ptr<int64_t>(), h_in, h_out,
+                                  y_offset, full_y_size, B, num_layers,
+                                  layer_idx, scale);
+          break;
+        default:
+          break;
+        }
+        break;
+      case at::ScalarType::BFloat16:
+        switch (w.scalar_type()) {
+        case at::ScalarType::Half:
+          ok = launch_bgmv_kernel(static_cast<nv_bfloat16 *>(y.data_ptr()),
+                                  static_cast<nv_bfloat16 *>(x.data_ptr()),
+                                  static_cast<nv_half *>(w.data_ptr()),
+                                  indicies.data_ptr<int64_t>(), h_in, h_out,
+                                  y_offset, full_y_size, B, num_layers,
+                                  layer_idx, scale);
+          break;
+        case at::ScalarType::BFloat16:
+          ok = launch_bgmv_kernel(static_cast<nv_bfloat16 *>(y.data_ptr()),
+                                  static_cast<nv_bfloat16 *>(x.data_ptr()),
+                                  static_cast<nv_bfloat16 *>(w.data_ptr()),
+                                  indicies.data_ptr<int64_t>(), h_in, h_out,
+                                  y_offset, full_y_size, B, num_layers,
+                                  layer_idx, scale);
+          break;
+        default:
+          break;
+        }
+        break;
+      case at::ScalarType::Float:
+        switch (w.scalar_type()) {
+        case at::ScalarType::Half:
+          ok = launch_bgmv_kernel(static_cast<float *>(y.data_ptr()),
+                                  static_cast<nv_bfloat16 *>(x.data_ptr()),
+                                  static_cast<nv_half *>(w.data_ptr()),
+                                  indicies.data_ptr<int64_t>(), h_in, h_out,
+                                  y_offset, full_y_size, B, num_layers,
+                                  layer_idx, scale);
+          break;
+        case at::ScalarType::BFloat16:
+          ok = launch_bgmv_kernel(static_cast<float *>(y.data_ptr()),
+                                  static_cast<nv_bfloat16 *>(x.data_ptr()),
+                                  static_cast<nv_bfloat16 *>(w.data_ptr()),
+                                  indicies.data_ptr<int64_t>(), h_in, h_out,
+                                  y_offset, full_y_size, B, num_layers,
+                                  layer_idx, scale);
+          break;
+        default:
+          break;
+        }
+        break;
+      default:
+        break;
+      }
+      break;
+    case at::ScalarType::Float:
+      switch (y.scalar_type()) {
+      case at::ScalarType::Half:
+        switch (w.scalar_type()) {
+        case at::ScalarType::Half:
+          ok = launch_bgmv_kernel(static_cast<nv_half *>(y.data_ptr()),
+                                  static_cast<float *>(x.data_ptr()),
+                                  static_cast<nv_half *>(w.data_ptr()),
+                                  indicies.data_ptr<int64_t>(), h_in, h_out,
+                                  y_offset, full_y_size, B, num_layers,
+                                  layer_idx, scale);
+          break;
+        case at::ScalarType::BFloat16:
+          ok = launch_bgmv_kernel(static_cast<nv_half *>(y.data_ptr()),
+                                  static_cast<float *>(x.data_ptr()),
+                                  static_cast<nv_bfloat16 *>(w.data_ptr()),
+                                  indicies.data_ptr<int64_t>(), h_in, h_out,
+                                  y_offset, full_y_size, B, num_layers,
+                                  layer_idx, scale);
+          break;
+        default:
+          break;
+        }
+        break;
+      case at::ScalarType::BFloat16:
+        switch (w.scalar_type()) {
+        case at::ScalarType::Half:
+          ok = launch_bgmv_kernel(static_cast<nv_bfloat16 *>(y.data_ptr()),
+                                  static_cast<float *>(x.data_ptr()),
+                                  static_cast<nv_half *>(w.data_ptr()),
+                                  indicies.data_ptr<int64_t>(), h_in, h_out,
+                                  y_offset, full_y_size, B, num_layers,
+                                  layer_idx, scale);
+          break;
+        case at::ScalarType::BFloat16:
+          ok = launch_bgmv_kernel(static_cast<nv_bfloat16 *>(y.data_ptr()),
+                                  static_cast<float *>(x.data_ptr()),
+                                  static_cast<nv_bfloat16 *>(w.data_ptr()),
+                                  indicies.data_ptr<int64_t>(), h_in, h_out,
+                                  y_offset, full_y_size, B, num_layers,
+                                  layer_idx, scale);
+          break;
+        default:
+          break;
+        }
+        break;
+      case at::ScalarType::Float:
+        switch (w.scalar_type()) {
+        case at::ScalarType::Half:
+          ok = launch_bgmv_kernel(static_cast<float *>(y.data_ptr()),
+                                  static_cast<float *>(x.data_ptr()),
+                                  static_cast<nv_half *>(w.data_ptr()),
+                                  indicies.data_ptr<int64_t>(), h_in, h_out,
+                                  y_offset, full_y_size, B, num_layers,
+                                  layer_idx, scale);
+          break;
+        case at::ScalarType::BFloat16:
+          ok = launch_bgmv_kernel(static_cast<float *>(y.data_ptr()),
+                                  static_cast<float *>(x.data_ptr()),
+                                  static_cast<nv_bfloat16 *>(w.data_ptr()),
+                                  indicies.data_ptr<int64_t>(), h_in, h_out,
+                                  y_offset, full_y_size, B, num_layers,
+                                  layer_idx, scale);
+          break;
+        default:
+          break;
+        }
+        break;
+      default:
+        break;
+      }
+      break;
+    default:
+      break;
+    }
+  }
+  TORCH_CHECK(ok, "No suitable kernel.", " h_in=", h_in, " h_out=", h_out,
+              " dtype=", x.scalar_type(), " out_dtype=", y.scalar_type());
+}
--- a/csrc/punica/punica_ops.h
+++ b/csrc/punica/punica_ops.h
@ -0,0 +1,11 @@
+#pragma once
+
+#include <torch/all.h>
+
+void dispatch_bgmv(torch::Tensor y, torch::Tensor x, torch::Tensor w,
+                   torch::Tensor indicies, int64_t layer_idx, double scale);
+
+void dispatch_bgmv_low_level(torch::Tensor y, torch::Tensor x, torch::Tensor w,
+                             torch::Tensor indicies, int64_t layer_idx,
+                             double scale, int64_t h_in, int64_t h_out,
+                             int64_t y_offset);
--- a/csrc/punica/torch_bindings.cpp
+++ b/csrc/punica/torch_bindings.cpp
@ -0,0 +1,18 @@
+#include "registration.h"
+#include "punica_ops.h"
+
+TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
+  m.def(
+      "dispatch_bgmv(Tensor! y, Tensor x, Tensor w, Tensor indicies, int "
+      "layer_idx, float scale) -> ()");
+  m.impl("dispatch_bgmv", torch::kCUDA, &dispatch_bgmv);
+
+  m.def(
+      "dispatch_bgmv_low_level(Tensor! y, Tensor x, Tensor w,"
+      "Tensor indicies, int layer_idx,"
+      "float scale, int h_in, int h_out,"
+      "int y_offset) -> ()");
+  m.impl("dispatch_bgmv_low_level", torch::kCUDA, &dispatch_bgmv_low_level);
+}
+
+REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
--- a/csrc/punica/type_convert.h
+++ b/csrc/punica/type_convert.h
@ -0,0 +1,82 @@
+#ifndef CSRC__PUNICA__TYPE_CONVERT_H__
+#define CSRC__PUNICA__TYPE_CONVERT_H__
+
+#ifndef USE_ROCM
+
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+
+#else
+
+#include <hip/hip_bf16.h>
+#include <hip/hip_fp16.h>
+
+#define __TYPE_CONVERT__HOST_DEVICE__ __host__ __device__
+
+typedef __half nv_half;
+typedef __hip_bfloat16 nv_bfloat16;
+typedef __hip_bfloat162 nv_bfloat162;
+
+__TYPE_CONVERT__HOST_DEVICE__
+inline __hip_bfloat162 make_bfloat162(__hip_bfloat16 val) {
+  return __hip_bfloat162{val, val};
+}
+
+__TYPE_CONVERT__HOST_DEVICE__
+inline __hip_bfloat162 make_bfloat162(__hip_bfloat16 vall, __hip_bfloat16 valr) {
+  return __hip_bfloat162{vall, valr};
+}
+
+template <typename T_src, typename T_dst>
+__TYPE_CONVERT__HOST_DEVICE__
+inline T_dst convert_type(T_src val) {
+  return static_cast<T_dst>(val);
+}
+
+template <>
+__TYPE_CONVERT__HOST_DEVICE__
+inline float convert_type<__half, float>(__half val) {
+  return __half2float(val);
+}
+
+template <>
+__TYPE_CONVERT__HOST_DEVICE__
+inline __half convert_type<float, __half>(float val) {
+  return __float2half(val);
+}
+
+template <>
+__TYPE_CONVERT__HOST_DEVICE__
+inline float convert_type<__hip_bfloat16, float>(__hip_bfloat16 val) {
+  return __bfloat162float(val);
+}
+
+template <>
+__TYPE_CONVERT__HOST_DEVICE__
+inline __hip_bfloat16 convert_type<float, __hip_bfloat16>(float val) {
+  return __float2bfloat16(val);
+}
+
+template <typename T>
+__TYPE_CONVERT__HOST_DEVICE__
+inline T vllm_add(T a, T b) {
+  return a + b;
+}
+
+template <>
+__TYPE_CONVERT__HOST_DEVICE__
+inline __half vllm_add<__half>(__half a, __half b) {
+  return __hadd(a, b);
+}
+
+template <>
+__TYPE_CONVERT__HOST_DEVICE__
+inline __hip_bfloat16 vllm_add<__hip_bfloat16>(__hip_bfloat16 a, __hip_bfloat16 b) {
+  return __hadd(a, b);
+}
+
+#undef __TYPE_CONVERT__HOST_DEVICE__
+
+#endif // USE_ROCM
+
+#endif // CSRC__PUNICA__TYPE_CONVERT_H__
--- a/csrc/quantization/aqlm/gemm_kernels.cu
+++ b/csrc/quantization/aqlm/gemm_kernels.cu
@ -273,6 +273,8 @@ __global__ void Code2x8Dequant(
  }
  __syncthreads();

+  float res = 0;
+
  int iters = (prob_k / 8 - 1) / (8 * 32) + 1;
  while (iters--) {
    if (pred && a_gl_rd < a_gl_end) {
@ -496,14 +498,14 @@ torch::Tensor code2x8_matmat(const torch::Tensor& input,
 }

 // Accumulate the partition sizes.
-int4 accumulate_sizes(const std::vector<int64_t>& codebook_partition_sizes) {
+int4 accumulate_sizes(const torch::Tensor& codebook_partition_sizes) {
  int4 cumulative_sizes;
  auto cumulative_size = &cumulative_sizes.x;
-  size_t i = 0;
+  int i = 0;
  int last = 0;
-  assert(codebook_partition_sizes.size() <= 4);
-  for (; i < codebook_partition_sizes.size(); ++i, ++cumulative_size) {
-    *cumulative_size = codebook_partition_sizes[i] + last;
+  assert(codebook_partition_sizes.size(0) <= 4);
+  for (; i < codebook_partition_sizes.size(0); ++i, ++cumulative_size) {
+    *cumulative_size = codebook_partition_sizes[i].item<int>() + last;
    last = *cumulative_size;
  }
  // fill in the rest with unreachable.
@ -519,12 +521,12 @@ int4 accumulate_sizes(const std::vector<int64_t>& codebook_partition_sizes) {
 torch::Tensor aqlm_gemm(const torch::Tensor& input, const torch::Tensor& codes,
                        const torch::Tensor& codebooks,
                        const torch::Tensor& scales,
-                        const std::vector<int64_t>& codebook_partition_sizes,
+                        const torch::Tensor& codebook_partition_sizes,
                        const std::optional<torch::Tensor>& bias) {
  int4 cumulative_sizes =
      vllm::aqlm::accumulate_sizes(codebook_partition_sizes);

-  int const nbooks = codebooks.size(0) / codebook_partition_sizes.size();
+  int const nbooks = codebooks.size(0) / codebook_partition_sizes.size(0);
  int const entries = codebooks.size(1);

  if (nbooks == 1 && entries == (1 << 16)) {
@ -541,13 +543,13 @@ torch::Tensor aqlm_gemm(const torch::Tensor& input, const torch::Tensor& codes,
  return {};
 }

-torch::Tensor aqlm_dequant(
-    const torch::Tensor& codes, const torch::Tensor& codebooks,
-    const std::vector<int64_t>& codebook_partition_sizes) {
+torch::Tensor aqlm_dequant(const torch::Tensor& codes,
+                           const torch::Tensor& codebooks,
+                           const torch::Tensor& codebook_partition_sizes) {
  int4 cumulative_sizes =
      vllm::aqlm::accumulate_sizes(codebook_partition_sizes);

-  int const nbooks = codebooks.size(0) / codebook_partition_sizes.size();
+  int const nbooks = codebooks.size(0) / codebook_partition_sizes.size(0);
  int const entries = codebooks.size(1);

  const at::cuda::OptionalCUDAGuard device_guard(device_of(codes));
@ -557,8 +559,7 @@ torch::Tensor aqlm_dequant(
  auto in_features = codes.size(1) * 8;
  auto out_features = codes.size(0);

-  assert(out_features == std::accumulate(codebook_partition_sizes.begin(),
-                                         codebook_partition_sizes.end(), 0));
+  assert(out_features = codebook_partition_sizes.sum().item<int>());

  auto weights = torch::empty({out_features, in_features},
                              torch::TensorOptions()
--- a/csrc/quantization/awq/dequantize.cuh
+++ b/csrc/quantization/awq/dequantize.cuh
@ -95,7 +95,6 @@ __device__ uint4 dequantize_s4_to_fp16x2(uint32_t const& source) {

  return result;
 #endif
-  __builtin_unreachable();  // Suppress missing return statement warning
 }

 }  // namespace awq
--- a/csrc/quantization/awq/gemm_kernels.cu
+++ b/csrc/quantization/awq/gemm_kernels.cu
@ -17,6 +17,14 @@ Shang and Dang, Xingyu and Han, Song}, journal={arXiv}, year={2023}
 namespace vllm {
 namespace awq {

+// Pack two half values.
+static inline __device__ __host__ unsigned __pack_half2(const half x,
+                                                        const half y) {
+  unsigned v0 = *((unsigned short*)&x);
+  unsigned v1 = *((unsigned short*)&y);
+  return (v1 << 16) | v0;
+}
+
 template <int N>
 __global__ void __launch_bounds__(64)
    gemm_forward_4bit_cuda_m16nXk32(int G, int split_k_iters,
@ -34,7 +42,11 @@ __global__ void __launch_bounds__(64)
  __shared__ half A_shared[16 * (32 + 8)];
  __shared__ half B_shared[32 * (N + 8)];

+  __shared__ half scaling_factors_shared[N];
+  __shared__ half zeros_shared[N];
+
  int j_factors1 = ((OC + N - 1) / N);
+  int blockIdx_x = 0;
  int blockIdx_y = blockIdx.x % ((M + 16 - 1) / 16 * j_factors1);
  int blockIdx_z = blockIdx.x / ((M + 16 - 1) / 16 * j_factors1);

@ -48,6 +60,7 @@ __global__ void __launch_bounds__(64)

  static constexpr int row_stride_warp = 32 * 8 / 32;
  static constexpr int row_stride = 2 * 32 * 8 / N;
+  bool ld_zero_flag = (threadIdx.y * 32 + threadIdx.x) * 8 < N;
  // TODO: Haotian: blockIdx_y / j_factors1 in A loading to support bsz > 16
  bool ld_A_flag =
      (blockIdx_y / j_factors1 * 16 + threadIdx.y * row_stride_warp +
@ -132,7 +145,11 @@ __global__ void __launch_bounds__(64)
      uint32_t B_loaded =
          *(uint32_t*)(B_ptr_local + ax0_ax1_fused_0 * row_stride * (OC / 8));
      uint4 B_loaded_fp16 = dequantize_s4_to_fp16x2(B_loaded);
+      // uint4 B_loaded_zero = *(uint4*)(zeros_shared + (threadIdx.x % (cta_N /
+      // 8)) * 8);

+      // uint4 B_loaded_scale = *(uint4*)(scaling_factors_shared + (threadIdx.x
+      // % (cta_N / 8)) * 8);
      // - zero and * scale
      // TODO (Haotian): can save 4 assembly instructions if sormulate as deq =
      // q * scale - zero * scale.
@ -350,11 +367,17 @@ __global__ void __launch_bounds__(64)
 __global__ void __launch_bounds__(64)
    dequantize_weights(int* __restrict__ B, half* __restrict__ scaling_factors,
                       int* __restrict__ zeros, half* __restrict__ C, int G) {
+  int j_factors1 = 4;
+  int row_stride2 = 4;
+  int split_k_iters = 1;
  static constexpr uint32_t ZERO = 0x0;
  half B_shared[32 * (128 + 8)];

  half* B_shared_ptr2 = B_shared;

+  half B_shared_warp[32];
+  int OC = 512;
+
  int N = blockDim.x * gridDim.x;  // 2
  int col = (blockIdx.x * blockDim.x + threadIdx.x);
  int row = blockIdx.y * blockDim.y + threadIdx.y;
--- a/csrc/quantization/compressed_tensors/int8_quant_kernels.cu
+++ b/csrc/quantization/compressed_tensors/int8_quant_kernels.cu
@ -3,14 +3,7 @@
 #include <cmath>

 #include "../../dispatch_utils.h"
-
-#ifndef USE_ROCM
-  #include <cub/util_type.cuh>
-  #include <cub/cub.cuh>
-#else
-  #include <hipcub/util_type.hpp>
-  #include <hipcub/hipcub.hpp>
-#endif
+#include "../../reduction_utils.cuh"

 static inline __device__ int8_t float_to_int8_rn(float x) {
 #ifdef USE_ROCM
@ -62,10 +55,7 @@ __global__ void dynamic_scaled_int8_quant_kernel(
    absmax_val = val > absmax_val ? val : absmax_val;
  }

-  using BlockReduce = cub::BlockReduce<float, 1024>;
-  __shared__ typename BlockReduce::TempStorage reduceStorage;
-  float const block_absmax_val_maybe =
-      BlockReduce(reduceStorage).Reduce(absmax_val, cub::Max{}, blockDim.x);
+  float const block_absmax_val_maybe = blockReduceMax(absmax_val);
  __shared__ float block_absmax_val;
  if (tid == 0) {
    block_absmax_val = block_absmax_val_maybe;
--- a/csrc/quantization/cutlass_w8a8/Epilogues.md
+++ b/csrc/quantization/cutlass_w8a8/Epilogues.md
@ -1,147 +0,0 @@
-# CUTLASS Epilogues
-
-## Introduction
-This document describes the various CUTLASS epilogues implemented for fusing de-quantization operations onto GEMMs. 
-
-Currently, we only support symmetric quantization for weights,
-and symmetric and asymmetric quantization for activations.
-Both can be quantized per-tensor or per-channel (weights) / per-token (activations).
-
-There are 4 epilogues:
-1. ScaledEpilogue: symmetric quantization for activations, no bias.
-1. ScaledEpilogueBias: symmetric quantization for activations, supports bias.
-1. ScaledEpilogueAzp: asymmetric per-tensor quantization for activations, supports bias.
-1. ScaledEpilogueAzpPerToken: asymmetric per-token quantization for activations, supports bias.
-
-We do not have epilogues for asymmetric quantization of activations without bias in order to reduce final binary size.
-Instead, if no bias is passed, the epilogue will use 0 as the bias.
-That induces a redundant addition operation (and runtime check), but the performance impact is minor.
-
-## Underlying Linear Algebra
-
-More details available in the [Activation Quantization RFC](https://github.com/vllm-project/vllm/issues/3975).
-
-If $` \widehat X `$ is the quantized $` X `$, our matrices become the following
-
-```math
-A = s_a (\widehat A - J_a z_a)
-```
-```math
-B = s_b \widehat B
-```
-```math
-D = A B + C
-```
-```math
-D = s_a s_b \widehat D + C
-```
-
-Here, D is the output of the GEMM, and C is the bias.
-A is the activations and supports asymmetric quantization,
-and B is the weights and only supports symmetric quantization.
-$ s_a $ and $s_b$ are the scales for activations and weights, respectively.
-$ z_a $ is the zero-point for activations, and $ J_a $ is the matrix of all ones with dimensions of A.
-Additional epilogues would be required to support asymmetric quantization for weights.
-
-Expanding further, we can calculate $` \widehat D `$ as follows:
-
-```math
-A B = s_a ( \widehat A - J_a z_a ) s_b \widehat B
-```
-```math
-A B = s_a s_b \left( \widehat A \widehat B - J_a z_a \widehat B \right)
-```
-```math
-\widehat D = \widehat A \widehat B - z_a J_a \widehat B
-```
-
-Note that $` \widehat A \widehat B `$ is the raw output of the GEMM,
-and $` J_a \widehat B `$ is known ahead of time.
-Each row of it is equal to $` \mathbf 1 \widehat B `$, which is a row-vector of column sums of $` \widehat B `$.
-
-## Epilogues
-
-### ScaledEpilogue
-This epilogue computes the symmetric quantization for activations without bias, meaning $` C = 0 `$ and $` z_a = 0 `$.
-The output of the GEMM is:
-
-```math
-\widehat D = \widehat A \widehat B
-```
-```math
-D = s_a s_b \widehat D
-```
-```math
-D = s_a s_b \widehat A \widehat B
-```
-
-Epilogue parameters:
- `scale_a` is the scale for activations, can be per-tensor (scalar) or per-token (column-vector).
- `scale_b` is the scale for weights, can be per-tensor (scalar) or per-channel (row-vector).
-
-### ScaledEpilogueBias
-This epilogue computes the symmetric quantization for activations with bias, meaning $` z_a = 0 `$.
-The output of the GEMM is:
-
-```math
-\widehat D = \widehat A \widehat B
-```
-```math
-D = s_a s_b \widehat D + C 
-```
-```math
-D = s_a s_b \widehat A \widehat B + C
-```
-
-
-Epilogue parameters:
- `scale_a` is the scale for activations, can be per-tensor (scalar) or per-token (column-vector).
- `scale_b` is the scale for weights, can be per-tensor (scalar) or per-channel (row-vector).
- `bias` is the bias, is always per-channel (row-vector).
-
-### ScaledEpilogueAzp
-This epilogue computes the asymmetric per-tensor quantization for activations with bias.
-The output of the GEMM is:
-
-```math
-\widehat D = \widehat A \widehat B - z_a J_a \widehat B
-```
-```math
-D = s_a s_b \widehat D + C 
-```
-```math
-D = s_a s_b \left( \widehat A \widehat B - z_a J_a \widehat B \right) + C
-```
-
-Because $` z_a `$ is a scalar, the zero-point term $` z_a J_a \widehat B `$ has every row equal to $` z_a \mathbf 1 B `$. 
-That is precomputed and stored in `azp_with_adj` as a row-vector.
-
-Epilogue parameters:
- `scale_a` is the scale for activations, can be per-tensor (scalar) or per-token (column-vector).
-  - Generally this will be per-tensor as the zero-points are per-tensor.
- `scale_b` is the scale for weights, can be per-tensor (scalar) or per-channel (row-vector).
- `azp_with_adj` is the precomputed zero-point term ($` z_a J_a \widehat B `$), is per-channel (row-vector).
- `bias` is the bias, is always per-channel (row-vector).
-
-To use these kernels efficiently, users must precompute the `azp_with_adj` term offline and pass it to the kernel.
-
-### ScaledEpilogueAzpPerToken
-This epilogue computes the asymmetric per-token quantization for activations with bias.
-
-The output of the GEMM is the same as above, but the $` z_a `$ is a column-vector.
-That means the zero-point term $` z_a J_a \widehat B `$ becomes an outer product of $` z_a `$ and $` \mathbf 1 \widehat B `$.
-
-Epilogue parameters:
- `scale_a` is the scale for activations, can be per-tensor (scalar) or per-token (column-vector).
-  - Generally this will be per-token as the zero-points are per-token.
- `scale_b` is the scale for weights, can be per-tensor (scalar) or per-channel (row-vector).
- `azp_adj` is the precomputed zero-point adjustment term ($` \mathbf 1 \widehat B `$), is per-channel (row-vector).
- `azp` is the zero-point (`z_a`), is per-token (column-vector).
- `bias` is the bias, is always per-channel (row-vector).
-
-To use these kernels efficiently, users must precompute the `azp_adj` term offline and pass it to the kernel.
-
-The epilogue performs the following computation (where `Dq` is the raw quantized output of the GEMM):
-```
-out = scale_a * scale_b * (Dq - azp_adj * azp) + bias
-```
--- a/csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c2x.hpp
+++ b/csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c2x.hpp
@ -207,156 +207,6 @@ struct VisitorRowOrScalarBroadcast {

 };

-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// This is a modified RowBroadcast that will broadcast 0 if ptr_row is null
-template<
-  class ThreadMap,
-  class Element,
-  class StrideMNL
->
-struct VisitorRowOrZeroBroadcast {
-
-  // This struct has been modified to remove null_default (because it's always 0)
-  struct Arguments {
-    Element const* ptr_row = nullptr;
-    StrideMNL dRow = {};
-  };
-
-  using Params = Arguments;
-
-  template <class ProblemShape>
-  static constexpr Params
-  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
-    return args;
-  }
-
-  template <class ProblemShape>
-  static size_t
-  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
-    return 0;
-  }
-
-  struct SharedStorage {};
-
-  // Global load type
-  static int constexpr vec_bits = ThreadMap::kElementsPerAccess * sizeof_bits<Element>::value;
-  using VecType = uint_bit_t<cute::min(128, vec_bits)>;
-  static int constexpr VecLength = sizeof(VecType) / sizeof(Element);
-
-  CUTLASS_HOST_DEVICE
-  VisitorRowOrZeroBroadcast() { }
-
-  CUTLASS_HOST_DEVICE
-  VisitorRowOrZeroBroadcast(Params const& params, SharedStorage const& shared_storage)
-    : params_ptr(&params) { }
-
-  Params const* params_ptr;
-
-  template <class GTensor, class RTensor, class CTensor, class ProblemShape>
-  struct Callbacks : EmptyCallbacks {
-    CUTLASS_DEVICE
-    Callbacks(
-      GTensor&& tC_gRow,
-      RTensor&& tC_rRow,
-      CTensor&& tC_cRow,
-      ProblemShape problem_shape,
-      Params const* params_ptr
-    ):
-      tC_gRow(cute::forward<GTensor>(tC_gRow)),
-      tC_rRow(cute::forward<RTensor>(tC_rRow)),
-      tC_cRow(cute::forward<CTensor>(tC_cRow)),
-      n(get<1>(problem_shape)),
-      params_ptr(params_ptr) { }
-
-    GTensor tC_gRow;
-    RTensor tC_rRow;
-    CTensor tC_cRow;
-    Params const* params_ptr;
-    int n;
-
-    // This function is modified from VisitorRowBroadcast
-    CUTLASS_DEVICE void
-    begin_epilogue() {
-      clear(tC_rRow);
-      auto src_v = filter(tC_gRow);
-      auto coord_v = filter(tC_cRow);
-      auto dst_v = filter(tC_rRow);
-
-      if (params_ptr->ptr_row != nullptr) {
-        // In this case we are loading from a row vector and broadcasting
-        CUTLASS_PRAGMA_UNROLL
-        for (int i = 0; i < size(src_v); ++i) {
-          bool guard = get<1>(coord_v(i)) < n;
-          cutlass::arch::global_load<VecType, sizeof(VecType)>(
-              dst_v(i), (void const*)&src_v(i), guard);
-        }
-      } else {
-        // In this case we are broadcasting 0
-        VecType filled_vec;
-        CUTLASS_PRAGMA_UNROLL
-        for (int i = 0; i < VecLength; i++) {
-          reinterpret_cast<Element*>(&filled_vec)[i] = Element{0};
-        }
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int i = 0; i < size(src_v); ++i) {
-          if (get<1>(coord_v(i)) < n) {
-            dst_v(i) = filled_vec;
-          }
-        }
-      }
-    }
-
-    template <class ElementAccumulator, int FragmentSize>
-    CUTLASS_DEVICE auto // returns an Array
-    visit(int iter_idx, int row_idx, int column_idx, int frg_idx,
-          Array<ElementAccumulator, FragmentSize> const& frg_acc) {
-      Tensor rRow_frg = recast<Array<Element, FragmentSize>>(coalesce(tC_rRow));
-      return rRow_frg(column_idx);
-    }
-  };
-
-  template <class ProblemShape>
-  CUTLASS_DEVICE auto
-  get_callbacks(
-    gemm::GemmCoord threadblock_tile_offset,
-    int thread_idx,
-    ProblemShape problem_shape
-  ) {
-    Tensor mRow = make_tensor(
-      make_gmem_ptr(params_ptr->ptr_row),
-      problem_shape,
-      params_ptr->dRow);
-
-    // VECTOR, FRAGMENT_COLUMN
-    Tensor tC_gRow = recast<VecType>(
-      ThreadMap::partition(mRow, thread_idx, threadblock_tile_offset)
-    )(_,_,_0{},_0{},_0{},_0{});
-    Tensor tC_rRow = make_tensor_like(tC_gRow);
-
-    // Generate the pred tensor
-    Tensor cRow = make_identity_tensor(mRow.shape());
-    Tensor tC_cRow = outer_partition(
-      ThreadMap::partition(cRow, thread_idx, threadblock_tile_offset)(_,_,_0{},_0{},_0{},_0{}),
-      Shape<Int<VecLength>>{},
-      (_0{})
-    );
-
-    return Callbacks<
-      decltype(tC_gRow), decltype(tC_rRow),
-      decltype(tC_cRow), ProblemShape>(
-      cute::move(tC_gRow),
-      cute::move(tC_rRow),
-      cute::move(tC_cRow),
-      problem_shape,
-      params_ptr
-    );
-  }
-
-};
-
-
 /////////////////////////////////////////////////////////////////////////////////////////////////

 // Column vector broadcast
@ -367,7 +217,7 @@ template<
 >
 struct VisitorColOrScalarBroadcast {

-  // This struct has been modified to have a bool indicating that ptr_col is a
+  // This struct has been modified to have a bool indicating that ptr_col is a 
  // scalar that must be broadcast.
  struct Arguments {
    Element const* ptr_col = nullptr;
--- a/csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c3x.hpp
+++ b/csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c3x.hpp
@ -64,6 +64,8 @@ using namespace detail;

 // Row vector broadcast
 template<
+  // Row bcast reuses the mbarriers from the epilogue subtile load pipeline, so this must be at least
+  // ceil_div(StagesC, epi tiles per CTA tile) + 1 to ensure no data races
  int Stages,
  class CtaTileShapeMNK,
  class Element,
@ -71,12 +73,14 @@ template<
  int Alignment = 128 / sizeof_bits_v<Element>
 >
 struct Sm90RowOrScalarBroadcast {
-  static_assert(Stages == 0, "Row broadcast doesn't support smem usage");
-  static_assert(is_static_v<decltype(take<0,2>(StrideMNL{}))>); // batch stride can be dynamic or static
-  static_assert(take<0,2>(StrideMNL{}) == Stride<_0,_1>{});
+  static_assert(Alignment * sizeof_bits_v<Element> % 128 == 0, "sub-16B alignment not supported yet");
+  static_assert(
+    (cute::is_same_v<StrideMNL, Stride<_0,_1, _0>>) || // row vector broadcast, e.g. per-col alpha/bias
+    (cute::is_same_v<StrideMNL, Stride<_0,_1,int>>));  // batched row vector broadcast

-  struct SharedStorage { 
-    array_aligned<Element, size<1>(CtaTileShapeMNK{})> smem;
+  // Accumulator doesn't distribute row elements evenly amongst threads so we must buffer in smem
+  struct SharedStorage {
+    alignas(16) array_aligned<Element, size<1>(CtaTileShapeMNK{}) * Stages> smem_row;
  };

  // This struct has been modified to have a bool indicating that ptr_row is a 
@ -96,12 +100,6 @@ struct Sm90RowOrScalarBroadcast {
    return args;
  }

-  template <class ProblemShape>
-  static bool
-  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
-    return true;
-  }
-
  template <class ProblemShape>
  static size_t
  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
@ -120,15 +118,15 @@ struct Sm90RowOrScalarBroadcast {

  CUTLASS_HOST_DEVICE
  Sm90RowOrScalarBroadcast(Params const& params, SharedStorage const& shared_storage)
-      : params(params)
-      , smem(const_cast<Element*>(shared_storage.smem.data())) { }
+      : params(params),
+        smem_row(const_cast<Element*>(shared_storage.smem_row.data())) { }

  Params params;
-  Element *smem = nullptr;
+  Element* smem_row;

  CUTLASS_DEVICE bool
  is_producer_load_needed() const {
-    return false;
+    return true;
  }

  CUTLASS_DEVICE bool
@ -141,76 +139,78 @@ struct Sm90RowOrScalarBroadcast {
    return (!params.row_broadcast && *(params.ptr_row) == Element(0));
  }

-  template <class... Args>
-  CUTLASS_DEVICE auto
-  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
-    return EmptyProducerLoadCallbacks{};
-  }
-
-  template <class GS_GTensor, class GS_STensor, class GS_CTensor, class Tiled_G2S, class SR_STensor, class SR_RTensor, class CTensor, class ThrResidue, class ThrNum>
-  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
+  template <int EpiTiles, class GTensor, class STensor>
+  struct ProducerLoadCallbacks : EmptyProducerLoadCallbacks {
    CUTLASS_DEVICE
-    ConsumerStoreCallbacks(
-        GS_GTensor tGS_gRow_, GS_STensor tGS_sRow_, 
-        GS_CTensor tGS_cRow_, Tiled_G2S tiled_g2s_, 
-        SR_STensor tSR_sRow_, SR_RTensor tSR_rRow_,
-        CTensor tCcRow_, ThrResidue residue_tCcRow_, ThrNum thr_num_, Params const& params_)
-      : tGS_gRow(tGS_gRow_)
-      , tGS_sRow(tGS_sRow_)
-      , tGS_cRow(tGS_cRow_)
-      , tiled_G2S(tiled_g2s_)
-      , tSR_sRow(tSR_sRow_)
-      , tSR_rRow(tSR_rRow_)
-      , tCcRow(tCcRow_)
-      , residue_tCcRow(residue_tCcRow_)
-      , params(params_) {}
+    ProducerLoadCallbacks(GTensor&& gRow, STensor&& sRow, Params const& params)
+      : gRow(cute::forward<GTensor>(gRow)),
+        sRow(cute::forward<STensor>(sRow)),
+        params(params) {}

-    GS_GTensor tGS_gRow;                                                         // (CPY,CPY_M,CPY_N)
-    GS_STensor tGS_sRow;                                                         // (CPY,CPY_M,CPY_N)
-    GS_CTensor tGS_cRow;                                                         // (CPY,CPY_M,CPY_N)
-    Tiled_G2S tiled_G2S;
-
-    SR_STensor tSR_sRow;                                                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
-    SR_RTensor tSR_rRow;                                                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N) 
-  
-    CTensor tCcRow;                                                              // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
-    ThrResidue residue_tCcRow;                                                   // (m, n)
-    ThrNum thr_num;
+    GTensor gRow;                                                                                 // (CTA_M,CTA_N)
+    STensor sRow;                                                                                 // (CTA_M,CTA_N,PIPE)
    Params const& params;

    CUTLASS_DEVICE void
-    begin() {
+    begin(uint64_t* full_mbarrier_ptr, int load_iteration, bool issue_tma_load) {
      if (!params.row_broadcast) {
-        fill(tSR_rRow, *(params.ptr_row));
        return;
      }

-      auto synchronize = [&] () { cutlass::arch::NamedBarrier::sync(thr_num, cutlass::arch::ReservedNamedBarriers::EpilogueBarrier); };
-      Tensor tGS_gRow_flt = filter_zeros(tGS_gRow);
-      Tensor tGS_sRow_flt = filter_zeros(tGS_sRow);
-      Tensor tGS_cRow_flt = make_tensor(tGS_cRow.data(), make_layout(tGS_gRow_flt.shape(), tGS_cRow.stride()));
-
-      for (int i = 0; i < size(tGS_gRow_flt); ++i) {
-        if (get<1>(tGS_cRow_flt(i)) >= size<1>(CtaTileShapeMNK{})) {
-          continue; // OOB of SMEM, 
-        }
-        if (elem_less(tGS_cRow_flt(i), make_coord(get<0>(residue_tCcRow), get<1>(residue_tCcRow)))) {
-          tGS_sRow_flt(i) = tGS_gRow_flt(i);
-        }
-        else {
-          tGS_sRow_flt(i) = Element(0); // Set to Zero when OOB so LDS could be issue without any preds.
-        }
+      if (issue_tma_load) {
+        // Increment the expect-tx count of the first subtile's mbarrier by the row vector's byte-size
+        constexpr uint32_t copy_bytes = size<1>(CtaTileShapeMNK{}) * sizeof_bits_v<Element> / 8;
+        cutlass::arch::ClusterTransactionBarrier::expect_transaction(full_mbarrier_ptr, copy_bytes);
+        // Issue the TMA bulk copy
+        auto bulk_copy = Copy_Atom<SM90_BULK_COPY_AUTO, Element>{}.with(*full_mbarrier_ptr);
+        // Filter so we don't issue redundant copies over stride-0 modes
+        int bcast_pipe_index = (load_iteration / EpiTiles) % Stages;
+        copy(bulk_copy, filter(gRow), filter(sRow(_,_,bcast_pipe_index)));
      }
-      synchronize();
    }
+  };
+
+  template <class... Args>
+  CUTLASS_DEVICE auto
+  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
+
+    auto [M, N, K, L] = args.problem_shape_mnkl;
+    auto [m, n, k, l] = args.tile_coord_mnkl;
+    Tensor mRow = make_tensor(make_gmem_ptr(params.ptr_row), make_shape(M,N,L), params.dRow);
+    Tensor gRow = local_tile(mRow, take<0,2>(args.tile_shape_mnk), make_coord(m,n,l));            // (CTA_M,CTA_N)
+    Tensor sRow = make_tensor(make_smem_ptr(smem_row),                                            // (CTA_M,CTA_N,PIPE)
+                    make_shape(size<0>(CtaTileShapeMNK{}), size<1>(CtaTileShapeMNK{}), Stages),
+                    make_stride(_0{},_1{},size<1>(CtaTileShapeMNK{})));
+
+    constexpr int EpiTiles = decltype(size<1>(zipped_divide(make_layout(take<0,2>(args.tile_shape_mnk)), args.epi_tile)))::value;
+    return ProducerLoadCallbacks<EpiTiles, decltype(gRow), decltype(sRow)>(
+      cute::move(gRow), cute::move(sRow), params);
+  }
+
+  template <int EpiTiles, class RTensor, class STensor>
+  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(RTensor&& tCrRow, STensor&& tCsRow, Params const& params)
+      : tCrRow(cute::forward<RTensor>(tCrRow)),
+        tCsRow(cute::forward<STensor>(tCsRow)),
+        params(params) {}
+
+    RTensor tCrRow;                                                               // (CPY,CPY_M,CPY_N)
+    STensor tCsRow;                                                               // (CPY,CPY_M,CPY_N,EPI_M,EPI_N,PIPE)
+    Params const& params;

    CUTLASS_DEVICE void
-    begin_loop(int epi_m, int epi_n) {
+    previsit(int epi_m, int epi_n, int load_iteration, bool is_producer_load_needed) {
+      if (!params.row_broadcast) {
+        fill(tCrRow, *(params.ptr_row));
+        return;
+      }
+
      if (epi_m == 0) { // Assumes M-major subtile loop
-        if (!params.row_broadcast) return; // Do not issue LDS when row is scalar 
-        Tensor tSR_sRow_flt = filter_zeros(tSR_sRow(_,_,_,epi_m,epi_n));
-        Tensor tSR_rRow_flt = filter_zeros(tSR_rRow);
-        copy(tSR_sRow_flt, tSR_rRow_flt);
+        // Filter so we don't issue redundant copies over stride-0 modes
+        // (only works if 0-strides are in same location, which is by construction)
+        int bcast_pipe_index = (load_iteration / EpiTiles) % Stages;
+        copy_aligned(filter(tCsRow(_,_,_,epi_m,epi_n,bcast_pipe_index)), filter(tCrRow));
      }
    }

@ -221,7 +221,7 @@ struct Sm90RowOrScalarBroadcast {

      CUTLASS_PRAGMA_UNROLL
      for (int i = 0; i < FragmentSize; ++i) {
-        frg_row[i] = tSR_rRow(epi_v * FragmentSize + i);
+        frg_row[i] = tCrRow(epi_v * FragmentSize + i);
      }

      return frg_row;
@ -234,41 +234,17 @@ struct Sm90RowOrScalarBroadcast {
  >
  CUTLASS_DEVICE auto
  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
-    auto [M, N, K, L] = args.problem_shape_mnkl;
-    auto [m, n, k, l] = args.tile_coord_mnkl;
-    using ThreadCount = decltype(size(args.tiled_copy));

-    Tensor mRow = make_tensor(make_gmem_ptr(params.ptr_row), make_shape(M,N,L), params.dRow);
-    Tensor gRow = local_tile(mRow(_,_,l), take<0,2>(args.tile_shape_mnk), make_coord(m, n));          // (CTA_M, CTA_N)
-    Tensor sRow = make_tensor(make_smem_ptr(smem), 
-        make_shape(size<0>(CtaTileShapeMNK{}), size<1>(CtaTileShapeMNK{})), make_shape(_0{}, _1{}));  // (CTA_M, CTA_N)
-    //// G2S: Gmem to Smem
-    auto tiled_g2s = make_tiled_copy(Copy_Atom<DefaultCopy, Element>{},
-                                     Layout< Shape<_1, ThreadCount>, 
-                                            Stride<_0,          _1>>{}, 
-                                     Layout<_1>{});   
-    auto thr_g2s = tiled_g2s.get_slice(args.thread_idx);
-    Tensor tGS_gRow = thr_g2s.partition_S(gRow);
-    Tensor tGS_sRow = thr_g2s.partition_D(sRow);
+    Tensor sRow = make_tensor(make_smem_ptr(smem_row),                                            // (CTA_M,CTA_N,PIPE)
+                    make_shape(size<0>(CtaTileShapeMNK{}), size<1>(CtaTileShapeMNK{}), Stages),
+                    make_stride(_0{},_1{},size<1>(CtaTileShapeMNK{})));
+    Tensor tCsRow = sm90_partition_for_epilogue<ReferenceSrc>(                    // (CPY,CPY_M,CPY_N,EPI_M,EPI_N,PIPE)
+                      sRow, args.epi_tile, args.tiled_copy, args.thread_idx);
+    Tensor tCrRow = make_tensor_like(take<0,3>(tCsRow));                                           // (CPY,CPY_M,CPY_N)

-    //// G2S: Coord 
-    auto cRow = make_identity_tensor(make_shape(size<0>(CtaTileShapeMNK{}), size<1>(CtaTileShapeMNK{})));
-    Tensor tGS_cRow = thr_g2s.partition_S(cRow);
-
-    //// S2R: Smem to Reg
-    Tensor tSR_sRow = sm90_partition_for_epilogue<ReferenceSrc>(sRow, args.epi_tile, args.tiled_copy, args.thread_idx);
-    Tensor tSR_rRow = make_tensor_like(take<0,3>(tSR_sRow));                                           // (CPY,CPY_M,CPY_N)
-
-    return ConsumerStoreCallbacks<decltype(tGS_gRow), decltype(tGS_sRow), decltype(tGS_cRow), decltype(tiled_g2s), decltype(tSR_sRow), decltype(tSR_rRow), decltype(args.tCcD), decltype(args.residue_cD), ThreadCount>(
-      tGS_gRow, 
-      tGS_sRow, 
-      tGS_cRow, tiled_g2s, 
-      tSR_sRow, 
-      tSR_rRow, 
-      args.tCcD, 
-      args.residue_cD,
-      ThreadCount{}, 
-      params);
+    constexpr int EpiTiles = decltype(size<1>(zipped_divide(make_layout(take<0,2>(args.tile_shape_mnk)), args.epi_tile)))::value;
+    return ConsumerStoreCallbacks<EpiTiles, decltype(tCrRow), decltype(tCsRow)>(
+      cute::move(tCrRow), cute::move(tCsRow), params);
  }
 };

@ -309,12 +285,6 @@ struct Sm90ColOrScalarBroadcast {
    return args;
  }

-  template <class ProblemShape>
-  static bool
-  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
-    return true;
-  }
-
  template <class ProblemShape>
  static size_t
  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu
@ -1,18 +1,470 @@
 #include <stddef.h>
 #include <torch/all.h>
-#include "cutlass/cutlass.h"

-#include "scaled_mm_c2x.cuh"
-#include "scaled_mm_c2x_sm75_dispatch.cuh"
-#include "scaled_mm_c2x_sm80_dispatch.cuh"
-#include "scaled_mm_c2x_sm89_fp8_dispatch.cuh"
-#include "scaled_mm_c2x_sm89_int8_dispatch.cuh"
+#include <ATen/cuda/CUDAContext.h>
+
+// clang-format will break include orders
+// clang-format off
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/util/device_memory.h"
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm_coord.h"
+#include "cutlass/arch/mma_sm75.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/arch/mma.h"
+#include "cutlass/gemm/device/gemm.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+
+#include "cutlass/epilogue/threadblock/fusion/visitors.hpp"
+#include "cutlass/gemm/kernel/default_gemm_universal_with_visitor.h"
+
+#include "broadcast_load_epilogue_c2x.hpp"
+#include "common.hpp"
+// clang-format on
+
+using namespace cute;

 /*
   This file defines quantized GEMM operations using the CUTLASS 2.x API, for
   NVIDIA GPUs with SM versions prior to sm90 (Hopper).
+
+   Epilogue functions can be defined to post-process the output before it is
+   written to GPU memory.
+   Epilogues must contain a public type named EVTCompute of type Sm80EVT,
+   as well as a static prepare_args function that constructs an
+   EVTCompute::Arguments struct.
 */

+namespace {
+
+// Wrappers for the GEMM kernel that is used to guard against compilation on
+// architectures that will never use the kernel. The purpose of this is to
+// reduce the size of the compiled binary.
+// __CUDA_ARCH__ is not defined in host code, so this lets us smuggle the ifdef
+// into code that will be executed on the device where it is defined.
+template <typename Kernel>
+struct enable_sm75_to_sm80 : Kernel {
+  template <typename... Args>
+  CUTLASS_DEVICE static void invoke(Args&&... args) {
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 750 && __CUDA_ARCH__ < 800
+    Kernel::invoke(std::forward<Args>(args)...);
+#endif
+  }
+};
+
+template <typename Kernel>
+struct enable_sm80_to_sm89 : Kernel {
+  template <typename... Args>
+  CUTLASS_DEVICE static void invoke(Args&&... args) {
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 800 && __CUDA_ARCH__ < 890
+    Kernel::invoke(std::forward<Args>(args)...);
+#endif
+  }
+};
+
+template <typename Kernel>
+struct enable_sm89_to_sm90 : Kernel {
+  template <typename... Args>
+  CUTLASS_DEVICE static void invoke(Args&&... args) {
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 890 && __CUDA_ARCH__ < 900
+    Kernel::invoke(std::forward<Args>(args)...);
+#endif
+  }
+};
+
+/*
+ * This class provides the common ScaleA and ScaleB descriptors for the
+ * ScaledEpilogue and ScaledEpilogueBias classes.
+ */
+template <typename ElementD, typename OutputTileThreadMap>
+struct ScaledEpilogueBase {
+ protected:
+  using Accum = cutlass::epilogue::threadblock::VisitorAccFetch;
+
+  using ScaleA = cutlass::epilogue::threadblock::VisitorColOrScalarBroadcast<
+      OutputTileThreadMap, float, Stride<Int<1>, Int<0>, Int<0>>>;
+
+  using ScaleB = cutlass::epilogue::threadblock::VisitorRowOrScalarBroadcast<
+      OutputTileThreadMap, float, Stride<Int<0>, Int<1>, Int<0>>>;
+};
+
+/*
+ This epilogue function defines a quantized GEMM operation similar to
+ torch._scaled_mm.
+
+ A and B may be both either int8 or fp8_e4m3. A can be quantized per-tensor or
+ per-row. B can be quantized per-tensor or per-column.
+ Any combination of per-tensor and per-row or column is supported.
+ A and B must have symmetric quantization (zero point == 0).
+
+ So the GEMM operation is D = (a_scales * A) (b_scales * B), where the
+ scales are applied elementwise with numpy-style broadcasting.
+
+ ScaleA and ScaleB define the epilogue functions that apply the scales for
+ the A and B operands respectively. These scales may be either per-tensor or
+ per row or column.
+*/
+template <typename ElementD, typename OutputTileThreadMap>
+struct ScaledEpilogue
+    : private ScaledEpilogueBase<ElementD, OutputTileThreadMap> {
+ private:
+  using SUPER = ScaledEpilogueBase<ElementD, OutputTileThreadMap>;
+  using Accum = typename SUPER::Accum;
+  using ScaleA = typename SUPER::ScaleA;
+  using ScaleB = typename SUPER::ScaleB;
+
+  using Compute0 = cutlass::epilogue::threadblock::VisitorCompute<
+      cutlass::multiplies, float, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTCompute0 =
+      cutlass::epilogue::threadblock::Sm80EVT<Compute0, ScaleB, Accum>;
+
+  using Compute1 = cutlass::epilogue::threadblock::VisitorCompute<
+      cutlass::multiplies, ElementD, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+ public:
+  using EVTCompute =
+      cutlass::epilogue::threadblock::Sm80EVT<Compute1, ScaleA, EVTCompute0>;
+  using ArgumentType = typename EVTCompute::Arguments;
+
+  static ArgumentType prepare_args(torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales) {
+    using ScaleAArgs = typename ScaleA::Arguments;
+    using ScaleBArgs = typename ScaleB::Arguments;
+
+    ScaleBArgs b_args{b_scales.data_ptr<float>(), b_scales.numel() != 1, {}};
+    ScaleAArgs a_args{a_scales.data_ptr<float>(), a_scales.numel() != 1, {}};
+
+    typename EVTCompute0::Arguments evt0_compute_args{b_args};
+
+    typename EVTCompute::Arguments evt_compute_args{a_args, evt0_compute_args};
+    return evt_compute_args;
+  }
+};
+
+template <typename ElementD, typename OutputTileThreadMap>
+struct ScaledEpilogueBias
+    : private ScaledEpilogueBase<ElementD, OutputTileThreadMap> {
+ private:
+  using SUPER = ScaledEpilogueBase<ElementD, OutputTileThreadMap>;
+  using Accum = typename SUPER::Accum;
+  using ScaleA = typename SUPER::ScaleA;
+  using ScaleB = typename SUPER::ScaleB;
+
+  using Compute0 = cutlass::epilogue::threadblock::VisitorCompute<
+      cutlass::multiplies, float, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTCompute0 =
+      cutlass::epilogue::threadblock::Sm80EVT<Compute0, ScaleB, Accum>;
+
+  using Compute1 = cutlass::epilogue::threadblock::VisitorCompute<
+      cutlass::multiply_add, ElementD, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using Bias = cutlass::epilogue::threadblock::VisitorRowBroadcast<
+      OutputTileThreadMap, ElementD, Stride<Int<0>, Int<1>, Int<0>>>;
+
+ public:
+  using EVTCompute = cutlass::epilogue::threadblock::Sm80EVT<Compute1, ScaleA,
+                                                             EVTCompute0, Bias>;
+  using ArgumentType = typename EVTCompute::Arguments;
+
+  static ArgumentType prepare_args(torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales,
+                                   torch::Tensor const& bias) {
+    using ScaleAArgs = typename ScaleA::Arguments;
+    using ScaleBArgs = typename ScaleB::Arguments;
+    using BiasArgs = typename Bias::Arguments;
+
+    ScaleBArgs b_args{b_scales.data_ptr<float>(), b_scales.numel() != 1, {}};
+    ScaleAArgs a_args{a_scales.data_ptr<float>(), a_scales.numel() != 1, {}};
+    BiasArgs bias_args{static_cast<ElementD*>(bias.data_ptr()), {}};
+
+    typename EVTCompute0::Arguments evt0_compute_args{b_args};
+
+    typename EVTCompute::Arguments evt_compute_args{a_args, evt0_compute_args,
+                                                    bias_args};
+    return evt_compute_args;
+  }
+};
+
+template <typename Arch, template <typename> typename ArchGuard,
+          typename ElementAB_, typename ElementD_,
+          template <typename, typename> typename Epilogue_, typename TileShape,
+          typename WarpShape, typename InstructionShape, int32_t MainLoopStages>
+struct cutlass_2x_gemm {
+  using ElementAB = ElementAB_;
+  using ElementD = ElementD_;
+
+  using ElementAcc =
+      typename std::conditional<std::is_same_v<ElementAB, int8_t>, int32_t,
+                                float>::type;
+
+  using Operator =
+      typename std::conditional<std::is_same_v<ElementAB, int8_t>,
+                                cutlass::arch::OpMultiplyAddSaturate,
+                                cutlass::arch::OpMultiplyAdd>::type;
+
+  using OutputTileThreadMap =
+      cutlass::epilogue::threadblock::OutputTileThreadLayout<
+          TileShape, WarpShape, float, 4, 1 /* epilogue stages */
+          >;
+
+  using Epilogue = Epilogue_<ElementD, OutputTileThreadMap>;
+  using EVTCompute = typename Epilogue::EVTCompute;
+
+  using D = cutlass::epilogue::threadblock::VisitorAuxStore<
+      OutputTileThreadMap, ElementD, cutlass::FloatRoundStyle::round_to_nearest,
+      Stride<int64_t, Int<1>, Int<0>>>;
+
+  using EVTD = cutlass::epilogue::threadblock::Sm80EVT<D, EVTCompute>;
+
+  // clang-format off
+  using RowMajor = typename cutlass::layout::RowMajor;
+  using ColumnMajor = typename cutlass::layout::ColumnMajor;
+  using KernelType =
+    ArchGuard<typename cutlass::gemm::kernel::DefaultGemmWithVisitor<
+      ElementAB, RowMajor, cutlass::ComplexTransform::kNone, 16,
+      ElementAB, ColumnMajor, cutlass::ComplexTransform::kNone, 16,
+      float, cutlass::layout::RowMajor, 4,
+      ElementAcc, float, cutlass::arch::OpClassTensorOp,
+      Arch,
+      TileShape, WarpShape, InstructionShape,
+      EVTD,
+      cutlass::gemm::threadblock::ThreadblockSwizzleStreamK,
+      MainLoopStages, Operator,
+      1 /* epilogue stages */
+      >::GemmKernel>;
+  // clang-format on
+
+  using Op = cutlass::gemm::device::GemmUniversalAdapter<KernelType>;
+};
+
+template <typename Gemm, typename... EpilogueArgs>
+void cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
+                         torch::Tensor const& b,
+                         EpilogueArgs&&... epilogue_params) {
+  using ElementAB = typename Gemm::ElementAB;
+  using ElementD = typename Gemm::ElementD;
+
+  int32_t m = a.size(0);
+  int32_t n = b.size(1);
+  int32_t k = a.size(1);
+  cutlass::gemm::GemmCoord problem_size{m, n, k};
+
+  int64_t lda = a.stride(0);
+  int64_t ldb = b.stride(1);
+  int64_t ldc = out.stride(0);
+
+  using StrideC = Stride<int64_t, Int<1>, Int<0>>;
+  StrideC c_stride{ldc, Int<1>{}, Int<0>{}};
+
+  auto a_ptr = static_cast<ElementAB const*>(a.data_ptr());
+  auto b_ptr = static_cast<ElementAB const*>(b.data_ptr());
+  auto c_ptr = static_cast<ElementD*>(out.data_ptr());
+
+  typename Gemm::D::Arguments d_args{c_ptr, c_stride};
+
+  using Epilogue = typename Gemm::Epilogue;
+  auto evt_args =
+      Epilogue::prepare_args(std::forward<EpilogueArgs>(epilogue_params)...);
+
+  typename Gemm::EVTD::Arguments epilogue_args{
+      evt_args,
+      d_args,
+  };
+
+  typename Gemm::Op::Arguments args{
+      cutlass::gemm::GemmUniversalMode::kGemmSplitKParallel,  // universal mode
+      problem_size,                                           // problem size
+      1,                                                      // batch count
+      epilogue_args,
+      a_ptr,
+      b_ptr,
+      nullptr,
+      nullptr,
+      0,
+      0,
+      0,
+      0,
+      lda,
+      ldb,
+      ldc,
+      ldc};
+
+  // Launch the CUTLASS GEMM kernel.
+  typename Gemm::Op gemm_op;
+  size_t workspace_size = gemm_op.get_workspace_size(args);
+  cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+  auto stream = at::cuda::getCurrentCUDAStream(a.get_device());
+
+  CUTLASS_CHECK(gemm_op.can_implement(args));
+  cutlass::Status status = gemm_op(args, workspace.get(), stream);
+  CUTLASS_CHECK(status);
+}
+
+template <typename Gemm, typename FallbackGemm, typename... EpilogueArgs>
+void fallback_cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
+                                  torch::Tensor const& b,
+                                  EpilogueArgs&&... args) {
+  // In some cases, the GPU isn't able to accommodate the
+  // shared memory requirements of the Gemm. In such cases, use
+  // the FallbackGemm instead.
+  static const int max_shared_mem_per_block_opt_in =
+      get_cuda_max_shared_memory_per_block_opt_in(0);
+
+  size_t const gemm_shared_mem_size =
+      sizeof(typename Gemm::KernelType::SharedStorage);
+  size_t const fallback_gemm_shared_mem_size =
+      sizeof(typename FallbackGemm::KernelType::SharedStorage);
+
+  if (gemm_shared_mem_size <= max_shared_mem_per_block_opt_in) {
+    return cutlass_gemm_caller<Gemm>(out, a, b,
+                                     std::forward<EpilogueArgs>(args)...);
+  } else {
+    TORCH_CHECK(fallback_gemm_shared_mem_size <=
+                max_shared_mem_per_block_opt_in);
+    return cutlass_gemm_caller<FallbackGemm>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  }
+}
+
+template <typename InType, typename OutType,
+          template <typename, typename> typename Epilogue>
+struct sm80_config_default {
+  // This config is used in 2 cases,
+  //  - M in (128, inf)
+  //  - M in (64, 128] and N >= 8192
+  // Shared Memory required by this Gemm - 81920 bytes
+  static_assert(std::is_same<InType, int8_t>());
+  using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>;
+  using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
+  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
+  using Cutlass2xGemm =
+      cutlass_2x_gemm<cutlass::arch::Sm80, enable_sm80_to_sm89, InType, OutType,
+                      Epilogue, TileShape, WarpShape, InstructionShape, 5>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename> typename Epilogue>
+struct sm80_config_M64 {
+  // This config is used in 2 cases,
+  // - M in (32, 64]
+  // - M in (64, 128] and N < 8192
+  // Shared Memory required by this Gemm - 122880 bytes
+  static_assert(std::is_same<InType, int8_t>());
+  using TileShape = typename cutlass::gemm::GemmShape<64, 128, 128>;
+  using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
+  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
+  using Cutlass2xGemm =
+      cutlass_2x_gemm<cutlass::arch::Sm80, enable_sm80_to_sm89, InType, OutType,
+                      Epilogue, TileShape, WarpShape, InstructionShape, 5>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename> typename Epilogue>
+struct sm80_config_M32 {
+  // M in (16, 32]
+  // Shared Memory required by this Gemm - 61440 bytes
+  static_assert(std::is_same<InType, int8_t>());
+  using TileShape = typename cutlass::gemm::GemmShape<32, 64, 128>;
+  using WarpShape = typename cutlass::gemm::GemmShape<32, 64, 64>;
+  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
+  using Cutlass2xGemm =
+      cutlass_2x_gemm<cutlass::arch::Sm80, enable_sm80_to_sm89, InType, OutType,
+                      Epilogue, TileShape, WarpShape, InstructionShape, 5>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename> typename Epilogue>
+struct sm80_config_M16 {
+  // M in [1, 16]
+  // Shared Memory required by this Gemm - 51200 bytes
+  static_assert(std::is_same<InType, int8_t>());
+  using TileShape = typename cutlass::gemm::GemmShape<16, 64, 128>;
+  using WarpShape = typename cutlass::gemm::GemmShape<16, 64, 64>;
+  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
+  using Cutlass2xGemm =
+      cutlass_2x_gemm<cutlass::arch::Sm80, enable_sm80_to_sm89, InType, OutType,
+                      Epilogue, TileShape, WarpShape, InstructionShape, 5>;
+};
+
+}  // namespace
+
+template <typename InType, typename OutType,
+          template <typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+void cutlass_gemm_sm80_dispatch(torch::Tensor& out, torch::Tensor const& a,
+                                torch::Tensor const& b,
+                                EpilogueArgs&&... args) {
+  static_assert(std::is_same<InType, int8_t>());
+  TORCH_CHECK(a.dtype() == torch::kInt8);
+  TORCH_CHECK(b.dtype() == torch::kInt8);
+
+  using Cutlass2xGemmDefault =
+      typename sm80_config_default<InType, OutType, Epilogue>::Cutlass2xGemm;
+  using Cutlass2xGemmM128BigN =
+      typename sm80_config_default<InType, OutType, Epilogue>::Cutlass2xGemm;
+  using Cutlass2xGemmM128SmallN =
+      typename sm80_config_M64<InType, OutType, Epilogue>::Cutlass2xGemm;
+  using Cutlass2xGemmM64 =
+      typename sm80_config_M64<InType, OutType, Epilogue>::Cutlass2xGemm;
+  using Cutlass2xGemmM32 =
+      typename sm80_config_M32<InType, OutType, Epilogue>::Cutlass2xGemm;
+  using Cutlass2xGemmM16 =
+      typename sm80_config_M16<InType, OutType, Epilogue>::Cutlass2xGemm;
+
+  // Due to shared memory requirements, some Gemms may fail to run on some
+  // GPUs. As the name indicates, the Fallback Gemm is used as an alternative
+  // in such cases.
+  // sm80_config_M16 has the least shared-memory requirement. However,
+  // based on some profiling, we select sm80_config_M32 as a better alternative
+  // performance wise.
+  using FallbackGemm =
+      typename sm80_config_M32<InType, OutType, Epilogue>::Cutlass2xGemm;
+
+  uint32_t const m = a.size(0);
+  uint32_t const mp2 =
+      std::max(static_cast<uint32_t>(16), next_pow_2(m));  // next power of 2
+  if (mp2 <= 16) {
+    // M in [1, 16]
+    return fallback_cutlass_gemm_caller<Cutlass2xGemmM16, FallbackGemm>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 32) {
+    // M in (16, 32]
+    return fallback_cutlass_gemm_caller<Cutlass2xGemmM32, FallbackGemm>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 64) {
+    // M in (32, 64]
+    return fallback_cutlass_gemm_caller<Cutlass2xGemmM64, FallbackGemm>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 128) {
+    // M in (64, 128]
+    uint32_t const n = out.size(1);
+    bool const small_n = n < 8192;
+    if (small_n) {
+      return fallback_cutlass_gemm_caller<Cutlass2xGemmM128SmallN,
+                                          FallbackGemm>(
+          out, a, b, std::forward<EpilogueArgs>(args)...);
+    } else {
+      return fallback_cutlass_gemm_caller<Cutlass2xGemmM128BigN, FallbackGemm>(
+          out, a, b, std::forward<EpilogueArgs>(args)...);
+    }
+  } else {
+    // M in (128, inf)
+    return fallback_cutlass_gemm_caller<Cutlass2xGemmDefault, FallbackGemm>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  }
+}
+
 template <template <typename, typename> typename Epilogue,
          typename... EpilogueArgs>
 void cutlass_scaled_mm_sm75_epilogue(torch::Tensor& out, torch::Tensor const& a,
@ -21,13 +473,20 @@ void cutlass_scaled_mm_sm75_epilogue(torch::Tensor& out, torch::Tensor const& a,
  TORCH_CHECK(a.dtype() == torch::kInt8);
  TORCH_CHECK(b.dtype() == torch::kInt8);

+  using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>;
+  using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
+  using InstructionShape = typename cutlass::gemm::GemmShape<8, 8, 16>;
+
  if (out.dtype() == torch::kBFloat16) {
-    return vllm::cutlass_gemm_sm75_dispatch<int8_t, cutlass::bfloat16_t,
-                                            Epilogue>(
+    return cutlass_gemm_caller<cutlass_2x_gemm<
+        cutlass::arch::Sm75, enable_sm75_to_sm80, int8_t, cutlass::bfloat16_t,
+        Epilogue, TileShape, WarpShape, InstructionShape, 2>>(
        out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
  } else {
    TORCH_CHECK(out.dtype() == torch::kFloat16);
-    return vllm::cutlass_gemm_sm75_dispatch<int8_t, cutlass::half_t, Epilogue>(
+    return cutlass_gemm_caller<cutlass_2x_gemm<
+        cutlass::arch::Sm75, enable_sm75_to_sm80, int8_t, cutlass::half_t,
+        Epilogue, TileShape, WarpShape, InstructionShape, 2>>(
        out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
  }
 }
@ -42,30 +501,11 @@ void cutlass_scaled_mm_sm75(torch::Tensor& out, torch::Tensor const& a,
  if (bias) {
    TORCH_CHECK(bias->dtype() == out.dtype(),
                "currently bias dtype must match output dtype ", out.dtype());
-    return cutlass_scaled_mm_sm75_epilogue<vllm::ScaledEpilogueBias>(
+    return cutlass_scaled_mm_sm75_epilogue<ScaledEpilogueBias>(
        out, a, b, a_scales, b_scales, *bias);
  } else {
-    return cutlass_scaled_mm_sm75_epilogue<vllm::ScaledEpilogue>(
-        out, a, b, a_scales, b_scales);
-  }
-}
-
-void cutlass_scaled_mm_azp_sm75(torch::Tensor& out, torch::Tensor const& a,
-                                torch::Tensor const& b,
-                                torch::Tensor const& a_scales,
-                                torch::Tensor const& b_scales,
-                                torch::Tensor const& azp_adj,
-                                c10::optional<torch::Tensor> const& azp,
-                                c10::optional<torch::Tensor> const& bias) {
-  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
-  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
-
-  if (azp) {
-    return cutlass_scaled_mm_sm75_epilogue<vllm::ScaledEpilogueBiasAzpToken>(
-        out, a, b, a_scales, b_scales, azp_adj, *azp, bias);
-  } else {
-    return cutlass_scaled_mm_sm75_epilogue<vllm::ScaledEpilogueBiasAzp>(
-        out, a, b, a_scales, b_scales, azp_adj, bias);
+    return cutlass_scaled_mm_sm75_epilogue<ScaledEpilogue>(out, a, b, a_scales,
+                                                           b_scales);
  }
 }

@ -78,12 +518,11 @@ void cutlass_scaled_mm_sm80_epilogue(torch::Tensor& out, torch::Tensor const& a,
  TORCH_CHECK(b.dtype() == torch::kInt8);

  if (out.dtype() == torch::kBFloat16) {
-    return vllm::cutlass_gemm_sm80_dispatch<int8_t, cutlass::bfloat16_t,
-                                            Epilogue>(
+    return cutlass_gemm_sm80_dispatch<int8_t, cutlass::bfloat16_t, Epilogue>(
        out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
  } else {
    TORCH_CHECK(out.dtype() == torch::kFloat16);
-    return vllm::cutlass_gemm_sm80_dispatch<int8_t, cutlass::half_t, Epilogue>(
+    return cutlass_gemm_sm80_dispatch<int8_t, cutlass::half_t, Epilogue>(
        out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
  }
 }
@ -98,30 +537,11 @@ void cutlass_scaled_mm_sm80(torch::Tensor& out, torch::Tensor const& a,
  if (bias) {
    TORCH_CHECK(bias->dtype() == out.dtype(),
                "currently bias dtype must match output dtype ", out.dtype());
-    return cutlass_scaled_mm_sm80_epilogue<vllm::ScaledEpilogueBias>(
+    return cutlass_scaled_mm_sm80_epilogue<ScaledEpilogueBias>(
        out, a, b, a_scales, b_scales, *bias);
  } else {
-    return cutlass_scaled_mm_sm80_epilogue<vllm::ScaledEpilogue>(
-        out, a, b, a_scales, b_scales);
-  }
-}
-
-void cutlass_scaled_mm_azp_sm80(torch::Tensor& out, torch::Tensor const& a,
-                                torch::Tensor const& b,
-                                torch::Tensor const& a_scales,
-                                torch::Tensor const& b_scales,
-                                torch::Tensor const& azp_adj,
-                                c10::optional<torch::Tensor> const& azp,
-                                c10::optional<torch::Tensor> const& bias) {
-  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
-  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
-
-  if (azp) {
-    return cutlass_scaled_mm_sm80_epilogue<vllm::ScaledEpilogueBiasAzpToken>(
-        out, a, b, a_scales, b_scales, azp_adj, *azp, bias);
-  } else {
-    return cutlass_scaled_mm_sm80_epilogue<vllm::ScaledEpilogueBiasAzp>(
-        out, a, b, a_scales, b_scales, azp_adj, bias);
+    return cutlass_scaled_mm_sm80_epilogue<ScaledEpilogue>(out, a, b, a_scales,
+                                                           b_scales);
  }
 }

@ -130,17 +550,23 @@ template <template <typename, typename> typename Epilogue,
 void cutlass_scaled_mm_sm89_epilogue(torch::Tensor& out, torch::Tensor const& a,
                                     torch::Tensor const& b,
                                     EpilogueArgs&&... epilogue_args) {
+  using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>;
+  using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
+  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
+
  if (a.dtype() == torch::kInt8) {
    TORCH_CHECK(b.dtype() == torch::kInt8);

    if (out.dtype() == torch::kBFloat16) {
-      return vllm::cutlass_gemm_sm89_int8_dispatch<int8_t, cutlass::bfloat16_t,
-                                                   Epilogue>(
+      return cutlass_gemm_caller<cutlass_2x_gemm<
+          cutlass::arch::Sm89, enable_sm89_to_sm90, int8_t, cutlass::bfloat16_t,
+          Epilogue, TileShape, WarpShape, InstructionShape, 5>>(
          out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
    } else {
      assert(out.dtype() == torch::kFloat16);
-      return vllm::cutlass_gemm_sm89_int8_dispatch<int8_t, cutlass::half_t,
-                                                   Epilogue>(
+      return cutlass_gemm_caller<cutlass_2x_gemm<
+          cutlass::arch::Sm89, enable_sm89_to_sm90, int8_t, cutlass::half_t,
+          Epilogue, TileShape, WarpShape, InstructionShape, 5>>(
          out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
    }
  } else {
@ -148,13 +574,17 @@ void cutlass_scaled_mm_sm89_epilogue(torch::Tensor& out, torch::Tensor const& a,
    TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);

    if (out.dtype() == torch::kBFloat16) {
-      return vllm::cutlass_gemm_sm89_fp8_dispatch<
-          cutlass::float_e4m3_t, cutlass::bfloat16_t, Epilogue>(
+      return cutlass_gemm_caller<
+          cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
+                          cutlass::float_e4m3_t, cutlass::bfloat16_t, Epilogue,
+                          TileShape, WarpShape, InstructionShape, 5>>(
          out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
    } else {
      TORCH_CHECK(out.dtype() == torch::kFloat16);
-      return vllm::cutlass_gemm_sm89_fp8_dispatch<cutlass::float_e4m3_t,
-                                                  cutlass::half_t, Epilogue>(
+      return cutlass_gemm_caller<
+          cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
+                          cutlass::float_e4m3_t, cutlass::half_t, Epilogue,
+                          TileShape, WarpShape, InstructionShape, 5>>(
          out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
    }
  }
@ -170,29 +600,10 @@ void cutlass_scaled_mm_sm89(torch::Tensor& out, torch::Tensor const& a,
  if (bias) {
    TORCH_CHECK(bias->dtype() == out.dtype(),
                "currently bias dtype must match output dtype ", out.dtype());
-    return cutlass_scaled_mm_sm89_epilogue<vllm::ScaledEpilogueBias>(
+    return cutlass_scaled_mm_sm89_epilogue<ScaledEpilogueBias>(
        out, a, b, a_scales, b_scales, *bias);
  } else {
-    return cutlass_scaled_mm_sm89_epilogue<vllm::ScaledEpilogue>(
-        out, a, b, a_scales, b_scales);
-  }
-}
-
-void cutlass_scaled_mm_azp_sm89(torch::Tensor& out, torch::Tensor const& a,
-                                torch::Tensor const& b,
-                                torch::Tensor const& a_scales,
-                                torch::Tensor const& b_scales,
-                                torch::Tensor const& azp_adj,
-                                c10::optional<torch::Tensor> const& azp,
-                                c10::optional<torch::Tensor> const& bias) {
-  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
-  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
-
-  if (azp) {
-    return cutlass_scaled_mm_sm89_epilogue<vllm::ScaledEpilogueBiasAzpToken>(
-        out, a, b, a_scales, b_scales, azp_adj, *azp, bias);
-  } else {
-    return cutlass_scaled_mm_sm89_epilogue<vllm::ScaledEpilogueBiasAzp>(
-        out, a, b, a_scales, b_scales, azp_adj, bias);
+    return cutlass_scaled_mm_sm89_epilogue<ScaledEpilogue>(out, a, b, a_scales,
+                                                           b_scales);
  }
 }
--- a/Show More
+++ b/Show More