[Kernel] fix types used in aqlm and ggml kernels to support dynamo (#7596)

This commit is contained in:
bnellnm
2024-08-16 17:00:11 -04:00
committed by GitHub
parent 7759ae958f
commit 37fd47e780
7 changed files with 39 additions and 53 deletions

View File

@ -496,14 +496,14 @@ torch::Tensor code2x8_matmat(const torch::Tensor& input,
}
// Accumulate the partition sizes.
int4 accumulate_sizes(const torch::Tensor& codebook_partition_sizes) {
int4 accumulate_sizes(const std::vector<int64_t>& codebook_partition_sizes) {
int4 cumulative_sizes;
auto cumulative_size = &cumulative_sizes.x;
int i = 0;
size_t i = 0;
int last = 0;
assert(codebook_partition_sizes.size(0) <= 4);
for (; i < codebook_partition_sizes.size(0); ++i, ++cumulative_size) {
*cumulative_size = codebook_partition_sizes[i].item<int>() + last;
assert(codebook_partition_sizes.size() <= 4);
for (; i < codebook_partition_sizes.size(); ++i, ++cumulative_size) {
*cumulative_size = codebook_partition_sizes[i] + last;
last = *cumulative_size;
}
// fill in the rest with unreachable.
@ -519,12 +519,12 @@ int4 accumulate_sizes(const torch::Tensor& codebook_partition_sizes) {
torch::Tensor aqlm_gemm(const torch::Tensor& input, const torch::Tensor& codes,
const torch::Tensor& codebooks,
const torch::Tensor& scales,
const torch::Tensor& codebook_partition_sizes,
const std::vector<int64_t>& codebook_partition_sizes,
const std::optional<torch::Tensor>& bias) {
int4 cumulative_sizes =
vllm::aqlm::accumulate_sizes(codebook_partition_sizes);
int const nbooks = codebooks.size(0) / codebook_partition_sizes.size(0);
int const nbooks = codebooks.size(0) / codebook_partition_sizes.size();
int const entries = codebooks.size(1);
if (nbooks == 1 && entries == (1 << 16)) {
@ -541,13 +541,13 @@ torch::Tensor aqlm_gemm(const torch::Tensor& input, const torch::Tensor& codes,
return {};
}
torch::Tensor aqlm_dequant(const torch::Tensor& codes,
const torch::Tensor& codebooks,
const torch::Tensor& codebook_partition_sizes) {
torch::Tensor aqlm_dequant(
const torch::Tensor& codes, const torch::Tensor& codebooks,
const std::vector<int64_t>& codebook_partition_sizes) {
int4 cumulative_sizes =
vllm::aqlm::accumulate_sizes(codebook_partition_sizes);
int const nbooks = codebooks.size(0) / codebook_partition_sizes.size(0);
int const nbooks = codebooks.size(0) / codebook_partition_sizes.size();
int const entries = codebooks.size(1);
const at::cuda::OptionalCUDAGuard device_guard(device_of(codes));
@ -557,7 +557,8 @@ torch::Tensor aqlm_dequant(const torch::Tensor& codes,
auto in_features = codes.size(1) * 8;
auto out_features = codes.size(0);
assert(out_features = codebook_partition_sizes.sum().item<int>());
assert(out_features == std::accumulate(codebook_partition_sizes.begin(),
codebook_partition_sizes.end(), 0));
auto weights = torch::empty({out_features, in_features},
torch::TensorOptions()

View File

@ -487,7 +487,7 @@ static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int k,
dequantize_block_iq4_xs<<<nb, 32, 0, stream>>>(vx, y);
}
static to_fp16_cuda_t ggml_get_to_fp16_cuda(int type) {
static to_fp16_cuda_t ggml_get_to_fp16_cuda(int64_t type) {
switch (type) {
case 2:
return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;

View File

@ -60,7 +60,7 @@ static void quantize_row_q8_1_cuda(const half* x, void* vy, const int kx,
}
torch::Tensor ggml_dequantize(torch::Tensor W, // quant weight
int8_t type, int64_t m, int64_t n) {
int64_t type, int64_t m, int64_t n) {
const at::cuda::OptionalCUDAGuard device_guard(device_of(W));
auto options =
torch::TensorOptions().dtype(torch::kFloat16).device(W.device());
@ -73,7 +73,7 @@ torch::Tensor ggml_dequantize(torch::Tensor W, // quant weight
torch::Tensor ggml_mul_mat_vec_a8(torch::Tensor W, // quant weight
torch::Tensor X, // input
int8_t type, int64_t row) {
int64_t type, int64_t row) {
int col = X.sizes()[1];
const int padded = (col + 512 - 1) / 512 * 512;
const at::cuda::OptionalCUDAGuard device_guard(device_of(X));
@ -172,7 +172,7 @@ torch::Tensor ggml_mul_mat_vec_a8(torch::Tensor W, // quant weight
torch::Tensor ggml_mul_mat_a8(torch::Tensor W, // quant weight
torch::Tensor X, // input
int8_t type, int64_t row) {
int64_t type, int64_t row) {
int col = X.sizes()[1];
int padded = (col + 512 - 1) / 512 * 512;
int batch = X.sizes()[0];
@ -239,4 +239,4 @@ torch::Tensor ggml_mul_mat_a8(torch::Tensor W, // quant weight
break;
}
return Y;
}
}