[Core] Support loading GGUF model (#5191)

Co-authored-by: Michael Goin <michael@neuralmagic.com>
This commit is contained in:
Isotr0py
2024-08-06 07:54:23 +08:00
committed by GitHub
parent ef527be06c
commit 360bd67cf0
29 changed files with 4970 additions and 21 deletions

View File

@ -145,6 +145,18 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
ops.def("awq_marlin_repack", &awq_marlin_repack);
ops.impl("awq_marlin_repack", torch::kCUDA, &awq_marlin_repack);
// Dequantization for GGML.
ops.def("ggml_dequantize", &ggml_dequantize);
ops.impl("ggml_dequantize", torch::kCUDA, &ggml_dequantize);
// mmvq kernel for GGML.
ops.def("ggml_mul_mat_vec_a8", &ggml_mul_mat_vec_a8);
ops.impl("ggml_mul_mat_vec_a8", torch::kCUDA, &ggml_mul_mat_vec_a8);
// mmq kernel for GGML.
ops.def("ggml_mul_mat_a8", &ggml_mul_mat_a8);
ops.impl("ggml_mul_mat_a8", torch::kCUDA, &ggml_mul_mat_a8);
// fp8_marlin Optimized Quantized GEMM for FP8 weight-only.
ops.def("fp8_marlin_gemm", &fp8_marlin_gemm);
ops.impl("fp8_marlin_gemm", torch::kCUDA, &fp8_marlin_gemm);