Add memory analyzer & utomatically configure KV cache size (#6)

2023-03-11 23:23:14 -08:00
parent 1a7eb7da61
commit e9d3f2ff77
7 changed files with 216 additions and 34 deletions
--- a/cacheflow/models/init.py
+++ b/cacheflow/models/init.py
@ -1,10 +1,12 @@
 from cacheflow.models.input_metadata import InputMetadata
+from cacheflow.models.model_utils import get_memory_analyzer
 from cacheflow.models.model_utils import get_model
-from cacheflow.models.model_utils import set_seed
+from cacheflow.models.utils import set_seed


 __all__ = [
    'InputMetadata',
+    'get_memory_analyzer',
    'get_model',
-    'set_seed'
+    'set_seed',
 ]
--- a/cacheflow/models/memory_analyzer.py
+++ b/cacheflow/models/memory_analyzer.py
@ -0,0 +1,125 @@
+import torch
+from transformers import AutoConfig
+
+from cacheflow.models.utils import get_cpu_memory
+from cacheflow.models.utils import get_dtype_size
+from cacheflow.models.utils import get_gpu_memory
+
+_GiB = 1 << 30    
+
+
+class CacheFlowMemoryAnalyzer:
+
+    def get_max_num_gpu_blocks(
+        self,
+        max_num_batched_tokens: int,
+        memory_utilization: float,
+    ) -> int:
+        raise NotImplementedError()
+
+    def get_max_num_cpu_blocks(
+        self,
+        memory_utilization: float,
+    ) -> int:
+        raise NotImplementedError()
+
+
+class OPTMemoryAnalyzer(CacheFlowMemoryAnalyzer):
+
+    def __init__(
+        self,
+        model_name: str,
+        block_size: int,
+        dtype: torch.dtype,
+    ) -> None:
+        self.model_name = model_name
+        self.block_size = block_size
+        self.dtype = dtype
+
+        # TODO(woosuk): Support tensor parallelism.
+        config = AutoConfig.from_pretrained(model_name)
+        self.num_layers = config.num_hidden_layers
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_size = config.hidden_size // self.num_heads
+        self.ffn_size = config.ffn_dim
+        self.embedding_size = config.word_embed_proj_dim
+        self.vocab_size = config.vocab_size
+        self.max_position = config.max_position_embeddings
+
+    def _get_param_size(self) -> int:
+        # TODO(woosuk): Support tensor parallelism.
+        word_embedding = self.vocab_size * self.embedding_size
+        if self.embedding_size != self.vocab_size:
+            # Project in/out.
+            word_embedding += 2 * self.embedding_size * self.vocab_size
+        position_embedding = self.max_position * self.hidden_size
+
+        ln1 = 2 * self.hidden_size
+        q = self.hidden_size * self.hidden_size + self.hidden_size
+        k = self.hidden_size * self.hidden_size + self.hidden_size
+        v = self.hidden_size * self.hidden_size + self.hidden_size
+        out = self.hidden_size * self.hidden_size + self.hidden_size
+        mha = ln1 + q + k + v + out
+
+        ln2 = 2 * self.hidden_size
+        ffn1 = self.hidden_size * self.ffn_size + self.ffn_size
+        ffn2 = self.ffn_size * self.hidden_size + self.hidden_size
+        ffn = ln2 + ffn1 + ffn2
+
+        total = (word_embedding + position_embedding + 
+                 self.num_layers * (mha + ffn))
+        dtype_size = get_dtype_size(self.dtype)
+        return dtype_size * total
+
+    def _get_max_act_size(
+        self,
+        max_num_batched_tokens: int,
+    ) -> int:
+        # TODO(woosuk): Support tensor parallelism.
+        # NOTE: We approxmiately calculate the maximum activation size by
+        # 1) estimating the maximum activation tensor size during inference, and
+        # 2) multiplying it by 4.
+        # Here, we assume that FlashAttention is used and
+        # thus the attention maps are never materialized in GPU DRAM.
+        qkv = 3 * (max_num_batched_tokens * self.hidden_size)
+        ffn = max_num_batched_tokens * self.ffn_size
+        max_act = 4 * max(qkv, ffn)
+        dtype_size = get_dtype_size(self.dtype)
+        return dtype_size * max_act
+
+    def _get_workspace_size(self) -> int:
+        return 1 * _GiB
+
+    def _get_cache_block_size(self) -> int:
+        key_cache_block = self.block_size * self.num_heads * self.head_size
+        value_cache_block = self.block_size * self.num_heads * self.head_size
+        total = self.num_layers * (key_cache_block + value_cache_block)
+        dtype_size = get_dtype_size(self.dtype)
+        return dtype_size * total
+
+    def get_max_num_gpu_blocks(
+        self,
+        max_num_batched_tokens: int,
+        memory_utilization: float = 0.95,
+    ) -> int:
+        # NOTE(woosuk): This assumes that the machine has homogeneous GPUs.
+        gpu_memory = get_gpu_memory()
+        usable_memory = int(memory_utilization * gpu_memory)
+
+        param_size = self._get_param_size()
+        act_size = self._get_max_act_size(max_num_batched_tokens)
+        workspace_size = self._get_workspace_size()
+
+        max_cache_size = usable_memory - (param_size + act_size + workspace_size)
+        max_num_blocks = max_cache_size // self._get_cache_block_size()
+        return max_num_blocks
+
+    def get_max_num_cpu_blocks(
+        self,
+        memory_utilization: float = 0.25,
+    ) -> int:
+        cpu_memory = get_cpu_memory()
+        usable_memory = int(memory_utilization * cpu_memory)
+        max_num_blocks = usable_memory // self._get_cache_block_size()
+        return max_num_blocks
--- a/cacheflow/models/model_utils.py
+++ b/cacheflow/models/model_utils.py
@ -1,21 +1,20 @@
-import random
 from typing import Union

-import numpy as np
 import torch
 import torch.nn as nn

+from cacheflow.models.memory_analyzer import CacheFlowMemoryAnalyzer
+from cacheflow.models.memory_analyzer import OPTMemoryAnalyzer
 from cacheflow.models.opt import OPTForCausalLM
+from cacheflow.models.utils import get_torch_dtype

-MODEL_CLASSES = {
+
+_MODELS = {
    'opt': OPTForCausalLM,
 }

-STR_DTYPE_TO_TORCH_DTYPE = {
-    'half': torch.half,
-    'float': torch.float,
-    'float16': torch.float16,
-    'float32': torch.float32,
+_MEMORY_ANALYZERS = {
+    'opt': OPTMemoryAnalyzer,
 }


@ -23,20 +22,23 @@ def get_model(
    model_name: str,
    dtype: Union[torch.dtype, str],
 ) -> nn.Module:
-    if isinstance(dtype, str):
-        torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype.lower()]
-    else:
-        torch_dtype = dtype
-    for model_class, hf_model in MODEL_CLASSES.items():
+    torch_dtype = get_torch_dtype(dtype)
+    for model_class, hf_model in _MODELS.items():
        if model_class in model_name:
-            model = hf_model.from_pretrained(model_name, torch_dtype=torch_dtype)
+            model = hf_model.from_pretrained(
+                model_name, torch_dtype=torch_dtype)
            return model.eval()
-    raise ValueError(f'Invalid model name: {model_name}')
+    raise ValueError(f'Unsupported model name: {model_name}')


-def set_seed(seed: int) -> None:
-    random.seed(seed)
-    np.random.seed(seed)
-    torch.manual_seed(seed)
-    if torch.cuda.is_available():
-        torch.cuda.manual_seed_all(seed)
+def get_memory_analyzer(
+    model_name: str,
+    block_size: int,
+    dtype: Union[torch.dtype, str],
+) -> CacheFlowMemoryAnalyzer:
+    torch_dtype = get_torch_dtype(dtype)
+    for model_class, memory_analyzer in _MEMORY_ANALYZERS.items():
+        if model_class in model_name:
+            return memory_analyzer(
+                model_name, block_size, torch_dtype)
+    raise ValueError(f'Unsupported model name: {model_name}')
--- a/cacheflow/models/utils.py
+++ b/cacheflow/models/utils.py
@ -0,0 +1,43 @@
+from typing import Union
+
+import random
+
+import numpy as np
+import psutil
+import torch
+
+_STR_DTYPE_TO_TORCH_DTYPE = {
+    'half': torch.half,
+    'float': torch.float,
+    'float16': torch.float16,
+    'float32': torch.float32,
+}
+
+
+def get_torch_dtype(dtype: Union[torch.dtype, str]) -> torch.dtype:
+    if isinstance(dtype, str):
+        torch_dtype = _STR_DTYPE_TO_TORCH_DTYPE[dtype.lower()]
+    else:
+        torch_dtype = dtype
+    return torch_dtype
+
+
+def get_dtype_size(dtype: Union[torch.dtype, str]) -> int:
+    torch_dtype = get_torch_dtype(dtype)
+    return torch.tensor([], dtype=torch_dtype).element_size()
+
+
+def set_seed(seed: int) -> None:
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(seed)
+
+
+def get_gpu_memory(gpu: int = 0) -> int:
+    return torch.cuda.get_device_properties(gpu).total_memory
+
+
+def get_cpu_memory() -> int:
+    return psutil.virtual_memory().total