Use runtime profiling to replace manual memory analyzers (#81)
This commit is contained in:
@ -1,11 +1,12 @@
|
||||
from cacheflow.model_executor.input_metadata import InputMetadata
|
||||
from cacheflow.model_executor.model_loader import get_model, get_memory_analyzer
|
||||
from cacheflow.model_executor.utils import set_random_seed
|
||||
from cacheflow.model_executor.model_loader import get_model
|
||||
from cacheflow.model_executor.utils import (set_random_seed,
|
||||
get_cache_block_size)
|
||||
|
||||
|
||||
__all__ = [
|
||||
"InputMetadata",
|
||||
"get_cache_block_size",
|
||||
"get_model",
|
||||
"get_memory_analyzer",
|
||||
"set_random_seed",
|
||||
]
|
||||
|
||||
@ -11,6 +11,8 @@ from cacheflow import pos_encoding_ops
|
||||
from cacheflow.model_executor.input_metadata import InputMetadata
|
||||
|
||||
|
||||
_SUPPORTED_HEAD_SIZES = [32, 64, 80, 96, 128, 160, 192, 256]
|
||||
|
||||
class GPTCacheFlowAttention(nn.Module):
|
||||
"""GPT-style multi-head attention.
|
||||
|
||||
@ -39,11 +41,19 @@ class GPTCacheFlowAttention(nn.Module):
|
||||
5. Output a flattened 1D tensor.
|
||||
"""
|
||||
|
||||
def __init__(self, scale: float) -> None:
|
||||
def __init__(self, num_heads: int, head_size: int, scale: float) -> None:
|
||||
super().__init__()
|
||||
self.num_heads = num_heads
|
||||
self.head_size = head_size
|
||||
self.scale = float(scale)
|
||||
self.attn_op = xops.fmha.cutlass.FwOp()
|
||||
|
||||
if self.head_size not in _SUPPORTED_HEAD_SIZES:
|
||||
raise ValueError(f'head_size ({self.head_size}) is not supported by '
|
||||
'the single_query_cached_kv_attention kernel. '
|
||||
'Use one of the following head sizes: '
|
||||
f'{_SUPPORTED_HEAD_SIZES}.')
|
||||
|
||||
def multi_query_kv_attention(
|
||||
self,
|
||||
output: torch.Tensor, # [num_prompt_tokens, num_heads, head_size]
|
||||
@ -74,14 +84,6 @@ class GPTCacheFlowAttention(nn.Module):
|
||||
value_cache: torch.Tensor, # [num_blocks, num_heads, head_size, block_size]
|
||||
input_metadata: InputMetadata,
|
||||
) -> None:
|
||||
head_size = value_cache.shape[2]
|
||||
supported_head_sizes = [32, 64, 80, 96, 128, 160, 192, 256]
|
||||
if head_size not in supported_head_sizes:
|
||||
raise ValueError(f'head_size ({head_size}) is not supported by '
|
||||
'the single_query_cached_kv_attention kernel. '
|
||||
'Use one of the following head sizes: '
|
||||
f'{supported_head_sizes}.')
|
||||
|
||||
block_size = value_cache.shape[3]
|
||||
attention_ops.single_query_cached_kv_attention(
|
||||
output,
|
||||
@ -100,8 +102,8 @@ class GPTCacheFlowAttention(nn.Module):
|
||||
query: torch.Tensor, # [num_tokens, num_heads * head_size]
|
||||
key: torch.Tensor, # [num_tokens, num_heads * head_size]
|
||||
value: torch.Tensor, # [num_tokens, num_heads * head_size]
|
||||
key_cache: torch.Tensor, # [num_blocks, num_heads, head_size/x, block_size, x]
|
||||
value_cache: torch.Tensor, # [num_blocks, num_heads, head_size, block_size]
|
||||
key_cache: Optional[torch.Tensor], # [num_blocks, num_heads, head_size/x, block_size, x]
|
||||
value_cache: Optional[torch.Tensor], # [num_blocks, num_heads, head_size, block_size]
|
||||
input_metadata: InputMetadata,
|
||||
cache_event: Optional[torch.cuda.Event],
|
||||
) -> torch.Tensor: # [num_tokens, num_heads * head_size]
|
||||
@ -109,11 +111,9 @@ class GPTCacheFlowAttention(nn.Module):
|
||||
# tensor of shape [num_tokens, 3 * num_heads * head_size].
|
||||
|
||||
# Reshape the query, key, and value tensors.
|
||||
num_heads = value_cache.shape[1]
|
||||
head_size = value_cache.shape[2]
|
||||
query = query.view(-1, num_heads, head_size)
|
||||
key = key.view(-1, num_heads, head_size)
|
||||
value = value.view(-1, num_heads, head_size)
|
||||
query = query.view(-1, self.num_heads, self.head_size)
|
||||
key = key.view(-1, self.num_heads, self.head_size)
|
||||
value = value.view(-1, self.num_heads, self.head_size)
|
||||
|
||||
# Pre-allocate the output tensor.
|
||||
output = torch.empty_like(query)
|
||||
@ -134,8 +134,11 @@ class GPTCacheFlowAttention(nn.Module):
|
||||
cache_event.wait()
|
||||
|
||||
# Reshape the keys and values and store them in the cache.
|
||||
# When key_cache and value_cache are not provided, the new key
|
||||
# and value vectors will not be cached.
|
||||
num_valid_tokens = input_metadata.num_valid_tokens
|
||||
if num_valid_tokens > 0:
|
||||
if (num_valid_tokens > 0 and key_cache is not None
|
||||
and value_cache is not None):
|
||||
# The stride is 3 because the key and value are sliced from qkv.
|
||||
cache_ops.reshape_and_cache(
|
||||
key[:num_valid_tokens],
|
||||
@ -146,6 +149,10 @@ class GPTCacheFlowAttention(nn.Module):
|
||||
)
|
||||
|
||||
if input_metadata.num_generation_tokens > 0:
|
||||
assert key_cache is not None and value_cache is not None, (
|
||||
"key_cache and value_cache must be provided when "
|
||||
"generating tokens."
|
||||
)
|
||||
# Compute the attention op for generation tokens.
|
||||
self.single_query_cached_kv_attention(
|
||||
output[num_prompt_tokens:num_valid_tokens],
|
||||
@ -156,7 +163,7 @@ class GPTCacheFlowAttention(nn.Module):
|
||||
|
||||
# Reshape the output tensor.
|
||||
# NOTE(woosuk): The output tensor may include paddings.
|
||||
return output.view(-1, num_heads * head_size)
|
||||
return output.view(-1, self.num_heads * self.head_size)
|
||||
|
||||
|
||||
class GPTNeoXCacheFlowAttention(GPTCacheFlowAttention):
|
||||
@ -164,12 +171,14 @@ class GPTNeoXCacheFlowAttention(GPTCacheFlowAttention):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
num_heads: int,
|
||||
head_size: int,
|
||||
scale: float,
|
||||
rotary_dim: int,
|
||||
max_position: int = 8192,
|
||||
base: int = 10000,
|
||||
) -> None:
|
||||
super().__init__(scale)
|
||||
super().__init__(num_heads, head_size, scale)
|
||||
|
||||
# Create the cos and sin cache.
|
||||
inv_freq = 1.0 / (base ** (torch.arange(0, rotary_dim, 2) / rotary_dim))
|
||||
@ -199,12 +208,11 @@ class GPTNeoXCacheFlowAttention(GPTCacheFlowAttention):
|
||||
) -> torch.Tensor: # [num_tokens, num_heads * head_size]
|
||||
# Apply rotary embedding to the query and key before passing them
|
||||
# to the attention op.
|
||||
head_size = value_cache.shape[2]
|
||||
pos_encoding_ops.rotary_embedding_neox(
|
||||
positions,
|
||||
query,
|
||||
key,
|
||||
head_size,
|
||||
self.head_size,
|
||||
self.cos_sin_cache,
|
||||
)
|
||||
return super().forward(
|
||||
|
||||
@ -74,7 +74,7 @@ class Sampler(nn.Module):
|
||||
# Apply top-p and top-k truncation.
|
||||
top_ps, top_ks = _get_top_p_top_k(input_metadata, self.vocab_size)
|
||||
assert len(top_ps) == len(top_ks) == probs.shape[0]
|
||||
if any(p < 1.0 for p in top_ps) or any(k != -1 for k in top_ks):
|
||||
if any(p < 1.0 for p in top_ps) or any(k != self.vocab_size for k in top_ks):
|
||||
probs = _apply_top_p_top_k(probs, top_ps, top_ks)
|
||||
|
||||
# Sample the next tokens.
|
||||
|
||||
@ -1,370 +0,0 @@
|
||||
import torch
|
||||
from transformers import AutoConfig
|
||||
|
||||
from cacheflow.logger import init_logger
|
||||
from cacheflow.model_executor.utils import get_dtype_size
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
_GiB = 1 << 30
|
||||
|
||||
|
||||
class CacheFlowMemoryAnalyzer:
|
||||
|
||||
def get_max_num_gpu_blocks(
|
||||
self,
|
||||
max_num_batched_tokens: int,
|
||||
memory_utilization: float,
|
||||
) -> int:
|
||||
raise NotImplementedError()
|
||||
|
||||
def get_workspace_size(self) -> int:
|
||||
return 1 * _GiB
|
||||
|
||||
def get_cache_block_size(self) -> int:
|
||||
raise NotImplementedError()
|
||||
|
||||
def get_max_num_cpu_blocks(
|
||||
self,
|
||||
swap_space_gib: int,
|
||||
) -> int:
|
||||
swap_space = swap_space_gib * _GiB
|
||||
cpu_memory = self.cpu_memory
|
||||
if swap_space > 0.8 * cpu_memory:
|
||||
raise ValueError(f'The swap space ({swap_space_gib:.2f} GiB) '
|
||||
'takes more than 80% of the available memory '
|
||||
f'({cpu_memory / _GiB:.2f} GiB).'
|
||||
'Please check the swap space size.')
|
||||
if swap_space > 0.5 * cpu_memory:
|
||||
logger.info(f'WARNING: The swap space ({swap_space_gib:.2f} GiB) '
|
||||
'takes more than 50% of the available memory '
|
||||
f'({cpu_memory / _GiB:.2f} GiB).'
|
||||
'This may slow the system performance.')
|
||||
max_num_blocks = swap_space // self.get_cache_block_size()
|
||||
return max_num_blocks
|
||||
|
||||
def get_param_size(self) -> int:
|
||||
raise NotImplementedError()
|
||||
|
||||
def get_max_act_size(self, max_num_batched_tokens: int) -> int:
|
||||
raise NotImplementedError()
|
||||
|
||||
def get_cache_block_size(self) -> int:
|
||||
key_cache_block = self.block_size * self.hidden_size // self.tensor_parallel_size
|
||||
value_cache_block = key_cache_block
|
||||
total = self.num_layers * (key_cache_block + value_cache_block)
|
||||
dtype_size = get_dtype_size(self.dtype)
|
||||
return dtype_size * total
|
||||
|
||||
def get_max_num_gpu_blocks(
|
||||
self,
|
||||
max_num_batched_tokens: int,
|
||||
memory_utilization: float = 0.95,
|
||||
) -> int:
|
||||
# NOTE(woosuk): This assumes that the machine has homogeneous GPUs.
|
||||
usable_memory = int(memory_utilization * self.gpu_memory)
|
||||
|
||||
param_size = self.get_param_size()
|
||||
act_size = self.get_max_act_size(max_num_batched_tokens)
|
||||
workspace_size = self.get_workspace_size()
|
||||
|
||||
max_cache_size = usable_memory - (param_size + act_size + workspace_size)
|
||||
if max_cache_size <= 0:
|
||||
raise RuntimeError('Not enough GPU memory.')
|
||||
max_num_blocks = max_cache_size // self.get_cache_block_size()
|
||||
return max_num_blocks
|
||||
|
||||
|
||||
class GPT2MemoryAnalyzer(CacheFlowMemoryAnalyzer):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model_name: str,
|
||||
block_size: int,
|
||||
dtype: torch.dtype,
|
||||
gpu_memory: int,
|
||||
cpu_memory: int,
|
||||
tensor_parallel_size: int,
|
||||
) -> None:
|
||||
self.model_name = model_name
|
||||
self.block_size = block_size
|
||||
self.dtype = dtype
|
||||
self.gpu_memory = gpu_memory
|
||||
self.cpu_memory = cpu_memory
|
||||
self.tensor_parallel_size = tensor_parallel_size
|
||||
|
||||
config = AutoConfig.from_pretrained(model_name)
|
||||
self.num_layers = config.num_hidden_layers
|
||||
self.hidden_size = config.hidden_size
|
||||
self.num_heads = config.num_attention_heads
|
||||
self.head_size = config.hidden_size // self.num_heads
|
||||
self.ffn_size = config.n_inner if config.n_inner is not None else 4 * self.hidden_size
|
||||
self.vocab_size = config.vocab_size
|
||||
self.max_position = config.max_position_embeddings
|
||||
|
||||
def get_param_size(self) -> int:
|
||||
word_embedding = self.vocab_size * self.hidden_size // self.tensor_parallel_size
|
||||
position_embedding = self.max_position * self.hidden_size
|
||||
|
||||
ln1 = 2 * self.hidden_size
|
||||
q = self.hidden_size * self.hidden_size // self.tensor_parallel_size + self.hidden_size
|
||||
k = self.hidden_size * self.hidden_size // self.tensor_parallel_size + self.hidden_size
|
||||
v = self.hidden_size * self.hidden_size // self.tensor_parallel_size + self.hidden_size
|
||||
out = self.hidden_size * self.hidden_size // self.tensor_parallel_size + self.hidden_size
|
||||
mha = ln1 + q + k + v + out
|
||||
|
||||
ln2 = 2 * self.hidden_size
|
||||
ffn1 = self.hidden_size * self.ffn_size // self.tensor_parallel_size + self.ffn_size
|
||||
ffn2 = self.ffn_size * self.hidden_size // self.tensor_parallel_size + self.hidden_size
|
||||
ffn = ln2 + ffn1 + ffn2
|
||||
|
||||
total = (word_embedding + position_embedding +
|
||||
self.num_layers * (mha + ffn))
|
||||
dtype_size = get_dtype_size(self.dtype)
|
||||
return dtype_size * total
|
||||
|
||||
def get_max_act_size(
|
||||
self,
|
||||
max_num_batched_tokens: int,
|
||||
) -> int:
|
||||
# NOTE: We approxmiately calculate the maximum activation size by
|
||||
# estimating
|
||||
# 1) the maximum activation tensor size during inference
|
||||
# 2) the residual tensor size during inference
|
||||
# Here, we assume that FlashAttention is used and
|
||||
# thus the attention maps are never materialized in GPU DRAM.
|
||||
residual = max_num_batched_tokens * self.hidden_size
|
||||
qkv = 3 * (max_num_batched_tokens * self.hidden_size) // self.tensor_parallel_size
|
||||
ffn = max_num_batched_tokens * self.ffn_size // self.tensor_parallel_size
|
||||
# Double the activation size for input and output.
|
||||
max_act = 2 * (max(qkv, ffn) + residual)
|
||||
# Size of output logits.
|
||||
output_logits = 2 * (max_num_batched_tokens * self.vocab_size)
|
||||
max_act = max(max_act, output_logits)
|
||||
dtype_size = get_dtype_size(self.dtype)
|
||||
return dtype_size * max_act
|
||||
|
||||
|
||||
class OPTMemoryAnalyzer(CacheFlowMemoryAnalyzer):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model_name: str,
|
||||
block_size: int,
|
||||
dtype: torch.dtype,
|
||||
gpu_memory: int,
|
||||
cpu_memory: int,
|
||||
tensor_parallel_size: int,
|
||||
) -> None:
|
||||
self.model_name = model_name
|
||||
self.block_size = block_size
|
||||
self.dtype = dtype
|
||||
self.gpu_memory = gpu_memory
|
||||
self.cpu_memory = cpu_memory
|
||||
self.tensor_parallel_size = tensor_parallel_size
|
||||
|
||||
config = AutoConfig.from_pretrained(model_name)
|
||||
self.num_layers = config.num_hidden_layers
|
||||
self.hidden_size = config.hidden_size
|
||||
self.num_heads = config.num_attention_heads
|
||||
self.head_size = config.hidden_size // self.num_heads
|
||||
self.ffn_size = config.ffn_dim
|
||||
self.embedding_size = config.word_embed_proj_dim
|
||||
self.vocab_size = config.vocab_size
|
||||
self.max_position = config.max_position_embeddings
|
||||
|
||||
def get_param_size(self) -> int:
|
||||
word_embedding = self.vocab_size * self.embedding_size // self.tensor_parallel_size
|
||||
if self.embedding_size != self.hidden_size:
|
||||
# Project in/out.
|
||||
word_embedding += 2 * self.embedding_size * self.hidden_size
|
||||
position_embedding = self.max_position * self.hidden_size
|
||||
|
||||
ln1 = 2 * self.hidden_size
|
||||
q = self.hidden_size * self.hidden_size // self.tensor_parallel_size + self.hidden_size
|
||||
k = self.hidden_size * self.hidden_size // self.tensor_parallel_size + self.hidden_size
|
||||
v = self.hidden_size * self.hidden_size // self.tensor_parallel_size + self.hidden_size
|
||||
out = self.hidden_size * self.hidden_size // self.tensor_parallel_size + self.hidden_size
|
||||
mha = ln1 + q + k + v + out
|
||||
|
||||
ln2 = 2 * self.hidden_size
|
||||
ffn1 = self.hidden_size * self.ffn_size // self.tensor_parallel_size + self.ffn_size
|
||||
ffn2 = self.ffn_size * self.hidden_size // self.tensor_parallel_size + self.hidden_size
|
||||
ffn = ln2 + ffn1 + ffn2
|
||||
|
||||
total = (word_embedding + position_embedding +
|
||||
self.num_layers * (mha + ffn))
|
||||
dtype_size = get_dtype_size(self.dtype)
|
||||
return dtype_size * total
|
||||
|
||||
def get_max_act_size(
|
||||
self,
|
||||
max_num_batched_tokens: int,
|
||||
) -> int:
|
||||
# NOTE: We approxmiately calculate the maximum activation size by
|
||||
# estimating
|
||||
# 1) the maximum activation tensor size during inference
|
||||
# 2) the residual tensor size during inference
|
||||
# Here, we assume that we use memory-efficient attention which
|
||||
# does not materialize the attention maps in GPU DRAM.
|
||||
residual = max_num_batched_tokens * self.hidden_size
|
||||
qkv = 3 * (max_num_batched_tokens * self.hidden_size) // self.tensor_parallel_size
|
||||
ffn = max_num_batched_tokens * self.ffn_size // self.tensor_parallel_size
|
||||
# Double the activation size for input and output.
|
||||
max_act = 2 * (max(qkv, ffn) + residual)
|
||||
# Size of output logits.
|
||||
output_logits = 2 * (max_num_batched_tokens * self.vocab_size)
|
||||
max_act = max(max_act, output_logits)
|
||||
dtype_size = get_dtype_size(self.dtype)
|
||||
return dtype_size * max_act
|
||||
|
||||
|
||||
class LlamaMemoryAnalyzer(CacheFlowMemoryAnalyzer):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model_name: str,
|
||||
block_size: int,
|
||||
dtype: torch.dtype,
|
||||
gpu_memory: int,
|
||||
cpu_memory: int,
|
||||
tensor_parallel_size: int,
|
||||
) -> None:
|
||||
self.model_name = model_name
|
||||
self.block_size = block_size
|
||||
self.dtype = dtype
|
||||
self.gpu_memory = gpu_memory
|
||||
self.cpu_memory = cpu_memory
|
||||
self.tensor_parallel_size = tensor_parallel_size
|
||||
|
||||
config = AutoConfig.from_pretrained(model_name)
|
||||
self.num_layers = config.num_hidden_layers
|
||||
self.hidden_size = config.hidden_size
|
||||
self.num_heads = config.num_attention_heads
|
||||
self.head_size = config.hidden_size // self.num_heads
|
||||
self.ffn_size = config.intermediate_size
|
||||
self.vocab_size = config.vocab_size
|
||||
self.max_position = 8192
|
||||
|
||||
def get_param_size(self) -> int:
|
||||
# NOTE: LLaMA does not tie the two embeddings.
|
||||
word_embedding = self.vocab_size * self.hidden_size // self.tensor_parallel_size
|
||||
lm_head = self.vocab_size * self.hidden_size // self.tensor_parallel_size
|
||||
|
||||
# NOTE: LLaMA does not have bias terms.
|
||||
ln1 = self.hidden_size
|
||||
q = self.hidden_size * self.hidden_size // self.tensor_parallel_size
|
||||
k = self.hidden_size * self.hidden_size // self.tensor_parallel_size
|
||||
v = self.hidden_size * self.hidden_size // self.tensor_parallel_size
|
||||
out = self.hidden_size * self.hidden_size // self.tensor_parallel_size
|
||||
# Rotary embedding.
|
||||
# TODO(woosuk): Share the rotary embedding between layers.
|
||||
rot = self.max_position * self.head_size
|
||||
mha = ln1 + q + k + v + out + rot
|
||||
|
||||
ln2 = self.hidden_size
|
||||
gate = self.hidden_size * self.ffn_size // self.tensor_parallel_size
|
||||
down = self.ffn_size * self.hidden_size // self.tensor_parallel_size
|
||||
up = self.hidden_size * self.ffn_size // self.tensor_parallel_size
|
||||
ffn = ln2 + gate + down + up
|
||||
|
||||
total = word_embedding + self.num_layers * (mha + ffn) + lm_head
|
||||
dtype_size = get_dtype_size(self.dtype)
|
||||
return dtype_size * total
|
||||
|
||||
def get_max_act_size(
|
||||
self,
|
||||
max_num_batched_tokens: int,
|
||||
) -> int:
|
||||
# NOTE: We approxmiately calculate the maximum activation size by
|
||||
# estimating
|
||||
# 1) the maximum activation tensor size during inference
|
||||
# 2) the residual tensor size during inference
|
||||
# Here, we assume that we use memory-efficient attention which
|
||||
# does not materialize the attention maps in GPU DRAM.
|
||||
residual = max_num_batched_tokens * self.hidden_size
|
||||
qkv = 3 * (max_num_batched_tokens * self.hidden_size) // self.tensor_parallel_size
|
||||
ffn = 2 * (max_num_batched_tokens * self.ffn_size) // self.tensor_parallel_size
|
||||
# Double the activation size for input and output.
|
||||
max_act = 2 * (max(qkv, ffn) + residual)
|
||||
# Size of output logits.
|
||||
output_logits = 2 * (max_num_batched_tokens * self.vocab_size)
|
||||
max_act = max(max_act, output_logits)
|
||||
dtype_size = get_dtype_size(self.dtype)
|
||||
return dtype_size * max_act
|
||||
|
||||
|
||||
class GPTNeoXMemoryAnalyzer(CacheFlowMemoryAnalyzer):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model_name: str,
|
||||
block_size: int,
|
||||
dtype: torch.dtype,
|
||||
gpu_memory: int,
|
||||
cpu_memory: int,
|
||||
tensor_parallel_size: int,
|
||||
) -> None:
|
||||
self.model_name = model_name
|
||||
self.block_size = block_size
|
||||
self.dtype = dtype
|
||||
self.gpu_memory = gpu_memory
|
||||
self.cpu_memory = cpu_memory
|
||||
self.tensor_parallel_size = tensor_parallel_size
|
||||
|
||||
config = AutoConfig.from_pretrained(model_name)
|
||||
self.num_layers = config.num_hidden_layers
|
||||
self.hidden_size = config.hidden_size
|
||||
self.num_heads = config.num_attention_heads
|
||||
self.head_size = config.hidden_size // self.num_heads
|
||||
self.ffn_size = config.intermediate_size
|
||||
self.vocab_size = config.vocab_size
|
||||
self.max_position = 8192
|
||||
self.tie_word_embeddings = config.tie_word_embeddings
|
||||
|
||||
def get_param_size(self) -> int:
|
||||
word_embedding = self.vocab_size * self.hidden_size // self.tensor_parallel_size
|
||||
if self.tie_word_embeddings:
|
||||
lm_head = 0
|
||||
else:
|
||||
lm_head = self.vocab_size * self.hidden_size // self.tensor_parallel_size
|
||||
|
||||
ln1 = 2 * self.hidden_size
|
||||
q = self.hidden_size * self.hidden_size // self.tensor_parallel_size + self.hidden_size
|
||||
k = self.hidden_size * self.hidden_size // self.tensor_parallel_size + self.hidden_size
|
||||
v = self.hidden_size * self.hidden_size // self.tensor_parallel_size + self.hidden_size
|
||||
out = self.hidden_size * self.hidden_size // self.tensor_parallel_size + self.hidden_size
|
||||
# Rotary embedding.
|
||||
# TODO(woosuk): Share the rotary embedding between layers.
|
||||
rot = self.max_position * self.head_size
|
||||
mha = ln1 + q + k + v + out + rot
|
||||
|
||||
ln2 = 2 * self.hidden_size
|
||||
ffn1 = self.hidden_size * self.ffn_size // self.tensor_parallel_size + self.ffn_size
|
||||
ffn2 = self.ffn_size * self.hidden_size // self.tensor_parallel_size + self.hidden_size
|
||||
ffn = ln2 + ffn1 + ffn2
|
||||
|
||||
total = word_embedding + self.num_layers * (mha + ffn) + lm_head
|
||||
dtype_size = get_dtype_size(self.dtype)
|
||||
return dtype_size * total
|
||||
|
||||
def get_max_act_size(
|
||||
self,
|
||||
max_num_batched_tokens: int,
|
||||
) -> int:
|
||||
# NOTE: We approxmiately calculate the maximum activation size by
|
||||
# estimating
|
||||
# 1) the maximum activation tensor size during inference
|
||||
# 2) the residual tensor size during inference
|
||||
# Here, we assume that we use memory-efficient attention which
|
||||
# does not materialize the attention maps in GPU DRAM.
|
||||
residual = max_num_batched_tokens * self.hidden_size
|
||||
qkv = 3 * (max_num_batched_tokens * self.hidden_size) // self.tensor_parallel_size
|
||||
ffn = 2 * (max_num_batched_tokens * self.ffn_size) // self.tensor_parallel_size
|
||||
# Double the activation size for input and output.
|
||||
max_act = 2 * (max(qkv, ffn) + residual)
|
||||
# Size of output logits.
|
||||
output_logits = 2 * (max_num_batched_tokens * self.vocab_size)
|
||||
max_act = max(max_act, output_logits)
|
||||
dtype_size = get_dtype_size(self.dtype)
|
||||
return dtype_size * max_act
|
||||
@ -5,9 +5,6 @@ import torch
|
||||
import torch.nn as nn
|
||||
from transformers import AutoConfig, PretrainedConfig
|
||||
|
||||
from cacheflow.model_executor.memory_analyzer import (
|
||||
CacheFlowMemoryAnalyzer, GPT2MemoryAnalyzer, GPTNeoXMemoryAnalyzer,
|
||||
LlamaMemoryAnalyzer, OPTMemoryAnalyzer)
|
||||
from cacheflow.model_executor.models import (
|
||||
GPT2LMHeadModel, GPTNeoXForCausalLM, LlamaForCausalLM, OPTForCausalLM)
|
||||
from cacheflow.model_executor.utils import get_torch_dtype
|
||||
@ -22,14 +19,6 @@ _MODEL_REGISTRY = {
|
||||
"OPTForCausalLM": OPTForCausalLM,
|
||||
}
|
||||
|
||||
_MEMORY_ANALYZERS = {
|
||||
"GPT2LMHeadModel": GPT2MemoryAnalyzer,
|
||||
"GPTNeoXForCausalLM": GPTNeoXMemoryAnalyzer,
|
||||
"LlamaForCausalLM": LlamaMemoryAnalyzer,
|
||||
"OPTForCausalLM": OPTMemoryAnalyzer,
|
||||
}
|
||||
|
||||
|
||||
def _get_model_architecture(config: PretrainedConfig) -> nn.Module:
|
||||
architectures = getattr(config, "architectures", [])
|
||||
for arch in architectures:
|
||||
@ -41,17 +30,6 @@ def _get_model_architecture(config: PretrainedConfig) -> nn.Module:
|
||||
)
|
||||
|
||||
|
||||
def _get_memory_analyzer(config: PretrainedConfig) -> CacheFlowMemoryAnalyzer:
|
||||
architectures = getattr(config, "architectures", [])
|
||||
for arch in architectures:
|
||||
if arch in _MEMORY_ANALYZERS:
|
||||
return _MEMORY_ANALYZERS[arch]
|
||||
raise ValueError(
|
||||
f"Model architectures {architectures} are not supported for now. "
|
||||
f"Supported architectures: {list(_MEMORY_ANALYZERS.keys())}"
|
||||
)
|
||||
|
||||
|
||||
def _get_dtype(config: PretrainedConfig, dtype: str) -> torch.dtype:
|
||||
# NOTE: getattr(config, "torch_dtype", torch.float32) is not correct
|
||||
# because config.torch_dtype can be None.
|
||||
@ -100,18 +78,3 @@ def get_model(
|
||||
model = model.cuda()
|
||||
return model.eval(), torch_dtype
|
||||
|
||||
|
||||
def get_memory_analyzer(
|
||||
model_name: str,
|
||||
block_size: int,
|
||||
dtype: str,
|
||||
gpu_memory: int,
|
||||
cpu_memory: int,
|
||||
tensor_parallel_size: int = 1,
|
||||
) -> CacheFlowMemoryAnalyzer:
|
||||
config = AutoConfig.from_pretrained(model_name)
|
||||
torch_dtype = _get_dtype(config, dtype)
|
||||
memory_analyzer = _get_memory_analyzer(config)
|
||||
return memory_analyzer(
|
||||
model_name, block_size, torch_dtype, gpu_memory, cpu_memory,
|
||||
tensor_parallel_size)
|
||||
|
||||
@ -58,7 +58,8 @@ class GPT2Attention(nn.Module):
|
||||
self.c_proj = RowParallelLinear(self.hidden_size, self.hidden_size, bias=True,
|
||||
input_is_parallel=True,
|
||||
perform_initialization=False)
|
||||
self.attn = GPTCacheFlowAttention(scale=self.scale)
|
||||
self.attn = GPTCacheFlowAttention(self.num_heads, self.head_dim,
|
||||
scale=self.scale)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
|
||||
@ -62,7 +62,8 @@ class GPTNeoXAttention(nn.Module):
|
||||
scaling = self.head_size ** -0.5
|
||||
rotary_dim = int(self.head_size * config.rotary_pct)
|
||||
assert rotary_dim % 2 == 0
|
||||
self.attn = GPTNeoXCacheFlowAttention(scaling, rotary_dim)
|
||||
self.attn = GPTNeoXCacheFlowAttention(self.num_heads, self.head_size,
|
||||
scaling, rotary_dim)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
|
||||
@ -104,7 +104,8 @@ class LlamaAttention(nn.Module):
|
||||
input_is_parallel=True,
|
||||
perform_initialization=False,
|
||||
)
|
||||
self.attn = GPTNeoXCacheFlowAttention(self.scaling, self.head_dim)
|
||||
self.attn = GPTNeoXCacheFlowAttention(self.num_heads, self.head_dim,
|
||||
self.scaling, rotary_dim=self.head_dim)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
|
||||
@ -74,7 +74,8 @@ class OPTAttention(nn.Module):
|
||||
self.out_proj = RowParallelLinear(embed_dim, embed_dim, bias=bias,
|
||||
input_is_parallel=True,
|
||||
perform_initialization=False)
|
||||
self.attn = GPTCacheFlowAttention(scale=self.scaling)
|
||||
self.attn = GPTCacheFlowAttention(self.num_heads, self.head_dim,
|
||||
scale=self.scaling)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
|
||||
@ -40,3 +40,15 @@ def set_random_seed(seed: int) -> None:
|
||||
|
||||
if model_parallel_is_initialized():
|
||||
model_parallel_cuda_manual_seed(seed)
|
||||
|
||||
|
||||
def get_cache_block_size(block_size: int,
|
||||
num_heads: int,
|
||||
head_size: int,
|
||||
num_layers: int,
|
||||
dtype: str) -> int:
|
||||
key_cache_block = block_size * num_heads * head_size
|
||||
value_cache_block = key_cache_block
|
||||
total = num_layers * (key_cache_block + value_cache_block)
|
||||
dtype_size = get_dtype_size(dtype)
|
||||
return dtype_size * total
|
||||
|
||||
Reference in New Issue
Block a user