Refactor system architecture (#82)

2023-05-09 15:30:12 -07:00
parent 8917782af6
commit 7c041ab578
40 changed files with 194 additions and 446 deletions
--- a/cacheflow/model_executor/init.py
+++ b/cacheflow/model_executor/init.py
@ -0,0 +1,11 @@
+from cacheflow.model_executor.input_metadata import InputMetadata
+from cacheflow.model_executor.model_loader import get_model, get_memory_analyzer
+from cacheflow.model_executor.utils import set_random_seed
+
+
+__all__ = [
+    "InputMetadata",
+    "get_model",
+    "get_memory_analyzer",
+    "set_random_seed",
+]
--- a/cacheflow/model_executor/input_metadata.py
+++ b/cacheflow/model_executor/input_metadata.py
@ -0,0 +1,52 @@
+from typing import List, Dict, Tuple
+
+import torch
+from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask
+
+from cacheflow.sampling_params import SamplingParams
+
+
+class InputMetadata:
+
+    def __init__(
+        self,
+        seq_groups: List[Tuple[List[int], SamplingParams]],
+        seq_logprobs: Dict[int, float],                         # Seq id -> cumulative logprobs.
+        prompt_lens: List[int],
+        slot_mapping: torch.Tensor,
+        context_lens: torch.Tensor,
+        max_context_len: int,
+        block_tables: torch.Tensor,
+    ) -> None:
+        self.seq_groups = seq_groups
+        self.seq_logprobs = seq_logprobs
+        self.prompt_lens = prompt_lens
+        self.slot_mapping = slot_mapping
+        self.context_lens = context_lens
+        self.max_context_len = max_context_len
+        self.block_tables = block_tables
+
+        self.attn_bias = BlockDiagonalCausalMask.from_seqlens(prompt_lens)
+        self.num_prompts = len(prompt_lens)
+        self.num_prompt_tokens = sum(prompt_lens)
+        self.num_generation_tokens = context_lens.shape[0]
+        self.num_valid_tokens = slot_mapping.shape[0]
+        if block_tables.numel() > 0:
+            self.max_num_blocks_per_seq = block_tables.shape[1]
+        else:
+            self.max_num_blocks_per_seq = 0
+        assert block_tables.shape[0] == self.num_generation_tokens
+        assert context_lens.shape[0] == self.num_generation_tokens
+
+    def __repr__(self) -> str:
+        return (f'InputMetadata('
+                f'num_valid_tokens={self.num_valid_tokens}, '
+                f'num_prompt_tokens={self.num_prompt_tokens}, '
+                f'num_prompts={self.num_prompts}, '
+                f'prompt_lens={self.prompt_lens}, '
+                f'num_generation_tokens={self.num_generation_tokens}, '
+                f'context_lens={self.context_lens}, '
+                f'max_context_len={self.max_context_len}), '
+                f'max_num_blocks_per_seq={self.max_num_blocks_per_seq}, '
+                f'block_tables={self.block_tables}), '
+                f'slot_mapping={self.slot_mapping}')
--- a/cacheflow/model_executor/layers/activation.py
+++ b/cacheflow/model_executor/layers/activation.py
@ -0,0 +1,20 @@
+import torch
+import torch.nn as nn
+
+from cacheflow import activation_ops
+
+
+class SiluAndMul(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(
+        self,
+        x: torch.Tensor,        # (num_tokens, 2 * d)
+    ) -> torch.Tensor:          # (num_tokens, d)
+        num_tokens = x.shape[0]
+        d = x.shape[1] // 2
+        out = torch.empty(num_tokens, d, dtype=x.dtype, device=x.device)
+        activation_ops.silu_and_mul(out, x)
+        return out
--- a/cacheflow/model_executor/layers/attention.py
+++ b/cacheflow/model_executor/layers/attention.py
@ -0,0 +1,191 @@
+from typing import Optional
+
+import torch
+import torch.nn as nn
+from xformers import ops as xops
+
+from cacheflow import attention_ops
+from cacheflow import cache_ops
+from cacheflow import pos_encoding_ops
+from cacheflow.model_executor.input_metadata import InputMetadata
+
+
+class GPTCacheFlowAttention(nn.Module):
+
+    def __init__(self, scale: float) -> None:
+        super().__init__()
+        self.scale = float(scale)
+        self.attn_op = xops.fmha.cutlass.FwOp()
+
+    def multi_query_kv_attention(
+        self,
+        output: torch.Tensor,                   # [num_prompt_tokens, num_heads, head_size]
+        query: torch.Tensor,                    # [num_prompt_tokens, num_heads, head_size]
+        key: torch.Tensor,                      # [num_prompt_tokens, num_heads, head_size]
+        value: torch.Tensor,                    # [num_prompt_tokens, num_heads, head_size]
+        attn_bias: xops.AttentionBias,
+    ) -> None:
+        # TODO(woosuk): The unsqueeze op may incur some CPU overhead. Optimize.
+        out = xops.memory_efficient_attention_forward(
+            query.unsqueeze(0),
+            key.unsqueeze(0),
+            value.unsqueeze(0),
+            attn_bias=attn_bias,
+            p=0.0,
+            scale=self.scale,
+            op=self.attn_op,
+        )
+        # TODO(woosuk): Unnecessary copy. Optimize.
+        output.copy_(out.squeeze(0))
+        return output
+
+    def single_query_cached_kv_attention(
+        self,
+        output: torch.Tensor,           # [num_generation_tokens, num_heads, head_size]
+        query: torch.Tensor,            # [num_generation_tokens, num_heads, head_size]
+        key_cache: torch.Tensor,        # [num_blocks, num_heads, head_size/x, block_size, x]
+        value_cache: torch.Tensor,      # [num_blocks, num_heads, head_size, block_size]
+        input_metadata: InputMetadata,
+    ) -> None:
+        head_size = value_cache.shape[2]
+        supported_head_sizes = [32, 64, 80, 96, 128, 160, 192, 256]
+        if head_size not in supported_head_sizes:
+            raise ValueError(f'head_size ({head_size}) is not supported by '
+                             'the single_query_cached_kv_attention kernel. '
+                             'Use one of the following head sizes: '
+                             f'{supported_head_sizes}.')
+
+        block_size = value_cache.shape[3]
+        attention_ops.single_query_cached_kv_attention(
+            output,
+            query,
+            key_cache,
+            value_cache,
+            self.scale,
+            input_metadata.block_tables,
+            input_metadata.context_lens,
+            block_size,
+            input_metadata.max_context_len,
+        )
+
+    def forward(
+        self,
+        query: torch.Tensor,                    # [num_tokens, num_heads * head_size]
+        key: torch.Tensor,                      # [num_tokens, num_heads * head_size]
+        value: torch.Tensor,                    # [num_tokens, num_heads * head_size]
+        key_cache: torch.Tensor,                # [num_blocks, num_heads, head_size/x, block_size, x]
+        value_cache: torch.Tensor,              # [num_blocks, num_heads, head_size, block_size]
+        input_metadata: InputMetadata,
+        cache_event: Optional[torch.cuda.Event],
+    ) -> torch.Tensor:                          # [num_tokens, num_heads * head_size]
+        # NOTE: The query, key, and value tensors must be sliced from a qkv
+        # tensor of shape [num_tokens, 3 * num_heads * head_size].
+
+        # Reshape the query, key, and value tensors.
+        num_heads = value_cache.shape[1]
+        head_size = value_cache.shape[2]
+        query = query.view(-1, num_heads, head_size)
+        key = key.view(-1, num_heads, head_size)
+        value = value.view(-1, num_heads, head_size)
+
+        # Pre-allocate the output tensor.
+        output = torch.empty_like(query)
+
+        # Compute the attention op for prompts.
+        num_prompt_tokens = input_metadata.num_prompt_tokens
+        if num_prompt_tokens > 0:
+            self.multi_query_kv_attention(
+                output[:num_prompt_tokens],
+                query[:num_prompt_tokens],
+                key[:num_prompt_tokens],
+                value[:num_prompt_tokens],
+                input_metadata.attn_bias,
+            )
+
+        # Wait until the cache op is done.
+        if cache_event is not None:
+            cache_event.wait()
+
+        # Reshape the keys and values and store them in the cache.
+        num_valid_tokens = input_metadata.num_valid_tokens
+        if num_valid_tokens > 0:
+            # The stride is 3 because the key and value are sliced from qkv.
+            cache_ops.reshape_and_cache(
+                key[:num_valid_tokens],
+                value[:num_valid_tokens],
+                key_cache,
+                value_cache,
+                input_metadata.slot_mapping,
+            )
+
+        if input_metadata.num_generation_tokens > 0:
+            # Compute the attention op for generation tokens.
+            self.single_query_cached_kv_attention(
+                output[num_prompt_tokens:num_valid_tokens],
+                query[num_prompt_tokens:num_valid_tokens],
+                key_cache,
+                value_cache,
+                input_metadata)
+
+        # Reshape the output tensor.
+        # NOTE(woosuk): The output tensor may include paddings.
+        return output.view(-1, num_heads * head_size)
+
+
+class GPTNeoXCacheFlowAttention(GPTCacheFlowAttention):
+    """Attention with GPT-NeoX style rotary embedding."""
+
+    def __init__(
+        self,
+        scale: float,
+        rotary_dim: int,
+        max_position: int = 8192,
+        base: int = 10000,
+    ) -> None:
+        super().__init__(scale)
+
+        # Create the cos and sin cache.
+        inv_freq = 1.0 / (base ** (torch.arange(0, rotary_dim, 2) / rotary_dim))
+        t = torch.arange(max_position).float()
+        freqs = torch.einsum('i,j -> ij', t, inv_freq.float())
+        cos = freqs.cos()
+        sin = freqs.sin()
+        cache = torch.cat((cos, sin), dim=-1)
+
+        # FIXME(woosuk): This assumes that we configure the default dtype when
+        # initializing the model. Make it more robust.
+        torch_dtype = torch.get_default_dtype()
+        cache = cache.to(torch_dtype)
+        # Embedding size: [max_position, rotary_dim]
+        self.register_buffer('cos_sin_cache', cache, persistent=False)
+
+    def forward(
+        self,
+        positions: torch.LongTensor,            # [num_tokens]
+        query: torch.Tensor,                    # [num_tokens, num_heads * head_size]
+        key: torch.Tensor,                      # [num_tokens, num_heads * head_size]
+        value: torch.Tensor,                    # [num_tokens, num_heads * head_size]
+        key_cache: torch.Tensor,                # [num_blocks, num_heads, head_size/x, block_size, x]
+        value_cache: torch.Tensor,              # [num_blocks, num_heads, head_size, block_size]
+        input_metadata: InputMetadata,
+        cache_event: Optional[torch.cuda.Event],
+    ) -> torch.Tensor:                          # [num_tokens, num_heads * head_size]
+        # Apply rotary embedding to the query and key before passing them
+        # to the attention op.
+        head_size = value_cache.shape[2]
+        pos_encoding_ops.rotary_embedding_neox(
+            positions,
+            query,
+            key,
+            head_size,
+            self.cos_sin_cache,
+        )
+        return super().forward(
+            query,
+            key,
+            value,
+            key_cache,
+            value_cache,
+            input_metadata,
+            cache_event,
+        )
--- a/cacheflow/model_executor/layers/layernorm.py
+++ b/cacheflow/model_executor/layers/layernorm.py
@ -0,0 +1,26 @@
+import torch
+import torch.nn as nn
+
+from cacheflow import layernorm_ops
+
+
+class RMSNorm(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        eps: float = 1e-6,
+    ) -> None:
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        layernorm_ops.rms_norm(
+            out,
+            x,
+            self.weight.data,
+            self.variance_epsilon,
+        )
+        return out
--- a/cacheflow/model_executor/layers/sampler.py
+++ b/cacheflow/model_executor/layers/sampler.py
@ -0,0 +1,291 @@
+from typing import Dict, List, Tuple
+
+import torch
+import torch.nn as nn
+
+from cacheflow.model_executor.input_metadata import InputMetadata
+from cacheflow.model_executor.parallel_utils.tensor_parallel import (
+    gather_from_tensor_model_parallel_region)
+from cacheflow.sampling_params import SamplingParams
+from cacheflow.sequence import SequenceOutputs
+
+
+class Sampler(nn.Module):
+
+    def __init__(self, vocab_size: int) -> None:
+        super().__init__()
+        self.vocab_size = vocab_size
+
+    def forward(
+        self,
+        embedding: torch.Tensor,
+        hidden_states: torch.Tensor,
+        input_metadata: InputMetadata,
+    ) -> Dict[int, SequenceOutputs]:
+        # Get the hidden states that we use for sampling.
+        hidden_states = _prune_hidden_states(hidden_states, input_metadata)
+
+        # Get the logits for the next tokens.
+        logits = torch.matmul(hidden_states, embedding.t())
+        logits = gather_from_tensor_model_parallel_region(logits)
+        # Remove paddings in vocab (if any).
+        logits = logits[:, :self.vocab_size]
+
+        # Apply temperature scaling.
+        temperatures = _get_temperatures(input_metadata)
+        assert len(temperatures) == logits.shape[0]
+        if any(t != 1.0 for t in temperatures):
+            t = torch.tensor(
+                temperatures, dtype=logits.dtype, device=logits.device)
+            # Use in-place division to avoid creating a new tensor.
+            logits.div_(t.unsqueeze(dim=1))
+
+        # We use float32 for probabilities and log probabilities.
+        # Compute the probabilities.
+        probs = torch.softmax(logits, dim=-1, dtype=torch.float)
+        # Compute the log probabilities (before applying top-p).
+        logprobs = torch.log(probs)
+
+        # Apply top-p truncation.
+        top_ps = _get_top_ps(input_metadata)
+        assert len(top_ps) == probs.shape[0]
+        if any(p < 1.0 for p in top_ps):
+            p = torch.tensor(top_ps, dtype=probs.dtype, device=probs.device)
+            probs = _apply_top_p(probs, p)
+
+        # Sample the next tokens.
+        return _sample(probs, logprobs, input_metadata)
+
+
+def _prune_hidden_states(
+    hidden_states: torch.Tensor,
+    input_metadata: InputMetadata,
+) -> torch.Tensor:
+    start_idx = 0
+    last_token_indicies: List[int] = []
+    for prompt_len in input_metadata.prompt_lens:
+        last_token_indicies.append(start_idx + prompt_len - 1)
+        start_idx += prompt_len
+    last_token_indicies.extend(
+        range(start_idx, start_idx + input_metadata.num_generation_tokens))
+    return hidden_states[last_token_indicies]
+
+
+def _get_temperatures(
+    input_metadata: InputMetadata,
+) -> List[float]:
+    # Collect the temperatures for the logits.
+    temperatures: List[float] = []
+    for i, seq_group in enumerate(input_metadata.seq_groups):
+        seq_ids, sampling_params = seq_group
+        temperature = sampling_params.temperature
+        if temperature == 0.0:
+            # NOTE: Zero temperature means deterministic sampling
+            # (i.e., greedy sampling or beam search).
+            # Set the temperature to 1 to avoid division by zero.
+            temperature = 1.0
+
+        if i < input_metadata.num_prompts:
+            # A prompt input.
+            temperatures.append(temperature)
+        else:
+            # A generation token.
+            temperatures += [temperature] * len(seq_ids)
+    return temperatures
+
+
+def _get_top_ps(
+    input_metadata: InputMetadata,
+) -> List[float]:
+    top_ps: List[float] = []
+    for i, seq_group in enumerate(input_metadata.seq_groups):
+        seq_ids, sampling_params = seq_group
+        if i < input_metadata.num_prompts:
+            # A prompt input.
+            top_ps.append(sampling_params.top_p)
+        else:
+            # A generation token.
+            top_ps += [sampling_params.top_p] * len(seq_ids)
+    return top_ps
+
+
+def _apply_top_p(
+    probs: torch.Tensor,
+    p: torch.Tensor,
+) -> torch.Tensor:
+    # TODO(woosuk): Optimize.
+    probs_sort, probs_idx = probs.sort(dim=-1, descending=True)
+    probs_sum = torch.cumsum(probs_sort, dim=-1)
+    mask = (probs_sum - probs_sort) > p.unsqueeze(dim=1)
+    probs_sort[mask] = 0.0
+    probs_sort.div_(probs_sort.sum(dim=-1, keepdim=True))
+    probs = torch.gather(
+        probs_sort, dim=-1, index=torch.argsort(probs_idx, dim=-1))
+    return probs
+
+
+def _get_topk_logprobs(
+    logprobs: torch.Tensor,
+    num_logprobs: int,
+) -> Dict[int, float]:
+    if num_logprobs == 0:
+        return {}
+
+    topk_logprobs, topk_ids = torch.topk(logprobs, num_logprobs)
+    if num_logprobs == 1:
+        topk_logprobs = [topk_logprobs.item()]
+        topk_ids = [topk_ids.item()]
+    else:
+        topk_logprobs = topk_logprobs.tolist()
+        topk_ids = topk_ids.tolist()
+
+    token_to_logprob: Dict[int, float] = {}
+    for token_id, logprob in zip(topk_ids, topk_logprobs):
+        token_to_logprob[token_id] = logprob
+    return token_to_logprob
+
+
+def _sample_from_prompt(
+    prob: torch.Tensor,
+    sampling_params: SamplingParams,
+) -> List[int]:
+    if sampling_params.use_beam_search:
+        # Beam search.
+        beam_width = sampling_params.n
+        _, next_token_ids = torch.topk(prob, beam_width)
+        next_token_ids = next_token_ids.tolist()
+    elif sampling_params.temperature == 0.0:
+        # Greedy sampling.
+        assert sampling_params.n == 1
+        next_token_id = torch.argmax(prob)
+        next_token_ids = [next_token_id.item()]
+    else:
+        # Neucleus sampling.
+        # Sample n tokens for the prompt.
+        n = sampling_params.n
+        next_token_ids = torch.multinomial(
+            prob, num_samples=n, replacement=True)
+        next_token_ids = next_token_ids.tolist()
+    return next_token_ids
+
+
+def _sample_from_generation_tokens(
+    seq_ids: List[int],
+    probs: torch.Tensor,
+    logprobs: torch.Tensor,
+    seq_logprobs: List[float],
+    sampling_params: SamplingParams,
+) -> Tuple[List[int], List[int]]:
+    # NOTE(woosuk): sampling_params.n can be greater than
+    # len(seq_ids) because some sequences in the group might have
+    # been already terminated.
+    if sampling_params.use_beam_search:
+        # Beam search.
+        # Add cumulative logprobs for the sequences in the group.
+        seq_logprobs = torch.tensor(
+            seq_logprobs, dtype=torch.float, device=logprobs.device)
+        logprobs = logprobs + seq_logprobs.unsqueeze(dim=1)
+
+        vocab_size = logprobs.size(-1)
+        beam_width = len(seq_ids)
+        _, topk_ids = torch.topk(logprobs.flatten(), beam_width)
+        topk_ids = topk_ids.tolist()
+        seq_idx = [i // vocab_size for i in topk_ids]
+        beam_seq_ids = [seq_ids[i] for i in seq_idx]
+        token_ids = [i % vocab_size for i in topk_ids]
+
+        beam_outputs: Dict[int, Tuple[int, int]] = {}
+        outstanding_beams: List[Tuple[int, int]] = []
+        # If a beam survives, continue with it.
+        for seq_id, token_id in zip(beam_seq_ids, token_ids):
+            if seq_id not in beam_outputs:
+                beam_outputs[seq_id] = (seq_id, token_id)
+            else:
+                outstanding_beams.append((seq_id, token_id))
+
+        # If a beam is discarded, fork another beam.
+        for seq_id in seq_ids:
+            if seq_id not in beam_outputs:
+                beam_outputs[seq_id] = outstanding_beams.pop()
+        assert not outstanding_beams
+
+        parent_seq_ids = [beam_outputs[seq_id][0] for seq_id in seq_ids]
+        next_token_ids = [beam_outputs[seq_id][1] for seq_id in seq_ids]
+    elif sampling_params.temperature == 0.0:
+        # Greedy sampling.
+        assert len(seq_ids) == 1
+        next_token_id = torch.argmax(probs, dim=-1)
+        next_token_ids = [next_token_id.item()]
+        parent_seq_ids = seq_ids
+    else:
+        # Neucleus sampling.
+        # Sample 1 token for each sequence in the group.
+        next_token_ids = torch.multinomial(
+            probs, num_samples=1, replacement=True)
+        next_token_ids = next_token_ids.squeeze(dim=-1).tolist()
+        parent_seq_ids = seq_ids
+    return parent_seq_ids, next_token_ids
+
+
+def _sample(
+    probs: torch.Tensor,
+    logprobs: torch.Tensor,
+    input_metadata: InputMetadata,
+) -> Dict[int, SequenceOutputs]:
+    seq_outputs: Dict[int, SequenceOutputs] = {}
+
+    # TODO(woosuk): Optimize.
+    idx = 0
+    for i, seq_group in enumerate(input_metadata.seq_groups):
+        seq_ids, sampling_params = seq_group
+        if i < input_metadata.num_prompts:
+            # Generate the next tokens for a prompt input.
+            assert len(seq_ids) == sampling_params.n
+            prob = probs[idx]
+            logprob = logprobs[idx]
+            idx += 1
+
+            # Sample the next tokens.
+            next_token_ids = _sample_from_prompt(prob, sampling_params)
+            # Get top-k log probabilities for the next tokens.
+            next_logprobs = _get_topk_logprobs(
+                logprob, sampling_params.num_logprobs)
+
+            # Build the output.
+            for seq_id, next_token_id in zip(seq_ids, next_token_ids):
+                output_logprobs = next_logprobs.copy()
+                output_logprobs[next_token_id] = logprob[next_token_id].item()
+                seq_outputs[seq_id] = SequenceOutputs(
+                    seq_id, seq_id, next_token_id, output_logprobs)
+        else:
+            # Generate the next tokens for generation tokens.
+            prob = probs[idx:idx + len(seq_ids)]
+            logprob = logprobs[idx:idx + len(seq_ids)]
+            idx += len(seq_ids)
+
+            # Sample the next tokens.
+            seq_logprobs = [
+                input_metadata.seq_logprobs[seq_id] for seq_id in seq_ids]
+            parent_seq_ids, next_token_ids = _sample_from_generation_tokens(
+                seq_ids, prob, logprob, seq_logprobs, sampling_params)
+
+            # Get top-k log probabilities for the next tokens.
+            next_logprobs: Dict[int, Dict[int, float]] = {}
+            for i, seq_id in enumerate(seq_ids):
+                next_logprobs[seq_id] = _get_topk_logprobs(
+                    logprob[i], sampling_params.num_logprobs)
+
+            # Build the output.
+            for seq_id, parent_seq_id, next_token_id in zip(
+                seq_ids, parent_seq_ids, next_token_ids):
+                i = seq_ids.index(parent_seq_id)
+                output_logprobs = next_logprobs[parent_seq_id].copy()
+                output_logprobs[next_token_id] = logprob[i, next_token_id].item()
+                seq_outputs[seq_id] = SequenceOutputs(
+                    seq_id,
+                    parent_seq_id,
+                    next_token_id,
+                    output_logprobs,
+                )
+
+    return seq_outputs
--- a/cacheflow/model_executor/memory_analyzer.py
+++ b/cacheflow/model_executor/memory_analyzer.py
@ -0,0 +1,371 @@
+import torch
+from transformers import AutoConfig
+
+from cacheflow.logger import init_logger
+from cacheflow.model_executor.utils import get_dtype_size
+
+
+logger = init_logger(__name__)
+
+_GiB = 1 << 30
+
+
+class CacheFlowMemoryAnalyzer:
+
+    def get_max_num_gpu_blocks(
+        self,
+        max_num_batched_tokens: int,
+        memory_utilization: float,
+    ) -> int:
+        raise NotImplementedError()
+
+    def get_workspace_size(self) -> int:
+        return 1 * _GiB
+
+    def get_cache_block_size(self) -> int:
+        raise NotImplementedError()
+
+    def get_max_num_cpu_blocks(
+        self,
+        swap_space_gib: int,
+    ) -> int:
+        swap_space = swap_space_gib * _GiB
+        cpu_memory = self.cpu_memory
+        if swap_space > 0.8 * cpu_memory:
+            raise ValueError(f'The swap space ({swap_space_gib:.2f} GiB) '
+                             'takes more than 80% of the available memory '
+                             f'({cpu_memory / _GiB:.2f} GiB).'
+                             'Please check the swap space size.')
+        if swap_space > 0.5 * cpu_memory:
+            logger.info(f'WARNING: The swap space ({swap_space_gib:.2f} GiB) '
+                        'takes more than 50% of the available memory '
+                        f'({cpu_memory / _GiB:.2f} GiB).'
+                        'This may slow the system performance.')
+        max_num_blocks = swap_space // self.get_cache_block_size()
+        return max_num_blocks
+
+    def get_param_size(self) -> int:
+        raise NotImplementedError()
+
+    def get_max_act_size(self, max_num_batched_tokens: int) -> int:
+        raise NotImplementedError()
+
+    def get_cache_block_size(self) -> int:
+        key_cache_block = self.block_size * self.hidden_size // self.tensor_parallel_size
+        value_cache_block = key_cache_block
+        total = self.num_layers * (key_cache_block + value_cache_block)
+        dtype_size = get_dtype_size(self.dtype)
+        return dtype_size * total
+
+    def get_max_num_gpu_blocks(
+        self,
+        max_num_batched_tokens: int,
+        memory_utilization: float = 0.95,
+    ) -> int:
+        # NOTE(woosuk): This assumes that the machine has homogeneous GPUs.
+        usable_memory = int(memory_utilization * self.gpu_memory)
+
+        param_size = self.get_param_size()
+        act_size = self.get_max_act_size(max_num_batched_tokens)
+        workspace_size = self.get_workspace_size()
+
+        max_cache_size = usable_memory - (param_size + act_size + workspace_size)
+        if max_cache_size <= 0:
+            raise RuntimeError('Not enough GPU memory.')
+        max_num_blocks = max_cache_size // self.get_cache_block_size()
+        return max_num_blocks
+
+
+class GPT2MemoryAnalyzer(CacheFlowMemoryAnalyzer):
+
+    def __init__(
+        self,
+        model_name: str,
+        block_size: int,
+        dtype: torch.dtype,
+        gpu_memory: int,
+        cpu_memory: int,
+        tensor_parallel_size: int,
+    ) -> None:
+        self.model_name = model_name
+        self.block_size = block_size
+        self.dtype = dtype
+        self.gpu_memory = gpu_memory
+        self.cpu_memory = cpu_memory
+        self.tensor_parallel_size = tensor_parallel_size
+
+        config = AutoConfig.from_pretrained(model_name)
+        self.num_layers = config.num_hidden_layers
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_size = config.hidden_size // self.num_heads
+        self.ffn_size = config.n_inner if config.n_inner is not None else 4 * self.hidden_size
+        self.vocab_size = config.vocab_size
+        self.max_position = config.max_position_embeddings
+
+    def get_param_size(self) -> int:
+        word_embedding = self.vocab_size * self.hidden_size // self.tensor_parallel_size
+        position_embedding = self.max_position * self.hidden_size
+
+        ln1 = 2 * self.hidden_size
+        q = self.hidden_size * self.hidden_size // self.tensor_parallel_size + self.hidden_size
+        k = self.hidden_size * self.hidden_size // self.tensor_parallel_size + self.hidden_size
+        v = self.hidden_size * self.hidden_size // self.tensor_parallel_size + self.hidden_size
+        out = self.hidden_size * self.hidden_size // self.tensor_parallel_size + self.hidden_size
+        mha = ln1 + q + k + v + out
+
+        ln2 = 2 * self.hidden_size
+        ffn1 = self.hidden_size * self.ffn_size // self.tensor_parallel_size + self.ffn_size
+        ffn2 = self.ffn_size * self.hidden_size // self.tensor_parallel_size + self.hidden_size
+        ffn = ln2 + ffn1 + ffn2
+
+        total = (word_embedding + position_embedding +
+                 self.num_layers * (mha + ffn))
+        dtype_size = get_dtype_size(self.dtype)
+        return dtype_size * total
+
+    def get_max_act_size(
+        self,
+        max_num_batched_tokens: int,
+    ) -> int:
+        # NOTE: We approxmiately calculate the maximum activation size by
+        # estimating
+        # 1) the maximum activation tensor size during inference
+        # 2) the residual tensor size during inference
+        # Here, we assume that FlashAttention is used and
+        # thus the attention maps are never materialized in GPU DRAM.
+        residual = max_num_batched_tokens * self.hidden_size
+        qkv = 3 * (max_num_batched_tokens * self.hidden_size) // self.tensor_parallel_size
+        ffn = max_num_batched_tokens * self.ffn_size // self.tensor_parallel_size
+        # Double the activation size for input and output.
+        max_act = 2 * (max(qkv, ffn) + residual)
+        # Size of output logits.
+        output_logits = 2 * (max_num_batched_tokens * self.vocab_size)
+        max_act = max(max_act, output_logits)
+        dtype_size = get_dtype_size(self.dtype)
+        return dtype_size * max_act
+
+
+class OPTMemoryAnalyzer(CacheFlowMemoryAnalyzer):
+
+    def __init__(
+        self,
+        model_name: str,
+        block_size: int,
+        dtype: torch.dtype,
+        gpu_memory: int,
+        cpu_memory: int,
+        tensor_parallel_size: int,
+    ) -> None:
+        self.model_name = model_name
+        self.block_size = block_size
+        self.dtype = dtype
+        self.gpu_memory = gpu_memory
+        self.cpu_memory = cpu_memory
+        self.tensor_parallel_size = tensor_parallel_size
+
+        config = AutoConfig.from_pretrained(model_name)
+        self.num_layers = config.num_hidden_layers
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_size = config.hidden_size // self.num_heads
+        self.ffn_size = config.ffn_dim
+        self.embedding_size = config.word_embed_proj_dim
+        self.vocab_size = config.vocab_size
+        self.max_position = config.max_position_embeddings
+
+    def get_param_size(self) -> int:
+        word_embedding = self.vocab_size * self.embedding_size // self.tensor_parallel_size
+        if self.embedding_size != self.hidden_size:
+            # Project in/out.
+            word_embedding += 2 * self.embedding_size * self.hidden_size
+        position_embedding = self.max_position * self.hidden_size
+
+        ln1 = 2 * self.hidden_size
+        q = self.hidden_size * self.hidden_size // self.tensor_parallel_size + self.hidden_size
+        k = self.hidden_size * self.hidden_size // self.tensor_parallel_size + self.hidden_size
+        v = self.hidden_size * self.hidden_size // self.tensor_parallel_size + self.hidden_size
+        out = self.hidden_size * self.hidden_size // self.tensor_parallel_size + self.hidden_size
+        mha = ln1 + q + k + v + out
+
+        ln2 = 2 * self.hidden_size
+        ffn1 = self.hidden_size * self.ffn_size // self.tensor_parallel_size + self.ffn_size
+        ffn2 = self.ffn_size * self.hidden_size // self.tensor_parallel_size + self.hidden_size
+        ffn = ln2 + ffn1 + ffn2
+
+        total = (word_embedding + position_embedding +
+                 self.num_layers * (mha + ffn))
+        dtype_size = get_dtype_size(self.dtype)
+        return dtype_size * total
+
+    def get_max_act_size(
+        self,
+        max_num_batched_tokens: int,
+    ) -> int:
+        # NOTE: We approxmiately calculate the maximum activation size by
+        # estimating
+        # 1) the maximum activation tensor size during inference
+        # 2) the residual tensor size during inference
+        # Here, we assume that we use memory-efficient attention which
+        # does not materialize the attention maps in GPU DRAM.
+        residual = max_num_batched_tokens * self.hidden_size
+        qkv = 3 * (max_num_batched_tokens * self.hidden_size) // self.tensor_parallel_size
+        ffn = max_num_batched_tokens * self.ffn_size // self.tensor_parallel_size
+        # Double the activation size for input and output.
+        max_act = 2 * (max(qkv, ffn) + residual)
+        # Size of output logits.
+        output_logits = 2 * (max_num_batched_tokens * self.vocab_size)
+        max_act = max(max_act, output_logits)
+        dtype_size = get_dtype_size(self.dtype)
+        return dtype_size * max_act
+
+
+class LlamaMemoryAnalyzer(CacheFlowMemoryAnalyzer):
+
+    def __init__(
+        self,
+        model_name: str,
+        block_size: int,
+        dtype: torch.dtype,
+        gpu_memory: int,
+        cpu_memory: int,
+        tensor_parallel_size: int,
+    ) -> None:
+        self.model_name = model_name
+        self.block_size = block_size
+        self.dtype = dtype
+        self.gpu_memory = gpu_memory
+        self.cpu_memory = cpu_memory
+        self.tensor_parallel_size = tensor_parallel_size
+
+        config = AutoConfig.from_pretrained(model_name)
+        self.num_layers = config.num_hidden_layers
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_size = config.hidden_size // self.num_heads
+        self.ffn_size = config.intermediate_size
+        self.vocab_size = config.vocab_size
+        self.max_position = 8192
+
+    def get_param_size(self) -> int:
+        # NOTE: LLaMA does not tie the two embeddings.
+        word_embedding = self.vocab_size * self.hidden_size // self.tensor_parallel_size
+        lm_head = self.vocab_size * self.hidden_size // self.tensor_parallel_size
+
+        # NOTE: LLaMA does not have bias terms.
+        ln1 = self.hidden_size
+        q = self.hidden_size * self.hidden_size // self.tensor_parallel_size
+        k = self.hidden_size * self.hidden_size // self.tensor_parallel_size
+        v = self.hidden_size * self.hidden_size // self.tensor_parallel_size
+        out = self.hidden_size * self.hidden_size // self.tensor_parallel_size
+        # Rotary embedding.
+        # TODO(woosuk): Share the rotary embedding between layers.
+        rot = self.max_position * self.head_size
+        mha = ln1 + q + k + v + out + rot
+
+        ln2 = self.hidden_size
+        gate = self.hidden_size * self.ffn_size // self.tensor_parallel_size
+        down = self.ffn_size * self.hidden_size // self.tensor_parallel_size
+        up = self.hidden_size * self.ffn_size // self.tensor_parallel_size
+        ffn = ln2 + gate + down + up
+
+        total = word_embedding + self.num_layers * (mha + ffn) + lm_head
+        dtype_size = get_dtype_size(self.dtype)
+        return dtype_size * total
+
+    def get_max_act_size(
+        self,
+        max_num_batched_tokens: int,
+    ) -> int:
+        # NOTE: We approxmiately calculate the maximum activation size by
+        # estimating
+        # 1) the maximum activation tensor size during inference
+        # 2) the residual tensor size during inference
+        # Here, we assume that we use memory-efficient attention which
+        # does not materialize the attention maps in GPU DRAM.
+        residual = max_num_batched_tokens * self.hidden_size
+        qkv = 3 * (max_num_batched_tokens * self.hidden_size) // self.tensor_parallel_size
+        ffn = 2 * (max_num_batched_tokens * self.ffn_size) // self.tensor_parallel_size
+        # Double the activation size for input and output.
+        max_act = 2 * (max(qkv, ffn) + residual)
+        # Size of output logits.
+        output_logits = 2 * (max_num_batched_tokens * self.vocab_size)
+        max_act = max(max_act, output_logits)
+        dtype_size = get_dtype_size(self.dtype)
+        return dtype_size * max_act
+
+
+class GPTNeoXMemoryAnalyzer(CacheFlowMemoryAnalyzer):
+
+    def __init__(
+        self,
+        model_name: str,
+        block_size: int,
+        dtype: torch.dtype,
+        gpu_memory: int,
+        cpu_memory: int,
+        tensor_parallel_size: int,
+    ) -> None:
+        self.model_name = model_name
+        self.block_size = block_size
+        self.dtype = dtype
+        self.gpu_memory = gpu_memory
+        self.cpu_memory = cpu_memory
+        self.tensor_parallel_size = tensor_parallel_size
+
+        config = AutoConfig.from_pretrained(model_name)
+        self.num_layers = config.num_hidden_layers
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_size = config.hidden_size // self.num_heads
+        self.ffn_size = config.intermediate_size
+        self.vocab_size = config.vocab_size
+        self.max_position = 8192
+        self.tie_word_embeddings = config.tie_word_embeddings
+
+    def get_param_size(self) -> int:
+        word_embedding = self.vocab_size * self.hidden_size // self.tensor_parallel_size
+        if self.tie_word_embeddings:
+            lm_head = 0
+        else:
+            lm_head = self.vocab_size * self.hidden_size // self.tensor_parallel_size
+
+        ln1 = 2 * self.hidden_size
+        q = self.hidden_size * self.hidden_size // self.tensor_parallel_size + self.hidden_size
+        k = self.hidden_size * self.hidden_size // self.tensor_parallel_size + self.hidden_size
+        v = self.hidden_size * self.hidden_size // self.tensor_parallel_size + self.hidden_size
+        out = self.hidden_size * self.hidden_size // self.tensor_parallel_size + self.hidden_size
+        # Rotary embedding.
+        # TODO(woosuk): Share the rotary embedding between layers.
+        rot = self.max_position * self.head_size
+        mha = ln1 + q + k + v + out + rot
+
+        ln2 = 2 * self.hidden_size
+        ffn1 = self.hidden_size * self.ffn_size // self.tensor_parallel_size + self.ffn_size
+        ffn2 = self.ffn_size * self.hidden_size // self.tensor_parallel_size + self.hidden_size
+        ffn = ln2 + ffn1 + ffn2
+
+        total = word_embedding + self.num_layers * (mha + ffn) + lm_head
+        dtype_size = get_dtype_size(self.dtype)
+        return dtype_size * total
+
+    def get_max_act_size(
+        self,
+        max_num_batched_tokens: int,
+    ) -> int:
+        # NOTE: We approxmiately calculate the maximum activation size by
+        # estimating
+        # 1) the maximum activation tensor size during inference
+        # 2) the residual tensor size during inference
+        # Here, we assume that we use memory-efficient attention which
+        # does not materialize the attention maps in GPU DRAM.
+        residual = max_num_batched_tokens * self.hidden_size
+        qkv = 3 * (max_num_batched_tokens * self.hidden_size) // self.tensor_parallel_size
+        ffn = 2 * (max_num_batched_tokens * self.ffn_size) // self.tensor_parallel_size
+        # Double the activation size for input and output.
+        max_act = 2 * (max(qkv, ffn) + residual)
+        # Size of output logits.
+        output_logits = 2 * (max_num_batched_tokens * self.vocab_size)
+        max_act = max(max_act, output_logits)
+        dtype_size = get_dtype_size(self.dtype)
+        return dtype_size * max_act
--- a/cacheflow/model_executor/model_loader.py
+++ b/cacheflow/model_executor/model_loader.py
@ -0,0 +1,103 @@
+from typing import Optional
+
+import torch
+import torch.nn as nn
+from transformers import AutoConfig
+from transformers import PretrainedConfig
+
+from cacheflow.model_executor.memory_analyzer import (
+    CacheFlowMemoryAnalyzer, GPT2MemoryAnalyzer, GPTNeoXMemoryAnalyzer,
+    LlamaMemoryAnalyzer, OPTMemoryAnalyzer)
+from cacheflow.model_executor.models import (
+    GPT2LMHeadModel, GPTNeoXForCausalLM, LlamaForCausalLM, OPTForCausalLM)
+from cacheflow.model_executor.utils import get_torch_dtype
+from cacheflow.model_executor.weight_utils import initialize_dummy_weights
+
+
+_MODELS = {
+    'gpt2': GPT2LMHeadModel,
+    'llama': LlamaForCausalLM,
+    'opt': OPTForCausalLM,
+    'stablelm': GPTNeoXForCausalLM,
+    'pythia': GPTNeoXForCausalLM,
+    'dolly-v2': GPTNeoXForCausalLM,
+}
+
+_MEMORY_ANALYZERS = {
+    'gpt2': GPT2MemoryAnalyzer,
+    'llama': LlamaMemoryAnalyzer,
+    'opt': OPTMemoryAnalyzer,
+    'stablelm': GPTNeoXMemoryAnalyzer,
+    'pythia': GPTNeoXMemoryAnalyzer,
+    'dolly-v2': GPTNeoXMemoryAnalyzer,
+}
+
+
+def _get_dtype(config: PretrainedConfig, dtype: str) -> torch.dtype:
+    # NOTE: getattr(config, 'torch_dtype', torch.float32) is not correct
+    # because config.torch_dtype can be None.
+    config_dtype = getattr(config, 'torch_dtype', None)
+    if config_dtype is None:
+        config_dtype = torch.float32
+    if dtype == 'default':
+        if config_dtype == torch.float32:
+            # Following the common practice, we use float16 for float32 models.
+            torch_dtype = torch.float16
+        else:
+            torch_dtype = config_dtype
+    else:
+        torch_dtype = get_torch_dtype(dtype)
+        if torch_dtype != config_dtype and config_dtype != torch.float32:
+            # TODO(woosuk): Allow using float16 for bfloat16 models and
+            # vice versa. Print a warning message and continue.
+            raise ValueError(
+                f'Cannot use {torch_dtype} for {config_dtype} model.')
+    return torch_dtype
+
+
+def get_model(
+    model_name: str,
+    dtype: str,
+    cache_dir: Optional[str],
+    use_dummy_weights: bool,
+    use_np_cache: bool,
+) -> nn.Module:
+    config = AutoConfig.from_pretrained(model_name)
+    torch_dtype = _get_dtype(config, dtype)
+    torch.set_default_dtype(torch_dtype)
+    for model_class_name, model_class in _MODELS.items():
+        if model_class_name in model_name:
+            if use_dummy_weights:
+                # Create a model instance.
+                # The weights will be initialized as empty tensors.
+                model = model_class(config)
+                model = model.cuda()
+                # NOTE(woosuk): For precise performance evaluation, we assign
+                # random values to the weights.
+                initialize_dummy_weights(model)
+            else:
+                # Create a model instance.
+                model = model_class(config)
+                # Load the weights from the cached or downloaded files.
+                model.load_weights(model_name, cache_dir, use_np_cache)
+                model = model.cuda()
+            return model.eval(), torch_dtype
+    raise ValueError(f'Unsupported model name: {model_name}')
+
+
+def get_memory_analyzer(
+    model_name: str,
+    block_size: int,
+    dtype: str,
+    gpu_memory: int,
+    cpu_memory: int,
+    tensor_parallel_size: int = 1,
+) -> CacheFlowMemoryAnalyzer:
+    config = AutoConfig.from_pretrained(model_name)
+    torch_dtype = _get_dtype(config, dtype)
+    for model_class, memory_analyzer in _MEMORY_ANALYZERS.items():
+        if model_class in model_name:
+            return memory_analyzer(
+                model_name, block_size, torch_dtype, gpu_memory, cpu_memory,
+                tensor_parallel_size)
+    raise ValueError(f'Unsupported model name: {model_name}')
--- a/cacheflow/model_executor/models/init.py
+++ b/cacheflow/model_executor/models/init.py
@ -0,0 +1,12 @@
+from cacheflow.model_executor.models.gpt_neox import GPTNeoXForCausalLM
+from cacheflow.model_executor.models.gpt2 import GPT2LMHeadModel
+from cacheflow.model_executor.models.llama import LlamaForCausalLM
+from cacheflow.model_executor.models.opt import OPTForCausalLM
+
+
+__all__ = [
+    "GPT2LMHeadModel",
+    "GPTNeoXForCausalLM",
+    "LlamaForCausalLM",
+    "OPTForCausalLM",
+]
--- a/cacheflow/model_executor/models/gpt2.py
+++ b/cacheflow/model_executor/models/gpt2.py
@ -0,0 +1,261 @@
+"""1D GPT-2 model compatible with HuggingFace weights."""
+from typing import Dict, List, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import GPT2Config
+
+from cacheflow.model_executor.input_metadata import InputMetadata
+from cacheflow.model_executor.layers.attention import GPTCacheFlowAttention
+from cacheflow.model_executor.layers.sampler import Sampler
+from cacheflow.model_executor.weight_utils import (hf_model_weights_iterator,
+                                                   load_tensor_parallel_weights)
+from cacheflow.model_executor.parallel_utils.parallel_state import (
+    get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
+from cacheflow.model_executor.parallel_utils.tensor_parallel import (
+    VocabParallelEmbedding, ColumnParallelLinear, RowParallelLinear)
+from cacheflow.sequence import SequenceOutputs
+
+KVCache = Tuple[torch.Tensor, torch.Tensor]
+
+
+class GPT2Attention(nn.Module):
+
+    def __init__(self, config: GPT2Config):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        total_num_heads = config.num_attention_heads
+        tensor_model_parallel_world_size = get_tensor_model_parallel_world_size()
+        assert total_num_heads % tensor_model_parallel_world_size == 0
+        self.num_heads = total_num_heads // tensor_model_parallel_world_size
+        self.head_dim = self.hidden_size // total_num_heads
+        self.scale = self.head_dim ** -0.5
+
+        self.c_attn = ColumnParallelLinear(self.hidden_size, 3 * self.hidden_size, bias=True,
+                                           gather_output=False,
+                                           perform_initialization=False)
+        self.c_proj = RowParallelLinear(self.hidden_size, self.hidden_size, bias=True,
+                                        input_is_parallel=True,
+                                        perform_initialization=False)
+        self.attn = GPTCacheFlowAttention(scale=self.scale)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        kv_cache: KVCache,
+        input_metadata: InputMetadata,
+        cache_event: Optional[torch.cuda.Event],
+    ) -> torch.Tensor:
+        qkv, _ = self.c_attn(hidden_states)
+        q, k, v = qkv.chunk(chunks=3, dim=-1)
+        key_cache, value_cache = kv_cache
+        attn_output = self.attn(
+            q, k, v, key_cache, value_cache, input_metadata, cache_event)
+        attn_output, _ = self.c_proj(attn_output)
+        return attn_output
+
+
+class GPT2MLP(nn.Module):
+
+    def __init__(
+        self,
+        intermediate_size: int,
+        config: GPT2Config,
+    ):
+        super().__init__()
+        hidden_size = config.hidden_size
+        self.c_fc = ColumnParallelLinear(hidden_size, intermediate_size,
+                                         bias=True, gather_output=False,
+                                         perform_initialization=False)
+        self.c_proj = RowParallelLinear(intermediate_size, hidden_size,
+                                        bias=True, input_is_parallel=True,
+                                        perform_initialization=False)
+
+        act_fn = config.activation_function
+        if act_fn != "gelu_new":
+            raise ValueError(f"Unsupported activation: {act_fn}. "
+                             "GPT-2 only supports gelu_new for now.")
+        self.act = torch.nn.GELU(approximate="tanh")
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states, _ = self.c_fc(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states, _ = self.c_proj(hidden_states)
+        return hidden_states
+
+
+class GPT2Block(nn.Module):
+
+    def __init__(self, config: GPT2Config):
+        super().__init__()
+        hidden_size = config.hidden_size
+        inner_dim = config.n_inner if config.n_inner is not None else 4 * hidden_size
+
+        self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+        self.attn = GPT2Attention(config)
+        self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+        self.mlp = GPT2MLP(inner_dim, config)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        kv_cache: KVCache,
+        input_metadata: InputMetadata,
+        cache_event: Optional[torch.cuda.Event],
+    ) -> torch.Tensor:
+        residual = hidden_states
+        hidden_states = self.ln_1(hidden_states)
+        attn_output = self.attn(
+            hidden_states=hidden_states,
+            kv_cache=kv_cache,
+            input_metadata=input_metadata,
+            cache_event=cache_event,
+        )
+        # residual connection
+        hidden_states = attn_output + residual
+
+        residual = hidden_states
+        hidden_states = self.ln_2(hidden_states)
+        feed_forward_hidden_states = self.mlp(hidden_states)
+        # residual connection
+        hidden_states = residual + feed_forward_hidden_states
+        return hidden_states
+
+
+class GPT2Model(nn.Module):
+
+    def __init__(self, config: GPT2Config):
+        super().__init__()
+        self.config = config
+        assert config.add_cross_attention == False
+        assert config.scale_attn_by_inverse_layer_idx == False
+        assert config.reorder_and_upcast_attn == False
+        self.embed_dim = config.hidden_size
+
+        # Optimization: While the vocab size of GPT-2 is 50257, we extend it
+        # to 50304 in order to make it divisible by 64.
+        # This improves performance since GPUs are faster if the dimension
+        # is divisible by 64. In addition, it allows us to shard the embedding
+        # layer across 2, 4, 8, or more GPUs.
+        vocab_size = ((config.vocab_size + 63) // 64) * 64
+        self.wte = VocabParallelEmbedding(vocab_size, self.embed_dim)
+        self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim)
+        self.h = nn.ModuleList(
+            [GPT2Block(config) for _ in range(config.num_hidden_layers)])
+        self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        position_ids: torch.LongTensor,
+        kv_caches: List[KVCache],
+        input_metadata: InputMetadata,
+        cache_events: Optional[List[torch.cuda.Event]],
+    ) -> torch.Tensor:
+        inputs_embeds = self.wte(input_ids)
+        position_embeds = self.wpe(position_ids)
+        hidden_states = inputs_embeds + position_embeds
+
+        for i in range(len(self.h)):
+            if cache_events is None:
+                cache_event = None
+            else:
+                cache_event = cache_events[i]
+            layer = self.h[i]
+            hidden_states = layer(
+                hidden_states, kv_caches[i], input_metadata, cache_event)
+
+        hidden_states = self.ln_f(hidden_states)
+        return hidden_states
+
+
+class GPT2LMHeadModel(nn.Module):
+
+    def __init__(self, config: GPT2Config):
+        super().__init__()
+        self.config = config
+        self.transformer = GPT2Model(config)
+        # TODO(zhuohan): create a new weight after implementing pipeline
+        #                parallelism
+        self.lm_head_weight = self.transformer.wte.weight
+        self.sampler = Sampler(config.vocab_size)
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        positions: torch.LongTensor,
+        kv_caches: List[KVCache],
+        input_metadata: InputMetadata,
+        cache_events: Optional[List[torch.cuda.Event]],
+    ) -> Dict[int, SequenceOutputs]:
+        hidden_states = self.transformer(
+            input_ids, positions, kv_caches, input_metadata, cache_events)
+        next_tokens = self.sampler(
+            self.lm_head_weight, hidden_states, input_metadata)
+        return next_tokens
+
+    _column_parallel_weights = ["wte.weight", "c_fc.weight", "c_fc.bias"]
+    _row_parallel_weights = ["c_proj.weight"]
+
+    def load_weights(self, model_name_or_path: str,
+                     cache_dir: Optional[str] = None,
+                     use_np_cache: bool = False):
+        tensor_model_parallel_world_size = get_tensor_model_parallel_world_size()
+        tensor_model_parallel_rank = get_tensor_model_parallel_rank()
+        state_dict = self.state_dict()
+
+        for name, loaded_weight in hf_model_weights_iterator(
+            model_name_or_path, cache_dir, use_np_cache):
+            if "lm_head.weight" in name:
+                # GPT-2 ties the weights of the embedding layer and the final
+                # linear layer.
+                continue
+            if ".attn.bias" in name:
+                # Skip attention mask.
+                # NOTE: "c_attn.bias" should not be skipped.
+                continue
+            name = "transformer." + name
+
+            # The HF's GPT-2 implementation uses Conv1D instead of Linear.
+            # Because of this, we need to transpose the weights.
+            for conv1d_weight_name in ["c_attn", "c_proj", "c_fc"]:
+                if conv1d_weight_name not in name:
+                    continue
+                if not name.endswith(".weight"):
+                    continue
+                loaded_weight = loaded_weight.t()
+            param = state_dict[name]
+
+            if name == "transformer.wte.weight":
+                # Consider padding in the vocab size.
+                padded_vocab_size = param.shape[0] * tensor_model_parallel_world_size
+                num_extra_rows = padded_vocab_size - self.config.vocab_size
+                extra_rows = torch.empty(num_extra_rows, loaded_weight.shape[1])
+                extra_rows = extra_rows.to(loaded_weight)
+                loaded_weight = torch.cat([loaded_weight, extra_rows], dim=0)
+
+            # For the fused QKV linear layer, manually shard the weights.
+            if "c_attn" in name:
+                # GPT-2's fused QKV has the shape of [3 * num_heads * head_size, hidden_size].
+                # When tensor parallelism is used, we shard the weights along the head dimension.
+                total_num_heads = self.config.num_attention_heads
+                hidden_size = self.config.hidden_size
+                head_size = hidden_size // total_num_heads
+                num_heads = total_num_heads // tensor_model_parallel_world_size
+                head_start = tensor_model_parallel_rank * num_heads
+                head_end = (tensor_model_parallel_rank + 1) * num_heads
+
+                if name.endswith(".weight"):
+                    loaded_weight = loaded_weight.view(3, total_num_heads, head_size, hidden_size)
+                    loaded_weight = loaded_weight[:, head_start:head_end, :, :]
+                    loaded_weight = loaded_weight.reshape(-1, hidden_size)
+                elif name.endswith(".bias"):
+                    loaded_weight = loaded_weight.view(3, total_num_heads, head_size)
+                    loaded_weight = loaded_weight[:, head_start:head_end, :]
+                    loaded_weight = loaded_weight.reshape(-1)
+                else:
+                    raise ValueError(f"Unexpected parameter name {name}")
+            load_tensor_parallel_weights(param, loaded_weight, name,
+                                         self._column_parallel_weights,
+                                         self._row_parallel_weights,
+                                         tensor_model_parallel_rank)
--- a/cacheflow/model_executor/models/gpt_neox.py
+++ b/cacheflow/model_executor/models/gpt_neox.py
@ -0,0 +1,231 @@
+"""1D GPT-NeoX model compatible with HuggingFace weights."""
+from typing import Dict, List, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import GPTNeoXConfig
+
+from cacheflow.model_executor.input_metadata import InputMetadata
+from cacheflow.model_executor.layers.attention import GPTNeoXCacheFlowAttention
+from cacheflow.model_executor.layers.sampler import Sampler
+from cacheflow.model_executor.weight_utils import (hf_model_weights_iterator,
+                                                   load_tensor_parallel_weights)
+from cacheflow.model_executor.parallel_utils.parallel_state import (
+    get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
+from cacheflow.model_executor.parallel_utils.tensor_parallel import (
+    VocabParallelEmbedding, ColumnParallelLinear, RowParallelLinear)
+from cacheflow.sequence import SequenceOutputs
+
+KVCache = Tuple[torch.Tensor, torch.Tensor]
+
+
+class GPTNeoXAttention(nn.Module):
+
+    def __init__(self, config: GPTNeoXConfig):
+        super().__init__()
+        self.total_num_heads = config.num_attention_heads
+        self.hidden_size = config.hidden_size
+        self.head_size = self.hidden_size // self.total_num_heads
+
+        tensor_model_parallel_world_size = get_tensor_model_parallel_world_size()
+        assert self.total_num_heads % tensor_model_parallel_world_size == 0
+        self.num_heads = self.total_num_heads // tensor_model_parallel_world_size
+
+        self.query_key_value = ColumnParallelLinear(config.hidden_size,
+                                                    3 * config.hidden_size,
+                                                    gather_output=False,
+                                                    perform_initialization=False)
+        self.dense = RowParallelLinear(config.hidden_size, config.hidden_size,
+                                       input_is_parallel=True,
+                                       perform_initialization=False)
+
+        scaling = self.head_size ** -0.5
+        rotary_dim = int(self.head_size * config.rotary_pct)
+        assert rotary_dim % 2 == 0
+        self.attn = GPTNeoXCacheFlowAttention(scaling, rotary_dim)
+
+    def forward(
+        self,
+        position_ids: torch.LongTensor,
+        hidden_states: torch.Tensor,
+        kv_cache: KVCache,
+        input_metadata: InputMetadata,
+        cache_event: Optional[torch.cuda.Event],
+    ) -> torch.Tensor:
+        qkv, _ = self.query_key_value(hidden_states)
+
+        q, k, v = qkv.chunk(chunks=3, dim=-1)
+        k_cache, v_cache = kv_cache
+        attn_output = self.attn(
+            position_ids, q, k, v, k_cache, v_cache, input_metadata, cache_event)
+        output, _ = self.dense(attn_output)
+        return output
+
+
+class GPTNeoXMLP(nn.Module):
+    def __init__(self, config: GPTNeoXConfig):
+        super().__init__()
+        self.dense_h_to_4h = ColumnParallelLinear(config.hidden_size,
+                                                  config.intermediate_size,
+                                                  gather_output=False,
+                                                  perform_initialization=False)
+        self.dense_4h_to_h = RowParallelLinear(config.intermediate_size, config.hidden_size,
+                                               input_is_parallel=True,
+                                               perform_initialization=False)
+        if config.hidden_act != 'gelu':
+            raise ValueError(f'Unsupported activation: {config.hidden_act}. '
+                             'Only gelu is supported for now.')
+        self.act = torch.nn.GELU()
+
+    def forward(self, hidden_states):
+        hidden_states, _ = self.dense_h_to_4h(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states, _ = self.dense_4h_to_h(hidden_states)
+        return hidden_states
+
+
+class GPTNeoXLayer(nn.Module):
+
+    def __init__(self, config: GPTNeoXConfig):
+        super().__init__()
+        self.use_parallel_residual = config.use_parallel_residual
+        self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.post_attention_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.attention = GPTNeoXAttention(config)
+        self.mlp = GPTNeoXMLP(config)
+
+    def forward(
+        self,
+        position_ids: torch.LongTensor,
+        hidden_states: torch.Tensor,
+        kv_cache: KVCache,
+        input_metadata: InputMetadata,
+        cache_event: Optional[torch.cuda.Event],
+    ) -> torch.Tensor:
+        attn_input = self.input_layernorm(hidden_states)
+        attn_output = self.attention(
+            position_ids=position_ids,
+            hidden_states=attn_input,
+            kv_cache=kv_cache,
+            input_metadata=input_metadata,
+            cache_event=cache_event,
+        )
+
+        if self.use_parallel_residual:
+            # pseudocode:
+            # x = x + attn(ln1(x)) + mlp(ln2(x))
+            mlp_input = self.post_attention_layernorm(hidden_states)
+            mlp_output = self.mlp(mlp_input)
+            hidden_states = mlp_output + attn_output + hidden_states
+        else:
+            # pseudocode:
+            # x = x + attn(ln1(x))
+            # x = x + mlp(ln2(x))
+            attn_output = attn_output + hidden_states
+            mlp_input = self.post_attention_layernorm(attn_output)
+            mlp_output = self.mlp(mlp_input)
+            hidden_states = mlp_output + attn_output
+        return hidden_states
+
+
+class GPTNeoXModel(nn.Module):
+    def __init__(self, config: GPTNeoXConfig):
+        super().__init__()
+        self.config = config
+
+        self.embed_in = VocabParallelEmbedding(config.vocab_size, config.hidden_size,
+                                               perform_initialization=False)
+        self.layers = nn.ModuleList([GPTNeoXLayer(config) for _ in range(config.num_hidden_layers)])
+        self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        position_ids: torch.LongTensor,
+        kv_caches: List[KVCache],
+        input_metadata: InputMetadata,
+        cache_events: Optional[List[torch.cuda.Event]],
+    ) -> torch.Tensor:
+        hidden_states = self.embed_in(input_ids)
+        for i in range(len(self.layers)):
+            if cache_events is None:
+                cache_event = None
+            else:
+                cache_event = cache_events[i]
+            layer = self.layers[i]
+            hidden_states = layer(
+                position_ids,
+                hidden_states,
+                kv_caches[i],
+                input_metadata,
+                cache_event,
+            )
+        hidden_states = self.final_layer_norm(hidden_states)
+        return hidden_states
+
+
+class GPTNeoXForCausalLM(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.gpt_neox = GPTNeoXModel(config)
+        self.embed_out = ColumnParallelLinear(config.hidden_size, config.vocab_size,
+                                              bias=False, gather_output=False,
+                                              perform_initialization=False)
+        self.sampler = Sampler(config.vocab_size)
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        positions: torch.LongTensor,
+        kv_caches: List[KVCache],
+        input_metadata: InputMetadata,
+        cache_events: Optional[List[torch.cuda.Event]],
+    ) -> Dict[int, SequenceOutputs]:
+        hidden_states = self.gpt_neox(
+            input_ids, positions, kv_caches, input_metadata, cache_events)
+        next_tokens = self.sampler(
+            self.embed_out.weight, hidden_states, input_metadata)
+        return next_tokens
+
+    _column_parallel_weights = ["embed_in.weight", "embed_out.weight", "dense_h_to_4h.weight", "dense_h_to_4h.bias"]
+    _row_parallel_weights = ["dense.weight", "dense_4h_to_h.weight"]
+
+    def load_weights(self, model_name_or_path: str,
+                     cache_dir: Optional[str] = None,
+                     use_np_cache: bool = False):
+        tensor_model_parallel_rank = get_tensor_model_parallel_rank()
+        state_dict = self.state_dict()
+        for name, loaded_weight in hf_model_weights_iterator(
+            model_name_or_path, cache_dir, use_np_cache):
+            if ("attention.bias" in name or "attention.masked_bias" in name
+                or "rotary_emb.inv_freq" in name):
+                continue
+            param = state_dict[name]
+            if "query_key_value" in name:
+                # NOTE(woosuk): GPT-NeoX's fused QKV has the shape of
+                # [num_heads * 3 * head_size, hidden_size], while the
+                # required shape is [3 * num_heads * head_size, hidden_size].
+                # Thus, we need weight conversion.
+                shard_size = param.shape[0]
+                loaded_weight = loaded_weight[shard_size * tensor_model_parallel_rank
+                                              :shard_size * (tensor_model_parallel_rank + 1)]
+
+                num_heads = self.config.num_attention_heads
+                hidden_size = self.config.hidden_size
+                head_size = hidden_size // num_heads
+                if 'query_key_value.weight' in name:
+                    loaded_weight = loaded_weight.view(-1, 3, head_size, hidden_size)
+                    loaded_weight = loaded_weight.transpose(0, 1)
+                    loaded_weight = loaded_weight.reshape(-1, hidden_size)
+                elif 'query_key_value.bias' in name:
+                    loaded_weight = loaded_weight.view(-1, 3, head_size)
+                    loaded_weight = loaded_weight.transpose(0, 1)
+                    loaded_weight = loaded_weight.reshape(-1)
+                else:
+                    raise ValueError(f"Unexpected weight name: {name}")
+            load_tensor_parallel_weights(param, loaded_weight, name,
+                                         self._column_parallel_weights,
+                                         self._row_parallel_weights,
+                                         tensor_model_parallel_rank)
--- a/cacheflow/model_executor/models/llama.py
+++ b/cacheflow/model_executor/models/llama.py
@ -0,0 +1,267 @@
+"""1D LLaMA model compatible with HuggingFace weights."""
+from typing import Dict, List, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import LlamaConfig
+
+from cacheflow.sequence import SequenceOutputs
+from cacheflow.model_executor.input_metadata import InputMetadata
+from cacheflow.model_executor.layers.activation import SiluAndMul
+from cacheflow.model_executor.layers.layernorm import RMSNorm
+from cacheflow.model_executor.layers.attention import GPTNeoXCacheFlowAttention
+from cacheflow.model_executor.layers.sampler import Sampler
+from cacheflow.model_executor.weight_utils import (hf_model_weights_iterator,
+                                                   load_tensor_parallel_weights)
+from cacheflow.model_executor.parallel_utils.parallel_state import (
+    get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
+from cacheflow.model_executor.parallel_utils.tensor_parallel import (
+    VocabParallelEmbedding, ColumnParallelLinear, RowParallelLinear)
+from cacheflow.sequence import SequenceOutputs
+
+KVCache = Tuple[torch.Tensor, torch.Tensor]
+
+
+class LlamaMLP(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+    ):
+        super().__init__()
+        self.gate_up_proj = ColumnParallelLinear(hidden_size, 2 * intermediate_size,
+                                                 bias=False, gather_output=False,
+                                                 perform_initialization=False)
+        self.down_proj = RowParallelLinear(intermediate_size, hidden_size,
+                                           bias=False, input_is_parallel=True,
+                                           perform_initialization=False)
+        if hidden_act != 'silu':
+            raise ValueError(f'Unsupported activation: {hidden_act}. '
+                             'Only silu is supported for now.')
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class LlamaAttention(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+    ):
+        super().__init__()
+        self.hidden_size = hidden_size
+        tensor_model_parallel_world_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tensor_model_parallel_world_size == 0
+        self.num_heads = self.total_num_heads // tensor_model_parallel_world_size
+        self.head_dim = hidden_size // self.total_num_heads
+        self.scaling = self.head_dim ** -0.5
+
+        self.qkv_proj = ColumnParallelLinear(
+            hidden_size,
+            3 * self.total_num_heads * self.head_dim,
+            bias=False,
+            gather_output=False,
+            perform_initialization=False,
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            input_is_parallel=True,
+            perform_initialization=False,
+        )
+        self.attn = GPTNeoXCacheFlowAttention(self.scaling, self.head_dim)
+
+    def forward(
+        self,
+        positions: torch.LongTensor,
+        hidden_states: torch.Tensor,
+        kv_cache: KVCache,
+        input_metadata: InputMetadata,
+        cache_event: Optional[torch.cuda.Event],
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.chunk(chunks=3, dim=-1)
+        k_cache, v_cache = kv_cache
+        attn_output = self.attn(
+            positions, q, k, v, k_cache, v_cache, input_metadata, cache_event)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class LlamaDecoderLayer(nn.Module):
+
+    def __init__(self, config: LlamaConfig):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = LlamaAttention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+        )
+        self.mlp = LlamaMLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+        )
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.LongTensor,
+        hidden_states: torch.Tensor,
+        kv_cache: KVCache,
+        input_metadata: InputMetadata,
+        cache_event: Optional[torch.cuda.Event],
+    ) -> torch.Tensor:
+        # Self Attention
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            kv_cache=kv_cache,
+            input_metadata=input_metadata,
+            cache_event=cache_event,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+
+
+class LlamaModel(nn.Module):
+
+    def __init__(self, config: LlamaConfig):
+        super().__init__()
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = VocabParallelEmbedding(config.vocab_size, config.hidden_size,
+                                                   perform_initialization=False)
+        self.layers = nn.ModuleList([LlamaDecoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        positions: torch.LongTensor,
+        kv_caches: List[KVCache],
+        input_metadata: InputMetadata,
+        cache_events: Optional[List[torch.cuda.Event]],
+    ) -> torch.Tensor:
+        hidden_states = self.embed_tokens(input_ids)
+        for i in range(len(self.layers)):
+            if cache_events is None:
+                cache_event = None
+            else:
+                cache_event = cache_events[i]
+            layer = self.layers[i]
+            hidden_states = layer(
+                positions,
+                hidden_states,
+                kv_caches[i],
+                input_metadata,
+                cache_event,
+            )
+        hidden_states = self.norm(hidden_states)
+        return hidden_states
+
+
+class LlamaForCausalLM(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.model = LlamaModel(config)
+        self.lm_head = ColumnParallelLinear(config.hidden_size,
+                                            config.vocab_size,
+                                            bias=False,
+                                            gather_output=False,
+                                            perform_initialization=False)
+        self.sampler = Sampler(config.vocab_size)
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        positions: torch.LongTensor,
+        kv_caches: List[KVCache],
+        input_metadata: InputMetadata,
+        cache_events: Optional[List[torch.cuda.Event]],
+    ) -> Dict[int, SequenceOutputs]:
+        hidden_states = self.model(
+            input_ids, positions, kv_caches, input_metadata, cache_events)
+        next_tokens = self.sampler(
+            self.lm_head.weight, hidden_states, input_metadata)
+        return next_tokens
+
+    _column_parallel_weights = ["embed_tokens.weight", "lm_head.weight",
+                                "qkv_proj.weight", "gate_proj.weight",
+                                "up_proj.weight"]
+    _row_parallel_weights = ["o_proj.weight", "down_proj.weight"]
+
+    def load_weights(self, model_name_or_path: str,
+                     cache_dir: Optional[str] = None,
+                     use_np_cache: bool = False):
+        tensor_model_parallel_rank = get_tensor_model_parallel_rank()
+        state_dict = self.state_dict()
+
+        for name, loaded_weight in hf_model_weights_iterator(
+            model_name_or_path, cache_dir, use_np_cache):
+            if "rotary_emb.inv_freq" in name:
+                continue
+
+            is_attention_weight = False
+            for stride_id, att_weight_name in enumerate(["q_proj", "k_proj", "v_proj"]):
+                if att_weight_name not in name:
+                    continue
+                param = state_dict[name.replace(att_weight_name, "qkv_proj")]
+                shard_size = param.shape[0] // 3
+                loaded_weight = loaded_weight[
+                    shard_size * tensor_model_parallel_rank
+                    :shard_size * (tensor_model_parallel_rank + 1)]
+                param_slice = param.data[shard_size * stride_id
+                                         :shard_size * (stride_id + 1)]
+                assert param_slice.shape == loaded_weight.shape
+                param_slice.copy_(loaded_weight)
+                is_attention_weight = True
+                break
+            if is_attention_weight:
+                continue
+
+            is_gate_up_weight = False
+            for stride_id, weight_name in enumerate(["gate_proj", "up_proj"]):
+                if weight_name not in name:
+                    continue
+                param = state_dict[name.replace(weight_name, "gate_up_proj")]
+                shard_size = param.shape[0] // 2
+                loaded_weight = loaded_weight[
+                    shard_size * tensor_model_parallel_rank
+                    :shard_size * (tensor_model_parallel_rank + 1)]
+                param_slice = param.data[shard_size * stride_id
+                                         :shard_size * (stride_id + 1)]
+                assert param_slice.shape == loaded_weight.shape
+                param_slice.copy_(loaded_weight)
+                is_gate_up_weight = True
+                break
+            if is_gate_up_weight:
+                continue
+
+            param = state_dict[name]
+            load_tensor_parallel_weights(param, loaded_weight, name,
+                                         self._column_parallel_weights,
+                                         self._row_parallel_weights,
+                                         tensor_model_parallel_rank)
--- a/cacheflow/model_executor/models/opt.py
+++ b/cacheflow/model_executor/models/opt.py
@ -0,0 +1,291 @@
+"""1D OPT model compatible with HuggingFace weights."""
+from typing import Dict, List, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import OPTConfig
+
+from cacheflow.model_executor.input_metadata import InputMetadata
+from cacheflow.model_executor.layers.attention import GPTCacheFlowAttention
+from cacheflow.model_executor.layers.sampler import Sampler
+from cacheflow.model_executor.weight_utils import (hf_model_weights_iterator,
+                                                   load_tensor_parallel_weights)
+from cacheflow.model_executor.parallel_utils.parallel_state import (
+    get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
+from cacheflow.model_executor.parallel_utils.tensor_parallel import (
+    VocabParallelEmbedding, ColumnParallelLinear, RowParallelLinear)
+from cacheflow.sequence import SequenceOutputs
+
+KVCache = Tuple[torch.Tensor, torch.Tensor]
+
+
+class OPTLearnedPositionalEmbedding(nn.Embedding):
+
+    def __init__(self, num_embeddings: int, embedding_dim: int):
+        # OPT is set up so that if padding_idx is specified then offset the embedding ids by 2
+        # and adjust num_embeddings appropriately. Other models don't have this hack
+        self.offset = 2
+        super().__init__(num_embeddings + self.offset, embedding_dim)
+
+    def forward(self, positions: torch.LongTensor):
+        return super().forward(positions + self.offset)
+
+
+class OPTAttention(nn.Module):
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        bias: bool = True,
+    ) -> None:
+        super().__init__()
+        self.embed_dim = embed_dim
+        tensor_model_parallel_world_size = get_tensor_model_parallel_world_size()
+        total_num_heads = num_heads
+        assert num_heads % tensor_model_parallel_world_size == 0
+        self.num_heads = total_num_heads // tensor_model_parallel_world_size
+        self.head_dim = embed_dim // total_num_heads
+        self.scaling = self.head_dim ** -0.5
+
+        self.qkv_proj = ColumnParallelLinear(embed_dim, 3 * embed_dim, bias=bias,
+                                             gather_output=False,
+                                             perform_initialization=False)
+        self.out_proj = RowParallelLinear(embed_dim, embed_dim, bias=bias,
+                                          input_is_parallel=True,
+                                          perform_initialization=False)
+        self.attn = GPTCacheFlowAttention(scale=self.scaling)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        kv_cache: KVCache,
+        input_metadata: InputMetadata,
+        cache_event: Optional[torch.cuda.Event],
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.chunk(chunks=3, dim=-1)
+        key_cache, value_cache = kv_cache
+        attn_output = self.attn(
+            q, k, v, key_cache, value_cache, input_metadata, cache_event)
+        output, _ = self.out_proj(attn_output)
+        return output
+
+
+class OPTDecoderLayer(nn.Module):
+
+    def __init__(self, config: OPTConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.self_attn = OPTAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.num_attention_heads,
+            bias=config.enable_bias,
+        )
+        self.do_layer_norm_before = config.do_layer_norm_before
+        assert config.activation_function == 'relu'
+        self.activation_fn = nn.ReLU()
+
+        self.self_attn_layer_norm = nn.LayerNorm(
+            self.embed_dim, elementwise_affine=config.layer_norm_elementwise_affine)
+        self.fc1 = ColumnParallelLinear(self.embed_dim, config.ffn_dim,
+                                        bias=config.enable_bias,
+                                        gather_output=False,
+                                        perform_initialization=False)
+        self.fc2 = RowParallelLinear(config.ffn_dim, self.embed_dim,
+                                     bias=config.enable_bias,
+                                     input_is_parallel=True,
+                                     perform_initialization=False)
+        self.final_layer_norm = nn.LayerNorm(
+            self.embed_dim, elementwise_affine=config.layer_norm_elementwise_affine)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        kv_cache: KVCache,
+        input_metadata: InputMetadata,
+        cache_event: Optional[torch.cuda.Event],
+    ) -> torch.Tensor:
+        # Self Attention
+        residual = hidden_states
+        # 125m, 1.7B, ..., 175B applies layer norm BEFORE attention
+        if self.do_layer_norm_before:
+            hidden_states = self.self_attn_layer_norm(hidden_states)
+        hidden_states = self.self_attn(
+            hidden_states=hidden_states,
+            kv_cache=kv_cache,
+            input_metadata=input_metadata,
+            cache_event=cache_event)
+        hidden_states = residual + hidden_states
+        # 350m applies layer norm AFTER attention
+        if not self.do_layer_norm_before:
+            hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        # Fully Connected
+        residual = hidden_states
+        # 125m, 1.7B, ..., 175B applies layer norm BEFORE attention
+        if self.do_layer_norm_before:
+            hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states, _ = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states, _ = self.fc2(hidden_states)
+        hidden_states = residual + hidden_states
+        # 350m applies layer norm AFTER attention
+        if not self.do_layer_norm_before:
+            hidden_states = self.final_layer_norm(hidden_states)
+        return hidden_states
+
+
+class OPTDecoder(nn.Module):
+
+    def __init__(self, config: OPTConfig):
+        super().__init__()
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.max_target_positions = config.max_position_embeddings
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = VocabParallelEmbedding(config.vocab_size,
+                                                   config.word_embed_proj_dim,
+                                                   perform_initialization=False)
+        # Positional embeddings are replicated (not sharded).
+        self.embed_positions = OPTLearnedPositionalEmbedding(
+            config.max_position_embeddings, config.hidden_size)
+
+        # Project out & in will be replicated if they exist.
+        if config.word_embed_proj_dim != config.hidden_size:
+            self.project_out = nn.Linear(config.hidden_size, config.word_embed_proj_dim, bias=False)
+        else:
+            self.project_out = None
+
+        if config.word_embed_proj_dim != config.hidden_size:
+            self.project_in = nn.Linear(config.word_embed_proj_dim, config.hidden_size, bias=False)
+        else:
+            self.project_in = None
+
+        # Note that the only purpose of `config._remove_final_layer_norm` is to keep backward compatibility
+        # with checkpoints that have been fine-tuned before transformers v4.20.1
+        # see https://github.com/facebookresearch/metaseq/pull/164
+        if config.do_layer_norm_before and not config._remove_final_layer_norm:
+            self.final_layer_norm = nn.LayerNorm(
+                config.hidden_size, elementwise_affine=config.layer_norm_elementwise_affine
+            )
+        else:
+            self.final_layer_norm = None
+
+        self.layers = nn.ModuleList([OPTDecoderLayer(config) for _ in range(config.num_hidden_layers)])
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        positions: torch.LongTensor,
+        kv_caches: List[KVCache],
+        input_metadata: InputMetadata,
+        cache_events: Optional[List[torch.cuda.Event]],
+    ) -> torch.Tensor:
+        inputs_embeds = self.embed_tokens(input_ids)
+        pos_embeds = self.embed_positions(positions)
+        if self.project_in is not None:
+            inputs_embeds = self.project_in(inputs_embeds)
+        hidden_states = inputs_embeds + pos_embeds
+
+        for i in range(len(self.layers)):
+            if cache_events is None:
+                cache_event = None
+            else:
+                cache_event = cache_events[i]
+            layer = self.layers[i]
+            hidden_states = layer(
+                hidden_states, kv_caches[i], input_metadata, cache_event)
+
+        if self.final_layer_norm is not None:
+            hidden_states = self.final_layer_norm(hidden_states)
+        if self.project_out is not None:
+            hidden_states = self.project_out(hidden_states)
+        return hidden_states
+
+
+class OPTModel(nn.Module):
+
+    def __init__(self, config: OPTConfig):
+        super().__init__()
+        self.decoder = OPTDecoder(config)
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        positions: torch.LongTensor,
+        kv_caches: List[KVCache],
+        input_metadata: InputMetadata,
+        cache_events: Optional[List[torch.cuda.Event]],
+    ) -> torch.Tensor:
+        return self.decoder(
+            input_ids, positions, kv_caches, input_metadata, cache_events)
+
+
+class OPTForCausalLM(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.model = OPTModel(config)
+        # TODO(zhuohan): create a new weight after implementing pipeline
+        #                parallelism
+        self.lm_head_weight = self.model.decoder.embed_tokens.weight
+        self.sampler = Sampler(config.vocab_size)
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        positions: torch.LongTensor,
+        kv_caches: List[KVCache],
+        input_metadata: InputMetadata,
+        cache_events: Optional[List[torch.cuda.Event]],
+    ) -> Dict[int, SequenceOutputs]:
+        hidden_states = self.model(
+            input_ids, positions, kv_caches, input_metadata, cache_events)
+        next_tokens = self.sampler(
+            self.lm_head_weight, hidden_states, input_metadata)
+        return next_tokens
+
+    _column_parallel_weights = ["embed_tokens.weight", "fc1.weight", "fc1.bias"]
+    _row_parallel_weights = ["out_proj.weight", "fc2.weight"]
+
+    def load_weights(self, model_name_or_path: str,
+                     cache_dir: Optional[str] = None,
+                     use_np_cache: bool = False):
+        tensor_model_parallel_rank = get_tensor_model_parallel_rank()
+        state_dict = self.state_dict()
+
+        for name, loaded_weight in hf_model_weights_iterator(
+            model_name_or_path, cache_dir, use_np_cache):
+            if "lm_head.weight" in name:
+                continue
+
+            if name.startswith("decoder."):
+                name = "model." + name
+
+            is_attention_weight = False
+            for stride_id, att_weight_name in enumerate(["q_proj", "k_proj", "v_proj"]):
+                if att_weight_name not in name:
+                    continue
+                param = state_dict[name.replace(att_weight_name, "qkv_proj")]
+                shard_size = param.shape[0] // 3
+                loaded_weight = loaded_weight[
+                    shard_size * tensor_model_parallel_rank
+                    :shard_size * (tensor_model_parallel_rank + 1)]
+                param_slice = param.data[shard_size * stride_id
+                                         :shard_size * (stride_id + 1)]
+                assert param_slice.shape == loaded_weight.shape
+                param_slice.copy_(loaded_weight)
+                is_attention_weight = True
+                break
+            if is_attention_weight:
+                continue
+
+            param = state_dict[name]
+            load_tensor_parallel_weights(param, loaded_weight, name,
+                                         self._column_parallel_weights,
+                                         self._row_parallel_weights,
+                                         tensor_model_parallel_rank)
--- a/cacheflow/model_executor/parallel_utils/README.md
+++ b/cacheflow/model_executor/parallel_utils/README.md
@ -0,0 +1 @@
+The files in this folder are ported from [Megatron-LM](https://github.com/NVIDIA/Megatron-LM/tree/main/megatron/core). We only keep the codes that are used in inference.
--- a/cacheflow/model_executor/parallel_utils/init.py
+++ b/cacheflow/model_executor/parallel_utils/init.py
@ -0,0 +1,12 @@
+import cacheflow.model_executor.parallel_utils.parallel_state
+import cacheflow.model_executor.parallel_utils.tensor_parallel
+import cacheflow.model_executor.parallel_utils.utils
+
+# Alias parallel_state as mpu, its legacy name
+mpu = parallel_state
+
+__all__ = [
+    "parallel_state",
+    "tensor_parallel",
+    "utils",
+]
--- a/cacheflow/model_executor/parallel_utils/parallel_state.py
+++ b/cacheflow/model_executor/parallel_utils/parallel_state.py
@ -0,0 +1,593 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+"""Model and data parallel groups."""
+
+import torch
+from typing import Optional
+
+from .utils import GlobalMemoryBuffer
+
+# Intra-layer model parallel group that the current rank belongs to.
+_TENSOR_MODEL_PARALLEL_GROUP = None
+# Inter-layer model parallel group that the current rank belongs to.
+_PIPELINE_MODEL_PARALLEL_GROUP = None
+# Model parallel group (both intra- and pipeline) that the current rank belongs to.
+_MODEL_PARALLEL_GROUP = None
+# Embedding group.
+_EMBEDDING_GROUP = None
+# Position embedding group.
+_POSITION_EMBEDDING_GROUP = None
+# Data parallel group that the current rank belongs to.
+_DATA_PARALLEL_GROUP = None
+
+_VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK = None
+_VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE = None
+_PIPELINE_MODEL_PARALLEL_SPLIT_RANK = None
+
+# These values enable us to change the mpu sizes on the fly.
+_MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE = None
+_MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE = None
+_MPU_TENSOR_MODEL_PARALLEL_RANK = None
+_MPU_PIPELINE_MODEL_PARALLEL_RANK = None
+
+# A list of ranks that have a copy of the embedding.
+_EMBEDDING_GLOBAL_RANKS = None
+
+# A list of ranks that have a copy of the position embedding.
+_POSITION_EMBEDDING_GLOBAL_RANKS = None
+
+# A list of global ranks for each pipeline group to ease calculation of the source
+# rank when broadcasting from the first or last pipeline stage.
+_PIPELINE_GLOBAL_RANKS = None
+
+# A list of global ranks for each data parallel group to ease calculation of the source
+# rank when broadcasting weights from src to all other data parallel ranks
+_DATA_PARALLEL_GLOBAL_RANKS = None
+
+# Memory buffers to avoid dynamic memory allocation
+_GLOBAL_MEMORY_BUFFER = None
+
+_ALL_REDUCE_LAUNCHER: Optional['GraphAllReduce'] = None
+
+def initialize_model_parallel(
+    tensor_model_parallel_size: int = 1,
+    pipeline_model_parallel_size: int = 1,
+    virtual_pipeline_model_parallel_size: Optional[int] = None,
+    pipeline_model_parallel_split_rank: Optional[int] = None,
+) -> None:
+    """
+    Initialize model data parallel groups.
+
+    Arguments:
+        tensor_model_parallel_size: number of GPUs used for tensor model parallelism.
+        pipeline_model_parallel_size: number of GPUs used for pipeline model parallelism.
+        virtual_pipeline_model_parallel_size: number of virtual stages (interleaved
+                                              pipeline).
+        pipeline_model_parallel_split_rank: for models with both encoder and decoder,
+                                            rank in pipeline with split point.
+
+    Let's say we have a total of 16 GPUs denoted by g0 ... g15 and we
+    use 2 GPUs to parallelize the model tensor, and 4 GPUs to parallelize
+    the model pipeline. The present function will
+    create 8 tensor model-parallel groups, 4 pipeline model-parallel groups
+    and 8 data-parallel groups as:
+        8 data_parallel groups:
+            [g0, g2], [g1, g3], [g4, g6], [g5, g7], [g8, g10], [g9, g11], [g12, g14], [g13, g15]
+        8 tensor model-parallel groups:
+            [g0, g1], [g2, g3], [g4, g5], [g6, g7], [g8, g9], [g10, g11], [g12, g13], [g14, g15]
+        4 pipeline model-parallel groups:
+            [g0, g4, g8, g12], [g1, g5, g9, g13], [g2, g6, g10, g14], [g3, g7, g11, g15]
+    Note that for efficiency, the caller should make sure adjacent ranks
+    are on the same DGX box. For example if we are using 2 DGX-1 boxes
+    with a total of 16 GPUs, rank 0 to 7 belong to the first box and
+    ranks 8 to 15 belong to the second box.
+    """
+    # Get world size and rank. Ensure some consistencies.
+    assert torch.distributed.is_initialized()
+    world_size: int = torch.distributed.get_world_size()
+
+    if world_size % (tensor_model_parallel_size * pipeline_model_parallel_size) != 0:
+        raise RuntimeError(
+            f"world_size ({world_size}) is not divisible by tensor_model_parallel_size "
+            f"({tensor_model_parallel_size}) x pipeline_model_parallel_size ({pipeline_model_parallel_size})"
+        )
+
+    data_parallel_size: int = world_size // (tensor_model_parallel_size *
+                                             pipeline_model_parallel_size)
+
+    num_tensor_model_parallel_groups: int  = world_size // tensor_model_parallel_size
+    num_pipeline_model_parallel_groups: int = world_size // pipeline_model_parallel_size
+    num_data_parallel_groups: int = world_size // data_parallel_size
+
+    if virtual_pipeline_model_parallel_size is not None:
+        if not pipeline_model_parallel_size > 2:
+            raise RuntimeError("pipeline-model-parallel size should be greater than 2 with "
+                               "interleaved schedule")
+        global _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK
+        global _VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
+        _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK = 0
+        _VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE = virtual_pipeline_model_parallel_size
+
+    if pipeline_model_parallel_split_rank is not None:
+        global _PIPELINE_MODEL_PARALLEL_SPLIT_RANK
+        _PIPELINE_MODEL_PARALLEL_SPLIT_RANK = pipeline_model_parallel_split_rank
+
+    rank = torch.distributed.get_rank()
+
+    # Build the data-parallel groups.
+    global _DATA_PARALLEL_GROUP
+    global _DATA_PARALLEL_GLOBAL_RANKS
+    assert _DATA_PARALLEL_GROUP is None, 'data parallel group is already initialized'
+    all_data_parallel_group_ranks = []
+    for i in range(pipeline_model_parallel_size):
+        start_rank = i * num_pipeline_model_parallel_groups
+        end_rank = (i + 1) * num_pipeline_model_parallel_groups
+        for j in range(tensor_model_parallel_size):
+            ranks = range(start_rank + j, end_rank, tensor_model_parallel_size)
+            all_data_parallel_group_ranks.append(list(ranks))
+            group = torch.distributed.new_group(ranks)
+            if rank in ranks:
+                _DATA_PARALLEL_GROUP = group
+                _DATA_PARALLEL_GLOBAL_RANKS = ranks
+
+    # Build the model-parallel groups.
+    global _MODEL_PARALLEL_GROUP
+    assert _MODEL_PARALLEL_GROUP is None, 'model parallel group is already initialized'
+    for i in range(data_parallel_size):
+        ranks = [data_parallel_group_ranks[i]
+                 for data_parallel_group_ranks in all_data_parallel_group_ranks]
+        group = torch.distributed.new_group(ranks)
+        if rank in ranks:
+            _MODEL_PARALLEL_GROUP = group
+
+    # Build the tensor model-parallel groups.
+    global _TENSOR_MODEL_PARALLEL_GROUP
+    assert _TENSOR_MODEL_PARALLEL_GROUP is None, \
+        'tensor model parallel group is already initialized'
+    for i in range(num_tensor_model_parallel_groups):
+        ranks = range(i * tensor_model_parallel_size,
+                      (i + 1) * tensor_model_parallel_size)
+        group = torch.distributed.new_group(ranks)
+        if rank in ranks:
+            _TENSOR_MODEL_PARALLEL_GROUP = group
+
+    # Build the pipeline model-parallel groups and embedding groups
+    # (first and last rank in each pipeline model-parallel group).
+    global _PIPELINE_MODEL_PARALLEL_GROUP
+    global _PIPELINE_GLOBAL_RANKS
+    assert _PIPELINE_MODEL_PARALLEL_GROUP is None, \
+        'pipeline model parallel group is already initialized'
+    global _EMBEDDING_GROUP
+    global _EMBEDDING_GLOBAL_RANKS
+    assert _EMBEDDING_GROUP is None, 'embedding group is already initialized'
+    global _POSITION_EMBEDDING_GROUP
+    global _POSITION_EMBEDDING_GLOBAL_RANKS
+    assert _POSITION_EMBEDDING_GROUP is None, \
+        'position embedding group is already initialized'
+    for i in range(num_pipeline_model_parallel_groups):
+        ranks = range(i, world_size, num_pipeline_model_parallel_groups)
+        group = torch.distributed.new_group(ranks)
+        if rank in ranks:
+            _PIPELINE_MODEL_PARALLEL_GROUP = group
+            _PIPELINE_GLOBAL_RANKS = ranks
+        # Setup embedding group (to exchange gradients between
+        # first and last stages).
+        if len(ranks) > 1:
+            embedding_ranks = [ranks[0], ranks[-1]]
+            position_embedding_ranks = [ranks[0]]
+            if pipeline_model_parallel_split_rank is not None:
+                if ranks[pipeline_model_parallel_split_rank] not in embedding_ranks:
+                    embedding_ranks = [ranks[0],
+                                       ranks[pipeline_model_parallel_split_rank],
+                                       ranks[-1]]
+                if ranks[pipeline_model_parallel_split_rank] not in position_embedding_ranks:
+                    position_embedding_ranks = [ranks[0],
+                                       ranks[pipeline_model_parallel_split_rank]]
+        else:
+            embedding_ranks = ranks
+            position_embedding_ranks = ranks
+
+        group = torch.distributed.new_group(embedding_ranks)
+        if rank in embedding_ranks:
+            _EMBEDDING_GROUP = group
+        if rank in ranks:
+            _EMBEDDING_GLOBAL_RANKS = embedding_ranks
+
+        group = torch.distributed.new_group(position_embedding_ranks)
+        if rank in position_embedding_ranks:
+            _POSITION_EMBEDDING_GROUP = group
+        if rank in ranks:
+            _POSITION_EMBEDDING_GLOBAL_RANKS = position_embedding_ranks
+
+    # Initialize global memory buffer
+    # This isn't really "parallel state" but there isn't another good place to
+    # put this. If we end up with a more generic initialization of megatron-core
+    # we could stick it there
+    _set_global_memory_buffer()
+
+
+def initialize_all_reduce_launcher(
+    max_num_tokens: int,
+    hidden_size: int,
+    dtype: torch.dtype,
+    disable_graph: bool = False,
+) -> None:
+    global _ALL_REDUCE_LAUNCHER
+    _ALL_REDUCE_LAUNCHER = GraphAllReduce(
+        max_num_tokens=max_num_tokens,
+        hidden_size=hidden_size,
+        dtype=dtype,
+        disable_graph=disable_graph,
+    )
+
+def model_parallel_is_initialized():
+    """Check if model and data parallel groups are initialized."""
+    if _TENSOR_MODEL_PARALLEL_GROUP is None or \
+        _PIPELINE_MODEL_PARALLEL_GROUP is None or \
+        _DATA_PARALLEL_GROUP is None:
+        return False
+    return True
+
+
+def get_model_parallel_group():
+    """Get the model parallel group the caller rank belongs to."""
+    assert _MODEL_PARALLEL_GROUP is not None, \
+        'model parallel group is not initialized'
+    return _MODEL_PARALLEL_GROUP
+
+
+def get_tensor_model_parallel_group():
+    """Get the tensor model parallel group the caller rank belongs to."""
+    assert _TENSOR_MODEL_PARALLEL_GROUP is not None, \
+        'intra_layer_model parallel group is not initialized'
+    return _TENSOR_MODEL_PARALLEL_GROUP
+
+
+def get_pipeline_model_parallel_group():
+    """Get the pipeline model parallel group the caller rank belongs to."""
+    assert _PIPELINE_MODEL_PARALLEL_GROUP is not None, \
+        'pipeline_model parallel group is not initialized'
+    return _PIPELINE_MODEL_PARALLEL_GROUP
+
+
+def get_data_parallel_group():
+    """Get the data parallel group the caller rank belongs to."""
+    assert _DATA_PARALLEL_GROUP is not None, \
+        'data parallel group is not initialized'
+    return _DATA_PARALLEL_GROUP
+
+
+def get_embedding_group():
+    """Get the embedding group the caller rank belongs to."""
+    assert _EMBEDDING_GROUP is not None, \
+        'embedding group is not initialized'
+    return _EMBEDDING_GROUP
+
+
+def get_position_embedding_group():
+    """Get the position embedding group the caller rank belongs to."""
+    assert _POSITION_EMBEDDING_GROUP is not None, \
+        'position embedding group is not initialized'
+    return _POSITION_EMBEDDING_GROUP
+
+
+def set_tensor_model_parallel_world_size(world_size):
+    """Set the tensor model parallel size"""
+    global _MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE
+    _MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE = world_size
+
+
+def set_pipeline_model_parallel_world_size(world_size):
+    """Set the pipeline model parallel size"""
+    global _MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
+    _MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE = world_size
+
+
+def get_tensor_model_parallel_world_size():
+    """Return world size for the tensor model parallel group."""
+    global _MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE
+    if _MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE is not None:
+        return _MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE
+    return torch.distributed.get_world_size(group=get_tensor_model_parallel_group())
+
+
+def get_pipeline_model_parallel_world_size():
+    """Return world size for the pipeline model parallel group."""
+    global _MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
+    if _MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE is not None:
+        return _MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
+    return torch.distributed.get_world_size(group=get_pipeline_model_parallel_group())
+
+
+def set_tensor_model_parallel_rank(rank):
+    """Set tensor model parallel rank."""
+    global _MPU_TENSOR_MODEL_PARALLEL_RANK
+    _MPU_TENSOR_MODEL_PARALLEL_RANK = rank
+
+
+def set_pipeline_model_parallel_rank(rank):
+    """Set pipeline model parallel rank."""
+    global _MPU_PIPELINE_MODEL_PARALLEL_RANK
+    _MPU_PIPELINE_MODEL_PARALLEL_RANK = rank
+
+
+def set_pipeline_model_parallel_split_rank(rank):
+    """Set pipeline model parallel split rank."""
+    global _MPU_PIPELINE_MODEL_PARALLEL_SPLIT_RANK
+    _MPU_PIPELINE_MODEL_PARALLEL_SPLIT_RANK = rank
+
+
+def get_tensor_model_parallel_rank():
+    """Return my rank for the tensor model parallel group."""
+    global _MPU_TENSOR_MODEL_PARALLEL_RANK
+    if _MPU_TENSOR_MODEL_PARALLEL_RANK is not None:
+        return _MPU_TENSOR_MODEL_PARALLEL_RANK
+    return torch.distributed.get_rank(group=get_tensor_model_parallel_group())
+
+
+def get_pipeline_model_parallel_rank():
+    """Return my rank for the pipeline model parallel group."""
+    global _MPU_PIPELINE_MODEL_PARALLEL_RANK
+    if _MPU_PIPELINE_MODEL_PARALLEL_RANK is not None:
+        return _MPU_PIPELINE_MODEL_PARALLEL_RANK
+    return torch.distributed.get_rank(group=get_pipeline_model_parallel_group())
+
+
+
+def is_pipeline_first_stage(ignore_virtual=False):
+    """Return True if in the first pipeline model-parallel stage, False otherwise."""
+    if not ignore_virtual:
+        if get_virtual_pipeline_model_parallel_world_size() is not None and \
+            get_virtual_pipeline_model_parallel_rank() != 0:
+            return False
+    return get_pipeline_model_parallel_rank() == 0
+
+
+def is_pipeline_last_stage(ignore_virtual=False):
+    """Return True if in the last pipeline model-parallel stage, False otherwise."""
+    if not ignore_virtual:
+        virtual_pipeline_model_parallel_world_size = \
+            get_virtual_pipeline_model_parallel_world_size()
+        if virtual_pipeline_model_parallel_world_size is not None and \
+            get_virtual_pipeline_model_parallel_rank() != (
+                virtual_pipeline_model_parallel_world_size - 1):
+            return False
+    return get_pipeline_model_parallel_rank() == (
+        get_pipeline_model_parallel_world_size() - 1)
+
+
+def is_rank_in_embedding_group(ignore_virtual=False):
+    """Return true if current rank is in embedding group, False otherwise."""
+    rank = torch.distributed.get_rank()
+    global _EMBEDDING_GLOBAL_RANKS
+    if ignore_virtual:
+        return rank in _EMBEDDING_GLOBAL_RANKS
+    if rank in _EMBEDDING_GLOBAL_RANKS:
+        if rank == _EMBEDDING_GLOBAL_RANKS[0]:
+            return is_pipeline_first_stage(ignore_virtual=False)
+        elif rank == _EMBEDDING_GLOBAL_RANKS[-1]:
+            return is_pipeline_last_stage(ignore_virtual=False)
+        else:
+            return True
+    return False
+
+
+def is_rank_in_position_embedding_group():
+    """Return true if current rank is in position embedding group, False otherwise."""
+    rank = torch.distributed.get_rank()
+    global _POSITION_EMBEDDING_GLOBAL_RANKS
+    return rank in _POSITION_EMBEDDING_GLOBAL_RANKS
+
+
+def is_pipeline_stage_before_split(rank=None):
+    """Return True if pipeline stage executes encoder block for a model
+    with both encoder and decoder."""
+    if get_pipeline_model_parallel_world_size() == 1:
+        return True
+    if rank is None:
+        rank = get_pipeline_model_parallel_rank()
+    global _PIPELINE_MODEL_PARALLEL_SPLIT_RANK
+    if _PIPELINE_MODEL_PARALLEL_SPLIT_RANK is None:
+        return True
+    if rank < _PIPELINE_MODEL_PARALLEL_SPLIT_RANK:
+        return True
+    return False
+
+
+def is_pipeline_stage_after_split(rank=None):
+    """Return True if pipeline stage executes decoder block for a model
+    with both encoder and decoder."""
+    if get_pipeline_model_parallel_world_size() == 1:
+        return True
+    if rank is None:
+        rank = get_pipeline_model_parallel_rank()
+    global _PIPELINE_MODEL_PARALLEL_SPLIT_RANK
+    if _PIPELINE_MODEL_PARALLEL_SPLIT_RANK is None:
+        return True
+    if rank >= _PIPELINE_MODEL_PARALLEL_SPLIT_RANK:
+        return True
+    return False
+
+
+def is_pipeline_stage_at_split():
+    """Return true if pipeline stage executes decoder block and next
+    stage executes encoder block for a model with both encoder and
+    decoder."""
+    rank = get_pipeline_model_parallel_rank()
+    return is_pipeline_stage_before_split(rank) and \
+            is_pipeline_stage_after_split(rank+1)
+
+
+def get_virtual_pipeline_model_parallel_rank():
+    """Return the virtual pipeline-parallel rank."""
+    global _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK
+    return _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK
+
+
+def set_virtual_pipeline_model_parallel_rank(rank):
+    """Set the virtual pipeline-parallel rank."""
+    global _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK
+    _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK = rank
+
+
+def get_virtual_pipeline_model_parallel_world_size():
+    """Return the virtual pipeline-parallel world size."""
+    global _VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
+    return _VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
+
+
+def get_tensor_model_parallel_src_rank():
+    """Calculate the global rank corresponding to the first local rank
+    in the tensor model parallel group."""
+    global_rank = torch.distributed.get_rank()
+    local_world_size = get_tensor_model_parallel_world_size()
+    return (global_rank // local_world_size) * local_world_size
+
+
+def get_data_parallel_src_rank():
+    """Calculate the global rank corresponding to the first local rank
+    in the data parallel group."""
+    assert _DATA_PARALLEL_GLOBAL_RANKS is not None, \
+        "Data parallel group is not initialized"
+    return _DATA_PARALLEL_GLOBAL_RANKS[0]
+
+
+def get_pipeline_model_parallel_first_rank():
+    """Return the global rank of the first process in the pipeline for the
+    current tensor parallel group"""
+    assert _PIPELINE_GLOBAL_RANKS is not None, \
+        "Pipeline parallel group is not initialized"
+    return _PIPELINE_GLOBAL_RANKS[0]
+
+
+def get_pipeline_model_parallel_last_rank():
+    """Return the global rank of the last process in the pipeline for the
+    current tensor parallel group"""
+    assert _PIPELINE_GLOBAL_RANKS is not None, \
+        "Pipeline parallel group is not initialized"
+    last_rank_local = get_pipeline_model_parallel_world_size() - 1
+    return _PIPELINE_GLOBAL_RANKS[last_rank_local]
+
+def get_pipeline_model_parallel_next_rank():
+    """Return the global rank that follows the caller in the pipeline"""
+    assert _PIPELINE_GLOBAL_RANKS is not None, \
+        "Pipeline parallel group is not initialized"
+    rank_in_pipeline = get_pipeline_model_parallel_rank()
+    world_size = get_pipeline_model_parallel_world_size()
+    return _PIPELINE_GLOBAL_RANKS[(rank_in_pipeline + 1) % world_size]
+
+
+def get_pipeline_model_parallel_prev_rank():
+    """Return the global rank that preceeds the caller in the pipeline"""
+    assert _PIPELINE_GLOBAL_RANKS is not None, \
+        "Pipeline parallel group is not initialized"
+    rank_in_pipeline = get_pipeline_model_parallel_rank()
+    world_size = get_pipeline_model_parallel_world_size()
+    return _PIPELINE_GLOBAL_RANKS[(rank_in_pipeline - 1) % world_size]
+
+
+def get_data_parallel_world_size():
+    """Return world size for the data parallel group."""
+    return torch.distributed.get_world_size(group=get_data_parallel_group())
+
+
+def get_data_parallel_rank():
+    """Return my rank for the data parallel group."""
+    return torch.distributed.get_rank(group=get_data_parallel_group())
+
+def _set_global_memory_buffer():
+    """Initialize global buffer"""
+    global _GLOBAL_MEMORY_BUFFER
+    assert _GLOBAL_MEMORY_BUFFER is None, 'global memory buffer is already initialized'
+    _GLOBAL_MEMORY_BUFFER = GlobalMemoryBuffer()
+
+def get_global_memory_buffer():
+    """Return the global GlobalMemoryBuffer object"""
+    assert _GLOBAL_MEMORY_BUFFER is not None, 'global memory buffer is not initialized'
+    return _GLOBAL_MEMORY_BUFFER
+
+def get_all_reduce_launcher() -> 'GraphAllReduce':
+    assert _ALL_REDUCE_LAUNCHER is not None, 'all reduce launcher is not initialized'
+    return _ALL_REDUCE_LAUNCHER
+
+def destroy_model_parallel():
+    """Set the groups to none."""
+    global _MODEL_PARALLEL_GROUP
+    _MODEL_PARALLEL_GROUP = None
+    global _TENSOR_MODEL_PARALLEL_GROUP
+    _TENSOR_MODEL_PARALLEL_GROUP = None
+    global _PIPELINE_MODEL_PARALLEL_GROUP
+    _PIPELINE_MODEL_PARALLEL_GROUP = None
+    global _DATA_PARALLEL_GROUP
+    _DATA_PARALLEL_GROUP = None
+    global _EMBEDDING_GROUP
+    _EMBEDDING_GROUP = None
+    global _POSITION_EMBEDDING_GROUP
+    _POSITION_EMBEDDING_GROUP = None
+    global _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK
+    _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK = None
+    global _VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
+    _VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE = None
+    global _MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE
+    _MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE = None
+    global _MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
+    _MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE = None
+    global _MPU_TENSOR_MODEL_PARALLEL_RANK
+    _MPU_TENSOR_MODEL_PARALLEL_RANK = None
+    global _MPU_PIPELINE_MODEL_PARALLEL_RANK
+    _MPU_PIPELINE_MODEL_PARALLEL_RANK = None
+    global _GLOBAL_MEMORY_BUFFER
+    _GLOBAL_MEMORY_BUFFER = None
+
+
+class GraphAllReduce:
+
+    def __init__(
+        self,
+        max_num_tokens: int,
+        hidden_size: int,
+        dtype: torch.dtype,
+        disable_graph: bool = False,
+    ) -> None:
+        self.max_num_tokens = max_num_tokens
+        self.hidden_size = hidden_size
+        self.disable_graph = disable_graph
+
+        tp_world_size = get_tensor_model_parallel_world_size()
+        if tp_world_size == 1:
+            return
+
+        self.group = get_tensor_model_parallel_group()
+        self.buffer = torch.empty(
+            size=(max_num_tokens, hidden_size),
+            dtype=dtype,
+            device='cuda',
+        )
+
+        # Build graphs for different number of tokens.
+        if not self.disable_graph:
+            self.graphs = {}
+            for num_tokens in range(8, max_num_tokens + 1, 8):
+                self.graphs[num_tokens] = self._build_graph(num_tokens)
+
+    def _build_graph(self, num_tokens: int) -> torch.cuda.CUDAGraph:
+        # Warm up.
+        torch.distributed.all_reduce(self.buffer[:num_tokens], group=self.group)
+        torch.cuda.synchronize()
+
+        # Build graph.
+        graph = torch.cuda.CUDAGraph()
+        with torch.cuda.graph(graph):
+            torch.distributed.all_reduce(
+                self.buffer[:num_tokens], group=self.group)
+        torch.cuda.synchronize()
+        return graph
+
+    def launch(self, x: torch.Tensor) -> torch.Tensor:
+        # NOTE: x must be a slice of self.buffer.
+        num_tokens = x.shape[0]
+        if self.disable_graph:
+            torch.distributed.all_reduce(x, group=self.group)
+        else:
+            self.graphs[num_tokens].replay()
+        return x
--- a/cacheflow/model_executor/parallel_utils/tensor_parallel/init.py
+++ b/cacheflow/model_executor/parallel_utils/tensor_parallel/init.py
@ -0,0 +1,55 @@
+from .layers import (
+    ColumnParallelLinear,
+    RowParallelLinear,
+    VocabParallelEmbedding,
+    set_tensor_model_parallel_attributes,
+    set_defaults_if_not_set_tensor_model_parallel_attributes,
+    copy_tensor_model_parallel_attributes,
+    param_is_not_tensor_parallel_duplicate,
+)
+
+from .mappings import (
+    copy_to_tensor_model_parallel_region,
+    gather_from_tensor_model_parallel_region,
+    gather_from_sequence_parallel_region,
+    scatter_to_tensor_model_parallel_region,
+    scatter_to_sequence_parallel_region,
+)
+
+from .random import (
+    checkpoint,
+    get_cuda_rng_tracker,
+    model_parallel_cuda_manual_seed,
+)
+
+from .utils import (
+    split_tensor_along_last_dim,
+    split_tensor_into_1d_equal_chunks,
+    gather_split_1d_tensor,
+)
+
+__all__ = [
+    #layers.py
+    "ColumnParallelLinear",
+    "RowParallelLinear",
+    "VocabParallelEmbedding",
+    "set_tensor_model_parallel_attributes",
+    "set_defaults_if_not_set_tensor_model_parallel_attributes",
+    "copy_tensor_model_parallel_attributes",
+    "param_is_not_tensor_parallel_duplicate",
+    # mappings.py
+    "copy_to_tensor_model_parallel_region",
+    "gather_from_tensor_model_parallel_region",
+    "gather_from_sequence_parallel_region",
+#    "reduce_from_tensor_model_parallel_region",
+    "scatter_to_tensor_model_parallel_region",
+    "scatter_to_sequence_parallel_region",
+    # random.py
+    "checkpoint",
+    "get_cuda_rng_tracker",
+    "model_parallel_cuda_manual_seed",
+    # utils.py
+    "split_tensor_along_last_dim",
+    "split_tensor_into_1d_equal_chunks",
+    "gather_split_1d_tensor",
+]
--- a/cacheflow/model_executor/parallel_utils/tensor_parallel/layers.py
+++ b/cacheflow/model_executor/parallel_utils/tensor_parallel/layers.py
@ -0,0 +1,446 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+# Parts of the code here are adapted from PyTorch
+# repo: https://github.com/pytorch/pytorch
+
+
+import torch
+import torch.nn.functional as F
+import torch.nn.init as init
+from torch.nn.parameter import Parameter
+
+from cacheflow.model_executor.parallel_utils.parallel_state import (
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+    get_all_reduce_launcher,
+)
+from .mappings import (
+    copy_to_tensor_model_parallel_region,
+    gather_from_tensor_model_parallel_region,
+    reduce_from_tensor_model_parallel_region,
+    scatter_to_tensor_model_parallel_region,
+)
+
+from .random import get_cuda_rng_tracker
+from .utils import (
+    divide,
+    VocabUtility,
+)
+
+_MODEL_PARALLEL_ATTRIBUTE_DEFAULTS = {'tensor_model_parallel': False,
+                                      'partition_dim': -1,
+                                      'partition_stride': 1}
+
+def param_is_not_tensor_parallel_duplicate(param):
+    return (hasattr(param, 'tensor_model_parallel') and
+            param.tensor_model_parallel) or (
+                get_tensor_model_parallel_rank() == 0)
+
+
+def set_tensor_model_parallel_attributes(tensor, is_parallel, dim, stride):
+    # Make sure the attributes are not set.
+    for attribute in _MODEL_PARALLEL_ATTRIBUTE_DEFAULTS:
+        assert not hasattr(tensor, attribute)
+    # Set the attributes.
+    setattr(tensor, 'tensor_model_parallel', is_parallel)
+    setattr(tensor, 'partition_dim', dim)
+    setattr(tensor, 'partition_stride', stride)
+
+
+def set_defaults_if_not_set_tensor_model_parallel_attributes(tensor):
+    def maybe_set(attribute, value):
+        if not hasattr(tensor, attribute):
+            setattr(tensor, attribute, value)
+    for attribute in _MODEL_PARALLEL_ATTRIBUTE_DEFAULTS:
+        maybe_set(attribute, _MODEL_PARALLEL_ATTRIBUTE_DEFAULTS[attribute])
+
+
+def copy_tensor_model_parallel_attributes(destination_tensor, source_tensor):
+    def maybe_copy(attribute):
+        if hasattr(source_tensor, attribute):
+            setattr(destination_tensor, attribute,
+                    getattr(source_tensor, attribute))
+    for attribute in _MODEL_PARALLEL_ATTRIBUTE_DEFAULTS:
+        maybe_copy(attribute)
+
+
+def _initialize_affine_weight_gpu(weight, init_method,
+                                  partition_dim, stride=1):
+    """Initialize affine weight for model parallel on GPU."""
+
+    set_tensor_model_parallel_attributes(tensor=weight,
+                                         is_parallel=True,
+                                         dim=partition_dim,
+                                         stride=stride)
+
+    with get_cuda_rng_tracker().fork():
+        init_method(weight)
+
+
+def _initialize_affine_weight_cpu(weight, output_size, input_size,
+                                  per_partition_size, partition_dim,
+                                  init_method, stride=1,
+                                  return_master_weight=False,
+                                  *, params_dtype=None):
+    """Initialize affine weight for model parallel.
+
+    Build the master weight on all processes and scatter
+    the relevant chunk."""
+
+    set_tensor_model_parallel_attributes(tensor=weight,
+                                         is_parallel=True,
+                                         dim=partition_dim,
+                                         stride=stride)
+
+    if params_dtype is None:
+        params_dtype = torch.get_default_dtype()
+
+    # Initialize master weight
+    master_weight = torch.empty(output_size, input_size,
+                                dtype=torch.float,
+                                requires_grad=False)
+    init_method(master_weight)
+    master_weight = master_weight.to(dtype=params_dtype)
+
+    # Split and copy
+    per_partition_per_stride_size = divide(per_partition_size, stride)
+    weight_list = torch.split(master_weight, per_partition_per_stride_size,
+                              dim=partition_dim)
+    rank = get_tensor_model_parallel_rank()
+    world_size = get_tensor_model_parallel_world_size()
+    my_weight_list = weight_list[rank::world_size]
+
+    with torch.no_grad():
+        torch.cat(my_weight_list, dim=partition_dim, out=weight)
+    if return_master_weight:
+        return master_weight
+    return None
+
+
+class VocabParallelEmbedding(torch.nn.Module):
+    """Embedding parallelized in the vocabulary dimension.
+
+    This is mainly adapted from torch.nn.Embedding and all the default
+    values are kept.
+    Arguments:
+        num_embeddings: vocabulary size.
+        embedding_dim: size of hidden state.
+
+    Keyword Arguments:
+        init_method: method to initialize weights.
+        params_dtype
+        use_cpu_initialization
+        perform_initialization
+    """
+
+    def __init__(self, num_embeddings: int, embedding_dim: int, *,
+                 init_method=init.xavier_normal_,
+                 params_dtype: torch.dtype=None,
+                 use_cpu_initialization: bool=False,
+                 perform_initialization: bool=True):
+        super(VocabParallelEmbedding, self).__init__()
+        # Keep the input dimensions.
+        self.num_embeddings = num_embeddings
+        self.embedding_dim = embedding_dim
+        if params_dtype is None:
+            params_dtype = torch.get_default_dtype()
+
+        # Set the defaults for compatibility.
+        self.padding_idx = None
+        self.max_norm = None
+        self.norm_type = 2.
+        self.scale_grad_by_freq = False
+        self.sparse = False
+        self._weight = None
+        self.tensor_model_parallel_size = get_tensor_model_parallel_world_size()
+        # Divide the weight matrix along the vocaburaly dimension.
+        self.vocab_start_index, self.vocab_end_index = \
+            VocabUtility.vocab_range_from_global_vocab_size(
+                self.num_embeddings, get_tensor_model_parallel_rank(),
+                self.tensor_model_parallel_size)
+        self.num_embeddings_per_partition = self.vocab_end_index - \
+            self.vocab_start_index
+
+        # Allocate weights and initialize.
+        if use_cpu_initialization:
+            self.weight = Parameter(torch.empty(
+                self.num_embeddings_per_partition, self.embedding_dim,
+                dtype=params_dtype))
+            if perform_initialization:
+                _initialize_affine_weight_cpu(
+                    self.weight, self.num_embeddings, self.embedding_dim,
+                    self.num_embeddings_per_partition, 0, init_method,
+                    params_dtype=params_dtype)
+        else:
+            self.weight = Parameter(torch.empty(
+                self.num_embeddings_per_partition, self.embedding_dim,
+                device=torch.cuda.current_device(), dtype=params_dtype))
+            if perform_initialization:
+                _initialize_affine_weight_gpu(self.weight, init_method,
+                                              partition_dim=0, stride=1)
+
+    def forward(self, input_):
+        if self.tensor_model_parallel_size > 1:
+            # Build the mask.
+            input_mask = (input_ < self.vocab_start_index) | \
+                         (input_ >= self.vocab_end_index)
+            # Mask the input.
+            masked_input = input_.clone() - self.vocab_start_index
+            masked_input[input_mask] = 0
+        else:
+            masked_input = input_
+            # Get the embeddings.
+        output_parallel = F.embedding(masked_input, self.weight,
+                                      self.padding_idx, self.max_norm,
+                                      self.norm_type, self.scale_grad_by_freq,
+                                      self.sparse)
+        # Mask the output embedding.
+        if self.tensor_model_parallel_size > 1:
+            output_parallel[input_mask, :] = 0.0
+        # Reduce across all the model parallel GPUs.
+        output = reduce_from_tensor_model_parallel_region(output_parallel)
+        return output
+
+
+class ColumnParallelLinear(torch.nn.Module):
+    """Linear layer with column parallelism.
+
+    The linear layer is defined as Y = XA + b. A is parallelized along
+    its second dimension as A = [A_1, ..., A_p].
+
+    Arguments:
+        input_size: first dimension of matrix A.
+        output_size: second dimension of matrix A.
+
+    Keyword Arguments
+        bias: If true, add bias
+        gather_output: If true, call all-gather on output and make Y available
+                       to all GPUs, otherwise, every GPU will have its output
+                       which is Y_i = XA_i
+        init_method: method to initialize weights. Note that bias is always set
+                     to zero.
+        stride: For the strided linear layers.
+        keep_master_weight_for_test: This was added for testing and should be
+                                     set to False. It returns the master weights
+                                     used for initialization.
+        skip_bias_add: This was added to enable performance optimations where bias
+                       can be fused with other elementwise operations. we skip
+                       adding bias but instead return it.
+        params_dtype:
+        use_cpu_initialization:
+    """
+
+    def __init__(self, input_size, output_size, *,
+                 bias=True, gather_output=True,
+                 init_method=init.xavier_normal_, stride=1,
+                 keep_master_weight_for_test=False,
+                 skip_bias_add=False,
+                 params_dtype=None,
+                 use_cpu_initialization=False,
+                 perform_initialization=True,
+                 ):
+        super(ColumnParallelLinear, self).__init__()
+
+        # Keep input parameters
+        self.input_size = input_size
+        self.output_size = output_size
+        self.gather_output = gather_output
+        # Divide the weight matrix along the last dimension.
+        world_size = get_tensor_model_parallel_world_size()
+        self.output_size_per_partition = divide(output_size, world_size)
+        self.skip_bias_add = skip_bias_add
+
+        if params_dtype is None:
+            params_dtype = torch.get_default_dtype()
+
+        # Parameters.
+        # Note: torch.nn.functional.linear performs XA^T + b and as a result
+        # we allocate the transpose.
+        # Initialize weight.
+        if use_cpu_initialization:
+            self.weight = Parameter(torch.empty(self.output_size_per_partition,
+                                                self.input_size,
+                                                dtype=params_dtype))
+            if perform_initialization:
+                self.master_weight = _initialize_affine_weight_cpu(
+                    self.weight, self.output_size, self.input_size,
+                    self.output_size_per_partition, 0, init_method,
+                    stride=stride, return_master_weight=keep_master_weight_for_test)
+        else:
+            self.weight = Parameter(torch.empty(
+                self.output_size_per_partition, self.input_size,
+                device=torch.cuda.current_device(), dtype=params_dtype))
+            if perform_initialization:
+                _initialize_affine_weight_gpu(self.weight, init_method,
+                                              partition_dim=0, stride=stride)
+
+        if bias:
+            if use_cpu_initialization:
+                self.bias = Parameter(torch.empty(
+                    self.output_size_per_partition, dtype=params_dtype))
+            else:
+                self.bias = Parameter(torch.empty(
+                    self.output_size_per_partition,
+                    device=torch.cuda.current_device(),
+                    dtype=params_dtype))
+            set_tensor_model_parallel_attributes(self.bias, True, 0, stride)
+            # Always initialize bias to zero.
+            with torch.no_grad():
+                self.bias.zero_()
+        else:
+            self.register_parameter('bias', None)
+
+
+    def forward(self, input_):
+        """Forward of ColumnParallelLinear
+
+        Args:
+            input_: 3D tensor whose order of dimension is [sequence, batch, hidden]
+
+        Returns:
+            - output
+            - bias
+        """
+        bias = self.bias if not self.skip_bias_add else None
+
+        input_parallel = copy_to_tensor_model_parallel_region(input_)
+        # Matrix multiply.
+        output_parallel = F.linear(input_parallel, self.weight, bias)
+        if self.gather_output:
+            # All-gather across the partitions.
+            output = gather_from_tensor_model_parallel_region(output_parallel)
+        else:
+            output = output_parallel
+        output_bias = self.bias if self.skip_bias_add else None
+        return output, output_bias
+
+
+class RowParallelLinear(torch.nn.Module):
+    """Linear layer with row parallelism.
+
+    The linear layer is defined as Y = XA + b. A is parallelized along
+    its first dimension and X along its second dimension as:
+               -   -
+              | A_1 |
+              | .   |
+          A = | .   |        X = [X_1, ..., X_p]
+              | .   |
+              | A_p |
+               -   -
+    Arguments:
+        input_size: first dimension of matrix A.
+        output_size: second dimension of matrix A.
+
+    Keyword Arguments:
+        bias: If true, add bias. Note that bias is not parallelized.
+        input_is_parallel: If true, we assume that the input is already
+                           split across the GPUs and we do not split
+                           again.
+        init_method: method to initialize weights. Note that bias is always set
+                     to zero.
+        stride: For the strided linear layers.
+        keep_master_weight_for_test: This was added for testing and should be
+                                     set to False. It returns the master weights
+                                     used for initialization.
+        skip_bias_add: This was added to enable performance optimization where bias
+                       can be fused with other elementwise operations. We skip
+                       adding bias but instead return it.
+        params_dtype:
+        use_cpu_initialization:
+        perform_initialization:
+    """
+
+    def __init__(self, input_size, output_size, *,
+                 bias=True, input_is_parallel=False,
+                 init_method=init.xavier_normal_, stride=1,
+                 keep_master_weight_for_test=False,
+                 skip_bias_add=False,
+                 params_dtype=None,
+                 use_cpu_initialization=False,
+                 perform_initialization=True,
+                 ):
+        super(RowParallelLinear, self).__init__()
+
+        # Keep input parameters
+        self.input_size = input_size
+        self.output_size = output_size
+        self.input_is_parallel = input_is_parallel
+        if params_dtype is None:
+            params_dtype = torch.get_default_dtype()
+
+        # Divide the weight matrix along the last dimension.
+        world_size = get_tensor_model_parallel_world_size()
+        self.input_size_per_partition = divide(input_size, world_size)
+        self.skip_bias_add = skip_bias_add
+
+        # Parameters.
+        # Note: torch.nn.functional.linear performs XA^T + b and as a result
+        # we allocate the transpose.
+        # Initialize weight.
+        if use_cpu_initialization:
+            self.weight = Parameter(torch.empty(self.output_size,
+                                                self.input_size_per_partition,
+                                                dtype=params_dtype))
+            if perform_initialization:
+                self.master_weight = _initialize_affine_weight_cpu(
+                    self.weight, self.output_size, self.input_size,
+                    self.input_size_per_partition, 1, init_method,
+                    stride=stride, return_master_weight=keep_master_weight_for_test,
+                    params_dtype=params_dtype)
+        else:
+            self.weight = Parameter(torch.empty(
+                self.output_size, self.input_size_per_partition,
+                device=torch.cuda.current_device(), dtype=params_dtype))
+            if perform_initialization:
+                _initialize_affine_weight_gpu(self.weight, init_method,
+                                              partition_dim=1, stride=stride)
+        if bias:
+            if use_cpu_initialization:
+                self.bias = Parameter(torch.empty(self.output_size,
+                                                  dtype=params_dtype))
+            else:
+                self.bias = Parameter(torch.empty(
+                    self.output_size, device=torch.cuda.current_device(),
+                    dtype=params_dtype))
+
+            # Always initialize bias to zero.
+            with torch.no_grad():
+                self.bias.zero_()
+        else:
+            self.register_parameter('bias', None)
+        self.weight_t = self.weight.t()
+
+    def forward(self, input_):
+        """Forward of RowParallelLinear
+
+        Args:
+            input_: 3D tensor whose order of dimension is [sequence, batch, hidden]
+
+        Returns:
+            - output
+            - bias
+        """
+        # Set up backprop all-reduce.
+        if self.input_is_parallel:
+            input_parallel = input_
+        else:
+            input_parallel = scatter_to_tensor_model_parallel_region(input_)
+        if get_tensor_model_parallel_world_size() == 1:
+            # Matrix multiply.
+            output_ = F.linear(input_parallel, self.weight)
+        else:
+            # Matrix multiply.
+            all_reduce_launcher = get_all_reduce_launcher()
+            num_tokens = input_parallel.shape[0]
+            output_buffer = all_reduce_launcher.buffer[:num_tokens]
+            torch.matmul(input_parallel, self.weight_t, out=output_buffer)
+            # All-reduce across all the partitions.
+            output_ = all_reduce_launcher.launch(output_buffer)
+
+        if not self.skip_bias_add:
+            output = output_ + self.bias if self.bias is not None else output_
+            output_bias = None
+        else:
+            output = output_
+            output_bias = self.bias
+        return output, output_bias
--- a/cacheflow/model_executor/parallel_utils/tensor_parallel/mappings.py
+++ b/cacheflow/model_executor/parallel_utils/tensor_parallel/mappings.py
@ -0,0 +1,279 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+import torch
+
+from cacheflow.model_executor.parallel_utils.parallel_state import (
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+    get_tensor_model_parallel_group,
+)
+from .utils import split_tensor_along_last_dim
+
+
+def _reduce(input_):
+    """All-reduce the input tensor across model parallel group."""
+
+    # Bypass the function if we are using only 1 GPU.
+    if get_tensor_model_parallel_world_size()==1:
+        return input_
+
+    # All-reduce.
+    torch.distributed.all_reduce(input_, group=get_tensor_model_parallel_group())
+
+    return input_
+
+
+def _split_along_last_dim(input_):
+    """Split the tensor along its last dimension and keep the
+    corresponding slice."""
+
+    world_size = get_tensor_model_parallel_world_size()
+    # Bypass the function if we are using only 1 GPU.
+    if world_size == 1:
+        return input_
+
+    # Split along last dimension.
+    input_list = split_tensor_along_last_dim(input_, world_size)
+
+    # Note: torch.split does not create contiguous tensors by default.
+    rank = get_tensor_model_parallel_rank()
+    output = input_list[rank].contiguous()
+
+    return output
+
+
+def _split_along_first_dim(input_):
+    """Split the tensor along its first dimension and keep the
+    corresponding slice."""
+
+    world_size = get_tensor_model_parallel_world_size()
+    # Bypass the function if we are using only 1 GPU.
+    if world_size == 1:
+        return input_
+
+    # Split along first dimension.
+    dim_size = input_.size()[0]
+    assert dim_size % world_size == 0, \
+        "First dimension of the tensor should be divisible by tensor parallel size"
+    local_dim_size = dim_size // world_size
+    rank = get_tensor_model_parallel_rank()
+    dim_offset = rank * local_dim_size
+
+    output = input_[dim_offset:dim_offset+local_dim_size].contiguous()
+
+    return output
+
+
+def _gather_along_last_dim(input_):
+    """Gather tensors and concatinate along the last dimension."""
+
+    world_size = get_tensor_model_parallel_world_size()
+    # Bypass the function if we are using only 1 GPU.
+    if world_size == 1:
+        return input_
+
+    # Size and dimension.
+    last_dim = input_.dim() - 1
+    rank = get_tensor_model_parallel_rank()
+
+    tensor_list = [torch.empty_like(input_) for _ in range(world_size)]
+    tensor_list[rank] = input_
+    torch.distributed.all_gather(tensor_list, input_, group=get_tensor_model_parallel_group())
+
+    # Note: torch.cat already creates a contiguous tensor.
+    output = torch.cat(tensor_list, dim=last_dim).contiguous()
+
+    return output
+
+
+def _gather_along_first_dim(input_):
+    """Gather tensors and concatinate along the first dimension."""
+
+    world_size = get_tensor_model_parallel_world_size()
+    # Bypass the function if we are using only 1 GPU.
+    if world_size == 1:
+        return input_
+
+    dim_size = list(input_.size())
+    dim_size[0] = dim_size[0] * world_size
+
+    output = torch.empty(dim_size, dtype=input_.dtype,
+                         device=torch.cuda.current_device())
+    torch.distributed._all_gather_base(output, input_.contiguous(),
+                                       group=get_tensor_model_parallel_group())
+
+    return output
+
+def _reduce_scatter_along_first_dim(input_):
+    """Reduce-scatter the input tensor across model parallel group."""
+    world_size = get_tensor_model_parallel_world_size()
+    # Bypass the function if we are using only 1 GPU.
+    if world_size == 1:
+        return input_
+
+    dim_size = list(input_.size())
+    assert dim_size[0] % world_size == 0, \
+        "First dimension of the tensor should be divisible by tensor parallel size"
+
+    dim_size[0] = dim_size[0] // world_size
+
+    output = torch.empty(dim_size, dtype=input_.dtype,
+                         device=torch.cuda.current_device())
+    torch.distributed._reduce_scatter_base(output, input_.contiguous(),
+                                           group=get_tensor_model_parallel_group())
+    return output
+
+
+class _CopyToModelParallelRegion(torch.autograd.Function):
+    """Pass the input to the model parallel region."""
+
+    @staticmethod
+    def symbolic(graph, input_):
+        return input_
+
+    @staticmethod
+    def forward(ctx, input_):
+        return input_
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return _reduce(grad_output)
+
+
+class _ReduceFromModelParallelRegion(torch.autograd.Function):
+    """All-reduce the input from the model parallel region."""
+
+    @staticmethod
+    def symbolic(graph, input_):
+        return _reduce(input_)
+
+    @staticmethod
+    def forward(ctx, input_):
+        return _reduce(input_)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return grad_output
+
+
+class _ScatterToModelParallelRegion(torch.autograd.Function):
+    """Split the input and keep only the corresponding chuck to the rank."""
+
+    @staticmethod
+    def symbolic(graph, input_):
+        return _split_along_last_dim(input_)
+
+    @staticmethod
+    def forward(ctx, input_):
+        return _split_along_last_dim(input_)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return _gather_along_last_dim(grad_output)
+
+
+class _GatherFromModelParallelRegion(torch.autograd.Function):
+    """Gather the input from model parallel region and concatinate."""
+
+    @staticmethod
+    def symbolic(graph, input_):
+        return _gather_along_last_dim(input_)
+
+    @staticmethod
+    def forward(ctx, input_):
+        return _gather_along_last_dim(input_)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return _split_along_last_dim(grad_output)
+
+
+class _ScatterToSequenceParallelRegion(torch.autograd.Function):
+    """Split the input and keep only the corresponding chuck to the rank."""
+
+    @staticmethod
+    def symbolic(graph, input_):
+        return _split_along_first_dim(input_)
+
+    @staticmethod
+    def forward(ctx, input_):
+        return _split_along_first_dim(input_)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return _gather_along_first_dim(grad_output)
+
+
+class _GatherFromSequenceParallelRegion(torch.autograd.Function):
+    """Gather the input from sequence parallel region and concatinate."""
+
+    @staticmethod
+    def symbolic(graph, input_, tensor_parallel_output_grad=True):
+        return _gather_along_first_dim(input_)
+
+    @staticmethod
+    def forward(ctx, input_, tensor_parallel_output_grad=True):
+        ctx.tensor_parallel_output_grad = tensor_parallel_output_grad
+        return _gather_along_first_dim(input_)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        tensor_parallel_output_grad = ctx.tensor_parallel_output_grad
+
+        # If the computation graph after the gather operation is
+        # in the tensor parallel mode, output gradients need to reduce
+        # scattered and whereas if the computation is duplicated,
+        # output gradients need to be scattered.
+        if tensor_parallel_output_grad:
+            return _reduce_scatter_along_first_dim(grad_output), None
+        else:
+            return _split_along_first_dim(grad_output), None
+
+
+class _ReduceScatterToSequenceParallelRegion(torch.autograd.Function):
+    """Reduce scatter the input from the model parallel region."""
+
+    @staticmethod
+    def symbolic(graph, input_):
+        return _reduce_scatter_along_first_dim(input_)
+
+    @staticmethod
+    def forward(ctx, input_):
+        return _reduce_scatter_along_first_dim(input_)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return _gather_along_first_dim(grad_output)
+
+
+# -----------------
+# Helper functions.
+# -----------------
+
+def copy_to_tensor_model_parallel_region(input_):
+    return _CopyToModelParallelRegion.apply(input_)
+
+
+def reduce_from_tensor_model_parallel_region(input_):
+    return _ReduceFromModelParallelRegion.apply(input_)
+
+
+def scatter_to_tensor_model_parallel_region(input_):
+    return _ScatterToModelParallelRegion.apply(input_)
+
+
+def gather_from_tensor_model_parallel_region(input_):
+    return _GatherFromModelParallelRegion.apply(input_)
+
+
+def scatter_to_sequence_parallel_region(input_):
+    return _ScatterToSequenceParallelRegion.apply(input_)
+
+
+def gather_from_sequence_parallel_region(input_, tensor_parallel_output_grad=True):
+    return _GatherFromSequenceParallelRegion.apply(input_, tensor_parallel_output_grad)
+
+
+def reduce_scatter_to_sequence_parallel_region(input_):
+    return _ReduceScatterToSequenceParallelRegion.apply(input_)
+
--- a/cacheflow/model_executor/parallel_utils/tensor_parallel/random.py
+++ b/cacheflow/model_executor/parallel_utils/tensor_parallel/random.py
@ -0,0 +1,253 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+# Parts of the code here are adapted from PyTorch
+# repo: https://github.com/pytorch/pytorch
+
+import contextlib
+
+import torch
+from torch import _C
+from torch.cuda import _lazy_call, device as device_ctx_manager
+from torch.utils.checkpoint import detach_variable
+
+from cacheflow.model_executor.parallel_utils.parallel_state import (
+    get_data_parallel_rank,
+    get_tensor_model_parallel_group,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
+
+from .utils import (
+    split_tensor_into_1d_equal_chunks,
+    gather_split_1d_tensor,
+)
+
+from cacheflow.model_executor.parallel_utils.utils import safely_set_viewless_tensor_data
+
+# Default name for the model parallel rng tracker.
+_MODEL_PARALLEL_RNG_TRACKER_NAME = 'model-parallel-rng'
+
+
+def _set_cuda_rng_state(new_state, device=-1):
+    """Sets the random number generator state of the current GPU.
+
+    Argumentss:
+        new_state (torch.ByteTensor): The desired state
+    This function is adapted from PyTorch repo (torch.cuda.set_rng_state)
+    with a single change: the input state is not cloned. Cloning caused
+    major performance issues for +4 GPU cases.
+    """
+    if hasattr(_C, '_cuda_setRNGState') and callable(_C._cuda_setRNGState):
+        # older PyTorch
+        def cb():
+            with device_ctx_manager(device):
+                _C._cuda_setRNGState(new_state)
+    else:
+        # newer PyTorch
+        if device == -1:
+            device = torch.device('cuda')
+        elif isinstance(device, str):
+            device = torch.device(device)
+        elif isinstance(device, int):
+            device = torch.device('cuda', device)
+
+        def cb():
+            idx = device.index
+            if idx is None:
+                idx = torch.cuda.current_device()
+            default_generator = torch.cuda.default_generators[idx]
+            default_generator.set_state(new_state)
+
+    _lazy_call(cb)
+
+
+
+class CudaRNGStatesTracker:
+    """Tracker for the cuda RNG states.
+
+    Using the `add` method, a cuda rng state is initialized based on
+    the input `seed` and is assigned to `name`. Later, by forking the
+    rng state, we can perform operations and return to our starting
+    cuda state.
+    """
+
+    def __init__(self):
+        # Map from a string name to the cuda rng state.
+        self.states_ = {}
+        # Seeds are just for book keeping and ensure no seed is set twice.
+        self.seeds_ = set()
+
+    def reset(self):
+        """Set to the initial state (no tracker)."""
+        self.states_ = {}
+        self.seeds_ = set()
+
+    def get_states(self):
+        """Get rng states. Copy the dictionary so we have direct
+        pointers to the states, not just a pointer to the dictionary."""
+        states = {}
+        for name in self.states_:
+            states[name] = self.states_[name]
+        return states
+
+    def set_states(self, states):
+        """Set the rng states. For efficiency purposes, we do not check
+        the size of seed for compatibility."""
+        self.states_ = states
+
+    def add(self, name, seed):
+        """Track the rng state."""
+        # Check seed is not already used.
+        if seed in self.seeds_:
+            raise Exception('seed {} already exists'.format(seed))
+        self.seeds_.add(seed)
+        # Check that state is not already defined.
+        if name in self.states_:
+            raise Exception('cuda rng state {} already exists'.format(name))
+        # Get the current rng state.
+        orig_rng_state = torch.cuda.get_rng_state()
+        # Set the new state and store it.
+        torch.cuda.manual_seed(seed)
+        self.states_[name] = torch.cuda.get_rng_state()
+        # Reset rng state to what it was.
+        _set_cuda_rng_state(orig_rng_state)
+
+    @contextlib.contextmanager
+    def fork(self, name=_MODEL_PARALLEL_RNG_TRACKER_NAME):
+        """Fork the cuda rng state, perform operations, and exit with
+        the original state."""
+        # Check if we have added the state
+        if name not in self.states_:
+            raise Exception('cuda rng state {} is not added'.format(name))
+        # Store current rng state.
+        orig_cuda_rng_state = torch.cuda.get_rng_state()
+        # Set rng state to the desired one
+        _set_cuda_rng_state(self.states_[name])
+        # Do the stuff we wanted to do.
+        try:
+            yield
+        finally:
+            # Update the current rng state for later use.
+            self.states_[name] = torch.cuda.get_rng_state()
+            # And set the state to the original state we started with.
+            _set_cuda_rng_state(orig_cuda_rng_state)
+
+
+# RNG tracker object.
+_CUDA_RNG_STATE_TRACKER = CudaRNGStatesTracker()
+
+
+def get_cuda_rng_tracker():
+    """Get cuda rng tracker."""
+    return _CUDA_RNG_STATE_TRACKER
+
+
+def model_parallel_cuda_manual_seed(seed):
+    """Initialize model parallel cuda seed.
+
+    This function should be called after the model parallel is
+    initialized. Also, no torch.cuda.manual_seed should be called
+    after this function. Basically, this is replacement for that
+    function.
+    Two set of RNG states are tracked:
+        default state: This is for data parallelism and is the same among a
+                       set of model parallel GPUs but different across
+                       different model paralle groups. This is used for
+                       example for dropout in the non-tensor-model-parallel regions.
+        tensor-model-parallel state: This state is different among a set of model
+                              parallel GPUs, but the same across data parallel
+                              groups. This is used for example for dropout in
+                              model parallel regions.
+    """
+    # 2718 is just for fun and any POSITIVE value will work.
+    offset = seed + 2718
+    tensor_model_parallel_seed = offset + get_tensor_model_parallel_rank()
+    # Data parallel gets the original seed.
+    data_parallel_seed = seed
+
+    _CUDA_RNG_STATE_TRACKER.reset()
+    # Set the default state.
+    torch.cuda.manual_seed(data_parallel_seed)
+    # and model parallel state.
+    _CUDA_RNG_STATE_TRACKER.add(_MODEL_PARALLEL_RNG_TRACKER_NAME,
+                                tensor_model_parallel_seed)
+
+
+class CheckpointFunction(torch.autograd.Function):
+    """This function is adapted from torch.utils.checkpoint with
+       two main changes:
+           1) torch.cuda.set_rng_state is replaced with `_set_cuda_rng_state`
+           2) the states in the model parallel tracker are also properly
+              tracked/set/reset.
+    """
+    @staticmethod
+    def forward(ctx, run_function, distribute_saved_activations, *args):
+        ctx.run_function = run_function
+        ctx.distribute_saved_activations \
+            = distribute_saved_activations
+
+        # Copy the rng states.
+        ctx.fwd_cpu_rng_state = torch.get_rng_state()
+        ctx.fwd_cuda_rng_state = torch.cuda.get_rng_state()
+        ctx.fwd_cuda_rng_state_tracker = get_cuda_rng_tracker().get_states()
+
+        with torch.no_grad():
+            outputs = run_function(*args)
+
+        # Divide hidden states across model parallel group and only keep
+        # the chunk corresponding to the current rank.
+        if distribute_saved_activations:
+            ctx.input_0_shape = args[0].data.shape
+            safely_set_viewless_tensor_data(
+                args[0],
+                split_tensor_into_1d_equal_chunks(args[0].data, new_buffer=True))
+
+        # Store everything.
+        ctx.save_for_backward(*args)
+
+        return outputs
+
+    @staticmethod
+    def backward(ctx, *args):
+        if not torch.autograd._is_checkpoint_valid():
+            raise RuntimeError("Checkpointing is not compatible with .grad(), "
+                               "please use .backward() if possible")
+        inputs = ctx.saved_tensors
+        if ctx.distribute_saved_activations:
+            safely_set_viewless_tensor_data(
+                inputs[0],
+                gather_split_1d_tensor(inputs[0].data).view(ctx.input_0_shape))
+
+        # Store the current states.
+        bwd_cpu_rng_state = torch.get_rng_state()
+        bwd_cuda_rng_state = torch.cuda.get_rng_state()
+        bwd_cuda_rng_state_tracker = get_cuda_rng_tracker().get_states()
+
+        # Set the states to what it used to be before the forward pass.
+        torch.set_rng_state(ctx.fwd_cpu_rng_state)
+        _set_cuda_rng_state(ctx.fwd_cuda_rng_state)
+        get_cuda_rng_tracker().set_states(ctx.fwd_cuda_rng_state_tracker)
+
+        # Compute the forward pass.
+        detached_inputs = detach_variable(inputs)
+        with torch.enable_grad():
+            outputs = ctx.run_function(*detached_inputs)
+
+        # Set the states back to what it was at the start of this function.
+        torch.set_rng_state(bwd_cpu_rng_state)
+        _set_cuda_rng_state(bwd_cuda_rng_state)
+        get_cuda_rng_tracker().set_states(bwd_cuda_rng_state_tracker)
+
+        if isinstance(outputs, torch.Tensor):
+            outputs = (outputs,)
+        torch.autograd.backward(outputs, args)
+        grads = tuple(inp.grad if isinstance(inp, torch.Tensor) else inp
+                      for inp in detached_inputs)
+        return (None, None) + grads
+
+
+def checkpoint(function, distribute_saved_activations, *args):
+    """Checkpoint a model or part of the model.
+    This has been directly copied from torch.utils.checkpoint."""
+    return CheckpointFunction.apply(function,
+                                    distribute_saved_activations, *args)
--- a/cacheflow/model_executor/parallel_utils/tensor_parallel/utils.py
+++ b/cacheflow/model_executor/parallel_utils/tensor_parallel/utils.py
@ -0,0 +1,108 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+import torch
+from typing import List, Sequence
+
+from cacheflow.model_executor.parallel_utils.utils import divide
+from cacheflow.model_executor.parallel_utils import parallel_state
+
+def split_tensor_along_last_dim(
+    tensor: torch.Tensor,
+    num_partitions: int,
+    contiguous_split_chunks: bool = False,
+) -> List[torch.Tensor]:
+    """ Split a tensor along its last dimension.
+
+        Arguments:
+            tensor: input tensor.
+            num_partitions: number of partitions to split the tensor
+            contiguous_split_chunks: If True, make each chunk contiguous
+                                     in memory.
+
+        Returns:
+            A list of Tensors
+    """
+    # Get the size and dimension.
+    last_dim = tensor.dim() - 1
+    last_dim_size = divide(tensor.size()[last_dim], num_partitions)
+    # Split.
+    tensor_list = torch.split(tensor, last_dim_size, dim=last_dim)
+    # Note: torch.split does not create contiguous tensors by default.
+    if contiguous_split_chunks:
+        return tuple(chunk.contiguous() for chunk in tensor_list)
+
+    return tensor_list
+
+def split_tensor_into_1d_equal_chunks(tensor, new_buffer=False):
+    """ Break a tensor into equal 1D chunks across tensor parallel ranks.
+
+        Returns a Tensor or View with this rank's portion of the data.
+
+        Arguments:
+            tensor: The tensor to split
+
+        Keyword Arguments:
+            new_buffer (bool): If True, returns a new Tensor.
+                               If False, returns a view into the existing Tensor.
+                               Default is False
+
+    """
+    partition_size = torch.numel(tensor) // \
+        parallel_state.get_tensor_model_parallel_world_size()
+    start_index = partition_size * parallel_state.get_tensor_model_parallel_rank()
+    end_index = start_index + partition_size
+    if new_buffer:
+        data = torch.empty(partition_size, dtype=tensor.dtype,
+                           device=torch.cuda.current_device(),
+                           requires_grad=False)
+        data.copy_(tensor.view(-1)[start_index:end_index])
+    else:
+        data = tensor.view(-1)[start_index:end_index]
+    return data
+
+
+def gather_split_1d_tensor(tensor):
+    """ Opposite of split_tensor_into_1d_equal_chunks. Gather values from tensor
+        model parallel ranks.
+
+        Returns a new Tensor with the gathered data.
+
+        Arguments:
+            tensor: A Tensor or view of this rank's portion of the data.
+    """
+    numel_gathered = torch.numel(tensor) * \
+        parallel_state.get_tensor_model_parallel_world_size()
+    gathered = torch.empty(numel_gathered, dtype=tensor.dtype,
+                           device=torch.cuda.current_device(),
+                           requires_grad=False)
+    # TODO: This API is experimental in pytorch (as of Feb 2022) and
+    # this might break in future pytorch releases. We chose this API
+    # as opposed to torch.distributed.all_gather for efficiency reasons.
+    # This API calls directly NCCL all-gather versus the former does
+    # internal copies and can potentially cause slow down.
+    torch.distributed._all_gather_base(gathered, tensor,
+                                       group=parallel_state.get_tensor_model_parallel_group())
+    return gathered
+
+
+class VocabUtility:
+    """ Split the vocabulary into `world_size` chunks and return the first
+        and last index of the vocabulary belonging to the `rank`
+        partition: Note that indices in [fist, last)
+
+    """
+
+    @staticmethod
+    def vocab_range_from_per_partition_vocab_size(
+        per_partition_vocab_size: int, rank, world_size: int
+    ) -> Sequence[int]:
+        index_f = rank * per_partition_vocab_size
+        index_l = index_f + per_partition_vocab_size
+        return index_f, index_l
+
+    @staticmethod
+    def vocab_range_from_global_vocab_size(global_vocab_size: int, rank: int, world_size: int) -> Sequence[int]:
+        per_partition_vocab_size = divide(global_vocab_size, world_size)
+        return VocabUtility.vocab_range_from_per_partition_vocab_size(
+            per_partition_vocab_size, rank, world_size
+        )
--- a/cacheflow/model_executor/parallel_utils/utils.py
+++ b/cacheflow/model_executor/parallel_utils/utils.py
@ -0,0 +1,120 @@
+"""Utility functions used throughout Megatron core"""
+from functools import reduce
+import operator
+
+import torch
+
+from cacheflow.model_executor.parallel_utils import parallel_state
+
+
+def ensure_divisibility(numerator, denominator):
+    """Ensure that numerator is divisible by the denominator."""
+    assert numerator % denominator == 0, "{} is not divisible by {}".format(
+        numerator, denominator
+    )
+
+
+def divide(numerator, denominator):
+    """Ensure that numerator is divisible by the denominator and return
+    the division value."""
+    ensure_divisibility(numerator, denominator)
+    return numerator // denominator
+
+
+class GlobalMemoryBuffer:
+    """Global buffer to avoid dynamic memory allocations.
+    Caller should ensure that buffers of the same name
+    are not used concurrently."""
+
+    def __init__(self):
+        self.buffer = {}
+
+    def get_tensor(self, tensor_shape, dtype, name):
+        required_len = reduce(operator.mul, tensor_shape, 1)
+        if self.buffer.get((name, dtype), None) is None or \
+                self.buffer[(name, dtype)].numel() < required_len:
+            self.buffer[(name, dtype)] = \
+                torch.empty(required_len,
+                            dtype=dtype,
+                            device=torch.cuda.current_device(),
+                            requires_grad=False)
+
+        return self.buffer[(name, dtype)][0:required_len].view(*tensor_shape)
+
+def _kernel_make_viewless_tensor(inp, requires_grad):
+    '''Make a viewless tensor.
+
+    View tensors have the undesirable side-affect of retaining a reference
+    to the originally-viewed tensor, even after manually setting the '.data'
+    field. This method creates a new tensor that links to the old tensor's
+    data, without linking the viewed tensor, referenced via the '._base'
+    field.
+    '''
+    out = torch.empty(
+        (1,),
+        dtype = inp.dtype,
+        device = inp.device,
+        requires_grad = requires_grad,
+    )
+    out.data = inp.data
+    return out
+
+class MakeViewlessTensor(torch.autograd.Function):
+    '''
+    Autograd function to make a viewless tensor.
+
+    This function should be used in cases where the computation graph needs
+    to be propagated, but we only want a viewless tensor (e.g.,
+    ParallelTransformer's hidden_states). Call this function by passing
+    'keep_graph = True' to 'make_viewless_tensor()'.
+    '''
+    @staticmethod
+    def forward(ctx, inp, requires_grad):
+        return _kernel_make_viewless_tensor(inp, requires_grad)
+    @staticmethod
+    def backward(ctx, grad_output):
+        return grad_output, None
+
+def make_viewless_tensor(inp, requires_grad, keep_graph):
+    '''
+    Entry-point for creating viewless tensors.
+
+    This method should be used, rather than calling 'MakeViewlessTensor'
+    or '_kernel_make_viewless_tensor' directly. This method acts as a
+    switch for determining if an autograd function or a regular method
+    should be used to create the tensor.
+    '''
+
+    # return tensor as-is, if not a 'view'
+    if inp._base is None:
+        return inp
+
+    # create viewless tensor
+    if keep_graph:
+        return MakeViewlessTensor.apply(inp, requires_grad)
+    else:
+        return _kernel_make_viewless_tensor(inp, requires_grad)
+
+def assert_viewless_tensor(tensor, extra_msg = None):
+    '''Assert that a tensor is not a view (i.e., its '._base' field is
+    not set).'''
+    if isinstance(tensor, list):
+        [ assert_viewless_tensor(t) for t in tensor ]
+        return tensor
+    if not isinstance(tensor, torch.Tensor):
+        return tensor
+    assert tensor._base is None, (
+        "Ensure tensor._base is None before setting tensor.data or storing "
+        "tensor to memory buffer. Otherwise, a memory leak will occur (and "
+        "likely accumulate over iterations). %s"
+    ) % extra_msg
+    return tensor
+
+def safely_set_viewless_tensor_data(tensor, new_data_tensor):
+    '''Safely set tensor's '.data' field.
+
+    Check first that the tensor is viewless (i.e., '._base' not set). If not,
+    raise an exception.
+    '''
+    assert_viewless_tensor(tensor, extra_msg = "FYI, tensor._base has shape %s, and new_data_tensor has shape %s." % ("--" if tensor._base is None else tensor._base.shape, new_data_tensor.shape))
+    tensor.data = new_data_tensor
--- a/cacheflow/model_executor/utils.py
+++ b/cacheflow/model_executor/utils.py
@ -0,0 +1,41 @@
+import random
+from typing import Union
+
+import numpy as np
+import torch
+
+from cacheflow.model_executor.parallel_utils.parallel_state import model_parallel_is_initialized
+from cacheflow.model_executor.parallel_utils.tensor_parallel import model_parallel_cuda_manual_seed
+
+
+_STR_DTYPE_TO_TORCH_DTYPE = {
+    'half': torch.half,
+    'float': torch.float,
+    'float16': torch.float16,
+    'float32': torch.float32,
+    'bfloat16': torch.bfloat16,
+}
+
+
+def get_torch_dtype(dtype: Union[torch.dtype, str]) -> torch.dtype:
+    if isinstance(dtype, str):
+        torch_dtype = _STR_DTYPE_TO_TORCH_DTYPE[dtype.lower()]
+    else:
+        torch_dtype = dtype
+    return torch_dtype
+
+
+def get_dtype_size(dtype: Union[torch.dtype, str]) -> int:
+    torch_dtype = get_torch_dtype(dtype)
+    return torch.tensor([], dtype=torch_dtype).element_size()
+
+
+def set_random_seed(seed: int) -> None:
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(seed)
+
+    if model_parallel_is_initialized():
+        model_parallel_cuda_manual_seed(seed)
--- a/cacheflow/model_executor/weight_utils.py
+++ b/cacheflow/model_executor/weight_utils.py
@ -0,0 +1,110 @@
+import filelock
+import glob
+import json
+import os
+from typing import Iterator, List, Optional, Tuple
+
+from huggingface_hub import snapshot_download
+import numpy as np
+import torch
+from tqdm.auto import tqdm
+
+
+class Disabledtqdm(tqdm):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs, disable=True)
+
+
+def hf_model_weights_iterator(
+    model_name_or_path: str,
+    cache_dir: Optional[str] = None,
+    use_np_cache: bool = False,
+) -> Iterator[Tuple[str, torch.Tensor]]:
+    # Prepare file lock directory to prevent multiple processes from
+    # downloading the same model weights at the same time.
+    lock_dir = cache_dir if cache_dir is not None else "/tmp"
+    lock_file_name = model_name_or_path.replace("/", "-") + ".lock"
+    lock = filelock.FileLock(os.path.join(lock_dir, lock_file_name))
+
+    # Download model weights from huggingface.
+    is_local = os.path.isdir(model_name_or_path)
+    if not is_local:
+        with lock:
+            hf_folder = snapshot_download(model_name_or_path,
+                                          allow_patterns="*.bin",
+                                          cache_dir=cache_dir,
+                                          tqdm_class=Disabledtqdm)
+    else:
+        hf_folder = model_name_or_path
+
+    hf_bin_files = glob.glob(os.path.join(hf_folder, "*.bin"))
+
+    if use_np_cache:
+        # Convert the model weights from torch tensors to numpy arrays for
+        # faster loading.
+        np_folder = os.path.join(hf_folder, 'np')
+        os.makedirs(np_folder, exist_ok=True)
+        weight_names_file = os.path.join(np_folder, 'weight_names.json')
+        with lock:
+            if not os.path.exists(weight_names_file):
+                weight_names = []
+                for bin_file in hf_bin_files:
+                    state = torch.load(bin_file, map_location="cpu")
+                    for name, param in state.items():
+                        param_path = os.path.join(np_folder, name)
+                        with open(param_path, "wb") as f:
+                            np.save(f, param.cpu().detach().numpy())
+                        weight_names.append(name)
+                with open(weight_names_file, 'w') as f:
+                    json.dump(weight_names, f)
+
+        with open(weight_names_file, 'r') as f:
+            weight_names = json.load(f)
+
+        for name in weight_names:
+            param_path = os.path.join(np_folder, name)
+            with open(param_path, "rb") as f:
+                param = np.load(f)
+            yield name, torch.from_numpy(param)
+    else:
+        for bin_file in hf_bin_files:
+            state = torch.load(bin_file, map_location="cpu")
+            for name, param in state.items():
+                yield name, param
+
+
+def load_tensor_parallel_weights(
+    param: torch.Tensor,
+    loaded_weight: torch.Tensor,
+    param_name: str,
+    column_parallel_weight_names: List[str],
+    row_parallel_weight_names: List[str],
+    tensor_model_parallel_rank: int,
+) -> None:
+    for p in column_parallel_weight_names:
+        if p in param_name:
+            shard_size = param.shape[0]
+            loaded_weight = loaded_weight[
+                shard_size * tensor_model_parallel_rank
+                :shard_size * (tensor_model_parallel_rank + 1)]
+            break
+    for p in row_parallel_weight_names:
+        if p in param_name:
+            shard_size = param.shape[1]
+            loaded_weight = loaded_weight[
+                :,
+                shard_size * tensor_model_parallel_rank
+                :shard_size * (tensor_model_parallel_rank + 1)]
+            break
+    assert param.shape == loaded_weight.shape
+    param.data.copy_(loaded_weight)
+
+
+def initialize_dummy_weights(
+    model: torch.nn.Module,
+    low: float = -1e-3,
+    high: float = 1e-3,
+) -> None:
+    for param in model.state_dict().values():
+        param.data.uniform_(low, high)
				`@ -0,0 +1 @@`
				`The files in this folder are ported from [Megatron-LM](https://github.com/NVIDIA/Megatron-LM/tree/main/megatron/core). We only keep the codes that are used in inference.`