Support various block sizes & Change default block size to 16 (#38)
This commit is contained in:
@ -15,9 +15,6 @@ class BlockAllocator:
|
||||
block_size: int,
|
||||
num_blocks: int,
|
||||
) -> None:
|
||||
if block_size not in [8, 16, 32]:
|
||||
raise ValueError(f'Unsupported block size: {block_size}'
|
||||
'The block size must be one of {8, 16, 32}.')
|
||||
self.device = device
|
||||
self.block_size = block_size
|
||||
self.num_blocks = num_blocks
|
||||
|
||||
@ -125,7 +125,8 @@ class Scheduler:
|
||||
|
||||
# Swap in the sequence groups in the SWAPPED state if possible.
|
||||
self.swapped = self.policy.sort_by_priority(now, self.swapped)
|
||||
while self.swapped:
|
||||
# FCFS
|
||||
while self.swapped and not blocks_to_swap_out:
|
||||
seq_group = self.swapped[0]
|
||||
# If the sequence group has been preempted in this step, stop.
|
||||
if seq_group in preempted:
|
||||
|
||||
@ -180,9 +180,9 @@ def add_server_arguments(parser: argparse.ArgumentParser):
|
||||
parser.add_argument('--pipeline-parallel-size', '-pp', type=int, default=1, help='number of pipeline stages')
|
||||
parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1, help='number of tensor parallel replicas')
|
||||
# KV cache arguments
|
||||
parser.add_argument('--block-size', type=int, default=8, choices=[8, 16, 32], help='token block size')
|
||||
parser.add_argument('--block-size', type=int, default=16, choices=[1, 2, 4, 8, 16, 32, 64, 128, 256], help='token block size')
|
||||
# NOTE(woosuk): If FlashAttention is used, the float data type is not supported.
|
||||
parser.add_argument('--dtype', type=str, default='half', choices=['half', 'float'], help='data type')
|
||||
parser.add_argument('--dtype', type=str, default='half', choices=['half'], help='data type')
|
||||
# TODO(woosuk): Support fine-grained seeds (e.g., seed per request).
|
||||
parser.add_argument('--seed', type=int, default=0, help='random seed')
|
||||
parser.add_argument('--swap-space', type=int, default=20, help='CPU swap space size (GiB) per GPU')
|
||||
|
||||
Reference in New Issue
Block a user