[BugFix][Frontend] Use LoRA tokenizer in OpenAI APIs (#6227)
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
This commit is contained in:
@ -7,11 +7,11 @@ import jsonschema
|
||||
import openai # use the official client for correctness check
|
||||
import pytest
|
||||
import torch
|
||||
# downloading lora to test lora requests
|
||||
from huggingface_hub import snapshot_download
|
||||
from openai import BadRequestError
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
from .test_completion import zephyr_lora_added_tokens_files # noqa: F401
|
||||
from .test_completion import zephyr_lora_files # noqa: F401
|
||||
|
||||
# any model with a chat template should work here
|
||||
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
|
||||
@ -21,12 +21,7 @@ LORA_NAME = "typeof/zephyr-7b-beta-lora"
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def zephyr_lora_files():
|
||||
return snapshot_download(repo_id=LORA_NAME)
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server(zephyr_lora_files):
|
||||
def server(zephyr_lora_files, zephyr_lora_added_tokens_files): # noqa: F811
|
||||
args = [
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
@ -38,7 +33,7 @@ def server(zephyr_lora_files):
|
||||
"--enable-lora",
|
||||
"--lora-modules",
|
||||
f"zephyr-lora={zephyr_lora_files}",
|
||||
f"zephyr-lora2={zephyr_lora_files}",
|
||||
f"zephyr-lora2={zephyr_lora_added_tokens_files}",
|
||||
"--max-lora-rank",
|
||||
"64",
|
||||
"--max-cpu-loras",
|
||||
|
||||
@ -1,6 +1,8 @@
|
||||
# imports for guided decoding tests
|
||||
import json
|
||||
import re
|
||||
import shutil
|
||||
from tempfile import TemporaryDirectory
|
||||
from typing import List
|
||||
|
||||
import jsonschema
|
||||
@ -9,6 +11,7 @@ import pytest
|
||||
# downloading lora to test lora requests
|
||||
from huggingface_hub import snapshot_download
|
||||
from openai import BadRequestError
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
|
||||
@ -30,13 +33,29 @@ def zephyr_lora_files():
|
||||
return snapshot_download(repo_id=LORA_NAME)
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def zephyr_lora_added_tokens_files(zephyr_lora_files):
|
||||
tmp_dir = TemporaryDirectory()
|
||||
tmp_model_dir = f"{tmp_dir.name}/zephyr"
|
||||
shutil.copytree(zephyr_lora_files, tmp_model_dir)
|
||||
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
||||
# Copy tokenizer to adapter and add some unique tokens
|
||||
# 32000, 32001, 32002
|
||||
added = tokenizer.add_tokens(["vllm1", "vllm2", "vllm3"],
|
||||
special_tokens=True)
|
||||
assert added == 3
|
||||
tokenizer.save_pretrained(tmp_model_dir)
|
||||
yield tmp_model_dir
|
||||
tmp_dir.cleanup()
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def zephyr_pa_files():
|
||||
return snapshot_download(repo_id=PA_NAME)
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server(zephyr_lora_files, zephyr_pa_files):
|
||||
def server(zephyr_lora_files, zephyr_lora_added_tokens_files, zephyr_pa_files):
|
||||
args = [
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
@ -50,7 +69,7 @@ def server(zephyr_lora_files, zephyr_pa_files):
|
||||
"--enable-lora",
|
||||
"--lora-modules",
|
||||
f"zephyr-lora={zephyr_lora_files}",
|
||||
f"zephyr-lora2={zephyr_lora_files}",
|
||||
f"zephyr-lora2={zephyr_lora_added_tokens_files}",
|
||||
"--max-lora-rank",
|
||||
"64",
|
||||
"--max-cpu-loras",
|
||||
@ -111,6 +130,34 @@ async def test_single_completion(client: openai.AsyncOpenAI, model_name: str,
|
||||
assert len(completion.choices[0].text) >= 1
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_added_lora_tokens(client: openai.AsyncOpenAI):
|
||||
# test using token IDs
|
||||
completion = await client.completions.create(
|
||||
model="zephyr-lora2",
|
||||
prompt=[0, 0, 32000, 32001, 32002],
|
||||
echo=True,
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
)
|
||||
# Added tokens should appear in tokenized prompt
|
||||
assert completion.choices[0].text.startswith("<unk><unk>vllm1vllm2vllm3")
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_added_lora_tokens_base_model(client: openai.AsyncOpenAI):
|
||||
# test using token IDs
|
||||
completion = await client.completions.create(
|
||||
model=MODEL_NAME,
|
||||
prompt=[0, 0, 32000, 32001, 32002],
|
||||
echo=True,
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
)
|
||||
# Added tokens should not appear in tokenized prompt
|
||||
assert "vllm" not in completion.choices[0].text
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize(
|
||||
# first test base model, then test loras, then test prompt adapters
|
||||
|
||||
@ -38,5 +38,4 @@ async def _async_serving_chat_init():
|
||||
|
||||
def test_async_serving_chat_init():
|
||||
serving_completion = asyncio.run(_async_serving_chat_init())
|
||||
assert serving_completion.tokenizer is not None
|
||||
assert serving_completion.tokenizer.chat_template == CHAT_TEMPLATE
|
||||
assert serving_completion.chat_template == CHAT_TEMPLATE
|
||||
|
||||
@ -5,13 +5,15 @@ import requests
|
||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
from .test_completion import zephyr_lora_added_tokens_files # noqa: F401
|
||||
from .test_completion import zephyr_lora_files # noqa: F401
|
||||
|
||||
# any model with a chat template should work here
|
||||
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server():
|
||||
def server(zephyr_lora_added_tokens_files: str): # noqa: F811
|
||||
args = [
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
@ -21,12 +23,25 @@ def server():
|
||||
"--enforce-eager",
|
||||
"--max-num-seqs",
|
||||
"128",
|
||||
# lora config
|
||||
"--enable-lora",
|
||||
"--lora-modules",
|
||||
f"zephyr-lora2={zephyr_lora_added_tokens_files}",
|
||||
"--max-lora-rank",
|
||||
"64",
|
||||
]
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def tokenizer_name(model_name: str,
|
||||
zephyr_lora_added_tokens_files: str): # noqa: F811
|
||||
return zephyr_lora_added_tokens_files if (
|
||||
model_name == "zephyr-lora2") else model_name
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def client(server):
|
||||
return server.get_async_client()
|
||||
@ -34,16 +49,18 @@ def client(server):
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize(
|
||||
"model_name",
|
||||
[MODEL_NAME],
|
||||
"model_name,tokenizer_name",
|
||||
[(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")],
|
||||
indirect=["tokenizer_name"],
|
||||
)
|
||||
async def test_tokenize_completions(client: openai.AsyncOpenAI,
|
||||
model_name: str):
|
||||
model_name: str, tokenizer_name: str):
|
||||
base_url = str(client.base_url)[:-3].strip("/")
|
||||
tokenizer = get_tokenizer(tokenizer_name=model_name, tokenizer_mode="fast")
|
||||
tokenizer = get_tokenizer(tokenizer_name=tokenizer_name,
|
||||
tokenizer_mode="fast")
|
||||
|
||||
for add_special in [False, True]:
|
||||
prompt = "This is a test prompt."
|
||||
prompt = "vllm1 This is a test prompt."
|
||||
tokens = tokenizer.encode(prompt, add_special_tokens=add_special)
|
||||
|
||||
response = requests.post(base_url + "/tokenize",
|
||||
@ -63,12 +80,15 @@ async def test_tokenize_completions(client: openai.AsyncOpenAI,
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize(
|
||||
"model_name",
|
||||
[MODEL_NAME],
|
||||
"model_name,tokenizer_name",
|
||||
[(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")],
|
||||
indirect=["tokenizer_name"],
|
||||
)
|
||||
async def test_tokenize_chat(client: openai.AsyncOpenAI, model_name: str):
|
||||
async def test_tokenize_chat(client: openai.AsyncOpenAI, model_name: str,
|
||||
tokenizer_name: str):
|
||||
base_url = str(client.base_url)[:-3].strip("/")
|
||||
tokenizer = get_tokenizer(tokenizer_name=model_name, tokenizer_mode="fast")
|
||||
tokenizer = get_tokenizer(tokenizer_name=tokenizer_name,
|
||||
tokenizer_mode="fast")
|
||||
|
||||
for add_generation in [False, True]:
|
||||
for add_special in [False, True]:
|
||||
@ -80,7 +100,7 @@ async def test_tokenize_chat(client: openai.AsyncOpenAI, model_name: str):
|
||||
"content": "Nice to meet you!"
|
||||
}, {
|
||||
"role": "user",
|
||||
"content": "Can I ask a question?"
|
||||
"content": "Can I ask a question? vllm1"
|
||||
}]
|
||||
|
||||
prompt = tokenizer.apply_chat_template(
|
||||
@ -108,16 +128,20 @@ async def test_tokenize_chat(client: openai.AsyncOpenAI, model_name: str):
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize(
|
||||
"model_name",
|
||||
[MODEL_NAME],
|
||||
"model_name,tokenizer_name",
|
||||
[(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")],
|
||||
indirect=["tokenizer_name"],
|
||||
)
|
||||
async def test_detokenize(client: openai.AsyncOpenAI, model_name: str):
|
||||
async def test_detokenize(client: openai.AsyncOpenAI, model_name: str,
|
||||
tokenizer_name: str):
|
||||
base_url = str(client.base_url)[:-3].strip("/")
|
||||
tokenizer = get_tokenizer(tokenizer_name=model_name, tokenizer_mode="fast")
|
||||
tokenizer = get_tokenizer(tokenizer_name=tokenizer_name,
|
||||
tokenizer_mode="fast")
|
||||
|
||||
prompt = "This is a test prompt."
|
||||
prompt = "This is a test prompt. vllm1"
|
||||
tokens = tokenizer.encode(prompt, add_special_tokens=False)
|
||||
|
||||
print(f"CALLING {base_url} FOR {model_name}")
|
||||
response = requests.post(base_url + "/detokenize",
|
||||
json={
|
||||
"model": model_name,
|
||||
|
||||
Reference in New Issue
Block a user