[Doc] Update reasoning with stream example to use OpenAI library (#14077)
Signed-off-by: liuyanyi <wolfsonliu@163.com>
This commit is contained in:
@ -78,7 +78,55 @@ Streaming chat completions are also supported for reasoning models. The `reasoni
|
||||
}
|
||||
```
|
||||
|
||||
Please note that it is not compatible with the OpenAI Python client library. You can use the `requests` library to make streaming requests. You could checkout the [example](https://github.com/vllm-project/vllm/blob/main/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py).
|
||||
OpenAI Python client library does not officially support `reasoning_content` attribute for streaming output. But the client support extra attributes in the response. You can use `hasattr` to check if the `reasoning_content` attribute is present in the response. For example:
|
||||
|
||||
```python
|
||||
from openai import OpenAI
|
||||
|
||||
# Modify OpenAI's API key and API base to use vLLM's API server.
|
||||
openai_api_key = "EMPTY"
|
||||
openai_api_base = "http://localhost:8000/v1"
|
||||
|
||||
client = OpenAI(
|
||||
api_key=openai_api_key,
|
||||
base_url=openai_api_base,
|
||||
)
|
||||
|
||||
models = client.models.list()
|
||||
model = models.data[0].id
|
||||
|
||||
messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
|
||||
stream = client.chat.completions.create(model=model,
|
||||
messages=messages,
|
||||
stream=True)
|
||||
|
||||
print("client: Start streaming chat completions...")
|
||||
printed_reasoning_content = False
|
||||
printed_content = False
|
||||
|
||||
for chunk in stream:
|
||||
reasoning_content = None
|
||||
content = None
|
||||
# Check the content is reasoning_content or content
|
||||
if hasattr(chunk.choices[0].delta, "reasoning_content"):
|
||||
reasoning_content = chunk.choices[0].delta.reasoning_content
|
||||
elif hasattr(chunk.choices[0].delta, "content"):
|
||||
content = chunk.choices[0].delta.content
|
||||
|
||||
if reasoning_content is not None:
|
||||
if not printed_reasoning_content:
|
||||
printed_reasoning_content = True
|
||||
print("reasoning_content:", end="", flush=True)
|
||||
print(reasoning_content, end="", flush=True)
|
||||
elif content is not None:
|
||||
if not printed_content:
|
||||
printed_content = True
|
||||
print("\ncontent:", end="", flush=True)
|
||||
# Extract and print the content
|
||||
print(content, end="", flush=True)
|
||||
```
|
||||
|
||||
Remember to check whether the `reasoning_content` exists in the response before accessing it. You could checkout the [example](https://github.com/vllm-project/vllm/blob/main/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py).
|
||||
|
||||
## Structured output
|
||||
|
||||
|
||||
@ -19,73 +19,50 @@ in real-time as they are generated by the model. This is useful for scenarios
|
||||
where you want to display chat completions to the user as they are generated
|
||||
by the model.
|
||||
|
||||
Here we do not use the OpenAI Python client library, because it does not support
|
||||
`reasoning_content` fields in the response.
|
||||
Remember to check content and reasoning_content exist in `ChatCompletionChunk`,
|
||||
content may not exist leading to errors if you try to access it.
|
||||
"""
|
||||
|
||||
import json
|
||||
|
||||
import requests
|
||||
from openai import OpenAI
|
||||
|
||||
# Modify OpenAI's API key and API base to use vLLM's API server.
|
||||
openai_api_key = "EMPTY"
|
||||
openai_api_base = "http://localhost:8000/v1"
|
||||
|
||||
models = requests.get(
|
||||
f"{openai_api_base}/models",
|
||||
headers={
|
||||
"Authorization": f"Bearer {openai_api_key}"
|
||||
},
|
||||
).json()
|
||||
model = models["data"][0]["id"]
|
||||
|
||||
# Streaming chat completions
|
||||
messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
|
||||
|
||||
response = requests.post(
|
||||
f"{openai_api_base}/chat/completions",
|
||||
headers={"Authorization": f"Bearer {openai_api_key}"},
|
||||
json={
|
||||
"model": model,
|
||||
"messages": messages,
|
||||
"stream": True
|
||||
},
|
||||
client = OpenAI(
|
||||
api_key=openai_api_key,
|
||||
base_url=openai_api_base,
|
||||
)
|
||||
|
||||
models = client.models.list()
|
||||
model = models.data[0].id
|
||||
|
||||
messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
|
||||
stream = client.chat.completions.create(model=model,
|
||||
messages=messages,
|
||||
stream=True)
|
||||
|
||||
print("client: Start streaming chat completions...")
|
||||
printed_reasoning_content = False
|
||||
printed_content = False
|
||||
# Make the streaming request
|
||||
if response.status_code == 200:
|
||||
# Process the streaming response
|
||||
for line in response.iter_lines():
|
||||
if line: # Filter out keep-alive new lines
|
||||
# Decode the line and parse the JSON
|
||||
decoded_line = line.decode("utf-8")
|
||||
if decoded_line.startswith("data:"):
|
||||
data = decoded_line[5:].strip() # Remove "data:" prefix
|
||||
if data == "[DONE]": # End of stream
|
||||
print("\nclient: Stream completed.")
|
||||
break
|
||||
try:
|
||||
# Parse the JSON data
|
||||
chunk = json.loads(data)
|
||||
reasoning_content = chunk["choices"][0]["delta"].get(
|
||||
"reasoning_content", "")
|
||||
content = chunk["choices"][0]["delta"].get("content", "")
|
||||
|
||||
if reasoning_content:
|
||||
if not printed_reasoning_content:
|
||||
printed_reasoning_content = True
|
||||
print("reasoning_content:", end="", flush=True)
|
||||
print(reasoning_content, end="", flush=True)
|
||||
elif content:
|
||||
if not printed_content:
|
||||
printed_content = True
|
||||
print("\ncontent:", end="", flush=True)
|
||||
# Extract and print the content
|
||||
print(content, end="", flush=True)
|
||||
except json.JSONDecodeError:
|
||||
print("Error decoding JSON:", decoded_line)
|
||||
else:
|
||||
print(f"Error: {response.status_code} - {response.text}")
|
||||
for chunk in stream:
|
||||
reasoning_content = None
|
||||
content = None
|
||||
# Check the content is reasoning_content or content
|
||||
if hasattr(chunk.choices[0].delta, "reasoning_content"):
|
||||
reasoning_content = chunk.choices[0].delta.reasoning_content
|
||||
elif hasattr(chunk.choices[0].delta, "content"):
|
||||
content = chunk.choices[0].delta.content
|
||||
|
||||
if reasoning_content is not None:
|
||||
if not printed_reasoning_content:
|
||||
printed_reasoning_content = True
|
||||
print("reasoning_content:", end="", flush=True)
|
||||
print(reasoning_content, end="", flush=True)
|
||||
elif content is not None:
|
||||
if not printed_content:
|
||||
printed_content = True
|
||||
print("\ncontent:", end="", flush=True)
|
||||
# Extract and print the content
|
||||
print(content, end="", flush=True)
|
||||
|
||||
Reference in New Issue
Block a user