Simplify (and fix) passing of guided decoding backend options (#17008)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-04-29 20:02:23 +01:00
parent 2fa2a50bf9
commit a6977dbd15
17 changed files with 309 additions and 217 deletions
--- a/examples/online_serving/openai_chat_completion_structured_outputs.py
+++ b/examples/online_serving/openai_chat_completion_structured_outputs.py
@ -112,8 +112,8 @@ def extra_backend_options_completion(client: OpenAI, model: str):
              "alan.turing@enigma.com\n")

    try:
-        # The no-fallback option forces vLLM to use xgrammar, so when it fails
-        # you get a 400 with the reason why
+        # The guided_decoding_disable_fallback option forces vLLM to use
+        # xgrammar, so when it fails you get a 400 with the reason why
        completion = client.chat.completions.create(
            model=model,
            messages=[{
@ -123,7 +123,8 @@ def extra_backend_options_completion(client: OpenAI, model: str):
            extra_body={
                "guided_regex": r"\w+@\w+\.com\n",
                "stop": ["\n"],
-                "guided_decoding_backend": "xgrammar:no-fallback"
+                "guided_decoding_backend": "xgrammar",
+                "guided_decoding_disable_fallback": True,
            },
        )
        return completion.choices[0].message.content