From 697ef765ee91d1a47b49ae7e43951cfd116b6052 Mon Sep 17 00:00:00 2001
From: Aaron Pham <contact@aarnphm.xyz>
Date: Mon, 14 Jul 2025 03:58:35 -0400
Subject: [PATCH] [Refactor][V1] Move outlines utils for V1 imports (#20878)

Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
---
 vllm/v1/structured_output/backend_outlines.py |   9 +-
 vllm/v1/structured_output/utils.py            | 200 +++++++++++++++++-
 2 files changed, 204 insertions(+), 5 deletions(-)

diff --git a/vllm/v1/structured_output/backend_outlines.py b/vllm/v1/structured_output/backend_outlines.py
index e1e4ea431d..572e498448 100644
--- a/vllm/v1/structured_output/backend_outlines.py
+++ b/vllm/v1/structured_output/backend_outlines.py
@@ -13,13 +13,14 @@ from typing import TYPE_CHECKING
 import torch
 from regex import escape as regex_escape
 
-from vllm.model_executor.guided_decoding.outlines_logits_processors import (
-    OutlinesVocabulary, get_cache, get_vocabulary)
 from vllm.sampling_params import SamplingParams
 from vllm.utils import LazyLoader
 from vllm.v1.structured_output.backend_types import (StructuredOutputBackend,
                                                      StructuredOutputGrammar,
                                                      StructuredOutputOptions)
+from vllm.v1.structured_output.utils import (OutlinesVocabulary,
+                                             get_outlines_cache,
+                                             get_outlines_vocabulary)
 
 if TYPE_CHECKING:
     import outlines_core as oc
@@ -47,8 +48,8 @@ else:
 class OutlinesBackend(StructuredOutputBackend):
 
     def __post_init__(self):
-        self.vocabulary = get_vocabulary(self.tokenizer)
-        self.cache = get_cache()
+        self.vocabulary = get_outlines_vocabulary(self.tokenizer)
+        self.cache = get_outlines_cache()
 
     def _compile_index(self, regex_string: str,
                        vocabulary: OutlinesVocabulary) -> oc.Index:
diff --git a/vllm/v1/structured_output/utils.py b/vllm/v1/structured_output/utils.py
index 7adee7237b..95319831d5 100644
--- a/vllm/v1/structured_output/utils.py
+++ b/vllm/v1/structured_output/utils.py
@@ -3,7 +3,205 @@
 
 from __future__ import annotations
 
+import hashlib
+import importlib.metadata
+import os
+from typing import TYPE_CHECKING
+
 import regex as re
+from cachetools import LRUCache
+from diskcache import Cache
+
+import vllm.envs as envs
+from vllm.logger import init_logger
+from vllm.utils import LazyLoader
+
+if TYPE_CHECKING:
+    import outlines_core as oc
+    import transformers.file_utils as file_utils
+    import transformers.models.gpt2.tokenization_gpt2 as tokenization_gpt2
+
+    from vllm.transformers_utils.tokenizer import AnyTokenizer
+else:
+    oc = LazyLoader("oc", globals(), "outlines_core")
+    file_utils = LazyLoader("file_utils", globals(), "transformers.file_utils")
+    tokenization_gpt2 = LazyLoader(
+        "tokenization_gpt2",
+        globals(),
+        "transformers.models.gpt2.tokenization_gpt2",
+    )
+
+logger = init_logger(__name__)
+
+CACHE = None
+
+
+class OutlinesVocabulary:
+    """
+    Wrapper class for `outlines_core.Vocabulary`,
+    which allows us to store a hash with the vocabulary
+    """
+
+    def __init__(self, vocabulary: oc.Vocabulary) -> None:
+        # Actual vocabulary object
+        self.inner = vocabulary
+        # Have to do abs(hash()) because python hashes can
+        # be negative, and we are using hash as a cache key.
+        hex_str = hashlib.sha256(
+            vocabulary.__repr__().encode('utf-8')).hexdigest()
+        hash_int = int(hex_str, 16)
+        self._hash = hash_int
+
+
+def get_outlines_cache_path() -> str:
+    """Get the context object that contains previously-computed return values"""
+    outlines_cache_dir = os.getenv("OUTLINES_CACHE_DIR")
+    xdg_cache_home = os.getenv("XDG_CACHE_HOME")
+    home_dir = os.path.expanduser("~")
+
+    if outlines_cache_dir:
+        # OUTLINES_CACHE_DIR takes precedence
+        return outlines_cache_dir
+    elif xdg_cache_home:
+        return os.path.join(xdg_cache_home, ".cache", "outlines")
+    # If homedir is "/", we may be inside a container, and thus writing to
+    # root would be problematic, so we fallback to using a tempfile.
+    # Also validate the path exists, since os.path.expanduser does
+    # not garuntee existence.
+    elif os.path.isdir(home_dir) and home_dir != "/":
+        # Default Unix fallback: ~/.cache/outlines
+        return os.path.join(home_dir, ".cache", "outlines")
+    else:
+        import tempfile
+
+        # home_dir may be / inside a docker container without existing user
+        tempdir = tempfile.gettempdir()
+        return os.path.join(tempdir, ".cache", "outlines")
+
+
+def get_outlines_cache():
+    """Get the Cache instance to be used for index caching"""
+
+    cache_dir = get_outlines_cache_path()
+    if envs.VLLM_V1_USE_OUTLINES_CACHE:
+        logger.warning("Enabling outlines cache. This is an unbounded on-disk "
+                       "cache. It may consume a lot of disk space and should "
+                       "not be used with untrusted clients.")
+        cache = Cache(cache_dir, eviction_policy="none", cull_limit=0)
+        outlines_version = importlib.metadata.version("outlines_core")
+
+        cached_version = cache.get('__version__', None)
+        if cached_version != outlines_version:
+            cache.clear()
+        cache.set('__version__', outlines_version)
+        return cache
+    else:
+        return LRUCache(maxsize=128)
+
+
+re_llama_byte_token = re.compile(r"^<0x[0-9A-F]{2}>$")
+re_replacement_seq = re.compile(r"^.{0,6}�+.{0,6}$")
+
+
+def _reduced_vocabulary(
+    tokenizer: AnyTokenizer,
+    eos_token_id: int,
+) -> dict[bytes, list[int]]:
+    """Create a map from vocabulary tokens to lists of equivalent token ids.
+
+    Returns:
+        A Dict of token string -> equivalent token ids
+    """
+
+    unicode_to_bytes = {
+        v: k
+        for k, v in tokenization_gpt2.bytes_to_unicode().items()
+    }
+
+    def convert_token_to_string(token: str) -> str:
+
+        string = tokenizer.convert_tokens_to_string([token])
+
+        # A hack to handle missing spaces to HF's Llama tokenizers
+        if (type(token) is str
+                and token.startswith(file_utils.SPIECE_UNDERLINE)
+                or token == "<0x20>"):
+            return " " + string
+
+        return string
+
+    vocabulary: dict[bytes, list[int]] = {}
+    empty_token_ids: list[int] = []
+    for token, token_idx in tokenizer.get_vocab().items():
+        if token in tokenizer.all_special_tokens:  # type: ignore
+            continue
+
+        token_str = convert_token_to_string(token)
+        if token_str:
+            if isinstance(token, (bytes, bytearray)):
+                # For BPE tokenizers where tokens are stored as bytes.
+
+                # safe to ignore since token_str is of type (bytearray, bytes)
+                # by this point.
+                token_bytes = bytes(token_str)  # type: ignore[arg-type]
+
+            elif "\ufffd" in token_str and not re_replacement_seq.match(
+                    token_str):
+                # Handle tokens with invalid UTF-8 sequences.
+                if re_llama_byte_token.match(token):
+                    # Llama-like tokenizers use <0xXX> for incomplete sequences.
+                    token_bytes = bytes([int(token[3:5], 16)])
+                else:
+                    # GPT2 tokenizers: map each byte back using unicode_to_bytes
+                    byte_vals = [unicode_to_bytes.get(c) for c in token]
+                    if None in byte_vals:
+                        raise RuntimeError(
+                            f"Cannot convert token `{token}`"
+                            f" ({token_idx}) to bytes: {token_str}")
+                    # safe to ignore, since if None in byte_vals,
+                    # an error is thrown.
+                    token_bytes = bytes(byte_vals)  # type: ignore[arg-type]
+            else:
+                token_bytes = token_str.encode('utf-8')
+
+            if token_idx != eos_token_id:
+                vocabulary.setdefault(token_bytes, []).append(token_idx)
+        else:
+            empty_token_ids.append(token_idx)
+
+    return vocabulary
+
+
+def get_outlines_vocabulary(tokenizer: AnyTokenizer) -> oc.Vocabulary:
+    """Get the `Vocabulary` object for a given tokenizer.
+    """
+    if hasattr(tokenizer, "_outlines_vocabulary"):
+        return tokenizer._outlines_vocabulary  # type: ignore
+
+    try:
+        if hasattr(
+                tokenizer,
+                "eos_token_id",
+        ) and tokenizer.eos_token_id is not None:
+            eos_token_id = tokenizer.eos_token_id
+        else:
+            raise ValueError(
+                f"Error during structured outputs setup for outlines: Tokenizer ({type(tokenizer)}) has no `eos_token_id` property, but `eos_token_id` is required for structured outputs to work properly."  # noqa: E501
+            )
+
+        reduced_vocab = _reduced_vocabulary(
+            tokenizer,
+            eos_token_id  #type: ignore
+        )
+        vocabulary = OutlinesVocabulary(
+            oc.Vocabulary(eos_token_id, reduced_vocab))
+        tokenizer._outlines_vocabulary = vocabulary  # type: ignore
+
+        return vocabulary
+    except AttributeError as e:
+        raise ValueError(f"Cannot get the vocabulary of the tokenizer "
+                         f"({type(tokenizer)}). The tokenizer should have a "
+                         "get_vocab method.") from e
 
 
 def grammar_is_likely_lark(grammar_str: str) -> bool:
@@ -77,7 +275,7 @@ def convert_lark_to_ebnf(grammar_str: str) -> str:
             raise ValueError(
                 f"Mismatched quotes in {rule_name} on line {line_num}")
 
-    def extract_references(text: str) -> set:
+    def extract_references(text: str) -> set[str]:
         """Extract rule references from text."""
         # Remove quoted strings and special characters
         text = re.sub(r'"[^"]*"', '', text)