Add Guidance backend to V0 structured output

russellb · lochuynh1412 · mmoskal · russellb · commit 5244b7f62375 · 2025-03-10T14:09:07.000Z
This commit is based on the PR #10217. It is updated to be compatible with `main`. Signed-off-by: Russell Bryant <rbryant@redhat.com> Co-authored-by: Loc Huynh <lohuynh@microsoft.com> Co-authored-by: Michal Moskal <michal@moskal.me>
diff --git a/benchmarks/benchmark_serving_structured_output.py b/benchmarks/benchmark_serving_structured_output.py
@@ -992,7 +992,7 @@ def main(args: argparse.Namespace):
     parser.add_argument(
         "--structured-output-backend",
         type=str,
-        choices=["outlines", "lm-format-enforcer", "xgrammar", "json-unique"],
+        choices=["outlines", "lm-format-enforcer", "xgrammar", "guidance"],
         default="xgrammar",
         help="Backend to use for structured outputs")
 
diff --git a/requirements/common.txt b/requirements/common.txt
@@ -17,6 +17,7 @@ pillow  # Required for image processing
 prometheus-fastapi-instrumentator >= 7.0.0
 tiktoken >= 0.6.0  # Required for DBRX tokenizer
 lm-format-enforcer >= 0.10.11, < 0.11
+llguidance>=0.6.15
 outlines == 0.1.11
 lark == 1.2.2
 xgrammar == 0.1.11; platform_machine == "x86_64"
@@ -37,4 +38,4 @@ depyf==0.18.0 # required for profiling and debugging with compilation config
 cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
 watchfiles # required for http server to monitor the updates of TLS files
 python-json-logger # Used by logging as per examples/other/logging_configuration.md
-scipy # Required for phi-4-multimodal-instruct
+scipy # Required for phi-4-multimodal-instruct
diff --git a/tests/model_executor/test_guided_processors.py b/tests/model_executor/test_guided_processors.py
@@ -15,7 +15,9 @@
 from vllm.sampling_params import GuidedDecodingParams
 
 MODEL_NAME = 'HuggingFaceH4/zephyr-7b-beta'
-GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"]
+GUIDED_DECODING_BACKENDS = [
+    "outlines", "lm-format-enforcer", "xgrammar", "guidance"
+]
 GUIDED_DECODING_BACKENDS_WITH_REASONING_SUPPORT = ["outlines", "xgrammar"]
 REASONING_MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
 
diff --git a/vllm/config.py b/vllm/config.py
@@ -2751,7 +2751,9 @@ def compute_hash(self) -> str:
         return hash_str
 
     def __post_init__(self):
-        valid_guided_backends = ['outlines', 'lm-format-enforcer', 'xgrammar']
+        valid_guided_backends = [
+            'outlines', 'lm-format-enforcer', 'xgrammar', 'guidance'
+        ]
 
         backend = GuidedDecodingParams(
             backend=self.guided_decoding_backend).backend_name
diff --git a/vllm/model_executor/guided_decoding/__init__.py b/vllm/model_executor/guided_decoding/__init__.py
@@ -130,10 +130,15 @@ async def get_guided_decoding_logits_processor(
             get_local_xgrammar_guided_decoding_logits_processor)
         return get_local_xgrammar_guided_decoding_logits_processor(
             guided_params, tokenizer, model_config, reasoner)
-
+    if guided_params.backend_name == 'guidance':
+        from vllm.model_executor.guided_decoding.guidance_decoding import (
+            get_local_guidance_guided_decoding_logits_processor)
+        return get_local_guidance_guided_decoding_logits_processor(
+            guided_params, tokenizer)
     raise ValueError(
         f"Unknown guided decoding backend '{guided_params.backend}'. "
-        "Must be one of 'outlines, 'lm-format-enforcer', 'xgrammar'")
+        "Must be one of 'outlines, 'lm-format-enforcer', 'xgrammar', 'guidance'"
+    )
 
 
 def get_local_guided_decoding_logits_processor(
@@ -163,7 +168,13 @@ def get_local_guided_decoding_logits_processor(
             get_local_xgrammar_guided_decoding_logits_processor)
         return get_local_xgrammar_guided_decoding_logits_processor(
             guided_params, tokenizer, model_config, reasoner)
+    if guided_params.backend_name == 'guidance':
+        from vllm.model_executor.guided_decoding.guidance_decoding import (
+            get_local_guidance_guided_decoding_logits_processor)
+        return get_local_guidance_guided_decoding_logits_processor(
+            guided_params, tokenizer)
 
     raise ValueError(
         f"Unknown guided decoding backend '{guided_params.backend}'. "
-        "Must be one of 'outlines, 'lm-format-enforcer', 'xgrammar'")
+        "Must be one of 'outlines, 'lm-format-enforcer', 'xgrammar', 'guidance'"
+    )
diff --git a/vllm/model_executor/guided_decoding/guidance_decoding.py b/vllm/model_executor/guided_decoding/guidance_decoding.py
@@ -0,0 +1,53 @@
+# SPDX-License-Identifier: Apache-2.0
+from enum import Enum
+from re import escape as regex_escape
+from typing import Union
+
+from transformers import PreTrainedTokenizerBase
+
+from vllm.model_executor.guided_decoding.guidance_logits_processors import (
+    GuidanceLogitsProcessor)
+from vllm.sampling_params import GuidedDecodingParams
+
+
+class GuidedDecodingMode(Enum):
+    JSON = "json"
+    REGEX = "regex"
+    CHOICE = "choice"
+    GRAMMAR = "grammar"
+
+
+def get_local_guidance_guided_decoding_logits_processor(
+    guided_params: GuidedDecodingParams, tokenizer: PreTrainedTokenizerBase
+) -> Union[GuidanceLogitsProcessor, None]:
+    """
+    Given an OpenAI-compatible request, check for guided decoding parameters
+    and get the necessary logits processor for the given guide.
+    We cache logit processors by (guide, tokenizer), and on cache hit
+    we make a shallow copy to reuse the same underlying FSM.
+    """
+    guide = None
+    mode = None
+
+    if guided_params.json:
+        guide = guided_params.json
+        mode = GuidedDecodingMode.JSON.value
+    elif guided_params.regex:
+        guide = guided_params.regex
+        mode = GuidedDecodingMode.REGEX.value
+    elif guided_params.choice:
+        # choice just uses regex
+        choices = (regex_escape(str(choice))
+                   for choice in guided_params.choice)
+        choices_regex = "(" + "|".join(choices) + ")"
+        guide = choices_regex
+        mode = GuidedDecodingMode.CHOICE.value
+    elif guided_params.grammar:
+        guide = guided_params.grammar
+        mode = GuidedDecodingMode.GRAMMAR.value
+
+    if not guide or not mode:
+        return None
+
+    return GuidanceLogitsProcessor(mode, guide, tokenizer,
+                                   guided_params.whitespace_pattern)
diff --git a/vllm/model_executor/guided_decoding/guidance_logits_processors.py b/vllm/model_executor/guided_decoding/guidance_logits_processors.py
@@ -0,0 +1,156 @@
+# SPDX-License-Identifier: Apache-2.0
+import json
+import os
+from typing import Any, List, Type, Union
+
+import llguidance  # type: ignore[import-untyped]
+import llguidance.hf
+import numpy as np
+import torch
+from pydantic import BaseModel
+from transformers import PreTrainedTokenizerBase
+
+from vllm.model_executor.guided_decoding.guidance_utils import (
+    LLInterpreterResponse)
+
+
+class GuidanceLogitsProcessor:
+    """Base Guidance Logits Processor"""
+
+    cached_tokenizers: dict[str, Any] = {}
+
+    def __init__(
+        self,
+        mode: str,
+        guide: Union[dict, Type[BaseModel], str],
+        tokenizer: PreTrainedTokenizerBase,
+        whitespace_pattern: Union[str, None] = None,
+    ) -> None:
+        """Base Guidance Logits Processor
+
+        Args:
+            mode (str)
+                guided generation mode. 
+                Must be one of "json", "regex", "choice", "grammar"
+            guide (Union[dict, Type[BaseModel], str])
+                guide for guided generation
+            tokenizer (PreTrainedTokenizerBase)
+                model's tokenizer
+            whitespace_pattern (Union[str, None], optional)
+                Json-string to indicate pattern to use \
+                    for JSON syntactic whitespace
+                Example: '{"whitespace_flexible":true}'
+        """
+        self.mode = mode
+        self.guide = guide
+        self.tokenizer = tokenizer
+        self.tokenizer_name = tokenizer.name_or_path
+        self.whitespace_pattern = whitespace_pattern
+
+        self.is_stopped = False
+        self.pending_ff_tokens: list[int] = []
+        self.new_sampling = False
+        self.initialized = False
+
+    def _initialize(self):
+        if self.initialized:
+            return
+
+        if self.mode.lower() == "json":
+            if isinstance(self.guide, dict):
+                schema = json.dumps(self.guide)
+            elif isinstance(self.guide, BaseModel):
+                schema = json.dumps(self.guide.model_json_schema())
+            else:
+                schema = str(self.guide)
+
+            whitespaces_config = {}
+            if isinstance(self.whitespace_pattern, str):
+                whitespaces_config = json.loads(self.whitespace_pattern)
+
+            whitespace_flexible = whitespaces_config.get(
+                "whitespace_flexible", False)
+            compiler = llguidance.JsonCompiler(
+                whitespace_flexible=whitespace_flexible)
+            self.serialized_grammar = compiler.compile(schema)
+        elif self.mode.lower() in ["regex", "choice"]:
+            compiler = llguidance.RegexCompiler()
+            self.serialized_grammar = compiler.compile(regex=self.guide)
+        elif self.mode.lower() == "grammar":
+            serialized_grammar = self.guide
+            if isinstance(self.guide, dict):
+                serialized_grammar = json.dumps(self.guide)
+            self.serialized_grammar = serialized_grammar
+
+        ll_tokenizer = self.cached_tokenizers.get(self.tokenizer.name_or_path,
+                                                  None)
+        if ll_tokenizer is None:
+            ll_tokenizer = llguidance.hf.from_tokenizer(self.tokenizer, None)
+            self.cached_tokenizers[self.tokenizer.name_or_path] = ll_tokenizer
+        self.ll_tokenizer = ll_tokenizer
+        self.ll_interpreter = llguidance.LLInterpreter(
+            self.ll_tokenizer,
+            self.serialized_grammar,
+            enable_backtrack=False,
+            enable_ff_tokens=False,
+            log_level=int(os.environ.get("LLGUIDANCE_LOG_LEVEL", "1")),
+        )
+
+        self.initialized = True
+
+    def __call__(
+        self,
+        input_ids: List[int],
+        scores: torch.Tensor,
+    ) -> torch.Tensor:
+        # we initialize the guidance model here
+        # to avoid pickling ll_tokenizer and ll_interpreter
+        self._initialize()
+
+        if self.is_stopped:
+            return scores
+
+        if self.new_sampling and len(input_ids) > 0:
+            backtrack, ff_tokens = self.ll_interpreter.commit_token(
+                input_ids[-1])
+            if len(ff_tokens) > 0 and backtrack == 0:
+                # first token is last generated token
+                ff_tokens = ff_tokens[1:]
+            self.pending_ff_tokens.extend(ff_tokens)
+            self.new_sampling = False
+
+        if len(self.pending_ff_tokens) > 0:
+            # if we have pending fast-forward tokens,
+            # just return them immediately
+            ff_token = self.pending_ff_tokens.pop(0)
+            scores.add_(-scores)
+            scores[ff_token] = 200.0
+            return scores
+
+        mask, resp = self.ll_interpreter.compute_mask()
+        r = LLInterpreterResponse.model_validate_json(resp)
+
+        if r.stop:
+            mask = np.zeros(scores.shape[-1], dtype=np.uint8)
+            if self.ll_tokenizer.eos_token is not None:
+                mask[self.ll_tokenizer.eos_token] = 200
+            self.is_stopped = True
+        elif mask is None:
+            # NOTE: mask should not be None unless r.stop is True
+            # However, we are handling this case just in case
+            # llguidance allows free-style generation
+            mask = np.zeros(scores.shape[-1], dtype=np.uint8)
+        else:
+            mask = np.frombuffer(mask, dtype=np.uint8)
+
+        # Force all invalid tokens to have 0 value
+        scores.add_(-torch.min(scores))
+        zero_indices = np.where(mask == 0)[0]
+        scores[zero_indices] = 0.0
+        non_zero_indices = np.nonzero(mask)[0]
+        scores[non_zero_indices] += 200.0
+        # set special tokens not in vocab to 0
+        scores[mask.shape[0]:] = 0.0
+        self.new_sampling = True
+
+        return scores
diff --git a/vllm/model_executor/guided_decoding/guidance_utils.py b/vllm/model_executor/guided_decoding/guidance_utils.py