[doc] Fold long code blocks to improve readability (#19926)

Signed-off-by: reidliu41 <reid201711@gmail.com> Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-06-23 13:24:23 +08:00
parent 493c275352
commit f17aec0d63
50 changed files with 3455 additions and 3180 deletions
--- a/docs/design/arch_overview.md
+++ b/docs/design/arch_overview.md
@ -22,31 +22,33 @@ server.

 Here is a sample of `LLM` class usage:

-```python
-from vllm import LLM, SamplingParams
+??? Code

-# Define a list of input prompts
-prompts = [
-    "Hello, my name is",
-    "The capital of France is",
-    "The largest ocean is",
-]
+    ```python
+    from vllm import LLM, SamplingParams

-# Define sampling parameters
-sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+    # Define a list of input prompts
+    prompts = [
+        "Hello, my name is",
+        "The capital of France is",
+        "The largest ocean is",
+    ]

-# Initialize the LLM engine with the OPT-125M model
-llm = LLM(model="facebook/opt-125m")
+    # Define sampling parameters
+    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)

-# Generate outputs for the input prompts
-outputs = llm.generate(prompts, sampling_params)
+    # Initialize the LLM engine with the OPT-125M model
+    llm = LLM(model="facebook/opt-125m")

-# Print the generated outputs
-for output in outputs:
-    prompt = output.prompt
-    generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-```
+    # Generate outputs for the input prompts
+    outputs = llm.generate(prompts, sampling_params)
+
+    # Print the generated outputs
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    ```

 More API details can be found in the [Offline Inference](#offline-inference-api) section of the API docs.

@ -178,32 +180,34 @@ vision-language model.

    To avoid accidentally passing incorrect arguments, the constructor is now keyword-only. This ensures that the constructor will raise an error if old configurations are passed. vLLM developers have already made this change for all models within vLLM. For out-of-tree registered models, developers need to update their models, for example by adding shim code to adapt the old constructor signature to the new one:

-    ```python
-    class MyOldModel(nn.Module):
-        def __init__(
-            self,
-            config,
-            cache_config: Optional[CacheConfig] = None,
-            quant_config: Optional[QuantizationConfig] = None,
-            lora_config: Optional[LoRAConfig] = None,
-            prefix: str = "",
-        ) -> None:
-            ...
+    ??? Code

-    from vllm.config import VllmConfig
-    class MyNewModel(MyOldModel):
-        def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-            config = vllm_config.model_config.hf_config
-            cache_config = vllm_config.cache_config
-            quant_config = vllm_config.quant_config
-            lora_config = vllm_config.lora_config
-            super().__init__(config, cache_config, quant_config, lora_config, prefix)
+        ```python
+        class MyOldModel(nn.Module):
+            def __init__(
+                self,
+                config,
+                cache_config: Optional[CacheConfig] = None,
+                quant_config: Optional[QuantizationConfig] = None,
+                lora_config: Optional[LoRAConfig] = None,
+                prefix: str = "",
+            ) -> None:
+                ...

-    if __version__ >= "0.6.4":
-        MyModel = MyNewModel
-    else:
-        MyModel = MyOldModel
-    ```
+        from vllm.config import VllmConfig
+        class MyNewModel(MyOldModel):
+            def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+                config = vllm_config.model_config.hf_config
+                cache_config = vllm_config.cache_config
+                quant_config = vllm_config.quant_config
+                lora_config = vllm_config.lora_config
+                super().__init__(config, cache_config, quant_config, lora_config, prefix)
+
+        if __version__ >= "0.6.4":
+            MyModel = MyNewModel
+        else:
+            MyModel = MyOldModel
+        ```

    This way, the model can work with both old and new versions of vLLM.

--- a/docs/design/kernel/paged_attention.md
+++ b/docs/design/kernel/paged_attention.md
@ -448,27 +448,29 @@ elements of the entire head for all context tokens. However, overall,
 all results for output have been calculated but are just stored in
 different thread register memory.

-```cpp
-float* out_smem = reinterpret_cast<float*>(shared_mem);
-for (int i = NUM_WARPS; i > 1; i /= 2) {
-    // Upper warps write to shared memory.
-    ...
-    float* dst = &out_smem[(warp_idx - mid) * HEAD_SIZE];
-    for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
-        ...
-        dst[row_idx] = accs[i];
-    }
+??? Code

-    // Lower warps update the output.
-    const float* src = &out_smem[warp_idx * HEAD_SIZE];
-    for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
+    ```cpp
+    float* out_smem = reinterpret_cast<float*>(shared_mem);
+    for (int i = NUM_WARPS; i > 1; i /= 2) {
+        // Upper warps write to shared memory.
        ...
-        accs[i] += src[row_idx];
-    }
+        float* dst = &out_smem[(warp_idx - mid) * HEAD_SIZE];
+        for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
+            ...
+            dst[row_idx] = accs[i];
+        }

-    // Write out the accs.
-}
-```
+        // Lower warps update the output.
+        const float* src = &out_smem[warp_idx * HEAD_SIZE];
+        for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
+            ...
+            accs[i] += src[row_idx];
+        }
+
+        // Write out the accs.
+    }
+    ```

 ## Output

--- a/docs/design/plugin_system.md
+++ b/docs/design/plugin_system.md
@ -13,28 +13,30 @@ Plugins are user-registered code that vLLM executes. Given vLLM's architecture (

 vLLM's plugin system uses the standard Python `entry_points` mechanism. This mechanism allows developers to register functions in their Python packages for use by other packages. An example of a plugin:

-```python
-# inside `setup.py` file
-from setuptools import setup
+??? Code

-setup(name='vllm_add_dummy_model',
-      version='0.1',
-      packages=['vllm_add_dummy_model'],
-      entry_points={
-          'vllm.general_plugins':
-          ["register_dummy_model = vllm_add_dummy_model:register"]
-      })
+    ```python
+    # inside `setup.py` file
+    from setuptools import setup

-# inside `vllm_add_dummy_model.py` file
-def register():
-    from vllm import ModelRegistry
+    setup(name='vllm_add_dummy_model',
+        version='0.1',
+        packages=['vllm_add_dummy_model'],
+        entry_points={
+            'vllm.general_plugins':
+            ["register_dummy_model = vllm_add_dummy_model:register"]
+        })

-    if "MyLlava" not in ModelRegistry.get_supported_archs():
-        ModelRegistry.register_model(
-            "MyLlava",
-            "vllm_add_dummy_model.my_llava:MyLlava",
-        )
-```
+    # inside `vllm_add_dummy_model.py` file
+    def register():
+        from vllm import ModelRegistry
+
+        if "MyLlava" not in ModelRegistry.get_supported_archs():
+            ModelRegistry.register_model(
+                "MyLlava",
+                "vllm_add_dummy_model.my_llava:MyLlava",
+            )
+    ```

 For more information on adding entry points to your package, please check the [official documentation](https://setuptools.pypa.io/en/latest/userguide/entry_point.html).