Merge branch 'master' into alexis/update_nodes_categories

Update TripoSplat categories
2026-06-19 13:07:23 +08:00 · 2026-06-17 20:55:15 +08:00 · 2026-06-17 09:23:51 +08:00
17 changed files with 81 additions and 979 deletions
--- a/README.md
+++ b/README.md
@ -140,7 +140,7 @@ ComfyUI follows a weekly release cycle targeting Monday but this regularly chang
   - Commits outside of the stable release tags may be very unstable and break many custom nodes.
   - Serves as the foundation for the desktop release

-2. **[Comfy Desktop](https://github.com/Comfy-Org/Comfy-Desktop)**
+2. **[ComfyUI Desktop](https://github.com/Comfy-Org/Comfy-Desktop)**
   - Builds a new release using the latest stable core version

 3. **[ComfyUI Frontend](https://github.com/Comfy-Org/ComfyUI_frontend)**
--- a/comfy/ldm/boogu/model.py
+++ b/comfy/ldm/boogu/model.py
@ -1,321 +0,0 @@
-# Boogu-Image-0.1 transformer
-# Architecture is an OmniGen2 derivative (see comfy/ldm/omnigen/omnigen2.py) with an
-# added dual-stream ("double_stream") stage before the single-stream layers, conditioned
-# by a Qwen3-VL multimodal LLM. Reuses the OmniGen2/Lumina building blocks and the Flux
-# RoPE core, the only new component is the double-stream block + the hybrid forward order.
-
-from typing import Optional, Tuple
-
-import torch
-import torch.nn as nn
-from einops import rearrange
-
-import comfy.ldm.common_dit
-import comfy.ldm.omnigen.omnigen2
-from comfy.ldm.modules.attention import optimized_attention_masked
-from comfy.ldm.omnigen.omnigen2 import (
-    OmniGen2RotaryPosEmbed,
-    Lumina2CombinedTimestepCaptionEmbedding,
-    LuminaRMSNormZero,
-    LuminaLayerNormContinuous,
-    LuminaFeedForward,
-    Attention,
-    OmniGen2TransformerBlock,
-    apply_rotary_emb,
-)
-
-class BooguDoubleStreamProcessor(nn.Module):
-    # Joint attention over [instruct ; img] with separate per-stream q/k/v and output projections.
-    def __init__(self, dim, head_dim, heads, kv_heads, dtype=None, device=None, operations=None):
-        super().__init__()
-        query_dim = head_dim * heads
-        kv_dim = head_dim * kv_heads
-
-        self.img_to_q = operations.Linear(query_dim, query_dim, bias=False, dtype=dtype, device=device)
-        self.img_to_k = operations.Linear(query_dim, kv_dim, bias=False, dtype=dtype, device=device)
-        self.img_to_v = operations.Linear(query_dim, kv_dim, bias=False, dtype=dtype, device=device)
-
-        self.instruct_to_q = operations.Linear(query_dim, query_dim, bias=False, dtype=dtype, device=device)
-        self.instruct_to_k = operations.Linear(query_dim, kv_dim, bias=False, dtype=dtype, device=device)
-        self.instruct_to_v = operations.Linear(query_dim, kv_dim, bias=False, dtype=dtype, device=device)
-
-        self.instruct_out = operations.Linear(query_dim, query_dim, bias=False, dtype=dtype, device=device)
-        self.img_out = operations.Linear(query_dim, query_dim, bias=False, dtype=dtype, device=device)
-
-    def forward(self, attn, img_hidden_states, instruct_hidden_states, rotary_emb, attention_mask=None, transformer_options={}):
-        batch_size = img_hidden_states.shape[0]
-        L_instruct = instruct_hidden_states.shape[1]
-
-        img_q = self.img_to_q(img_hidden_states)
-        img_k = self.img_to_k(img_hidden_states)
-        img_v = self.img_to_v(img_hidden_states)
-
-        instruct_q = self.instruct_to_q(instruct_hidden_states)
-        instruct_k = self.instruct_to_k(instruct_hidden_states)
-        instruct_v = self.instruct_to_v(instruct_hidden_states)
-
-        # Concatenate instruction first, then image (matches reference processor order).
-        query = torch.cat([instruct_q, img_q], dim=1)
-        key = torch.cat([instruct_k, img_k], dim=1)
-        value = torch.cat([instruct_v, img_v], dim=1)
-
-        query = query.view(batch_size, -1, attn.heads, attn.dim_head)
-        key = key.view(batch_size, -1, attn.kv_heads, attn.dim_head)
-        value = value.view(batch_size, -1, attn.kv_heads, attn.dim_head)
-
-        query = attn.norm_q(query)
-        key = attn.norm_k(key)
-
-        if rotary_emb is not None:
-            query = apply_rotary_emb(query, rotary_emb)
-            key = apply_rotary_emb(key, rotary_emb)
-
-        query = query.transpose(1, 2)
-        key = key.transpose(1, 2)
-        value = value.transpose(1, 2)
-
-        if attn.kv_heads < attn.heads:
-            key = key.repeat_interleave(attn.heads // attn.kv_heads, dim=1)
-            value = value.repeat_interleave(attn.heads // attn.kv_heads, dim=1)
-
-        hidden_states = optimized_attention_masked(query, key, value, attn.heads, attention_mask, skip_reshape=True, transformer_options=transformer_options)
-
-        # Split back to instruction/image, apply per-stream output projections, recombine.
-        instruct_hidden_states = self.instruct_out(hidden_states[:, :L_instruct])
-        img_hidden_states = self.img_out(hidden_states[:, L_instruct:])
-        hidden_states = torch.cat([instruct_hidden_states, img_hidden_states], dim=1)
-
-        hidden_states = attn.to_out[0](hidden_states)
-        return hidden_states
-
-
-class BooguJointAttention(nn.Module):
-    # Holds the shared q/k RMSNorm + final output projection
-    def __init__(self, dim, head_dim, heads, kv_heads, eps=1e-5, dtype=None, device=None, operations=None):
-        super().__init__()
-        self.heads = heads
-        self.kv_heads = kv_heads
-        self.dim_head = head_dim
-        self.scale = head_dim ** -0.5
-
-        self.norm_q = operations.RMSNorm(head_dim, eps=eps, dtype=dtype, device=device)
-        self.norm_k = operations.RMSNorm(head_dim, eps=eps, dtype=dtype, device=device)
-        self.to_out = nn.Sequential(
-            operations.Linear(heads * head_dim, dim, bias=False, dtype=dtype, device=device),
-            nn.Dropout(0.0),
-        )
-        self.processor = BooguDoubleStreamProcessor(dim, head_dim, heads, kv_heads, dtype=dtype, device=device, operations=operations)
-
-    def forward(self, img_hidden_states, instruct_hidden_states, rotary_emb, attention_mask=None, transformer_options={}):
-        return self.processor(self, img_hidden_states, instruct_hidden_states, rotary_emb, attention_mask, transformer_options=transformer_options)
-
-
-class BooguDoubleStreamBlock(nn.Module):
-    # Dual-stream block: joint attention over [instruct ; img] + image self-attention, each stream with its own modulation/MLP.
-    def __init__(self, dim, num_attention_heads, num_kv_heads, multiple_of, ffn_dim_multiplier, norm_eps, dtype=None, device=None, operations=None):
-        super().__init__()
-        head_dim = dim // num_attention_heads
-
-        self.img_instruct_attn = BooguJointAttention(dim, head_dim, num_attention_heads, num_kv_heads, eps=1e-5, dtype=dtype, device=device, operations=operations)
-        self.img_self_attn = Attention(
-            query_dim=dim, dim_head=head_dim, heads=num_attention_heads, kv_heads=num_kv_heads,
-            eps=1e-5, bias=False, dtype=dtype, device=device, operations=operations,
-        )
-
-        self.img_feed_forward = LuminaFeedForward(dim=dim, inner_dim=4 * dim, multiple_of=multiple_of, dtype=dtype, device=device, operations=operations)
-        self.instruct_feed_forward = LuminaFeedForward(dim=dim, inner_dim=4 * dim, multiple_of=multiple_of, dtype=dtype, device=device, operations=operations)
-
-        self.img_norm1 = LuminaRMSNormZero(embedding_dim=dim, norm_eps=norm_eps, dtype=dtype, device=device, operations=operations)
-        self.img_norm2 = LuminaRMSNormZero(embedding_dim=dim, norm_eps=norm_eps, dtype=dtype, device=device, operations=operations)
-        self.img_norm3 = LuminaRMSNormZero(embedding_dim=dim, norm_eps=norm_eps, dtype=dtype, device=device, operations=operations)
-        self.instruct_norm1 = LuminaRMSNormZero(embedding_dim=dim, norm_eps=norm_eps, dtype=dtype, device=device, operations=operations)
-        self.instruct_norm2 = LuminaRMSNormZero(embedding_dim=dim, norm_eps=norm_eps, dtype=dtype, device=device, operations=operations)
-
-        self.img_attn_norm = operations.RMSNorm(dim, eps=norm_eps, dtype=dtype, device=device)
-        self.img_self_attn_norm = operations.RMSNorm(dim, eps=norm_eps, dtype=dtype, device=device)
-        self.img_ffn_norm1 = operations.RMSNorm(dim, eps=norm_eps, dtype=dtype, device=device)
-        self.img_ffn_norm2 = operations.RMSNorm(dim, eps=norm_eps, dtype=dtype, device=device)
-
-        self.instruct_attn_norm = operations.RMSNorm(dim, eps=norm_eps, dtype=dtype, device=device)
-        self.instruct_ffn_norm1 = operations.RMSNorm(dim, eps=norm_eps, dtype=dtype, device=device)
-        self.instruct_ffn_norm2 = operations.RMSNorm(dim, eps=norm_eps, dtype=dtype, device=device)
-
-    def forward(self, img_hidden_states, instruct_hidden_states, joint_rotary_emb, img_rotary_emb, temb, joint_attention_mask=None, img_attention_mask=None, transformer_options={}):
-        L_instruct = instruct_hidden_states.shape[1]
-
-        img_norm1_out, img_gate_msa, img_scale_mlp, img_gate_mlp = self.img_norm1(img_hidden_states, temb)
-        img_norm2_out, img_shift_mlp, _, _ = self.img_norm2(img_hidden_states, temb)
-        img_norm3_out, img_gate_self, _, _ = self.img_norm3(img_hidden_states, temb)
-
-        instruct_norm1_out, instruct_gate_msa, instruct_scale_mlp, instruct_gate_mlp = self.instruct_norm1(instruct_hidden_states, temb)
-        instruct_norm2_out, instruct_shift_mlp, _, _ = self.instruct_norm2(instruct_hidden_states, temb)
-
-        joint_attn_out = self.img_instruct_attn(img_norm1_out, instruct_norm1_out, joint_rotary_emb, joint_attention_mask, transformer_options=transformer_options)
-        instruct_attn_out = joint_attn_out[:, :L_instruct]
-        img_attn_out = joint_attn_out[:, L_instruct:]
-
-        img_self_attn_out = self.img_self_attn(img_norm3_out, img_norm3_out, img_attention_mask, img_rotary_emb, transformer_options=transformer_options)
-
-        img_hidden_states = img_hidden_states + img_gate_msa.unsqueeze(1).tanh() * self.img_attn_norm(img_attn_out)
-        img_hidden_states = img_hidden_states + img_gate_self.unsqueeze(1).tanh() * self.img_self_attn_norm(img_self_attn_out)
-        img_mlp_input = (1 + img_scale_mlp.unsqueeze(1)) * img_norm2_out + img_shift_mlp.unsqueeze(1)
-        img_mlp_out = self.img_feed_forward(self.img_ffn_norm1(img_mlp_input))
-        img_hidden_states = img_hidden_states + img_gate_mlp.unsqueeze(1).tanh() * self.img_ffn_norm2(img_mlp_out)
-
-        instruct_hidden_states = instruct_hidden_states + instruct_gate_msa.unsqueeze(1).tanh() * self.instruct_attn_norm(instruct_attn_out)
-        instruct_mlp_input = (1 + instruct_scale_mlp.unsqueeze(1)) * instruct_norm2_out + instruct_shift_mlp.unsqueeze(1)
-        instruct_mlp_out = self.instruct_feed_forward(self.instruct_ffn_norm1(instruct_mlp_input))
-        instruct_hidden_states = instruct_hidden_states + instruct_gate_mlp.unsqueeze(1).tanh() * self.instruct_ffn_norm2(instruct_mlp_out)
-
-        return img_hidden_states, instruct_hidden_states
-
-
-class BooguTransformer2DModel(nn.Module):
-    def __init__(
-        self,
-        patch_size: int = 2,
-        in_channels: int = 16,
-        out_channels: Optional[int] = None,
-        hidden_size: int = 3360,
-        num_layers: int = 32,
-        num_double_stream_layers: int = 8,
-        num_refiner_layers: int = 2,
-        num_attention_heads: int = 28,
-        num_kv_heads: int = 7,
-        multiple_of: int = 256,
-        ffn_dim_multiplier: Optional[float] = None,
-        norm_eps: float = 1e-5,
-        axes_dim_rope: Tuple[int, int, int] = (40, 40, 40),
-        axes_lens: Tuple[int, int, int] = (2048, 1664, 1664),
-        instruction_feat_dim: int = 4096,
-        timestep_scale: float = 1000.0,
-        image_model=None,
-        device=None, dtype=None, operations=None,
-    ):
-        super().__init__()
-
-        self.patch_size = patch_size
-        self.out_channels = out_channels or in_channels
-        self.hidden_size = hidden_size
-        self.dtype = dtype
-
-        self.rope_embedder = OmniGen2RotaryPosEmbed(
-            theta=10000,
-            axes_dim=axes_dim_rope,
-            axes_lens=axes_lens,
-            patch_size=patch_size,
-        )
-
-        self.x_embedder = operations.Linear(patch_size * patch_size * in_channels, hidden_size, dtype=dtype, device=device)
-        self.ref_image_patch_embedder = operations.Linear(patch_size * patch_size * in_channels, hidden_size, dtype=dtype, device=device)
-
-        self.time_caption_embed = Lumina2CombinedTimestepCaptionEmbedding(
-            hidden_size=hidden_size,
-            text_feat_dim=instruction_feat_dim,
-            norm_eps=norm_eps,
-            timestep_scale=timestep_scale, dtype=dtype, device=device, operations=operations
-        )
-
-        self.noise_refiner = nn.ModuleList([
-            OmniGen2TransformerBlock(hidden_size, num_attention_heads, num_kv_heads, multiple_of, ffn_dim_multiplier, norm_eps, modulation=True, dtype=dtype, device=device, operations=operations)
-            for _ in range(num_refiner_layers)
-        ])
-
-        self.ref_image_refiner = nn.ModuleList([
-            OmniGen2TransformerBlock(hidden_size, num_attention_heads, num_kv_heads, multiple_of, ffn_dim_multiplier, norm_eps, modulation=True, dtype=dtype, device=device, operations=operations)
-            for _ in range(num_refiner_layers)
-        ])
-
-        self.context_refiner = nn.ModuleList([
-            OmniGen2TransformerBlock(hidden_size, num_attention_heads, num_kv_heads, multiple_of, ffn_dim_multiplier, norm_eps, modulation=False, dtype=dtype, device=device, operations=operations)
-            for _ in range(num_refiner_layers)
-        ])
-
-        self.double_stream_layers = nn.ModuleList([
-            BooguDoubleStreamBlock(hidden_size, num_attention_heads, num_kv_heads, multiple_of, ffn_dim_multiplier, norm_eps, dtype=dtype, device=device, operations=operations)
-            for _ in range(num_double_stream_layers)
-        ])
-
-        self.single_stream_layers = nn.ModuleList([
-            OmniGen2TransformerBlock(hidden_size, num_attention_heads, num_kv_heads, multiple_of, ffn_dim_multiplier, norm_eps, modulation=True, dtype=dtype, device=device, operations=operations)
-            for _ in range(num_layers)
-        ])
-
-        self.norm_out = LuminaLayerNormContinuous(
-            embedding_dim=hidden_size,
-            conditioning_embedding_dim=min(hidden_size, 1024),
-            elementwise_affine=False,
-            eps=1e-6,
-            out_dim=patch_size * patch_size * self.out_channels, dtype=dtype, device=device, operations=operations
-        )
-
-        self.image_index_embedding = nn.Parameter(torch.empty(5, hidden_size, device=device, dtype=dtype))
-
-    # Patchify/refine helpers are identical to OmniGen2; reuse via bound methods.
-    flat_and_pad_to_seq = comfy.ldm.omnigen.omnigen2.OmniGen2Transformer2DModel.flat_and_pad_to_seq
-    img_patch_embed_and_refine = comfy.ldm.omnigen.omnigen2.OmniGen2Transformer2DModel.img_patch_embed_and_refine
-
-    def forward(self, x, timesteps, context, num_tokens, ref_latents=None, attention_mask=None, transformer_options={}, **kwargs):
-        B, C, H, W = x.shape
-        hidden_states = comfy.ldm.common_dit.pad_to_patch_size(x, (self.patch_size, self.patch_size))
-        _, _, H_padded, W_padded = hidden_states.shape
-        timestep = 1.0 - timesteps
-        text_hidden_states = context
-        text_attention_mask = attention_mask
-        ref_image_hidden_states = ref_latents
-        device = hidden_states.device
-
-        temb, text_hidden_states = self.time_caption_embed(timestep, text_hidden_states, hidden_states[0].dtype)
-
-        (
-            hidden_states, ref_image_hidden_states,
-            img_mask, ref_img_mask,
-            l_effective_ref_img_len, l_effective_img_len,
-            ref_img_sizes, img_sizes,
-        ) = self.flat_and_pad_to_seq(hidden_states, ref_image_hidden_states)
-
-        (
-            context_rotary_emb, ref_img_rotary_emb, noise_rotary_emb,
-            rotary_emb, encoder_seq_lengths, seq_lengths,
-        ) = self.rope_embedder(
-            hidden_states.shape[0], text_hidden_states.shape[1], [num_tokens] * text_hidden_states.shape[0],
-            l_effective_ref_img_len, l_effective_img_len,
-            ref_img_sizes, img_sizes, device,
-        )
-
-        for layer in self.context_refiner:
-            text_hidden_states = layer(text_hidden_states, text_attention_mask, context_rotary_emb, transformer_options=transformer_options)
-
-        img_len = hidden_states.shape[1]
-        combined_img_hidden_states = self.img_patch_embed_and_refine(
-            hidden_states, ref_image_hidden_states,
-            img_mask, ref_img_mask,
-            noise_rotary_emb, ref_img_rotary_emb,
-            l_effective_ref_img_len, l_effective_img_len,
-            temb,
-            transformer_options=transformer_options,
-        )
-
-        # Double-stream stage: the image self-attention only sees the [ref ; noise] tokens,
-        # which sit after the instruction tokens in the joint rope.
-        L_instruct = text_hidden_states.shape[1]
-        combined_img_rotary_emb = rotary_emb[:, L_instruct:]
-        for layer in self.double_stream_layers:
-            combined_img_hidden_states, text_hidden_states = layer(
-                combined_img_hidden_states, text_hidden_states,
-                rotary_emb, combined_img_rotary_emb, temb,
-                joint_attention_mask=None, img_attention_mask=None,
-                transformer_options=transformer_options,
-            )
-
-        hidden_states = torch.cat([text_hidden_states, combined_img_hidden_states], dim=1)
-
-        for layer in self.single_stream_layers:
-            hidden_states = layer(hidden_states, None, rotary_emb, temb, transformer_options=transformer_options)
-
-        hidden_states = self.norm_out(hidden_states, temb)
-
-        p = self.patch_size
-        output = rearrange(hidden_states[:, -img_len:], 'b (h w) (p1 p2 c) -> b c (h p1) (w p2)', h=H_padded // p, w=W_padded // p, p1=p, p2=p)[:, :, :H, :W]
-
-        return -output
--- a/comfy/ldm/omnigen/omnigen2.py
+++ b/comfy/ldm/omnigen/omnigen2.py
@ -22,7 +22,7 @@ def apply_rotary_emb(x, freqs_cis):


 def swiglu(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
-    return F.silu(x, inplace=True).mul_(y)
+    return F.silu(x) * y


 class TimestepEmbedding(nn.Module):
--- a/comfy/ldm/wan/model.py
+++ b/comfy/ldm/wan/model.py
@ -1665,7 +1665,7 @@ class SCAILWanModel(WanModel):

        # embeddings
        x = self.patch_embedding(x.float()).to(x.dtype)
-        if ref_mask_latents is not None:  # SCAIL-2 additive mask stream (one identity mask frame per reference, then video)
+        if ref_mask_latents is not None:  # SCAIL-2 additive mask stream
            x = x + self.patch_embedding_mask(ref_mask_latents.float()).to(x.dtype)
        grid_sizes = x.shape[2:]
        transformer_options["grid_sizes"] = grid_sizes
@ -1728,25 +1728,22 @@ class SCAILWanModel(WanModel):

    # ref_mask_flag is a scalar bool (CONDConstant, SCAIL-2 only). False => replacement mode,
    # which places ref/pose via H/W rope shifts instead of the animation-mode temporal offset.
-    # reference_latent may stack several frames: the last is the primary reference adjacent to the video, the earlier frames are additional references.
    def rope_encode(self, t, h, w, t_start=0, steps_t=None, steps_h=None, steps_w=None, device=None, dtype=None, pose_latents=None, reference_latent=None, ref_mask_flag=None, transformer_options={}):
-        ref_t_patches = 0
-        if reference_latent is not None:
-            ref_t_patches = (reference_latent.shape[2] + (self.patch_size[0] // 2)) // self.patch_size[0]
-
        if ref_mask_flag is not None and not bool(ref_mask_flag):
            REF_ROPE_H = 120.0
            POSE_ROPE_W = 120.0

+            ref_t_patches = 0
+            if reference_latent is not None:
+                ref_t_patches = (reference_latent.shape[2] + (self.patch_size[0] // 2)) // self.patch_size[0]
            main_t_patches = t - ref_t_patches
-            video_t_start = max(ref_t_patches - 1, 0)

            parts = []
            if ref_t_patches > 0:
                ref_tf = {"rope_options": {"shift_y": REF_ROPE_H, "shift_x": 0.0, "scale_y": 1.0, "scale_x": 1.0}}
                parts.append(super().rope_encode(ref_t_patches, h, w, t_start=0, device=device, dtype=dtype, transformer_options=ref_tf))
            if main_t_patches > 0:
-                parts.append(super().rope_encode(main_t_patches, h, w, t_start=video_t_start, device=device, dtype=dtype, transformer_options=transformer_options))
+                parts.append(super().rope_encode(main_t_patches, h, w, t_start=0, device=device, dtype=dtype, transformer_options=transformer_options))

            if pose_latents is not None:
                F_pose, H_pose, W_pose = pose_latents.shape[-3], pose_latents.shape[-2], pose_latents.shape[-1]
@ -1755,7 +1752,7 @@ class SCAILWanModel(WanModel):
                h_shift = (h_scale - 1) / 2
                w_shift = (w_scale - 1) / 2
                pose_tf = {"rope_options": {"shift_y": h_shift, "shift_x": POSE_ROPE_W + w_shift, "scale_y": h_scale, "scale_x": w_scale}}
-                parts.append(super().rope_encode(F_pose, H_pose, W_pose, t_start=video_t_start, device=device, dtype=dtype, transformer_options=pose_tf))
+                parts.append(super().rope_encode(F_pose, H_pose, W_pose, t_start=0, device=device, dtype=dtype, transformer_options=pose_tf))

            return torch.cat(parts, dim=1)

@ -1764,6 +1761,10 @@ class SCAILWanModel(WanModel):
        if pose_latents is None:
            return main_freqs

+        ref_t_patches = 0
+        if reference_latent is not None:
+            ref_t_patches = (reference_latent.shape[2] + (self.patch_size[0] // 2)) // self.patch_size[0]
+
        F_pose, H_pose, W_pose = pose_latents.shape[-3], pose_latents.shape[-2], pose_latents.shape[-1]

        # if pose is at half resolution, scale_y/scale_x=2 stretches the position range to cover the same RoPE extent as the main frames
--- a/comfy/model_base.py
+++ b/comfy/model_base.py
@ -54,7 +54,6 @@ import comfy.ldm.pixeldit.model
 import comfy.ldm.pixeldit.pid
 import comfy.ldm.ace.model
 import comfy.ldm.omnigen.omnigen2
-import comfy.ldm.boogu.model
 import comfy.ldm.qwen_image.model
 import comfy.ldm.ideogram4.model
 import comfy.ldm.kandinsky5.model
@ -1748,14 +1747,10 @@ class WAN21_SCAIL(WAN21):

        reference_latents = kwargs.get("reference_latents", None)
        if reference_latents is not None:
-            # SCAIL-2 multi-reference: reference_latents[0] is the primary ref, [1:] are additional
-            # references. Stack as [additional..., primary] so the primary stays adjacent to the video.
-            ordered = list(reference_latents[1:]) + list(reference_latents[:1])
-            stacked = []
-            for lat in ordered:
-                lat = self.process_latent_in(lat)
-                stacked.append(torch.cat([lat, torch.ones_like(lat[:, :4])], dim=1))
-            out['reference_latent'] = comfy.conds.CONDRegular(torch.cat(stacked, dim=2))
+            ref_latent = self.process_latent_in(reference_latents[-1])
+            ref_mask = torch.ones_like(ref_latent[:, :4])
+            ref_latent = torch.cat([ref_latent, ref_mask], dim=1)
+            out['reference_latent'] = comfy.conds.CONDRegular(ref_latent)

        pose_latents = kwargs.get("pose_video_latent", None)
        if pose_latents is not None:
@ -1797,7 +1792,6 @@ class WAN21_SCAIL2(WAN21_SCAIL):
        if driving_mask_28ch is not None:
            out['sam_latents'] = comfy.conds.CONDRegular(driving_mask_28ch.movedim(1, 2).contiguous())

-        # ref_mask_28ch holds one identity mask per stacked reference frame (additional refs first, then the primary ref), followed by zeros over the video frames.
        ref_mask_28ch = kwargs.get("ref_mask_28ch", None)
        if ref_mask_28ch is not None:
            out['ref_mask_latents'] = comfy.conds.CONDRegular(ref_mask_28ch.movedim(1, 2).contiguous())
@ -1825,11 +1819,10 @@ class WAN21_SCAIL2(WAN21_SCAIL):
            # Return sliced view omitting retain_index_list
            return comfy.context_windows.slice_cond(cond_value, window, x_in, device, temporal_dim=2, temporal_offset=0)
        if cond_key == "ref_mask_latents" and hasattr(cond_value, "cond") and isinstance(cond_value.cond, torch.Tensor):
-            # The ref mask is N leading ref frames padded with frames of zeros, so just grab the first frames for all windows
+            # The ref mask is just a single frame padded with frames of zeros, so just grab the first frames for all windows
            full_ref_mask = cond_value.cond
            video_frame_count = x_in.shape[2]
-            ref_frame_count = full_ref_mask.shape[2] - video_frame_count
-            if ref_frame_count < 1:
+            if full_ref_mask.shape[2] != video_frame_count + 1:
                return None
            window_length = len(window.index_list)

@ -1838,7 +1831,7 @@ class WAN21_SCAIL2(WAN21_SCAIL):
            if anchor_index is not None and anchor_index >= 0:
                window_length += 1

-            window_ref_mask = full_ref_mask[:, :, :window_length + ref_frame_count].to(device)
+            window_ref_mask = full_ref_mask[:, :, :window_length + 1].to(device)
            return cond_value._copy_with(window_ref_mask)

        return super().resize_cond_for_context_window(cond_key, cond_value, window, x_in, device, retain_index_list=retain_index_list)
@ -2104,11 +2097,6 @@ class Omnigen2(BaseModel):
            out['ref_latents'] = list([1, 16, sum(map(lambda a: math.prod(a.size()), ref_latents)) // 16])
        return out

-class Boogu(Omnigen2):
-    def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
-        super(Omnigen2, self).__init__(model_config, model_type, device=device, unet_model=comfy.ldm.boogu.model.BooguTransformer2DModel)
-        self.memory_usage_factor_conds = ("ref_latents",)
-
 class QwenImage(BaseModel):
    def __init__(self, model_config, model_type=ModelType.FLUX, device=None):
        super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.qwen_image.model.QwenImageTransformer2DModel)
--- a/comfy/model_detection.py
+++ b/comfy/model_detection.py
@ -761,16 +761,6 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):

        return dit_config

-    if '{}double_stream_layers.0.img_instruct_attn.processor.img_to_q.weight'.format(key_prefix) in state_dict_keys:  # Boogu-Image (OmniGen2 derivative + dual-stream stage)
-        dit_config = {}
-        dit_config["image_model"] = "boogu"
-        dit_config["hidden_size"] = state_dict['{}x_embedder.weight'.format(key_prefix)].shape[0]
-        dit_config["num_layers"] = count_blocks(state_dict_keys, '{}single_stream_layers.'.format(key_prefix) + '{}.')
-        dit_config["num_double_stream_layers"] = count_blocks(state_dict_keys, '{}double_stream_layers.'.format(key_prefix) + '{}.')
-        dit_config["num_refiner_layers"] = count_blocks(state_dict_keys, '{}noise_refiner.'.format(key_prefix) + '{}.')
-        dit_config["instruction_feat_dim"] = state_dict['{}time_caption_embed.caption_embedder.0.weight'.format(key_prefix)].shape[0]
-        return dit_config
-
    if '{}time_caption_embed.timestep_embedder.linear_1.bias'.format(key_prefix) in state_dict_keys:  # Omnigen2
        dit_config = {}
        dit_config["image_model"] = "omnigen2"
--- a/comfy/sd.py
+++ b/comfy/sd.py
@ -68,7 +68,6 @@ import comfy.text_encoders.ace15
 import comfy.text_encoders.longcat_image
 import comfy.text_encoders.qwen35
 import comfy.text_encoders.qwen3vl
-import comfy.text_encoders.boogu
 import comfy.text_encoders.ernie
 import comfy.text_encoders.gemma4
 import comfy.text_encoders.cogvideo
@ -1302,7 +1301,6 @@ class CLIPType(Enum):
    LENS = 28
    PIXELDIT = 29
    IDEOGRAM4 = 30
-    BOOGU = 31



@ -1624,14 +1622,6 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
                clip_data[0] = comfy.utils.state_dict_prefix_replace(clip_data[0], {"model.language_model.": "model.", "model.visual.": "visual.", "lm_head.": "model.lm_head."})
                clip_target.clip = comfy.text_encoders.ideogram4.te_qwen3vl(**llama_detect(clip_data))
                clip_target.tokenizer = comfy.text_encoders.ideogram4.Ideogram4Qwen3VLTokenizer
-            elif clip_type == CLIPType.BOOGU and te_model == TEModel.QWEN3VL_8B:  # Boogu-Image: full Qwen3-VL-8B, last hidden state, no-think template.
-                clip_data[0] = comfy.utils.state_dict_prefix_replace(clip_data[0], {"model.language_model.": "model.", "model.visual.": "visual.", "lm_head.": "model.lm_head."})
-                clip_target.clip = comfy.text_encoders.boogu.te(**llama_detect(clip_data))
-                clip_target.tokenizer = comfy.text_encoders.boogu.BooguTokenizer
-            elif clip_type in (CLIPType.FLUX, CLIPType.FLUX2):  # Flux2 Klein reuses the Qwen3-VL LM (3-layer tap -> 12288); visual unused.
-                klein_model_type = "qwen3_8b" if te_model == TEModel.QWEN3VL_8B else "qwen3_4b"
-                clip_target.clip = comfy.text_encoders.flux.klein_te(**llama_detect(clip_data), model_type=klein_model_type)
-                clip_target.tokenizer = comfy.text_encoders.flux.KleinTokenizer8B if te_model == TEModel.QWEN3VL_8B else comfy.text_encoders.flux.KleinTokenizer
            else:
                clip_data[0] = comfy.utils.state_dict_prefix_replace(clip_data[0], {"model.language_model.": "model.", "model.visual.": "visual.", "lm_head.": "model.lm_head."})
                qwen3vl_type = {TEModel.QWEN3VL_4B: "qwen3vl_4b", TEModel.QWEN3VL_8B: "qwen3vl_8b"}[te_model]
--- a/comfy/supported_models.py
+++ b/comfy/supported_models.py
@ -25,7 +25,6 @@ import comfy.text_encoders.hunyuan_image
 import comfy.text_encoders.kandinsky5
 import comfy.text_encoders.z_image
 import comfy.text_encoders.ideogram4
-import comfy.text_encoders.boogu
 import comfy.text_encoders.anima
 import comfy.text_encoders.ace15
 import comfy.text_encoders.longcat_image
@ -1759,27 +1758,6 @@ class Omnigen2(supported_models_base.BASE):
        hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen25_3b.transformer.".format(pref))
        return supported_models_base.ClipTarget(comfy.text_encoders.omnigen2.Omnigen2Tokenizer, comfy.text_encoders.omnigen2.te(**hunyuan_detect))

-class Boogu(Omnigen2):
-    unet_config = {
-        "image_model": "boogu",
-    }
-
-    sampling_settings = {
-        "multiplier": 1.0,
-        "shift": 3.16,
-    }
-
-    memory_usage_factor = 2.15
-
-    def get_model(self, state_dict, prefix="", device=None):
-        out = model_base.Boogu(self, device=device)
-        return out
-
-    def clip_target(self, state_dict={}):
-        pref = self.text_encoder_key_prefix[0]
-        hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen3vl_8b.transformer.".format(pref))
-        return supported_models_base.ClipTarget(comfy.text_encoders.boogu.BooguTokenizer, comfy.text_encoders.boogu.te(**hunyuan_detect))
-
 class Ideogram4(supported_models_base.BASE):
    unet_config = {
        "image_model": "ideogram4",
@ -2322,7 +2300,6 @@ models = [
    ACEStep,
    ACEStep15,
    Omnigen2,
-    Boogu,
    QwenImage,
    Ideogram4,
    Flux2,
--- a/comfy/text_encoders/boogu.py
+++ b/comfy/text_encoders/boogu.py
@ -1,58 +0,0 @@
-"""Boogu-Image text encoder: full Qwen3-VL-8B, last hidden state (4096-dim).
-
-Boogu uses the final hidden state of Qwen3-VL as the per-token instruction feature
-(num_instruction_feature_layers=1, reduce_type=mean -> just the last layer).
-The model itself is the standard Qwen3-VL TE, only the chat template differs
-(a fixed system prompt and no <think> block).
-"""
-
-import comfy.text_encoders.qwen3vl
-from comfy import sd1_clip
-
-
-# System prompts from the reference pipeline (pipeline_boogu.py).
-# T2I (non-empty instruction, no image) uses the helpful-assistant prompt
-# everything else (the CFG negative / "drop" condition, and any image case) uses the TI2I "describe" prompt.
-BOOGU_T2I_SYSTEM = "You are a helpful assistant that generates high-quality images based on user instructions. The instructions are as follows."
-BOOGU_DROP_SYSTEM = "Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate."
-
-
-class BooguTokenizer(comfy.text_encoders.qwen3vl.Qwen3VLTokenizer):
-    def __init__(self, embedding_directory=None, tokenizer_data={}):
-        super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, model_type="qwen3vl_8b")
-        # apply_chat_template without add_generation_prompt
-        self.llama_template = "<|im_start|>system\n" + BOOGU_T2I_SYSTEM + "<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n"
-        self.llama_template_images = "<|im_start|>system\n" + BOOGU_DROP_SYSTEM + "<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>\n"
-        # Reference SYSTEM_PROMPT_DROP: used for the empty negative/uncond instruction.
-        self.llama_template_drop = "<|im_start|>system\n" + BOOGU_DROP_SYSTEM + "<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n"
-
-    def tokenize_with_weights(self, text, return_word_ids=False, llama_template=None, images=[], prevent_empty_text=False, thinking=True, **kwargs):
-        if llama_template is None and len(images) == 0 and text.strip() == "":
-            llama_template = self.llama_template_drop
-        # Boogu conditions on the no-think template; thinking=True drops the empty <think> block qwen3vl adds by default.
-        return super().tokenize_with_weights(text, return_word_ids=return_word_ids, llama_template=llama_template, images=images, prevent_empty_text=prevent_empty_text, thinking=thinking, **kwargs)
-
-
-class BooguQwen3VLClipModel(comfy.text_encoders.qwen3vl.Qwen3VLClipModel):
-    def __init__(self, device="cpu", dtype=None, attention_mask=True, model_options={}, model_type="qwen3vl_8b"):
-        super().__init__(device=device, dtype=dtype, attention_mask=attention_mask, model_options=model_options, model_type=model_type)
-        # apply the final RMSNorm to the tapped last layer
-        self.layer_norm_hidden_state = True
-
-
-class BooguTEModel(sd1_clip.SD1ClipModel):
-    def __init__(self, device="cpu", dtype=None, model_options={}):
-        clip_model = lambda **kw: BooguQwen3VLClipModel(**kw, model_type="qwen3vl_8b")
-        super().__init__(device=device, dtype=dtype, name="qwen3vl_8b", clip_model=clip_model, model_options=model_options)
-
-
-def te(dtype_llama=None, llama_quantization_metadata=None):
-    class BooguTEModel_(BooguTEModel):
-        def __init__(self, device="cpu", dtype=None, model_options={}):
-            if dtype_llama is not None:
-                dtype = dtype_llama
-            if llama_quantization_metadata is not None:
-                model_options = model_options.copy()
-                model_options["quantization_metadata"] = llama_quantization_metadata
-            super().__init__(device=device, dtype=dtype, model_options=model_options)
-    return BooguTEModel_
--- a/comfy_api/feature_flags.py
+++ b/comfy_api/feature_flags.py
@ -25,11 +25,6 @@ CLI_FEATURE_FLAG_REGISTRY: dict[str, FeatureFlagInfo] = {
        "default": False,
        "description": "Show the sign-in button in the frontend even when not signed in",
    },
-    "enable_telemetry": {
-        "type": "bool",
-        "default": False,
-        "description": "Signal the frontend that telemetry collection is enabled",
-    },
 }


--- a/comfy_api_nodes/apis/kling.py
+++ b/comfy_api_nodes/apis/kling.py
@ -149,59 +149,3 @@ class MotionControlRequest(BaseModel):
    character_orientation: str = Field(...)
    mode: str = Field(..., description="'pro' or 'std'")
    model_name: str = Field(...)
-
-
-class Kling3TurboSettings(BaseModel):
-    resolution: str = Field("720p", description="'720p' or '1080p'")
-    aspect_ratio: str | None = Field(None, description="'16:9'/'9:16'/'1:1'; text-to-video only")
-    duration: int = Field(5, description="3-15 second")
-
-
-class Kling3TurboText2VideoRequest(BaseModel):
-    prompt: str = Field(..., description="<=3072 chars; may use multi-shot 'shot n, m, words; ...'")
-    settings: Kling3TurboSettings | None = Field(None)
-
-
-class Kling3TurboContent(BaseModel):
-    type: str = Field(..., description="'prompt' or 'first_frame'")
-    text: str | None = Field(None, description="for type=prompt; <=2500 chars")
-    url: str | None = Field(None, description="for type=first_frame")
-
-
-class Kling3TurboImage2VideoRequest(BaseModel):
-    contents: list[Kling3TurboContent] = Field(..., description="prompt + first_frame materials")
-    settings: Kling3TurboSettings | None = Field(None)
-
-
-class Kling3TurboCreateData(BaseModel):
-    id: str | None = Field(None, description="Task ID")
-    status: str | None = Field(None)
-    message: str | None = Field(None)
-
-
-class Kling3TurboCreateResponse(BaseModel):
-    code: int | None = Field(None)
-    message: str | None = Field(None)
-    request_id: str | None = Field(None)
-    data: Kling3TurboCreateData | None = Field(None)
-
-
-class Kling3TurboOutput(BaseModel):
-    type: str | None = Field(None, description="'video', 'image', 'audio', ...")
-    id: str | None = Field(None)
-    url: str | None = Field(None)
-    duration: str | None = Field(None)
-
-
-class Kling3TurboTaskData(BaseModel):
-    id: str | None = Field(None)
-    status: str | None = Field(None, description="submitted | processing | succeeded | failed")
-    message: str | None = Field(None)
-    outputs: list[Kling3TurboOutput] | None = Field(None)
-
-
-class Kling3TurboQueryResponse(BaseModel):
-    code: int | None = Field(None)
-    message: str | None = Field(None)
-    request_id: str | None = Field(None)
-    data: list[Kling3TurboTaskData] | None = Field(None)
--- a/comfy_api_nodes/nodes_kling.py
+++ b/comfy_api_nodes/nodes_kling.py
@ -60,12 +60,6 @@ from comfy_api_nodes.apis.kling import (
    OmniProImageRequest,
    OmniProReferences2VideoRequest,
    OmniProText2VideoRequest,
-    Kling3TurboSettings,
-    Kling3TurboText2VideoRequest,
-    Kling3TurboContent,
-    Kling3TurboImage2VideoRequest,
-    Kling3TurboCreateResponse,
-    Kling3TurboQueryResponse,
    TaskStatusResponse,
    TextToVideoWithAudioRequest,
 )
@ -2853,67 +2847,6 @@ class MotionControl(IO.ComfyNode):
        return IO.NodeOutput(await download_url_to_video_output(final_response.data.task_result.videos[0].url))


-def build_turbo_shot_prompt(multi_prompt: list[MultiPromptEntry]) -> str:
-    """Render storyboard entries into the Turbo multi-shot prompt 'shot n, m, words; ...'."""
-    return "; ".join(f"shot {i}, {int(e.duration)}, {e.prompt}" for i, e in enumerate(multi_prompt, 1)) + ";"
-
-
-def _turbo_video_url(response: Kling3TurboQueryResponse) -> str:
-    """Extract the result video URL from a /tasks response (data[].outputs[] where type == 'video')."""
-    task = response.data[0] if response.data else None
-    if task and task.outputs:
-        for output in task.outputs:
-            if output.type == "video" and output.url:
-                return output.url
-    raise RuntimeError(f"Kling 3.0 Turbo task finished without a video output: {response.model_dump()}")
-
-
-async def execute_kling_turbo(
-    cls: type[IO.ComfyNode],
-    *,
-    prompt: str,
-    resolution: str,
-    aspect_ratio: str,
-    duration: int,
-    start_frame: torch.Tensor | None,
-) -> IO.NodeOutput:
-    """Create + poll a Kling 3.0 Turbo task. Image-to-video when start_frame is given, else text-to-video."""
-    if start_frame is not None:
-        validate_image_dimensions(start_frame, min_width=300, min_height=300)
-        validate_image_aspect_ratio(start_frame, (1, 2.5), (2.5, 1))
-        contents = [Kling3TurboContent(type="first_frame", url=tensor_to_base64_string(start_frame))]
-        if prompt:
-            contents.insert(0, Kling3TurboContent(type="prompt", text=prompt))
-        create = await sync_op(
-            cls,
-            ApiEndpoint(path="/proxy/kling/image-to-video/kling-3.0-turbo", method="POST"),
-            response_model=Kling3TurboCreateResponse,
-            data=Kling3TurboImage2VideoRequest(
-                contents=contents,
-                settings=Kling3TurboSettings(resolution=resolution, duration=duration),  # i2v: no aspect_ratio
-            ),
-        )
-    else:
-        create = await sync_op(
-            cls,
-            ApiEndpoint(path="/proxy/kling/text-to-video/kling-3.0-turbo", method="POST"),
-            response_model=Kling3TurboCreateResponse,
-            data=Kling3TurboText2VideoRequest(
-                prompt=prompt,
-                settings=Kling3TurboSettings(resolution=resolution, aspect_ratio=aspect_ratio, duration=duration),
-            ),
-        )
-    if not (create.data and create.data.id):
-        raise RuntimeError(f"Kling 3.0 Turbo create failed. Code: {create.code}, Message: {create.message}")
-    final_response = await poll_op(
-        cls,
-        ApiEndpoint(path="/proxy/kling/tasks", query_params={"task_ids": create.data.id}),
-        response_model=Kling3TurboQueryResponse,
-        status_extractor=lambda r: (r.data[0].status if r.data else None),
-    )
-    return IO.NodeOutput(await download_url_to_video_output(_turbo_video_url(final_response)))
-
-
 class KlingVideoNode(IO.ComfyNode):

    @classmethod
@ -2951,11 +2884,7 @@ class KlingVideoNode(IO.ComfyNode):
                    ],
                    tooltip="Generate a series of video segments with individual prompts and durations.",
                ),
-                IO.Boolean.Input(
-                    "generate_audio",
-                    default=True,
-                    tooltip="'kling-3.0-turbo' always generates native audio, so the audio toggle is ignored.",
-                ),
+                IO.Boolean.Input("generate_audio", default=True),
                IO.DynamicCombo.Input(
                    "model",
                    options=[
@ -2970,17 +2899,6 @@ class KlingVideoNode(IO.ComfyNode):
                                ),
                            ],
                        ),
-                        IO.DynamicCombo.Option(
-                            "kling-3.0-turbo",
-                            [
-                                IO.Combo.Input("resolution", options=["1080p", "720p"], default="720p"),
-                                IO.Combo.Input(
-                                    "aspect_ratio",
-                                    options=["16:9", "9:16", "1:1"],
-                                    tooltip="Ignored in image-to-video mode.",
-                                ),
-                            ],
-                        ),
                    ],
                    tooltip="Model and generation settings.",
                ),
@ -3012,7 +2930,6 @@ class KlingVideoNode(IO.ComfyNode):
            price_badge=IO.PriceBadge(
                depends_on=IO.PriceBadgeDepends(
                    widgets=[
-                        "model",
                        "model.resolution",
                        "generate_audio",
                        "multi_shot",
@ -3027,7 +2944,14 @@ class KlingVideoNode(IO.ComfyNode):
                ),
                expr="""
                (
+                  $rates := {
+                    "4k": {"off": 0.42, "on": 0.42},
+                    "1080p": {"off": 0.112, "on": 0.168},
+                    "720p": {"off": 0.084, "on": 0.126}
+                  };
                  $res := $lookup(widgets, "model.resolution");
+                  $audio := widgets.generate_audio ? "on" : "off";
+                  $rate := $lookup($lookup($rates, $res), $audio);
                  $ms := widgets.multi_shot;
                  $isSb := $ms != "disabled";
                  $n := $isSb ? $number($substring($ms, 0, 1)) : 0;
@ -3038,18 +2962,7 @@ class KlingVideoNode(IO.ComfyNode):
                  $d5 := $n >= 5 ? $lookup(widgets, "multi_shot.storyboard_5_duration") : 0;
                  $d6 := $n >= 6 ? $lookup(widgets, "multi_shot.storyboard_6_duration") : 0;
                  $dur := $isSb ? $d1 + $d2 + $d3 + $d4 + $d5 + $d6 : $lookup(widgets, "multi_shot.duration");
-                  widgets.model = "kling-3.0-turbo"
-                    ? {"type":"usd","usd": ($res = "1080p" ? 0.14 : 0.112) * $dur}
-                    : (
-                        $rates := {
-                          "4k": {"off": 0.42, "on": 0.42},
-                          "1080p": {"off": 0.112, "on": 0.168},
-                          "720p": {"off": 0.084, "on": 0.126}
-                        };
-                        $audio := widgets.generate_audio ? "on" : "off";
-                        $rate := $lookup($lookup($rates, $res), $audio);
-                        {"type":"usd","usd": $rate * $dur}
-                      )
+                  {"type":"usd","usd": $rate * $dur}
                )
                """,
            ),
@ -3102,17 +3015,6 @@ class KlingVideoNode(IO.ComfyNode):
            duration = multi_shot["duration"]
            validate_string(multi_shot["prompt"], min_length=1, max_length=2500)

-        if model["model"] == "kling-3.0-turbo":
-            turbo_prompt = build_turbo_shot_prompt(multi_prompt_list) if custom_multi_shot else multi_shot["prompt"]
-            return await execute_kling_turbo(
-                cls,
-                prompt=turbo_prompt,
-                resolution=model["resolution"],
-                aspect_ratio=model["aspect_ratio"],
-                duration=duration,
-                start_frame=start_frame,
-            )
-
        if start_frame is not None:
            validate_image_dimensions(start_frame, min_width=300, min_height=300)
            validate_image_aspect_ratio(start_frame, (1, 2.5), (2.5, 1))
--- a/comfy_extras/nodes_boogu.py
+++ b/comfy_extras/nodes_boogu.py
@ -1,97 +0,0 @@
-import math
-
-import node_helpers
-import comfy.utils
-from typing_extensions import override
-from comfy_api.latest import ComfyExtension, io
-
-
-class TextEncodeBooguEdit(io.ComfyNode):
-    """Boogu-Image Edit conditioning.
-
-    The edit image is used twice, matching the reference pipeline:
-      - Qwen3-VL vision tokens (instruction understanding) -> positive only
-      - VAE reference latent (image identity)              -> positive and negative
-    The ref latent is in both conds so it cancels under CFG (identity preserved);
-    the vision tokens are only in the positive so CFG amplifies the instruction.
-    The tokenizer selects the right system prompt automatically (image -> TI2I,
-    empty negative -> DROP), so no template plumbing is needed here.
-    """
-
-    @classmethod
-    def define_schema(cls):
-        return io.Schema(
-            node_id="TextEncodeBooguEdit",
-            category="model/conditioning/boogu",
-            inputs=[
-                io.Clip.Input("clip"),
-                io.String.Input("prompt", multiline=True, dynamic_prompts=True),
-                io.String.Input("negative_prompt", multiline=True, dynamic_prompts=True, advanced=True),
-                io.Vae.Input("vae"),
-                io.Autogrow.Input(
-                    "images",
-                    template=io.Autogrow.TemplateNames(
-                        io.Image.Input("image"),
-                        names=[f"image_{i}" for i in range(1, 17)],
-                        min=0,
-                    ),
-                    tooltip="Reference image(s) to edit. Boogu focuses on one reference per sample; more are allowed.",
-                ),
-            ],
-            outputs=[
-                io.Conditioning.Output(display_name="positive"),
-                io.Conditioning.Output(display_name="negative"),
-            ],
-        )
-
-    @classmethod
-    def execute(cls, clip, prompt, negative_prompt, vae=None, images: io.Autogrow.Type = None) -> io.NodeOutput:
-        ref_latents = []
-        images_vl = []
-
-        images = images or {}
-        for name in sorted(images, key=lambda n: int(n.rsplit("_", 1)[-1])):
-            image = images[name]
-            if image is None:
-                continue
-            samples = image.movedim(-1, 1)
-
-            # Vision tower input: the reference caps the VLM image at 384x384
-            # (max_vlm_input_pil_pixels in pipeline_boogu.py).
-            total = int(384 * 384)
-            scale_by = math.sqrt(total / (samples.shape[3] * samples.shape[2]))
-            width = round(samples.shape[3] * scale_by)
-            height = round(samples.shape[2] * scale_by)
-            s = comfy.utils.common_upscale(samples, width, height, "area", "disabled")
-            images_vl.append(s.movedim(1, -1)[:, :, :, :3])
-
-            # Reference latent: align to 16 px (VAE /8 * patch_size 2).
-            if vae is not None:
-                total = int(1024 * 1024)
-                scale_by = math.sqrt(total / (samples.shape[3] * samples.shape[2]))
-                width = round(samples.shape[3] * scale_by / 16.0) * 16
-                height = round(samples.shape[2] * scale_by / 16.0) * 16
-                s = comfy.utils.common_upscale(samples, width, height, "area", "disabled")
-                ref_latents.append(vae.encode(s.movedim(1, -1)[:, :, :, :3]))
-
-        # positive: instruction + vision tokens; negative: empty (no vision). Ref latent on both.
-        positive = clip.encode_from_tokens_scheduled(clip.tokenize(prompt, images=images_vl))
-        negative = clip.encode_from_tokens_scheduled(clip.tokenize(negative_prompt))
-
-        if len(ref_latents) > 0:
-            positive = node_helpers.conditioning_set_values(positive, {"reference_latents": ref_latents}, append=True)
-            negative = node_helpers.conditioning_set_values(negative, {"reference_latents": ref_latents}, append=True)
-
-        return io.NodeOutput(positive, negative)
-
-
-class BooguExtension(ComfyExtension):
-    @override
-    async def get_node_list(self) -> list[type[io.ComfyNode]]:
-        return [
-            TextEncodeBooguEdit,
-        ]
-
-
-async def comfy_entrypoint() -> BooguExtension:
-    return BooguExtension()
--- a/comfy_extras/nodes_dataset.py
+++ b/comfy_extras/nodes_dataset.py
@ -1583,7 +1583,7 @@ class LoadTrainingDataset(io.ComfyNode):
            shard_path = os.path.join(dataset_dir, shard_file)

            with open(shard_path, "rb") as f:
-                shard_data = torch.load(f, weights_only=True)
+                shard_data = torch.load(f)

            all_latents.extend(shard_data["latents"])
            all_conditioning.extend(shard_data["conditioning"])
--- a/comfy_extras/nodes_scail.py
+++ b/comfy_extras/nodes_scail.py
@ -34,20 +34,14 @@ def _unpack(track_data):
    return unpack_masks(packed)


-def _first_appearance_cx_area(masks_bool):
-    """Per object: first frame it appears in, plus centroid-x and area in that frame."""
-    m = masks_bool.float()
-    T, H, W = m.shape[0], m.shape[-2], m.shape[-1]
-    grid_x = torch.arange(W, device=m.device, dtype=m.dtype).view(1, 1, 1, W)
-    area_t = m.sum(dim=(-1, -2))
-    cx_t = (m * grid_x).sum(dim=(-1, -2)) / area_t.clamp(min=1)
-    present = area_t > 0
-    frame_idx = torch.arange(T, device=m.device).unsqueeze(1)
-    first_t = torch.where(present, frame_idx, T).amin(dim=0)
-    sel = first_t.clamp(max=T - 1).unsqueeze(0)
-    cx = cx_t.gather(0, sel).squeeze(0)
-    area = area_t.gather(0, sel).squeeze(0)
-    return first_t.tolist(), (cx / W).tolist(), (area / (H * W)).tolist()
+def _first_frame_cx_area(masks_bool):
+    first = masks_bool[0].float()
+    H, W = first.shape[-2], first.shape[-1]
+    n_pixels = H * W
+    grid_x = torch.arange(W, device=first.device, dtype=first.dtype).view(1, W)
+    area = first.sum(dim=(-1, -2)).clamp_(min=1)
+    cx = (first * grid_x).sum(dim=(-1, -2)) / area
+    return (cx / W).tolist(), (area / n_pixels).tolist()


 def _subset_track_data(track_data, obj_indices):
@ -87,26 +81,12 @@ def _render_colored_masks(track_data, background="black"):
        masks_full.view(T * N_obj, 1, Hm, Wm), size=(H, W), mode="nearest"
    ).view(T, N_obj, H, W) > 0.5
    any_mask = masks_full.any(dim=1)
-    color_overlay = colors[masks_full.to(torch.uint8).argmax(dim=1)]
+    obj_idx_map = masks_full.to(torch.uint8).argmax(dim=1)
+    color_overlay = colors[obj_idx_map]
    bg_tensor = torch.tensor(bg_rgb, device=device, dtype=color_overlay.dtype).view(1, 1, 1, 3)
    return torch.where(any_mask.unsqueeze(-1), color_overlay, bg_tensor.expand_as(color_overlay))


-def _render_mask_as_identity(mask, background="black"):
-    """Plain comfy MASK (B,H,W) or (H,W) -> (B,H,W,3) rendered as a single identity (palette[0])
-    on the given background. A batch is treated as multiple views of that one subject."""
-    device = comfy.model_management.intermediate_device()
-    dtype = comfy.model_management.intermediate_dtype()
-    if mask.ndim == 2:
-        mask = mask.unsqueeze(0)
-    mask = mask.to(device=device, dtype=dtype)
-    B, H, W = mask.shape
-    bg_rgb = (1.0, 1.0, 1.0) if background.startswith("white") else (0.0, 0.0, 0.0)
-    color = torch.tensor(DEFAULT_PALETTE[0], device=device, dtype=dtype).view(1, 1, 1, 3)
-    bg = torch.tensor(bg_rgb, device=device, dtype=dtype).view(1, 1, 1, 3)
-    return torch.where((mask > 0.5).unsqueeze(-1), color.expand(B, H, W, 3), bg.expand(B, H, W, 3))
-
-
 def _extract_mask_to_28ch(rgb_video):
    """Colored RGB mask (T, H, W, 3) in [0, 1] -> SCAIL-2 28-channel binary latent
    (1, T_lat, 28, H_lat, W_lat). 7 per-color binary channels (white/r/g/b/y/m/c)
@ -158,8 +138,8 @@ class WanSCAILToVideo(io.ComfyNode):
                io.Float.Input("pose_strength", default=1.0, min=0.0, max=10.0, step=0.01, tooltip="Strength of the pose latent."),
                io.Float.Input("pose_start", default=0.0, min=0.0, max=1.0, step=0.01, tooltip="Start step of the pose conditioning."),
                io.Float.Input("pose_end", default=1.0, min=0.0, max=1.0, step=0.01, tooltip="End step of the pose conditioning."),
-                io.Image.Input("reference_image", optional=True, tooltip="Reference image. The first image is the primary reference (composite all identities onto it). SCAIL-2: extra batch images are used as additional views (back view, close-up, occluded background), each needing a matching reference_image_mask in that identity's color."),
-                io.Image.Input("reference_image_mask", optional=True, tooltip="SCAIL-2 only. Colored reference mask, batch matching reference_image (first = primary reference mask, rest = identity masks for the additional reference_image)."),
+                io.Image.Input("reference_image", optional=True, tooltip="Reference image, for multiple references composite all on single image."),
+                io.Image.Input("reference_image_mask", optional=True, tooltip="SCAIL-2 only. Colored reference mask at the same resolution as reference_image."),
                io.ClipVisionOutput.Input("clip_vision_output", optional=True, tooltip="CLIP vision features for conditioning. Model is trained with stretch resize to aspect ratio."),
                io.Int.Input("video_frame_offset", default=0, min=0, max=nodes.MAX_RESOLUTION, step=1, tooltip="Cumulative output frame this chunk begins at. Wire from the previous chunk's video_frame_offset output."),
                io.Int.Input("previous_frame_count", default=5, min=1, max=nodes.MAX_RESOLUTION, step=4, tooltip="Tail frames of previous_frames to anchor. SCAIL-2 trained at 5 (81-frame chunks, 76-frame step)."),
@ -191,21 +171,19 @@ class WanSCAILToVideo(io.ComfyNode):
            video_frame_offset -= prev_trimmed.shape[0]
            video_frame_offset = max(0, video_frame_offset)

+        ref_latent = None
        if reference_image is not None:
-            ref_imgs = comfy.utils.common_upscale(reference_image.movedim(-1, 1), width, height, "bicubic", "center").movedim(1, -1)
-            n_ref = ref_imgs.shape[0]
-            # SCAIL-2 multi-reference: the first image is the primary ref, the rest are additional references.
-
-            # Replacement Mode: composite each ref on black bg using its mask as alpha matte
+            reference_image = comfy.utils.common_upscale(reference_image[:1].movedim(-1, 1), width, height, "bicubic", "center").movedim(1, -1)
+            # Replacement Mode: composite ref on black bg using reference_image_mask as alpha matte
            if replacement_mode and reference_image_mask is not None:
-                rm = comfy.utils.common_upscale(reference_image_mask.movedim(-1, 1), width, height, "nearest-exact", "center").movedim(1, -1)
-                rm = rm[[min(i, rm.shape[0] - 1) for i in range(n_ref)]]
-                is_char = (rm[..., :3].max(dim=-1, keepdim=True).values > 0.1).to(ref_imgs.dtype)
-                ref_imgs = ref_imgs * is_char
-            # encode each ref individually so each stays a single latent frame (a batched encode would be treated as a video)
-            ref_latents = [vae.encode(ref_imgs[i:i + 1, :, :, :3]) for i in range(n_ref)]
-            positive = node_helpers.conditioning_set_values(positive, {"reference_latents": ref_latents}, append=True)
-            negative = node_helpers.conditioning_set_values(negative, {"reference_latents": ref_latents}, append=True)
+                rm = comfy.utils.common_upscale(reference_image_mask[:1].movedim(-1, 1), width, height, "nearest-exact", "center").movedim(1, -1)
+                is_char = (rm[..., :3].max(dim=-1, keepdim=True).values > 0.1).to(reference_image.dtype)
+                reference_image = reference_image * is_char
+            ref_latent = vae.encode(reference_image[:, :, :, :3])
+
+        if ref_latent is not None:
+            positive = node_helpers.conditioning_set_values(positive, {"reference_latents": [ref_latent]}, append=True)
+            negative = node_helpers.conditioning_set_values(negative, {"reference_latents": [ref_latent]}, append=True)

        if clip_vision_output is not None:
            positive = node_helpers.conditioning_set_values(positive, {"clip_vision_output": clip_vision_output})
@ -243,16 +221,11 @@ class WanSCAILToVideo(io.ComfyNode):
            positive = node_helpers.conditioning_set_values(positive, {"driving_mask_28ch": driving_mask_28ch})
            negative = node_helpers.conditioning_set_values(negative, {"driving_mask_28ch": driving_mask_28ch})

-        # The ref mask binds reference frames to identities, so it only applies when there's a reference image.
-        if reference_image_mask is not None and reference_image is not None:
-            ref_mask_hw = comfy.utils.common_upscale(reference_image_mask.movedim(-1, 1), width, height, "nearest-exact", "center").movedim(1, -1)
-            n_masks = ref_mask_hw.shape[0]
-            n_ref = reference_image.shape[0]
-
-            add_masks = [_extract_mask_to_28ch(ref_mask_hw[min(i, n_masks - 1)][None]) for i in range(1, n_ref)]
-            ref_mask_1f = _extract_mask_to_28ch(ref_mask_hw[:1])
+        if reference_image_mask is not None:
+            ref_mask_hw = comfy.utils.common_upscale(reference_image_mask[:1].movedim(-1, 1), width, height, "bicubic", "center").movedim(1, -1)
+            ref_mask_1f = _extract_mask_to_28ch(ref_mask_hw)
            zeros = torch.zeros((1, latent.shape[2], 28, ref_mask_1f.shape[-2], ref_mask_1f.shape[-1]), device=ref_mask_1f.device, dtype=ref_mask_1f.dtype)
-            ref_mask_28ch = torch.cat(add_masks + [ref_mask_1f, zeros], dim=1)
+            ref_mask_28ch = torch.cat([ref_mask_1f, zeros], dim=1)
            positive = node_helpers.conditioning_set_values(positive, {"ref_mask_28ch": ref_mask_28ch})
            negative = node_helpers.conditioning_set_values(negative, {"ref_mask_28ch": ref_mask_28ch})

@ -271,9 +244,12 @@ class WanSCAILToVideo(io.ComfyNode):


 class SCAIL2ColoredMask(io.ComfyNode):
-    """Render SAM3 tracks for the driving pose video and reference image(s) into the
-    colored masks WanSCAILToVideo consumes. Shared `sort_by` keeps each identity on the
-    same color across both outputs.
+    """Render SAM3 tracks for the driving pose video and (optionally) the reference
+    image into the two colored masks WanSCAILToVideo consumes. Shared `sort_by`
+    across both outputs guarantees identity K maps to the same color on both
+    sides, for multi-person workflow consistency.
+    reference_image_mask is always rendered black-bg (model convention)
+    pose_video_mask bg follows replacement_mode: black = Animation Mode, white = Replacement Mode
    """

    @classmethod
@ -284,12 +260,10 @@ class SCAIL2ColoredMask(io.ComfyNode):
            category="model/conditioning/wan/scail",
            inputs=[
                SAM3TrackData.Input("driving_track_data", tooltip="SAM3 track of the driving pose video. Will be rendered into the pose_video_mask output."),
-                io.MultiType.Input("ref_track_data", [SAM3TrackData, io.Mask], optional=True, display_name="reference_masks",
-                                   tooltip="SAM3 track of the reference image(s) (one identity per object, colored in batch order), or a plain MASK of the reference subject (rendered as a single identity)."),
-                io.String.Input("object_indices", default="",
-                                tooltip="Comma-separated list of person indices to include (e.g. '0,2,3'). Applied to both reference and pose video masks. Empty = all."),
+                SAM3TrackData.Input("ref_track_data", optional=True, tooltip="SAM3 track of the reference image."),
+                io.String.Input("object_indices", default="", tooltip="Comma-separated list of person indices to include (e.g. '0,2,3'). Applied to both reference and pose video masks. Empty = all."),
                io.Combo.Input("sort_by", options=["none", "left_to_right", "area"], default="left_to_right",
-                               tooltip="Order in which palette colors are assigned to the tracked objects (applied to both reference and pose video so each identity keeps the same color). Objects that appear in earlier frames always come first; within a frame, left_to_right = leftmost object (by centroid at first appearance) gets the first color, area = biggest object (by mask area at first appearance) gets the first color; none = keep SAM3's order."),
+                    tooltip="Order in which palette colors are assigned to the tracked objects (applied to both reference and pose video so each identity keeps the same color). left_to_right = leftmost object (by first-frame centroid) gets the first color; area = biggest object (by first-frame mask area) gets the first color; none = keep SAM3's order."),
                io.Boolean.Input("replacement_mode", default=False,
                    tooltip="False = Animation Mode (pose_video_mask has black background, reference_image_mask has white background). "
                    "True = Replacement Mode (pose_video_mask has white background, reference_image_mask has black background)."),
@ -306,11 +280,11 @@ class SCAIL2ColoredMask(io.ComfyNode):
        def _prep(td):
            masks_bool = _unpack(td)
            if sort_by != "none" and masks_bool is not None:
-                first_t, cx, area = _first_appearance_cx_area(masks_bool)
+                cx, area = _first_frame_cx_area(masks_bool)
                if sort_by == "left_to_right":
-                    order = sorted(range(len(cx)), key=lambda i: (first_t[i], cx[i]))
+                    order = sorted(range(len(cx)), key=lambda i: cx[i])
                else:  # "area"
-                    order = sorted(range(len(area)), key=lambda i: (first_t[i], -area[i]))
+                    order = sorted(range(len(area)), key=lambda i: -area[i])
                td = _subset_track_data(td, order)
            if object_indices.strip():
                indices = [int(i.strip()) for i in object_indices.split(",") if i.strip().isdigit()]
@ -326,10 +300,8 @@ class SCAIL2ColoredMask(io.ComfyNode):
        ref_bg = "black" if replacement_mode else "white"

        if ref_track_data is not None:
-            if isinstance(ref_track_data, torch.Tensor):  # plain comfy MASK
-                reference_image_mask = _render_mask_as_identity(ref_track_data, ref_bg)
-            else:
-                reference_image_mask = _render_colored_masks(_prep(ref_track_data), ref_bg)
+            ref = _prep(ref_track_data)
+            reference_image_mask = _render_colored_masks(ref, ref_bg)
        else:
            H, W = drv["orig_size"]
            fill_value = 1.0 if ref_bg == "white" else 0.0
--- a/nodes.py
+++ b/nodes.py
@ -20,6 +20,8 @@ from PIL.PngImagePlugin import PngInfo
 import numpy as np
 import safetensors.torch

+sys.path.insert(0, os.path.join(os.path.dirname(os.path.realpath(__file__)), "comfy"))
+
 import comfy.diffusers_load
 import comfy.samplers
 import comfy.sample
@ -967,7 +969,7 @@ class CLIPLoader:
    @classmethod
    def INPUT_TYPES(s):
        return {"required": { "clip_name": (folder_paths.get_filename_list("text_encoders"), ),
-                              "type": (["stable_diffusion", "stable_cascade", "sd3", "stable_audio", "mochi", "ltxv", "pixart", "cosmos", "lumina2", "wan", "hidream", "chroma", "ace", "omnigen2", "qwen_image", "hunyuan_image", "flux2", "ovis", "longcat_image", "cogvideox", "lens", "pixeldit", "ideogram4", "boogu"], ),
+                              "type": (["stable_diffusion", "stable_cascade", "sd3", "stable_audio", "mochi", "ltxv", "pixart", "cosmos", "lumina2", "wan", "hidream", "chroma", "ace", "omnigen2", "qwen_image", "hunyuan_image", "flux2", "ovis", "longcat_image", "cogvideox", "lens", "pixeldit", "ideogram4"], ),
                              },
                "optional": {
                              "device": (["default", "cpu"], {"advanced": True}),
@ -2297,9 +2299,6 @@ async def init_external_custom_nodes():
    Returns:
        None
    """
-    # TODO: remove at some point when custom nodes don't break.
-    sys.path.insert(0, os.path.join(os.path.dirname(os.path.realpath(__file__)), "comfy"))
-
    base_node_names = set(NODE_CLASS_MAPPINGS.keys())
    node_paths = folder_paths.get_folder_paths("custom_nodes")
    node_import_times = []
@ -2426,7 +2425,6 @@ async def init_builtin_extra_nodes():
        "nodes_tcfg.py",
        "nodes_context_windows.py",
        "nodes_qwen.py",
-        "nodes_boogu.py",
        "nodes_chroma_radiance.py",
        "nodes_pid.py",
        "nodes_model_patch.py",
--- a/openapi.yaml
+++ b/openapi.yaml
@ -673,35 +673,6 @@ components:
                - created_at
                - updated_at
            type: object
-        JobsCancelRequest:
-            additionalProperties: false
-            description: Request to cancel multiple jobs by ID.
-            properties:
-                job_ids:
-                    description: Job identifiers (UUIDs) to cancel.
-                    items:
-                        format: uuid
-                        type: string
-                    maxItems: 100
-                    minItems: 1
-                    type: array
-            required:
-                - job_ids
-            type: object
-        JobsCancelResponse:
-            description: Response for POST /api/jobs/cancel.
-            properties:
-                cancelled:
-                    description: |
-                        Job IDs for which a cancel event was successfully dispatched by this
-                        call. Jobs already in a terminal or cancelling state are idempotently
-                        skipped and will not appear here.
-                    items:
-                        type: string
-                    type: array
-            required:
-                - cancelled
-            type: object
        JobsListResponse:
            description: Paginated list of jobs for the authenticated user.
            properties:
@ -1035,7 +1006,7 @@ components:
                    description: If true, clear all pending jobs from the queue
                    type: boolean
                delete:
-                    description: Array of job IDs to cancel; pending and running jobs transition to cancelled
+                    description: Array of PENDING job IDs to cancel
                    items:
                        type: string
                    type: array
@ -1851,83 +1822,6 @@ paths:
            summary: Update asset metadata
            tags:
                - file
-    /api/assets/{id}/content:
-        get:
-            description: |
-                Returns the binary content of an asset by ID.
-
-                The contract is the same across runtimes — "GET this path and you
-                receive the asset's bytes" — but the mechanism differs:
-                - **Local ComfyUI** streams the bytes directly (`200`,
-                  `application/octet-stream`).
-                - **Cloud** does not proxy large files; it responds `302` with a
-                  `Location` redirect to a short-lived signed storage URL. Clients that
-                  follow redirects (browsers, `fetch`/XHR, `<img>`/`<video>`) receive
-                  the bytes transparently.
-
-                Prefer this over the filename-addressed `/api/view` when you have an
-                asset ID.
-            operationId: getAssetContent
-            parameters:
-                - description: Asset ID
-                  in: path
-                  name: id
-                  required: true
-                  schema:
-                    type: string
-                - description: |
-                    Content-Disposition for the response: `attachment` (download) or
-                    `inline` (render in browser). Defaults to `attachment`.
-                  in: query
-                  name: disposition
-                  schema:
-                    default: attachment
-                    enum:
-                        - inline
-                        - attachment
-                    type: string
-            responses:
-                "200":
-                    content:
-                        application/octet-stream:
-                            schema:
-                                format: binary
-                                type: string
-                    description: Asset content stream (local runtime streams the bytes directly)
-                "302":
-                    description: Redirect to a signed storage URL (cloud runtime)
-                    headers:
-                        Cache-Control:
-                            description: Private caching directive scoped to the signed URL lifetime
-                            schema:
-                                type: string
-                        Location:
-                            description: Short-lived signed URL to the asset content in storage
-                            schema:
-                                type: string
-                        Vary:
-                            description: Partitions any cached redirect by auth credentials so a private redirect is not reused across users
-                            schema:
-                                type: string
-                "404":
-                    content:
-                        application/json:
-                            schema:
-                                $ref: '#/components/schemas/ErrorResponse'
-                    description: Asset not found
-                "500":
-                    content:
-                        application/json:
-                            schema:
-                                $ref: '#/components/schemas/ErrorResponse'
-                    description: Internal server error
-            security:
-                - ApiKeyAuth: []
-                - BearerAuth: []
-                - CookieAuth: []
-            summary: Get asset content
-            tags:
-                - file
    /api/assets/{id}/tags:
        delete:
            description: Removes one or more tags from an existing asset
@ -2781,20 +2675,14 @@ paths:
            summary: Get internationalisation translation strings
    /api/interrupt:
        post:
-            deprecated: true
            description: |
-                Deprecated. Prefer the jobs-namespace cancel endpoints:
-                POST /api/jobs/{job_id}/cancel for a single job, or
-                POST /api/jobs/cancel to cancel jobs by ID.
-
-                Cancels the first active job for the authenticated user (the currently
-                running job if there is one, otherwise the next pending job). Takes no
-                body and cannot target a specific job — use the jobs-namespace endpoints
-                for that.
+                Cancel all currently RUNNING jobs for the authenticated user.
+                This will interrupt any job that is currently in 'in_progress' status.
+                Note: This endpoint only affects running jobs. To cancel pending jobs, use /api/queue.
            operationId: interruptJob
            responses:
                "200":
-                    description: Success - first active job cancelled, or no active job found
+                    description: Success - Job interrupted or no running job found
                "401":
                    content:
                        application/json:
@ -2807,7 +2695,7 @@ paths:
                            schema:
                                $ref: '#/components/schemas/ErrorResponse'
                    description: Internal server error
-            summary: Interrupt the first active job
+            summary: Interrupt currently running jobs
            tags:
                - queue
    /api/job/{job_id}/status:
@ -3066,64 +2954,6 @@ paths:
            summary: Cancel a job
            tags:
                - workflow
-    /api/jobs/cancel:
-        post:
-            description: |
-                Cancel one or more jobs for the authenticated user in a single request.
-
-                State-agnostic: cancels both pending and running jobs (both transition to
-                the cancelled state via the same mechanism as the single-job endpoint).
-
-                Idempotent per job: a job already in a terminal or cancelling state is a
-                no-op and simply will not appear in the returned `cancelled` list.
-
-                Fail-fast on unknown IDs: if any provided job ID does not exist for this
-                user, the request returns 404 and no jobs are cancelled. This surfaces
-                bad IDs to the caller rather than silently dropping them.
-
-                This is the canonical batch-cancel endpoint. The delete operation on
-                POST /api/queue is deprecated in favour of this.
-            operationId: cancelJobs
-            requestBody:
-                content:
-                    application/json:
-                        schema:
-                            $ref: '#/components/schemas/JobsCancelRequest'
-                required: true
-            responses:
-                "200":
-                    content:
-                        application/json:
-                            schema:
-                                $ref: '#/components/schemas/JobsCancelResponse'
-                    description: Success - cancel requests dispatched (or jobs were already terminal)
-                "400":
-                    content:
-                        application/json:
-                            schema:
-                                $ref: '#/components/schemas/ErrorResponse'
-                    description: Bad Request - job_ids is missing, empty, exceeds the maximum count, or contains an invalid UUID
-                "401":
-                    content:
-                        application/json:
-                            schema:
-                                $ref: '#/components/schemas/ErrorResponse'
-                    description: Unauthorized - Authentication required
-                "404":
-                    content:
-                        application/json:
-                            schema:
-                                $ref: '#/components/schemas/ErrorResponse'
-                    description: One or more job IDs not found for this user (no jobs cancelled)
-                "500":
-                    content:
-                        application/json:
-                            schema:
-                                $ref: '#/components/schemas/ErrorResponse'
-                    description: Internal server error - cancellation failed
-            summary: Cancel multiple jobs
-            tags:
-                - workflow
    /api/node_replacements:
        get:
            description: |
@ -3274,18 +3104,9 @@ paths:
            tags:
                - queue
        post:
-            deprecated: true
            description: |
-                Deprecated. Prefer the jobs-namespace cancel endpoints:
-                POST /api/jobs/cancel for cancelling jobs by ID, and
-                POST /api/jobs/{job_id}/cancel for a single job.
-
-                Cancel specific jobs by ID (the `delete` field) or clear all pending
-                jobs in the queue (the `clear` field). Despite the `delete` naming, this
-                does not delete anything — listed jobs transition to the cancelled state,
-                and `delete` cancels both pending and running jobs (not pending-only as
-                previously documented). Job-by-ID cancellation is superseded by
-                POST /api/jobs/cancel; `clear` has no jobs-namespace replacement yet.
+                Cancel specific PENDING jobs by ID or clear all pending jobs in the queue.
+                Note: This endpoint only affects pending jobs. To cancel running jobs, use /api/interrupt.
            operationId: manageQueue
            requestBody:
                content:
Author	SHA1	Message	Date
Alexis Rolland	4a72cd3a55	Merge branch 'master' into alexis/update_nodes_categories	2026-06-17 20:55:15 +08:00
Alexis Rolland	67fee80da2	Update TripoSplat categories	2026-06-17 09:23:51 +08:00