chore(openapi): sync shared API contract from cloud@d10ff72

2026-06-18 12:37:58 +08:00 · 2026-06-17 03:57:23 +00:00
7 changed files with 247 additions and 142 deletions
--- a/.github/workflows/ci-cursor-review.yml
+++ b/.github/workflows/ci-cursor-review.yml
@ -1,37 +0,0 @@
-name: CI - Cursor Review
-
-# Thin caller for the shared reusable cursor-review workflow in
-# Comfy-Org/github-workflows. The review logic (panel matrix, judge
-# consolidation, prompts, extract/post/notify scripts) lives there as the
-# single source of truth, so this repo only carries the repo-specific diff
-# excludes.
-
-on:
-  pull_request:
-    types: [labeled, unlabeled]
-
-concurrency:
-  group: cursor-review-pr-${{ github.event.pull_request.number }}-${{ github.event.label.name }}
-  cancel-in-progress: true
-
-jobs:
-  cursor-review:
-    permissions:
-      contents: read
-      pull-requests: write
-    # SHA-pinned per zizmor `unpinned-uses: hash-pin`. Bump this SHA to pick up
-    # upstream changes; keep `workflows_ref` matching so prompts/scripts load
-    # from the same commit as the workflow definition.
-    uses: Comfy-Org/github-workflows/.github/workflows/cursor-review.yml@047ca48febe3a6647608ed2e0c4331b491cb9d6a # github-workflows#9
-    with:
-      workflows_ref: 047ca48febe3a6647608ed2e0c4331b491cb9d6a
-      diff_excludes: >-
-        :!**/.claude/**
-        :!**/dist/**
-        :!**/vendor/**
-        :!**/*.generated.*
-        :!**/*.min.js
-        :!**/*.min.css
-    secrets:
-      CURSOR_API_KEY: ${{ secrets.CURSOR_API_KEY }}
-      SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
--- a/comfy/ldm/wan/model.py
+++ b/comfy/ldm/wan/model.py
@ -1665,7 +1665,7 @@ class SCAILWanModel(WanModel):

        # embeddings
        x = self.patch_embedding(x.float()).to(x.dtype)
-        if ref_mask_latents is not None:  # SCAIL-2 additive mask stream (one identity mask frame per reference, then video)
+        if ref_mask_latents is not None:  # SCAIL-2 additive mask stream
            x = x + self.patch_embedding_mask(ref_mask_latents.float()).to(x.dtype)
        grid_sizes = x.shape[2:]
        transformer_options["grid_sizes"] = grid_sizes
@ -1728,25 +1728,22 @@ class SCAILWanModel(WanModel):

    # ref_mask_flag is a scalar bool (CONDConstant, SCAIL-2 only). False => replacement mode,
    # which places ref/pose via H/W rope shifts instead of the animation-mode temporal offset.
-    # reference_latent may stack several frames: the last is the primary reference adjacent to the video, the earlier frames are additional references.
    def rope_encode(self, t, h, w, t_start=0, steps_t=None, steps_h=None, steps_w=None, device=None, dtype=None, pose_latents=None, reference_latent=None, ref_mask_flag=None, transformer_options={}):
-        ref_t_patches = 0
-        if reference_latent is not None:
-            ref_t_patches = (reference_latent.shape[2] + (self.patch_size[0] // 2)) // self.patch_size[0]
-
        if ref_mask_flag is not None and not bool(ref_mask_flag):
            REF_ROPE_H = 120.0
            POSE_ROPE_W = 120.0

+            ref_t_patches = 0
+            if reference_latent is not None:
+                ref_t_patches = (reference_latent.shape[2] + (self.patch_size[0] // 2)) // self.patch_size[0]
            main_t_patches = t - ref_t_patches
-            video_t_start = max(ref_t_patches - 1, 0)

            parts = []
            if ref_t_patches > 0:
                ref_tf = {"rope_options": {"shift_y": REF_ROPE_H, "shift_x": 0.0, "scale_y": 1.0, "scale_x": 1.0}}
                parts.append(super().rope_encode(ref_t_patches, h, w, t_start=0, device=device, dtype=dtype, transformer_options=ref_tf))
            if main_t_patches > 0:
-                parts.append(super().rope_encode(main_t_patches, h, w, t_start=video_t_start, device=device, dtype=dtype, transformer_options=transformer_options))
+                parts.append(super().rope_encode(main_t_patches, h, w, t_start=0, device=device, dtype=dtype, transformer_options=transformer_options))

            if pose_latents is not None:
                F_pose, H_pose, W_pose = pose_latents.shape[-3], pose_latents.shape[-2], pose_latents.shape[-1]
@ -1755,7 +1752,7 @@ class SCAILWanModel(WanModel):
                h_shift = (h_scale - 1) / 2
                w_shift = (w_scale - 1) / 2
                pose_tf = {"rope_options": {"shift_y": h_shift, "shift_x": POSE_ROPE_W + w_shift, "scale_y": h_scale, "scale_x": w_scale}}
-                parts.append(super().rope_encode(F_pose, H_pose, W_pose, t_start=video_t_start, device=device, dtype=dtype, transformer_options=pose_tf))
+                parts.append(super().rope_encode(F_pose, H_pose, W_pose, t_start=0, device=device, dtype=dtype, transformer_options=pose_tf))

            return torch.cat(parts, dim=1)

@ -1764,6 +1761,10 @@ class SCAILWanModel(WanModel):
        if pose_latents is None:
            return main_freqs

+        ref_t_patches = 0
+        if reference_latent is not None:
+            ref_t_patches = (reference_latent.shape[2] + (self.patch_size[0] // 2)) // self.patch_size[0]
+
        F_pose, H_pose, W_pose = pose_latents.shape[-3], pose_latents.shape[-2], pose_latents.shape[-1]

        # if pose is at half resolution, scale_y/scale_x=2 stretches the position range to cover the same RoPE extent as the main frames
--- a/comfy/model_base.py
+++ b/comfy/model_base.py
@ -1747,14 +1747,10 @@ class WAN21_SCAIL(WAN21):

        reference_latents = kwargs.get("reference_latents", None)
        if reference_latents is not None:
-            # SCAIL-2 multi-reference: reference_latents[0] is the primary ref, [1:] are additional
-            # references. Stack as [additional..., primary] so the primary stays adjacent to the video.
-            ordered = list(reference_latents[1:]) + list(reference_latents[:1])
-            stacked = []
-            for lat in ordered:
-                lat = self.process_latent_in(lat)
-                stacked.append(torch.cat([lat, torch.ones_like(lat[:, :4])], dim=1))
-            out['reference_latent'] = comfy.conds.CONDRegular(torch.cat(stacked, dim=2))
+            ref_latent = self.process_latent_in(reference_latents[-1])
+            ref_mask = torch.ones_like(ref_latent[:, :4])
+            ref_latent = torch.cat([ref_latent, ref_mask], dim=1)
+            out['reference_latent'] = comfy.conds.CONDRegular(ref_latent)

        pose_latents = kwargs.get("pose_video_latent", None)
        if pose_latents is not None:
@ -1796,7 +1792,6 @@ class WAN21_SCAIL2(WAN21_SCAIL):
        if driving_mask_28ch is not None:
            out['sam_latents'] = comfy.conds.CONDRegular(driving_mask_28ch.movedim(1, 2).contiguous())

-        # ref_mask_28ch holds one identity mask per stacked reference frame (additional refs first, then the primary ref), followed by zeros over the video frames.
        ref_mask_28ch = kwargs.get("ref_mask_28ch", None)
        if ref_mask_28ch is not None:
            out['ref_mask_latents'] = comfy.conds.CONDRegular(ref_mask_28ch.movedim(1, 2).contiguous())
@ -1824,11 +1819,10 @@ class WAN21_SCAIL2(WAN21_SCAIL):
            # Return sliced view omitting retain_index_list
            return comfy.context_windows.slice_cond(cond_value, window, x_in, device, temporal_dim=2, temporal_offset=0)
        if cond_key == "ref_mask_latents" and hasattr(cond_value, "cond") and isinstance(cond_value.cond, torch.Tensor):
-            # The ref mask is N leading ref frames padded with frames of zeros, so just grab the first frames for all windows
+            # The ref mask is just a single frame padded with frames of zeros, so just grab the first frames for all windows
            full_ref_mask = cond_value.cond
            video_frame_count = x_in.shape[2]
-            ref_frame_count = full_ref_mask.shape[2] - video_frame_count
-            if ref_frame_count < 1:
+            if full_ref_mask.shape[2] != video_frame_count + 1:
                return None
            window_length = len(window.index_list)

@ -1837,7 +1831,7 @@ class WAN21_SCAIL2(WAN21_SCAIL):
            if anchor_index is not None and anchor_index >= 0:
                window_length += 1

-            window_ref_mask = full_ref_mask[:, :, :window_length + ref_frame_count].to(device)
+            window_ref_mask = full_ref_mask[:, :, :window_length + 1].to(device)
            return cond_value._copy_with(window_ref_mask)

        return super().resize_cond_for_context_window(cond_key, cond_value, window, x_in, device, retain_index_list=retain_index_list)
--- a/comfy/sd.py
+++ b/comfy/sd.py
@ -1622,10 +1622,6 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
                clip_data[0] = comfy.utils.state_dict_prefix_replace(clip_data[0], {"model.language_model.": "model.", "model.visual.": "visual.", "lm_head.": "model.lm_head."})
                clip_target.clip = comfy.text_encoders.ideogram4.te_qwen3vl(**llama_detect(clip_data))
                clip_target.tokenizer = comfy.text_encoders.ideogram4.Ideogram4Qwen3VLTokenizer
-            elif clip_type in (CLIPType.FLUX, CLIPType.FLUX2):  # Flux2 Klein reuses the Qwen3-VL LM (3-layer tap -> 12288); visual unused.
-                klein_model_type = "qwen3_8b" if te_model == TEModel.QWEN3VL_8B else "qwen3_4b"
-                clip_target.clip = comfy.text_encoders.flux.klein_te(**llama_detect(clip_data), model_type=klein_model_type)
-                clip_target.tokenizer = comfy.text_encoders.flux.KleinTokenizer8B if te_model == TEModel.QWEN3VL_8B else comfy.text_encoders.flux.KleinTokenizer
            else:
                clip_data[0] = comfy.utils.state_dict_prefix_replace(clip_data[0], {"model.language_model.": "model.", "model.visual.": "visual.", "lm_head.": "model.lm_head."})
                qwen3vl_type = {TEModel.QWEN3VL_4B: "qwen3vl_4b", TEModel.QWEN3VL_8B: "qwen3vl_8b"}[te_model]
--- a/comfy_extras/nodes_scail.py
+++ b/comfy_extras/nodes_scail.py
@ -34,20 +34,14 @@ def _unpack(track_data):
    return unpack_masks(packed)


-def _first_appearance_cx_area(masks_bool):
-    """Per object: first frame it appears in, plus centroid-x and area in that frame."""
-    m = masks_bool.float()
-    T, H, W = m.shape[0], m.shape[-2], m.shape[-1]
-    grid_x = torch.arange(W, device=m.device, dtype=m.dtype).view(1, 1, 1, W)
-    area_t = m.sum(dim=(-1, -2))
-    cx_t = (m * grid_x).sum(dim=(-1, -2)) / area_t.clamp(min=1)
-    present = area_t > 0
-    frame_idx = torch.arange(T, device=m.device).unsqueeze(1)
-    first_t = torch.where(present, frame_idx, T).amin(dim=0)
-    sel = first_t.clamp(max=T - 1).unsqueeze(0)
-    cx = cx_t.gather(0, sel).squeeze(0)
-    area = area_t.gather(0, sel).squeeze(0)
-    return first_t.tolist(), (cx / W).tolist(), (area / (H * W)).tolist()
+def _first_frame_cx_area(masks_bool):
+    first = masks_bool[0].float()
+    H, W = first.shape[-2], first.shape[-1]
+    n_pixels = H * W
+    grid_x = torch.arange(W, device=first.device, dtype=first.dtype).view(1, W)
+    area = first.sum(dim=(-1, -2)).clamp_(min=1)
+    cx = (first * grid_x).sum(dim=(-1, -2)) / area
+    return (cx / W).tolist(), (area / n_pixels).tolist()


 def _subset_track_data(track_data, obj_indices):
@ -87,26 +81,12 @@ def _render_colored_masks(track_data, background="black"):
        masks_full.view(T * N_obj, 1, Hm, Wm), size=(H, W), mode="nearest"
    ).view(T, N_obj, H, W) > 0.5
    any_mask = masks_full.any(dim=1)
-    color_overlay = colors[masks_full.to(torch.uint8).argmax(dim=1)]
+    obj_idx_map = masks_full.to(torch.uint8).argmax(dim=1)
+    color_overlay = colors[obj_idx_map]
    bg_tensor = torch.tensor(bg_rgb, device=device, dtype=color_overlay.dtype).view(1, 1, 1, 3)
    return torch.where(any_mask.unsqueeze(-1), color_overlay, bg_tensor.expand_as(color_overlay))


-def _render_mask_as_identity(mask, background="black"):
-    """Plain comfy MASK (B,H,W) or (H,W) -> (B,H,W,3) rendered as a single identity (palette[0])
-    on the given background. A batch is treated as multiple views of that one subject."""
-    device = comfy.model_management.intermediate_device()
-    dtype = comfy.model_management.intermediate_dtype()
-    if mask.ndim == 2:
-        mask = mask.unsqueeze(0)
-    mask = mask.to(device=device, dtype=dtype)
-    B, H, W = mask.shape
-    bg_rgb = (1.0, 1.0, 1.0) if background.startswith("white") else (0.0, 0.0, 0.0)
-    color = torch.tensor(DEFAULT_PALETTE[0], device=device, dtype=dtype).view(1, 1, 1, 3)
-    bg = torch.tensor(bg_rgb, device=device, dtype=dtype).view(1, 1, 1, 3)
-    return torch.where((mask > 0.5).unsqueeze(-1), color.expand(B, H, W, 3), bg.expand(B, H, W, 3))
-
-
 def _extract_mask_to_28ch(rgb_video):
    """Colored RGB mask (T, H, W, 3) in [0, 1] -> SCAIL-2 28-channel binary latent
    (1, T_lat, 28, H_lat, W_lat). 7 per-color binary channels (white/r/g/b/y/m/c)
@ -158,8 +138,8 @@ class WanSCAILToVideo(io.ComfyNode):
                io.Float.Input("pose_strength", default=1.0, min=0.0, max=10.0, step=0.01, tooltip="Strength of the pose latent."),
                io.Float.Input("pose_start", default=0.0, min=0.0, max=1.0, step=0.01, tooltip="Start step of the pose conditioning."),
                io.Float.Input("pose_end", default=1.0, min=0.0, max=1.0, step=0.01, tooltip="End step of the pose conditioning."),
-                io.Image.Input("reference_image", optional=True, tooltip="Reference image. The first image is the primary reference (composite all identities onto it). SCAIL-2: extra batch images are used as additional views (back view, close-up, occluded background), each needing a matching reference_image_mask in that identity's color."),
-                io.Image.Input("reference_image_mask", optional=True, tooltip="SCAIL-2 only. Colored reference mask, batch matching reference_image (first = primary reference mask, rest = identity masks for the additional reference_image)."),
+                io.Image.Input("reference_image", optional=True, tooltip="Reference image, for multiple references composite all on single image."),
+                io.Image.Input("reference_image_mask", optional=True, tooltip="SCAIL-2 only. Colored reference mask at the same resolution as reference_image."),
                io.ClipVisionOutput.Input("clip_vision_output", optional=True, tooltip="CLIP vision features for conditioning. Model is trained with stretch resize to aspect ratio."),
                io.Int.Input("video_frame_offset", default=0, min=0, max=nodes.MAX_RESOLUTION, step=1, tooltip="Cumulative output frame this chunk begins at. Wire from the previous chunk's video_frame_offset output."),
                io.Int.Input("previous_frame_count", default=5, min=1, max=nodes.MAX_RESOLUTION, step=4, tooltip="Tail frames of previous_frames to anchor. SCAIL-2 trained at 5 (81-frame chunks, 76-frame step)."),
@ -191,21 +171,19 @@ class WanSCAILToVideo(io.ComfyNode):
            video_frame_offset -= prev_trimmed.shape[0]
            video_frame_offset = max(0, video_frame_offset)

+        ref_latent = None
        if reference_image is not None:
-            ref_imgs = comfy.utils.common_upscale(reference_image.movedim(-1, 1), width, height, "bicubic", "center").movedim(1, -1)
-            n_ref = ref_imgs.shape[0]
-            # SCAIL-2 multi-reference: the first image is the primary ref, the rest are additional references.
-
-            # Replacement Mode: composite each ref on black bg using its mask as alpha matte
+            reference_image = comfy.utils.common_upscale(reference_image[:1].movedim(-1, 1), width, height, "bicubic", "center").movedim(1, -1)
+            # Replacement Mode: composite ref on black bg using reference_image_mask as alpha matte
            if replacement_mode and reference_image_mask is not None:
-                rm = comfy.utils.common_upscale(reference_image_mask.movedim(-1, 1), width, height, "nearest-exact", "center").movedim(1, -1)
-                rm = rm[[min(i, rm.shape[0] - 1) for i in range(n_ref)]]
-                is_char = (rm[..., :3].max(dim=-1, keepdim=True).values > 0.1).to(ref_imgs.dtype)
-                ref_imgs = ref_imgs * is_char
-            # encode each ref individually so each stays a single latent frame (a batched encode would be treated as a video)
-            ref_latents = [vae.encode(ref_imgs[i:i + 1, :, :, :3]) for i in range(n_ref)]
-            positive = node_helpers.conditioning_set_values(positive, {"reference_latents": ref_latents}, append=True)
-            negative = node_helpers.conditioning_set_values(negative, {"reference_latents": ref_latents}, append=True)
+                rm = comfy.utils.common_upscale(reference_image_mask[:1].movedim(-1, 1), width, height, "nearest-exact", "center").movedim(1, -1)
+                is_char = (rm[..., :3].max(dim=-1, keepdim=True).values > 0.1).to(reference_image.dtype)
+                reference_image = reference_image * is_char
+            ref_latent = vae.encode(reference_image[:, :, :, :3])
+
+        if ref_latent is not None:
+            positive = node_helpers.conditioning_set_values(positive, {"reference_latents": [ref_latent]}, append=True)
+            negative = node_helpers.conditioning_set_values(negative, {"reference_latents": [ref_latent]}, append=True)

        if clip_vision_output is not None:
            positive = node_helpers.conditioning_set_values(positive, {"clip_vision_output": clip_vision_output})
@ -243,16 +221,11 @@ class WanSCAILToVideo(io.ComfyNode):
            positive = node_helpers.conditioning_set_values(positive, {"driving_mask_28ch": driving_mask_28ch})
            negative = node_helpers.conditioning_set_values(negative, {"driving_mask_28ch": driving_mask_28ch})

-        # The ref mask binds reference frames to identities, so it only applies when there's a reference image.
-        if reference_image_mask is not None and reference_image is not None:
-            ref_mask_hw = comfy.utils.common_upscale(reference_image_mask.movedim(-1, 1), width, height, "nearest-exact", "center").movedim(1, -1)
-            n_masks = ref_mask_hw.shape[0]
-            n_ref = reference_image.shape[0]
-
-            add_masks = [_extract_mask_to_28ch(ref_mask_hw[min(i, n_masks - 1)][None]) for i in range(1, n_ref)]
-            ref_mask_1f = _extract_mask_to_28ch(ref_mask_hw[:1])
+        if reference_image_mask is not None:
+            ref_mask_hw = comfy.utils.common_upscale(reference_image_mask[:1].movedim(-1, 1), width, height, "bicubic", "center").movedim(1, -1)
+            ref_mask_1f = _extract_mask_to_28ch(ref_mask_hw)
            zeros = torch.zeros((1, latent.shape[2], 28, ref_mask_1f.shape[-2], ref_mask_1f.shape[-1]), device=ref_mask_1f.device, dtype=ref_mask_1f.dtype)
-            ref_mask_28ch = torch.cat(add_masks + [ref_mask_1f, zeros], dim=1)
+            ref_mask_28ch = torch.cat([ref_mask_1f, zeros], dim=1)
            positive = node_helpers.conditioning_set_values(positive, {"ref_mask_28ch": ref_mask_28ch})
            negative = node_helpers.conditioning_set_values(negative, {"ref_mask_28ch": ref_mask_28ch})

@ -271,9 +244,12 @@ class WanSCAILToVideo(io.ComfyNode):


 class SCAIL2ColoredMask(io.ComfyNode):
-    """Render SAM3 tracks for the driving pose video and reference image(s) into the
-    colored masks WanSCAILToVideo consumes. Shared `sort_by` keeps each identity on the
-    same color across both outputs.
+    """Render SAM3 tracks for the driving pose video and (optionally) the reference
+    image into the two colored masks WanSCAILToVideo consumes. Shared `sort_by`
+    across both outputs guarantees identity K maps to the same color on both
+    sides, for multi-person workflow consistency.
+    reference_image_mask is always rendered black-bg (model convention)
+    pose_video_mask bg follows replacement_mode: black = Animation Mode, white = Replacement Mode
    """

    @classmethod
@ -284,12 +260,10 @@ class SCAIL2ColoredMask(io.ComfyNode):
            category="model/conditioning/wan/scail",
            inputs=[
                SAM3TrackData.Input("driving_track_data", tooltip="SAM3 track of the driving pose video. Will be rendered into the pose_video_mask output."),
-                io.MultiType.Input("ref_track_data", [SAM3TrackData, io.Mask], optional=True, display_name="reference_masks",
-                                   tooltip="SAM3 track of the reference image(s) (one identity per object, colored in batch order), or a plain MASK of the reference subject (rendered as a single identity)."),
-                io.String.Input("object_indices", default="",
-                                tooltip="Comma-separated list of person indices to include (e.g. '0,2,3'). Applied to both reference and pose video masks. Empty = all."),
+                SAM3TrackData.Input("ref_track_data", optional=True, tooltip="SAM3 track of the reference image."),
+                io.String.Input("object_indices", default="", tooltip="Comma-separated list of person indices to include (e.g. '0,2,3'). Applied to both reference and pose video masks. Empty = all."),
                io.Combo.Input("sort_by", options=["none", "left_to_right", "area"], default="left_to_right",
-                               tooltip="Order in which palette colors are assigned to the tracked objects (applied to both reference and pose video so each identity keeps the same color). Objects that appear in earlier frames always come first; within a frame, left_to_right = leftmost object (by centroid at first appearance) gets the first color, area = biggest object (by mask area at first appearance) gets the first color; none = keep SAM3's order."),
+                    tooltip="Order in which palette colors are assigned to the tracked objects (applied to both reference and pose video so each identity keeps the same color). left_to_right = leftmost object (by first-frame centroid) gets the first color; area = biggest object (by first-frame mask area) gets the first color; none = keep SAM3's order."),
                io.Boolean.Input("replacement_mode", default=False,
                    tooltip="False = Animation Mode (pose_video_mask has black background, reference_image_mask has white background). "
                    "True = Replacement Mode (pose_video_mask has white background, reference_image_mask has black background)."),
@ -306,11 +280,11 @@ class SCAIL2ColoredMask(io.ComfyNode):
        def _prep(td):
            masks_bool = _unpack(td)
            if sort_by != "none" and masks_bool is not None:
-                first_t, cx, area = _first_appearance_cx_area(masks_bool)
+                cx, area = _first_frame_cx_area(masks_bool)
                if sort_by == "left_to_right":
-                    order = sorted(range(len(cx)), key=lambda i: (first_t[i], cx[i]))
+                    order = sorted(range(len(cx)), key=lambda i: cx[i])
                else:  # "area"
-                    order = sorted(range(len(area)), key=lambda i: (first_t[i], -area[i]))
+                    order = sorted(range(len(area)), key=lambda i: -area[i])
                td = _subset_track_data(td, order)
            if object_indices.strip():
                indices = [int(i.strip()) for i in object_indices.split(",") if i.strip().isdigit()]
@ -326,10 +300,8 @@ class SCAIL2ColoredMask(io.ComfyNode):
        ref_bg = "black" if replacement_mode else "white"

        if ref_track_data is not None:
-            if isinstance(ref_track_data, torch.Tensor):  # plain comfy MASK
-                reference_image_mask = _render_mask_as_identity(ref_track_data, ref_bg)
-            else:
-                reference_image_mask = _render_colored_masks(_prep(ref_track_data), ref_bg)
+            ref = _prep(ref_track_data)
+            reference_image_mask = _render_colored_masks(ref, ref_bg)
        else:
            H, W = drv["orig_size"]
            fill_value = 1.0 if ref_bg == "white" else 0.0
--- a/comfy_extras/nodes_triposplat.py
+++ b/comfy_extras/nodes_triposplat.py
@ -65,7 +65,7 @@ class TripoSplatPreprocessImage(IO.ComfyNode):
        return IO.Schema(
            node_id="TripoSplatPreprocessImage",
            display_name="TripoSplat Preprocess Image",
-            category="model/conditioning/triposplat",
+            category="3d/conditioning",
            description="Crop center each image to a square canvas on a black background and add padding.",
            inputs=[
                IO.Image.Input("image"),
@ -95,7 +95,7 @@ class TripoSplatConditioning(IO.ComfyNode):
        return IO.Schema(
            node_id="TripoSplatConditioning",
            display_name="TripoSplat Conditioning",
-            category="model/conditioning/triposplat",
+            category="3d/conditioning",
            description="Encode the image with DINOv3 and the Flux2 VAE into TripoSplat positive/negative "
                        "conditioning, and create the fixed size noise target (latent + camera) for the KSampler",
            inputs=[
--- a/openapi.yaml
+++ b/openapi.yaml
@ -673,6 +673,35 @@ components:
                - created_at
                - updated_at
            type: object
+        JobsCancelRequest:
+            additionalProperties: false
+            description: Request to cancel multiple jobs by ID.
+            properties:
+                job_ids:
+                    description: Job identifiers (UUIDs) to cancel.
+                    items:
+                        format: uuid
+                        type: string
+                    maxItems: 100
+                    minItems: 1
+                    type: array
+            required:
+                - job_ids
+            type: object
+        JobsCancelResponse:
+            description: Response for POST /api/jobs/cancel.
+            properties:
+                cancelled:
+                    description: |
+                        Job IDs for which a cancel event was successfully dispatched by this
+                        call. Jobs already in a terminal or cancelling state are idempotently
+                        skipped and will not appear here.
+                    items:
+                        type: string
+                    type: array
+            required:
+                - cancelled
+            type: object
        JobsListResponse:
            description: Paginated list of jobs for the authenticated user.
            properties:
@ -1006,7 +1035,7 @@ components:
                    description: If true, clear all pending jobs from the queue
                    type: boolean
                delete:
-                    description: Array of PENDING job IDs to cancel
+                    description: Array of job IDs to cancel; pending and running jobs transition to cancelled
                    items:
                        type: string
                    type: array
@ -1822,6 +1851,83 @@ paths:
            summary: Update asset metadata
            tags:
                - file
+    /api/assets/{id}/content:
+        get:
+            description: |
+                Returns the binary content of an asset by ID.
+
+                The contract is the same across runtimes — "GET this path and you
+                receive the asset's bytes" — but the mechanism differs:
+                - **Local ComfyUI** streams the bytes directly (`200`,
+                  `application/octet-stream`).
+                - **Cloud** does not proxy large files; it responds `302` with a
+                  `Location` redirect to a short-lived signed storage URL. Clients that
+                  follow redirects (browsers, `fetch`/XHR, `<img>`/`<video>`) receive
+                  the bytes transparently.
+
+                Prefer this over the filename-addressed `/api/view` when you have an
+                asset ID.
+            operationId: getAssetContent
+            parameters:
+                - description: Asset ID
+                  in: path
+                  name: id
+                  required: true
+                  schema:
+                    type: string
+                - description: |
+                    Content-Disposition for the response: `attachment` (download) or
+                    `inline` (render in browser). Defaults to `attachment`.
+                  in: query
+                  name: disposition
+                  schema:
+                    default: attachment
+                    enum:
+                        - inline
+                        - attachment
+                    type: string
+            responses:
+                "200":
+                    content:
+                        application/octet-stream:
+                            schema:
+                                format: binary
+                                type: string
+                    description: Asset content stream (local runtime streams the bytes directly)
+                "302":
+                    description: Redirect to a signed storage URL (cloud runtime)
+                    headers:
+                        Cache-Control:
+                            description: Private caching directive scoped to the signed URL lifetime
+                            schema:
+                                type: string
+                        Location:
+                            description: Short-lived signed URL to the asset content in storage
+                            schema:
+                                type: string
+                        Vary:
+                            description: Partitions any cached redirect by auth credentials so a private redirect is not reused across users
+                            schema:
+                                type: string
+                "404":
+                    content:
+                        application/json:
+                            schema:
+                                $ref: '#/components/schemas/ErrorResponse'
+                    description: Asset not found
+                "500":
+                    content:
+                        application/json:
+                            schema:
+                                $ref: '#/components/schemas/ErrorResponse'
+                    description: Internal server error
+            security:
+                - ApiKeyAuth: []
+                - BearerAuth: []
+                - CookieAuth: []
+            summary: Get asset content
+            tags:
+                - file
    /api/assets/{id}/tags:
        delete:
            description: Removes one or more tags from an existing asset
@ -2675,14 +2781,20 @@ paths:
            summary: Get internationalisation translation strings
    /api/interrupt:
        post:
+            deprecated: true
            description: |
-                Cancel all currently RUNNING jobs for the authenticated user.
-                This will interrupt any job that is currently in 'in_progress' status.
-                Note: This endpoint only affects running jobs. To cancel pending jobs, use /api/queue.
+                Deprecated. Prefer the jobs-namespace cancel endpoints:
+                POST /api/jobs/{job_id}/cancel for a single job, or
+                POST /api/jobs/cancel to cancel jobs by ID.
+
+                Cancels the first active job for the authenticated user (the currently
+                running job if there is one, otherwise the next pending job). Takes no
+                body and cannot target a specific job — use the jobs-namespace endpoints
+                for that.
            operationId: interruptJob
            responses:
                "200":
-                    description: Success - Job interrupted or no running job found
+                    description: Success - first active job cancelled, or no active job found
                "401":
                    content:
                        application/json:
@ -2695,7 +2807,7 @@ paths:
                            schema:
                                $ref: '#/components/schemas/ErrorResponse'
                    description: Internal server error
-            summary: Interrupt currently running jobs
+            summary: Interrupt the first active job
            tags:
                - queue
    /api/job/{job_id}/status:
@ -2954,6 +3066,64 @@ paths:
            summary: Cancel a job
            tags:
                - workflow
+    /api/jobs/cancel:
+        post:
+            description: |
+                Cancel one or more jobs for the authenticated user in a single request.
+
+                State-agnostic: cancels both pending and running jobs (both transition to
+                the cancelled state via the same mechanism as the single-job endpoint).
+
+                Idempotent per job: a job already in a terminal or cancelling state is a
+                no-op and simply will not appear in the returned `cancelled` list.
+
+                Fail-fast on unknown IDs: if any provided job ID does not exist for this
+                user, the request returns 404 and no jobs are cancelled. This surfaces
+                bad IDs to the caller rather than silently dropping them.
+
+                This is the canonical batch-cancel endpoint. The delete operation on
+                POST /api/queue is deprecated in favour of this.
+            operationId: cancelJobs
+            requestBody:
+                content:
+                    application/json:
+                        schema:
+                            $ref: '#/components/schemas/JobsCancelRequest'
+                required: true
+            responses:
+                "200":
+                    content:
+                        application/json:
+                            schema:
+                                $ref: '#/components/schemas/JobsCancelResponse'
+                    description: Success - cancel requests dispatched (or jobs were already terminal)
+                "400":
+                    content:
+                        application/json:
+                            schema:
+                                $ref: '#/components/schemas/ErrorResponse'
+                    description: Bad Request - job_ids is missing, empty, exceeds the maximum count, or contains an invalid UUID
+                "401":
+                    content:
+                        application/json:
+                            schema:
+                                $ref: '#/components/schemas/ErrorResponse'
+                    description: Unauthorized - Authentication required
+                "404":
+                    content:
+                        application/json:
+                            schema:
+                                $ref: '#/components/schemas/ErrorResponse'
+                    description: One or more job IDs not found for this user (no jobs cancelled)
+                "500":
+                    content:
+                        application/json:
+                            schema:
+                                $ref: '#/components/schemas/ErrorResponse'
+                    description: Internal server error - cancellation failed
+            summary: Cancel multiple jobs
+            tags:
+                - workflow
    /api/node_replacements:
        get:
            description: |
@ -3104,9 +3274,18 @@ paths:
            tags:
                - queue
        post:
+            deprecated: true
            description: |
-                Cancel specific PENDING jobs by ID or clear all pending jobs in the queue.
-                Note: This endpoint only affects pending jobs. To cancel running jobs, use /api/interrupt.
+                Deprecated. Prefer the jobs-namespace cancel endpoints:
+                POST /api/jobs/cancel for cancelling jobs by ID, and
+                POST /api/jobs/{job_id}/cancel for a single job.
+
+                Cancel specific jobs by ID (the `delete` field) or clear all pending
+                jobs in the queue (the `clear` field). Despite the `delete` naming, this
+                does not delete anything — listed jobs transition to the cancelled state,
+                and `delete` cancels both pending and running jobs (not pending-only as
+                previously documented). Job-by-ID cancellation is superseded by
+                POST /api/jobs/cancel; `clear` has no jobs-namespace replacement yet.
            operationId: manageQueue
            requestBody:
                content: