feat: Add VideoSlice node with lazy operations on VideoInput

- Add VideoOp base class and SliceOp in _input/video_types.py - Add sliced() method to VideoInput that returns a copy with operation appended - Each subclass applies operations in get_components() and get_frame_count() - After materialization, VideoFromFile delegates to internal VideoFromComponents - Add VideoSlice node that uses video.sliced(start_frame, frame_count) - Add tests for SliceOp, sliced() behavior, and materialization
qwen_image: propagate attention mask. (#11966 )
2026-01-29 16:26:29 +08:00 · 2026-01-23 20:52:15 -08:00 · 2026-01-22 20:02:31 -05:00 · 2026-01-22 18:20:48 -05:00 · 2026-01-22 16:54:18 -05:00 · 2026-01-22 09:46:56 -08:00
20 changed files with 460 additions and 225 deletions
--- a/app/node_replace_manager.py
+++ b/app/node_replace_manager.py
@ -1,23 +0,0 @@
-from __future__ import annotations
-
-from aiohttp import web
-
-from typing import TYPE_CHECKING
-if TYPE_CHECKING:
-    from comfy_api.latest._node_replace import NodeReplace
-
-REGISTERED_NODE_REPLACEMENTS: dict[str, list[NodeReplace]] = {}
-
-def register_node_replacement(node_replace: NodeReplace):
-    REGISTERED_NODE_REPLACEMENTS.setdefault(node_replace.old_node_id, []).append(node_replace)
-
-def registered_as_dict():
-    return {
-        k: [v.as_dict() for v in v_list] for k, v_list in REGISTERED_NODE_REPLACEMENTS.items()
-    }
-
-class NodeReplaceManager:
-    def add_routes(self, routes):
-        @routes.get("/node_replacements")
-        async def get_node_replacements(request):
-            return web.json_response(registered_as_dict())
--- a/comfy/ldm/lightricks/vae/causal_conv3d.py
+++ b/comfy/ldm/lightricks/vae/causal_conv3d.py
@ -1,11 +1,11 @@
 from typing import Tuple, Union

+import threading
 import torch
 import torch.nn as nn
 import comfy.ops
 ops = comfy.ops.disable_weight_init

-
 class CausalConv3d(nn.Module):
    def __init__(
        self,
@ -42,23 +42,34 @@ class CausalConv3d(nn.Module):
            padding_mode=spatial_padding_mode,
            groups=groups,
        )
+        self.temporal_cache_state={}

    def forward(self, x, causal: bool = True):
-        if causal:
-            first_frame_pad = x[:, :, :1, :, :].repeat(
-                (1, 1, self.time_kernel_size - 1, 1, 1)
-            )
-            x = torch.concatenate((first_frame_pad, x), dim=2)
-        else:
-            first_frame_pad = x[:, :, :1, :, :].repeat(
-                (1, 1, (self.time_kernel_size - 1) // 2, 1, 1)
-            )
-            last_frame_pad = x[:, :, -1:, :, :].repeat(
-                (1, 1, (self.time_kernel_size - 1) // 2, 1, 1)
-            )
-            x = torch.concatenate((first_frame_pad, x, last_frame_pad), dim=2)
-        x = self.conv(x)
-        return x
+        tid = threading.get_ident()
+
+        cached, is_end = self.temporal_cache_state.get(tid, (None, False))
+        if cached is None:
+            padding_length = self.time_kernel_size - 1
+            if not causal:
+                padding_length = padding_length // 2
+            if x.shape[2] == 0:
+                return x
+            cached = x[:, :, :1, :, :].repeat((1, 1, padding_length, 1, 1))
+        pieces = [ cached, x ]
+        if is_end and not causal:
+            pieces.append(x[:, :, -1:, :, :].repeat((1, 1, (self.time_kernel_size - 1) // 2, 1, 1)))
+
+        needs_caching = not is_end
+        if needs_caching and x.shape[2] >= self.time_kernel_size - 1:
+            needs_caching = False
+            self.temporal_cache_state[tid] = (x[:, :, -(self.time_kernel_size - 1):, :, :], False)
+
+        x = torch.cat(pieces, dim=2)
+
+        if needs_caching:
+            self.temporal_cache_state[tid] = (x[:, :, -(self.time_kernel_size - 1):, :, :], False)
+
+        return self.conv(x) if x.shape[2] >= self.time_kernel_size else x[:, :, :0, :, :]

    @property
    def weight(self):
--- a/comfy/ldm/lightricks/vae/causal_video_autoencoder.py
+++ b/comfy/ldm/lightricks/vae/causal_video_autoencoder.py
@ -1,4 +1,5 @@
 from __future__ import annotations
+import threading
 import torch
 from torch import nn
 from functools import partial
@ -6,12 +7,35 @@ import math
 from einops import rearrange
 from typing import List, Optional, Tuple, Union
 from .conv_nd_factory import make_conv_nd, make_linear_nd
+from .causal_conv3d import CausalConv3d
 from .pixel_norm import PixelNorm
 from ..model import PixArtAlphaCombinedTimestepSizeEmbeddings
 import comfy.ops
+from comfy.ldm.modules.diffusionmodules.model import torch_cat_if_needed

 ops = comfy.ops.disable_weight_init

+def mark_conv3d_ended(module):
+    tid = threading.get_ident()
+    for _, m in module.named_modules():
+        if isinstance(m, CausalConv3d):
+            current = m.temporal_cache_state.get(tid, (None, False))
+            m.temporal_cache_state[tid] = (current[0], True)
+
+def split2(tensor, split_point, dim=2):
+    return torch.split(tensor, [split_point, tensor.shape[dim] - split_point], dim=dim)
+
+def add_exchange_cache(dest, cache_in, new_input, dim=2):
+    if dest is not None:
+        if cache_in is not None:
+            cache_to_dest = min(dest.shape[dim], cache_in.shape[dim])
+            lead_in_dest, dest = split2(dest, cache_to_dest, dim=dim)
+            lead_in_source, cache_in = split2(cache_in, cache_to_dest, dim=dim)
+            lead_in_dest.add_(lead_in_source)
+        body, new_input = split2(new_input, dest.shape[dim], dim)
+        dest.add_(body)
+    return torch_cat_if_needed([cache_in, new_input], dim=dim)
+
 class Encoder(nn.Module):
    r"""
    The `Encoder` layer of a variational autoencoder that encodes its input into a latent representation.
@ -205,7 +229,7 @@ class Encoder(nn.Module):

        self.gradient_checkpointing = False

-    def forward(self, sample: torch.FloatTensor) -> torch.FloatTensor:
+    def forward_orig(self, sample: torch.FloatTensor) -> torch.FloatTensor:
        r"""The forward method of the `Encoder` class."""

        sample = patchify(sample, patch_size_hw=self.patch_size, patch_size_t=1)
@ -254,6 +278,22 @@ class Encoder(nn.Module):

        return sample

+    def forward(self, *args, **kwargs):
+        #No encoder support so just flag the end so it doesnt use the cache.
+        mark_conv3d_ended(self)
+        try:
+            return self.forward_orig(*args, **kwargs)
+        finally:
+            tid = threading.get_ident()
+            for _, module in self.named_modules():
+                # ComfyUI doesn't thread this kind of stuff today, but just in case
+                # we key on the thread to make it thread safe.
+                tid = threading.get_ident()
+                if hasattr(module, "temporal_cache_state"):
+                    module.temporal_cache_state.pop(tid, None)
+
+
+MAX_CHUNK_SIZE=(128 * 1024 ** 2)

 class Decoder(nn.Module):
    r"""
@ -341,18 +381,6 @@ class Decoder(nn.Module):
                    timestep_conditioning=timestep_conditioning,
                    spatial_padding_mode=spatial_padding_mode,
                )
-            elif block_name == "attn_res_x":
-                block = UNetMidBlock3D(
-                    dims=dims,
-                    in_channels=input_channel,
-                    num_layers=block_params["num_layers"],
-                    resnet_groups=norm_num_groups,
-                    norm_layer=norm_layer,
-                    inject_noise=block_params.get("inject_noise", False),
-                    timestep_conditioning=timestep_conditioning,
-                    attention_head_dim=block_params["attention_head_dim"],
-                    spatial_padding_mode=spatial_padding_mode,
-                )
            elif block_name == "res_x_y":
                output_channel = output_channel // block_params.get("multiplier", 2)
                block = ResnetBlock3D(
@ -428,8 +456,9 @@ class Decoder(nn.Module):
            )
            self.last_scale_shift_table = nn.Parameter(torch.empty(2, output_channel))

+
    # def forward(self, sample: torch.FloatTensor, target_shape) -> torch.FloatTensor:
-    def forward(
+    def forward_orig(
        self,
        sample: torch.FloatTensor,
        timestep: Optional[torch.Tensor] = None,
@ -437,6 +466,7 @@ class Decoder(nn.Module):
        r"""The forward method of the `Decoder` class."""
        batch_size = sample.shape[0]

+        mark_conv3d_ended(self.conv_in)
        sample = self.conv_in(sample, causal=self.causal)

        checkpoint_fn = (
@ -445,24 +475,12 @@ class Decoder(nn.Module):
            else lambda x: x
        )

-        scaled_timestep = None
+        timestep_shift_scale = None
        if self.timestep_conditioning:
            assert (
                timestep is not None
            ), "should pass timestep with timestep_conditioning=True"
            scaled_timestep = timestep * self.timestep_scale_multiplier.to(dtype=sample.dtype, device=sample.device)
-
-        for up_block in self.up_blocks:
-            if self.timestep_conditioning and isinstance(up_block, UNetMidBlock3D):
-                sample = checkpoint_fn(up_block)(
-                    sample, causal=self.causal, timestep=scaled_timestep
-                )
-            else:
-                sample = checkpoint_fn(up_block)(sample, causal=self.causal)
-
-        sample = self.conv_norm_out(sample)
-
-        if self.timestep_conditioning:
            embedded_timestep = self.last_time_embedder(
                timestep=scaled_timestep.flatten(),
                resolution=None,
@ -483,16 +501,62 @@ class Decoder(nn.Module):
                embedded_timestep.shape[-2],
                embedded_timestep.shape[-1],
            )
-            shift, scale = ada_values.unbind(dim=1)
-            sample = sample * (1 + scale) + shift
+            timestep_shift_scale = ada_values.unbind(dim=1)

-        sample = self.conv_act(sample)
-        sample = self.conv_out(sample, causal=self.causal)
+        output = []
+
+        def run_up(idx, sample, ended):
+            if idx >= len(self.up_blocks):
+                sample = self.conv_norm_out(sample)
+                if timestep_shift_scale is not None:
+                    shift, scale = timestep_shift_scale
+                    sample = sample * (1 + scale) + shift
+                sample = self.conv_act(sample)
+                if ended:
+                    mark_conv3d_ended(self.conv_out)
+                sample = self.conv_out(sample, causal=self.causal)
+                if sample is not None and sample.shape[2] > 0:
+                    output.append(sample)
+                return
+
+            up_block = self.up_blocks[idx]
+            if (ended):
+                mark_conv3d_ended(up_block)
+            if self.timestep_conditioning and isinstance(up_block, UNetMidBlock3D):
+                sample = checkpoint_fn(up_block)(
+                    sample, causal=self.causal, timestep=scaled_timestep
+                )
+            else:
+                sample = checkpoint_fn(up_block)(sample, causal=self.causal)
+
+            if sample is None or sample.shape[2] == 0:
+                return
+
+            total_bytes = sample.numel() * sample.element_size()
+            num_chunks = (total_bytes + MAX_CHUNK_SIZE - 1) // MAX_CHUNK_SIZE
+            samples = torch.chunk(sample, chunks=num_chunks, dim=2)
+
+            for chunk_idx, sample1 in enumerate(samples):
+                run_up(idx + 1, sample1, ended and chunk_idx == len(samples) - 1)
+
+        run_up(0, sample, True)
+        sample = torch.cat(output, dim=2)

        sample = unpatchify(sample, patch_size_hw=self.patch_size, patch_size_t=1)

        return sample

+    def forward(self, *args, **kwargs):
+        try:
+            return self.forward_orig(*args, **kwargs)
+        finally:
+            for _, module in self.named_modules():
+                #ComfyUI doesn't thread this kind of stuff today, but just incase
+                #we key on the thread to make it thread safe.
+                tid = threading.get_ident()
+                if hasattr(module, "temporal_cache_state"):
+                    module.temporal_cache_state.pop(tid, None)
+

 class UNetMidBlock3D(nn.Module):
    """
@ -663,8 +727,22 @@ class DepthToSpaceUpsample(nn.Module):
        )
        self.residual = residual
        self.out_channels_reduction_factor = out_channels_reduction_factor
+        self.temporal_cache_state = {}

    def forward(self, x, causal: bool = True, timestep: Optional[torch.Tensor] = None):
+        tid = threading.get_ident()
+        cached, drop_first_conv, drop_first_res = self.temporal_cache_state.get(tid, (None, True, True))
+        y = self.conv(x, causal=causal)
+        y = rearrange(
+            y,
+            "b (c p1 p2 p3) d h w -> b c (d p1) (h p2) (w p3)",
+            p1=self.stride[0],
+            p2=self.stride[1],
+            p3=self.stride[2],
+        )
+        if self.stride[0] == 2 and y.shape[2] > 0 and drop_first_conv:
+            y = y[:, :, 1:, :, :]
+            drop_first_conv = False
        if self.residual:
            # Reshape and duplicate the input to match the output shape
            x_in = rearrange(
@ -676,21 +754,20 @@ class DepthToSpaceUpsample(nn.Module):
            )
            num_repeat = math.prod(self.stride) // self.out_channels_reduction_factor
            x_in = x_in.repeat(1, num_repeat, 1, 1, 1)
-            if self.stride[0] == 2:
+            if self.stride[0] == 2 and x_in.shape[2] > 0 and drop_first_res:
                x_in = x_in[:, :, 1:, :, :]
-        x = self.conv(x, causal=causal)
-        x = rearrange(
-            x,
-            "b (c p1 p2 p3) d h w -> b c (d p1) (h p2) (w p3)",
-            p1=self.stride[0],
-            p2=self.stride[1],
-            p3=self.stride[2],
-        )
-        if self.stride[0] == 2:
-            x = x[:, :, 1:, :, :]
-        if self.residual:
-            x = x + x_in
-        return x
+                drop_first_res = False
+
+            if y.shape[2] == 0:
+                y = None
+
+            cached = add_exchange_cache(y, cached, x_in, dim=2)
+            self.temporal_cache_state[tid] = (cached, drop_first_conv, drop_first_res)
+
+        else:
+            self.temporal_cache_state[tid] = (None, drop_first_conv, False)
+
+        return y

 class LayerNorm(nn.Module):
    def __init__(self, dim, eps, elementwise_affine=True) -> None:
@ -807,6 +884,8 @@ class ResnetBlock3D(nn.Module):
                torch.randn(4, in_channels) / in_channels**0.5
            )

+        self.temporal_cache_state={}
+
    def _feed_spatial_noise(
        self, hidden_states: torch.FloatTensor, per_channel_scale: torch.FloatTensor
    ) -> torch.FloatTensor:
@ -880,9 +959,12 @@ class ResnetBlock3D(nn.Module):

        input_tensor = self.conv_shortcut(input_tensor)

-        output_tensor = input_tensor + hidden_states
+        tid = threading.get_ident()
+        cached = self.temporal_cache_state.get(tid, None)
+        cached = add_exchange_cache(hidden_states, cached, input_tensor, dim=2)
+        self.temporal_cache_state[tid] = cached

-        return output_tensor
+        return hidden_states


 def patchify(x, patch_size_hw, patch_size_t=1):
--- a/comfy/ldm/modules/diffusionmodules/model.py
+++ b/comfy/ldm/modules/diffusionmodules/model.py
@ -14,10 +14,13 @@ if model_management.xformers_enabled_vae():
    import xformers.ops

 def torch_cat_if_needed(xl, dim):
+    xl = [x for x in xl if x is not None and x.shape[dim] > 0]
    if len(xl) > 1:
        return torch.cat(xl, dim)
-    else:
+    elif len(xl) == 1:
        return xl[0]
+    else:
+        return None

 def get_timestep_embedding(timesteps, embedding_dim):
    """
--- a/comfy/ldm/qwen_image/model.py
+++ b/comfy/ldm/qwen_image/model.py
@ -170,8 +170,14 @@ class Attention(nn.Module):
        joint_query = apply_rope1(joint_query, image_rotary_emb)
        joint_key = apply_rope1(joint_key, image_rotary_emb)

+        if encoder_hidden_states_mask is not None:
+            attn_mask = torch.zeros((batch_size, 1, seq_txt + seq_img), dtype=hidden_states.dtype, device=hidden_states.device)
+            attn_mask[:, 0, :seq_txt] = encoder_hidden_states_mask
+        else:
+            attn_mask = None
+
        joint_hidden_states = optimized_attention_masked(joint_query, joint_key, joint_value, self.heads,
-                                                         attention_mask, transformer_options=transformer_options,
+                                                         attn_mask, transformer_options=transformer_options,
                                                         skip_reshape=True)

        txt_attn_output = joint_hidden_states[:, :seq_txt, :]
@ -430,6 +436,9 @@ class QwenImageTransformer2DModel(nn.Module):
        encoder_hidden_states = context
        encoder_hidden_states_mask = attention_mask

+        if encoder_hidden_states_mask is not None and not torch.is_floating_point(encoder_hidden_states_mask):
+            encoder_hidden_states_mask = (encoder_hidden_states_mask - 1).to(x.dtype) * torch.finfo(x.dtype).max
+
        hidden_states, img_ids, orig_shape = self.process_img(x)
        num_embeds = hidden_states.shape[1]

--- a/comfy/model_base.py
+++ b/comfy/model_base.py
@ -1578,6 +1578,9 @@ class QwenImage(BaseModel):

    def extra_conds(self, **kwargs):
        out = super().extra_conds(**kwargs)
+        attention_mask = kwargs.get("attention_mask", None)
+        if attention_mask is not None:
+            out['attention_mask'] = comfy.conds.CONDRegular(attention_mask)
        cross_attn = kwargs.get("cross_attn", None)
        if cross_attn is not None:
            out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn)
--- a/comfy/supported_models.py
+++ b/comfy/supported_models.py
@ -771,10 +771,24 @@ class Flux2(Flux):
        return out

    def clip_target(self, state_dict={}):
-        return None # TODO
        pref = self.text_encoder_key_prefix[0]
-        t5_detect = comfy.text_encoders.sd3_clip.t5_xxl_detect(state_dict, "{}t5xxl.transformer.".format(pref))
-        return supported_models_base.ClipTarget(comfy.text_encoders.flux.FluxTokenizer, comfy.text_encoders.flux.flux_clip(**t5_detect))
+        detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen3_4b.transformer.".format(pref))
+        if len(detect) > 0:
+            detect["model_type"] = "qwen3_4b"
+            return supported_models_base.ClipTarget(comfy.text_encoders.flux.KleinTokenizer, comfy.text_encoders.flux.klein_te(**detect))
+
+        detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen3_8b.transformer.".format(pref))
+        if len(detect) > 0:
+            detect["model_type"] = "qwen3_8b"
+            return supported_models_base.ClipTarget(comfy.text_encoders.flux.KleinTokenizer8B, comfy.text_encoders.flux.klein_te(**detect))
+
+        detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}mistral3_24b.transformer.".format(pref))
+        if len(detect) > 0:
+            if "{}mistral3_24b.transformer.model.layers.39.post_attention_layernorm.weight".format(pref) not in state_dict:
+                detect["pruned"] = True
+            return supported_models_base.ClipTarget(comfy.text_encoders.flux.Flux2Tokenizer, comfy.text_encoders.flux.flux2_te(**detect))
+
+        return None

 class GenmoMochi(supported_models_base.BASE):
    unet_config = {
--- a/comfy/text_encoders/hunyuan_video.py
+++ b/comfy/text_encoders/hunyuan_video.py
@ -10,9 +10,11 @@ import comfy.utils

 def llama_detect(state_dict, prefix=""):
    out = {}
-    t5_key = "{}model.norm.weight".format(prefix)
-    if t5_key in state_dict:
-        out["dtype_llama"] = state_dict[t5_key].dtype
+    norm_keys = ["{}model.norm.weight".format(prefix), "{}model.layers.0.input_layernorm.weight".format(prefix)]
+    for norm_key in norm_keys:
+        if norm_key in state_dict:
+            out["dtype_llama"] = state_dict[norm_key].dtype
+            break

    quant = comfy.utils.detect_layer_quantization(state_dict, prefix)
    if quant is not None:
--- a/comfy_api/latest/init.py
+++ b/comfy_api/latest/init.py
@ -10,7 +10,6 @@ from ._input_impl import VideoFromFile, VideoFromComponents
 from ._util import VideoCodec, VideoContainer, VideoComponents, MESH, VOXEL
 from . import _io_public as io
 from . import _ui_public as ui
-from . import _node_replace_public as node_replace
 from comfy_execution.utils import get_executing_context
 from comfy_execution.progress import get_progress_state, PreviewImageTuple
 from PIL import Image
@ -131,5 +130,4 @@ __all__ = [
    "IO",
    "ui",
    "UI",
-    "node_replace",
 ]
--- a/comfy_api/latest/_input/init.py
+++ b/comfy_api/latest/_input/init.py
@ -1,10 +1,12 @@
 from .basic_types import ImageInput, AudioInput, MaskInput, LatentInput
-from .video_types import VideoInput
+from .video_types import VideoInput, VideoOp, SliceOp

 __all__ = [
    "ImageInput",
    "AudioInput",
    "VideoInput",
+    "VideoOp",
+    "SliceOp",
    "MaskInput",
    "LatentInput",
 ]
--- a/comfy_api/latest/_input/video_types.py
+++ b/comfy_api/latest/_input/video_types.py
@ -1,11 +1,48 @@
 from __future__ import annotations
 from abc import ABC, abstractmethod
+from dataclasses import dataclass
 from fractions import Fraction
 from typing import Optional, Union, IO
+import copy
 import io
 import av
 from .._util import VideoContainer, VideoCodec, VideoComponents

+
+class VideoOp(ABC):
+    """Base class for lazy video operations."""
+
+    @abstractmethod
+    def apply(self, components: VideoComponents) -> VideoComponents:
+        pass
+
+    @abstractmethod
+    def compute_frame_count(self, input_frame_count: int) -> int:
+        pass
+
+
+@dataclass(frozen=True)
+class SliceOp(VideoOp):
+    """Extract a range of frames from the video."""
+    start_frame: int
+    frame_count: int
+
+    def apply(self, components: VideoComponents) -> VideoComponents:
+        total = components.images.shape[0]
+        start = max(0, min(self.start_frame, total))
+        end = min(start + self.frame_count, total)
+        return VideoComponents(
+            images=components.images[start:end],
+            audio=components.audio,
+            frame_rate=components.frame_rate,
+            metadata=getattr(components, 'metadata', None),
+        )
+
+    def compute_frame_count(self, input_frame_count: int) -> int:
+        start = max(0, min(self.start_frame, input_frame_count))
+        return min(self.frame_count, input_frame_count - start)
+
+
 class VideoInput(ABC):
    """
    Abstract base class for video input types.
@ -21,6 +58,12 @@ class VideoInput(ABC):
        """
        pass

+    def sliced(self, start_frame: int, frame_count: int) -> "VideoInput":
+        """Return a copy of this video with a slice operation appended."""
+        new = copy.copy(self)
+        new._operations = getattr(self, '_operations', []) + [SliceOp(start_frame, frame_count)]
+        return new
+
    @abstractmethod
    def save_to(
        self,
--- a/comfy_api/latest/_input_impl/init.py
+++ b/comfy_api/latest/_input_impl/init.py
@ -1,7 +1,8 @@
 from .video_types import VideoFromFile, VideoFromComponents
+from .._input import SliceOp

 __all__ = [
-    # Implementations
    "VideoFromFile",
    "VideoFromComponents",
+    "SliceOp",
 ]
--- a/comfy_api/latest/_input_impl/video_types.py
+++ b/comfy_api/latest/_input_impl/video_types.py
@ -3,7 +3,7 @@ from av.container import InputContainer
 from av.subtitles.stream import SubtitleStream
 from fractions import Fraction
 from typing import Optional
-from .._input import AudioInput, VideoInput
+from .._input import AudioInput, VideoInput, VideoOp
 import av
 import io
 import json
@ -63,6 +63,8 @@ class VideoFromFile(VideoInput):
        containing the file contents.
        """
        self.__file = file
+        self._operations: list[VideoOp] = []
+        self.__materialized: Optional[VideoFromComponents] = None

    def get_stream_source(self) -> str | io.BytesIO:
        """
@ -161,6 +163,10 @@ class VideoFromFile(VideoInput):

            if frame_count == 0:
                raise ValueError(f"Could not determine frame count for file '{self.__file}'")
+
+            # Apply operations to get final frame count
+            for op in self._operations:
+                frame_count = op.compute_frame_count(frame_count)
            return frame_count

    def get_frame_rate(self) -> Fraction:
@ -239,10 +245,18 @@ class VideoFromFile(VideoInput):
        return VideoComponents(images=images, audio=audio, frame_rate=frame_rate, metadata=metadata)

    def get_components(self) -> VideoComponents:
+        if self.__materialized is not None:
+            return self.__materialized.get_components()
+
        if isinstance(self.__file, io.BytesIO):
            self.__file.seek(0)  # Reset the BytesIO object to the beginning
        with av.open(self.__file, mode='r') as container:
-            return self.get_components_internal(container)
+            components = self.get_components_internal(container)
+            for op in self._operations:
+                components = op.apply(components)
+            self.__materialized = VideoFromComponents(components)
+            self._operations = []
+            return components
        raise ValueError(f"No video stream found in file '{self.__file}'")

    def save_to(
@ -317,14 +331,27 @@ class VideoFromComponents(VideoInput):

    def __init__(self, components: VideoComponents):
        self.__components = components
+        self._operations: list[VideoOp] = []

    def get_components(self) -> VideoComponents:
+        if self._operations:
+            components = self.__components
+            for op in self._operations:
+                components = op.apply(components)
+            self.__components = components
+            self._operations = []
        return VideoComponents(
            images=self.__components.images,
            audio=self.__components.audio,
            frame_rate=self.__components.frame_rate
        )

+    def get_frame_count(self) -> int:
+        count = int(self.__components.images.shape[0])
+        for op in self._operations:
+            count = op.compute_frame_count(count)
+        return count
+
    def save_to(
        self,
        path: str,
@ -332,6 +359,9 @@ class VideoFromComponents(VideoInput):
        codec: VideoCodec = VideoCodec.AUTO,
        metadata: Optional[dict] = None
    ):
+        # Materialize ops before saving
+        components = self.get_components()
+
        if format != VideoContainer.AUTO and format != VideoContainer.MP4:
            raise ValueError("Only MP4 format is supported for now")
        if codec != VideoCodec.AUTO and codec != VideoCodec.H264:
@ -345,22 +375,22 @@ class VideoFromComponents(VideoInput):
                for key, value in metadata.items():
                    output.metadata[key] = json.dumps(value)

-            frame_rate = Fraction(round(self.__components.frame_rate * 1000), 1000)
+            frame_rate = Fraction(round(components.frame_rate * 1000), 1000)
            # Create a video stream
            video_stream = output.add_stream('h264', rate=frame_rate)
-            video_stream.width = self.__components.images.shape[2]
-            video_stream.height = self.__components.images.shape[1]
+            video_stream.width = components.images.shape[2]
+            video_stream.height = components.images.shape[1]
            video_stream.pix_fmt = 'yuv420p'

            # Create an audio stream
            audio_sample_rate = 1
            audio_stream: Optional[av.AudioStream] = None
-            if self.__components.audio:
-                audio_sample_rate = int(self.__components.audio['sample_rate'])
+            if components.audio:
+                audio_sample_rate = int(components.audio['sample_rate'])
                audio_stream = output.add_stream('aac', rate=audio_sample_rate)

            # Encode video
-            for i, frame in enumerate(self.__components.images):
+            for i, frame in enumerate(components.images):
                img = (frame * 255).clamp(0, 255).byte().cpu().numpy() # shape: (H, W, 3)
                frame = av.VideoFrame.from_ndarray(img, format='rgb24')
                frame = frame.reformat(format='yuv420p')  # Convert to YUV420P as required by h264
@ -371,9 +401,9 @@ class VideoFromComponents(VideoInput):
            packet = video_stream.encode(None)
            output.mux(packet)

-            if audio_stream and self.__components.audio:
-                waveform = self.__components.audio['waveform']
-                waveform = waveform[:, :, :math.ceil((audio_sample_rate / frame_rate) * self.__components.images.shape[0])]
+            if audio_stream and components.audio:
+                waveform = components.audio['waveform']
+                waveform = waveform[:, :, :math.ceil((audio_sample_rate / frame_rate) * components.images.shape[0])]
                frame = av.AudioFrame.from_ndarray(waveform.movedim(2, 1).reshape(1, -1).float().cpu().numpy(), format='flt', layout='mono' if waveform.shape[1] == 1 else 'stereo')
                frame.sample_rate = audio_sample_rate
                frame.pts = 0
--- a/comfy_api/latest/_node_replace.py
+++ b/comfy_api/latest/_node_replace.py
@ -1,109 +0,0 @@
-from __future__ import annotations
-
-from typing import Any
-import app.node_replace_manager
-
-def register_node_replacement(node_replace: NodeReplace):
-    """
-    Register node replacement.
-    """
-    app.node_replace_manager.register_node_replacement(node_replace)
-
-
-class NodeReplace:
-    """
-    Defines a possible node replacement, mapping inputs and outputs of the old node to the new node.
-
-    Also supports assigning specific values to the input widgets of the new node.
-    """
-    def __init__(self,
-        new_node_id: str,
-        old_node_id: str,
-        old_widget_ids: list[str] | None=None,
-        input_mapping: list[InputMap] | None=None,
-        output_mapping: list[OutputMap] | None=None,
-    ):
-        self.new_node_id = new_node_id
-        self.old_node_id = old_node_id
-        self.old_widget_ids = old_widget_ids
-        self.input_mapping = input_mapping
-        self.output_mapping = output_mapping
-
-    def as_dict(self):
-        """
-        Create serializable representation of the node replacement.
-        """
-        return {
-            "new_node_id": self.new_node_id,
-            "old_node_id": self.old_node_id,
-            "old_widget_ids": self.old_widget_ids,
-            "input_mapping": [m.as_dict() for m in self.input_mapping] if self.input_mapping else None,
-            "output_mapping": [m.as_dict() for m in self.output_mapping] if self.output_mapping else None,
-        }
-
-
-class InputMap:
-    """
-    Map inputs of node replacement.
-
-    Use InputMap.OldId or InputMap.SetValue for mapping purposes.
-    """
-    class _Assign:
-        def __init__(self, assign_type: str):
-            self.assign_type = assign_type
-
-        def as_dict(self):
-            return {
-                "assign_type": self.assign_type,
-            }
-
-    class OldId(_Assign):
-        """
-        Connect the input of the old node with given id to new node when replacing.
-        """
-        def __init__(self, old_id: str):
-            super().__init__("old_id")
-            self.old_id = old_id
-
-        def as_dict(self):
-            return super().as_dict() | {
-                "old_id": self.old_id,
-            }
-
-    class SetValue(_Assign):
-        """
-        Use the given value for the input of the new node when replacing; assumes input is a widget.
-        """
-        def __init__(self, value: Any):
-            super().__init__("set_value")
-            self.value = value
-
-        def as_dict(self):
-            return super().as_dict() | {
-                "value": self.value,
-            }
-
-    def __init__(self, new_id: str, assign: OldId | SetValue):
-        self.new_id = new_id
-        self.assign = assign
-
-    def as_dict(self):
-        return {
-            "new_id": self.new_id,
-            "assign": self.assign.as_dict(),
-        }
-
-
-class OutputMap:
-    """
-    Map outputs of node replacement via indexes, as that's how outputs are stored.
-    """
-    def __init__(self, new_idx: int, old_idx: int):
-        self.new_idx = new_idx
-        self.old_idx = old_idx
-
-    def as_dict(self):
-        return {
-            "new_idx": self.new_idx,
-            "old_idx": self.old_idx,
-        }
--- a/comfy_api/latest/_node_replace_public.py
+++ b/comfy_api/latest/_node_replace_public.py
@ -1 +0,0 @@
-from ._node_replace import *  # noqa: F403
--- a/comfy_api/v0_0_2/init.py
+++ b/comfy_api/v0_0_2/init.py
@ -6,7 +6,7 @@ from comfy_api.latest import (
 )
 from typing import Type, TYPE_CHECKING
 from comfy_api.internal.async_to_sync import create_sync_class
-from comfy_api.latest import io, ui, IO, UI, ComfyExtension, node_replace  #noqa: F401
+from comfy_api.latest import io, ui, IO, UI, ComfyExtension  #noqa: F401


 class ComfyAPIAdapter_v0_0_2(ComfyAPI_latest):
@ -46,5 +46,4 @@ __all__ = [
    "IO",
    "ui",
    "UI",
-    "node_replace",
 ]
--- a/comfy_extras/nodes_load_3d.py
+++ b/comfy_extras/nodes_load_3d.py
@ -24,7 +24,7 @@ class Load3D(IO.ComfyNode):
        files = [
            normalize_path(str(file_path.relative_to(base_path)))
            for file_path in input_path.rglob("*")
-            if file_path.suffix.lower() in {'.gltf', '.glb', '.obj', '.fbx', '.stl'}
+            if file_path.suffix.lower() in {'.gltf', '.glb', '.obj', '.fbx', '.stl', '.spz', '.splat', '.ply', '.ksplat'}
        ]
        return IO.Schema(
            node_id="Load3D",
--- a/comfy_extras/nodes_video.py
+++ b/comfy_extras/nodes_video.py
@ -159,6 +159,29 @@ class GetVideoComponents(io.ComfyNode):
        return io.NodeOutput(components.images, components.audio, float(components.frame_rate))


+class VideoSlice(io.ComfyNode):
+    @classmethod
+    def define_schema(cls):
+        return io.Schema(
+            node_id="VideoSlice",
+            display_name="Video Slice",
+            category="image/video",
+            description="Extract a range of frames from a video.",
+            inputs=[
+                io.Video.Input("video", tooltip="The video to slice."),
+                io.Int.Input("start_frame", default=0, min=0, tooltip="The frame index to start from (0-indexed)."),
+                io.Int.Input("frame_count", default=1, min=1, tooltip="Number of frames to extract."),
+            ],
+            outputs=[
+                io.Video.Output(tooltip="The sliced video."),
+            ],
+        )
+
+    @classmethod
+    def execute(cls, video: Input.Video, start_frame: int, frame_count: int) -> io.NodeOutput:
+        return io.NodeOutput(video.sliced(start_frame, frame_count))
+
+
 class LoadVideo(io.ComfyNode):
    @classmethod
    def define_schema(cls):
@ -206,6 +229,7 @@ class VideoExtension(ComfyExtension):
            SaveVideo,
            CreateVideo,
            GetVideoComponents,
+            VideoSlice,
            LoadVideo,
        ]

--- a/server.py
+++ b/server.py
@ -40,7 +40,6 @@ from app.user_manager import UserManager
 from app.model_manager import ModelFileManager
 from app.custom_node_manager import CustomNodeManager
 from app.subgraph_manager import SubgraphManager
-from app.node_replace_manager import NodeReplaceManager
 from typing import Optional, Union
 from api_server.routes.internal.internal_routes import InternalRoutes
 from protocol import BinaryEventTypes
@ -205,7 +204,6 @@ class PromptServer():
        self.model_file_manager = ModelFileManager()
        self.custom_node_manager = CustomNodeManager()
        self.subgraph_manager = SubgraphManager()
-        self.node_replace_manager = NodeReplaceManager()
        self.internal_routes = InternalRoutes(self)
        self.supports = ["custom_nodes_from_web"]
        self.prompt_queue = execution.PromptQueue(self)
@ -994,7 +992,6 @@ class PromptServer():
        self.model_file_manager.add_routes(self.routes)
        self.custom_node_manager.add_routes(self.routes, self.app, nodes.LOADED_MODULE_DIRS.items())
        self.subgraph_manager.add_routes(self.routes, nodes.LOADED_MODULE_DIRS.items())
-        self.node_replace_manager.add_routes(self.routes)
        self.app.add_subapp('/internal', self.internal_routes.get_app())

        # Prefix every route with /api for easier matching for delegation.
--- a/tests-unit/comfy_api_test/video_slice_test.py
+++ b/tests-unit/comfy_api_test/video_slice_test.py
@ -0,0 +1,150 @@
+import pytest
+import torch
+import tempfile
+import os
+import av
+from fractions import Fraction
+from comfy_api.input_impl.video_types import (
+    VideoFromFile,
+    VideoFromComponents,
+    SliceOp,
+)
+from comfy_api.util.video_types import VideoComponents
+
+
+def create_test_video(width=4, height=4, frames=10, fps=30):
+    """Helper to create a temporary video file."""
+    tmp = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False)
+    with av.open(tmp.name, mode="w") as container:
+        stream = container.add_stream("h264", rate=fps)
+        stream.width = width
+        stream.height = height
+        stream.pix_fmt = "yuv420p"
+
+        for i in range(frames):
+            frame_data = torch.ones(height, width, 3, dtype=torch.uint8) * (i * 25)
+            frame = av.VideoFrame.from_ndarray(frame_data.numpy(), format="rgb24")
+            frame = frame.reformat(format="yuv420p")
+            packet = stream.encode(frame)
+            container.mux(packet)
+
+        packet = stream.encode(None)
+        container.mux(packet)
+
+    return tmp.name
+
+
+@pytest.fixture
+def video_file_10_frames():
+    file_path = create_test_video(frames=10)
+    yield file_path
+    os.unlink(file_path)
+
+
+@pytest.fixture
+def video_components_10_frames():
+    images = torch.rand(10, 4, 4, 3)
+    return VideoComponents(images=images, frame_rate=Fraction(30))
+
+
+class TestSliceOp:
+    def test_apply_slices_correctly(self, video_components_10_frames):
+        op = SliceOp(start_frame=2, frame_count=3)
+        result = op.apply(video_components_10_frames)
+
+        assert result.images.shape[0] == 3
+        assert torch.equal(result.images, video_components_10_frames.images[2:5])
+
+    def test_compute_frame_count(self):
+        op = SliceOp(start_frame=2, frame_count=5)
+        assert op.compute_frame_count(10) == 5
+
+    def test_compute_frame_count_clamps(self):
+        op = SliceOp(start_frame=8, frame_count=5)
+        assert op.compute_frame_count(10) == 2
+
+
+class TestVideoSliced:
+    def test_sliced_returns_new_instance(self, video_components_10_frames):
+        video = VideoFromComponents(video_components_10_frames)
+        sliced = video.sliced(2, 3)
+
+        assert video is not sliced
+        assert len(video._operations) == 0
+        assert len(sliced._operations) == 1
+
+    def test_get_components_applies_operations(self, video_components_10_frames):
+        video = VideoFromComponents(video_components_10_frames)
+        sliced = video.sliced(2, 3)
+
+        components = sliced.get_components()
+
+        assert components.images.shape[0] == 3
+        assert torch.equal(components.images, video_components_10_frames.images[2:5])
+
+    def test_get_frame_count(self, video_components_10_frames):
+        video = VideoFromComponents(video_components_10_frames)
+        sliced = video.sliced(2, 3)
+
+        assert sliced.get_frame_count() == 3
+
+    def test_get_duration(self, video_components_10_frames):
+        video = VideoFromComponents(video_components_10_frames)
+        sliced = video.sliced(0, 3)
+
+        assert sliced.get_duration() == pytest.approx(0.1)
+
+    def test_chained_slices_compose(self, video_components_10_frames):
+        video = VideoFromComponents(video_components_10_frames)
+        sliced = video.sliced(2, 6).sliced(1, 3)
+
+        components = sliced.get_components()
+
+        assert components.images.shape[0] == 3
+        assert torch.equal(components.images, video_components_10_frames.images[3:6])
+
+    def test_operations_list_is_immutable(self, video_components_10_frames):
+        video = VideoFromComponents(video_components_10_frames)
+        sliced1 = video.sliced(0, 5)
+        sliced2 = sliced1.sliced(1, 2)
+
+        assert len(video._operations) == 0
+        assert len(sliced1._operations) == 1
+        assert len(sliced2._operations) == 2
+
+    def test_from_file(self, video_file_10_frames):
+        video = VideoFromFile(video_file_10_frames)
+        sliced = video.sliced(2, 3)
+
+        components = sliced.get_components()
+
+        assert components.images.shape[0] == 3
+        assert sliced.get_frame_count() == 3
+
+    def test_save_sliced_video(self, video_components_10_frames, tmp_path):
+        video = VideoFromComponents(video_components_10_frames)
+        sliced = video.sliced(2, 3)
+
+        output_path = str(tmp_path / "sliced_output.mp4")
+        sliced.save_to(output_path)
+
+        saved_video = VideoFromFile(output_path)
+        assert saved_video.get_frame_count() == 3
+
+    def test_materialization_clears_ops(self, video_components_10_frames):
+        video = VideoFromComponents(video_components_10_frames)
+        sliced = video.sliced(2, 3)
+
+        assert len(sliced._operations) == 1
+        sliced.get_components()
+        assert len(sliced._operations) == 0
+
+    def test_second_get_components_uses_cache(self, video_components_10_frames):
+        video = VideoFromComponents(video_components_10_frames)
+        sliced = video.sliced(2, 3)
+
+        first = sliced.get_components()
+        second = sliced.get_components()
+
+        assert first.images.shape == second.images.shape
+        assert torch.equal(first.images, second.images)
Author	SHA1	Message	Date
bymyself	e4f3d335dc	feat: Add VideoSlice node with lazy operations on VideoInput - Add VideoOp base class and SliceOp in _input/video_types.py - Add sliced() method to VideoInput that returns a copy with operation appended - Each subclass applies operations in get_components() and get_frame_count() - After materialization, VideoFromFile delegates to internal VideoFromComponents - Add VideoSlice node that uses video.sliced(start_frame, frame_count) - Add tests for SliceOp, sliced() behavior, and materialization	2026-01-23 20:52:15 -08:00
Omri Marom	d7f3241bf6	qwen_image: propagate attention mask. (#11966 )	2026-01-22 20:02:31 -05:00
comfyanonymous	09a2e67151	Support loading flux 2 klein checkpoints saved with SaveCheckpoint. (#12033 )	2026-01-22 18:20:48 -05:00
rattus	0fd1b78736	Reduce LTX2 VAE VRAM consumption (#12028 ) * causal_video_ae: Remove attention ResNet This attention_head_dim argument does not exist on this constructor so this is dead code. Remove as generic attention mid VAE conflicts with temporal roll. * ltx-vae: consoldate causal/non-causal code paths * ltx-vae: add cache rolling adder * ltx-vae: use cached adder for resnet * ltx-vae: Implement rolling VAE Implement a temporal rolling VAE for the LTX2 VAE. Usually when doing temporal rolling VAEs you can just chunk on time relying on causality and cache behind you as you go. The LTX VAE is however non-causal. So go whole hog and implement per layer run ahead and backpressure between the decoder layers using recursive state beween the layers. Operations are ammended with temporal_cache_state{} which they can use to hold any state then need for partial execution. Convolutions cache their inputs behind the up to N-1 frames, and skip connections need to cache the mismatch between convolution input and output that happens due to missing future (non-causal) input. Each call to run_up() processes a layer accross a range on input that may or may not be complete. It goes depth first to process as much as possible to try and digest frames to the final output ASAP. If layers run out of input due to convolution losses, they simply return without action effectively applying back-pressure to the earlier layers. As the earlier layers do more work and caller deeper, the partial states are reconciled and output continues to digest depth first as much as possible. Chunking is done using a size quota rather than a fixed frame length and any layer can initiate chunking, and multiple layers can chunk at different granulatiries. This remove the old limitation of always having to process 1 latent frame to entirety and having to hold 8 full decoded frames as the VRAM peak.	2026-01-22 16:54:18 -05:00
Terry Jia	8490eedadf	add ply & 3dgs format in 3d node (#11474 )	2026-01-22 09:46:56 -08:00