Doing some experimentation

Support predict_ratio changing with timesteps
Implement Sortblock for single cond usage
2026-01-25 22:36:10 +08:00 · 2025-09-02 22:19:12 -07:00 · 2025-09-02 15:23:28 -07:00 · 2025-09-02 00:45:59 -07:00 · 2025-09-01 09:39:40 -07:00 · 2025-08-31 20:26:49 -07:00
38 changed files with 1454 additions and 273 deletions
--- a/comfy/ldm/ace/attention.py
+++ b/comfy/ldm/ace/attention.py
@ -133,6 +133,7 @@ class Attention(nn.Module):
        hidden_states: torch.Tensor,
        encoder_hidden_states: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
+        transformer_options={},
        **cross_attention_kwargs,
    ) -> torch.Tensor:
        return self.processor(
@ -140,6 +141,7 @@ class Attention(nn.Module):
            hidden_states,
            encoder_hidden_states=encoder_hidden_states,
            attention_mask=attention_mask,
+            transformer_options=transformer_options,
            **cross_attention_kwargs,
        )

@ -366,6 +368,7 @@ class CustomerAttnProcessor2_0:
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        rotary_freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]] = None,
        rotary_freqs_cis_cross: Union[torch.Tensor, Tuple[torch.Tensor]] = None,
+        transformer_options={},
        *args,
        **kwargs,
    ) -> torch.Tensor:
@ -433,7 +436,7 @@ class CustomerAttnProcessor2_0:

        # the output of sdp = (batch, num_heads, seq_len, head_dim)
        hidden_states = optimized_attention(
-            query, key, value, heads=query.shape[1], mask=attention_mask, skip_reshape=True,
+            query, key, value, heads=query.shape[1], mask=attention_mask, skip_reshape=True, transformer_options=transformer_options,
        ).to(query.dtype)

        # linear proj
@ -697,6 +700,7 @@ class LinearTransformerBlock(nn.Module):
        rotary_freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]] = None,
        rotary_freqs_cis_cross: Union[torch.Tensor, Tuple[torch.Tensor]] = None,
        temb: torch.FloatTensor = None,
+        transformer_options={},
    ):

        N = hidden_states.shape[0]
@ -720,6 +724,7 @@ class LinearTransformerBlock(nn.Module):
                encoder_attention_mask=encoder_attention_mask,
                rotary_freqs_cis=rotary_freqs_cis,
                rotary_freqs_cis_cross=rotary_freqs_cis_cross,
+                transformer_options=transformer_options,
            )
        else:
            attn_output, _ = self.attn(
@ -729,6 +734,7 @@ class LinearTransformerBlock(nn.Module):
                encoder_attention_mask=None,
                rotary_freqs_cis=rotary_freqs_cis,
                rotary_freqs_cis_cross=None,
+                transformer_options=transformer_options,
            )

        if self.use_adaln_single:
@ -743,6 +749,7 @@ class LinearTransformerBlock(nn.Module):
                encoder_attention_mask=encoder_attention_mask,
                rotary_freqs_cis=rotary_freqs_cis,
                rotary_freqs_cis_cross=rotary_freqs_cis_cross,
+                transformer_options=transformer_options,
            )
            hidden_states = attn_output + hidden_states

--- a/comfy/ldm/ace/model.py
+++ b/comfy/ldm/ace/model.py
@ -314,6 +314,7 @@ class ACEStepTransformer2DModel(nn.Module):
        output_length: int = 0,
        block_controlnet_hidden_states: Optional[Union[List[torch.Tensor], torch.Tensor]] = None,
        controlnet_scale: Union[float, torch.Tensor] = 1.0,
+        transformer_options={},
    ):
        embedded_timestep = self.timestep_embedder(self.time_proj(timestep).to(dtype=hidden_states.dtype))
        temb = self.t_block(embedded_timestep)
@ -339,6 +340,7 @@ class ACEStepTransformer2DModel(nn.Module):
                rotary_freqs_cis=rotary_freqs_cis,
                rotary_freqs_cis_cross=encoder_rotary_freqs_cis,
                temb=temb,
+                transformer_options=transformer_options,
            )

        output = self.final_layer(hidden_states, embedded_timestep, output_length)
@ -393,6 +395,7 @@ class ACEStepTransformer2DModel(nn.Module):

        output_length = hidden_states.shape[-1]

+        transformer_options = kwargs.get("transformer_options", {})
        output = self.decode(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
@ -402,6 +405,7 @@ class ACEStepTransformer2DModel(nn.Module):
            output_length=output_length,
            block_controlnet_hidden_states=block_controlnet_hidden_states,
            controlnet_scale=controlnet_scale,
+            transformer_options=transformer_options,
        )

        return output
--- a/comfy/ldm/audio/dit.py
+++ b/comfy/ldm/audio/dit.py
@ -298,7 +298,8 @@ class Attention(nn.Module):
        mask = None,
        context_mask = None,
        rotary_pos_emb = None,
-        causal = None
+        causal = None,
+        transformer_options={},
    ):
        h, kv_h, has_context = self.num_heads, self.kv_heads, context is not None

@ -363,7 +364,7 @@ class Attention(nn.Module):
            heads_per_kv_head = h // kv_h
            k, v = map(lambda t: t.repeat_interleave(heads_per_kv_head, dim = 1), (k, v))

-        out = optimized_attention(q, k, v, h, skip_reshape=True)
+        out = optimized_attention(q, k, v, h, skip_reshape=True, transformer_options=transformer_options)
        out = self.to_out(out)

        if mask is not None:
@ -488,7 +489,8 @@ class TransformerBlock(nn.Module):
        global_cond=None,
        mask = None,
        context_mask = None,
-        rotary_pos_emb = None
+        rotary_pos_emb = None,
+        transformer_options={}
    ):
        if self.global_cond_dim is not None and self.global_cond_dim > 0 and global_cond is not None:

@ -498,12 +500,12 @@ class TransformerBlock(nn.Module):
            residual = x
            x = self.pre_norm(x)
            x = x * (1 + scale_self) + shift_self
-            x = self.self_attn(x, mask = mask, rotary_pos_emb = rotary_pos_emb)
+            x = self.self_attn(x, mask = mask, rotary_pos_emb = rotary_pos_emb, transformer_options=transformer_options)
            x = x * torch.sigmoid(1 - gate_self)
            x = x + residual

            if context is not None:
-                x = x + self.cross_attn(self.cross_attend_norm(x), context = context, context_mask = context_mask)
+                x = x + self.cross_attn(self.cross_attend_norm(x), context = context, context_mask = context_mask, transformer_options=transformer_options)

            if self.conformer is not None:
                x = x + self.conformer(x)
@ -517,10 +519,10 @@ class TransformerBlock(nn.Module):
            x = x + residual

        else:
-            x = x + self.self_attn(self.pre_norm(x), mask = mask, rotary_pos_emb = rotary_pos_emb)
+            x = x + self.self_attn(self.pre_norm(x), mask = mask, rotary_pos_emb = rotary_pos_emb, transformer_options=transformer_options)

            if context is not None:
-                x = x + self.cross_attn(self.cross_attend_norm(x), context = context, context_mask = context_mask)
+                x = x + self.cross_attn(self.cross_attend_norm(x), context = context, context_mask = context_mask, transformer_options=transformer_options)

            if self.conformer is not None:
                x = x + self.conformer(x)
@ -606,7 +608,8 @@ class ContinuousTransformer(nn.Module):
        return_info = False,
        **kwargs
    ):
-        patches_replace = kwargs.get("transformer_options", {}).get("patches_replace", {})
+        transformer_options = kwargs.get("transformer_options", {})
+        patches_replace = transformer_options.get("patches_replace", {})
        batch, seq, device = *x.shape[:2], x.device
        context = kwargs["context"]

@ -645,13 +648,13 @@ class ContinuousTransformer(nn.Module):
            if ("double_block", i) in blocks_replace:
                def block_wrap(args):
                    out = {}
-                    out["img"] = layer(args["img"], rotary_pos_emb=args["pe"], global_cond=args["vec"], context=args["txt"])
+                    out["img"] = layer(args["img"], rotary_pos_emb=args["pe"], global_cond=args["vec"], context=args["txt"], transformer_options=args["transformer_options"])
                    return out

-                out = blocks_replace[("double_block", i)]({"img": x, "txt": context, "vec": global_cond, "pe": rotary_pos_emb}, {"original_block": block_wrap})
+                out = blocks_replace[("double_block", i)]({"img": x, "txt": context, "vec": global_cond, "pe": rotary_pos_emb, "transformer_options": transformer_options}, {"original_block": block_wrap})
                x = out["img"]
            else:
-                x = layer(x, rotary_pos_emb = rotary_pos_emb, global_cond=global_cond, context=context)
+                x = layer(x, rotary_pos_emb = rotary_pos_emb, global_cond=global_cond, context=context, transformer_options=transformer_options)
            # x = checkpoint(layer, x, rotary_pos_emb = rotary_pos_emb, global_cond=global_cond, **kwargs)

            if return_info:
--- a/comfy/ldm/aura/mmdit.py
+++ b/comfy/ldm/aura/mmdit.py
@ -85,7 +85,7 @@ class SingleAttention(nn.Module):
        )

    #@torch.compile()
-    def forward(self, c):
+    def forward(self, c, transformer_options={}):

        bsz, seqlen1, _ = c.shape

@ -95,7 +95,7 @@ class SingleAttention(nn.Module):
        v = v.view(bsz, seqlen1, self.n_heads, self.head_dim)
        q, k = self.q_norm1(q), self.k_norm1(k)

-        output = optimized_attention(q.permute(0, 2, 1, 3), k.permute(0, 2, 1, 3), v.permute(0, 2, 1, 3), self.n_heads, skip_reshape=True)
+        output = optimized_attention(q.permute(0, 2, 1, 3), k.permute(0, 2, 1, 3), v.permute(0, 2, 1, 3), self.n_heads, skip_reshape=True, transformer_options=transformer_options)
        c = self.w1o(output)
        return c

@ -144,7 +144,7 @@ class DoubleAttention(nn.Module):


    #@torch.compile()
-    def forward(self, c, x):
+    def forward(self, c, x, transformer_options={}):

        bsz, seqlen1, _ = c.shape
        bsz, seqlen2, _ = x.shape
@ -168,7 +168,7 @@ class DoubleAttention(nn.Module):
            torch.cat([cv, xv], dim=1),
        )

-        output = optimized_attention(q.permute(0, 2, 1, 3), k.permute(0, 2, 1, 3), v.permute(0, 2, 1, 3), self.n_heads, skip_reshape=True)
+        output = optimized_attention(q.permute(0, 2, 1, 3), k.permute(0, 2, 1, 3), v.permute(0, 2, 1, 3), self.n_heads, skip_reshape=True, transformer_options=transformer_options)

        c, x = output.split([seqlen1, seqlen2], dim=1)
        c = self.w1o(c)
@ -207,7 +207,7 @@ class MMDiTBlock(nn.Module):
        self.is_last = is_last

    #@torch.compile()
-    def forward(self, c, x, global_cond, **kwargs):
+    def forward(self, c, x, global_cond, transformer_options={}, **kwargs):

        cres, xres = c, x

@ -225,7 +225,7 @@ class MMDiTBlock(nn.Module):
        x = modulate(self.normX1(x), xshift_msa, xscale_msa)

        # attention
-        c, x = self.attn(c, x)
+        c, x = self.attn(c, x, transformer_options=transformer_options)


        c = self.normC2(cres + cgate_msa.unsqueeze(1) * c)
@ -255,13 +255,13 @@ class DiTBlock(nn.Module):
        self.mlp = MLP(dim, hidden_dim=dim * 4, dtype=dtype, device=device, operations=operations)

    #@torch.compile()
-    def forward(self, cx, global_cond, **kwargs):
+    def forward(self, cx, global_cond, transformer_options={}, **kwargs):
        cxres = cx
        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.modCX(
            global_cond
        ).chunk(6, dim=1)
        cx = modulate(self.norm1(cx), shift_msa, scale_msa)
-        cx = self.attn(cx)
+        cx = self.attn(cx, transformer_options=transformer_options)
        cx = self.norm2(cxres + gate_msa.unsqueeze(1) * cx)
        mlpout = self.mlp(modulate(cx, shift_mlp, scale_mlp))
        cx = gate_mlp.unsqueeze(1) * mlpout
@ -473,13 +473,14 @@ class MMDiT(nn.Module):
                        out = {}
                        out["txt"], out["img"] = layer(args["txt"],
                                                       args["img"],
-                                                       args["vec"])
+                                                       args["vec"],
+                                                       transformer_options=args["transformer_options"])
                        return out
-                    out = blocks_replace[("double_block", i)]({"img": x, "txt": c, "vec": global_cond}, {"original_block": block_wrap})
+                    out = blocks_replace[("double_block", i)]({"img": x, "txt": c, "vec": global_cond, "transformer_options": transformer_options}, {"original_block": block_wrap})
                    c = out["txt"]
                    x = out["img"]
                else:
-                    c, x = layer(c, x, global_cond, **kwargs)
+                    c, x = layer(c, x, global_cond, transformer_options=transformer_options, **kwargs)

        if len(self.single_layers) > 0:
            c_len = c.size(1)
@ -488,13 +489,13 @@ class MMDiT(nn.Module):
                if ("single_block", i) in blocks_replace:
                    def block_wrap(args):
                        out = {}
-                        out["img"] = layer(args["img"], args["vec"])
+                        out["img"] = layer(args["img"], args["vec"], transformer_options=args["transformer_options"])
                        return out

-                    out = blocks_replace[("single_block", i)]({"img": cx, "vec": global_cond}, {"original_block": block_wrap})
+                    out = blocks_replace[("single_block", i)]({"img": cx, "vec": global_cond, "transformer_options": transformer_options}, {"original_block": block_wrap})
                    cx = out["img"]
                else:
-                    cx = layer(cx, global_cond, **kwargs)
+                    cx = layer(cx, global_cond, transformer_options=transformer_options, **kwargs)

            x = cx[:, c_len:]

--- a/comfy/ldm/cascade/common.py
+++ b/comfy/ldm/cascade/common.py
@ -32,12 +32,12 @@ class OptimizedAttention(nn.Module):

        self.out_proj = operations.Linear(c, c, bias=True, dtype=dtype, device=device)

-    def forward(self, q, k, v):
+    def forward(self, q, k, v, transformer_options={}):
        q = self.to_q(q)
        k = self.to_k(k)
        v = self.to_v(v)

-        out = optimized_attention(q, k, v, self.heads)
+        out = optimized_attention(q, k, v, self.heads, transformer_options=transformer_options)

        return self.out_proj(out)

@ -47,13 +47,13 @@ class Attention2D(nn.Module):
        self.attn = OptimizedAttention(c, nhead, dtype=dtype, device=device, operations=operations)
        # self.attn = nn.MultiheadAttention(c, nhead, dropout=dropout, bias=True, batch_first=True, dtype=dtype, device=device)

-    def forward(self, x, kv, self_attn=False):
+    def forward(self, x, kv, self_attn=False, transformer_options={}):
        orig_shape = x.shape
        x = x.view(x.size(0), x.size(1), -1).permute(0, 2, 1)  # Bx4xHxW -> Bx(HxW)x4
        if self_attn:
            kv = torch.cat([x, kv], dim=1)
        # x = self.attn(x, kv, kv, need_weights=False)[0]
-        x = self.attn(x, kv, kv)
+        x = self.attn(x, kv, kv, transformer_options=transformer_options)
        x = x.permute(0, 2, 1).view(*orig_shape)
        return x

@ -114,9 +114,9 @@ class AttnBlock(nn.Module):
            operations.Linear(c_cond, c, dtype=dtype, device=device)
        )

-    def forward(self, x, kv):
+    def forward(self, x, kv, transformer_options={}):
        kv = self.kv_mapper(kv)
-        x = x + self.attention(self.norm(x), kv, self_attn=self.self_attn)
+        x = x + self.attention(self.norm(x), kv, self_attn=self.self_attn, transformer_options=transformer_options)
        return x


--- a/comfy/ldm/cascade/stage_b.py
+++ b/comfy/ldm/cascade/stage_b.py
@ -173,7 +173,7 @@ class StageB(nn.Module):
        clip = self.clip_norm(clip)
        return clip

-    def _down_encode(self, x, r_embed, clip):
+    def _down_encode(self, x, r_embed, clip, transformer_options={}):
        level_outputs = []
        block_group = zip(self.down_blocks, self.down_downscalers, self.down_repeat_mappers)
        for down_block, downscaler, repmap in block_group:
@ -187,7 +187,7 @@ class StageB(nn.Module):
                    elif isinstance(block, AttnBlock) or (
                            hasattr(block, '_fsdp_wrapped_module') and isinstance(block._fsdp_wrapped_module,
                                                                                  AttnBlock)):
-                        x = block(x, clip)
+                        x = block(x, clip, transformer_options=transformer_options)
                    elif isinstance(block, TimestepBlock) or (
                            hasattr(block, '_fsdp_wrapped_module') and isinstance(block._fsdp_wrapped_module,
                                                                                  TimestepBlock)):
@ -199,7 +199,7 @@ class StageB(nn.Module):
            level_outputs.insert(0, x)
        return level_outputs

-    def _up_decode(self, level_outputs, r_embed, clip):
+    def _up_decode(self, level_outputs, r_embed, clip, transformer_options={}):
        x = level_outputs[0]
        block_group = zip(self.up_blocks, self.up_upscalers, self.up_repeat_mappers)
        for i, (up_block, upscaler, repmap) in enumerate(block_group):
@ -216,7 +216,7 @@ class StageB(nn.Module):
                    elif isinstance(block, AttnBlock) or (
                            hasattr(block, '_fsdp_wrapped_module') and isinstance(block._fsdp_wrapped_module,
                                                                                  AttnBlock)):
-                        x = block(x, clip)
+                        x = block(x, clip, transformer_options=transformer_options)
                    elif isinstance(block, TimestepBlock) or (
                            hasattr(block, '_fsdp_wrapped_module') and isinstance(block._fsdp_wrapped_module,
                                                                                  TimestepBlock)):
@ -228,7 +228,7 @@ class StageB(nn.Module):
            x = upscaler(x)
        return x

-    def forward(self, x, r, effnet, clip, pixels=None, **kwargs):
+    def forward(self, x, r, effnet, clip, pixels=None, transformer_options={}, **kwargs):
        if pixels is None:
            pixels = x.new_zeros(x.size(0), 3, 8, 8)

@ -245,8 +245,8 @@ class StageB(nn.Module):
            nn.functional.interpolate(effnet, size=x.shape[-2:], mode='bilinear', align_corners=True))
        x = x + nn.functional.interpolate(self.pixels_mapper(pixels), size=x.shape[-2:], mode='bilinear',
                                          align_corners=True)
-        level_outputs = self._down_encode(x, r_embed, clip)
-        x = self._up_decode(level_outputs, r_embed, clip)
+        level_outputs = self._down_encode(x, r_embed, clip, transformer_options=transformer_options)
+        x = self._up_decode(level_outputs, r_embed, clip, transformer_options=transformer_options)
        return self.clf(x)

    def update_weights_ema(self, src_model, beta=0.999):
--- a/comfy/ldm/cascade/stage_c.py
+++ b/comfy/ldm/cascade/stage_c.py
@ -182,7 +182,7 @@ class StageC(nn.Module):
        clip = self.clip_norm(clip)
        return clip

-    def _down_encode(self, x, r_embed, clip, cnet=None):
+    def _down_encode(self, x, r_embed, clip, cnet=None, transformer_options={}):
        level_outputs = []
        block_group = zip(self.down_blocks, self.down_downscalers, self.down_repeat_mappers)
        for down_block, downscaler, repmap in block_group:
@ -201,7 +201,7 @@ class StageC(nn.Module):
                    elif isinstance(block, AttnBlock) or (
                            hasattr(block, '_fsdp_wrapped_module') and isinstance(block._fsdp_wrapped_module,
                                                                                  AttnBlock)):
-                        x = block(x, clip)
+                        x = block(x, clip, transformer_options=transformer_options)
                    elif isinstance(block, TimestepBlock) or (
                            hasattr(block, '_fsdp_wrapped_module') and isinstance(block._fsdp_wrapped_module,
                                                                                  TimestepBlock)):
@ -213,7 +213,7 @@ class StageC(nn.Module):
            level_outputs.insert(0, x)
        return level_outputs

-    def _up_decode(self, level_outputs, r_embed, clip, cnet=None):
+    def _up_decode(self, level_outputs, r_embed, clip, cnet=None, transformer_options={}):
        x = level_outputs[0]
        block_group = zip(self.up_blocks, self.up_upscalers, self.up_repeat_mappers)
        for i, (up_block, upscaler, repmap) in enumerate(block_group):
@ -235,7 +235,7 @@ class StageC(nn.Module):
                    elif isinstance(block, AttnBlock) or (
                            hasattr(block, '_fsdp_wrapped_module') and isinstance(block._fsdp_wrapped_module,
                                                                                  AttnBlock)):
-                        x = block(x, clip)
+                        x = block(x, clip, transformer_options=transformer_options)
                    elif isinstance(block, TimestepBlock) or (
                            hasattr(block, '_fsdp_wrapped_module') and isinstance(block._fsdp_wrapped_module,
                                                                                  TimestepBlock)):
@ -247,7 +247,7 @@ class StageC(nn.Module):
            x = upscaler(x)
        return x

-    def forward(self, x, r, clip_text, clip_text_pooled, clip_img, control=None, **kwargs):
+    def forward(self, x, r, clip_text, clip_text_pooled, clip_img, control=None, transformer_options={}, **kwargs):
        # Process the conditioning embeddings
        r_embed = self.gen_r_embedding(r).to(dtype=x.dtype)
        for c in self.t_conds:
@ -262,8 +262,8 @@ class StageC(nn.Module):

        # Model Blocks
        x = self.embedding(x)
-        level_outputs = self._down_encode(x, r_embed, clip, cnet)
-        x = self._up_decode(level_outputs, r_embed, clip, cnet)
+        level_outputs = self._down_encode(x, r_embed, clip, cnet, transformer_options=transformer_options)
+        x = self._up_decode(level_outputs, r_embed, clip, cnet, transformer_options=transformer_options)
        return self.clf(x)

    def update_weights_ema(self, src_model, beta=0.999):
--- a/comfy/ldm/chroma/layers.py
+++ b/comfy/ldm/chroma/layers.py
@ -76,7 +76,7 @@ class DoubleStreamBlock(nn.Module):
        )
        self.flipped_img_txt = flipped_img_txt

-    def forward(self, img: Tensor, txt: Tensor, pe: Tensor, vec: Tensor, attn_mask=None):
+    def forward(self, img: Tensor, txt: Tensor, pe: Tensor, vec: Tensor, attn_mask=None, transformer_options={}):
        (img_mod1, img_mod2), (txt_mod1, txt_mod2) = vec

        # prepare image for attention
@ -95,7 +95,7 @@ class DoubleStreamBlock(nn.Module):
        attn = attention(torch.cat((txt_q, img_q), dim=2),
                         torch.cat((txt_k, img_k), dim=2),
                         torch.cat((txt_v, img_v), dim=2),
-                         pe=pe, mask=attn_mask)
+                         pe=pe, mask=attn_mask, transformer_options=transformer_options)

        txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1] :]

@ -148,7 +148,7 @@ class SingleStreamBlock(nn.Module):

        self.mlp_act = nn.GELU(approximate="tanh")

-    def forward(self, x: Tensor, pe: Tensor, vec: Tensor, attn_mask=None) -> Tensor:
+    def forward(self, x: Tensor, pe: Tensor, vec: Tensor, attn_mask=None, transformer_options={}) -> Tensor:
        mod = vec
        x_mod = torch.addcmul(mod.shift, 1 + mod.scale, self.pre_norm(x))
        qkv, mlp = torch.split(self.linear1(x_mod), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1)
@ -157,7 +157,7 @@ class SingleStreamBlock(nn.Module):
        q, k = self.norm(q, k, v)

        # compute attention
-        attn = attention(q, k, v, pe=pe, mask=attn_mask)
+        attn = attention(q, k, v, pe=pe, mask=attn_mask, transformer_options=transformer_options)
        # compute activation in mlp stream, cat again and run second linear layer
        output = self.linear2(torch.cat((attn, self.mlp_act(mlp)), 2))
        x.addcmul_(mod.gate, output)
--- a/comfy/ldm/chroma/model.py
+++ b/comfy/ldm/chroma/model.py
@ -193,14 +193,16 @@ class Chroma(nn.Module):
                                                       txt=args["txt"],
                                                       vec=args["vec"],
                                                       pe=args["pe"],
-                                                       attn_mask=args.get("attn_mask"))
+                                                       attn_mask=args.get("attn_mask"),
+                                                       transformer_options=args.get("transformer_options"))
                        return out

                    out = blocks_replace[("double_block", i)]({"img": img,
                                                               "txt": txt,
                                                               "vec": double_mod,
                                                               "pe": pe,
-                                                               "attn_mask": attn_mask},
+                                                               "attn_mask": attn_mask,
+                                                               "transformer_options": transformer_options},
                                                              {"original_block": block_wrap})
                    txt = out["txt"]
                    img = out["img"]
@ -209,7 +211,8 @@ class Chroma(nn.Module):
                                     txt=txt,
                                     vec=double_mod,
                                     pe=pe,
-                                     attn_mask=attn_mask)
+                                     attn_mask=attn_mask,
+                                     transformer_options=transformer_options)

                if control is not None: # Controlnet
                    control_i = control.get("input")
@ -229,17 +232,19 @@ class Chroma(nn.Module):
                        out["img"] = block(args["img"],
                                           vec=args["vec"],
                                           pe=args["pe"],
-                                           attn_mask=args.get("attn_mask"))
+                                           attn_mask=args.get("attn_mask"),
+                                           transformer_options=args.get("transformer_options"))
                        return out

                    out = blocks_replace[("single_block", i)]({"img": img,
                                                               "vec": single_mod,
                                                               "pe": pe,
-                                                               "attn_mask": attn_mask},
+                                                               "attn_mask": attn_mask,
+                                                               "transformer_options": transformer_options},
                                                              {"original_block": block_wrap})
                    img = out["img"]
                else:
-                    img = block(img, vec=single_mod, pe=pe, attn_mask=attn_mask)
+                    img = block(img, vec=single_mod, pe=pe, attn_mask=attn_mask, transformer_options=transformer_options)

                if control is not None: # Controlnet
                    control_o = control.get("output")
--- a/comfy/ldm/cosmos/blocks.py
+++ b/comfy/ldm/cosmos/blocks.py
@ -176,6 +176,7 @@ class Attention(nn.Module):
        context=None,
        mask=None,
        rope_emb=None,
+        transformer_options={},
        **kwargs,
    ):
        """
@ -184,7 +185,7 @@ class Attention(nn.Module):
            context (Optional[Tensor]): The key tensor of shape [B, Mk, K] or use x as context [self attention] if None
        """
        q, k, v = self.cal_qkv(x, context, mask, rope_emb=rope_emb, **kwargs)
-        out = optimized_attention(q, k, v, self.heads, skip_reshape=True, mask=mask, skip_output_reshape=True)
+        out = optimized_attention(q, k, v, self.heads, skip_reshape=True, mask=mask, skip_output_reshape=True, transformer_options=transformer_options)
        del q, k, v
        out = rearrange(out, " b n s c -> s b (n c)")
        return self.to_out(out)
@ -546,6 +547,7 @@ class VideoAttn(nn.Module):
        context: Optional[torch.Tensor] = None,
        crossattn_mask: Optional[torch.Tensor] = None,
        rope_emb_L_1_1_D: Optional[torch.Tensor] = None,
+        transformer_options: Optional[dict] = {},
    ) -> torch.Tensor:
        """
        Forward pass for video attention.
@ -571,6 +573,7 @@ class VideoAttn(nn.Module):
            context_M_B_D,
            crossattn_mask,
            rope_emb=rope_emb_L_1_1_D,
+            transformer_options=transformer_options,
        )
        x_T_H_W_B_D = rearrange(x_THW_B_D, "(t h w) b d -> t h w b d", h=H, w=W)
        return x_T_H_W_B_D
@ -665,6 +668,7 @@ class DITBuildingBlock(nn.Module):
        crossattn_mask: Optional[torch.Tensor] = None,
        rope_emb_L_1_1_D: Optional[torch.Tensor] = None,
        adaln_lora_B_3D: Optional[torch.Tensor] = None,
+        transformer_options: Optional[dict] = {},
    ) -> torch.Tensor:
        """
        Forward pass for dynamically configured blocks with adaptive normalization.
@ -702,6 +706,7 @@ class DITBuildingBlock(nn.Module):
                adaln_norm_state(self.norm_state, x, scale_1_1_1_B_D, shift_1_1_1_B_D),
                context=None,
                rope_emb_L_1_1_D=rope_emb_L_1_1_D,
+                transformer_options=transformer_options,
            )
        elif self.block_type in ["cross_attn", "ca"]:
            x = x + gate_1_1_1_B_D * self.block(
@ -709,6 +714,7 @@ class DITBuildingBlock(nn.Module):
                context=crossattn_emb,
                crossattn_mask=crossattn_mask,
                rope_emb_L_1_1_D=rope_emb_L_1_1_D,
+                transformer_options=transformer_options,
            )
        else:
            raise ValueError(f"Unknown block type: {self.block_type}")
@ -784,6 +790,7 @@ class GeneralDITTransformerBlock(nn.Module):
        crossattn_mask: Optional[torch.Tensor] = None,
        rope_emb_L_1_1_D: Optional[torch.Tensor] = None,
        adaln_lora_B_3D: Optional[torch.Tensor] = None,
+        transformer_options: Optional[dict] = {},
    ) -> torch.Tensor:
        for block in self.blocks:
            x = block(
@ -793,5 +800,6 @@ class GeneralDITTransformerBlock(nn.Module):
                crossattn_mask,
                rope_emb_L_1_1_D=rope_emb_L_1_1_D,
                adaln_lora_B_3D=adaln_lora_B_3D,
+                transformer_options=transformer_options,
            )
        return x
--- a/comfy/ldm/cosmos/model.py
+++ b/comfy/ldm/cosmos/model.py
@ -520,6 +520,7 @@ class GeneralDIT(nn.Module):
                x.shape == extra_pos_emb_B_T_H_W_D_or_T_H_W_B_D.shape
            ), f"{x.shape} != {extra_pos_emb_B_T_H_W_D_or_T_H_W_B_D.shape} {original_shape}"

+        transformer_options = kwargs.get("transformer_options", {})
        for _, block in self.blocks.items():
            assert (
                self.blocks["block0"].x_format == block.x_format
@ -534,6 +535,7 @@ class GeneralDIT(nn.Module):
                crossattn_mask,
                rope_emb_L_1_1_D=rope_emb_L_1_1_D,
                adaln_lora_B_3D=adaln_lora_B_3D,
+                transformer_options=transformer_options,
            )

        x_B_T_H_W_D = rearrange(x, "T H W B D -> B T H W D")
--- a/comfy/ldm/cosmos/predict2.py
+++ b/comfy/ldm/cosmos/predict2.py
@ -44,7 +44,7 @@ class GPT2FeedForward(nn.Module):
        return x


-def torch_attention_op(q_B_S_H_D: torch.Tensor, k_B_S_H_D: torch.Tensor, v_B_S_H_D: torch.Tensor) -> torch.Tensor:
+def torch_attention_op(q_B_S_H_D: torch.Tensor, k_B_S_H_D: torch.Tensor, v_B_S_H_D: torch.Tensor, transformer_options: Optional[dict] = {}) -> torch.Tensor:
    """Computes multi-head attention using PyTorch's native implementation.

    This function provides a PyTorch backend alternative to Transformer Engine's attention operation.
@ -71,7 +71,7 @@ def torch_attention_op(q_B_S_H_D: torch.Tensor, k_B_S_H_D: torch.Tensor, v_B_S_H
    q_B_H_S_D = rearrange(q_B_S_H_D, "b ... h k -> b h ... k").view(in_q_shape[0], in_q_shape[-2], -1, in_q_shape[-1])
    k_B_H_S_D = rearrange(k_B_S_H_D, "b ... h v -> b h ... v").view(in_k_shape[0], in_k_shape[-2], -1, in_k_shape[-1])
    v_B_H_S_D = rearrange(v_B_S_H_D, "b ... h v -> b h ... v").view(in_k_shape[0], in_k_shape[-2], -1, in_k_shape[-1])
-    return optimized_attention(q_B_H_S_D, k_B_H_S_D, v_B_H_S_D, in_q_shape[-2], skip_reshape=True)
+    return optimized_attention(q_B_H_S_D, k_B_H_S_D, v_B_H_S_D, in_q_shape[-2], skip_reshape=True, transformer_options=transformer_options)


 class Attention(nn.Module):
@ -180,8 +180,8 @@ class Attention(nn.Module):

        return q, k, v

-    def compute_attention(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor) -> torch.Tensor:
-        result = self.attn_op(q, k, v)  # [B, S, H, D]
+    def compute_attention(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, transformer_options: Optional[dict] = {}) -> torch.Tensor:
+        result = self.attn_op(q, k, v, transformer_options=transformer_options)  # [B, S, H, D]
        return self.output_dropout(self.output_proj(result))

    def forward(
@ -189,6 +189,7 @@ class Attention(nn.Module):
        x: torch.Tensor,
        context: Optional[torch.Tensor] = None,
        rope_emb: Optional[torch.Tensor] = None,
+        transformer_options: Optional[dict] = {},
    ) -> torch.Tensor:
        """
        Args:
@ -196,7 +197,7 @@ class Attention(nn.Module):
            context (Optional[Tensor]): The key tensor of shape [B, Mk, K] or use x as context [self attention] if None
        """
        q, k, v = self.compute_qkv(x, context, rope_emb=rope_emb)
-        return self.compute_attention(q, k, v)
+        return self.compute_attention(q, k, v, transformer_options=transformer_options)


 class Timesteps(nn.Module):
@ -459,6 +460,7 @@ class Block(nn.Module):
        rope_emb_L_1_1_D: Optional[torch.Tensor] = None,
        adaln_lora_B_T_3D: Optional[torch.Tensor] = None,
        extra_per_block_pos_emb: Optional[torch.Tensor] = None,
+        transformer_options: Optional[dict] = {},
    ) -> torch.Tensor:
        if extra_per_block_pos_emb is not None:
            x_B_T_H_W_D = x_B_T_H_W_D + extra_per_block_pos_emb
@ -512,6 +514,7 @@ class Block(nn.Module):
                rearrange(normalized_x_B_T_H_W_D, "b t h w d -> b (t h w) d"),
                None,
                rope_emb=rope_emb_L_1_1_D,
+                transformer_options=transformer_options,
            ),
            "b (t h w) d -> b t h w d",
            t=T,
@ -525,6 +528,7 @@ class Block(nn.Module):
            layer_norm_cross_attn: Callable,
            _scale_cross_attn_B_T_1_1_D: torch.Tensor,
            _shift_cross_attn_B_T_1_1_D: torch.Tensor,
+            transformer_options: Optional[dict] = {},
        ) -> torch.Tensor:
            _normalized_x_B_T_H_W_D = _fn(
                _x_B_T_H_W_D, layer_norm_cross_attn, _scale_cross_attn_B_T_1_1_D, _shift_cross_attn_B_T_1_1_D
@ -534,6 +538,7 @@ class Block(nn.Module):
                    rearrange(_normalized_x_B_T_H_W_D, "b t h w d -> b (t h w) d"),
                    crossattn_emb,
                    rope_emb=rope_emb_L_1_1_D,
+                    transformer_options=transformer_options,
                ),
                "b (t h w) d -> b t h w d",
                t=T,
@ -547,6 +552,7 @@ class Block(nn.Module):
            self.layer_norm_cross_attn,
            scale_cross_attn_B_T_1_1_D,
            shift_cross_attn_B_T_1_1_D,
+            transformer_options=transformer_options,
        )
        x_B_T_H_W_D = result_B_T_H_W_D * gate_cross_attn_B_T_1_1_D + x_B_T_H_W_D

@ -865,6 +871,7 @@ class MiniTrainDIT(nn.Module):
            "rope_emb_L_1_1_D": rope_emb_L_1_1_D.unsqueeze(1).unsqueeze(0),
            "adaln_lora_B_T_3D": adaln_lora_B_T_3D,
            "extra_per_block_pos_emb": extra_pos_emb_B_T_H_W_D_or_T_H_W_B_D,
+            "transformer_options": kwargs.get("transformer_options", {}),
        }
        for block in self.blocks:
            x_B_T_H_W_D = block(
--- a/comfy/ldm/flux/layers.py
+++ b/comfy/ldm/flux/layers.py
@ -159,7 +159,7 @@ class DoubleStreamBlock(nn.Module):
        )
        self.flipped_img_txt = flipped_img_txt

-    def forward(self, img: Tensor, txt: Tensor, vec: Tensor, pe: Tensor, attn_mask=None, modulation_dims_img=None, modulation_dims_txt=None):
+    def forward(self, img: Tensor, txt: Tensor, vec: Tensor, pe: Tensor, attn_mask=None, modulation_dims_img=None, modulation_dims_txt=None, transformer_options={}):
        img_mod1, img_mod2 = self.img_mod(vec)
        txt_mod1, txt_mod2 = self.txt_mod(vec)

@ -182,7 +182,7 @@ class DoubleStreamBlock(nn.Module):
            attn = attention(torch.cat((img_q, txt_q), dim=2),
                             torch.cat((img_k, txt_k), dim=2),
                             torch.cat((img_v, txt_v), dim=2),
-                             pe=pe, mask=attn_mask)
+                             pe=pe, mask=attn_mask, transformer_options=transformer_options)

            img_attn, txt_attn = attn[:, : img.shape[1]], attn[:, img.shape[1]:]
        else:
@ -190,7 +190,7 @@ class DoubleStreamBlock(nn.Module):
            attn = attention(torch.cat((txt_q, img_q), dim=2),
                             torch.cat((txt_k, img_k), dim=2),
                             torch.cat((txt_v, img_v), dim=2),
-                             pe=pe, mask=attn_mask)
+                             pe=pe, mask=attn_mask, transformer_options=transformer_options)

            txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1]:]

@ -244,7 +244,7 @@ class SingleStreamBlock(nn.Module):
        self.mlp_act = nn.GELU(approximate="tanh")
        self.modulation = Modulation(hidden_size, double=False, dtype=dtype, device=device, operations=operations)

-    def forward(self, x: Tensor, vec: Tensor, pe: Tensor, attn_mask=None, modulation_dims=None) -> Tensor:
+    def forward(self, x: Tensor, vec: Tensor, pe: Tensor, attn_mask=None, modulation_dims=None, transformer_options={}) -> Tensor:
        mod, _ = self.modulation(vec)
        qkv, mlp = torch.split(self.linear1(apply_mod(self.pre_norm(x), (1 + mod.scale), mod.shift, modulation_dims)), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1)

@ -252,7 +252,7 @@ class SingleStreamBlock(nn.Module):
        q, k = self.norm(q, k, v)

        # compute attention
-        attn = attention(q, k, v, pe=pe, mask=attn_mask)
+        attn = attention(q, k, v, pe=pe, mask=attn_mask, transformer_options=transformer_options)
        # compute activation in mlp stream, cat again and run second linear layer
        output = self.linear2(torch.cat((attn, self.mlp_act(mlp)), 2))
        x += apply_mod(output, mod.gate, None, modulation_dims)
--- a/comfy/ldm/flux/math.py
+++ b/comfy/ldm/flux/math.py
@ -6,7 +6,7 @@ from comfy.ldm.modules.attention import optimized_attention
 import comfy.model_management


-def attention(q: Tensor, k: Tensor, v: Tensor, pe: Tensor, mask=None) -> Tensor:
+def attention(q: Tensor, k: Tensor, v: Tensor, pe: Tensor, mask=None, transformer_options={}) -> Tensor:
    q_shape = q.shape
    k_shape = k.shape

@ -17,7 +17,7 @@ def attention(q: Tensor, k: Tensor, v: Tensor, pe: Tensor, mask=None) -> Tensor:
        k = (pe[..., 0] * k[..., 0] + pe[..., 1] * k[..., 1]).reshape(*k_shape).type_as(v)

    heads = q.shape[1]
-    x = optimized_attention(q, k, v, heads, skip_reshape=True, mask=mask)
+    x = optimized_attention(q, k, v, heads, skip_reshape=True, mask=mask, transformer_options=transformer_options)
    return x


--- a/comfy/ldm/flux/model.py
+++ b/comfy/ldm/flux/model.py
@ -128,6 +128,7 @@ class Flux(nn.Module):

        blocks_replace = patches_replace.get("dit", {})
        for i, block in enumerate(self.double_blocks):
+            transformer_options["block"] = ("double_block", i, 2)
            if ("double_block", i) in blocks_replace:
                def block_wrap(args):
                    out = {}
@ -135,14 +136,16 @@ class Flux(nn.Module):
                                                   txt=args["txt"],
                                                   vec=args["vec"],
                                                   pe=args["pe"],
-                                                   attn_mask=args.get("attn_mask"))
+                                                   attn_mask=args.get("attn_mask"),
+                                                   transformer_options=args.get("transformer_options"))
                    return out

                out = blocks_replace[("double_block", i)]({"img": img,
                                                           "txt": txt,
                                                           "vec": vec,
                                                           "pe": pe,
-                                                           "attn_mask": attn_mask},
+                                                           "attn_mask": attn_mask,
+                                                           "transformer_options": transformer_options},
                                                          {"original_block": block_wrap})
                txt = out["txt"]
                img = out["img"]
@ -151,7 +154,8 @@ class Flux(nn.Module):
                                 txt=txt,
                                 vec=vec,
                                 pe=pe,
-                                 attn_mask=attn_mask)
+                                 attn_mask=attn_mask,
+                                 transformer_options=transformer_options)

            if control is not None: # Controlnet
                control_i = control.get("input")
@ -166,23 +170,26 @@ class Flux(nn.Module):
        img = torch.cat((txt, img), 1)

        for i, block in enumerate(self.single_blocks):
+            transformer_options["block"] = ("single_block", i, 1)
            if ("single_block", i) in blocks_replace:
                def block_wrap(args):
                    out = {}
                    out["img"] = block(args["img"],
                                       vec=args["vec"],
                                       pe=args["pe"],
-                                       attn_mask=args.get("attn_mask"))
+                                       attn_mask=args.get("attn_mask"),
+                                       transformer_options=args.get("transformer_options"))
                    return out

                out = blocks_replace[("single_block", i)]({"img": img,
                                                           "vec": vec,
                                                           "pe": pe,
-                                                           "attn_mask": attn_mask},
+                                                           "attn_mask": attn_mask,
+                                                           "transformer_options": transformer_options},
                                                          {"original_block": block_wrap})
                img = out["img"]
            else:
-                img = block(img, vec=vec, pe=pe, attn_mask=attn_mask)
+                img = block(img, vec=vec, pe=pe, attn_mask=attn_mask, transformer_options=transformer_options)

            if control is not None: # Controlnet
                control_o = control.get("output")
--- a/comfy/ldm/genmo/joint_model/asymm_models_joint.py
+++ b/comfy/ldm/genmo/joint_model/asymm_models_joint.py
@ -109,6 +109,7 @@ class AsymmetricAttention(nn.Module):
        scale_x: torch.Tensor,  # (B, dim_x), modulation for pre-RMSNorm.
        scale_y: torch.Tensor,  # (B, dim_y), modulation for pre-RMSNorm.
        crop_y,
+        transformer_options={},
        **rope_rotation,
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        rope_cos = rope_rotation.get("rope_cos")
@ -143,7 +144,7 @@ class AsymmetricAttention(nn.Module):

        xy = optimized_attention(q,
                                 k,
-                                 v, self.num_heads, skip_reshape=True)
+                                 v, self.num_heads, skip_reshape=True, transformer_options=transformer_options)

        x, y = torch.tensor_split(xy, (q_x.shape[1],), dim=1)
        x = self.proj_x(x)
@ -224,6 +225,7 @@ class AsymmetricJointBlock(nn.Module):
        x: torch.Tensor,
        c: torch.Tensor,
        y: torch.Tensor,
+        transformer_options={},
        **attn_kwargs,
    ):
        """Forward pass of a block.
@ -256,6 +258,7 @@ class AsymmetricJointBlock(nn.Module):
            y,
            scale_x=scale_msa_x,
            scale_y=scale_msa_y,
+            transformer_options=transformer_options,
            **attn_kwargs,
        )

@ -524,10 +527,11 @@ class AsymmDiTJoint(nn.Module):
                                                    args["txt"],
                                                    rope_cos=args["rope_cos"],
                                                    rope_sin=args["rope_sin"],
-                                                    crop_y=args["num_tokens"]
+                                                    crop_y=args["num_tokens"],
+                                                    transformer_options=args["transformer_options"]
                                                    )
                    return out
-                out = blocks_replace[("double_block", i)]({"img": x, "txt": y_feat, "vec": c, "rope_cos": rope_cos, "rope_sin": rope_sin, "num_tokens": num_tokens}, {"original_block": block_wrap})
+                out = blocks_replace[("double_block", i)]({"img": x, "txt": y_feat, "vec": c, "rope_cos": rope_cos, "rope_sin": rope_sin, "num_tokens": num_tokens, "transformer_options": transformer_options}, {"original_block": block_wrap})
                y_feat = out["txt"]
                x = out["img"]
            else:
@ -538,6 +542,7 @@ class AsymmDiTJoint(nn.Module):
                    rope_cos=rope_cos,
                    rope_sin=rope_sin,
                    crop_y=num_tokens,
+                    transformer_options=transformer_options,
                )  # (B, M, D), (B, L, D)
        del y_feat  # Final layers don't use dense text features.

--- a/comfy/ldm/hidream/model.py
+++ b/comfy/ldm/hidream/model.py
@ -72,8 +72,8 @@ class TimestepEmbed(nn.Module):
        return t_emb


-def attention(query: torch.Tensor, key: torch.Tensor, value: torch.Tensor):
-    return optimized_attention(query.view(query.shape[0], -1, query.shape[-1] * query.shape[-2]), key.view(key.shape[0], -1, key.shape[-1] * key.shape[-2]), value.view(value.shape[0], -1, value.shape[-1] * value.shape[-2]), query.shape[2])
+def attention(query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, transformer_options={}):
+    return optimized_attention(query.view(query.shape[0], -1, query.shape[-1] * query.shape[-2]), key.view(key.shape[0], -1, key.shape[-1] * key.shape[-2]), value.view(value.shape[0], -1, value.shape[-1] * value.shape[-2]), query.shape[2], transformer_options=transformer_options)


 class HiDreamAttnProcessor_flashattn:
@ -86,6 +86,7 @@ class HiDreamAttnProcessor_flashattn:
        image_tokens_masks: Optional[torch.FloatTensor] = None,
        text_tokens: Optional[torch.FloatTensor] = None,
        rope: torch.FloatTensor = None,
+        transformer_options={},
        *args,
        **kwargs,
    ) -> torch.FloatTensor:
@ -133,7 +134,7 @@ class HiDreamAttnProcessor_flashattn:
            query = torch.cat([query_1, query_2], dim=-1)
            key = torch.cat([key_1, key_2], dim=-1)

-        hidden_states = attention(query, key, value)
+        hidden_states = attention(query, key, value, transformer_options=transformer_options)

        if not attn.single:
            hidden_states_i, hidden_states_t = torch.split(hidden_states, [num_image_tokens, num_text_tokens], dim=1)
@ -199,6 +200,7 @@ class HiDreamAttention(nn.Module):
        image_tokens_masks: torch.FloatTensor = None,
        norm_text_tokens: torch.FloatTensor = None,
        rope: torch.FloatTensor = None,
+        transformer_options={},
    ) -> torch.Tensor:
        return self.processor(
            self,
@ -206,6 +208,7 @@ class HiDreamAttention(nn.Module):
            image_tokens_masks = image_tokens_masks,
            text_tokens = norm_text_tokens,
            rope = rope,
+            transformer_options=transformer_options,
        )


@ -406,7 +409,7 @@ class HiDreamImageSingleTransformerBlock(nn.Module):
        text_tokens: Optional[torch.FloatTensor] = None,
        adaln_input: Optional[torch.FloatTensor] = None,
        rope: torch.FloatTensor = None,
-
+        transformer_options={},
    ) -> torch.FloatTensor:
        wtype = image_tokens.dtype
        shift_msa_i, scale_msa_i, gate_msa_i, shift_mlp_i, scale_mlp_i, gate_mlp_i = \
@ -419,6 +422,7 @@ class HiDreamImageSingleTransformerBlock(nn.Module):
            norm_image_tokens,
            image_tokens_masks,
            rope = rope,
+            transformer_options=transformer_options,
        )
        image_tokens = gate_msa_i * attn_output_i + image_tokens

@ -483,6 +487,7 @@ class HiDreamImageTransformerBlock(nn.Module):
        text_tokens: Optional[torch.FloatTensor] = None,
        adaln_input: Optional[torch.FloatTensor] = None,
        rope: torch.FloatTensor = None,
+        transformer_options={},
    ) -> torch.FloatTensor:
        wtype = image_tokens.dtype
        shift_msa_i, scale_msa_i, gate_msa_i, shift_mlp_i, scale_mlp_i, gate_mlp_i, \
@ -500,6 +505,7 @@ class HiDreamImageTransformerBlock(nn.Module):
            image_tokens_masks,
            norm_text_tokens,
            rope = rope,
+            transformer_options=transformer_options,
        )

        image_tokens = gate_msa_i * attn_output_i + image_tokens
@ -550,6 +556,7 @@ class HiDreamImageBlock(nn.Module):
        text_tokens: Optional[torch.FloatTensor] = None,
        adaln_input: torch.FloatTensor = None,
        rope: torch.FloatTensor = None,
+        transformer_options={},
    ) -> torch.FloatTensor:
        return self.block(
            image_tokens,
@ -557,6 +564,7 @@ class HiDreamImageBlock(nn.Module):
            text_tokens,
            adaln_input,
            rope,
+            transformer_options=transformer_options,
        )


@ -786,6 +794,7 @@ class HiDreamImageTransformer2DModel(nn.Module):
                text_tokens = cur_encoder_hidden_states,
                adaln_input = adaln_input,
                rope = rope,
+                transformer_options=transformer_options,
            )
            initial_encoder_hidden_states = initial_encoder_hidden_states[:, :initial_encoder_hidden_states_seq_len]
            block_id += 1
@ -809,6 +818,7 @@ class HiDreamImageTransformer2DModel(nn.Module):
                text_tokens=None,
                adaln_input=adaln_input,
                rope=rope,
+                transformer_options=transformer_options,
            )
            hidden_states = hidden_states[:, :hidden_states_seq_len]
            block_id += 1
--- a/comfy/ldm/hunyuan3d/model.py
+++ b/comfy/ldm/hunyuan3d/model.py
@ -99,14 +99,16 @@ class Hunyuan3Dv2(nn.Module):
                                                   txt=args["txt"],
                                                   vec=args["vec"],
                                                   pe=args["pe"],
-                                                   attn_mask=args.get("attn_mask"))
+                                                   attn_mask=args.get("attn_mask"),
+                                                   transformer_options=args["transformer_options"])
                    return out

                out = blocks_replace[("double_block", i)]({"img": img,
                                                           "txt": txt,
                                                           "vec": vec,
                                                           "pe": pe,
-                                                           "attn_mask": attn_mask},
+                                                           "attn_mask": attn_mask,
+                                                           "transformer_options": transformer_options},
                                                          {"original_block": block_wrap})
                txt = out["txt"]
                img = out["img"]
@ -115,7 +117,8 @@ class Hunyuan3Dv2(nn.Module):
                                 txt=txt,
                                 vec=vec,
                                 pe=pe,
-                                 attn_mask=attn_mask)
+                                 attn_mask=attn_mask,
+                                 transformer_options=transformer_options)

        img = torch.cat((txt, img), 1)

@ -126,17 +129,19 @@ class Hunyuan3Dv2(nn.Module):
                    out["img"] = block(args["img"],
                                       vec=args["vec"],
                                       pe=args["pe"],
-                                       attn_mask=args.get("attn_mask"))
+                                       attn_mask=args.get("attn_mask"),
+                                       transformer_options=args["transformer_options"])
                    return out

                out = blocks_replace[("single_block", i)]({"img": img,
                                                           "vec": vec,
                                                           "pe": pe,
-                                                           "attn_mask": attn_mask},
+                                                           "attn_mask": attn_mask,
+                                                           "transformer_options": transformer_options},
                                                          {"original_block": block_wrap})
                img = out["img"]
            else:
-                img = block(img, vec=vec, pe=pe, attn_mask=attn_mask)
+                img = block(img, vec=vec, pe=pe, attn_mask=attn_mask, transformer_options=transformer_options)

        img = img[:, txt.shape[1]:, ...]
        img = self.final_layer(img, vec)
--- a/comfy/ldm/hunyuan_video/model.py
+++ b/comfy/ldm/hunyuan_video/model.py
@ -78,13 +78,13 @@ class TokenRefinerBlock(nn.Module):
            operations.Linear(mlp_hidden_dim, hidden_size, bias=True, dtype=dtype, device=device),
        )

-    def forward(self, x, c, mask):
+    def forward(self, x, c, mask, transformer_options={}):
        mod1, mod2 = self.adaLN_modulation(c).chunk(2, dim=1)

        norm_x = self.norm1(x)
        qkv = self.self_attn.qkv(norm_x)
        q, k, v = qkv.reshape(qkv.shape[0], qkv.shape[1], 3, self.heads, -1).permute(2, 0, 3, 1, 4)
-        attn = optimized_attention(q, k, v, self.heads, mask=mask, skip_reshape=True)
+        attn = optimized_attention(q, k, v, self.heads, mask=mask, skip_reshape=True, transformer_options=transformer_options)

        x = x + self.self_attn.proj(attn) * mod1.unsqueeze(1)
        x = x + self.mlp(self.norm2(x)) * mod2.unsqueeze(1)
@ -115,14 +115,14 @@ class IndividualTokenRefiner(nn.Module):
            ]
        )

-    def forward(self, x, c, mask):
+    def forward(self, x, c, mask, transformer_options={}):
        m = None
        if mask is not None:
            m = mask.view(mask.shape[0], 1, 1, mask.shape[1]).repeat(1, 1, mask.shape[1], 1)
            m = m + m.transpose(2, 3)

        for block in self.blocks:
-            x = block(x, c, m)
+            x = block(x, c, m, transformer_options=transformer_options)
        return x


@ -150,6 +150,7 @@ class TokenRefiner(nn.Module):
        x,
        timesteps,
        mask,
+        transformer_options={},
    ):
        t = self.t_embedder(timestep_embedding(timesteps, 256, time_factor=1.0).to(x.dtype))
        # m = mask.float().unsqueeze(-1)
@ -158,7 +159,7 @@ class TokenRefiner(nn.Module):

        c = t + self.c_embedder(c.to(x.dtype))
        x = self.input_embedder(x)
-        x = self.individual_token_refiner(x, c, mask)
+        x = self.individual_token_refiner(x, c, mask, transformer_options=transformer_options)
        return x

 class HunyuanVideo(nn.Module):
@ -267,7 +268,7 @@ class HunyuanVideo(nn.Module):
        if txt_mask is not None and not torch.is_floating_point(txt_mask):
            txt_mask = (txt_mask - 1).to(img.dtype) * torch.finfo(img.dtype).max

-        txt = self.txt_in(txt, timesteps, txt_mask)
+        txt = self.txt_in(txt, timesteps, txt_mask, transformer_options=transformer_options)

        ids = torch.cat((img_ids, txt_ids), dim=1)
        pe = self.pe_embedder(ids)
@ -285,14 +286,14 @@ class HunyuanVideo(nn.Module):
            if ("double_block", i) in blocks_replace:
                def block_wrap(args):
                    out = {}
-                    out["img"], out["txt"] = block(img=args["img"], txt=args["txt"], vec=args["vec"], pe=args["pe"], attn_mask=args["attention_mask"], modulation_dims_img=args["modulation_dims_img"], modulation_dims_txt=args["modulation_dims_txt"])
+                    out["img"], out["txt"] = block(img=args["img"], txt=args["txt"], vec=args["vec"], pe=args["pe"], attn_mask=args["attention_mask"], modulation_dims_img=args["modulation_dims_img"], modulation_dims_txt=args["modulation_dims_txt"], transformer_options=args["transformer_options"])
                    return out

-                out = blocks_replace[("double_block", i)]({"img": img, "txt": txt, "vec": vec, "pe": pe, "attention_mask": attn_mask, 'modulation_dims_img': modulation_dims, 'modulation_dims_txt': modulation_dims_txt}, {"original_block": block_wrap})
+                out = blocks_replace[("double_block", i)]({"img": img, "txt": txt, "vec": vec, "pe": pe, "attention_mask": attn_mask, 'modulation_dims_img': modulation_dims, 'modulation_dims_txt': modulation_dims_txt, 'transformer_options': transformer_options}, {"original_block": block_wrap})
                txt = out["txt"]
                img = out["img"]
            else:
-                img, txt = block(img=img, txt=txt, vec=vec, pe=pe, attn_mask=attn_mask, modulation_dims_img=modulation_dims, modulation_dims_txt=modulation_dims_txt)
+                img, txt = block(img=img, txt=txt, vec=vec, pe=pe, attn_mask=attn_mask, modulation_dims_img=modulation_dims, modulation_dims_txt=modulation_dims_txt, transformer_options=transformer_options)

            if control is not None: # Controlnet
                control_i = control.get("input")
@ -307,13 +308,13 @@ class HunyuanVideo(nn.Module):
            if ("single_block", i) in blocks_replace:
                def block_wrap(args):
                    out = {}
-                    out["img"] = block(args["img"], vec=args["vec"], pe=args["pe"], attn_mask=args["attention_mask"], modulation_dims=args["modulation_dims"])
+                    out["img"] = block(args["img"], vec=args["vec"], pe=args["pe"], attn_mask=args["attention_mask"], modulation_dims=args["modulation_dims"], transformer_options=args["transformer_options"])
                    return out

-                out = blocks_replace[("single_block", i)]({"img": img, "vec": vec, "pe": pe, "attention_mask": attn_mask, 'modulation_dims': modulation_dims}, {"original_block": block_wrap})
+                out = blocks_replace[("single_block", i)]({"img": img, "vec": vec, "pe": pe, "attention_mask": attn_mask, 'modulation_dims': modulation_dims, 'transformer_options': transformer_options}, {"original_block": block_wrap})
                img = out["img"]
            else:
-                img = block(img, vec=vec, pe=pe, attn_mask=attn_mask, modulation_dims=modulation_dims)
+                img = block(img, vec=vec, pe=pe, attn_mask=attn_mask, modulation_dims=modulation_dims, transformer_options=transformer_options)

            if control is not None: # Controlnet
                control_o = control.get("output")
--- a/comfy/ldm/lightricks/model.py
+++ b/comfy/ldm/lightricks/model.py
@ -271,7 +271,7 @@ class CrossAttention(nn.Module):

        self.to_out = nn.Sequential(operations.Linear(inner_dim, query_dim, dtype=dtype, device=device), nn.Dropout(dropout))

-    def forward(self, x, context=None, mask=None, pe=None):
+    def forward(self, x, context=None, mask=None, pe=None, transformer_options={}):
        q = self.to_q(x)
        context = x if context is None else context
        k = self.to_k(context)
@ -285,9 +285,9 @@ class CrossAttention(nn.Module):
            k = apply_rotary_emb(k, pe)

        if mask is None:
-            out = comfy.ldm.modules.attention.optimized_attention(q, k, v, self.heads, attn_precision=self.attn_precision)
+            out = comfy.ldm.modules.attention.optimized_attention(q, k, v, self.heads, attn_precision=self.attn_precision, transformer_options=transformer_options)
        else:
-            out = comfy.ldm.modules.attention.optimized_attention_masked(q, k, v, self.heads, mask, attn_precision=self.attn_precision)
+            out = comfy.ldm.modules.attention.optimized_attention_masked(q, k, v, self.heads, mask, attn_precision=self.attn_precision, transformer_options=transformer_options)
        return self.to_out(out)


@ -303,12 +303,12 @@ class BasicTransformerBlock(nn.Module):

        self.scale_shift_table = nn.Parameter(torch.empty(6, dim, device=device, dtype=dtype))

-    def forward(self, x, context=None, attention_mask=None, timestep=None, pe=None):
+    def forward(self, x, context=None, attention_mask=None, timestep=None, pe=None, transformer_options={}):
        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (self.scale_shift_table[None, None].to(device=x.device, dtype=x.dtype) + timestep.reshape(x.shape[0], timestep.shape[1], self.scale_shift_table.shape[0], -1)).unbind(dim=2)

-        x += self.attn1(comfy.ldm.common_dit.rms_norm(x) * (1 + scale_msa) + shift_msa, pe=pe) * gate_msa
+        x += self.attn1(comfy.ldm.common_dit.rms_norm(x) * (1 + scale_msa) + shift_msa, pe=pe, transformer_options=transformer_options) * gate_msa

-        x += self.attn2(x, context=context, mask=attention_mask)
+        x += self.attn2(x, context=context, mask=attention_mask, transformer_options=transformer_options)

        y = comfy.ldm.common_dit.rms_norm(x) * (1 + scale_mlp) + shift_mlp
        x += self.ff(y) * gate_mlp
@ -479,10 +479,10 @@ class LTXVModel(torch.nn.Module):
            if ("double_block", i) in blocks_replace:
                def block_wrap(args):
                    out = {}
-                    out["img"] = block(args["img"], context=args["txt"], attention_mask=args["attention_mask"], timestep=args["vec"], pe=args["pe"])
+                    out["img"] = block(args["img"], context=args["txt"], attention_mask=args["attention_mask"], timestep=args["vec"], pe=args["pe"], transformer_options=args["transformer_options"])
                    return out

-                out = blocks_replace[("double_block", i)]({"img": x, "txt": context, "attention_mask": attention_mask, "vec": timestep, "pe": pe}, {"original_block": block_wrap})
+                out = blocks_replace[("double_block", i)]({"img": x, "txt": context, "attention_mask": attention_mask, "vec": timestep, "pe": pe, "transformer_options": transformer_options}, {"original_block": block_wrap})
                x = out["img"]
            else:
                x = block(
@ -490,7 +490,8 @@ class LTXVModel(torch.nn.Module):
                    context=context,
                    attention_mask=attention_mask,
                    timestep=timestep,
-                    pe=pe
+                    pe=pe,
+                    transformer_options=transformer_options,
                )

        # 3. Output
--- a/comfy/ldm/lumina/model.py
+++ b/comfy/ldm/lumina/model.py
@ -104,6 +104,7 @@ class JointAttention(nn.Module):
        x: torch.Tensor,
        x_mask: torch.Tensor,
        freqs_cis: torch.Tensor,
+        transformer_options={},
    ) -> torch.Tensor:
        """

@ -140,7 +141,7 @@ class JointAttention(nn.Module):
        if n_rep >= 1:
            xk = xk.unsqueeze(3).repeat(1, 1, 1, n_rep, 1).flatten(2, 3)
            xv = xv.unsqueeze(3).repeat(1, 1, 1, n_rep, 1).flatten(2, 3)
-        output = optimized_attention_masked(xq.movedim(1, 2), xk.movedim(1, 2), xv.movedim(1, 2), self.n_local_heads, x_mask, skip_reshape=True)
+        output = optimized_attention_masked(xq.movedim(1, 2), xk.movedim(1, 2), xv.movedim(1, 2), self.n_local_heads, x_mask, skip_reshape=True, transformer_options=transformer_options)

        return self.out(output)

@ -268,6 +269,7 @@ class JointTransformerBlock(nn.Module):
        x_mask: torch.Tensor,
        freqs_cis: torch.Tensor,
        adaln_input: Optional[torch.Tensor]=None,
+        transformer_options={},
    ):
        """
        Perform a forward pass through the TransformerBlock.
@ -290,6 +292,7 @@ class JointTransformerBlock(nn.Module):
                    modulate(self.attention_norm1(x), scale_msa),
                    x_mask,
                    freqs_cis,
+                    transformer_options=transformer_options,
                )
            )
            x = x + gate_mlp.unsqueeze(1).tanh() * self.ffn_norm2(
@ -304,6 +307,7 @@ class JointTransformerBlock(nn.Module):
                    self.attention_norm1(x),
                    x_mask,
                    freqs_cis,
+                    transformer_options=transformer_options,
                )
            )
            x = x + self.ffn_norm2(
@ -494,7 +498,7 @@ class NextDiT(nn.Module):
        return imgs

    def patchify_and_embed(
-        self, x: List[torch.Tensor] | torch.Tensor, cap_feats: torch.Tensor, cap_mask: torch.Tensor, t: torch.Tensor, num_tokens
+        self, x: List[torch.Tensor] | torch.Tensor, cap_feats: torch.Tensor, cap_mask: torch.Tensor, t: torch.Tensor, num_tokens, transformer_options={}
    ) -> Tuple[torch.Tensor, torch.Tensor, List[Tuple[int, int]], List[int], torch.Tensor]:
        bsz = len(x)
        pH = pW = self.patch_size
@ -554,7 +558,7 @@ class NextDiT(nn.Module):

        # refine context
        for layer in self.context_refiner:
-            cap_feats = layer(cap_feats, cap_mask, cap_freqs_cis)
+            cap_feats = layer(cap_feats, cap_mask, cap_freqs_cis, transformer_options=transformer_options)

        # refine image
        flat_x = []
@ -573,7 +577,7 @@ class NextDiT(nn.Module):
        padded_img_embed = self.x_embedder(padded_img_embed)
        padded_img_mask = padded_img_mask.unsqueeze(1)
        for layer in self.noise_refiner:
-            padded_img_embed = layer(padded_img_embed, padded_img_mask, img_freqs_cis, t)
+            padded_img_embed = layer(padded_img_embed, padded_img_mask, img_freqs_cis, t, transformer_options=transformer_options)

        if cap_mask is not None:
            mask = torch.zeros(bsz, max_seq_len, dtype=dtype, device=device)
@ -616,12 +620,13 @@ class NextDiT(nn.Module):

        cap_feats = self.cap_embedder(cap_feats)  # (N, L, D)  # todo check if able to batchify w.o. redundant compute

+        transformer_options = kwargs.get("transformer_options", {})
        x_is_tensor = isinstance(x, torch.Tensor)
-        x, mask, img_size, cap_size, freqs_cis = self.patchify_and_embed(x, cap_feats, cap_mask, t, num_tokens)
+        x, mask, img_size, cap_size, freqs_cis = self.patchify_and_embed(x, cap_feats, cap_mask, t, num_tokens, transformer_options=transformer_options)
        freqs_cis = freqs_cis.to(x.device)

        for layer in self.layers:
-            x = layer(x, mask, freqs_cis, adaln_input)
+            x = layer(x, mask, freqs_cis, adaln_input, transformer_options=transformer_options)

        x = self.final_layer(x, adaln_input)
        x = self.unpatchify(x, img_size, cap_size, return_tensor=x_is_tensor)[:,:,:h,:w]
--- a/comfy/ldm/modules/attention.py
+++ b/comfy/ldm/modules/attention.py
@ -5,8 +5,9 @@ import torch
 import torch.nn.functional as F
 from torch import nn, einsum
 from einops import rearrange, repeat
-from typing import Optional
+from typing import Optional, Any, Callable, Union
 import logging
+import functools

 from .diffusionmodules.util import AlphaBlender, timestep_embedding
 from .sub_quadratic_attention import efficient_dot_product_attention
@ -17,23 +18,45 @@ if model_management.xformers_enabled():
    import xformers
    import xformers.ops

-if model_management.sage_attention_enabled():
-    try:
-        from sageattention import sageattn
-    except ModuleNotFoundError as e:
+SAGE_ATTENTION_IS_AVAILABLE = False
+try:
+    from sageattention import sageattn
+    SAGE_ATTENTION_IS_AVAILABLE = True
+except ModuleNotFoundError as e:
+    if model_management.sage_attention_enabled():
        if e.name == "sageattention":
            logging.error(f"\n\nTo use the `--use-sage-attention` feature, the `sageattention` package must be installed first.\ncommand:\n\t{sys.executable} -m pip install sageattention")
        else:
            raise e
        exit(-1)

-if model_management.flash_attention_enabled():
-    try:
-        from flash_attn import flash_attn_func
-    except ModuleNotFoundError:
+FLASH_ATTENTION_IS_AVAILABLE = False
+try:
+    from flash_attn import flash_attn_func
+    FLASH_ATTENTION_IS_AVAILABLE = True
+except ModuleNotFoundError:
+    if model_management.flash_attention_enabled():
        logging.error(f"\n\nTo use the `--use-flash-attention` feature, the `flash-attn` package must be installed first.\ncommand:\n\t{sys.executable} -m pip install flash-attn")
        exit(-1)

+REGISTERED_ATTENTION_FUNCTIONS = {}
+def register_attention_function(name: str, func: Callable):
+    # avoid replacing existing functions
+    if name not in REGISTERED_ATTENTION_FUNCTIONS:
+        REGISTERED_ATTENTION_FUNCTIONS[name] = func
+    else:
+        logging.warning(f"Attention function {name} already registered, skipping registration.")
+
+def get_attention_function(name: str, default: Any=...) -> Union[Callable, None]:
+    if name == "optimized":
+        return optimized_attention
+    elif name not in REGISTERED_ATTENTION_FUNCTIONS:
+        if default is ...:
+            raise KeyError(f"Attention function {name} not found.")
+        else:
+            return default
+    return REGISTERED_ATTENTION_FUNCTIONS[name]
+
 from comfy.cli_args import args
 import comfy.ops
 ops = comfy.ops.disable_weight_init
@ -91,7 +114,27 @@ class FeedForward(nn.Module):
 def Normalize(in_channels, dtype=None, device=None):
    return torch.nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True, dtype=dtype, device=device)

-def attention_basic(q, k, v, heads, mask=None, attn_precision=None, skip_reshape=False, skip_output_reshape=False):
+
+def wrap_attn(func):
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        remove_attn_wrapper_key = False
+        try:
+            if "_inside_attn_wrapper" not in kwargs:
+                transformer_options = kwargs.get("transformer_options", None)
+                remove_attn_wrapper_key = True
+                kwargs["_inside_attn_wrapper"] = True
+                if transformer_options is not None:
+                    if "optimized_attention_override" in transformer_options:
+                        return transformer_options["optimized_attention_override"](func, *args, **kwargs)
+            return func(*args, **kwargs)
+        finally:
+            if remove_attn_wrapper_key:
+                del kwargs["_inside_attn_wrapper"]
+    return wrapper
+
+@wrap_attn
+def attention_basic(q, k, v, heads, mask=None, attn_precision=None, skip_reshape=False, skip_output_reshape=False, **kwargs):
    attn_precision = get_attn_precision(attn_precision, q.dtype)

    if skip_reshape:
@ -159,8 +202,8 @@ def attention_basic(q, k, v, heads, mask=None, attn_precision=None, skip_reshape
        )
    return out

-
-def attention_sub_quad(query, key, value, heads, mask=None, attn_precision=None, skip_reshape=False, skip_output_reshape=False):
+@wrap_attn
+def attention_sub_quad(query, key, value, heads, mask=None, attn_precision=None, skip_reshape=False, skip_output_reshape=False, **kwargs):
    attn_precision = get_attn_precision(attn_precision, query.dtype)

    if skip_reshape:
@ -230,7 +273,8 @@ def attention_sub_quad(query, key, value, heads, mask=None, attn_precision=None,
        hidden_states = hidden_states.unflatten(0, (-1, heads)).transpose(1,2).flatten(start_dim=2)
    return hidden_states

-def attention_split(q, k, v, heads, mask=None, attn_precision=None, skip_reshape=False, skip_output_reshape=False):
+@wrap_attn
+def attention_split(q, k, v, heads, mask=None, attn_precision=None, skip_reshape=False, skip_output_reshape=False, **kwargs):
    attn_precision = get_attn_precision(attn_precision, q.dtype)

    if skip_reshape:
@ -359,7 +403,8 @@ try:
 except:
    pass

-def attention_xformers(q, k, v, heads, mask=None, attn_precision=None, skip_reshape=False, skip_output_reshape=False):
+@wrap_attn
+def attention_xformers(q, k, v, heads, mask=None, attn_precision=None, skip_reshape=False, skip_output_reshape=False, **kwargs):
    b = q.shape[0]
    dim_head = q.shape[-1]
    # check to make sure xformers isn't broken
@ -374,7 +419,7 @@ def attention_xformers(q, k, v, heads, mask=None, attn_precision=None, skip_resh
            disabled_xformers = True

    if disabled_xformers:
-        return attention_pytorch(q, k, v, heads, mask, skip_reshape=skip_reshape)
+        return attention_pytorch(q, k, v, heads, mask, skip_reshape=skip_reshape, **kwargs)

    if skip_reshape:
        # b h k d -> b k h d
@ -427,8 +472,8 @@ else:
    #TODO: other GPUs ?
    SDP_BATCH_LIMIT = 2**31

-
-def attention_pytorch(q, k, v, heads, mask=None, attn_precision=None, skip_reshape=False, skip_output_reshape=False):
+@wrap_attn
+def attention_pytorch(q, k, v, heads, mask=None, attn_precision=None, skip_reshape=False, skip_output_reshape=False, **kwargs):
    if skip_reshape:
        b, _, _, dim_head = q.shape
    else:
@ -470,8 +515,8 @@ def attention_pytorch(q, k, v, heads, mask=None, attn_precision=None, skip_resha
            ).transpose(1, 2).reshape(-1, q.shape[2], heads * dim_head)
    return out

-
-def attention_sage(q, k, v, heads, mask=None, attn_precision=None, skip_reshape=False, skip_output_reshape=False):
+@wrap_attn
+def attention_sage(q, k, v, heads, mask=None, attn_precision=None, skip_reshape=False, skip_output_reshape=False, **kwargs):
    if skip_reshape:
        b, _, _, dim_head = q.shape
        tensor_layout = "HND"
@ -501,7 +546,7 @@ def attention_sage(q, k, v, heads, mask=None, attn_precision=None, skip_reshape=
                lambda t: t.transpose(1, 2),
                (q, k, v),
            )
-        return attention_pytorch(q, k, v, heads, mask=mask, skip_reshape=True, skip_output_reshape=skip_output_reshape)
+        return attention_pytorch(q, k, v, heads, mask=mask, skip_reshape=True, skip_output_reshape=skip_output_reshape, **kwargs)

    if tensor_layout == "HND":
        if not skip_output_reshape:
@ -534,8 +579,8 @@ except AttributeError as error:
                    dropout_p: float = 0.0, causal: bool = False) -> torch.Tensor:
        assert False, f"Could not define flash_attn_wrapper: {FLASH_ATTN_ERROR}"

-
-def attention_flash(q, k, v, heads, mask=None, attn_precision=None, skip_reshape=False, skip_output_reshape=False):
+@wrap_attn
+def attention_flash(q, k, v, heads, mask=None, attn_precision=None, skip_reshape=False, skip_output_reshape=False, **kwargs):
    if skip_reshape:
        b, _, _, dim_head = q.shape
    else:
@ -597,6 +642,19 @@ else:

 optimized_attention_masked = optimized_attention

+
+# register core-supported attention functions
+if SAGE_ATTENTION_IS_AVAILABLE:
+    register_attention_function("sage", attention_sage)
+if FLASH_ATTENTION_IS_AVAILABLE:
+    register_attention_function("flash", attention_flash)
+if model_management.xformers_enabled():
+    register_attention_function("xformers", attention_xformers)
+register_attention_function("pytorch", attention_pytorch)
+register_attention_function("sub_quad", attention_sub_quad)
+register_attention_function("split", attention_split)
+
+
 def optimized_attention_for_device(device, mask=False, small_input=False):
    if small_input:
        if model_management.pytorch_attention_enabled():
@ -629,7 +687,7 @@ class CrossAttention(nn.Module):

        self.to_out = nn.Sequential(operations.Linear(inner_dim, query_dim, dtype=dtype, device=device), nn.Dropout(dropout))

-    def forward(self, x, context=None, value=None, mask=None):
+    def forward(self, x, context=None, value=None, mask=None, transformer_options={}):
        q = self.to_q(x)
        context = default(context, x)
        k = self.to_k(context)
@ -640,9 +698,9 @@ class CrossAttention(nn.Module):
            v = self.to_v(context)

        if mask is None:
-            out = optimized_attention(q, k, v, self.heads, attn_precision=self.attn_precision)
+            out = optimized_attention(q, k, v, self.heads, attn_precision=self.attn_precision, transformer_options=transformer_options)
        else:
-            out = optimized_attention_masked(q, k, v, self.heads, mask, attn_precision=self.attn_precision)
+            out = optimized_attention_masked(q, k, v, self.heads, mask, attn_precision=self.attn_precision, transformer_options=transformer_options)
        return self.to_out(out)


@ -746,7 +804,7 @@ class BasicTransformerBlock(nn.Module):
            n = attn1_replace_patch[block_attn1](n, context_attn1, value_attn1, extra_options)
            n = self.attn1.to_out(n)
        else:
-            n = self.attn1(n, context=context_attn1, value=value_attn1)
+            n = self.attn1(n, context=context_attn1, value=value_attn1, transformer_options=transformer_options)

        if "attn1_output_patch" in transformer_patches:
            patch = transformer_patches["attn1_output_patch"]
@ -786,7 +844,7 @@ class BasicTransformerBlock(nn.Module):
                n = attn2_replace_patch[block_attn2](n, context_attn2, value_attn2, extra_options)
                n = self.attn2.to_out(n)
            else:
-                n = self.attn2(n, context=context_attn2, value=value_attn2)
+                n = self.attn2(n, context=context_attn2, value=value_attn2, transformer_options=transformer_options)

        if "attn2_output_patch" in transformer_patches:
            patch = transformer_patches["attn2_output_patch"]
@ -1017,7 +1075,7 @@ class SpatialVideoTransformer(SpatialTransformer):

            B, S, C = x_mix.shape
            x_mix = rearrange(x_mix, "(b t) s c -> (b s) t c", t=timesteps)
-            x_mix = mix_block(x_mix, context=time_context) #TODO: transformer_options
+            x_mix = mix_block(x_mix, context=time_context, transformer_options=transformer_options)
            x_mix = rearrange(
                x_mix, "(b s) t c -> (b t) s c", s=S, b=B // timesteps, c=C, t=timesteps
            )
--- a/comfy/ldm/modules/diffusionmodules/mmdit.py
+++ b/comfy/ldm/modules/diffusionmodules/mmdit.py
@ -606,7 +606,7 @@ def block_mixing(*args, use_checkpoint=True, **kwargs):
        return _block_mixing(*args, **kwargs)


-def _block_mixing(context, x, context_block, x_block, c):
+def _block_mixing(context, x, context_block, x_block, c, transformer_options={}):
    context_qkv, context_intermediates = context_block.pre_attention(context, c)

    if x_block.x_block_self_attn:
@ -622,6 +622,7 @@ def _block_mixing(context, x, context_block, x_block, c):
    attn = optimized_attention(
        qkv[0], qkv[1], qkv[2],
        heads=x_block.attn.num_heads,
+        transformer_options=transformer_options,
    )
    context_attn, x_attn = (
        attn[:, : context_qkv[0].shape[1]],
@ -637,6 +638,7 @@ def _block_mixing(context, x, context_block, x_block, c):
        attn2 = optimized_attention(
                x_qkv2[0], x_qkv2[1], x_qkv2[2],
                heads=x_block.attn2.num_heads,
+                transformer_options=transformer_options,
            )
        x = x_block.post_attention_x(x_attn, attn2, *x_intermediates)
    else:
@ -958,10 +960,10 @@ class MMDiT(nn.Module):
            if ("double_block", i) in blocks_replace:
                def block_wrap(args):
                    out = {}
-                    out["txt"], out["img"] = self.joint_blocks[i](args["txt"], args["img"], c=args["vec"])
+                    out["txt"], out["img"] = self.joint_blocks[i](args["txt"], args["img"], c=args["vec"], transformer_options=args["transformer_options"])
                    return out

-                out = blocks_replace[("double_block", i)]({"img": x, "txt": context, "vec": c_mod}, {"original_block": block_wrap})
+                out = blocks_replace[("double_block", i)]({"img": x, "txt": context, "vec": c_mod, "transformer_options": transformer_options}, {"original_block": block_wrap})
                context = out["txt"]
                x = out["img"]
            else:
@ -970,6 +972,7 @@ class MMDiT(nn.Module):
                    x,
                    c=c_mod,
                    use_checkpoint=self.use_checkpoint,
+                    transformer_options=transformer_options,
                )
            if control is not None:
                control_o = control.get("output")
--- a/comfy/ldm/omnigen/omnigen2.py
+++ b/comfy/ldm/omnigen/omnigen2.py
@ -120,7 +120,7 @@ class Attention(nn.Module):
            nn.Dropout(0.0)
        )

-    def forward(self, hidden_states: torch.Tensor, encoder_hidden_states: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, image_rotary_emb: Optional[torch.Tensor] = None) -> torch.Tensor:
+    def forward(self, hidden_states: torch.Tensor, encoder_hidden_states: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, image_rotary_emb: Optional[torch.Tensor] = None, transformer_options={}) -> torch.Tensor:
        batch_size, sequence_length, _ = hidden_states.shape

        query = self.to_q(hidden_states)
@ -146,7 +146,7 @@ class Attention(nn.Module):
            key = key.repeat_interleave(self.heads // self.kv_heads, dim=1)
            value = value.repeat_interleave(self.heads // self.kv_heads, dim=1)

-        hidden_states = optimized_attention_masked(query, key, value, self.heads, attention_mask, skip_reshape=True)
+        hidden_states = optimized_attention_masked(query, key, value, self.heads, attention_mask, skip_reshape=True, transformer_options=transformer_options)
        hidden_states = self.to_out[0](hidden_states)
        return hidden_states

@ -182,16 +182,16 @@ class OmniGen2TransformerBlock(nn.Module):
        self.norm2 = operations.RMSNorm(dim, eps=norm_eps, dtype=dtype, device=device)
        self.ffn_norm2 = operations.RMSNorm(dim, eps=norm_eps, dtype=dtype, device=device)

-    def forward(self, hidden_states: torch.Tensor, attention_mask: torch.Tensor, image_rotary_emb: torch.Tensor, temb: Optional[torch.Tensor] = None) -> torch.Tensor:
+    def forward(self, hidden_states: torch.Tensor, attention_mask: torch.Tensor, image_rotary_emb: torch.Tensor, temb: Optional[torch.Tensor] = None, transformer_options={}) -> torch.Tensor:
        if self.modulation:
            norm_hidden_states, gate_msa, scale_mlp, gate_mlp = self.norm1(hidden_states, temb)
-            attn_output = self.attn(norm_hidden_states, norm_hidden_states, attention_mask, image_rotary_emb)
+            attn_output = self.attn(norm_hidden_states, norm_hidden_states, attention_mask, image_rotary_emb, transformer_options=transformer_options)
            hidden_states = hidden_states + gate_msa.unsqueeze(1).tanh() * self.norm2(attn_output)
            mlp_output = self.feed_forward(self.ffn_norm1(hidden_states) * (1 + scale_mlp.unsqueeze(1)))
            hidden_states = hidden_states + gate_mlp.unsqueeze(1).tanh() * self.ffn_norm2(mlp_output)
        else:
            norm_hidden_states = self.norm1(hidden_states)
-            attn_output = self.attn(norm_hidden_states, norm_hidden_states, attention_mask, image_rotary_emb)
+            attn_output = self.attn(norm_hidden_states, norm_hidden_states, attention_mask, image_rotary_emb, transformer_options=transformer_options)
            hidden_states = hidden_states + self.norm2(attn_output)
            mlp_output = self.feed_forward(self.ffn_norm1(hidden_states))
            hidden_states = hidden_states + self.ffn_norm2(mlp_output)
@ -390,7 +390,7 @@ class OmniGen2Transformer2DModel(nn.Module):
            ref_img_sizes, img_sizes,
        )

-    def img_patch_embed_and_refine(self, hidden_states, ref_image_hidden_states, padded_img_mask, padded_ref_img_mask, noise_rotary_emb, ref_img_rotary_emb, l_effective_ref_img_len, l_effective_img_len, temb):
+    def img_patch_embed_and_refine(self, hidden_states, ref_image_hidden_states, padded_img_mask, padded_ref_img_mask, noise_rotary_emb, ref_img_rotary_emb, l_effective_ref_img_len, l_effective_img_len, temb, transformer_options={}):
        batch_size = len(hidden_states)

        hidden_states = self.x_embedder(hidden_states)
@ -405,17 +405,17 @@ class OmniGen2Transformer2DModel(nn.Module):
                    shift += ref_img_len

        for layer in self.noise_refiner:
-            hidden_states = layer(hidden_states, padded_img_mask, noise_rotary_emb, temb)
+            hidden_states = layer(hidden_states, padded_img_mask, noise_rotary_emb, temb, transformer_options=transformer_options)

        if ref_image_hidden_states is not None:
            for layer in self.ref_image_refiner:
-                ref_image_hidden_states = layer(ref_image_hidden_states, padded_ref_img_mask, ref_img_rotary_emb, temb)
+                ref_image_hidden_states = layer(ref_image_hidden_states, padded_ref_img_mask, ref_img_rotary_emb, temb, transformer_options=transformer_options)

            hidden_states = torch.cat([ref_image_hidden_states, hidden_states], dim=1)

        return hidden_states

-    def forward(self, x, timesteps, context, num_tokens, ref_latents=None, attention_mask=None, **kwargs):
+    def forward(self, x, timesteps, context, num_tokens, ref_latents=None, attention_mask=None, transformer_options={}, **kwargs):
        B, C, H, W = x.shape
        hidden_states = comfy.ldm.common_dit.pad_to_patch_size(x, (self.patch_size, self.patch_size))
        _, _, H_padded, W_padded = hidden_states.shape
@ -444,7 +444,7 @@ class OmniGen2Transformer2DModel(nn.Module):
        )

        for layer in self.context_refiner:
-            text_hidden_states = layer(text_hidden_states, text_attention_mask, context_rotary_emb)
+            text_hidden_states = layer(text_hidden_states, text_attention_mask, context_rotary_emb, transformer_options=transformer_options)

        img_len = hidden_states.shape[1]
        combined_img_hidden_states = self.img_patch_embed_and_refine(
@ -453,13 +453,14 @@ class OmniGen2Transformer2DModel(nn.Module):
            noise_rotary_emb, ref_img_rotary_emb,
            l_effective_ref_img_len, l_effective_img_len,
            temb,
+            transformer_options=transformer_options,
        )

        hidden_states = torch.cat([text_hidden_states, combined_img_hidden_states], dim=1)
        attention_mask = None

        for layer in self.layers:
-            hidden_states = layer(hidden_states, attention_mask, rotary_emb, temb)
+            hidden_states = layer(hidden_states, attention_mask, rotary_emb, temb, transformer_options=transformer_options)

        hidden_states = self.norm_out(hidden_states, temb)

--- a/comfy/ldm/qwen_image/model.py
+++ b/comfy/ldm/qwen_image/model.py
@ -132,6 +132,7 @@ class Attention(nn.Module):
        encoder_hidden_states_mask: torch.FloatTensor = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        image_rotary_emb: Optional[torch.Tensor] = None,
+        transformer_options={},
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        seq_txt = encoder_hidden_states.shape[1]

@ -159,7 +160,7 @@ class Attention(nn.Module):
        joint_key = joint_key.flatten(start_dim=2)
        joint_value = joint_value.flatten(start_dim=2)

-        joint_hidden_states = optimized_attention_masked(joint_query, joint_key, joint_value, self.heads, attention_mask)
+        joint_hidden_states = optimized_attention_masked(joint_query, joint_key, joint_value, self.heads, attention_mask, transformer_options=transformer_options)

        txt_attn_output = joint_hidden_states[:, :seq_txt, :]
        img_attn_output = joint_hidden_states[:, seq_txt:, :]
@ -226,6 +227,7 @@ class QwenImageTransformerBlock(nn.Module):
        encoder_hidden_states_mask: torch.Tensor,
        temb: torch.Tensor,
        image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        transformer_options={},
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        img_mod_params = self.img_mod(temb)
        txt_mod_params = self.txt_mod(temb)
@ -242,6 +244,7 @@ class QwenImageTransformerBlock(nn.Module):
            encoder_hidden_states=txt_modulated,
            encoder_hidden_states_mask=encoder_hidden_states_mask,
            image_rotary_emb=image_rotary_emb,
+            transformer_options=transformer_options,
        )

        hidden_states = hidden_states + img_gate1 * img_attn_output
@ -434,9 +437,9 @@ class QwenImageTransformer2DModel(nn.Module):
            if ("double_block", i) in blocks_replace:
                def block_wrap(args):
                    out = {}
-                    out["txt"], out["img"] = block(hidden_states=args["img"], encoder_hidden_states=args["txt"], encoder_hidden_states_mask=encoder_hidden_states_mask, temb=args["vec"], image_rotary_emb=args["pe"])
+                    out["txt"], out["img"] = block(hidden_states=args["img"], encoder_hidden_states=args["txt"], encoder_hidden_states_mask=encoder_hidden_states_mask, temb=args["vec"], image_rotary_emb=args["pe"], transformer_options=args["transformer_options"])
                    return out
-                out = blocks_replace[("double_block", i)]({"img": hidden_states, "txt": encoder_hidden_states, "vec": temb, "pe": image_rotary_emb}, {"original_block": block_wrap})
+                out = blocks_replace[("double_block", i)]({"img": hidden_states, "txt": encoder_hidden_states, "vec": temb, "pe": image_rotary_emb, "transformer_options": transformer_options}, {"original_block": block_wrap})
                hidden_states = out["img"]
                encoder_hidden_states = out["txt"]
            else:
@ -446,11 +449,12 @@ class QwenImageTransformer2DModel(nn.Module):
                    encoder_hidden_states_mask=encoder_hidden_states_mask,
                    temb=temb,
                    image_rotary_emb=image_rotary_emb,
+                    transformer_options=transformer_options,
                )

            if "double_block" in patches:
                for p in patches["double_block"]:
-                    out = p({"img": hidden_states, "txt": encoder_hidden_states, "x": x, "block_index": i})
+                    out = p({"img": hidden_states, "txt": encoder_hidden_states, "x": x, "block_index": i, "transformer_options": transformer_options})
                    hidden_states = out["img"]
                    encoder_hidden_states = out["txt"]

--- a/comfy/ldm/wan/model.py
+++ b/comfy/ldm/wan/model.py
@ -52,7 +52,7 @@ class WanSelfAttention(nn.Module):
        self.norm_q = operation_settings.get("operations").RMSNorm(dim, eps=eps, elementwise_affine=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")) if qk_norm else nn.Identity()
        self.norm_k = operation_settings.get("operations").RMSNorm(dim, eps=eps, elementwise_affine=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")) if qk_norm else nn.Identity()

-    def forward(self, x, freqs):
+    def forward(self, x, freqs, transformer_options={}):
        r"""
        Args:
            x(Tensor): Shape [B, L, num_heads, C / num_heads]
@ -75,6 +75,7 @@ class WanSelfAttention(nn.Module):
            k.view(b, s, n * d),
            v,
            heads=self.num_heads,
+            transformer_options=transformer_options,
        )

        x = self.o(x)
@ -83,7 +84,7 @@ class WanSelfAttention(nn.Module):

 class WanT2VCrossAttention(WanSelfAttention):

-    def forward(self, x, context, **kwargs):
+    def forward(self, x, context, transformer_options={}, **kwargs):
        r"""
        Args:
            x(Tensor): Shape [B, L1, C]
@ -95,7 +96,7 @@ class WanT2VCrossAttention(WanSelfAttention):
        v = self.v(context)

        # compute attention
-        x = optimized_attention(q, k, v, heads=self.num_heads)
+        x = optimized_attention(q, k, v, heads=self.num_heads, transformer_options=transformer_options)

        x = self.o(x)
        return x
@ -116,7 +117,7 @@ class WanI2VCrossAttention(WanSelfAttention):
        # self.alpha = nn.Parameter(torch.zeros((1, )))
        self.norm_k_img = operation_settings.get("operations").RMSNorm(dim, eps=eps, elementwise_affine=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")) if qk_norm else nn.Identity()

-    def forward(self, x, context, context_img_len):
+    def forward(self, x, context, context_img_len, transformer_options={}):
        r"""
        Args:
            x(Tensor): Shape [B, L1, C]
@ -131,9 +132,9 @@ class WanI2VCrossAttention(WanSelfAttention):
        v = self.v(context)
        k_img = self.norm_k_img(self.k_img(context_img))
        v_img = self.v_img(context_img)
-        img_x = optimized_attention(q, k_img, v_img, heads=self.num_heads)
+        img_x = optimized_attention(q, k_img, v_img, heads=self.num_heads, transformer_options=transformer_options)
        # compute attention
-        x = optimized_attention(q, k, v, heads=self.num_heads)
+        x = optimized_attention(q, k, v, heads=self.num_heads, transformer_options=transformer_options)

        # output
        x = x + img_x
@ -206,6 +207,7 @@ class WanAttentionBlock(nn.Module):
        freqs,
        context,
        context_img_len=257,
+        transformer_options={},
    ):
        r"""
        Args:
@ -224,12 +226,12 @@ class WanAttentionBlock(nn.Module):
        # self-attention
        y = self.self_attn(
            torch.addcmul(repeat_e(e[0], x), self.norm1(x), 1 + repeat_e(e[1], x)),
-            freqs)
+            freqs, transformer_options=transformer_options)

        x = torch.addcmul(x, y, repeat_e(e[2], x))

        # cross-attention & ffn
-        x = x + self.cross_attn(self.norm3(x), context, context_img_len=context_img_len)
+        x = x + self.cross_attn(self.norm3(x), context, context_img_len=context_img_len, transformer_options=transformer_options)
        y = self.ffn(torch.addcmul(repeat_e(e[3], x), self.norm2(x), 1 + repeat_e(e[4], x)))
        x = torch.addcmul(x, y, repeat_e(e[5], x))
        return x
@ -559,12 +561,12 @@ class WanModel(torch.nn.Module):
            if ("double_block", i) in blocks_replace:
                def block_wrap(args):
                    out = {}
-                    out["img"] = block(args["img"], context=args["txt"], e=args["vec"], freqs=args["pe"], context_img_len=context_img_len)
+                    out["img"] = block(args["img"], context=args["txt"], e=args["vec"], freqs=args["pe"], context_img_len=context_img_len, transformer_options=args["transformer_options"])
                    return out
-                out = blocks_replace[("double_block", i)]({"img": x, "txt": context, "vec": e0, "pe": freqs}, {"original_block": block_wrap})
+                out = blocks_replace[("double_block", i)]({"img": x, "txt": context, "vec": e0, "pe": freqs, "transformer_options": transformer_options}, {"original_block": block_wrap})
                x = out["img"]
            else:
-                x = block(x, e=e0, freqs=freqs, context=context, context_img_len=context_img_len)
+                x = block(x, e=e0, freqs=freqs, context=context, context_img_len=context_img_len, transformer_options=transformer_options)

        # head
        x = self.head(x, e)
@ -742,17 +744,17 @@ class VaceWanModel(WanModel):
            if ("double_block", i) in blocks_replace:
                def block_wrap(args):
                    out = {}
-                    out["img"] = block(args["img"], context=args["txt"], e=args["vec"], freqs=args["pe"], context_img_len=context_img_len)
+                    out["img"] = block(args["img"], context=args["txt"], e=args["vec"], freqs=args["pe"], context_img_len=context_img_len, transformer_options=args["transformer_options"])
                    return out
-                out = blocks_replace[("double_block", i)]({"img": x, "txt": context, "vec": e0, "pe": freqs}, {"original_block": block_wrap})
+                out = blocks_replace[("double_block", i)]({"img": x, "txt": context, "vec": e0, "pe": freqs, "transformer_options": transformer_options}, {"original_block": block_wrap})
                x = out["img"]
            else:
-                x = block(x, e=e0, freqs=freqs, context=context, context_img_len=context_img_len)
+                x = block(x, e=e0, freqs=freqs, context=context, context_img_len=context_img_len, transformer_options=transformer_options)

            ii = self.vace_layers_mapping.get(i, None)
            if ii is not None:
                for iii in range(len(c)):
-                    c_skip, c[iii] = self.vace_blocks[ii](c[iii], x=x_orig, e=e0, freqs=freqs, context=context, context_img_len=context_img_len)
+                    c_skip, c[iii] = self.vace_blocks[ii](c[iii], x=x_orig, e=e0, freqs=freqs, context=context, context_img_len=context_img_len, transformer_options=transformer_options)
                    x += c_skip * vace_strength[iii]
                del c_skip
        # head
@ -841,12 +843,12 @@ class CameraWanModel(WanModel):
            if ("double_block", i) in blocks_replace:
                def block_wrap(args):
                    out = {}
-                    out["img"] = block(args["img"], context=args["txt"], e=args["vec"], freqs=args["pe"], context_img_len=context_img_len)
+                    out["img"] = block(args["img"], context=args["txt"], e=args["vec"], freqs=args["pe"], context_img_len=context_img_len, transformer_options=args["transformer_options"])
                    return out
-                out = blocks_replace[("double_block", i)]({"img": x, "txt": context, "vec": e0, "pe": freqs}, {"original_block": block_wrap})
+                out = blocks_replace[("double_block", i)]({"img": x, "txt": context, "vec": e0, "pe": freqs, "transformer_options": transformer_options}, {"original_block": block_wrap})
                x = out["img"]
            else:
-                x = block(x, e=e0, freqs=freqs, context=context, context_img_len=context_img_len)
+                x = block(x, e=e0, freqs=freqs, context=context, context_img_len=context_img_len, transformer_options=transformer_options)

        # head
        x = self.head(x, e)
--- a/comfy/model_base.py
+++ b/comfy/model_base.py
@ -1110,9 +1110,10 @@ class WAN21(BaseModel):
            shape_image[1] = extra_channels
            image = torch.zeros(shape_image, dtype=noise.dtype, layout=noise.layout, device=noise.device)
        else:
+            latent_dim = self.latent_format.latent_channels
            image = utils.common_upscale(image.to(device), noise.shape[-1], noise.shape[-2], "bilinear", "center")
-            for i in range(0, image.shape[1], 16):
-                image[:, i: i + 16] = self.process_latent_in(image[:, i: i + 16])
+            for i in range(0, image.shape[1], latent_dim):
+                image[:, i: i + latent_dim] = self.process_latent_in(image[:, i: i + latent_dim])
            image = utils.resize_to_batch_size(image, noise.shape[0])

        if extra_channels != image.shape[1] + 4:
@ -1245,18 +1246,14 @@ class WAN22_S2V(WAN21):
            out['reference_motion'] = reference_motion.shape
        return out

-class WAN22(BaseModel):
+class WAN22(WAN21):
    def __init__(self, model_config, model_type=ModelType.FLOW, image_to_video=False, device=None):
-        super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.wan.model.WanModel)
+        super(WAN21, self).__init__(model_config, model_type, device=device, unet_model=comfy.ldm.wan.model.WanModel)
        self.image_to_video = image_to_video

    def extra_conds(self, **kwargs):
        out = super().extra_conds(**kwargs)
-        cross_attn = kwargs.get("cross_attn", None)
-        if cross_attn is not None:
-            out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn)
-
-        denoise_mask = kwargs.get("concat_mask", kwargs.get("denoise_mask", None))
+        denoise_mask = kwargs.get("denoise_mask", None)
        if denoise_mask is not None:
            out["denoise_mask"] = comfy.conds.CONDRegular(denoise_mask)
        return out
--- a/comfy_api/latest/_input_impl/video_types.py
+++ b/comfy_api/latest/_input_impl/video_types.py
@ -8,6 +8,7 @@ import av
 import io
 import json
 import numpy as np
+import math
 import torch
 from comfy_api.latest._util import VideoContainer, VideoCodec, VideoComponents

@ -282,8 +283,6 @@ class VideoFromComponents(VideoInput):
            if self.__components.audio:
                audio_sample_rate = int(self.__components.audio['sample_rate'])
                audio_stream = output.add_stream('aac', rate=audio_sample_rate)
-                audio_stream.sample_rate = audio_sample_rate
-                audio_stream.format = 'fltp'

            # Encode video
            for i, frame in enumerate(self.__components.images):
@ -298,27 +297,12 @@ class VideoFromComponents(VideoInput):
            output.mux(packet)

            if audio_stream and self.__components.audio:
-                # Encode audio
-                samples_per_frame = int(audio_sample_rate / frame_rate)
-                num_frames = self.__components.audio['waveform'].shape[2] // samples_per_frame
-                for i in range(num_frames):
-                    start = i * samples_per_frame
-                    end = start + samples_per_frame
-                    # TODO(Feature) - Add support for stereo audio
-                    chunk = (
-                        self.__components.audio["waveform"][0, 0, start:end]
-                        .unsqueeze(0)
-                        .contiguous()
-                        .numpy()
-                    )
-                    audio_frame = av.AudioFrame.from_ndarray(chunk, format='fltp', layout='mono')
-                    audio_frame.sample_rate = audio_sample_rate
-                    audio_frame.pts = i * samples_per_frame
-                    for packet in audio_stream.encode(audio_frame):
-                        output.mux(packet)
-
-                # Flush audio
-                for packet in audio_stream.encode(None):
-                    output.mux(packet)
-
+                waveform = self.__components.audio['waveform']
+                waveform = waveform[:, :, :math.ceil((audio_sample_rate / frame_rate) * self.__components.images.shape[0])]
+                frame = av.AudioFrame.from_ndarray(waveform.movedim(2, 1).reshape(1, -1).float().numpy(), format='flt', layout='mono' if waveform.shape[1] == 1 else 'stereo')
+                frame.sample_rate = audio_sample_rate
+                frame.pts = 0
+                output.mux(audio_stream.encode(frame))

+                # Flush encoder
+                output.mux(audio_stream.encode(None))
--- a/comfy_extras/nodes_easysortblock.py
+++ b/comfy_extras/nodes_easysortblock.py
@ -0,0 +1,503 @@
+from __future__ import annotations
+from typing import TYPE_CHECKING, Union
+
+from comfy_api.latest import io, ComfyExtension
+import comfy.patcher_extension
+import logging
+import torch
+import math
+import comfy.model_patcher
+if TYPE_CHECKING:
+    from uuid import UUID
+
+
+
+def easysortblock_predict_noise_wrapper(executor, *args, **kwargs):
+    # get values from args
+    x: torch.Tensor = args[0]
+    timestep: float = args[1]
+    model_options: dict[str] = args[2]
+    easycache: EasySortblockHolder = model_options["transformer_options"]["easycache"]
+
+    # initialize predict_ratios
+    if easycache.initial_step:
+        sample_sigmas = model_options["transformer_options"]["sample_sigmas"]
+        relevant_sigmas = []
+        for i,sigma in enumerate(sample_sigmas):
+            if easycache.check_if_within_timesteps(sigma):
+                relevant_sigmas.append((i, sigma))
+        start_index = relevant_sigmas[0][0]
+        end_index = relevant_sigmas[-1][0]
+        easycache.predict_ratios = torch.linspace(easycache.start_predict_ratio, easycache.end_predict_ratio, end_index - start_index + 1)
+        easycache.predict_start_index = start_index
+
+    easycache.skip_current_step = False
+    if easycache.is_past_end_timestep(timestep):
+        return executor(*args, **kwargs)
+    # prepare next x_prev
+    next_x_prev = x
+    input_change = None
+    do_easycache = easycache.should_do_easycache(timestep)
+    if do_easycache:
+        easycache.check_metadata(x)
+        if easycache.has_x_prev_subsampled():
+            if easycache.has_x_prev_subsampled():
+                input_change = (easycache.subsample(x, clone=False) - easycache.x_prev_subsampled).flatten().abs().mean()
+            if easycache.has_output_prev_norm() and easycache.has_relative_transformation_rate():
+                approx_output_change_rate = (easycache.relative_transformation_rate * input_change) / easycache.output_prev_norm
+                easycache.cumulative_change_rate += approx_output_change_rate
+                if easycache.cumulative_change_rate < easycache.reuse_threshold:
+                    if easycache.verbose:
+                        logging.info(f"EasySortblock [verbose] - skipping step; cumulative_change_rate: {easycache.cumulative_change_rate}, reuse_threshold: {easycache.reuse_threshold}")
+                    # other conds should also skip this step
+                    easycache.skip_current_step = True
+                    easycache.steps_skipped.append(easycache.step_count)
+                else:
+                    if easycache.verbose:
+                        logging.info(f"EasySortblock [verbose] - NOT skipping step; cumulative_change_rate: {easycache.cumulative_change_rate}, reuse_threshold: {easycache.reuse_threshold}")
+                    easycache.cumulative_change_rate = 0.0
+    output: torch.Tensor = executor(*args, **kwargs)
+    if easycache.has_output_prev_norm():
+        output_change = (easycache.subsample(output, clone=False) - easycache.output_prev_subsampled).flatten().abs().mean()
+        if easycache.verbose:
+            output_change_rate = output_change / easycache.output_prev_norm
+            easycache.output_change_rates.append(output_change_rate.item())
+        if easycache.has_relative_transformation_rate():
+            approx_output_change_rate = (easycache.relative_transformation_rate * input_change) / easycache.output_prev_norm
+            easycache.approx_output_change_rates.append(approx_output_change_rate.item())
+            if easycache.verbose:
+                logging.info(f"EasySortblock [verbose] - approx_output_change_rate: {approx_output_change_rate}")
+        if input_change is not None:
+            easycache.relative_transformation_rate = output_change / input_change
+        if easycache.verbose:
+            logging.info(f"EasySortblock [verbose] - output_change_rate: {output_change_rate}")
+    easycache.x_prev_subsampled = easycache.subsample(next_x_prev)
+    easycache.output_prev_subsampled = easycache.subsample(output)
+    easycache.output_prev_norm = output.flatten().abs().mean()
+    if easycache.verbose:
+        logging.info(f"EasySortblock [verbose] - x_prev_subsampled: {easycache.x_prev_subsampled.shape}")
+
+    # increment step count
+    easycache.step_count += 1
+    easycache.initial_step = False
+
+    return output
+
+
+def easysortblock_outer_sample_wrapper(executor, *args, **kwargs):
+    """
+    This OUTER_SAMPLE wrapper makes sure EasySortblock is prepped for current run, and all memory usage is cleared at the end.
+    """
+    try:
+        guider = executor.class_obj
+        orig_model_options = guider.model_options
+        guider.model_options = comfy.model_patcher.create_model_options_clone(orig_model_options)
+        # clone and prepare timesteps
+        guider.model_options["transformer_options"]["easycache"] = guider.model_options["transformer_options"]["easycache"].clone().prepare_timesteps(guider.model_patcher.model.model_sampling)
+        easycache: EasySortblockHolder = guider.model_options['transformer_options']['easycache']
+        logging.info(f"{easycache.name} enabled - threshold: {easycache.reuse_threshold}, start_percent: {easycache.start_percent}, end_percent: {easycache.end_percent}")
+        return executor(*args, **kwargs)
+    finally:
+        easycache = guider.model_options['transformer_options']['easycache']
+        output_change_rates = easycache.output_change_rates
+        approx_output_change_rates = easycache.approx_output_change_rates
+        if easycache.verbose:
+            logging.info(f"{easycache.name} [verbose] - output_change_rates {len(output_change_rates)}: {output_change_rates}")
+            logging.info(f"{easycache.name} [verbose] - approx_output_change_rates {len(approx_output_change_rates)}: {approx_output_change_rates}")
+        total_steps = len(args[3])-1
+        logging.info(f"{easycache.name} - skipped {len(easycache.steps_skipped)}/{total_steps} steps")# ({total_steps/(total_steps-easycache.total_steps_skipped):.2f}x speedup).")
+        logging.info(f"{easycache.name} - skipped steps: {easycache.steps_skipped}")
+        easycache.reset()
+        guider.model_options = orig_model_options
+
+
+def model_forward_wrapper(executor, *args, **kwargs):
+    # TODO: make work with batches of conds
+    transformer_options: dict[str] = args[-1]
+    if not isinstance(transformer_options, dict):
+        transformer_options = kwargs.get("transformer_options")
+        if not transformer_options:
+            transformer_options = args[-2]
+    sigmas = transformer_options["sigmas"]
+    sb_holder: EasySortblockHolder = transformer_options["easycache"]
+
+    # if initial step, prepare everything for Sortblock
+    if sb_holder.initial_step:
+        logging.info(f"EasySortblock: inside model {executor.class_obj.__class__.__name__}")
+        # TODO: generalize for other models
+        # these won't stick around past this step; should store on sb_holder instead
+        logging.info(f"EasySortblock: preparing {len(executor.class_obj.double_blocks)} double blocks and {len(executor.class_obj.single_blocks)} single blocks")
+        if hasattr(executor.class_obj, "double_blocks"):
+            for block in executor.class_obj.double_blocks:
+                prepare_block(block, sb_holder)
+        if hasattr(executor.class_obj, "single_blocks"):
+            for block in executor.class_obj.single_blocks:
+                prepare_block(block, sb_holder)
+        if hasattr(executor.class_obj, "blocks"):
+            for block in executor.class_obj.block:
+                prepare_block(block, sb_holder)
+
+    if sb_holder.skip_current_step:
+        predict_index = max(0, sb_holder.step_count - sb_holder.predict_start_index)
+        predict_ratio = sb_holder.predict_ratios[predict_index]
+        logging.info(f"EasySortblock: skipping step {sb_holder.step_count}, predict_ratio: {predict_ratio}")
+        # reuse_ratio = 1.0 - predict_ratio
+        for block_type, blocks in sb_holder.blocks_per_type.items():
+            for block in blocks:
+                cache: BlockCache = block.__block_cache
+                cache.allowed_to_skip = False
+            sorted_blocks = sorted(blocks, key=lambda x: (x.__block_cache.consecutive_skipped_steps, x.__block_cache.prev_change_rate))
+            # for block in sorted_blocks:
+            #     pass
+            threshold_index = int(len(sorted_blocks) * predict_ratio)
+            # blocks with lower similarity are marked for recomputation
+            for block in sorted_blocks[:threshold_index]:
+                cache: BlockCache = block.__block_cache
+                cache.allowed_to_skip = True
+                logging.info(f"EasySortblock: skip block {block.__class__.__name__} - consecutive_skipped_steps: {block.__block_cache.consecutive_skipped_steps}, prev_change_rate: {block.__block_cache.prev_change_rate}, index: {block.__block_cache.block_index}")
+            not_skipped  = [block for block in blocks if not block.__block_cache.allowed_to_skip]
+            for block in not_skipped:
+                logging.info(f"EasySortblock: reco block {block.__class__.__name__} - consecutive_skipped_steps: {block.__block_cache.consecutive_skipped_steps}, prev_change_rate: {block.__block_cache.prev_change_rate}, index: {block.__block_cache.block_index}")
+            logging.info(f"EasySortblock: for {block_type}, selected {len(sorted_blocks[:threshold_index])} blocks for prediction and {len(sorted_blocks[threshold_index:])} blocks for recomputation")
+        # return executor(*args, **kwargs)
+    to_return = executor(*args, **kwargs)
+
+    return to_return
+
+
+
+
+
+def block_forward_factory(func, block):
+    def block_forward_wrapper(*args, **kwargs):
+        transformer_options: dict[str] = kwargs.get("transformer_options")
+        sigmas = transformer_options["sigmas"]
+        sb_holder: EasySortblockHolder = transformer_options["easycache"]
+        cache: BlockCache = block.__block_cache
+        # make sure stream count is properly set for this block
+        if sb_holder.initial_step:
+            sb_holder.add_to_blocks_per_type(block, transformer_options['block'][0])
+            cache.block_index = transformer_options['block'][1]
+            cache.stream_count = transformer_options['block'][2]
+
+        if sb_holder.is_past_end_timestep(sigmas):
+            return func(*args, **kwargs)
+        # do sortblock stuff
+        x = cache.get_next_x_prev(args, kwargs)
+        # prepare next_x_prev
+        next_x_prev = cache.get_next_x_prev(args, kwargs, clone=True)
+        input_change = None
+        do_sortblock = sb_holder.should_do_easycache(sigmas)
+        if do_sortblock:
+            # TODO: checkmetadata
+            if cache.has_x_prev_subsampled():
+                input_change = (cache.subsample(x, clone=False) - cache.x_prev_subsampled).flatten().abs().mean()
+            if cache.has_output_prev_norm() and cache.has_relative_transformation_rate():
+                approx_output_change_rate = (cache.relative_transformation_rate * input_change) / cache.output_prev_norm
+                cache.cumulative_change_rate += approx_output_change_rate
+                if cache.allowed_to_skip:
+                # if cache.cumulative_change_rate < sb_holder.reuse_threshold:
+                    # accumulate error + skip block
+                    # cache.want_to_skip = True
+                    # if cache.allowed_to_skip:
+                    cache.consecutive_skipped_steps += 1
+                    cache.prev_change_rate = approx_output_change_rate
+                    return cache.apply_cache_diff(x, sb_holder)
+                else:
+                    # reset error; NOT skipping block and recalculating
+                    cache.cumulative_change_rate = 0.0
+                    cache.prev_change_rate = approx_output_change_rate
+                    cache.want_to_skip = False
+                    cache.consecutive_skipped_steps = 0
+        # output_raw is expected to have cache.stream_count elements if count is greaater than 1 (double block, etc.)
+        output_raw: Union[torch.Tensor, tuple[torch.Tensor, ...]] = func(*args, **kwargs)
+        # if more than one stream from block, only use first one
+        if isinstance(output_raw, tuple):
+            output = output_raw[0]
+        else:
+            output = output_raw
+        if cache.has_output_prev_norm():
+            output_change = (cache.subsample(output, clone=False) - cache.output_prev_subsampled).flatten().abs().mean()
+            # if verbose in future
+            output_change_rate = output_change / cache.output_prev_norm
+            cache.output_change_rates.append(output_change_rate.item())
+            if cache.has_relative_transformation_rate():
+                approx_output_change_rate = (cache.relative_transformation_rate * input_change) / cache.output_prev_norm
+                cache.approx_output_change_rates.append(approx_output_change_rate.item())
+            if input_change is not None:
+                cache.relative_transformation_rate = output_change / input_change
+        # TODO: allow cache_diff to be offloaded
+        cache.update_cache_diff(output_raw, next_x_prev)
+        cache.x_prev_subsampled = cache.subsample(next_x_prev)
+        cache.output_prev_subsampled = cache.subsample(output)
+        cache.output_prev_norm = output.flatten().abs().mean()
+        return output_raw
+    return block_forward_wrapper
+
+def prepare_block(block, sb_holder: EasySortblockHolder, stream_count: int=1):
+    sb_holder.add_to_all_blocks(block)
+    block.__original_forward = block.forward
+    block.forward = block_forward_factory(block.__original_forward, block)
+    block.__block_cache = BlockCache(subsample_factor=sb_holder.subsample_factor, verbose=sb_holder.verbose)
+
+def clean_block(block):
+    block.forward = block.__original_forward
+    del block.__original_forward
+    del block.__block_cache
+
+class BlockCache:
+    def __init__(self, subsample_factor: int=8, verbose: bool=False):
+        self.subsample_factor = subsample_factor
+        self.verbose = verbose
+        self.stream_count = 1
+        self.block_index = 0
+        # control values
+        self.relative_transformation_rate: float = None
+        self.cumulative_change_rate = 0.0
+        self.prev_change_rate = 0.0
+        # cached values
+        self.x_prev_subsampled: torch.Tensor = None
+        self.output_prev_subsampled: torch.Tensor = None
+        self.output_prev_norm: torch.Tensor = None
+        self.cache_diff: list[torch.Tensor] = []
+        self.output_change_rates = []
+        self.approx_output_change_rates = []
+        self.steps_skipped: list[int] = []
+        self.consecutive_skipped_steps = 0
+        # self.state_metadata = None
+        self.want_to_skip = False
+        self.allowed_to_skip = False
+
+    def has_cache_diff(self) -> bool:
+        return self.cache_diff[0] is not None
+
+    def has_x_prev_subsampled(self) -> bool:
+        return self.x_prev_subsampled is not None
+
+    def has_output_prev_subsampled(self) -> bool:
+        return self.output_prev_subsampled is not None
+
+    def has_output_prev_norm(self) -> bool:
+        return self.output_prev_norm is not None
+
+    def has_relative_transformation_rate(self) -> bool:
+        return self.relative_transformation_rate is not None
+
+    def get_next_x_prev(self, d_args: tuple[torch.Tensor, ...], d_kwargs: dict[str, torch.Tensor], clone: bool=False) -> tuple[torch.Tensor, ...]:
+        if self.stream_count == 1:
+            if clone:
+                return d_args[0].clone()
+            return d_args[0]
+        keys = list(d_kwargs.keys())[:self.stream_count]
+        orig_inputs = []
+        for key in keys:
+            if clone:
+                orig_inputs.append(d_kwargs[key].clone())
+            else:
+                orig_inputs.append(d_kwargs[key])
+        return tuple(orig_inputs)
+
+    def subsample(self, x: Union[torch.Tensor, tuple[torch.Tensor, ...]], clone: bool = True) -> torch.Tensor:
+        # subsample only the first compoenent
+        if isinstance(x, tuple):
+            return self.subsample(x[0], clone)
+        if self.subsample_factor > 1:
+            to_return = x[..., ::self.subsample_factor, ::self.subsample_factor]
+            if clone:
+                return to_return.clone()
+            return to_return
+        if clone:
+            return x.clone()
+        return x
+
+    def apply_cache_diff(self, x: Union[torch.Tensor, tuple[torch.Tensor, ...]], sb_holder: EasySortblockHolder):
+        self.steps_skipped.append(sb_holder.step_count)
+        if not isinstance(x, tuple):
+            x = (x, )
+        to_return = tuple([x[i] + self.cache_diff[i] for i in range(self.stream_count)])
+        if len(to_return) == 1:
+            return to_return[0]
+        return to_return
+
+    def update_cache_diff(self, output_raw: Union[torch.Tensor, tuple[torch.Tensor, ...]], x: Union[torch.Tensor, tuple[torch.Tensor, ...]]):
+        if not isinstance(output_raw, tuple):
+            output_raw = (output_raw, )
+        if not isinstance(x, tuple):
+            x = (x, )
+        self.cache_diff = tuple([output_raw[i] - x[i] for i in range(self.stream_count)])
+
+    def reset(self):
+        self.relative_transformation_rate = 0.0
+        self.cumulative_change_rate = 0.0
+        self.prev_change_rate = 0.0
+        self.x_prev_subsampled = None
+        self.output_prev_subsampled = None
+        self.output_prev_norm = None
+        self.cache_diff = []
+        self.output_change_rates = []
+        self.approx_output_change_rates = []
+        self.steps_skipped = []
+        self.consecutive_skipped_steps = 0
+        self.want_to_skip = False
+        self.allowed_to_skip = False
+        return self
+
+
+class EasySortblockHolder:
+    def __init__(self, reuse_threshold: float, start_predict_ratio: float, end_predict_ratio: float, max_skipped_steps: int,
+                       start_percent: float, end_percent: float, subsample_factor: int, verbose: bool=False):
+        self.name = "EasySortblock"
+        self.reuse_threshold = reuse_threshold
+        self.start_predict_ratio = start_predict_ratio
+        self.end_predict_ratio = end_predict_ratio
+        self.max_skipped_steps = max_skipped_steps
+        self.start_percent = start_percent
+        self.end_percent = end_percent
+        self.subsample_factor = subsample_factor
+        self.verbose = verbose
+        # timestep values
+        self.start_t = 0.0
+        self.end_t = 0.0
+        # control values
+        self.relative_transformation_rate: float = None
+        self.cumulative_change_rate = 0.0
+        self.initial_step = True
+        self.step_count = 0
+        self.predict_ratios = []
+        self.skip_current_step = False
+        self.predict_start_index = 0
+        # cache values
+        self.x_prev_subsampled: torch.Tensor = None
+        self.output_prev_subsampled: torch.Tensor = None
+        self.output_prev_norm: torch.Tensor = None
+        self.steps_skipped: list[int] = []
+        self.output_change_rates = []
+        self.approx_output_change_rates = []
+        self.state_metadata = None
+        self.all_blocks = []
+        self.blocks_per_type = {}
+
+    def add_to_all_blocks(self, block):
+        self.all_blocks.append(block)
+
+    def add_to_blocks_per_type(self, block, block_type: str):
+        self.blocks_per_type.setdefault(block_type, []).append(block)
+
+    def is_past_end_timestep(self, timestep: float) -> bool:
+        return not (timestep[0] > self.end_t).item()
+
+    def should_do_easycache(self, timestep: float) -> bool:
+        return (timestep[0] <= self.start_t).item()
+
+    def check_if_within_timesteps(self, timestep: Union[float, torch.Tensor]) -> bool:
+        return (timestep <= self.start_t).item() and (timestep > self.end_t).item()
+
+    def has_x_prev_subsampled(self) -> bool:
+        return self.x_prev_subsampled is not None
+
+    def has_output_prev_subsampled(self) -> bool:
+        return self.output_prev_subsampled is not None
+
+    def has_output_prev_norm(self) -> bool:
+        return self.output_prev_norm is not None
+
+    def has_relative_transformation_rate(self) -> bool:
+        return self.relative_transformation_rate is not None
+
+    def prepare_timesteps(self, model_sampling):
+        self.start_t = model_sampling.percent_to_sigma(self.start_percent)
+        self.end_t = model_sampling.percent_to_sigma(self.end_percent)
+        return self
+
+    def subsample(self, x: torch.Tensor, clone: bool = True) -> torch.Tensor:
+        if self.subsample_factor > 1:
+            to_return = x[..., ::self.subsample_factor, ::self.subsample_factor]
+            if clone:
+                return to_return.clone()
+            return to_return
+        if clone:
+            return x.clone()
+        return x
+
+    def check_metadata(self, x: torch.Tensor) -> bool:
+        metadata = (x.device, x.dtype, x.shape)
+        if self.state_metadata is None:
+            self.state_metadata = metadata
+            return True
+        if metadata == self.state_metadata:
+            return True
+        logging.warning(f"{self.name} - Tensor shape, dtype or device changed, resetting state")
+        self.reset()
+        return False
+
+    def reset(self):
+        logging.info(f"EasySortblock: resetting {len(self.all_blocks)} blocks")
+        for block in self.all_blocks:
+            clean_block(block)
+        self.relative_transformation_rate = 0.0
+        self.cumulative_change_rate = 0.0
+        self.initial_step = True
+        self.step_count = 0
+        self.predict_ratios = []
+        self.skip_current_step = False
+        self.predict_start_index = 0
+        self.x_prev_subsampled = None
+        self.output_prev_subsampled = None
+        self.output_prev_norm = None
+        self.steps_skipped = []
+        self.output_change_rates = []
+        self.approx_output_change_rates = []
+        self.state_metadata = None
+        self.all_blocks = []
+        self.blocks_per_type = {}
+        return self
+
+    def clone(self):
+        return EasySortblockHolder(self.reuse_threshold, self.start_predict_ratio, self.end_predict_ratio, self.max_skipped_steps,
+                                   self.start_percent, self.end_percent, self.subsample_factor, self.verbose)
+
+class EasySortblockScaledNode(io.ComfyNode):
+    @classmethod
+    def define_schema(cls) -> io.Schema:
+        return io.Schema(
+            node_id="EasySortblockScaled",
+            display_name="EasySortblockScaled",
+            description="A homebrew version of EasyCache - even 'easier' version of EasyCache to implement. Overall works worse than EasyCache, but better in some rare cases AND universal compatibility with everything in ComfyUI.",
+            category="advanced/debug/model",
+            is_experimental=True,
+            inputs=[
+                io.Model.Input("model", tooltip="The model to add Sortblock to."),
+                io.Float.Input("reuse_threshold", min=0.0, default=0.2, max=3.0, step=0.01, tooltip="The threshold for reusing cached steps."),
+                io.Float.Input("start_predict_ratio", min=0.0, default=0.2, max=1.0, step=0.01, tooltip="The ratio of blocks to predict."),
+                io.Float.Input("end_predict_ratio", min=0.0, default=0.9, max=1.0, step=0.01, tooltip="The ratio of blocks to predict."),
+                io.Int.Input("policy_refresh_interval", min=3, default=5, max=100, step=1, tooltip="The interval at which to refresh the policy."),
+                io.Float.Input("start_percent", min=0.0, default=0.15, max=1.0, step=0.01, tooltip="The relative sampling step to begin use of Sortblock."),
+                io.Float.Input("end_percent", min=0.0, default=0.95, max=1.0, step=0.01, tooltip="The relative sampling step to end use of Sortblock."),
+                io.Boolean.Input("verbose", default=False, tooltip="Whether to log verbose information."),
+            ],
+            outputs=[
+                io.Model.Output(tooltip="The model with Sortblock."),
+            ],
+        )
+
+    @classmethod
+    def execute(cls, model: io.Model.Type, reuse_threshold: float, start_predict_ratio: float, end_predict_ratio: float, policy_refresh_interval: int, start_percent: float, end_percent: float, verbose: bool) -> io.NodeOutput:
+        # TODO: check for specific flavors of supported models
+        model = model.clone()
+        model.model_options["transformer_options"]["easycache"] = EasySortblockHolder(reuse_threshold, start_predict_ratio, end_predict_ratio, policy_refresh_interval, start_percent, end_percent, subsample_factor=8, verbose=verbose)
+        model.add_wrapper_with_key(comfy.patcher_extension.WrappersMP.PREDICT_NOISE, "sortblock", easysortblock_predict_noise_wrapper)
+        model.add_wrapper_with_key(comfy.patcher_extension.WrappersMP.OUTER_SAMPLE, "sortblock", easysortblock_outer_sample_wrapper)
+        model.add_wrapper_with_key(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, "sortblock", model_forward_wrapper)
+        return io.NodeOutput(model)
+
+
+class EasySortblockExtension(ComfyExtension):
+    async def get_node_list(self) -> list[type[io.ComfyNode]]:
+        return [
+            # EasySortblockNode,
+            EasySortblockScaledNode,
+        ]
+
+def comfy_entrypoint():
+    return EasySortblockExtension()
+
--- a/comfy_extras/nodes_latent.py
+++ b/comfy_extras/nodes_latent.py
@ -1,6 +1,7 @@
 import comfy.utils
 import comfy_extras.nodes_post_processing
 import torch
+import nodes


 def reshape_latent_to(target_shape, latent, repeat_batch=True):
@ -137,6 +138,41 @@ class LatentConcat:
        samples_out["samples"] = torch.cat(c, dim=dim)
        return (samples_out,)

+class LatentCut:
+    @classmethod
+    def INPUT_TYPES(s):
+        return {"required": {"samples": ("LATENT",),
+                             "dim": (["x", "y", "t"], ),
+                             "index": ("INT", {"default": 0, "min": -nodes.MAX_RESOLUTION, "max": nodes.MAX_RESOLUTION, "step": 1}),
+                             "amount": ("INT", {"default": 1, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 1})}}
+
+    RETURN_TYPES = ("LATENT",)
+    FUNCTION = "op"
+
+    CATEGORY = "latent/advanced"
+
+    def op(self, samples, dim, index, amount):
+        samples_out = samples.copy()
+
+        s1 = samples["samples"]
+
+        if "x" in dim:
+            dim = s1.ndim - 1
+        elif "y" in dim:
+            dim = s1.ndim - 2
+        elif "t" in dim:
+            dim = s1.ndim - 3
+
+        if index >= 0:
+            index = min(index, s1.shape[dim] - 1)
+            amount = min(s1.shape[dim] - index, amount)
+        else:
+            index = max(index, -s1.shape[dim])
+            amount = min(-index, amount)
+
+        samples_out["samples"] = torch.narrow(s1, dim, index, amount)
+        return (samples_out,)
+
 class LatentBatch:
    @classmethod
    def INPUT_TYPES(s):
@ -312,6 +348,7 @@ NODE_CLASS_MAPPINGS = {
    "LatentMultiply": LatentMultiply,
    "LatentInterpolate": LatentInterpolate,
    "LatentConcat": LatentConcat,
+    "LatentCut": LatentCut,
    "LatentBatch": LatentBatch,
    "LatentBatchSeedBehavior": LatentBatchSeedBehavior,
    "LatentApplyOperation": LatentApplyOperation,
--- a/comfy_extras/nodes_model_patch.py
+++ b/comfy_extras/nodes_model_patch.py
@ -108,7 +108,7 @@ class DiffSynthCnetPatch:
        img = kwargs.get("img")
        block_index = kwargs.get("block_index")
        spacial_compression = self.vae.spacial_compression_encode()
-        if self.encoded_image is None or self.encoded_image_size != (x.shape[-1] * spacial_compression, x.shape[-2] * spacial_compression):
+        if self.encoded_image is None or self.encoded_image_size != (x.shape[-2] * spacial_compression, x.shape[-1] * spacial_compression):
            image_scaled = comfy.utils.common_upscale(self.image.movedim(-1, 1), x.shape[-1] * spacial_compression, x.shape[-2] * spacial_compression, "area", "center")
            loaded_models = comfy.model_management.loaded_models(only_currently_used=True)
            self.encoded_image = self.model_patch.model.process_input_latent_image(self.encode_latent_cond(image_scaled.movedim(1, -1)))
--- a/comfy_extras/nodes_sortblock.py
+++ b/comfy_extras/nodes_sortblock.py
@ -0,0 +1,462 @@
+from __future__ import annotations
+from typing import TYPE_CHECKING, Union
+
+from comfy_api.latest import io, ComfyExtension
+import comfy.patcher_extension
+import logging
+import torch
+import math
+import comfy.model_patcher
+if TYPE_CHECKING:
+    from uuid import UUID
+
+
+def prepare_noise_wrapper(executor, *args, **kwargs):
+    try:
+        transformer_options: dict[str] = args[2]["transformer_options"]
+        sb_holder: SortblockHolder = transformer_options["sortblock"]
+        if sb_holder.initial_step:
+            sample_sigmas = transformer_options["sample_sigmas"]
+            relevant_sigmas = []
+            # find start and end steps, then use to interpolate between start and end predict ratios
+            for i,sigma in enumerate(sample_sigmas):
+                if sb_holder.check_if_within_timesteps(sigma):
+                    relevant_sigmas.append((i, sigma))
+            start_index = relevant_sigmas[0][0]
+            end_index = relevant_sigmas[-1][0]
+            sb_holder.predict_ratios = torch.linspace(sb_holder.start_predict_ratio, sb_holder.end_predict_ratio, end_index - start_index + 1)
+            sb_holder.predict_start_index = start_index
+
+        return executor(*args, **kwargs)
+    finally:
+        transformer_options: dict[str] = args[2]["transformer_options"]
+        sb_holder: SortblockHolder = transformer_options["sortblock"]
+        sb_holder.step_count += 1
+        if sb_holder.should_do_sortblock():
+            sb_holder.active_steps += 1
+
+
+def outer_sample_wrapper(executor, *args, **kwargs):
+    try:
+        logging.info("Sortblock: inside outer_sample!")
+        guider = executor.class_obj
+        orig_model_options = guider.model_options
+        guider.model_options = comfy.model_patcher.create_model_options_clone(orig_model_options)
+        # clone and prepare timesteps
+        sb_holder = guider.model_options["transformer_options"]["sortblock"]
+        guider.model_options["transformer_options"]["sortblock"] = sb_holder.clone().prepare_timesteps(guider.model_patcher.model.model_sampling)
+        sb_holder: SortblockHolder = guider.model_options["transformer_options"]["sortblock"]
+        logging.info(f"Sortblock: enabled - threshold: {sb_holder.start_predict_ratio}, start_percent: {sb_holder.start_percent}, end_percent: {sb_holder.end_percent}")
+        return executor(*args, **kwargs)
+    finally:
+        sb_holder = guider.model_options["transformer_options"]["sortblock"]
+        logging.info(f"Sortblock: final step count: {sb_holder.step_count}")
+        sb_holder.reset()
+        guider.model_options = orig_model_options
+
+
+def model_forward_wrapper(executor, *args, **kwargs):
+    # TODO: make work with batches of conds
+    transformer_options: dict[str] = args[-1]
+    if not isinstance(transformer_options, dict):
+        transformer_options = kwargs.get("transformer_options")
+        if not transformer_options:
+            transformer_options = args[-2]
+    sigmas = transformer_options["sigmas"]
+    sb_holder: SortblockHolder = transformer_options["sortblock"]
+    sb_holder.update_should_do_sortblock(sigmas)
+
+    # if initial step, prepare everything for Sortblock
+    if sb_holder.initial_step:
+        logging.info(f"Sortblock: inside model {executor.class_obj.__class__.__name__}")
+        # TODO: generalize for other models
+        # these won't stick around past this step; should store on sb_holder instead
+        logging.info(f"Sortblock: preparing {len(executor.class_obj.double_blocks)} double blocks and {len(executor.class_obj.single_blocks)} single blocks")
+        if hasattr(executor.class_obj, "double_blocks"):
+            for block in executor.class_obj.double_blocks:
+                prepare_block(block, sb_holder)
+        if hasattr(executor.class_obj, "single_blocks"):
+            for block in executor.class_obj.single_blocks:
+                prepare_block(block, sb_holder)
+        if hasattr(executor.class_obj, "blocks"):
+            for block in executor.class_obj.block:
+                prepare_block(block, sb_holder)
+
+    # when 0: Initialization(1)
+    if sb_holder.step_modulus == 0:
+        logging.info(f"Sortblock: for step {sb_holder.step_count}, all blocks are marked for recomputation")
+        # all features are computed, input-outputs changes for all DiT blocks are stored for relative step 'k'
+        sb_holder.activated_steps.append(sb_holder.step_count)
+        for block in sb_holder.all_blocks:
+            cache: BlockCache = block.__block_cache
+            cache.mark_recompute()
+
+    # all block operations are performed in forward pass of model
+    to_return = executor(*args, **kwargs)
+
+    # when 1: Select DiT blocks(4)
+    if sb_holder.step_modulus == 1:
+        predict_index = max(0, sb_holder.step_count - sb_holder.predict_start_index)
+        predict_ratio = sb_holder.predict_ratios[predict_index]
+        logging.info(f"Sortblock: for step {sb_holder.step_count}, selecting blocks for recomputation and prediction, predict_ratio: {predict_ratio}")
+        reuse_ratio = 1.0 - predict_ratio
+        for block_type, blocks in sb_holder.blocks_per_type.items():
+            sorted_blocks = sorted(blocks, key=lambda x: x.__block_cache.cosine_similarity)
+            threshold_index = int(len(sorted_blocks) * reuse_ratio)
+            # blocks with lower similarity are marked for recomputation
+            for block in sorted_blocks[:threshold_index]:
+                cache: BlockCache = block.__block_cache
+                cache.mark_recompute()
+            # blocks with higher similarity are marked for prediction
+            for block in sorted_blocks[threshold_index:]:
+                cache: BlockCache = block.__block_cache
+                cache.mark_predict()
+            logging.info(f"Sortblock: for {block_type}, selected {len(sorted_blocks[:threshold_index])} blocks for recomputation and {len(sorted_blocks[threshold_index:])} blocks for prediction")
+
+    if sb_holder.initial_step:
+        sb_holder.initial_step = False
+    return to_return
+
+def block_forward_factory(func, block):
+    def block_forward_wrapper(*args, **kwargs):
+        transformer_options: dict[str] = kwargs.get("transformer_options")
+        sb_holder: SortblockHolder = transformer_options["sortblock"]
+        cache: BlockCache = block.__block_cache
+        # make sure stream count is properly set for this block
+        if sb_holder.initial_step:
+            sb_holder.add_to_blocks_per_type(block, transformer_options['block'][0])
+            cache.block_index = transformer_options['block'][1]
+            cache.stream_count = transformer_options['block'][2]
+        # do sortblock stuff
+        if cache.recompute and sb_holder.step_modulus != 1:
+            # clone relevant inputs
+            orig_inputs = cache.get_orig_inputs(args, kwargs, clone=True)
+            # get block outputs
+            # NOTE: output_raw is expected to have cache.stream_count elements if count is greaater than 1 (double block, etc.)
+            if cache.stream_count == 1:
+                zzz = 10
+            output_raw: Union[torch.Tensor, tuple[torch.Tensor, ...]] = func(*args, **kwargs)
+            # perform derivative approximation;
+            cache.derivative_approximation(sb_holder, output_raw, orig_inputs)
+            # if step_modulus is 0, input-output changes for DiT block are stored
+            if sb_holder.step_modulus == 0:
+                cache.cache_previous_residual(output_raw, orig_inputs)
+        else:
+            # if not to recompute, predict features for current timestep
+            orig_inputs = cache.get_orig_inputs(args, kwargs, clone=False)
+            # when 1: Linear Prediction(2)
+            # if step_modulus is 1, store block residuals as 'current' after applying taylor_formula
+            if sb_holder.step_modulus == 1:
+                cache.cache_current_residual(sb_holder)
+            # based on features computed in last timestep, all features for current timestep are predicted using Eq. 4,
+            # input-output changes for all DiT blocks are stored for relative step 'k+1'
+            output_raw = cache.apply_linear_prediction(sb_holder, orig_inputs)
+
+        # when 1: Identify Changes(3)
+        if sb_holder.step_modulus == 1:
+            # based on features computed in last timestep, all features for current timestep are predicted using Eq. 4,
+            # input-output changes for all DiT blocks are stored for relative step 'k+1'
+            cache.calculate_cosine_similarity()
+
+        # return output_raw
+        return output_raw
+    return block_forward_wrapper
+
+
+def perform_sortblock(blocks: list):
+    ...
+
+def prepare_block(block, sb_holder: SortblockHolder, stream_count: int=1):
+    sb_holder.add_to_all_blocks(block)
+    block.__original_forward = block.forward
+    block.forward = block_forward_factory(block.__original_forward, block)
+    block.__block_cache = BlockCache(subsample_factor=sb_holder.subsample_factor, verbose=sb_holder.verbose)
+
+def clean_block(block):
+    block.forward = block.__original_forward
+    del block.__original_forward
+    del block.__block_cache
+
+def subsample(x: torch.Tensor, factor: int, clone: bool=True) -> torch.Tensor:
+    if factor > 1:
+        to_return = x[..., ::factor, ::factor]
+        if clone:
+            return to_return.clone()
+        return to_return
+    if clone:
+        return x.clone()
+    return x
+
+class BlockCache:
+    def __init__(self, subsample_factor: int=8, verbose: bool=False):
+        self.subsample_factor = subsample_factor
+        self.verbose = verbose
+        self.stream_count = 1
+        self.recompute = False
+        self.block_index = 0
+        # cached values
+        self.previous_residual_subsampled: torch.Tensor = None
+        self.current_residual_subsampled: torch.Tensor = None
+        self.cosine_similarity: float = None
+        self.previous_taylor_factors: dict[int, torch.Tensor] = {}
+        self.current_taylor_factors: dict[int, torch.Tensor] = {}
+
+    def mark_recompute(self):
+        self.recompute = True
+
+    def mark_predict(self):
+        self.recompute = False
+
+    def cache_previous_residual(self, output_raw: Union[torch.Tensor, tuple[torch.Tensor, ...]], orig_inputs: Union[torch.Tensor, tuple[torch.Tensor, ...]]):
+        if isinstance(output_raw, tuple):
+            output_raw = output_raw[0]
+        if isinstance(orig_inputs, tuple):
+            orig_inputs = orig_inputs[0]
+        del self.previous_residual_subsampled
+        self.previous_residual_subsampled = subsample(output_raw - orig_inputs, self.subsample_factor, clone=True)
+
+    def cache_current_residual(self, sb_holder: SortblockHolder):
+        del self.current_residual_subsampled
+        self.current_residual_subsampled = subsample(self.use_taylor_formula(sb_holder)[0], self.subsample_factor, clone=True)
+
+    def get_orig_inputs(self, d_args: tuple, d_kwargs: dict, clone: bool=True) -> tuple[torch.Tensor, ...]:
+        if self.stream_count == 1:
+            if clone:
+                return d_args[0].clone()
+            return d_args[0]
+        keys = list(d_kwargs.keys())[:self.stream_count]
+        orig_inputs = []
+        for key in keys:
+            if clone:
+                orig_inputs.append(d_kwargs[key].clone())
+            else:
+                orig_inputs.append(d_kwargs[key])
+        return tuple(orig_inputs)
+
+    def apply_linear_prediction(self, sb_holder: SortblockHolder, orig_inputs: Union[torch.Tensor, tuple[torch.Tensor, ...]]) -> None:
+        drop_tuple = False
+        if not isinstance(orig_inputs, tuple):
+            orig_inputs = (orig_inputs,)
+            drop_tuple = True
+        taylor_results = self.use_taylor_formula(sb_holder)
+        for output, taylor_result in zip(orig_inputs, taylor_results):
+            if output.shape != taylor_result.shape:
+                zzz = 10
+            output += taylor_result
+        if drop_tuple:
+            orig_inputs = orig_inputs[0]
+        return orig_inputs
+
+    def calculate_cosine_similarity(self) -> None:
+        self.cosine_similarity = torch.nn.functional.cosine_similarity(self.previous_residual_subsampled, self.current_residual_subsampled, dim=-1).mean().item()
+
+    def derivative_approximation(self, sb_holder: SortblockHolder, output_raw: Union[torch.Tensor, tuple[torch.Tensor, ...]], orig_inputs: Union[torch.Tensor, tuple[torch.Tensor, ...]]):
+        activation_distance = sb_holder.activated_steps[-1] - sb_holder.activated_steps[-2]
+        # make tuple if not already tuple, so that works with both single and double blocks
+        if not isinstance(output_raw, tuple):
+            output_raw = (output_raw,)
+        if not isinstance(orig_inputs, tuple):
+            orig_inputs = (orig_inputs,)
+
+        for i, (output, x) in enumerate(zip(output_raw, orig_inputs)):
+            feature = output.clone() - x
+            has_previous_taylor_factor = self.previous_taylor_factors.get(i, None) is not None
+            # NOTE: not sure why - 2, but that's what's in the original implementation. Maybe consider changing values?
+            if has_previous_taylor_factor and sb_holder.step_count > (sb_holder.first_enhance - 2):
+                self.current_taylor_factors[i] = (
+                    feature - self.previous_taylor_factors[i]
+                ) / activation_distance
+
+            self.previous_taylor_factors[i] = feature
+
+    def use_taylor_formula(self, sb_holder: SortblockHolder) -> tuple[torch.Tensor, ...]:
+        step_distance = sb_holder.step_count - sb_holder.activated_steps[-1]
+
+        output_predicted = []
+
+        for key in self.previous_taylor_factors.keys():
+            previous_tf = self.previous_taylor_factors[key]
+            current_tf = self.current_taylor_factors[key]
+            predicted = taylor_formula(previous_tf, 0, step_distance)
+            predicted += taylor_formula(current_tf, 1, step_distance)
+            output_predicted.append(predicted)
+
+        return tuple(output_predicted)
+
+    def reset(self):
+        self.recompute = False
+        self.current_residual_subsampled = None
+        self.previous_residual_subsampled = None
+        self.cosine_similarity = None
+        self.previous_taylor_factors = {}
+        self.current_taylor_factors = {}
+
+def taylor_formula(taylor_factor: torch.Tensor, i: int, step_distance: int):
+    return (
+        (1 / math.factorial(i))
+        * taylor_factor
+        * (step_distance ** i)
+    )
+
+class SortblockHolder:
+    def __init__(self, start_predict_ratio: float, end_predict_ratio: float, policy_refresh_interval: int,
+                 start_percent: float, end_percent: float, subsample_factor: int=8, verbose: bool=False):
+        self.start_predict_ratio = start_predict_ratio
+        self.end_predict_ratio = end_predict_ratio
+        self.start_percent = start_percent
+        self.end_percent = end_percent
+        self.subsample_factor = subsample_factor
+        self.verbose = verbose
+
+        # NOTE: number represents steps
+        self.policy_refresh_interval = policy_refresh_interval
+        self.active_policy_refresh_interval = 1
+        self.first_enhance = 3  # NOTE: this value is 2 higher than the one actually used in code (subtracted by 2 in derivative_approximation)
+        # timestep values
+        self.start_t = 0.0
+        self.end_t = 0.0
+        self.curr_t = 0.0
+        # control values
+        self.initial_step = True
+        self.step_count = 0
+        self.activated_steps: list[int] = [0]
+        self.step_modulus = 0
+        self.do_sortblock = False
+        self.active_steps = 0
+        self.predict_ratios = []
+        self.predict_start_index = 0
+
+        # cache values
+        self.all_blocks = []
+        self.blocks_per_type = {}
+
+    def add_to_all_blocks(self, block):
+        self.all_blocks.append(block)
+
+    def add_to_blocks_per_type(self, block, block_type: str):
+        self.blocks_per_type.setdefault(block_type, []).append(block)
+
+    def prepare_timesteps(self, model_sampling):
+        self.start_t = model_sampling.percent_to_sigma(self.start_percent)
+        self.end_t = model_sampling.percent_to_sigma(self.end_percent)
+        return self
+
+    def check_if_within_timesteps(self, timestep: Union[float, torch.Tensor]) -> bool:
+        return (timestep <= self.start_t).item() and (timestep > self.end_t).item()
+
+    def update_should_do_sortblock(self, timestep: float) -> bool:
+        self.do_sortblock = (timestep[0] <= self.start_t).item() and (timestep[0] > self.end_t).item()
+        self.curr_t = timestep
+        if self.do_sortblock:
+            self.active_policy_refresh_interval = self.policy_refresh_interval
+        else:
+            self.active_policy_refresh_interval = 1
+        self.update_step_modulus()
+        return self.do_sortblock
+
+    def update_step_modulus(self):
+        self.step_modulus = int(self.step_count % self.active_policy_refresh_interval)
+
+    def should_do_sortblock(self) -> bool:
+        return self.do_sortblock
+
+    def reset(self):
+        self.initial_step = True
+        self.curr_t = 0.0
+        logging.info(f"Sortblock: resetting {len(self.all_blocks)} blocks")
+        for block in self.all_blocks:
+            clean_block(block)
+        self.all_blocks = []
+        self.blocks_per_type = {}
+        self.step_count = 0
+        self.activated_steps = [0]
+        self.step_modulus = 0
+        self.active_steps = 0
+        self.predict_ratios = []
+        self.do_sortblock = False
+        self.predict_start_index = 0
+        return self
+
+    def clone(self):
+        return SortblockHolder(start_predict_ratio=self.start_predict_ratio, end_predict_ratio=self.end_predict_ratio, policy_refresh_interval=self.policy_refresh_interval,
+            start_percent=self.start_percent, end_percent=self.end_percent, subsample_factor=self.subsample_factor,
+            verbose=self.verbose)
+
+
+class SortblockNode(io.ComfyNode):
+    @classmethod
+    def define_schema(cls) -> io.Schema:
+        return io.Schema(
+            node_id="Sortblock",
+            display_name="Sortblock",
+            description="A homebrew version of EasyCache - even 'easier' version of EasyCache to implement. Overall works worse than EasyCache, but better in some rare cases AND universal compatibility with everything in ComfyUI.",
+            category="advanced/debug/model",
+            is_experimental=True,
+            inputs=[
+                io.Model.Input("model", tooltip="The model to add Sortblock to."),
+                io.Float.Input("predict_ratio", min=0.0, default=0.8, max=3.0, step=0.01, tooltip="The ratio of blocks to predict."),
+                io.Int.Input("policy_refresh_interval", min=3, default=5, max=100, step=1, tooltip="The interval at which to refresh the policy."),
+                io.Float.Input("start_percent", min=0.0, default=0.15, max=1.0, step=0.01, tooltip="The relative sampling step to begin use of Sortblock."),
+                io.Float.Input("end_percent", min=0.0, default=0.95, max=1.0, step=0.01, tooltip="The relative sampling step to end use of Sortblock."),
+                io.Boolean.Input("verbose", default=False, tooltip="Whether to log verbose information."),
+            ],
+            outputs=[
+                io.Model.Output(tooltip="The model with Sortblock."),
+            ],
+        )
+
+    @classmethod
+    def execute(cls, model: io.Model.Type, predict_ratio: float, policy_refresh_interval: int, start_percent: float, end_percent: float, verbose: bool) -> io.NodeOutput:
+        # TODO: check for specific flavors of supported models
+        model = model.clone()
+        model.model_options["transformer_options"]["sortblock"] = SortblockHolder(start_predict_ratio=predict_ratio, end_predict_ratio=predict_ratio, policy_refresh_interval=policy_refresh_interval,
+                                                                                  start_percent=start_percent, end_percent=end_percent, subsample_factor=8, verbose=verbose)
+        model.add_wrapper_with_key(comfy.patcher_extension.WrappersMP.PREDICT_NOISE, "sortblock", prepare_noise_wrapper)
+        model.add_wrapper_with_key(comfy.patcher_extension.WrappersMP.OUTER_SAMPLE, "sortblock", outer_sample_wrapper)
+        model.add_wrapper_with_key(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, "sortblock", model_forward_wrapper)
+        return io.NodeOutput(model)
+
+
+class SortblockScaledNode(io.ComfyNode):
+    @classmethod
+    def define_schema(cls) -> io.Schema:
+        return io.Schema(
+            node_id="SortblockScaled",
+            display_name="SortblockScaled",
+            description="A homebrew version of EasyCache - even 'easier' version of EasyCache to implement. Overall works worse than EasyCache, but better in some rare cases AND universal compatibility with everything in ComfyUI.",
+            category="advanced/debug/model",
+            is_experimental=True,
+            inputs=[
+                io.Model.Input("model", tooltip="The model to add Sortblock to."),
+                io.Float.Input("start_predict_ratio", min=0.0, default=0.2, max=1.0, step=0.01, tooltip="The ratio of blocks to predict."),
+                io.Float.Input("end_predict_ratio", min=0.0, default=0.9, max=1.0, step=0.01, tooltip="The ratio of blocks to predict."),
+                io.Int.Input("policy_refresh_interval", min=3, default=5, max=100, step=1, tooltip="The interval at which to refresh the policy."),
+                io.Float.Input("start_percent", min=0.0, default=0.15, max=1.0, step=0.01, tooltip="The relative sampling step to begin use of Sortblock."),
+                io.Float.Input("end_percent", min=0.0, default=0.95, max=1.0, step=0.01, tooltip="The relative sampling step to end use of Sortblock."),
+                io.Boolean.Input("verbose", default=False, tooltip="Whether to log verbose information."),
+            ],
+            outputs=[
+                io.Model.Output(tooltip="The model with Sortblock."),
+            ],
+        )
+
+    @classmethod
+    def execute(cls, model: io.Model.Type, start_predict_ratio: float, end_predict_ratio: float, policy_refresh_interval: int, start_percent: float, end_percent: float, verbose: bool) -> io.NodeOutput:
+        # TODO: check for specific flavors of supported models
+        model = model.clone()
+        model.model_options["transformer_options"]["sortblock"] = SortblockHolder(start_predict_ratio, end_predict_ratio, policy_refresh_interval, start_percent, end_percent, subsample_factor=8, verbose=verbose)
+        model.add_wrapper_with_key(comfy.patcher_extension.WrappersMP.PREDICT_NOISE, "sortblock", prepare_noise_wrapper)
+        model.add_wrapper_with_key(comfy.patcher_extension.WrappersMP.OUTER_SAMPLE, "sortblock", outer_sample_wrapper)
+        model.add_wrapper_with_key(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, "sortblock", model_forward_wrapper)
+        return io.NodeOutput(model)
+
+
+class SortblockExtension(ComfyExtension):
+    async def get_node_list(self) -> list[type[io.ComfyNode]]:
+        return [
+            SortblockNode,
+            SortblockScaledNode,
+        ]
+
+def comfy_entrypoint():
+    return SortblockExtension()
--- a/comfy_extras/nodes_wan.py
+++ b/comfy_extras/nodes_wan.py
@ -139,16 +139,21 @@ class Wan22FunControlToVideo(io.ComfyNode):

    @classmethod
    def execute(cls, positive, negative, vae, width, height, length, batch_size, ref_image=None, start_image=None, control_video=None) -> io.NodeOutput:
-        latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device())
-        concat_latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device())
-        concat_latent = comfy.latent_formats.Wan21().process_out(concat_latent)
+        spacial_scale = vae.spacial_compression_encode()
+        latent_channels = vae.latent_channels
+        latent = torch.zeros([batch_size, latent_channels, ((length - 1) // 4) + 1, height // spacial_scale, width // spacial_scale], device=comfy.model_management.intermediate_device())
+        concat_latent = torch.zeros([batch_size, latent_channels, ((length - 1) // 4) + 1, height // spacial_scale, width // spacial_scale], device=comfy.model_management.intermediate_device())
+        if latent_channels == 48:
+            concat_latent = comfy.latent_formats.Wan22().process_out(concat_latent)
+        else:
+            concat_latent = comfy.latent_formats.Wan21().process_out(concat_latent)
        concat_latent = concat_latent.repeat(1, 2, 1, 1, 1)
        mask = torch.ones((1, 1, latent.shape[2] * 4, latent.shape[-2], latent.shape[-1]))

        if start_image is not None:
            start_image = comfy.utils.common_upscale(start_image[:length].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1)
            concat_latent_image = vae.encode(start_image[:, :, :, :3])
-            concat_latent[:,16:,:concat_latent_image.shape[2]] = concat_latent_image[:,:,:concat_latent.shape[2]]
+            concat_latent[:,latent_channels:,:concat_latent_image.shape[2]] = concat_latent_image[:,:,:concat_latent.shape[2]]
            mask[:, :, :start_image.shape[0] + 3] = 0.0

        ref_latent = None
@ -159,11 +164,11 @@ class Wan22FunControlToVideo(io.ComfyNode):
        if control_video is not None:
            control_video = comfy.utils.common_upscale(control_video[:length].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1)
            concat_latent_image = vae.encode(control_video[:, :, :, :3])
-            concat_latent[:,:16,:concat_latent_image.shape[2]] = concat_latent_image[:,:,:concat_latent.shape[2]]
+            concat_latent[:,:latent_channels,:concat_latent_image.shape[2]] = concat_latent_image[:,:,:concat_latent.shape[2]]

        mask = mask.view(1, mask.shape[2] // 4, 4, mask.shape[3], mask.shape[4]).transpose(1, 2)
-        positive = node_helpers.conditioning_set_values(positive, {"concat_latent_image": concat_latent, "concat_mask": mask, "concat_mask_index": 16})
-        negative = node_helpers.conditioning_set_values(negative, {"concat_latent_image": concat_latent, "concat_mask": mask, "concat_mask_index": 16})
+        positive = node_helpers.conditioning_set_values(positive, {"concat_latent_image": concat_latent, "concat_mask": mask, "concat_mask_index": latent_channels})
+        negative = node_helpers.conditioning_set_values(negative, {"concat_latent_image": concat_latent, "concat_mask": mask, "concat_mask_index": latent_channels})

        if ref_latent is not None:
            positive = node_helpers.conditioning_set_values(positive, {"reference_latents": [ref_latent]}, append=True)
@ -201,7 +206,8 @@ class WanFirstLastFrameToVideo(io.ComfyNode):

    @classmethod
    def execute(cls, positive, negative, vae, width, height, length, batch_size, start_image=None, end_image=None, clip_vision_start_image=None, clip_vision_end_image=None) -> io.NodeOutput:
-        latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device())
+        spacial_scale = vae.spacial_compression_encode()
+        latent = torch.zeros([batch_size, vae.latent_channels, ((length - 1) // 4) + 1, height // spacial_scale, width // spacial_scale], device=comfy.model_management.intermediate_device())
        if start_image is not None:
            start_image = comfy.utils.common_upscale(start_image[:length].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1)
        if end_image is not None:
@ -877,6 +883,68 @@ def get_audio_embed_bucket_fps(audio_embed, fps=16, batch_frames=81, m=0, video_
    return batch_audio_eb, min_batch_num


+def wan_sound_to_video(positive, negative, vae, width, height, length, batch_size, frame_offset=0, ref_image=None, audio_encoder_output=None, control_video=None, ref_motion=None, ref_motion_latent=None):
+    latent_t = ((length - 1) // 4) + 1
+    if audio_encoder_output is not None:
+        feat = torch.cat(audio_encoder_output["encoded_audio_all_layers"])
+        video_rate = 30
+        fps = 16
+        feat = linear_interpolation(feat, input_fps=50, output_fps=video_rate)
+        batch_frames = latent_t * 4
+        audio_embed_bucket, num_repeat = get_audio_embed_bucket_fps(feat, fps=fps, batch_frames=batch_frames, m=0, video_rate=video_rate)
+        audio_embed_bucket = audio_embed_bucket.unsqueeze(0)
+        if len(audio_embed_bucket.shape) == 3:
+            audio_embed_bucket = audio_embed_bucket.permute(0, 2, 1)
+        elif len(audio_embed_bucket.shape) == 4:
+            audio_embed_bucket = audio_embed_bucket.permute(0, 2, 3, 1)
+
+        audio_embed_bucket = audio_embed_bucket[:, :, :, frame_offset:frame_offset + batch_frames]
+        if audio_embed_bucket.shape[3] > 0:
+            positive = node_helpers.conditioning_set_values(positive, {"audio_embed": audio_embed_bucket})
+            negative = node_helpers.conditioning_set_values(negative, {"audio_embed": audio_embed_bucket * 0.0})
+            frame_offset += batch_frames
+
+    if ref_image is not None:
+        ref_image = comfy.utils.common_upscale(ref_image[:1].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1)
+        ref_latent = vae.encode(ref_image[:, :, :, :3])
+        positive = node_helpers.conditioning_set_values(positive, {"reference_latents": [ref_latent]}, append=True)
+        negative = node_helpers.conditioning_set_values(negative, {"reference_latents": [ref_latent]}, append=True)
+
+    if ref_motion is not None:
+        if ref_motion.shape[0] > 73:
+            ref_motion = ref_motion[-73:]
+
+        ref_motion = comfy.utils.common_upscale(ref_motion.movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1)
+
+        if ref_motion.shape[0] < 73:
+            r = torch.ones([73, height, width, 3]) * 0.5
+            r[-ref_motion.shape[0]:] = ref_motion
+            ref_motion = r
+
+        ref_motion_latent = vae.encode(ref_motion[:, :, :, :3])
+
+    if ref_motion_latent is not None:
+        ref_motion_latent = ref_motion_latent[:, :, -19:]
+        positive = node_helpers.conditioning_set_values(positive, {"reference_motion": ref_motion_latent})
+        negative = node_helpers.conditioning_set_values(negative, {"reference_motion": ref_motion_latent})
+
+    latent = torch.zeros([batch_size, 16, latent_t, height // 8, width // 8], device=comfy.model_management.intermediate_device())
+
+    control_video_out = comfy.latent_formats.Wan21().process_out(torch.zeros_like(latent))
+    if control_video is not None:
+        control_video = comfy.utils.common_upscale(control_video[:length].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1)
+        control_video = vae.encode(control_video[:, :, :, :3])
+        control_video_out[:, :, :control_video.shape[2]] = control_video
+
+    # TODO: check if zero is better than none if none provided
+    positive = node_helpers.conditioning_set_values(positive, {"control_video": control_video_out})
+    negative = node_helpers.conditioning_set_values(negative, {"control_video": control_video_out})
+
+    out_latent = {}
+    out_latent["samples"] = latent
+    return positive, negative, out_latent, frame_offset
+
+
 class WanSoundImageToVideo(io.ComfyNode):
    @classmethod
    def define_schema(cls):
@ -906,57 +974,44 @@ class WanSoundImageToVideo(io.ComfyNode):

    @classmethod
    def execute(cls, positive, negative, vae, width, height, length, batch_size, ref_image=None, audio_encoder_output=None, control_video=None, ref_motion=None) -> io.NodeOutput:
-        latent_t = ((length - 1) // 4) + 1
-        if audio_encoder_output is not None:
-            feat = torch.cat(audio_encoder_output["encoded_audio_all_layers"])
-            video_rate = 30
-            fps = 16
-            feat = linear_interpolation(feat, input_fps=50, output_fps=video_rate)
-            audio_embed_bucket, num_repeat = get_audio_embed_bucket_fps(feat, fps=fps, batch_frames=latent_t * 4, m=0, video_rate=video_rate)
-            audio_embed_bucket = audio_embed_bucket.unsqueeze(0)
-            if len(audio_embed_bucket.shape) == 3:
-                audio_embed_bucket = audio_embed_bucket.permute(0, 2, 1)
-            elif len(audio_embed_bucket.shape) == 4:
-                audio_embed_bucket = audio_embed_bucket.permute(0, 2, 3, 1)
+        positive, negative, out_latent, frame_offset = wan_sound_to_video(positive, negative, vae, width, height, length, batch_size, ref_image=ref_image, audio_encoder_output=audio_encoder_output,
+                                                                          control_video=control_video, ref_motion=ref_motion)
+        return io.NodeOutput(positive, negative, out_latent)

-            positive = node_helpers.conditioning_set_values(positive, {"audio_embed": audio_embed_bucket})
-            negative = node_helpers.conditioning_set_values(negative, {"audio_embed": audio_embed_bucket * 0.0})

-        if ref_image is not None:
-            ref_image = comfy.utils.common_upscale(ref_image[:1].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1)
-            ref_latent = vae.encode(ref_image[:, :, :, :3])
-            positive = node_helpers.conditioning_set_values(positive, {"reference_latents": [ref_latent]}, append=True)
-            negative = node_helpers.conditioning_set_values(negative, {"reference_latents": [ref_latent]}, append=True)
+class WanSoundImageToVideoExtend(io.ComfyNode):
+    @classmethod
+    def define_schema(cls):
+        return io.Schema(
+            node_id="WanSoundImageToVideoExtend",
+            category="conditioning/video_models",
+            inputs=[
+                io.Conditioning.Input("positive"),
+                io.Conditioning.Input("negative"),
+                io.Vae.Input("vae"),
+                io.Int.Input("length", default=77, min=1, max=nodes.MAX_RESOLUTION, step=4),
+                io.Latent.Input("video_latent"),
+                io.AudioEncoderOutput.Input("audio_encoder_output", optional=True),
+                io.Image.Input("ref_image", optional=True),
+                io.Image.Input("control_video", optional=True),
+            ],
+            outputs=[
+                io.Conditioning.Output(display_name="positive"),
+                io.Conditioning.Output(display_name="negative"),
+                io.Latent.Output(display_name="latent"),
+            ],
+            is_experimental=True,
+        )

-        if ref_motion is not None:
-            if ref_motion.shape[0] > 73:
-                ref_motion = ref_motion[-73:]
-
-            ref_motion = comfy.utils.common_upscale(ref_motion.movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1)
-
-            if ref_motion.shape[0] < 73:
-                r = torch.ones([73, height, width, 3]) * 0.5
-                r[-ref_motion.shape[0]:] = ref_motion
-                ref_motion = r
-
-            ref_motion = vae.encode(ref_motion[:, :, :, :3])
-            positive = node_helpers.conditioning_set_values(positive, {"reference_motion": ref_motion})
-            negative = node_helpers.conditioning_set_values(negative, {"reference_motion": ref_motion})
-
-        latent = torch.zeros([batch_size, 16, latent_t, height // 8, width // 8], device=comfy.model_management.intermediate_device())
-
-        control_video_out = comfy.latent_formats.Wan21().process_out(torch.zeros_like(latent))
-        if control_video is not None:
-            control_video = comfy.utils.common_upscale(control_video[:length].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1)
-            control_video = vae.encode(control_video[:, :, :, :3])
-            control_video_out[:, :, :control_video.shape[2]] = control_video
-
-        # TODO: check if zero is better than none if none provided
-        positive = node_helpers.conditioning_set_values(positive, {"control_video": control_video_out})
-        negative = node_helpers.conditioning_set_values(negative, {"control_video": control_video_out})
-
-        out_latent = {}
-        out_latent["samples"] = latent
+    @classmethod
+    def execute(cls, positive, negative, vae, length, video_latent, ref_image=None, audio_encoder_output=None, control_video=None) -> io.NodeOutput:
+        video_latent = video_latent["samples"]
+        width = video_latent.shape[-1] * 8
+        height = video_latent.shape[-2] * 8
+        batch_size = video_latent.shape[0]
+        frame_offset = video_latent.shape[-3] * 4
+        positive, negative, out_latent, frame_offset = wan_sound_to_video(positive, negative, vae, width, height, length, batch_size, frame_offset=frame_offset, ref_image=ref_image, audio_encoder_output=audio_encoder_output,
+                                                                          control_video=control_video, ref_motion=None, ref_motion_latent=video_latent)
        return io.NodeOutput(positive, negative, out_latent)


@ -1019,6 +1074,7 @@ class WanExtension(ComfyExtension):
            WanCameraImageToVideo,
            WanPhantomSubjectToVideo,
            WanSoundImageToVideo,
+            WanSoundImageToVideoExtend,
            Wan22ImageToVideoLatent,
        ]

--- a/comfyui_version.py
+++ b/comfyui_version.py
@ -1,3 +1,3 @@
 # This file is automatically generated by the build process when version is
 # updated in pyproject.toml.
-__version__ = "0.3.53"
+__version__ = "0.3.56"
--- a/main.py
+++ b/main.py
@ -112,6 +112,7 @@ import gc


 if os.name == "nt":
+    os.environ['MIMALLOC_PURGE_DELAY'] = '0'
    logging.getLogger("xformers").addFilter(lambda record: 'A matching Triton is not available' not in record.getMessage())

 if __name__ == "__main__":
--- a/nodes.py
+++ b/nodes.py
@ -2325,6 +2325,8 @@ async def init_builtin_extra_nodes():
        "nodes_model_patch.py",
        "nodes_easycache.py",
        "nodes_audio_encoder.py",
+        "nodes_sortblock.py",
+        "nodes_easysortblock.py",
    ]

    import_failed = []
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,6 +1,6 @@
 [project]
 name = "ComfyUI"
-version = "0.3.53"
+version = "0.3.56"
 readme = "README.md"
 license = { file = "LICENSE" }
 requires-python = ">=3.9"
--- a/requirements.txt
+++ b/requirements.txt
@ -1,5 +1,5 @@
 comfyui-frontend-package==1.25.11
-comfyui-workflow-templates==0.1.68
+comfyui-workflow-templates==0.1.70
 comfyui-embedded-docs==0.2.6
 torch
 torchsde
Author	SHA1	Message	Date
Jedrzej Kosinski	295b49c165	Doing some experimentation	2025-09-02 22:19:12 -07:00
Jedrzej Kosinski	a40c5ae341	Support predict_ratio changing with timesteps	2025-09-02 15:23:28 -07:00
Jedrzej Kosinski	953b906f63	Implement Sortblock for single cond usage	2025-09-02 00:45:59 -07:00
Jedrzej Kosinski	d4a8752c8c	some exploration of sortblock as more things from paper/source code need to be added	2025-09-01 09:39:40 -07:00
Jedrzej Kosinski	cf26d3d58e	More progress on Sortblock	2025-08-31 20:26:49 -07:00
Jedrzej Kosinski	f655fcc5ce	Progress on scaffolding for an EasyCache style implementation of Sortblock	2025-08-31 00:59:01 -07:00
Jedrzej Kosinski	e2491f44e8	Merge branch 'attention-select' into sortblock	2025-08-30 20:04:48 -07:00
Jedrzej Kosinski	66c4eb006b	Remove AttentionOverrideTest node, that's something to cook up for later	2025-08-30 15:19:36 -07:00
Jedrzej Kosinski	dd0a5093f6	Satisfy ruff	2025-08-30 14:58:30 -07:00
Jedrzej Kosinski	c092b8a4ac	Remove _register_core_attention_functions, as we wouldn't want someone to call that, just in case	2025-08-30 14:49:04 -07:00
Jedrzej Kosinski	eaa9433ff8	Remove attention logging code	2025-08-30 14:45:12 -07:00
comfyanonymous	4449e14769	ComfyUI version 0.3.56	2025-08-30 06:31:19 -04:00
Jedrzej Kosinski	720d0a88e6	Disable attention logs for now	2025-08-30 01:11:34 -07:00
Jedrzej Kosinski	d9bb4530b0	Merge branch 'master' into attention-select	2025-08-29 23:35:38 -07:00
Jedrzej Kosinski	cb959f9669	Add optimized to get_attention_function	2025-08-29 21:48:36 -07:00
comfyanonymous	885015eecf	Lower ram usage on windows. (#9628 )	2025-08-29 23:06:04 -04:00
Jedrzej Kosinski	d553073a1e	Fixed WAN 2.1 VACE transformer_options passthrough	2025-08-29 13:20:43 -07:00
Jedrzej Kosinski	af288b9946	Fixed Wan2.1 Fun Camera transformer_options passthrough	2025-08-29 13:06:37 -07:00
comfyanonymous	a86aaa4301	ComfyUI v0.3.55	2025-08-29 06:03:41 -04:00
ComfyUI Wiki	2efb2cbc38	Update template to 0.1.70 (#9620 )	2025-08-29 06:03:25 -04:00
Jedrzej Kosinski	1ae6fe14a7	Fix WanI2VCrossAttention so that it expects to receive transformer_options	2025-08-29 02:31:16 -07:00
comfyanonymous	15aa9222c4	Trim audio to video when saving video. (#9617 )	2025-08-29 04:12:00 -04:00
Jedrzej Kosinski	2d13bf1c7a	Made SVD work with optimized_attention_override	2025-08-28 22:45:45 -07:00
Jedrzej Kosinski	8be3edb606	Made Chroma work with optimized_attention_override	2025-08-28 22:45:31 -07:00
Jedrzej Kosinski	d644aba6bc	Made Lumina work with optimized_attention_override	2025-08-28 22:00:44 -07:00
Jedrzej Kosinski	17090c56be	Made AuraFlow work with optimized_attention_override	2025-08-28 21:46:56 -07:00
Jedrzej Kosinski	034d6c12e6	Made StableCascade work with optimized_attention_override	2025-08-28 21:42:08 -07:00
Jedrzej Kosinski	09c84b31a2	Made Omnigen 2 work with optimized_attention_override	2025-08-28 21:30:18 -07:00
Jedrzej Kosinski	8fe2dea297	Made CosmosVideo work with optimized_attention_override	2025-08-28 21:23:03 -07:00
Jedrzej Kosinski	4a44ed4a76	Make CosmosPredict2 work with optimized_attention_override	2025-08-28 21:18:34 -07:00
Jedrzej Kosinski	8b9b4bbb62	Made Hunyuan3D work with optimized_attention_override	2025-08-28 21:06:44 -07:00
Jedrzej Kosinski	27ebd312ae	Made optimized_attention_override work with ACE Step	2025-08-28 21:03:28 -07:00
Jedrzej Kosinski	9461f30387	Made StableAudio work with optimized_attention_override	2025-08-28 20:56:56 -07:00
Jedrzej Kosinski	2cda45d1b4	Made LTX work with optimized_attention_override	2025-08-28 20:42:22 -07:00
Jedrzej Kosinski	61b5c5fc75	Made Mochi work with optimized_attention_override	2025-08-28 20:34:06 -07:00
Jedrzej Kosinski	ef894cdf08	Made HunyuanVideo work with optimized_attention_override	2025-08-28 20:26:53 -07:00
Jedrzej Kosinski	0ac5c6344f	Made SD3 work with optimized_attention_override	2025-08-28 20:21:14 -07:00
Jedrzej Kosinski	1ddfb5bb14	Made wan patches_replace work with optimized_attention_override	2025-08-28 20:13:51 -07:00
Jedrzej Kosinski	4cafd58f71	Made hidream work with optimized_attention_override	2025-08-28 20:10:50 -07:00
Jedrzej Kosinski	f752715aac	Make Qwen work with optimized_attention_override	2025-08-28 19:52:52 -07:00
comfyanonymous	c7bb3e2bce	Support the 5B fun inpaint model. (#9614 ) Use the WanFunInpaintToVideo node without the clip_vision_output.	2025-08-28 22:46:57 -04:00
Jedrzej Kosinski	48ed71caf8	Add logs to verify optimized_attention_override is passed all the way into attention function	2025-08-28 19:43:39 -07:00
Jedrzej Kosinski	a7d70e42a0	Make flux work with optimized_attention_override	2025-08-28 19:33:02 -07:00
comfyanonymous	e80a14ad50	Support wan2.2 5B fun control model. (#9611 ) Use the Wan22FunControlToVideo node.	2025-08-28 22:13:07 -04:00
Jedrzej Kosinski	1f499f0794	Turn off attention logging for now, make AttentionOverrideTestNode have a dropdown with available attention (this is a test node only)	2025-08-28 18:54:22 -07:00
Jedrzej Kosinski	51a30c2ad7	Make sure wrap_attn doesn't make itself recurse infinitely, attempt to load SageAttention and FlashAttention if not enabled so that they can be marked as available or not, create registry for available attention	2025-08-28 18:53:20 -07:00
comfyanonymous	d28b39d93d	Add a LatentCut node to cut latents. (#9609 )	2025-08-28 19:38:28 -04:00
comfyanonymous	1c184c29eb	Fix issue with s2v node when extending past audio length. (#9608 )	2025-08-28 18:34:01 -04:00
comfyanonymous	edde0b5043	WanSoundImageToVideoExtend node to manually extend s2v video. (#9606 )	2025-08-28 17:59:48 -04:00
Jedrzej Kosinski	669b9ef8e6	Added **kwargs to all attention functions so transformer_options could potentially be passed through	2025-08-28 13:14:41 -07:00
comfyanonymous	0063610177	ComfyUI version 0.3.54	2025-08-28 10:44:57 -04:00
comfyanonymous	ce0052c087	Fix diffsynth controlnet regression. (#9597 )	2025-08-28 10:37:42 -04:00
Jedrzej Kosinski	dd21b4aa51	Made WAN attention receive transformer_options, test node added to wan to test out attention override later	2025-08-27 17:56:21 -07:00
Jedrzej Kosinski	29b7990dc2	Fix memory usage issue with inspect	2025-08-27 17:55:35 -07:00
Jedrzej Kosinski	68b00e9c60	Created logging code for this branch so that it can be used to track down all the code paths where transformer_options would need to be added	2025-08-27 17:13:33 -07:00
Jedrzej Kosinski	b58db6934c	Looking into a @wrap_attn decorator to look for 'optimized_attention_override' entry in transformer_options	2025-08-27 14:18:18 -07:00