Merge branch 'master' into mattmiller/veo-4k-model-gating

chore: update workflow templates to v0.9.59 (#13507 )
Allow logging in comfy app files. (#13505 )
2026-05-06 10:17:59 +08:00 · 2026-04-21 22:08:25 -07:00 · 2026-04-21 20:45:25 -07:00 · 2026-04-21 22:59:31 -04:00 · 2026-04-21 19:33:24 -07:00 · 2026-04-21 17:58:59 -07:00
23 changed files with 1055 additions and 211 deletions
--- a/README.md
+++ b/README.md
@ -195,7 +195,9 @@ The portable above currently comes with python 3.13 and pytorch cuda 13.0. Updat

 #### Alternative Downloads:

-[Experimental portable for AMD GPUs](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_amd.7z)
+[Portable for AMD GPUs](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_amd.7z)
+
+[Experimental portable for Intel GPUs](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_intel.7z)

 [Portable with pytorch cuda 12.6 and python 3.12](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_nvidia_cu126.7z) (Supports Nvidia 10 series and older GPUs).

--- a/api_server/routes/internal/internal_routes.py
+++ b/api_server/routes/internal/internal_routes.py
@ -67,7 +67,7 @@ class InternalRoutes:
                (entry for entry in os.scandir(directory) if is_visible_file(entry)),
                key=lambda entry: -entry.stat().st_mtime
            )
-            return web.json_response([entry.name for entry in sorted_files], status=200)
+            return web.json_response([f"{entry.name} [{directory_type}]" for entry in sorted_files], status=200)


    def get_app(self):
--- a/comfy/ldm/ernie/model.py
+++ b/comfy/ldm/ernie/model.py
@ -118,8 +118,6 @@ class ErnieImageAttention(nn.Module):
            query = apply_rotary_emb(query, image_rotary_emb)
            key = apply_rotary_emb(key, image_rotary_emb)

-        query, key = query.to(x.dtype), key.to(x.dtype)
-
        q_flat = query.reshape(B, S, -1)
        k_flat = key.reshape(B, S, -1)

@ -161,16 +159,16 @@ class ErnieImageSharedAdaLNBlock(nn.Module):

        residual = x
        x_norm = self.adaLN_sa_ln(x)
-        x_norm = (x_norm.float() * (1 + scale_msa.float()) + shift_msa.float()).to(x.dtype)
+        x_norm = x_norm * (1 + scale_msa) + shift_msa

        attn_out = self.self_attention(x_norm, attention_mask=attention_mask, image_rotary_emb=rotary_pos_emb)
-        x = residual + (gate_msa.float() * attn_out.float()).to(x.dtype)
+        x = residual + gate_msa * attn_out

        residual = x
        x_norm = self.adaLN_mlp_ln(x)
-        x_norm = (x_norm.float() * (1 + scale_mlp.float()) + shift_mlp.float()).to(x.dtype)
+        x_norm = x_norm * (1 + scale_mlp) + shift_mlp

-        return residual + (gate_mlp.float() * self.mlp(x_norm).float()).to(x.dtype)
+        return residual + gate_mlp * self.mlp(x_norm)

 class ErnieImageAdaLNContinuous(nn.Module):
    def __init__(self, hidden_size: int, eps: float = 1e-6, operations=None, device=None, dtype=None):
@ -183,7 +181,7 @@ class ErnieImageAdaLNContinuous(nn.Module):
    def forward(self, x: torch.Tensor, conditioning: torch.Tensor) -> torch.Tensor:
        scale, shift = self.linear(conditioning).chunk(2, dim=-1)
        x = self.norm(x)
-        x = x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
+        x = torch.addcmul(shift.unsqueeze(1), x, 1 + scale.unsqueeze(1))
        return x

 class ErnieImageModel(nn.Module):
--- a/comfy/ldm/lightricks/vae/audio_vae.py
+++ b/comfy/ldm/lightricks/vae/audio_vae.py
@ -4,9 +4,6 @@ import math
 import torch
 import torchaudio

-import comfy.model_management
-import comfy.model_patcher
-import comfy.utils as utils
 from comfy.ldm.mmaudio.vae.distributions import DiagonalGaussianDistribution
 from comfy.ldm.lightricks.symmetric_patchifier import AudioPatchifier
 from comfy.ldm.lightricks.vae.causal_audio_autoencoder import (
@ -43,30 +40,6 @@ class AudioVAEComponentConfig:

        return cls(autoencoder=audio_config, vocoder=vocoder_config)

-
-class ModelDeviceManager:
-    """Manages device placement and GPU residency for the composed model."""
-
-    def __init__(self, module: torch.nn.Module):
-        load_device = comfy.model_management.get_torch_device()
-        offload_device = comfy.model_management.vae_offload_device()
-        self.patcher = comfy.model_patcher.ModelPatcher(module, load_device, offload_device)
-
-    def ensure_model_loaded(self) -> None:
-        comfy.model_management.free_memory(
-            self.patcher.model_size(),
-            self.patcher.load_device,
-        )
-        comfy.model_management.load_model_gpu(self.patcher)
-
-    def move_to_load_device(self, tensor: torch.Tensor) -> torch.Tensor:
-        return tensor.to(self.patcher.load_device)
-
-    @property
-    def load_device(self):
-        return self.patcher.load_device
-
-
 class AudioLatentNormalizer:
    """Applies per-channel statistics in patch space and restores original layout."""

@ -132,23 +105,17 @@ class AudioPreprocessor:
 class AudioVAE(torch.nn.Module):
    """High-level Audio VAE wrapper exposing encode and decode entry points."""

-    def __init__(self, state_dict: dict, metadata: dict):
+    def __init__(self, metadata: dict):
        super().__init__()

        component_config = AudioVAEComponentConfig.from_metadata(metadata)

-        vae_sd = utils.state_dict_prefix_replace(state_dict, {"audio_vae.": ""}, filter_keys=True)
-        vocoder_sd = utils.state_dict_prefix_replace(state_dict, {"vocoder.": ""}, filter_keys=True)
-
        self.autoencoder = CausalAudioAutoencoder(config=component_config.autoencoder)
        if "bwe" in component_config.vocoder:
            self.vocoder = VocoderWithBWE(config=component_config.vocoder)
        else:
            self.vocoder = Vocoder(config=component_config.vocoder)

-        self.autoencoder.load_state_dict(vae_sd, strict=False)
-        self.vocoder.load_state_dict(vocoder_sd, strict=False)
-
        autoencoder_config = self.autoencoder.get_config()
        self.normalizer = AudioLatentNormalizer(
            AudioPatchifier(
@ -168,18 +135,12 @@ class AudioVAE(torch.nn.Module):
            n_fft=autoencoder_config["n_fft"],
        )

-        self.device_manager = ModelDeviceManager(self)
-
-    def encode(self, audio: dict) -> torch.Tensor:
+    def encode(self, audio, sample_rate=44100) -> torch.Tensor:
        """Encode a waveform dictionary into normalized latent tensors."""

-        waveform = audio["waveform"]
-        waveform_sample_rate = audio["sample_rate"]
+        waveform = audio
+        waveform_sample_rate = sample_rate
        input_device = waveform.device
-        # Ensure that Audio VAE is loaded on the correct device.
-        self.device_manager.ensure_model_loaded()
-
-        waveform = self.device_manager.move_to_load_device(waveform)
        expected_channels = self.autoencoder.encoder.in_channels
        if waveform.shape[1] != expected_channels:
            if waveform.shape[1] == 1:
@ -190,7 +151,7 @@ class AudioVAE(torch.nn.Module):
                )

        mel_spec = self.preprocessor.waveform_to_mel(
-            waveform, waveform_sample_rate, device=self.device_manager.load_device
+            waveform, waveform_sample_rate, device=waveform.device
        )

        latents = self.autoencoder.encode(mel_spec)
@ -204,17 +165,13 @@ class AudioVAE(torch.nn.Module):
        """Decode normalized latent tensors into an audio waveform."""
        original_shape = latents.shape

-        # Ensure that Audio VAE is loaded on the correct device.
-        self.device_manager.ensure_model_loaded()
-
-        latents = self.device_manager.move_to_load_device(latents)
        latents = self.normalizer.denormalize(latents)

        target_shape = self.target_shape_from_latents(original_shape)
        mel_spec = self.autoencoder.decode(latents, target_shape=target_shape)

        waveform = self.run_vocoder(mel_spec)
-        return self.device_manager.move_to_load_device(waveform)
+        return waveform

    def target_shape_from_latents(self, latents_shape):
        batch, _, time, _ = latents_shape
--- a/comfy/ldm/modules/diffusionmodules/openaimodel.py
+++ b/comfy/ldm/modules/diffusionmodules/openaimodel.py
@ -34,6 +34,16 @@ class TimestepBlock(nn.Module):
 #This is needed because accelerate makes a copy of transformer_options which breaks "transformer_index"
 def forward_timestep_embed(ts, x, emb, context=None, transformer_options={}, output_shape=None, time_context=None, num_video_frames=None, image_only_indicator=None):
    for layer in ts:
+        if "patches" in transformer_options and "forward_timestep_embed_patch" in transformer_options["patches"]:
+            found_patched = False
+            for class_type, handler in transformer_options["patches"]["forward_timestep_embed_patch"]:
+                if isinstance(layer, class_type):
+                    x = handler(layer, x, emb, context, transformer_options, output_shape, time_context, num_video_frames, image_only_indicator)
+                    found_patched = True
+                    break
+            if found_patched:
+                continue
+
        if isinstance(layer, VideoResBlock):
            x = layer(x, emb, num_video_frames, image_only_indicator)
        elif isinstance(layer, TimestepBlock):
@ -49,15 +59,6 @@ def forward_timestep_embed(ts, x, emb, context=None, transformer_options={}, out
        elif isinstance(layer, Upsample):
            x = layer(x, output_shape=output_shape)
        else:
-            if "patches" in transformer_options and "forward_timestep_embed_patch" in transformer_options["patches"]:
-                found_patched = False
-                for class_type, handler in transformer_options["patches"]["forward_timestep_embed_patch"]:
-                    if isinstance(layer, class_type):
-                        x = handler(layer, x, emb, context, transformer_options, output_shape, time_context, num_video_frames, image_only_indicator)
-                        found_patched = True
-                        break
-                if found_patched:
-                    continue
            x = layer(x)
    return x

@ -894,6 +895,12 @@ class UNetModel(nn.Module):
            h = forward_timestep_embed(self.middle_block, h, emb, context, transformer_options, time_context=time_context, num_video_frames=num_video_frames, image_only_indicator=image_only_indicator)
        h = apply_control(h, control, 'middle')

+        if "middle_block_after_patch" in transformer_patches:
+            patch = transformer_patches["middle_block_after_patch"]
+            for p in patch:
+                out = p({"h": h, "x": x, "emb": emb, "context": context, "y": y,
+                         "timesteps": timesteps, "transformer_options": transformer_options})
+                h = out["h"]

        for id, module in enumerate(self.output_blocks):
            transformer_options["block"] = ("output", id)
@ -905,8 +912,9 @@ class UNetModel(nn.Module):
                for p in patch:
                    h, hsp = p(h, hsp, transformer_options)

-            h = th.cat([h, hsp], dim=1)
-            del hsp
+            if hsp is not None:
+                h = th.cat([h, hsp], dim=1)
+                del hsp
            if len(hs) > 0:
                output_shape = hs[-1].shape
            else:
--- a/comfy/ldm/supir/init.py
+++ b/comfy/ldm/supir/init.py
--- a/comfy/ldm/supir/supir_modules.py
+++ b/comfy/ldm/supir/supir_modules.py
@ -0,0 +1,226 @@
+import torch
+import torch.nn as nn
+
+from comfy.ldm.modules.diffusionmodules.util import timestep_embedding
+from comfy.ldm.modules.diffusionmodules.openaimodel import Downsample, TimestepEmbedSequential, ResBlock, SpatialTransformer
+from comfy.ldm.modules.attention import optimized_attention
+
+
+class ZeroSFT(nn.Module):
+    def __init__(self, label_nc, norm_nc, concat_channels=0, dtype=None, device=None, operations=None):
+        super().__init__()
+
+        ks = 3
+        pw = ks // 2
+
+        self.param_free_norm = operations.GroupNorm(32, norm_nc + concat_channels, dtype=dtype, device=device)
+
+        nhidden = 128
+
+        self.mlp_shared = nn.Sequential(
+            operations.Conv2d(label_nc, nhidden, kernel_size=ks, padding=pw, dtype=dtype, device=device),
+            nn.SiLU()
+        )
+        self.zero_mul = operations.Conv2d(nhidden, norm_nc + concat_channels, kernel_size=ks, padding=pw, dtype=dtype, device=device)
+        self.zero_add = operations.Conv2d(nhidden, norm_nc + concat_channels, kernel_size=ks, padding=pw, dtype=dtype, device=device)
+
+        self.zero_conv = operations.Conv2d(label_nc, norm_nc, 1, 1, 0, dtype=dtype, device=device)
+        self.pre_concat = bool(concat_channels != 0)
+
+    def forward(self, c, h, h_ori=None, control_scale=1):
+        if h_ori is not None and self.pre_concat:
+            h_raw = torch.cat([h_ori, h], dim=1)
+        else:
+            h_raw = h
+
+        h = h + self.zero_conv(c)
+        if h_ori is not None and self.pre_concat:
+            h = torch.cat([h_ori, h], dim=1)
+        actv = self.mlp_shared(c)
+        gamma = self.zero_mul(actv)
+        beta = self.zero_add(actv)
+        h = self.param_free_norm(h)
+        h = torch.addcmul(h + beta, h, gamma)
+        if h_ori is not None and not self.pre_concat:
+            h = torch.cat([h_ori, h], dim=1)
+        return torch.lerp(h_raw, h, control_scale)
+
+
+class _CrossAttnInner(nn.Module):
+    """Inner cross-attention module matching the state_dict layout of the original CrossAttention."""
+    def __init__(self, query_dim, context_dim, heads, dim_head, dtype=None, device=None, operations=None):
+        super().__init__()
+        inner_dim = dim_head * heads
+        self.heads = heads
+        self.to_q = operations.Linear(query_dim, inner_dim, bias=False, dtype=dtype, device=device)
+        self.to_k = operations.Linear(context_dim, inner_dim, bias=False, dtype=dtype, device=device)
+        self.to_v = operations.Linear(context_dim, inner_dim, bias=False, dtype=dtype, device=device)
+        self.to_out = nn.Sequential(
+            operations.Linear(inner_dim, query_dim, dtype=dtype, device=device),
+        )
+
+    def forward(self, x, context):
+        q = self.to_q(x)
+        k = self.to_k(context)
+        v = self.to_v(context)
+        return self.to_out(optimized_attention(q, k, v, self.heads))
+
+
+class ZeroCrossAttn(nn.Module):
+    def __init__(self, context_dim, query_dim, dtype=None, device=None, operations=None):
+        super().__init__()
+        heads = query_dim // 64
+        dim_head = 64
+        self.attn = _CrossAttnInner(query_dim, context_dim, heads, dim_head, dtype=dtype, device=device, operations=operations)
+        self.norm1 = operations.GroupNorm(32, query_dim, dtype=dtype, device=device)
+        self.norm2 = operations.GroupNorm(32, context_dim, dtype=dtype, device=device)
+
+    def forward(self, context, x, control_scale=1):
+        b, c, h, w = x.shape
+        x_in = x
+
+        x = self.attn(
+            self.norm1(x).flatten(2).transpose(1, 2),
+            self.norm2(context).flatten(2).transpose(1, 2),
+        ).transpose(1, 2).unflatten(2, (h, w))
+
+        return x_in + x * control_scale
+
+
+class GLVControl(nn.Module):
+    """SUPIR's Guided Latent Vector control encoder. Truncated UNet (input + middle blocks only)."""
+    def __init__(
+        self,
+        in_channels=4,
+        model_channels=320,
+        num_res_blocks=2,
+        attention_resolutions=(4, 2),
+        channel_mult=(1, 2, 4),
+        num_head_channels=64,
+        transformer_depth=(1, 2, 10),
+        context_dim=2048,
+        adm_in_channels=2816,
+        use_linear_in_transformer=True,
+        use_checkpoint=False,
+        dtype=None,
+        device=None,
+        operations=None,
+        **kwargs,
+    ):
+        super().__init__()
+        self.model_channels = model_channels
+        time_embed_dim = model_channels * 4
+
+        self.time_embed = nn.Sequential(
+            operations.Linear(model_channels, time_embed_dim, dtype=dtype, device=device),
+            nn.SiLU(),
+            operations.Linear(time_embed_dim, time_embed_dim, dtype=dtype, device=device),
+        )
+
+        self.label_emb = nn.Sequential(
+            nn.Sequential(
+                operations.Linear(adm_in_channels, time_embed_dim, dtype=dtype, device=device),
+                nn.SiLU(),
+                operations.Linear(time_embed_dim, time_embed_dim, dtype=dtype, device=device),
+            )
+        )
+
+        self.input_blocks = nn.ModuleList([
+            TimestepEmbedSequential(
+                operations.Conv2d(in_channels, model_channels, 3, padding=1, dtype=dtype, device=device)
+            )
+        ])
+        ch = model_channels
+        ds = 1
+        for level, mult in enumerate(channel_mult):
+            for nr in range(num_res_blocks):
+                layers = [
+                    ResBlock(ch, time_embed_dim, 0, out_channels=mult * model_channels,
+                             dtype=dtype, device=device, operations=operations)
+                ]
+                ch = mult * model_channels
+                if ds in attention_resolutions:
+                    num_heads = ch // num_head_channels
+                    layers.append(
+                        SpatialTransformer(ch, num_heads, num_head_channels,
+                                           depth=transformer_depth[level], context_dim=context_dim,
+                                           use_linear=use_linear_in_transformer,
+                                           use_checkpoint=use_checkpoint,
+                                           dtype=dtype, device=device, operations=operations)
+                    )
+                self.input_blocks.append(TimestepEmbedSequential(*layers))
+            if level != len(channel_mult) - 1:
+                self.input_blocks.append(
+                    TimestepEmbedSequential(
+                        Downsample(ch, True, out_channels=ch, dtype=dtype, device=device, operations=operations)
+                    )
+                )
+                ds *= 2
+
+        num_heads = ch // num_head_channels
+        self.middle_block = TimestepEmbedSequential(
+            ResBlock(ch, time_embed_dim, 0, dtype=dtype, device=device, operations=operations),
+            SpatialTransformer(ch, num_heads, num_head_channels,
+                               depth=transformer_depth[-1], context_dim=context_dim,
+                               use_linear=use_linear_in_transformer,
+                               use_checkpoint=use_checkpoint,
+                               dtype=dtype, device=device, operations=operations),
+            ResBlock(ch, time_embed_dim, 0, dtype=dtype, device=device, operations=operations),
+        )
+
+        self.input_hint_block = TimestepEmbedSequential(
+            operations.Conv2d(in_channels, model_channels, 3, padding=1, dtype=dtype, device=device)
+        )
+
+    def forward(self, x, timesteps, xt, context=None, y=None, **kwargs):
+        t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False).to(x.dtype)
+        emb = self.time_embed(t_emb) + self.label_emb(y)
+
+        guided_hint = self.input_hint_block(x, emb, context)
+
+        hs = []
+        h = xt
+        for module in self.input_blocks:
+            if guided_hint is not None:
+                h = module(h, emb, context)
+                h += guided_hint
+                guided_hint = None
+            else:
+                h = module(h, emb, context)
+            hs.append(h)
+        h = self.middle_block(h, emb, context)
+        hs.append(h)
+        return hs
+
+
+class SUPIR(nn.Module):
+    """
+    SUPIR model containing GLVControl (control encoder) and project_modules (adapters).
+    State dict keys match the original SUPIR checkpoint layout:
+      control_model.*           -> GLVControl
+      project_modules.*         -> nn.ModuleList of ZeroSFT/ZeroCrossAttn
+    """
+    def __init__(self, device=None, dtype=None, operations=None):
+        super().__init__()
+
+        self.control_model = GLVControl(dtype=dtype, device=device, operations=operations)
+
+        project_channel_scale = 2
+        cond_output_channels = [320] * 4 + [640] * 3 + [1280] * 3
+        project_channels = [int(c * project_channel_scale) for c in [160] * 4 + [320] * 3 + [640] * 3]
+        concat_channels = [320] * 2 + [640] * 3 + [1280] * 4 + [0]
+        cross_attn_insert_idx = [6, 3]
+
+        self.project_modules = nn.ModuleList()
+        for i in range(len(cond_output_channels)):
+            self.project_modules.append(ZeroSFT(
+                project_channels[i], cond_output_channels[i],
+                concat_channels=concat_channels[i],
+                dtype=dtype, device=device, operations=operations,
+            ))
+
+        for i in cross_attn_insert_idx:
+            self.project_modules.insert(i, ZeroCrossAttn(
+                cond_output_channels[i], concat_channels[i],
+                dtype=dtype, device=device, operations=operations,
+            ))
--- a/comfy/ldm/supir/supir_patch.py
+++ b/comfy/ldm/supir/supir_patch.py
@ -0,0 +1,103 @@
+import torch
+from comfy.ldm.modules.diffusionmodules.openaimodel import Upsample
+
+
+class SUPIRPatch:
+    """
+    Holds GLVControl (control encoder) + project_modules (ZeroSFT/ZeroCrossAttn adapters).
+    Runs GLVControl lazily on first patch invocation per step, applies adapters through
+    middle_block_after_patch, output_block_merge_patch, and forward_timestep_embed_patch.
+    """
+    SIGMA_MAX = 14.6146
+
+    def __init__(self, model_patch, project_modules, hint_latent, strength_start, strength_end):
+        self.model_patch = model_patch           # CoreModelPatcher wrapping GLVControl
+        self.project_modules = project_modules   # nn.ModuleList of ZeroSFT/ZeroCrossAttn
+        self.hint_latent = hint_latent           # encoded LQ image latent
+        self.strength_start = strength_start
+        self.strength_end = strength_end
+        self.cached_features = None
+        self.adapter_idx = 0
+        self.control_idx = 0
+        self.current_control_idx = 0
+        self.active = True
+
+    def _ensure_features(self, kwargs):
+        """Run GLVControl on first call per step, cache results."""
+        if self.cached_features is not None:
+            return
+        x = kwargs["x"]
+        b = x.shape[0]
+        hint = self.hint_latent.to(device=x.device, dtype=x.dtype)
+        if hint.shape[0] != b:
+            hint = hint.expand(b, -1, -1, -1) if hint.shape[0] == 1 else hint.repeat((b + hint.shape[0] - 1) // hint.shape[0], 1, 1, 1)[:b]
+        self.cached_features = self.model_patch.model.control_model(
+            hint, kwargs["timesteps"], x,
+            kwargs["context"], kwargs["y"]
+        )
+        self.adapter_idx = len(self.project_modules) - 1
+        self.control_idx = len(self.cached_features) - 1
+
+    def _get_control_scale(self, kwargs):
+        if self.strength_start == self.strength_end:
+            return self.strength_end
+        sigma = kwargs["transformer_options"].get("sigmas")
+        if sigma is None:
+            return self.strength_end
+        s = sigma[0].item() if sigma.dim() > 0 else sigma.item()
+        t = min(s / self.SIGMA_MAX, 1.0)
+        return t * (self.strength_start - self.strength_end) + self.strength_end
+
+    def middle_after(self, kwargs):
+        """middle_block_after_patch: run GLVControl lazily, apply last adapter after middle block."""
+        self.cached_features = None  # reset from previous step
+        self.current_scale = self._get_control_scale(kwargs)
+        self.active = self.current_scale > 0
+        if not self.active:
+            return {"h": kwargs["h"]}
+        self._ensure_features(kwargs)
+        h = kwargs["h"]
+        h = self.project_modules[self.adapter_idx](
+            self.cached_features[self.control_idx], h, control_scale=self.current_scale
+        )
+        self.adapter_idx -= 1
+        self.control_idx -= 1
+        return {"h": h}
+
+    def output_block(self, h, hsp, transformer_options):
+        """output_block_patch: ZeroSFT adapter fusion replaces cat([h, hsp]). Returns (h, None) to skip cat."""
+        if not self.active:
+            return h, hsp
+        self.current_control_idx = self.control_idx
+        h = self.project_modules[self.adapter_idx](
+            self.cached_features[self.control_idx], hsp, h, control_scale=self.current_scale
+        )
+        self.adapter_idx -= 1
+        self.control_idx -= 1
+        return h, None
+
+    def pre_upsample(self, layer, x, emb, context, transformer_options, output_shape, *args, **kw):
+        """forward_timestep_embed_patch for Upsample: extra cross-attn adapter before upsample."""
+        block_type, _ = transformer_options["block"]
+        if block_type == "output" and self.active and self.cached_features is not None:
+            x = self.project_modules[self.adapter_idx](
+                self.cached_features[self.current_control_idx], x, control_scale=self.current_scale
+            )
+            self.adapter_idx -= 1
+        return layer(x, output_shape=output_shape)
+
+    def to(self, device_or_dtype):
+        if isinstance(device_or_dtype, torch.device):
+            self.cached_features = None
+            if self.hint_latent is not None:
+                self.hint_latent = self.hint_latent.to(device_or_dtype)
+        return self
+
+    def models(self):
+        return [self.model_patch]
+
+    def register(self, model_patcher):
+        """Register all patches on a cloned model patcher."""
+        model_patcher.set_model_patch(self.middle_after, "middle_block_after_patch")
+        model_patcher.set_model_output_block_patch(self.output_block)
+        model_patcher.set_model_patch((Upsample, self.pre_upsample), "forward_timestep_embed_patch")
--- a/comfy/model_patcher.py
+++ b/comfy/model_patcher.py
@ -506,6 +506,10 @@ class ModelPatcher:
    def set_model_noise_refiner_patch(self, patch):
        self.set_model_patch(patch, "noise_refiner")

+    def set_model_middle_block_after_patch(self, patch):
+        self.set_model_patch(patch, "middle_block_after_patch")
+
+
    def set_model_rope_options(self, scale_x, shift_x, scale_y, shift_y, scale_t, shift_t, **kwargs):
        rope_options = self.model_options["transformer_options"].get("rope_options", {})
        rope_options["scale_x"] = scale_x
--- a/comfy/sd.py
+++ b/comfy/sd.py
@ -12,6 +12,7 @@ from .ldm.cascade.stage_c_coder import StageC_coder
 from .ldm.audio.autoencoder import AudioOobleckVAE
 import comfy.ldm.genmo.vae.model
 import comfy.ldm.lightricks.vae.causal_video_autoencoder
+import comfy.ldm.lightricks.vae.audio_vae
 import comfy.ldm.cosmos.vae
 import comfy.ldm.wan.vae
 import comfy.ldm.wan.vae2_2
@ -805,6 +806,24 @@ class VAE:
                    self.downscale_index_formula = (4, 8, 8)
                    self.memory_used_encode = lambda shape, dtype: (700 * (max(1, (shape[-3] ** 0.66 * 0.11)) * shape[-2] * shape[-1]) * model_management.dtype_size(dtype))
                    self.memory_used_decode = lambda shape, dtype: (50 * (max(1, (shape[-3] ** 0.65 * 0.26)) * shape[-2] * shape[-1] * 32 * 32) * model_management.dtype_size(dtype))
+            elif "vocoder.resblocks.0.convs1.0.weight" in sd or "vocoder.vocoder.resblocks.0.convs1.0.weight" in sd: # LTX Audio
+                sd = comfy.utils.state_dict_prefix_replace(sd, {"audio_vae.": "autoencoder."})
+                self.first_stage_model = comfy.ldm.lightricks.vae.audio_vae.AudioVAE(metadata=metadata)
+                self.memory_used_encode = lambda shape, dtype: (shape[2] * 330) * model_management.dtype_size(dtype)
+                self.memory_used_decode = lambda shape, dtype: (shape[2] * shape[3] * 87000) * model_management.dtype_size(dtype)
+                self.latent_channels = self.first_stage_model.latent_channels
+                self.audio_sample_rate_output = self.first_stage_model.output_sample_rate
+                self.autoencoder = self.first_stage_model.autoencoder  # TODO: remove hack for ltxv custom nodes
+                self.output_channels = 2
+                self.pad_channel_value = "replicate"
+                self.upscale_ratio = 4096
+                self.downscale_ratio = 4096
+                self.latent_dim = 2
+                self.process_output = lambda audio: audio
+                self.process_input = lambda audio: audio
+                self.working_dtypes = [torch.float32]
+                self.disable_offload = True
+                self.extra_1d_channel = 16
            else:
                logging.warning("WARNING: No VAE weights detected, VAE not initalized.")
                self.first_stage_model = None
--- a/comfy_api_nodes/apis/bytedance.py
+++ b/comfy_api_nodes/apis/bytedance.py
@ -158,10 +158,17 @@ RECOMMENDED_PRESETS_SEEDREAM_4 = [
    ("Custom", None, None),
 ]

-# Seedance 2.0 reference video pixel count limits per model.
+# Seedance 2.0 reference video pixel count limits per model and output resolution.
 SEEDANCE2_REF_VIDEO_PIXEL_LIMITS = {
-    "dreamina-seedance-2-0-260128": {"min": 409_600, "max": 927_408},
-    "dreamina-seedance-2-0-fast-260128": {"min": 409_600, "max": 927_408},
+    "dreamina-seedance-2-0-260128": {
+        "480p": {"min": 409_600, "max": 927_408},
+        "720p": {"min": 409_600, "max": 927_408},
+        "1080p": {"min": 409_600, "max": 2_073_600},
+    },
+    "dreamina-seedance-2-0-fast-260128": {
+        "480p": {"min": 409_600, "max": 927_408},
+        "720p": {"min": 409_600, "max": 927_408},
+    },
 }

 # The time in this dictionary are given for 10 seconds duration.
--- a/comfy_api_nodes/nodes_bytedance.py
+++ b/comfy_api_nodes/nodes_bytedance.py
@ -35,6 +35,7 @@ from comfy_api_nodes.util import (
    get_number_of_images,
    image_tensor_pair_to_batch,
    poll_op,
+    resize_video_to_pixel_budget,
    sync_op,
    upload_audio_to_comfyapi,
    upload_image_to_comfyapi,
@ -69,9 +70,12 @@ DEPRECATED_MODELS = {"seedance-1-0-lite-t2v-250428", "seedance-1-0-lite-i2v-2504
 logger = logging.getLogger(__name__)


-def _validate_ref_video_pixels(video: Input.Video, model_id: str, index: int) -> None:
-    """Validate reference video pixel count against Seedance 2.0 model limits."""
-    limits = SEEDANCE2_REF_VIDEO_PIXEL_LIMITS.get(model_id)
+def _validate_ref_video_pixels(video: Input.Video, model_id: str, resolution: str, index: int) -> None:
+    """Validate reference video pixel count against Seedance 2.0 model limits for the selected resolution."""
+    model_limits = SEEDANCE2_REF_VIDEO_PIXEL_LIMITS.get(model_id)
+    if not model_limits:
+        return
+    limits = model_limits.get(resolution)
    if not limits:
        return
    try:
@ -1373,6 +1377,14 @@ def _seedance2_reference_inputs(resolutions: list[str]):
                min=0,
            ),
        ),
+        IO.Boolean.Input(
+            "auto_downscale",
+            default=False,
+            advanced=True,
+            optional=True,
+            tooltip="Automatically downscale reference videos that exceed the model's pixel budget "
+            "for the selected resolution. Aspect ratio is preserved; videos already within limits are untouched.",
+        ),
    ]


@ -1480,10 +1492,23 @@ class ByteDance2ReferenceNode(IO.ComfyNode):

        model_id = SEEDANCE_MODELS[model["model"]]
        has_video_input = len(reference_videos) > 0
+
+        if model.get("auto_downscale") and reference_videos:
+            max_px = (
+                SEEDANCE2_REF_VIDEO_PIXEL_LIMITS.get(model_id, {})
+                .get(model["resolution"], {})
+                .get("max")
+            )
+            if max_px:
+                for key in reference_videos:
+                    reference_videos[key] = resize_video_to_pixel_budget(
+                        reference_videos[key], max_px
+                    )
+
        total_video_duration = 0.0
        for i, key in enumerate(reference_videos, 1):
            video = reference_videos[key]
-            _validate_ref_video_pixels(video, model_id, i)
+            _validate_ref_video_pixels(video, model_id, model["resolution"], i)
            try:
                dur = video.get_duration()
                if dur < 1.8:
--- a/comfy_api_nodes/nodes_openai.py
+++ b/comfy_api_nodes/nodes_openai.py
@ -363,7 +363,7 @@ class OpenAIGPTImage1(IO.ComfyNode):
    def define_schema(cls):
        return IO.Schema(
            node_id="OpenAIGPTImage1",
-            display_name="OpenAI GPT Image 1.5",
+            display_name="OpenAI GPT Image 2",
            category="api node/image/OpenAI",
            description="Generates images synchronously via OpenAI's GPT Image endpoint.",
            inputs=[
@ -427,8 +427,8 @@ class OpenAIGPTImage1(IO.ComfyNode):
                ),
                IO.Combo.Input(
                    "model",
-                    options=["gpt-image-1", "gpt-image-1.5"],
-                    default="gpt-image-1.5",
+                    options=["gpt-image-1", "gpt-image-1.5", 'gpt-image-2'],
+                    default="gpt-image-2",
                    optional=True,
                ),
            ],
@ -487,6 +487,8 @@ class OpenAIGPTImage1(IO.ComfyNode):
            price_extractor = calculate_tokens_price_image_1
        elif model == "gpt-image-1.5":
            price_extractor = calculate_tokens_price_image_1_5
+        elif model == "gpt-image-2":
+            price_extractor = calculate_tokens_price_image_1_5
        else:
            raise ValueError(f"Unknown model: {model}")

--- a/comfy_api_nodes/nodes_veo2.py
+++ b/comfy_api_nodes/nodes_veo2.py
@ -24,8 +24,9 @@ from comfy_api_nodes.util import (
 AVERAGE_DURATION_VIDEO_GEN = 32
 MODELS_MAP = {
    "veo-2.0-generate-001": "veo-2.0-generate-001",
-    "veo-3.1-generate": "veo-3.1-generate-preview",
-    "veo-3.1-fast-generate": "veo-3.1-fast-generate-preview",
+    "veo-3.1-generate": "veo-3.1-generate-001",
+    "veo-3.1-fast-generate": "veo-3.1-fast-generate-001",
+    "veo-3.1-lite": "veo-3.1-lite-generate-001",
    "veo-3.0-generate-001": "veo-3.0-generate-001",
    "veo-3.0-fast-generate-001": "veo-3.0-fast-generate-001",
 }
@ -247,17 +248,8 @@ class VeoVideoGenerationNode(IO.ComfyNode):
        raise Exception("Video generation completed but no video was returned")


-class Veo3VideoGenerationNode(VeoVideoGenerationNode):
-    """
-    Generates videos from text prompts using Google's Veo 3 API.
-
-    Supported models:
-    - veo-3.0-generate-001
-    - veo-3.0-fast-generate-001
-
-    This node extends the base Veo node with Veo 3 specific features including
-    audio generation and fixed 8-second duration.
-    """
+class Veo3VideoGenerationNode(IO.ComfyNode):
+    """Generates videos from text prompts using Google's Veo 3 API."""

    @classmethod
    def define_schema(cls):
@ -279,6 +271,13 @@ class Veo3VideoGenerationNode(VeoVideoGenerationNode):
                    default="16:9",
                    tooltip="Aspect ratio of the output video",
                ),
+                IO.Combo.Input(
+                    "resolution",
+                    options=["720p", "1080p", "4k"],
+                    default="720p",
+                    tooltip="Output video resolution. 4K is not available for veo-3.1-lite and veo-3.0 models.",
+                    optional=True,
+                ),
                IO.String.Input(
                    "negative_prompt",
                    multiline=True,
@ -289,11 +288,11 @@ class Veo3VideoGenerationNode(VeoVideoGenerationNode):
                IO.Int.Input(
                    "duration_seconds",
                    default=8,
-                    min=8,
+                    min=4,
                    max=8,
-                    step=1,
+                    step=2,
                    display_mode=IO.NumberDisplay.number,
-                    tooltip="Duration of the output video in seconds (Veo 3 only supports 8 seconds)",
+                    tooltip="Duration of the output video in seconds",
                    optional=True,
                ),
                IO.Boolean.Input(
@ -332,10 +331,10 @@ class Veo3VideoGenerationNode(VeoVideoGenerationNode):
                    options=[
                        "veo-3.1-generate",
                        "veo-3.1-fast-generate",
+                        "veo-3.1-lite",
                        "veo-3.0-generate-001",
                        "veo-3.0-fast-generate-001",
                    ],
-                    default="veo-3.0-generate-001",
                    tooltip="Veo 3 model to use for video generation",
                    optional=True,
                ),
@ -356,21 +355,111 @@ class Veo3VideoGenerationNode(VeoVideoGenerationNode):
            ],
            is_api_node=True,
            price_badge=IO.PriceBadge(
-                depends_on=IO.PriceBadgeDepends(widgets=["model", "generate_audio"]),
+                depends_on=IO.PriceBadgeDepends(widgets=["model", "generate_audio", "resolution", "duration_seconds"]),
                expr="""
                (
                  $m := widgets.model;
+                  $r := widgets.resolution;
                  $a := widgets.generate_audio;
-                  ($contains($m,"veo-3.0-fast-generate-001") or $contains($m,"veo-3.1-fast-generate"))
-                    ? {"type":"usd","usd": ($a ? 1.2 : 0.8)}
-                    : ($contains($m,"veo-3.0-generate-001") or $contains($m,"veo-3.1-generate"))
-                      ? {"type":"usd","usd": ($a ? 3.2 : 1.6)}
-                      : {"type":"range_usd","min_usd":0.8,"max_usd":3.2}
+                  $seconds := widgets.duration_seconds;
+                  $pps :=
+                    $contains($m, "lite")
+                      ? ($r = "1080p" ? ($a ? 0.08 : 0.05) : ($a ? 0.05 : 0.03))
+                    : $contains($m, "3.1-fast")
+                      ? ($r = "4k" ? ($a ? 0.30 : 0.25) : $r = "1080p" ? ($a ? 0.12 : 0.10) : ($a ? 0.10 : 0.08))
+                    : $contains($m, "3.1-generate")
+                      ? ($r = "4k" ? ($a ? 0.60 : 0.40) : ($a ? 0.40 : 0.20))
+                    : $contains($m, "3.0-fast")
+                      ? ($a ? 0.15 : 0.10)
+                    : ($a ? 0.40 : 0.20);
+                  {"type":"usd","usd": $pps * $seconds}
                )
                """,
            ),
        )

+    @classmethod
+    async def execute(
+        cls,
+        prompt,
+        aspect_ratio="16:9",
+        resolution="720p",
+        negative_prompt="",
+        duration_seconds=8,
+        enhance_prompt=True,
+        person_generation="ALLOW",
+        seed=0,
+        image=None,
+        model="veo-3.0-generate-001",
+        generate_audio=False,
+    ):
+        if resolution == "4k" and ("lite" in model or "3.0" in model):
+            raise Exception("4K resolution is not supported by the veo-3.1-lite or veo-3.0 models.")
+
+        model = MODELS_MAP[model]
+
+        instances = [{"prompt": prompt}]
+        if image is not None:
+            image_base64 = tensor_to_base64_string(image)
+            if image_base64:
+                instances[0]["image"] = {"bytesBase64Encoded": image_base64, "mimeType": "image/png"}
+
+        parameters = {
+            "aspectRatio": aspect_ratio,
+            "personGeneration": person_generation,
+            "durationSeconds": duration_seconds,
+            "enhancePrompt": True,
+            "generateAudio": generate_audio,
+        }
+        if negative_prompt:
+            parameters["negativePrompt"] = negative_prompt
+        if seed > 0:
+            parameters["seed"] = seed
+        if "veo-3.1" in model:
+            parameters["resolution"] = resolution
+
+        initial_response = await sync_op(
+            cls,
+            ApiEndpoint(path=f"/proxy/veo/{model}/generate", method="POST"),
+            response_model=VeoGenVidResponse,
+            data=VeoGenVidRequest(
+                instances=instances,
+                parameters=parameters,
+            ),
+        )
+
+        poll_response = await poll_op(
+            cls,
+            ApiEndpoint(path=f"/proxy/veo/{model}/poll", method="POST"),
+            response_model=VeoGenVidPollResponse,
+            status_extractor=lambda r: "completed" if r.done else "pending",
+            data=VeoGenVidPollRequest(operationName=initial_response.name),
+            poll_interval=9.0,
+            estimated_duration=AVERAGE_DURATION_VIDEO_GEN,
+        )
+
+        if poll_response.error:
+            raise Exception(f"Veo API error: {poll_response.error.message} (code: {poll_response.error.code})")
+
+        response = poll_response.response
+        filtered_count = response.raiMediaFilteredCount
+        if filtered_count:
+            reasons = response.raiMediaFilteredReasons or []
+            reason_part = f": {reasons[0]}" if reasons else ""
+            raise Exception(
+                f"Content blocked by Google's Responsible AI filters{reason_part} "
+                f"({filtered_count} video{'s' if filtered_count != 1 else ''} filtered)."
+            )
+
+        if response.videos:
+            video = response.videos[0]
+            if video.bytesBase64Encoded:
+                return IO.NodeOutput(InputImpl.VideoFromFile(BytesIO(base64.b64decode(video.bytesBase64Encoded))))
+            if video.gcsUri:
+                return IO.NodeOutput(await download_url_to_video_output(video.gcsUri))
+            raise Exception("Video returned but no data or URL was provided")
+        raise Exception("Video generation completed but no video was returned")
+

 class Veo3FirstLastFrameNode(IO.ComfyNode):

@ -394,7 +483,7 @@ class Veo3FirstLastFrameNode(IO.ComfyNode):
                    default="",
                    tooltip="Negative text prompt to guide what to avoid in the video",
                ),
-                IO.Combo.Input("resolution", options=["720p", "1080p"]),
+                IO.Combo.Input("resolution", options=["720p", "1080p", "4k"]),
                IO.Combo.Input(
                    "aspect_ratio",
                    options=["16:9", "9:16"],
@ -424,8 +513,7 @@ class Veo3FirstLastFrameNode(IO.ComfyNode):
                IO.Image.Input("last_frame", tooltip="End frame"),
                IO.Combo.Input(
                    "model",
-                    options=["veo-3.1-generate", "veo-3.1-fast-generate"],
-                    default="veo-3.1-fast-generate",
+                    options=["veo-3.1-generate", "veo-3.1-fast-generate", "veo-3.1-lite"],
                ),
                IO.Boolean.Input(
                    "generate_audio",
@ -443,26 +531,20 @@ class Veo3FirstLastFrameNode(IO.ComfyNode):
            ],
            is_api_node=True,
            price_badge=IO.PriceBadge(
-                depends_on=IO.PriceBadgeDepends(widgets=["model", "generate_audio", "duration"]),
+                depends_on=IO.PriceBadgeDepends(widgets=["model", "generate_audio", "duration", "resolution"]),
                expr="""
                (
-                  $prices := {
-                    "veo-3.1-fast-generate": { "audio": 0.15, "no_audio": 0.10 },
-                    "veo-3.1-generate":      { "audio": 0.40, "no_audio": 0.20 }
-                  };
                  $m := widgets.model;
-                  $ga := (widgets.generate_audio = "true");
+                  $r := widgets.resolution;
+                  $ga := widgets.generate_audio;
                  $seconds := widgets.duration;
-                  $modelKey :=
-                    $contains($m, "veo-3.1-fast-generate") ? "veo-3.1-fast-generate" :
-                    $contains($m, "veo-3.1-generate")      ? "veo-3.1-generate" :
-                    "";
-                  $audioKey := $ga ? "audio" : "no_audio";
-                  $modelPrices := $lookup($prices, $modelKey);
-                  $pps := $lookup($modelPrices, $audioKey);
-                  ($pps != null)
-                    ? {"type":"usd","usd": $pps * $seconds}
-                    : {"type":"range_usd","min_usd": 0.4, "max_usd": 3.2}
+                  $pps :=
+                    $contains($m, "lite")
+                      ? ($r = "1080p" ? ($ga ? 0.08 : 0.05) : ($ga ? 0.05 : 0.03))
+                    : $contains($m, "fast")
+                      ? ($r = "4k" ? ($ga ? 0.30 : 0.25) : $r = "1080p" ? ($ga ? 0.12 : 0.10) : ($ga ? 0.10 : 0.08))
+                    : ($r = "4k" ? ($ga ? 0.60 : 0.40) : ($ga ? 0.40 : 0.20));
+                  {"type":"usd","usd": $pps * $seconds}
                )
                """,
            ),
@ -482,6 +564,9 @@ class Veo3FirstLastFrameNode(IO.ComfyNode):
        model: str,
        generate_audio: bool,
    ):
+        if "lite" in model and resolution == "4k":
+            raise Exception("4K resolution is not supported by the veo-3.1-lite model.")
+
        model = MODELS_MAP[model]
        initial_response = await sync_op(
            cls,
@ -519,7 +604,7 @@ class Veo3FirstLastFrameNode(IO.ComfyNode):
            data=VeoGenVidPollRequest(
                operationName=initial_response.name,
            ),
-            poll_interval=5.0,
+            poll_interval=9.0,
            estimated_duration=AVERAGE_DURATION_VIDEO_GEN,
        )

--- a/comfy_api_nodes/util/init.py
+++ b/comfy_api_nodes/util/init.py
@ -19,6 +19,7 @@ from .conversions import (
    image_tensor_pair_to_batch,
    pil_to_bytesio,
    resize_mask_to_image,
+    resize_video_to_pixel_budget,
    tensor_to_base64_string,
    tensor_to_bytesio,
    tensor_to_pil,
@ -90,6 +91,7 @@ __all__ = [
    "image_tensor_pair_to_batch",
    "pil_to_bytesio",
    "resize_mask_to_image",
+    "resize_video_to_pixel_budget",
    "tensor_to_base64_string",
    "tensor_to_bytesio",
    "tensor_to_pil",
--- a/comfy_api_nodes/util/conversions.py
+++ b/comfy_api_nodes/util/conversions.py
@ -129,22 +129,38 @@ def pil_to_bytesio(img: Image.Image, mime_type: str = "image/png") -> BytesIO:
    return img_byte_arr


+def _compute_downscale_dims(src_w: int, src_h: int, total_pixels: int) -> tuple[int, int] | None:
+    """Return downscaled (w, h) with even dims fitting ``total_pixels``, or None if already fits.
+
+    Source aspect ratio is preserved; output may drift by a fraction of a percent because both dimensions
+    are rounded down to even values (many  codecs require divisible-by-2).
+    """
+    pixels = src_w * src_h
+    if pixels <= total_pixels:
+        return None
+    scale = math.sqrt(total_pixels / pixels)
+    new_w = max(2, int(src_w * scale))
+    new_h = max(2, int(src_h * scale))
+    new_w -= new_w % 2
+    new_h -= new_h % 2
+    return new_w, new_h
+
+
 def downscale_image_tensor(image: torch.Tensor, total_pixels: int = 1536 * 1024) -> torch.Tensor:
-    """Downscale input image tensor to roughly the specified total pixels."""
+    """Downscale input image tensor to roughly the specified total pixels.
+
+    Output dimensions are rounded down to even values so that the result is guaranteed to fit within ``total_pixels``
+    and is compatible with codecs that require even dimensions (e.g. yuv420p).
+    """
    samples = image.movedim(-1, 1)
-    total = int(total_pixels)
-    scale_by = math.sqrt(total / (samples.shape[3] * samples.shape[2]))
-    if scale_by >= 1:
+    dims = _compute_downscale_dims(samples.shape[3], samples.shape[2], int(total_pixels))
+    if dims is None:
        return image
-    width = round(samples.shape[3] * scale_by)
-    height = round(samples.shape[2] * scale_by)
-
-    s = common_upscale(samples, width, height, "lanczos", "disabled")
-    s = s.movedim(1, -1)
-    return s
+    new_w, new_h = dims
+    return common_upscale(samples, new_w, new_h, "lanczos", "disabled").movedim(1, -1)


-def downscale_image_tensor_by_max_side(image: torch.Tensor, *,  max_side: int) -> torch.Tensor:
+def downscale_image_tensor_by_max_side(image: torch.Tensor, *, max_side: int) -> torch.Tensor:
    """Downscale input image tensor so the largest dimension is at most max_side pixels."""
    samples = image.movedim(-1, 1)
    height, width = samples.shape[2], samples.shape[3]
@ -399,6 +415,72 @@ def trim_video(video: Input.Video, duration_sec: float) -> Input.Video:
        raise RuntimeError(f"Failed to trim video: {str(e)}") from e


+def resize_video_to_pixel_budget(video: Input.Video, total_pixels: int) -> Input.Video:
+    """Downscale a video to fit within ``total_pixels`` (w * h), preserving aspect ratio.
+
+    Returns the original video object untouched when it already fits. Preserves frame rate, duration, and audio.
+    Aspect ratio is preserved up to a fraction of a percent (even-dim rounding).
+    """
+    src_w, src_h = video.get_dimensions()
+    scale_dims = _compute_downscale_dims(src_w, src_h, total_pixels)
+    if scale_dims is None:
+        return video
+    return _apply_video_scale(video, scale_dims)
+
+
+def _apply_video_scale(video: Input.Video, scale_dims: tuple[int, int]) -> Input.Video:
+    """Re-encode ``video`` scaled to ``scale_dims`` with a single decode/encode pass."""
+    out_w, out_h = scale_dims
+    output_buffer = BytesIO()
+    input_container = None
+    output_container = None
+
+    try:
+        input_source = video.get_stream_source()
+        input_container = av.open(input_source, mode="r")
+        output_container = av.open(output_buffer, mode="w", format="mp4")
+
+        video_stream = output_container.add_stream("h264", rate=video.get_frame_rate())
+        video_stream.width = out_w
+        video_stream.height = out_h
+        video_stream.pix_fmt = "yuv420p"
+
+        audio_stream = None
+        for stream in input_container.streams:
+            if isinstance(stream, av.AudioStream):
+                audio_stream = output_container.add_stream("aac", rate=stream.sample_rate)
+                audio_stream.sample_rate = stream.sample_rate
+                audio_stream.layout = stream.layout
+                break
+
+        for frame in input_container.decode(video=0):
+            frame = frame.reformat(width=out_w, height=out_h, format="yuv420p")
+            for packet in video_stream.encode(frame):
+                output_container.mux(packet)
+        for packet in video_stream.encode():
+            output_container.mux(packet)
+
+        if audio_stream is not None:
+            input_container.seek(0)
+            for audio_frame in input_container.decode(audio=0):
+                for packet in audio_stream.encode(audio_frame):
+                    output_container.mux(packet)
+            for packet in audio_stream.encode():
+                output_container.mux(packet)
+
+        output_container.close()
+        input_container.close()
+        output_buffer.seek(0)
+        return InputImpl.VideoFromFile(output_buffer)
+
+    except Exception as e:
+        if input_container is not None:
+            input_container.close()
+        if output_container is not None:
+            output_container.close()
+        raise RuntimeError(f"Failed to resize video: {str(e)}") from e
+
+
 def _f32_pcm(wav: torch.Tensor) -> torch.Tensor:
    """Convert audio to float 32 bits PCM format. Copy-paste from nodes_audio.py file."""
    if wav.dtype.is_floating_point:
--- a/comfy_extras/nodes_ace.py
+++ b/comfy_extras/nodes_ace.py
@ -3,136 +3,136 @@ from typing_extensions import override

 import comfy.model_management
 import node_helpers
-from comfy_api.latest import ComfyExtension, io
+from comfy_api.latest import ComfyExtension, IO


-class TextEncodeAceStepAudio(io.ComfyNode):
+class TextEncodeAceStepAudio(IO.ComfyNode):
    @classmethod
    def define_schema(cls):
-        return io.Schema(
+        return IO.Schema(
            node_id="TextEncodeAceStepAudio",
            category="conditioning",
            inputs=[
-                io.Clip.Input("clip"),
-                io.String.Input("tags", multiline=True, dynamic_prompts=True),
-                io.String.Input("lyrics", multiline=True, dynamic_prompts=True),
-                io.Float.Input("lyrics_strength", default=1.0, min=0.0, max=10.0, step=0.01),
+                IO.Clip.Input("clip"),
+                IO.String.Input("tags", multiline=True, dynamic_prompts=True),
+                IO.String.Input("lyrics", multiline=True, dynamic_prompts=True),
+                IO.Float.Input("lyrics_strength", default=1.0, min=0.0, max=10.0, step=0.01),
            ],
-            outputs=[io.Conditioning.Output()],
+            outputs=[IO.Conditioning.Output()],
        )

    @classmethod
-    def execute(cls, clip, tags, lyrics, lyrics_strength) -> io.NodeOutput:
+    def execute(cls, clip, tags, lyrics, lyrics_strength) -> IO.NodeOutput:
        tokens = clip.tokenize(tags, lyrics=lyrics)
        conditioning = clip.encode_from_tokens_scheduled(tokens)
        conditioning = node_helpers.conditioning_set_values(conditioning, {"lyrics_strength": lyrics_strength})
-        return io.NodeOutput(conditioning)
+        return IO.NodeOutput(conditioning)

-class TextEncodeAceStepAudio15(io.ComfyNode):
+class TextEncodeAceStepAudio15(IO.ComfyNode):
    @classmethod
    def define_schema(cls):
-        return io.Schema(
+        return IO.Schema(
            node_id="TextEncodeAceStepAudio1.5",
            category="conditioning",
            inputs=[
-                io.Clip.Input("clip"),
-                io.String.Input("tags", multiline=True, dynamic_prompts=True),
-                io.String.Input("lyrics", multiline=True, dynamic_prompts=True),
-                io.Int.Input("seed", default=0, min=0, max=0xffffffffffffffff, control_after_generate=True),
-                io.Int.Input("bpm", default=120, min=10, max=300),
-                io.Float.Input("duration", default=120.0, min=0.0, max=2000.0, step=0.1),
-                io.Combo.Input("timesignature", options=['2', '3', '4', '6']),
-                io.Combo.Input("language", options=["en", "ja", "zh", "es", "de", "fr", "pt", "ru", "it", "nl", "pl", "tr", "vi", "cs", "fa", "id", "ko", "uk", "hu", "ar", "sv", "ro", "el"]),
-                io.Combo.Input("keyscale", options=[f"{root} {quality}" for quality in ["major", "minor"] for root in ["C", "C#", "Db", "D", "D#", "Eb", "E", "F", "F#", "Gb", "G", "G#", "Ab", "A", "A#", "Bb", "B"]]),
-                io.Boolean.Input("generate_audio_codes", default=True, tooltip="Enable the LLM that generates audio codes. This can be slow but will increase the quality of the generated audio. Turn this off if you are giving the model an audio reference.", advanced=True),
-                io.Float.Input("cfg_scale", default=2.0, min=0.0, max=100.0, step=0.1, advanced=True),
-                io.Float.Input("temperature", default=0.85, min=0.0, max=2.0, step=0.01, advanced=True),
-                io.Float.Input("top_p", default=0.9, min=0.0, max=2000.0, step=0.01, advanced=True),
-                io.Int.Input("top_k", default=0, min=0, max=100, advanced=True),
-                io.Float.Input("min_p", default=0.000, min=0.0, max=1.0, step=0.001, advanced=True),
+                IO.Clip.Input("clip"),
+                IO.String.Input("tags", multiline=True, dynamic_prompts=True),
+                IO.String.Input("lyrics", multiline=True, dynamic_prompts=True),
+                IO.Int.Input("seed", default=0, min=0, max=0xffffffffffffffff, control_after_generate=True),
+                IO.Int.Input("bpm", default=120, min=10, max=300),
+                IO.Float.Input("duration", default=120.0, min=0.0, max=2000.0, step=0.1),
+                IO.Combo.Input("timesignature", options=['2', '3', '4', '6']),
+                IO.Combo.Input("language", options=["en", "ja", "zh", "es", "de", "fr", "pt", "ru", "it", "nl", "pl", "tr", "vi", "cs", "fa", "id", "ko", "uk", "hu", "ar", "sv", "ro", "el"]),
+                IO.Combo.Input("keyscale", options=[f"{root} {quality}" for quality in ["major", "minor"] for root in ["C", "C#", "Db", "D", "D#", "Eb", "E", "F", "F#", "Gb", "G", "G#", "Ab", "A", "A#", "Bb", "B"]]),
+                IO.Boolean.Input("generate_audio_codes", default=True, tooltip="Enable the LLM that generates audio codes. This can be slow but will increase the quality of the generated audio. Turn this off if you are giving the model an audio reference.", advanced=True),
+                IO.Float.Input("cfg_scale", default=2.0, min=0.0, max=100.0, step=0.1, advanced=True),
+                IO.Float.Input("temperature", default=0.85, min=0.0, max=2.0, step=0.01, advanced=True),
+                IO.Float.Input("top_p", default=0.9, min=0.0, max=2000.0, step=0.01, advanced=True),
+                IO.Int.Input("top_k", default=0, min=0, max=100, advanced=True),
+                IO.Float.Input("min_p", default=0.000, min=0.0, max=1.0, step=0.001, advanced=True),
            ],
-            outputs=[io.Conditioning.Output()],
+            outputs=[IO.Conditioning.Output()],
        )

    @classmethod
-    def execute(cls, clip, tags, lyrics, seed, bpm, duration, timesignature, language, keyscale, generate_audio_codes, cfg_scale, temperature, top_p, top_k, min_p) -> io.NodeOutput:
+    def execute(cls, clip, tags, lyrics, seed, bpm, duration, timesignature, language, keyscale, generate_audio_codes, cfg_scale, temperature, top_p, top_k, min_p) -> IO.NodeOutput:
        tokens = clip.tokenize(tags, lyrics=lyrics, bpm=bpm, duration=duration, timesignature=int(timesignature), language=language, keyscale=keyscale, seed=seed, generate_audio_codes=generate_audio_codes, cfg_scale=cfg_scale, temperature=temperature, top_p=top_p, top_k=top_k, min_p=min_p)
        conditioning = clip.encode_from_tokens_scheduled(tokens)
-        return io.NodeOutput(conditioning)
+        return IO.NodeOutput(conditioning)


-class EmptyAceStepLatentAudio(io.ComfyNode):
+class EmptyAceStepLatentAudio(IO.ComfyNode):
    @classmethod
    def define_schema(cls):
-        return io.Schema(
+        return IO.Schema(
            node_id="EmptyAceStepLatentAudio",
            display_name="Empty Ace Step 1.0 Latent Audio",
            category="latent/audio",
            inputs=[
-                io.Float.Input("seconds", default=120.0, min=1.0, max=1000.0, step=0.1),
-                io.Int.Input(
+                IO.Float.Input("seconds", default=120.0, min=1.0, max=1000.0, step=0.1),
+                IO.Int.Input(
                    "batch_size", default=1, min=1, max=4096, tooltip="The number of latent images in the batch."
                ),
            ],
-            outputs=[io.Latent.Output()],
+            outputs=[IO.Latent.Output()],
        )

    @classmethod
-    def execute(cls, seconds, batch_size) -> io.NodeOutput:
+    def execute(cls, seconds, batch_size) -> IO.NodeOutput:
        length = int(seconds * 44100 / 512 / 8)
        latent = torch.zeros([batch_size, 8, 16, length], device=comfy.model_management.intermediate_device(), dtype=comfy.model_management.intermediate_dtype())
-        return io.NodeOutput({"samples": latent, "type": "audio"})
+        return IO.NodeOutput({"samples": latent, "type": "audio"})


-class EmptyAceStep15LatentAudio(io.ComfyNode):
+class EmptyAceStep15LatentAudio(IO.ComfyNode):
    @classmethod
    def define_schema(cls):
-        return io.Schema(
+        return IO.Schema(
            node_id="EmptyAceStep1.5LatentAudio",
            display_name="Empty Ace Step 1.5 Latent Audio",
            category="latent/audio",
            inputs=[
-                io.Float.Input("seconds", default=120.0, min=1.0, max=1000.0, step=0.01),
-                io.Int.Input(
+                IO.Float.Input("seconds", default=120.0, min=1.0, max=1000.0, step=0.01),
+                IO.Int.Input(
                    "batch_size", default=1, min=1, max=4096, tooltip="The number of latent images in the batch."
                ),
            ],
-            outputs=[io.Latent.Output()],
+            outputs=[IO.Latent.Output()],
        )

    @classmethod
-    def execute(cls, seconds, batch_size) -> io.NodeOutput:
+    def execute(cls, seconds, batch_size) -> IO.NodeOutput:
        length = round((seconds * 48000 / 1920))
        latent = torch.zeros([batch_size, 64, length], device=comfy.model_management.intermediate_device(), dtype=comfy.model_management.intermediate_dtype())
-        return io.NodeOutput({"samples": latent, "type": "audio"})
+        return IO.NodeOutput({"samples": latent, "type": "audio"})

-class ReferenceAudio(io.ComfyNode):
+class ReferenceAudio(IO.ComfyNode):
    @classmethod
    def define_schema(cls):
-        return io.Schema(
+        return IO.Schema(
            node_id="ReferenceTimbreAudio",
            display_name="Reference Audio",
            category="advanced/conditioning/audio",
            is_experimental=True,
            description="This node sets the reference audio for ace step 1.5",
            inputs=[
-                io.Conditioning.Input("conditioning"),
-                io.Latent.Input("latent", optional=True),
+                IO.Conditioning.Input("conditioning"),
+                IO.Latent.Input("latent", optional=True),
            ],
            outputs=[
-                io.Conditioning.Output(),
+                IO.Conditioning.Output(),
            ]
        )

    @classmethod
-    def execute(cls, conditioning, latent=None) -> io.NodeOutput:
+    def execute(cls, conditioning, latent=None) -> IO.NodeOutput:
        if latent is not None:
            conditioning = node_helpers.conditioning_set_values(conditioning, {"reference_audio_timbre_latents": [latent["samples"]]}, append=True)
-        return io.NodeOutput(conditioning)
+        return IO.NodeOutput(conditioning)

 class AceExtension(ComfyExtension):
    @override
-    async def get_node_list(self) -> list[type[io.ComfyNode]]:
+    async def get_node_list(self) -> list[type[IO.ComfyNode]]:
        return [
            TextEncodeAceStepAudio,
            EmptyAceStepLatentAudio,
--- a/comfy_extras/nodes_audio.py
+++ b/comfy_extras/nodes_audio.py
@ -104,7 +104,7 @@ def vae_decode_audio(vae, samples, tile=None, overlap=None):
    std = torch.std(audio, dim=[1, 2], keepdim=True) * 5.0
    std[std < 1.0] = 1.0
    audio /= std
-    vae_sample_rate = getattr(vae, "audio_sample_rate", 44100)
+    vae_sample_rate = getattr(vae, "audio_sample_rate_output", getattr(vae, "audio_sample_rate", 44100))
    return {"waveform": audio, "sample_rate": vae_sample_rate if "sample_rate" not in samples else samples["sample_rate"]}


--- a/comfy_extras/nodes_lt_audio.py
+++ b/comfy_extras/nodes_lt_audio.py
@ -3,9 +3,8 @@ import comfy.utils
 import comfy.model_management
 import torch

-from comfy.ldm.lightricks.vae.audio_vae import AudioVAE
 from comfy_api.latest import ComfyExtension, io
-
+from comfy_extras.nodes_audio import VAEEncodeAudio

 class LTXVAudioVAELoader(io.ComfyNode):
    @classmethod
@ -28,10 +27,14 @@ class LTXVAudioVAELoader(io.ComfyNode):
    def execute(cls, ckpt_name: str) -> io.NodeOutput:
        ckpt_path = folder_paths.get_full_path_or_raise("checkpoints", ckpt_name)
        sd, metadata = comfy.utils.load_torch_file(ckpt_path, return_metadata=True)
-        return io.NodeOutput(AudioVAE(sd, metadata))
+        sd = comfy.utils.state_dict_prefix_replace(sd, {"audio_vae.": "autoencoder.", "vocoder.": "vocoder."}, filter_keys=True)
+        vae = comfy.sd.VAE(sd=sd, metadata=metadata)
+        vae.throw_exception_if_invalid()
+
+        return io.NodeOutput(vae)


-class LTXVAudioVAEEncode(io.ComfyNode):
+class LTXVAudioVAEEncode(VAEEncodeAudio):
    @classmethod
    def define_schema(cls) -> io.Schema:
        return io.Schema(
@ -50,15 +53,8 @@ class LTXVAudioVAEEncode(io.ComfyNode):
        )

    @classmethod
-    def execute(cls, audio, audio_vae: AudioVAE) -> io.NodeOutput:
-        audio_latents = audio_vae.encode(audio)
-        return io.NodeOutput(
-            {
-                "samples": audio_latents,
-                "sample_rate": int(audio_vae.sample_rate),
-                "type": "audio",
-            }
-        )
+    def execute(cls, audio, audio_vae) -> io.NodeOutput:
+        return super().execute(audio_vae, audio)


 class LTXVAudioVAEDecode(io.ComfyNode):
@ -80,12 +76,12 @@ class LTXVAudioVAEDecode(io.ComfyNode):
        )

    @classmethod
-    def execute(cls, samples, audio_vae: AudioVAE) -> io.NodeOutput:
+    def execute(cls, samples, audio_vae) -> io.NodeOutput:
        audio_latent = samples["samples"]
        if audio_latent.is_nested:
            audio_latent = audio_latent.unbind()[-1]
-        audio = audio_vae.decode(audio_latent).to(audio_latent.device)
-        output_audio_sample_rate = audio_vae.output_sample_rate
+        audio = audio_vae.decode(audio_latent).movedim(-1, 1).to(audio_latent.device)
+        output_audio_sample_rate = audio_vae.first_stage_model.output_sample_rate
        return io.NodeOutput(
            {
                "waveform": audio,
@ -143,17 +139,17 @@ class LTXVEmptyLatentAudio(io.ComfyNode):
        frames_number: int,
        frame_rate: int,
        batch_size: int,
-        audio_vae: AudioVAE,
+        audio_vae,
    ) -> io.NodeOutput:
        """Generate empty audio latents matching the reference pipeline structure."""

        assert audio_vae is not None, "Audio VAE model is required"

        z_channels = audio_vae.latent_channels
-        audio_freq = audio_vae.latent_frequency_bins
-        sampling_rate = int(audio_vae.sample_rate)
+        audio_freq = audio_vae.first_stage_model.latent_frequency_bins
+        sampling_rate = int(audio_vae.first_stage_model.sample_rate)

-        num_audio_latents = audio_vae.num_of_latents_from_frames(frames_number, frame_rate)
+        num_audio_latents = audio_vae.first_stage_model.num_of_latents_from_frames(frames_number, frame_rate)

        audio_latents = torch.zeros(
            (batch_size, z_channels, num_audio_latents, audio_freq),
--- a/comfy_extras/nodes_model_patch.py
+++ b/comfy_extras/nodes_model_patch.py
@ -7,7 +7,10 @@ import comfy.model_management
 import comfy.ldm.common_dit
 import comfy.latent_formats
 import comfy.ldm.lumina.controlnet
+import comfy.ldm.supir.supir_modules
 from comfy.ldm.wan.model_multitalk import WanMultiTalkAttentionBlock, MultiTalkAudioProjModel
+from comfy_api.latest import io
+from comfy.ldm.supir.supir_patch import SUPIRPatch


 class BlockWiseControlBlock(torch.nn.Module):
@ -266,6 +269,27 @@ class ModelPatchLoader:
                    out_dim=sd["audio_proj.norm.weight"].shape[0],
                    device=comfy.model_management.unet_offload_device(),
                    operations=comfy.ops.manual_cast)
+        elif 'model.control_model.input_hint_block.0.weight' in sd or 'control_model.input_hint_block.0.weight' in sd:
+            prefix_replace = {}
+            if 'model.control_model.input_hint_block.0.weight' in sd:
+                prefix_replace["model.control_model."] = "control_model."
+                prefix_replace["model.diffusion_model.project_modules."] = "project_modules."
+            else:
+                prefix_replace["control_model."] = "control_model."
+                prefix_replace["project_modules."] = "project_modules."
+
+            # Extract denoise_encoder weights before filter_keys discards them
+            de_prefix = "first_stage_model.denoise_encoder."
+            denoise_encoder_sd = {}
+            for k in list(sd.keys()):
+                if k.startswith(de_prefix):
+                    denoise_encoder_sd[k[len(de_prefix):]] = sd.pop(k)
+
+            sd = comfy.utils.state_dict_prefix_replace(sd, prefix_replace, filter_keys=True)
+            sd.pop("control_model.mask_LQ", None)
+            model = comfy.ldm.supir.supir_modules.SUPIR(device=comfy.model_management.unet_offload_device(), dtype=dtype, operations=comfy.ops.manual_cast)
+            if denoise_encoder_sd:
+                model.denoise_encoder_sd = denoise_encoder_sd

        model_patcher = comfy.model_patcher.CoreModelPatcher(model, load_device=comfy.model_management.get_torch_device(), offload_device=comfy.model_management.unet_offload_device())
        model.load_state_dict(sd, assign=model_patcher.is_dynamic())
@ -565,9 +589,89 @@ class MultiTalkModelPatch(torch.nn.Module):
        )


+class SUPIRApply(io.ComfyNode):
+    @classmethod
+    def define_schema(cls) -> io.Schema:
+        return io.Schema(
+            node_id="SUPIRApply",
+            category="model_patches/supir",
+            is_experimental=True,
+            inputs=[
+                io.Model.Input("model"),
+                io.ModelPatch.Input("model_patch"),
+                io.Vae.Input("vae"),
+                io.Image.Input("image"),
+                io.Float.Input("strength_start", default=1.0, min=0.0, max=10.0, step=0.01,
+                               tooltip="Control strength at the start of sampling (high sigma)."),
+                io.Float.Input("strength_end", default=1.0, min=0.0, max=10.0, step=0.01,
+                               tooltip="Control strength at the end of sampling (low sigma). Linearly interpolated from start."),
+                io.Float.Input("restore_cfg", default=4.0, min=0.0, max=20.0, step=0.1, advanced=True,
+                               tooltip="Pulls denoised output toward the input latent. Higher = stronger fidelity to input. 0 to disable."),
+                io.Float.Input("restore_cfg_s_tmin", default=0.05, min=0.0, max=1.0, step=0.01, advanced=True,
+                               tooltip="Sigma threshold below which restore_cfg is disabled."),
+            ],
+            outputs=[io.Model.Output()],
+        )
+
+    @classmethod
+    def _encode_with_denoise_encoder(cls, vae, model_patch, image):
+        """Encode using denoise_encoder weights from SUPIR checkpoint if available."""
+        denoise_sd = getattr(model_patch.model, 'denoise_encoder_sd', None)
+        if not denoise_sd:
+            return vae.encode(image)
+
+        # Clone VAE patcher, apply denoise_encoder weights to clone, encode
+        orig_patcher = vae.patcher
+        vae.patcher = orig_patcher.clone()
+        patches = {f"encoder.{k}": (v,) for k, v in denoise_sd.items()}
+        vae.patcher.add_patches(patches, strength_patch=1.0, strength_model=0.0)
+        try:
+            return vae.encode(image)
+        finally:
+            vae.patcher = orig_patcher
+
+    @classmethod
+    def execute(cls, *, model: io.Model.Type, model_patch: io.ModelPatch.Type, vae: io.Vae.Type, image: io.Image.Type,
+                strength_start: float, strength_end: float, restore_cfg: float, restore_cfg_s_tmin: float) -> io.NodeOutput:
+        model_patched = model.clone()
+        hint_latent = model.get_model_object("latent_format").process_in(
+            cls._encode_with_denoise_encoder(vae, model_patch, image[:, :, :, :3]))
+        patch = SUPIRPatch(model_patch, model_patch.model.project_modules, hint_latent, strength_start, strength_end)
+        patch.register(model_patched)
+
+        if restore_cfg > 0.0:
+            # Round-trip to match original pipeline: decode hint, re-encode with regular VAE
+            latent_format = model.get_model_object("latent_format")
+            decoded = vae.decode(latent_format.process_out(hint_latent))
+            x_center = latent_format.process_in(vae.encode(decoded[:, :, :, :3]))
+            sigma_max = 14.6146
+
+            def restore_cfg_function(args):
+                denoised = args["denoised"]
+                sigma = args["sigma"]
+                if sigma.dim() > 0:
+                    s = sigma[0].item()
+                else:
+                    s = sigma.item()
+                if s > restore_cfg_s_tmin:
+                    ref = x_center.to(device=denoised.device, dtype=denoised.dtype)
+                    b = denoised.shape[0]
+                    if ref.shape[0] != b:
+                        ref = ref.expand(b, -1, -1, -1) if ref.shape[0] == 1 else ref.repeat((b + ref.shape[0] - 1) // ref.shape[0], 1, 1, 1)[:b]
+                    sigma_val = sigma.view(-1, 1, 1, 1) if sigma.dim() > 0 else sigma
+                    d_center = denoised - ref
+                    denoised = denoised - d_center * ((sigma_val / sigma_max) ** restore_cfg)
+                return denoised
+
+            model_patched.set_model_sampler_post_cfg_function(restore_cfg_function)
+
+        return io.NodeOutput(model_patched)
+
+
 NODE_CLASS_MAPPINGS = {
    "ModelPatchLoader": ModelPatchLoader,
    "QwenImageDiffsynthControlnet": QwenImageDiffsynthControlnet,
    "ZImageFunControlnet": ZImageFunControlnet,
    "USOStyleReference": USOStyleReference,
+    "SUPIRApply": SUPIRApply,
 }
--- a/comfy_extras/nodes_post_processing.py
+++ b/comfy_extras/nodes_post_processing.py
@ -6,6 +6,7 @@ from PIL import Image
 import math
 from enum import Enum
 from typing import TypedDict, Literal
+import kornia

 import comfy.utils
 import comfy.model_management
@ -660,6 +661,228 @@ class BatchImagesMasksLatentsNode(io.ComfyNode):
        return io.NodeOutput(batched)


+class ColorTransfer(io.ComfyNode):
+    @classmethod
+    def define_schema(cls):
+        return io.Schema(
+            node_id="ColorTransfer",
+            category="image/postprocessing",
+            description="Match the colors of one image to another using various algorithms.",
+            search_aliases=["color match", "color grading", "color correction", "match colors", "color transform", "mkl", "reinhard", "histogram"],
+            inputs=[
+                io.Image.Input("image_target", tooltip="Image(s) to apply the color transform to."),
+                io.Image.Input("image_ref", optional=True, tooltip="Reference image(s) to match colors to. If not provided, processing is skipped"),
+                io.Combo.Input("method", options=['reinhard_lab', 'mkl_lab', 'histogram'],),
+                io.DynamicCombo.Input("source_stats",
+                    tooltip="per_frame: each frame matched to image_ref individually. uniform: pool stats across all source frames as baseline, match to image_ref. target_frame: use one chosen frame as the baseline for the transform to image_ref, applied uniformly to all frames (preserves relative differences)",
+                    options=[
+                        io.DynamicCombo.Option("per_frame", []),
+                        io.DynamicCombo.Option("uniform", []),
+                        io.DynamicCombo.Option("target_frame", [
+                            io.Int.Input("target_index", default=0, min=0, max=10000,
+                                tooltip="Frame index used as the source baseline for computing the transform to image_ref"),
+                        ]),
+                    ]),
+                io.Float.Input("strength", default=1.0, min=0.0, max=10.0, step=0.01),
+            ],
+            outputs=[
+                io.Image.Output(display_name="image"),
+            ],
+        )
+
+    @staticmethod
+    def _to_lab(images, i, device):
+        return kornia.color.rgb_to_lab(
+            images[i:i+1].to(device, dtype=torch.float32).permute(0, 3, 1, 2))
+
+    @staticmethod
+    def _pool_stats(images, device, is_reinhard, eps):
+        """Two-pass pooled mean + std/cov across all frames."""
+        N, C = images.shape[0], images.shape[3]
+        HW = images.shape[1] * images.shape[2]
+        mean = torch.zeros(C, 1, device=device, dtype=torch.float32)
+        for i in range(N):
+            mean += ColorTransfer._to_lab(images, i, device).view(C, -1).mean(dim=-1, keepdim=True)
+        mean /= N
+        acc = torch.zeros(C, 1 if is_reinhard else C, device=device, dtype=torch.float32)
+        for i in range(N):
+            centered = ColorTransfer._to_lab(images, i, device).view(C, -1) - mean
+            if is_reinhard:
+                acc += (centered * centered).mean(dim=-1, keepdim=True)
+            else:
+                acc += centered @ centered.T / HW
+        if is_reinhard:
+            return mean, torch.sqrt(acc / N).clamp_min_(eps)
+        return mean, acc / N
+
+    @staticmethod
+    def _frame_stats(lab_flat, hw, is_reinhard, eps):
+        """Per-frame mean + std/cov."""
+        mean = lab_flat.mean(dim=-1, keepdim=True)
+        if is_reinhard:
+            return mean, lab_flat.std(dim=-1, keepdim=True, unbiased=False).clamp_min_(eps)
+        centered = lab_flat - mean
+        return mean, centered @ centered.T / hw
+
+    @staticmethod
+    def _mkl_matrix(cov_s, cov_r, eps):
+        """Compute MKL 3x3 transform matrix from source and ref covariances."""
+        eig_val_s, eig_vec_s = torch.linalg.eigh(cov_s)
+        sqrt_val_s = torch.sqrt(eig_val_s.clamp_min(0)).clamp_min_(eps)
+
+        scaled_V = eig_vec_s * sqrt_val_s.unsqueeze(0)
+        mid = scaled_V.T @ cov_r @ scaled_V
+        eig_val_m, eig_vec_m = torch.linalg.eigh(mid)
+        sqrt_m = torch.sqrt(eig_val_m.clamp_min(0))
+
+        inv_sqrt_s = 1.0 / sqrt_val_s
+        inv_scaled_V = eig_vec_s * inv_sqrt_s.unsqueeze(0)
+        M_half = (eig_vec_m * sqrt_m.unsqueeze(0)) @ eig_vec_m.T
+        return inv_scaled_V @ M_half @ inv_scaled_V.T
+
+    @staticmethod
+    def _histogram_lut(src, ref, bins=256):
+        """Build per-channel LUT from source and ref histograms. src/ref: (C, HW) in [0,1]."""
+        s_bins = (src * (bins - 1)).long().clamp(0, bins - 1)
+        r_bins = (ref * (bins - 1)).long().clamp(0, bins - 1)
+        s_hist = torch.zeros(src.shape[0], bins, device=src.device, dtype=src.dtype)
+        r_hist = torch.zeros(src.shape[0], bins, device=src.device, dtype=src.dtype)
+        ones_s = torch.ones_like(src)
+        ones_r = torch.ones_like(ref)
+        s_hist.scatter_add_(1, s_bins, ones_s)
+        r_hist.scatter_add_(1, r_bins, ones_r)
+        s_cdf = s_hist.cumsum(1)
+        s_cdf = s_cdf / s_cdf[:, -1:]
+        r_cdf = r_hist.cumsum(1)
+        r_cdf = r_cdf / r_cdf[:, -1:]
+        return torch.searchsorted(r_cdf, s_cdf).clamp_max_(bins - 1).float() / (bins - 1)
+
+    @classmethod
+    def _pooled_cdf(cls, images, device, num_bins=256):
+        """Build pooled CDF across all frames, one frame at a time."""
+        C = images.shape[3]
+        hist = torch.zeros(C, num_bins, device=device, dtype=torch.float32)
+        for i in range(images.shape[0]):
+            frame = images[i].to(device, dtype=torch.float32).permute(2, 0, 1).reshape(C, -1)
+            bins = (frame * (num_bins - 1)).long().clamp(0, num_bins - 1)
+            hist.scatter_add_(1, bins, torch.ones_like(frame))
+        cdf = hist.cumsum(1)
+        return cdf / cdf[:, -1:]
+
+    @classmethod
+    def _build_histogram_transform(cls, image_target, image_ref, device, stats_mode, target_index, B):
+        """Build per-frame or uniform LUT transform for histogram mode."""
+        if stats_mode == 'per_frame':
+            return None  # LUT computed per-frame in the apply loop
+
+        r_cdf = cls._pooled_cdf(image_ref, device)
+        if stats_mode == 'target_frame':
+            ti = min(target_index, B - 1)
+            s_cdf = cls._pooled_cdf(image_target[ti:ti+1], device)
+        else:
+            s_cdf = cls._pooled_cdf(image_target, device)
+        return torch.searchsorted(r_cdf, s_cdf).clamp_max_(255).float() / 255.0
+
+    @classmethod
+    def _build_lab_transform(cls, image_target, image_ref, device, stats_mode, target_index, is_reinhard):
+        """Build transform parameters for Lab-based methods. Returns a transform function."""
+        eps = 1e-6
+        B, H, W, C = image_target.shape
+        B_ref = image_ref.shape[0]
+        single_ref = B_ref == 1
+        HW = H * W
+        HW_ref = image_ref.shape[1] * image_ref.shape[2]
+
+        # Precompute ref stats
+        if single_ref or stats_mode in ('uniform', 'target_frame'):
+            ref_mean, ref_sc = cls._pool_stats(image_ref, device, is_reinhard, eps)
+
+        # Uniform/target_frame: precompute single affine transform
+        if stats_mode in ('uniform', 'target_frame'):
+            if stats_mode == 'target_frame':
+                ti = min(target_index, B - 1)
+                s_lab = cls._to_lab(image_target, ti, device).view(C, -1)
+                s_mean, s_sc = cls._frame_stats(s_lab, HW, is_reinhard, eps)
+            else:
+                s_mean, s_sc = cls._pool_stats(image_target, device, is_reinhard, eps)
+
+            if is_reinhard:
+                scale = ref_sc / s_sc
+                offset = ref_mean - scale * s_mean
+                return lambda src_flat, **_: src_flat * scale + offset
+            T = cls._mkl_matrix(s_sc, ref_sc, eps)
+            offset = ref_mean - T @ s_mean
+            return lambda src_flat, **_: T @ src_flat + offset
+
+        # per_frame
+        def per_frame_transform(src_flat, frame_idx):
+            s_mean, s_sc = cls._frame_stats(src_flat, HW, is_reinhard, eps)
+
+            if single_ref:
+                r_mean, r_sc = ref_mean, ref_sc
+            else:
+                ri = min(frame_idx, B_ref - 1)
+                r_mean, r_sc = cls._frame_stats(cls._to_lab(image_ref, ri, device).view(C, -1), HW_ref, is_reinhard, eps)
+
+            centered = src_flat - s_mean
+            if is_reinhard:
+                return centered * (r_sc / s_sc) + r_mean
+            T = cls._mkl_matrix(centered @ centered.T / HW, r_sc, eps)
+            return T @ centered + r_mean
+
+        return per_frame_transform
+
+    @classmethod
+    def execute(cls, image_target, image_ref, method, source_stats, strength=1.0) -> io.NodeOutput:
+        stats_mode = source_stats["source_stats"]
+        target_index = source_stats.get("target_index", 0)
+
+        if strength == 0 or image_ref is None:
+            return io.NodeOutput(image_target)
+
+        device = comfy.model_management.get_torch_device()
+        intermediate_device = comfy.model_management.intermediate_device()
+        intermediate_dtype = comfy.model_management.intermediate_dtype()
+
+        B, H, W, C = image_target.shape
+        B_ref = image_ref.shape[0]
+        pbar = comfy.utils.ProgressBar(B)
+        out = torch.empty(B, H, W, C, device=intermediate_device, dtype=intermediate_dtype)
+
+        if method == 'histogram':
+            uniform_lut = cls._build_histogram_transform(
+                image_target, image_ref, device, stats_mode, target_index, B)
+
+            for i in range(B):
+                src = image_target[i].to(device, dtype=torch.float32).permute(2, 0, 1)
+                src_flat = src.reshape(C, -1)
+                if uniform_lut is not None:
+                    lut = uniform_lut
+                else:
+                    ri = min(i, B_ref - 1)
+                    ref = image_ref[ri].to(device, dtype=torch.float32).permute(2, 0, 1).reshape(C, -1)
+                    lut = cls._histogram_lut(src_flat, ref)
+                bin_idx = (src_flat * 255).long().clamp(0, 255)
+                matched = lut.gather(1, bin_idx).view(C, H, W)
+                result = matched if strength == 1.0 else torch.lerp(src, matched, strength)
+                out[i] = result.permute(1, 2, 0).clamp_(0, 1).to(device=intermediate_device, dtype=intermediate_dtype)
+                pbar.update(1)
+        else:
+            transform = cls._build_lab_transform(image_target, image_ref, device, stats_mode, target_index, is_reinhard=method == "reinhard_lab")
+
+            for i in range(B):
+                src_frame = cls._to_lab(image_target, i, device)
+                corrected = transform(src_frame.view(C, -1), frame_idx=i)
+                if strength == 1.0:
+                    result = kornia.color.lab_to_rgb(corrected.view(1, C, H, W))
+                else:
+                    result = kornia.color.lab_to_rgb(torch.lerp(src_frame, corrected.view(1, C, H, W), strength))
+                out[i] = result.squeeze(0).permute(1, 2, 0).clamp_(0, 1).to(device=intermediate_device, dtype=intermediate_dtype)
+                pbar.update(1)
+
+        return io.NodeOutput(out)
+
+
 class PostProcessingExtension(ComfyExtension):
    @override
    async def get_node_list(self) -> list[type[io.ComfyNode]]:
@ -673,6 +896,7 @@ class PostProcessingExtension(ComfyExtension):
            BatchImagesNode,
            BatchMasksNode,
            BatchLatentsNode,
+            ColorTransfer,
            # BatchImagesMasksLatentsNode,
        ]

--- a/main.py
+++ b/main.py
@ -9,6 +9,8 @@ import folder_paths
 import time
 from comfy.cli_args import args, enables_dynamic_vram
 from app.logger import setup_logger
+setup_logger(log_level=args.verbose, use_stdout=args.log_stdout)
+
 from app.assets.seeder import asset_seeder
 from app.assets.services import register_output_files
 import itertools
@ -27,8 +29,6 @@ if __name__ == "__main__":
    os.environ['HF_HUB_DISABLE_TELEMETRY'] = '1'
    os.environ['DO_NOT_TRACK'] = '1'

-setup_logger(log_level=args.verbose, use_stdout=args.log_stdout)
-
 faulthandler.enable(file=sys.stderr, all_threads=False)

 import comfy_aimdo.control
--- a/requirements.txt
+++ b/requirements.txt
@ -1,5 +1,5 @@
-comfyui-frontend-package==1.42.11
-comfyui-workflow-templates==0.9.57
+comfyui-frontend-package==1.42.14
+comfyui-workflow-templates==0.9.59
 comfyui-embedded-docs==0.4.3
 torch
 torchsde
@ -19,7 +19,7 @@ scipy
 tqdm
 psutil
 alembic
-SQLAlchemy
+SQLAlchemy>=2.0
 filelock
 av>=14.2.0
 comfy-kitchen>=0.2.8
Author	SHA1	Message	Date
Jedrzej Kosinski	174f873cce	Merge branch 'master' into mattmiller/veo-4k-model-gating	2026-04-21 22:08:25 -07:00
Daxiong (Lin)	6045c11d8b	chore: update workflow templates to v0.9.59 (#13507 )	2026-04-21 20:45:25 -07:00
comfyanonymous	529c80255f	Allow logging in comfy app files. (#13505 )	2026-04-21 22:59:31 -04:00
Matt Miller	65eb54a4aa	fix(veo): reject 4K resolution for veo-3.0 models in Veo3VideoGenerationNode The tooltip on the resolution input states that 4K is not available for veo-3.1-lite or veo-3.0 models, but the execute guard only rejected the lite combination. Selecting 4K with veo-3.0-generate-001 or veo-3.0-fast-generate-001 would fall through and hit the upstream API with an invalid request. Broaden the guard to match the documented behavior and update the error message accordingly.	2026-04-21 19:33:24 -07:00
AustinMroz	43a1263b60	Add gpt-image-2 as version option (#13501 )	2026-04-21 17:58:59 -07:00
Comfy Org PR Bot	102773cd2c	Bump comfyui-frontend-package to 1.42.14 (#13493 )	2026-04-21 11:35:45 -07:00
Alexander Piskun	1e1d4f1254	[Partner Nodes] added 4K resolution for Veo models; added Veo 3 Lite model (#13330 ) * feat(api nodes): added 4K resolution for Veo models; added Veo 3 Lite model Signed-off-by: bigcat88 <bigcat88@icloud.com> * increase poll_interval from 5 to 9 --------- Signed-off-by: bigcat88 <bigcat88@icloud.com> Co-authored-by: Jedrzej Kosinski <kosinkadink1@gmail.com>	2026-04-21 11:27:35 -07:00
Jukka Seppänen	eb22225387	Support standalone LTXV audio VAEs (#13499 )	2026-04-21 10:46:37 -07:00
Alexander Piskun	b38dd0ff23	feat(api-nodes): add automatic downscaling of videos for ByteDance 2 nodes (#13465 )	2026-04-21 10:45:10 -07:00
comfyanonymous	ad94d47221	Make the ltx audio vae more native. (#13486 )	2026-04-21 11:02:42 -04:00
Comfy Org PR Bot	e75f775ae8	Bump comfyui-frontend-package to 1.42.12 (#13489 )	2026-04-21 00:43:11 -07:00
comfyanonymous	c514890325	Refactor io to IO in nodes_ace.py (#13485 )	2026-04-20 21:59:26 -04:00
Octopus	543e9fba64	fix: pin SQLAlchemy>=2.0 in requirements.txt (fixes #13036 ) (#13316 )	2026-04-20 15:30:23 -07:00
comfyanonymous	fc5f4a996b	Add link to Intel portable to Readme. (#13477 )	2026-04-19 20:26:12 -04:00
Abdul Rehman	138571da95	fix: append directory type annotation to internal files endpoint response (#13078 ) (#13305 )	2026-04-18 23:21:22 -04:00
comfyanonymous	3d816db07f	Some optimizations to make Ernie inference a bit faster. (#13472 )	2026-04-18 23:02:29 -04:00
Jukka Seppänen	b9dedea57d	feat: SUPIR model support (CORE-17) (#13250 )	2026-04-18 23:02:01 -04:00