ComfyUI version 0.3.51

Bump template to 0.1.62 (#9419 )
* Bump template to 0.1.61 * Bump template to 0.1.62
2026-01-24 22:06:22 +08:00 · 2025-08-20 03:15:30 -04:00 · 2025-08-20 03:15:09 -04:00 · 2025-08-20 02:58:54 -04:00 · 2025-08-20 01:08:11 -04:00 · 2025-08-20 00:45:27 -04:00
21 changed files with 1456 additions and 34 deletions
--- a/comfy/clip_model.py
+++ b/comfy/clip_model.py
@ -97,7 +97,7 @@ class CLIPTextModel_(torch.nn.Module):
        self.encoder = CLIPEncoder(num_layers, embed_dim, heads, intermediate_size, intermediate_activation, dtype, device, operations)
        self.final_layer_norm = operations.LayerNorm(embed_dim, dtype=dtype, device=device)

-    def forward(self, input_tokens=None, attention_mask=None, embeds=None, num_tokens=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=torch.float32):
+    def forward(self, input_tokens=None, attention_mask=None, embeds=None, num_tokens=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=torch.float32, embeds_info=[]):
        if embeds is not None:
            x = embeds + comfy.ops.cast_to(self.embeddings.position_embedding.weight, dtype=dtype, device=embeds.device)
        else:
--- a/comfy/ldm/qwen_image/model.py
+++ b/comfy/ldm/qwen_image/model.py
@ -347,10 +347,10 @@ class QwenImageTransformer2DModel(nn.Module):
        h_offset = ((h_offset + (patch_size // 2)) // patch_size)
        w_offset = ((w_offset + (patch_size // 2)) // patch_size)

-        img_ids = torch.zeros((h_len, w_len, 3), device=x.device, dtype=x.dtype)
+        img_ids = torch.zeros((h_len, w_len, 3), device=x.device)
        img_ids[:, :, 0] = img_ids[:, :, 1] + index
-        img_ids[:, :, 1] = img_ids[:, :, 1] + torch.linspace(h_offset, h_len - 1 + h_offset, steps=h_len, device=x.device, dtype=x.dtype).unsqueeze(1)
-        img_ids[:, :, 2] = img_ids[:, :, 2] + torch.linspace(w_offset, w_len - 1 + w_offset, steps=w_len, device=x.device, dtype=x.dtype).unsqueeze(0)
+        img_ids[:, :, 1] = img_ids[:, :, 1] + torch.linspace(h_offset, h_len - 1 + h_offset, steps=h_len, device=x.device, dtype=x.dtype).unsqueeze(1) - (h_len // 2)
+        img_ids[:, :, 2] = img_ids[:, :, 2] + torch.linspace(w_offset, w_len - 1 + w_offset, steps=w_len, device=x.device, dtype=x.dtype).unsqueeze(0) - (w_len // 2)
        return hidden_states, repeat(img_ids, "h w c -> b (h w) c", b=bs), orig_shape

    def forward(
@ -396,10 +396,11 @@ class QwenImageTransformer2DModel(nn.Module):
                hidden_states = torch.cat([hidden_states, kontext], dim=1)
                img_ids = torch.cat([img_ids, kontext_ids], dim=1)

-        txt_start = round(max(((x.shape[-1] + (self.patch_size // 2)) // self.patch_size), ((x.shape[-2] + (self.patch_size // 2)) // self.patch_size)))
-        txt_ids = torch.linspace(txt_start, txt_start + context.shape[1], steps=context.shape[1], device=x.device, dtype=x.dtype).reshape(1, -1, 1).repeat(x.shape[0], 1, 3)
+        txt_start = round(max(((x.shape[-1] + (self.patch_size // 2)) // self.patch_size) // 2, ((x.shape[-2] + (self.patch_size // 2)) // self.patch_size) // 2))
+        txt_ids = torch.arange(txt_start, txt_start + context.shape[1], device=x.device).reshape(1, -1, 1).repeat(x.shape[0], 1, 3)
        ids = torch.cat((txt_ids, img_ids), dim=1)
        image_rotary_emb = self.pe_embedder(ids).squeeze(1).unsqueeze(2).to(x.dtype)
+        del ids, txt_ids, img_ids

        hidden_states = self.img_in(hidden_states)
        encoder_hidden_states = self.txt_norm(encoder_hidden_states)
--- a/comfy/model_base.py
+++ b/comfy/model_base.py
@ -1325,6 +1325,7 @@ class Omnigen2(BaseModel):
 class QwenImage(BaseModel):
    def __init__(self, model_config, model_type=ModelType.FLUX, device=None):
        super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.qwen_image.model.QwenImageTransformer2DModel)
+        self.memory_usage_factor_conds = ("ref_latents",)

    def extra_conds(self, **kwargs):
        out = super().extra_conds(**kwargs)
@ -1342,3 +1343,10 @@ class QwenImage(BaseModel):
            if ref_latents_method is not None:
                out['ref_latents_method'] = comfy.conds.CONDConstant(ref_latents_method)
        return out
+
+    def extra_conds_shapes(self, **kwargs):
+        out = {}
+        ref_latents = kwargs.get("reference_latents", None)
+        if ref_latents is not None:
+            out['ref_latents'] = list([1, 16, sum(map(lambda a: math.prod(a.size()), ref_latents)) // 16])
+        return out
--- a/comfy/sd1_clip.py
+++ b/comfy/sd1_clip.py
@ -204,17 +204,19 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder):
            tokens_embed = self.transformer.get_input_embeddings()(tokens_embed, out_dtype=torch.float32)
            index = 0
            pad_extra = 0
+            embeds_info = []
            for o in other_embeds:
                emb = o[1]
                if torch.is_tensor(emb):
                    emb = {"type": "embedding", "data": emb}

+                extra = None
                emb_type = emb.get("type", None)
                if emb_type == "embedding":
                    emb = emb.get("data", None)
                else:
                    if hasattr(self.transformer, "preprocess_embed"):
-                        emb = self.transformer.preprocess_embed(emb, device=device)
+                        emb, extra = self.transformer.preprocess_embed(emb, device=device)
                    else:
                        emb = None

@ -229,6 +231,7 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder):
                    tokens_embed = torch.cat([tokens_embed[:, :ind], emb, tokens_embed[:, ind:]], dim=1)
                    attention_mask = attention_mask[:ind] + [1] * emb_shape + attention_mask[ind:]
                    index += emb_shape - 1
+                    embeds_info.append({"type": emb_type, "index": ind, "size": emb_shape, "extra": extra})
                else:
                    index += -1
                    pad_extra += emb_shape
@ -243,11 +246,11 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder):
            attention_masks.append(attention_mask)
            num_tokens.append(sum(attention_mask))

-        return torch.cat(embeds_out), torch.tensor(attention_masks, device=device, dtype=torch.long), num_tokens
+        return torch.cat(embeds_out), torch.tensor(attention_masks, device=device, dtype=torch.long), num_tokens, embeds_info

    def forward(self, tokens):
        device = self.transformer.get_input_embeddings().weight.device
-        embeds, attention_mask, num_tokens = self.process_tokens(tokens, device)
+        embeds, attention_mask, num_tokens, embeds_info = self.process_tokens(tokens, device)

        attention_mask_model = None
        if self.enable_attention_masks:
@ -258,7 +261,7 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder):
        else:
            intermediate_output = self.layer_idx

-        outputs = self.transformer(None, attention_mask_model, embeds=embeds, num_tokens=num_tokens, intermediate_output=intermediate_output, final_layer_norm_intermediate=self.layer_norm_hidden_state, dtype=torch.float32)
+        outputs = self.transformer(None, attention_mask_model, embeds=embeds, num_tokens=num_tokens, intermediate_output=intermediate_output, final_layer_norm_intermediate=self.layer_norm_hidden_state, dtype=torch.float32, embeds_info=embeds_info)

        if self.layer == "last":
            z = outputs[0].float()
@ -531,7 +534,10 @@ class SDTokenizer:
        min_padding = tokenizer_options.get("{}_min_padding".format(self.embedding_key), self.min_padding)

        text = escape_important(text)
-        parsed_weights = token_weights(text, 1.0)
+        if kwargs.get("disable_weights", False):
+            parsed_weights = [(text, 1.0)]
+        else:
+            parsed_weights = token_weights(text, 1.0)

        # tokenize words
        tokens = []
--- a/comfy/text_encoders/bert.py
+++ b/comfy/text_encoders/bert.py
@ -116,7 +116,7 @@ class BertModel_(torch.nn.Module):
        self.embeddings = BertEmbeddings(config_dict["vocab_size"], config_dict["max_position_embeddings"], config_dict["type_vocab_size"], config_dict["pad_token_id"], embed_dim, layer_norm_eps, dtype, device, operations)
        self.encoder = BertEncoder(config_dict["num_hidden_layers"], embed_dim, config_dict["intermediate_size"], config_dict["num_attention_heads"], layer_norm_eps, dtype, device, operations)

-    def forward(self, input_tokens, attention_mask=None, embeds=None, num_tokens=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=None):
+    def forward(self, input_tokens, attention_mask=None, embeds=None, num_tokens=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=None, embeds_info=[]):
        x = self.embeddings(input_tokens, embeds=embeds, dtype=dtype)
        mask = None
        if attention_mask is not None:
--- a/comfy/text_encoders/llama.py
+++ b/comfy/text_encoders/llama.py
@ -2,12 +2,14 @@ import torch
 import torch.nn as nn
 from dataclasses import dataclass
 from typing import Optional, Any
+import math

 from comfy.ldm.modules.attention import optimized_attention_for_device
 import comfy.model_management
 import comfy.ldm.common_dit

 import comfy.model_management
+from . import qwen_vl

@dataclass
 class Llama2Config:
@ -25,6 +27,7 @@ class Llama2Config:
    rms_norm_add = False
    mlp_activation = "silu"
    qkv_bias = False
+    rope_dims = None

@dataclass
 class Qwen25_3BConfig:
@ -42,6 +45,7 @@ class Qwen25_3BConfig:
    rms_norm_add = False
    mlp_activation = "silu"
    qkv_bias = True
+    rope_dims = None

@dataclass
 class Qwen25_7BVLI_Config:
@ -59,6 +63,7 @@ class Qwen25_7BVLI_Config:
    rms_norm_add = False
    mlp_activation = "silu"
    qkv_bias = True
+    rope_dims = [16, 24, 24]

@dataclass
 class Gemma2_2B_Config:
@ -76,6 +81,7 @@ class Gemma2_2B_Config:
    rms_norm_add = True
    mlp_activation = "gelu_pytorch_tanh"
    qkv_bias = False
+    rope_dims = None

 class RMSNorm(nn.Module):
    def __init__(self, dim: int, eps: float = 1e-5, add=False, device=None, dtype=None):
@ -100,24 +106,30 @@ def rotate_half(x):
    return torch.cat((-x2, x1), dim=-1)


-def precompute_freqs_cis(head_dim, seq_len, theta, device=None):
+def precompute_freqs_cis(head_dim, position_ids, theta, rope_dims=None, device=None):
    theta_numerator = torch.arange(0, head_dim, 2, device=device).float()
    inv_freq = 1.0 / (theta ** (theta_numerator / head_dim))

-    position_ids = torch.arange(0, seq_len, device=device).unsqueeze(0)
-
    inv_freq_expanded = inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
    position_ids_expanded = position_ids[:, None, :].float()
    freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
    emb = torch.cat((freqs, freqs), dim=-1)
    cos = emb.cos()
    sin = emb.sin()
+    if rope_dims is not None and position_ids.shape[0] > 1:
+        mrope_section = rope_dims * 2
+        cos = torch.cat([m[i % 3] for i, m in enumerate(cos.split(mrope_section, dim=-1))], dim=-1).unsqueeze(0)
+        sin = torch.cat([m[i % 3] for i, m in enumerate(sin.split(mrope_section, dim=-1))], dim=-1).unsqueeze(0)
+    else:
+        cos = cos.unsqueeze(1)
+        sin = sin.unsqueeze(1)
+
    return (cos, sin)


 def apply_rope(xq, xk, freqs_cis):
-    cos = freqs_cis[0].unsqueeze(1)
-    sin = freqs_cis[1].unsqueeze(1)
+    cos = freqs_cis[0]
+    sin = freqs_cis[1]
    q_embed = (xq * cos) + (rotate_half(xq) * sin)
    k_embed = (xk * cos) + (rotate_half(xk) * sin)
    return q_embed, k_embed
@ -277,7 +289,7 @@ class Llama2_(nn.Module):
        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps, add=config.rms_norm_add, device=device, dtype=dtype)
        # self.lm_head = ops.Linear(config.hidden_size, config.vocab_size, bias=False, device=device, dtype=dtype)

-    def forward(self, x, attention_mask=None, embeds=None, num_tokens=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=None):
+    def forward(self, x, attention_mask=None, embeds=None, num_tokens=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=None, position_ids=None, embeds_info=[]):
        if embeds is not None:
            x = embeds
        else:
@ -286,9 +298,13 @@ class Llama2_(nn.Module):
        if self.normalize_in:
            x *= self.config.hidden_size ** 0.5

+        if position_ids is None:
+            position_ids = torch.arange(0, x.shape[1], device=x.device).unsqueeze(0)
+
        freqs_cis = precompute_freqs_cis(self.config.head_dim,
-                                         x.shape[1],
+                                         position_ids,
                                         self.config.rope_theta,
+                                         self.config.rope_dims,
                                         device=x.device)

        mask = None
@ -372,8 +388,38 @@ class Qwen25_7BVLI(BaseLlama, torch.nn.Module):
        self.num_layers = config.num_hidden_layers

        self.model = Llama2_(config, device=device, dtype=dtype, ops=operations)
+        self.visual = qwen_vl.Qwen2VLVisionTransformer(hidden_size=1280, output_hidden_size=config.hidden_size, device=device, dtype=dtype, ops=operations)
        self.dtype = dtype

+    def preprocess_embed(self, embed, device):
+        if embed["type"] == "image":
+            image, grid = qwen_vl.process_qwen2vl_images(embed["data"])
+            return self.visual(image.to(device, dtype=torch.float32), grid), grid
+        return None, None
+
+    def forward(self, x, attention_mask=None, embeds=None, num_tokens=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=None, embeds_info=[]):
+        grid = None
+        for e in embeds_info:
+            if e.get("type") == "image":
+                grid = e.get("extra", None)
+                position_ids = torch.zeros((3, embeds.shape[1]), device=embeds.device)
+                start = e.get("index")
+                position_ids[:, :start] = torch.arange(0, start, device=embeds.device)
+                end = e.get("size") + start
+                len_max = int(grid.max()) // 2
+                start_next = len_max + start
+                position_ids[:, end:] = torch.arange(start_next, start_next + (embeds.shape[1] - end), device=embeds.device)
+                position_ids[0, start:end] = start
+                max_d = int(grid[0][1]) // 2
+                position_ids[1, start:end] = torch.arange(start, start + max_d, device=embeds.device).unsqueeze(1).repeat(1, math.ceil((end - start) / max_d)).flatten(0)[:end - start]
+                max_d = int(grid[0][2]) // 2
+                position_ids[2, start:end] = torch.arange(start, start + max_d, device=embeds.device).unsqueeze(0).repeat(math.ceil((end - start) / max_d), 1).flatten(0)[:end - start]
+
+        if grid is None:
+            position_ids = None
+
+        return super().forward(x, attention_mask=attention_mask, embeds=embeds, num_tokens=num_tokens, intermediate_output=intermediate_output, final_layer_norm_intermediate=final_layer_norm_intermediate, dtype=dtype, position_ids=position_ids)
+
 class Gemma2_2B(BaseLlama, torch.nn.Module):
    def __init__(self, config_dict, dtype, device, operations):
        super().__init__()
--- a/comfy/text_encoders/qwen_image.py
+++ b/comfy/text_encoders/qwen_image.py
@ -15,13 +15,27 @@ class QwenImageTokenizer(sd1_clip.SD1Tokenizer):
    def __init__(self, embedding_directory=None, tokenizer_data={}):
        super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, name="qwen25_7b", tokenizer=Qwen25_7BVLITokenizer)
        self.llama_template = "<|im_start|>system\nDescribe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n"
+        self.llama_template_images = "<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>\n<|im_start|>assistant\n"

-    def tokenize_with_weights(self, text, return_word_ids=False, llama_template=None,**kwargs):
+    def tokenize_with_weights(self, text, return_word_ids=False, llama_template=None, images=[], **kwargs):
        if llama_template is None:
-            llama_text = self.llama_template.format(text)
+            if len(images) > 0:
+                llama_text = self.llama_template_images.format(text)
+            else:
+                llama_text = self.llama_template.format(text)
        else:
            llama_text = llama_template.format(text)
-        return super().tokenize_with_weights(llama_text, return_word_ids=return_word_ids, **kwargs)
+        tokens = super().tokenize_with_weights(llama_text, return_word_ids=return_word_ids, disable_weights=True, **kwargs)
+        key_name = next(iter(tokens))
+        embed_count = 0
+        qwen_tokens = tokens[key_name]
+        for r in qwen_tokens:
+            for i in range(len(r)):
+                if r[i][0] == 151655:
+                    if len(images) > embed_count:
+                        r[i] = ({"type": "image", "data": images[embed_count], "original_type": "image"},) + r[i][1:]
+                        embed_count += 1
+        return tokens


 class Qwen25_7BVLIModel(sd1_clip.SDClipModel):
--- a/comfy/text_encoders/qwen_vl.py
+++ b/comfy/text_encoders/qwen_vl.py
@ -0,0 +1,428 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Optional, Tuple
+import math
+from comfy.ldm.modules.attention import optimized_attention_for_device
+
+
+def process_qwen2vl_images(
+    images: torch.Tensor,
+    min_pixels: int = 3136,
+    max_pixels: int = 12845056,
+    patch_size: int = 14,
+    temporal_patch_size: int = 2,
+    merge_size: int = 2,
+    image_mean: list = None,
+    image_std: list = None,
+):
+    if image_mean is None:
+        image_mean = [0.48145466, 0.4578275, 0.40821073]
+    if image_std is None:
+        image_std = [0.26862954, 0.26130258, 0.27577711]
+
+    batch_size, height, width, channels = images.shape
+    device = images.device
+    # dtype = images.dtype
+
+    images = images.permute(0, 3, 1, 2)
+
+    grid_thw_list = []
+    img = images[0]
+
+    factor = patch_size * merge_size
+
+    h_bar = round(height / factor) * factor
+    w_bar = round(width / factor) * factor
+
+    if h_bar * w_bar > max_pixels:
+        beta = math.sqrt((height * width) / max_pixels)
+        h_bar = max(factor, math.floor(height / beta / factor) * factor)
+        w_bar = max(factor, math.floor(width / beta / factor) * factor)
+    elif h_bar * w_bar < min_pixels:
+        beta = math.sqrt(min_pixels / (height * width))
+        h_bar = math.ceil(height * beta / factor) * factor
+        w_bar = math.ceil(width * beta / factor) * factor
+
+    img_resized = F.interpolate(
+        img.unsqueeze(0),
+        size=(h_bar, w_bar),
+        mode='bilinear',
+        align_corners=False
+    ).squeeze(0)
+
+    normalized = img_resized.clone()
+    for c in range(3):
+        normalized[c] = (img_resized[c] - image_mean[c]) / image_std[c]
+
+    grid_h = h_bar // patch_size
+    grid_w = w_bar // patch_size
+    grid_thw = torch.tensor([1, grid_h, grid_w], device=device, dtype=torch.long)
+
+    pixel_values = normalized
+    grid_thw_list.append(grid_thw)
+    image_grid_thw = torch.stack(grid_thw_list)
+
+    grid_t = 1
+    channel = pixel_values.shape[0]
+    pixel_values = pixel_values.unsqueeze(0).repeat(2, 1, 1, 1)
+
+    patches = pixel_values.reshape(
+        grid_t,
+        temporal_patch_size,
+        channel,
+        grid_h // merge_size,
+        merge_size,
+        patch_size,
+        grid_w // merge_size,
+        merge_size,
+        patch_size,
+    )
+
+    patches = patches.permute(0, 3, 6, 4, 7, 2, 1, 5, 8)
+    flatten_patches = patches.reshape(
+        grid_t * grid_h * grid_w,
+        channel * temporal_patch_size * patch_size * patch_size
+    )
+
+    return flatten_patches, image_grid_thw
+
+
+class VisionPatchEmbed(nn.Module):
+    def __init__(
+        self,
+        patch_size: int = 14,
+        temporal_patch_size: int = 2,
+        in_channels: int = 3,
+        embed_dim: int = 3584,
+        device=None,
+        dtype=None,
+        ops=None,
+    ):
+        super().__init__()
+        self.patch_size = patch_size
+        self.temporal_patch_size = temporal_patch_size
+        self.in_channels = in_channels
+        self.embed_dim = embed_dim
+
+        kernel_size = [temporal_patch_size, patch_size, patch_size]
+        self.proj = ops.Conv3d(
+            in_channels,
+            embed_dim,
+            kernel_size=kernel_size,
+            stride=kernel_size,
+            bias=False,
+            device=device,
+            dtype=dtype
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = hidden_states.view(
+            -1, self.in_channels, self.temporal_patch_size, self.patch_size, self.patch_size
+        )
+        hidden_states = self.proj(hidden_states)
+        return hidden_states.view(-1, self.embed_dim)
+
+
+def rotate_half(x):
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb_vision(q, k, cos, sin):
+    cos, sin = cos.unsqueeze(-2).float(), sin.unsqueeze(-2).float()
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+class VisionRotaryEmbedding(nn.Module):
+    def __init__(self, dim: int, theta: float = 10000.0):
+        super().__init__()
+        self.dim = dim
+        self.theta = theta
+
+    def forward(self, seqlen: int, device) -> torch.Tensor:
+        inv_freq = 1.0 / (self.theta ** (torch.arange(0, self.dim, 2, dtype=torch.float, device=device) / self.dim))
+        seq = torch.arange(seqlen, device=inv_freq.device, dtype=inv_freq.dtype)
+        freqs = torch.outer(seq, inv_freq)
+        return freqs
+
+
+class PatchMerger(nn.Module):
+    def __init__(self, dim: int, context_dim: int, spatial_merge_size: int = 2, device=None, dtype=None, ops=None):
+        super().__init__()
+        self.hidden_size = context_dim * (spatial_merge_size ** 2)
+        self.ln_q = ops.RMSNorm(context_dim, eps=1e-6, device=device, dtype=dtype)
+        self.mlp = nn.Sequential(
+            ops.Linear(self.hidden_size, self.hidden_size, device=device, dtype=dtype),
+            nn.GELU(),
+            ops.Linear(self.hidden_size, dim, device=device, dtype=dtype),
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.ln_q(x).reshape(-1, self.hidden_size)
+        x = self.mlp(x)
+        return x
+
+
+class VisionAttention(nn.Module):
+    def __init__(self, hidden_size: int, num_heads: int, device=None, dtype=None, ops=None):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.num_heads = num_heads
+        self.head_dim = hidden_size // num_heads
+        self.scaling = self.head_dim ** -0.5
+
+        self.qkv = ops.Linear(hidden_size, hidden_size * 3, bias=True, device=device, dtype=dtype)
+        self.proj = ops.Linear(hidden_size, hidden_size, bias=True, device=device, dtype=dtype)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        cu_seqlens=None,
+        optimized_attention=None,
+    ) -> torch.Tensor:
+        if hidden_states.dim() == 2:
+            seq_length, _ = hidden_states.shape
+            batch_size = 1
+            hidden_states = hidden_states.unsqueeze(0)
+        else:
+            batch_size, seq_length, _ = hidden_states.shape
+
+        qkv = self.qkv(hidden_states)
+        qkv = qkv.reshape(batch_size, seq_length, 3, self.num_heads, self.head_dim)
+        query_states, key_states, value_states = qkv.reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0)
+
+        if position_embeddings is not None:
+            cos, sin = position_embeddings
+            query_states, key_states = apply_rotary_pos_emb_vision(query_states, key_states, cos, sin)
+
+        query_states = query_states.transpose(0, 1).unsqueeze(0)
+        key_states = key_states.transpose(0, 1).unsqueeze(0)
+        value_states = value_states.transpose(0, 1).unsqueeze(0)
+
+        lengths = cu_seqlens[1:] - cu_seqlens[:-1]
+        splits = [
+            torch.split(tensor, lengths.tolist(), dim=2) for tensor in (query_states, key_states, value_states)
+        ]
+
+        attn_outputs = [
+            optimized_attention(q, k, v, self.num_heads, skip_reshape=True)
+            for q, k, v in zip(*splits)
+        ]
+        attn_output = torch.cat(attn_outputs, dim=1)
+        attn_output = attn_output.reshape(seq_length, -1)
+        attn_output = self.proj(attn_output)
+
+        return attn_output
+
+
+class VisionMLP(nn.Module):
+    def __init__(self, hidden_size: int, intermediate_size: int, device=None, dtype=None, ops=None):
+        super().__init__()
+        self.gate_proj = ops.Linear(hidden_size, intermediate_size, bias=True, device=device, dtype=dtype)
+        self.up_proj = ops.Linear(hidden_size, intermediate_size, bias=True, device=device, dtype=dtype)
+        self.down_proj = ops.Linear(intermediate_size, hidden_size, bias=True, device=device, dtype=dtype)
+        self.act_fn = nn.SiLU()
+
+    def forward(self, hidden_state):
+        return self.down_proj(self.act_fn(self.gate_proj(hidden_state)) * self.up_proj(hidden_state))
+
+
+class VisionBlock(nn.Module):
+    def __init__(self, hidden_size: int, intermediate_size: int, num_heads: int, device=None, dtype=None, ops=None):
+        super().__init__()
+        self.norm1 = ops.RMSNorm(hidden_size, eps=1e-6, device=device, dtype=dtype)
+        self.norm2 = ops.RMSNorm(hidden_size, eps=1e-6, device=device, dtype=dtype)
+        self.attn = VisionAttention(hidden_size, num_heads, device=device, dtype=dtype, ops=ops)
+        self.mlp = VisionMLP(hidden_size, intermediate_size, device=device, dtype=dtype, ops=ops)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        cu_seqlens=None,
+        optimized_attention=None,
+    ) -> torch.Tensor:
+        residual = hidden_states
+        hidden_states = self.norm1(hidden_states)
+        hidden_states = self.attn(hidden_states, position_embeddings, cu_seqlens, optimized_attention)
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        return hidden_states
+
+
+class Qwen2VLVisionTransformer(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int = 3584,
+        output_hidden_size: int = 3584,
+        intermediate_size: int = 3420,
+        num_heads: int = 16,
+        num_layers: int = 32,
+        patch_size: int = 14,
+        temporal_patch_size: int = 2,
+        spatial_merge_size: int = 2,
+        window_size: int = 112,
+        device=None,
+        dtype=None,
+        ops=None
+    ):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.patch_size = patch_size
+        self.spatial_merge_size = spatial_merge_size
+        self.window_size = window_size
+        self.fullatt_block_indexes = [7, 15, 23, 31]
+
+        self.patch_embed = VisionPatchEmbed(
+            patch_size=patch_size,
+            temporal_patch_size=temporal_patch_size,
+            in_channels=3,
+            embed_dim=hidden_size,
+            device=device,
+            dtype=dtype,
+            ops=ops,
+        )
+
+        head_dim = hidden_size // num_heads
+        self.rotary_pos_emb = VisionRotaryEmbedding(head_dim // 2)
+
+        self.blocks = nn.ModuleList([
+            VisionBlock(hidden_size, intermediate_size, num_heads, device, dtype, ops)
+            for _ in range(num_layers)
+        ])
+
+        self.merger = PatchMerger(
+            dim=output_hidden_size,
+            context_dim=hidden_size,
+            spatial_merge_size=spatial_merge_size,
+            device=device,
+            dtype=dtype,
+            ops=ops,
+        )
+
+    def get_window_index(self, grid_thw):
+        window_index = []
+        cu_window_seqlens = [0]
+        window_index_id = 0
+        vit_merger_window_size = self.window_size // self.spatial_merge_size // self.patch_size
+
+        for grid_t, grid_h, grid_w in grid_thw:
+            llm_grid_h = grid_h // self.spatial_merge_size
+            llm_grid_w = grid_w // self.spatial_merge_size
+
+            index = torch.arange(grid_t * llm_grid_h * llm_grid_w).reshape(grid_t, llm_grid_h, llm_grid_w)
+
+            pad_h = vit_merger_window_size - llm_grid_h % vit_merger_window_size
+            pad_w = vit_merger_window_size - llm_grid_w % vit_merger_window_size
+            num_windows_h = (llm_grid_h + pad_h) // vit_merger_window_size
+            num_windows_w = (llm_grid_w + pad_w) // vit_merger_window_size
+
+            index_padded = F.pad(index, (0, pad_w, 0, pad_h), "constant", -100)
+            index_padded = index_padded.reshape(
+                grid_t,
+                num_windows_h,
+                vit_merger_window_size,
+                num_windows_w,
+                vit_merger_window_size,
+            )
+            index_padded = index_padded.permute(0, 1, 3, 2, 4).reshape(
+                grid_t,
+                num_windows_h * num_windows_w,
+                vit_merger_window_size,
+                vit_merger_window_size,
+            )
+
+            seqlens = (index_padded != -100).sum([2, 3]).reshape(-1)
+            index_padded = index_padded.reshape(-1)
+            index_new = index_padded[index_padded != -100]
+            window_index.append(index_new + window_index_id)
+
+            cu_seqlens_tmp = seqlens.cumsum(0) * self.spatial_merge_size * self.spatial_merge_size + cu_window_seqlens[-1]
+            cu_window_seqlens.extend(cu_seqlens_tmp.tolist())
+            window_index_id += (grid_t * llm_grid_h * llm_grid_w).item()
+
+        window_index = torch.cat(window_index, dim=0)
+        return window_index, cu_window_seqlens
+
+    def get_position_embeddings(self, grid_thw, device):
+        pos_ids = []
+
+        for t, h, w in grid_thw:
+            hpos_ids = torch.arange(h, device=device).unsqueeze(1).expand(-1, w)
+            hpos_ids = hpos_ids.reshape(
+                h // self.spatial_merge_size,
+                self.spatial_merge_size,
+                w // self.spatial_merge_size,
+                self.spatial_merge_size,
+            )
+            hpos_ids = hpos_ids.permute(0, 2, 1, 3).flatten()
+
+            wpos_ids = torch.arange(w, device=device).unsqueeze(0).expand(h, -1)
+            wpos_ids = wpos_ids.reshape(
+                h // self.spatial_merge_size,
+                self.spatial_merge_size,
+                w // self.spatial_merge_size,
+                self.spatial_merge_size,
+            )
+            wpos_ids = wpos_ids.permute(0, 2, 1, 3).flatten()
+
+            pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
+
+        pos_ids = torch.cat(pos_ids, dim=0)
+        max_grid_size = grid_thw[:, 1:].max()
+        rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size, device)
+        return rotary_pos_emb_full[pos_ids].flatten(1)
+
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        image_grid_thw: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        optimized_attention = optimized_attention_for_device(pixel_values.device, mask=False, small_input=True)
+
+        hidden_states = self.patch_embed(pixel_values)
+
+        window_index, cu_window_seqlens = self.get_window_index(image_grid_thw)
+        cu_window_seqlens = torch.tensor(cu_window_seqlens, device=hidden_states.device)
+        cu_window_seqlens = torch.unique_consecutive(cu_window_seqlens)
+
+        position_embeddings = self.get_position_embeddings(image_grid_thw, hidden_states.device)
+
+        seq_len, _ = hidden_states.size()
+        spatial_merge_unit = self.spatial_merge_size * self.spatial_merge_size
+
+        hidden_states = hidden_states.reshape(seq_len // spatial_merge_unit, spatial_merge_unit, -1)
+        hidden_states = hidden_states[window_index, :, :]
+        hidden_states = hidden_states.reshape(seq_len, -1)
+
+        position_embeddings = position_embeddings.reshape(seq_len // spatial_merge_unit, spatial_merge_unit, -1)
+        position_embeddings = position_embeddings[window_index, :, :]
+        position_embeddings = position_embeddings.reshape(seq_len, -1)
+        position_embeddings = torch.cat((position_embeddings, position_embeddings), dim=-1)
+        position_embeddings = (position_embeddings.cos(), position_embeddings.sin())
+
+        cu_seqlens = torch.repeat_interleave(image_grid_thw[:, 1] * image_grid_thw[:, 2], image_grid_thw[:, 0]).cumsum(
+            dim=0,
+            dtype=torch.int32,
+        )
+        cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
+
+        for i, block in enumerate(self.blocks):
+            if i in self.fullatt_block_indexes:
+                cu_seqlens_now = cu_seqlens
+            else:
+                cu_seqlens_now = cu_window_seqlens
+            hidden_states = block(hidden_states, position_embeddings, cu_seqlens_now, optimized_attention=optimized_attention)
+
+        hidden_states = self.merger(hidden_states)
+        return hidden_states
--- a/comfy/text_encoders/t5.py
+++ b/comfy/text_encoders/t5.py
@ -199,7 +199,7 @@ class T5Stack(torch.nn.Module):
        self.final_layer_norm = T5LayerNorm(model_dim, dtype=dtype, device=device, operations=operations)
        # self.dropout = nn.Dropout(config.dropout_rate)

-    def forward(self, x, attention_mask=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=None):
+    def forward(self, x, attention_mask=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=None, embeds_info=[]):
        mask = None
        if attention_mask is not None:
            mask = 1.0 - attention_mask.to(x.dtype).reshape((attention_mask.shape[0], 1, -1, attention_mask.shape[-1])).expand(attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1])
--- a/comfy_api_nodes/apis/init.py
+++ b/comfy_api_nodes/apis/init.py
@ -1315,6 +1315,7 @@ class KlingTaskStatus(str, Enum):
 class KlingTextToVideoModelName(str, Enum):
    kling_v1 = 'kling-v1'
    kling_v1_6 = 'kling-v1-6'
+    kling_v2_1_master = 'kling-v2-1-master'


 class KlingVideoGenAspectRatio(str, Enum):
@ -1347,6 +1348,8 @@ class KlingVideoGenModelName(str, Enum):
    kling_v1_5 = 'kling-v1-5'
    kling_v1_6 = 'kling-v1-6'
    kling_v2_master = 'kling-v2-master'
+    kling_v2_1 = 'kling-v2-1'
+    kling_v2_1_master = 'kling-v2-1-master'


 class KlingVideoResult(BaseModel):
@ -1620,13 +1623,14 @@ class MinimaxTaskResultResponse(BaseModel):
    task_id: str = Field(..., description='The task ID being queried.')


-class Model(str, Enum):
+class MiniMaxModel(str, Enum):
    T2V_01_Director = 'T2V-01-Director'
    I2V_01_Director = 'I2V-01-Director'
    S2V_01 = 'S2V-01'
    I2V_01 = 'I2V-01'
    I2V_01_live = 'I2V-01-live'
    T2V_01 = 'T2V-01'
+    Hailuo_02 = 'MiniMax-Hailuo-02'


 class SubjectReferenceItem(BaseModel):
@ -1648,7 +1652,7 @@ class MinimaxVideoGenerationRequest(BaseModel):
        None,
        description='URL or base64 encoding of the first frame image. Required when model is I2V-01, I2V-01-Director, or I2V-01-live.',
    )
-    model: Model = Field(
+    model: MiniMaxModel = Field(
        ...,
        description='Required. ID of model. Options: T2V-01-Director, I2V-01-Director, S2V-01, I2V-01, I2V-01-live, T2V-01',
    )
@ -1665,6 +1669,14 @@ class MinimaxVideoGenerationRequest(BaseModel):
        None,
        description='Only available when model is S2V-01. The model will generate a video based on the subject uploaded through this parameter.',
    )
+    duration: Optional[int] = Field(
+        None,
+        description="The length of the output video in seconds."
+    )
+    resolution: Optional[str] = Field(
+        None,
+        description="The dimensions of the video display. 1080p corresponds to 1920 x 1080 pixels, 768p corresponds to 1366 x 768 pixels."
+    )


 class MinimaxVideoGenerationResponse(BaseModel):
--- a/comfy_api_nodes/nodes_gemini.py
+++ b/comfy_api_nodes/nodes_gemini.py
@ -46,6 +46,8 @@ class GeminiModel(str, Enum):

    gemini_2_5_pro_preview_05_06 = "gemini-2.5-pro-preview-05-06"
    gemini_2_5_flash_preview_04_17 = "gemini-2.5-flash-preview-04-17"
+    gemini_2_5_pro = "gemini-2.5-pro"
+    gemini_2_5_flash = "gemini-2.5-flash"


 def get_gemini_endpoint(
@ -97,7 +99,7 @@ class GeminiNode(ComfyNodeABC):
                    {
                        "tooltip": "The Gemini model to use for generating responses.",
                        "options": [model.value for model in GeminiModel],
-                        "default": GeminiModel.gemini_2_5_pro_preview_05_06.value,
+                        "default": GeminiModel.gemini_2_5_pro.value,
                    },
                ),
                "seed": (
--- a/comfy_api_nodes/nodes_kling.py
+++ b/comfy_api_nodes/nodes_kling.py
@ -421,6 +421,8 @@ class KlingTextToVideoNode(KlingNodeBase):
            "pro mode / 10s duration / kling-v2-master": ("pro", "10", "kling-v2-master"),
            "standard mode / 5s duration / kling-v2-master": ("std", "5", "kling-v2-master"),
            "standard mode / 10s duration / kling-v2-master": ("std", "10", "kling-v2-master"),
+            "pro mode / 5s duration / kling-v2-1-master": ("pro", "5", "kling-v2-1-master"),
+            "pro mode / 10s duration / kling-v2-1-master": ("pro", "10", "kling-v2-1-master"),
        }

    @classmethod
--- a/comfy_api_nodes/nodes_minimax.py
+++ b/comfy_api_nodes/nodes_minimax.py
@ -1,3 +1,4 @@
+from inspect import cleandoc
 from typing import Union
 import logging
 import torch
@ -10,7 +11,7 @@ from comfy_api_nodes.apis import (
    MinimaxFileRetrieveResponse,
    MinimaxTaskResultResponse,
    SubjectReferenceItem,
-    Model
+    MiniMaxModel
 )
 from comfy_api_nodes.apis.client import (
    ApiEndpoint,
@ -84,7 +85,6 @@ class MinimaxTextToVideoNode:
    FUNCTION = "generate_video"
    CATEGORY = "api node/video/MiniMax"
    API_NODE = True
-    OUTPUT_NODE = True

    async def generate_video(
        self,
@ -121,7 +121,7 @@ class MinimaxTextToVideoNode:
                response_model=MinimaxVideoGenerationResponse,
            ),
            request=MinimaxVideoGenerationRequest(
-                model=Model(model),
+                model=MiniMaxModel(model),
                prompt=prompt_text,
                callback_url=None,
                first_frame_image=image_url,
@ -251,7 +251,6 @@ class MinimaxImageToVideoNode(MinimaxTextToVideoNode):
    FUNCTION = "generate_video"
    CATEGORY = "api node/video/MiniMax"
    API_NODE = True
-    OUTPUT_NODE = True


 class MinimaxSubjectToVideoNode(MinimaxTextToVideoNode):
@ -313,7 +312,181 @@ class MinimaxSubjectToVideoNode(MinimaxTextToVideoNode):
    FUNCTION = "generate_video"
    CATEGORY = "api node/video/MiniMax"
    API_NODE = True
-    OUTPUT_NODE = True
+
+
+class MinimaxHailuoVideoNode:
+    """Generates videos from prompt, with optional start frame using the new MiniMax Hailuo-02 model."""
+
+    @classmethod
+    def INPUT_TYPES(s):
+        return {
+            "required": {
+                "prompt_text": (
+                    "STRING",
+                    {
+                        "multiline": True,
+                        "default": "",
+                        "tooltip": "Text prompt to guide the video generation.",
+                    },
+                ),
+            },
+            "optional": {
+                "seed": (
+                    IO.INT,
+                    {
+                        "default": 0,
+                        "min": 0,
+                        "max": 0xFFFFFFFFFFFFFFFF,
+                        "control_after_generate": True,
+                        "tooltip": "The random seed used for creating the noise.",
+                    },
+                ),
+                "first_frame_image": (
+                    IO.IMAGE,
+                    {
+                        "tooltip": "Optional image to use as the first frame to generate a video."
+                    },
+                ),
+                "prompt_optimizer": (
+                    IO.BOOLEAN,
+                    {
+                        "tooltip": "Optimize prompt to improve generation quality when needed.",
+                        "default": True,
+                    },
+                ),
+                "duration": (
+                    IO.COMBO,
+                    {
+                        "tooltip": "The length of the output video in seconds.",
+                        "default": 6,
+                        "options": [6, 10],
+                    },
+                ),
+                "resolution": (
+                    IO.COMBO,
+                    {
+                        "tooltip": "The dimensions of the video display. "
+                                   "1080p corresponds to 1920 x 1080 pixels, 768p corresponds to 1366 x 768 pixels.",
+                        "default": "768P",
+                        "options": ["768P", "1080P"],
+                    },
+                ),
+            },
+            "hidden": {
+                "auth_token": "AUTH_TOKEN_COMFY_ORG",
+                "comfy_api_key": "API_KEY_COMFY_ORG",
+                "unique_id": "UNIQUE_ID",
+            },
+        }
+
+    RETURN_TYPES = ("VIDEO",)
+    DESCRIPTION = cleandoc(__doc__ or "")
+    FUNCTION = "generate_video"
+    CATEGORY = "api node/video/MiniMax"
+    API_NODE = True
+
+    async def generate_video(
+        self,
+        prompt_text,
+        seed=0,
+        first_frame_image: torch.Tensor=None, # used for ImageToVideo
+        prompt_optimizer=True,
+        duration=6,
+        resolution="768P",
+        model="MiniMax-Hailuo-02",
+        unique_id: Union[str, None]=None,
+        **kwargs,
+    ):
+        if first_frame_image is None:
+            validate_string(prompt_text, field_name="prompt_text")
+
+        if model == "MiniMax-Hailuo-02" and resolution.upper() == "1080P" and duration != 6:
+            raise Exception(
+                "When model is MiniMax-Hailuo-02 and resolution is 1080P, duration is limited to 6 seconds."
+            )
+
+        # upload image, if passed in
+        image_url = None
+        if first_frame_image is not None:
+            image_url = (await upload_images_to_comfyapi(first_frame_image, max_images=1, auth_kwargs=kwargs))[0]
+
+        video_generate_operation = SynchronousOperation(
+            endpoint=ApiEndpoint(
+                path="/proxy/minimax/video_generation",
+                method=HttpMethod.POST,
+                request_model=MinimaxVideoGenerationRequest,
+                response_model=MinimaxVideoGenerationResponse,
+            ),
+            request=MinimaxVideoGenerationRequest(
+                model=MiniMaxModel(model),
+                prompt=prompt_text,
+                callback_url=None,
+                first_frame_image=image_url,
+                prompt_optimizer=prompt_optimizer,
+                duration=duration,
+                resolution=resolution,
+            ),
+            auth_kwargs=kwargs,
+        )
+        response = await video_generate_operation.execute()
+
+        task_id = response.task_id
+        if not task_id:
+            raise Exception(f"MiniMax generation failed: {response.base_resp}")
+
+        average_duration = 120 if resolution == "768P" else 240
+        video_generate_operation = PollingOperation(
+            poll_endpoint=ApiEndpoint(
+                path="/proxy/minimax/query/video_generation",
+                method=HttpMethod.GET,
+                request_model=EmptyRequest,
+                response_model=MinimaxTaskResultResponse,
+                query_params={"task_id": task_id},
+            ),
+            completed_statuses=["Success"],
+            failed_statuses=["Fail"],
+            status_extractor=lambda x: x.status.value,
+            estimated_duration=average_duration,
+            node_id=unique_id,
+            auth_kwargs=kwargs,
+        )
+        task_result = await video_generate_operation.execute()
+
+        file_id = task_result.file_id
+        if file_id is None:
+            raise Exception("Request was not successful. Missing file ID.")
+        file_retrieve_operation = SynchronousOperation(
+            endpoint=ApiEndpoint(
+                path="/proxy/minimax/files/retrieve",
+                method=HttpMethod.GET,
+                request_model=EmptyRequest,
+                response_model=MinimaxFileRetrieveResponse,
+                query_params={"file_id": int(file_id)},
+            ),
+            request=EmptyRequest(),
+            auth_kwargs=kwargs,
+        )
+        file_result = await file_retrieve_operation.execute()
+
+        file_url = file_result.file.download_url
+        if file_url is None:
+            raise Exception(
+                f"No video was found in the response. Full response: {file_result.model_dump()}"
+            )
+        logging.info(f"Generated video URL: {file_url}")
+        if unique_id:
+            if hasattr(file_result.file, "backup_download_url"):
+                message = f"Result URL: {file_url}\nBackup URL: {file_result.file.backup_download_url}"
+            else:
+                message = f"Result URL: {file_url}"
+            PromptServer.instance.send_progress_text(message, unique_id)
+
+        video_io = await download_url_to_bytesio(file_url)
+        if video_io is None:
+            error_msg = f"Failed to download video from {file_url}"
+            logging.error(error_msg)
+            raise Exception(error_msg)
+        return (VideoFromFile(video_io),)


 # A dictionary that contains all nodes you want to export with their names
@ -322,6 +495,7 @@ NODE_CLASS_MAPPINGS = {
    "MinimaxTextToVideoNode": MinimaxTextToVideoNode,
    "MinimaxImageToVideoNode": MinimaxImageToVideoNode,
    # "MinimaxSubjectToVideoNode": MinimaxSubjectToVideoNode,
+    "MinimaxHailuoVideoNode": MinimaxHailuoVideoNode,
 }

 # A dictionary that contains the friendly/humanly readable titles for the nodes
@ -329,4 +503,5 @@ NODE_DISPLAY_NAME_MAPPINGS = {
    "MinimaxTextToVideoNode": "MiniMax Text to Video",
    "MinimaxImageToVideoNode": "MiniMax Image to Video",
    "MinimaxSubjectToVideoNode": "MiniMax Subject to Video",
+    "MinimaxHailuoVideoNode": "MiniMax Hailuo Video",
 }
--- a/comfy_api_nodes/nodes_openai.py
+++ b/comfy_api_nodes/nodes_openai.py
@ -80,6 +80,9 @@ class SupportedOpenAIModel(str, Enum):
    gpt_4_1 = "gpt-4.1"
    gpt_4_1_mini = "gpt-4.1-mini"
    gpt_4_1_nano = "gpt-4.1-nano"
+    gpt_5 = "gpt-5"
+    gpt_5_mini = "gpt-5-mini"
+    gpt_5_nano = "gpt-5-nano"


 class OpenAIDalle2(ComfyNodeABC):
--- a/comfy_api_nodes/nodes_vidu.py
+++ b/comfy_api_nodes/nodes_vidu.py
@ -0,0 +1,622 @@
+import logging
+from enum import Enum
+from typing import Any, Callable, Optional, Literal, TypeVar
+from typing_extensions import override
+
+import torch
+from pydantic import BaseModel, Field
+
+from comfy_api.latest import ComfyExtension, io as comfy_io
+from comfy_api_nodes.util.validation_utils import (
+    validate_aspect_ratio_closeness,
+    validate_image_dimensions,
+    validate_image_aspect_ratio_range,
+    get_number_of_images,
+)
+from comfy_api_nodes.apis.client import (
+    ApiEndpoint,
+    HttpMethod,
+    SynchronousOperation,
+    PollingOperation,
+    EmptyRequest,
+)
+from comfy_api_nodes.apinode_utils import download_url_to_video_output, upload_images_to_comfyapi
+
+
+VIDU_TEXT_TO_VIDEO = "/proxy/vidu/text2video"
+VIDU_IMAGE_TO_VIDEO = "/proxy/vidu/img2video"
+VIDU_REFERENCE_VIDEO = "/proxy/vidu/reference2video"
+VIDU_START_END_VIDEO = "/proxy/vidu/start-end2video"
+VIDU_GET_GENERATION_STATUS = "/proxy/vidu/tasks/%s/creations"
+
+R = TypeVar("R")
+
+class VideoModelName(str, Enum):
+    vidu_q1 = 'viduq1'
+
+
+class AspectRatio(str, Enum):
+    r_16_9 = "16:9"
+    r_9_16 = "9:16"
+    r_1_1 = "1:1"
+
+
+class Resolution(str, Enum):
+    r_1080p = "1080p"
+
+
+class MovementAmplitude(str, Enum):
+    auto = "auto"
+    small = "small"
+    medium = "medium"
+    large = "large"
+
+
+class TaskCreationRequest(BaseModel):
+    model: VideoModelName = VideoModelName.vidu_q1
+    prompt: Optional[str] = Field(None, max_length=1500)
+    duration: Optional[Literal[5]] = 5
+    seed: Optional[int] = Field(0, ge=0, le=2147483647)
+    aspect_ratio: Optional[AspectRatio] = AspectRatio.r_16_9
+    resolution: Optional[Resolution] = Resolution.r_1080p
+    movement_amplitude: Optional[MovementAmplitude] = MovementAmplitude.auto
+    images: Optional[list[str]] = Field(None, description="Base64 encoded string or image URL")
+
+
+class TaskStatus(str, Enum):
+    created = "created"
+    queueing = "queueing"
+    processing = "processing"
+    success = "success"
+    failed = "failed"
+
+
+class TaskCreationResponse(BaseModel):
+    task_id: str = Field(...)
+    state: TaskStatus = Field(...)
+    created_at: str = Field(...)
+    code: Optional[int] = Field(None, description="Error code")
+
+
+class TaskResult(BaseModel):
+    id: str = Field(..., description="Creation id")
+    url: str = Field(..., description="The URL of the generated results, valid for one hour")
+    cover_url: str = Field(..., description="The cover URL of the generated results, valid for one hour")
+
+
+class TaskStatusResponse(BaseModel):
+    state: TaskStatus = Field(...)
+    err_code: Optional[str] = Field(None)
+    creations: list[TaskResult] = Field(..., description="Generated results")
+
+
+async def poll_until_finished(
+    auth_kwargs: dict[str, str],
+    api_endpoint: ApiEndpoint[Any, R],
+    result_url_extractor: Optional[Callable[[R], str]] = None,
+    estimated_duration: Optional[int] = None,
+    node_id: Optional[str] = None,
+) -> R:
+    return await PollingOperation(
+        poll_endpoint=api_endpoint,
+        completed_statuses=[TaskStatus.success.value],
+        failed_statuses=[TaskStatus.failed.value],
+        status_extractor=lambda response: response.state.value,
+        auth_kwargs=auth_kwargs,
+        result_url_extractor=result_url_extractor,
+        estimated_duration=estimated_duration,
+        node_id=node_id,
+        poll_interval=16.0,
+        max_poll_attempts=256,
+    ).execute()
+
+
+def get_video_url_from_response(response) -> Optional[str]:
+    if response.creations:
+        return response.creations[0].url
+    return None
+
+
+def get_video_from_response(response) -> TaskResult:
+    if not response.creations:
+        error_msg = f"Vidu request does not contain results. State: {response.state}, Error Code: {response.err_code}"
+        logging.info(error_msg)
+        raise RuntimeError(error_msg)
+    logging.info("Vidu task %s succeeded. Video URL: %s", response.creations[0].id, response.creations[0].url)
+    return response.creations[0]
+
+
+async def execute_task(
+    vidu_endpoint: str,
+    auth_kwargs: Optional[dict[str, str]],
+    payload: TaskCreationRequest,
+    estimated_duration: int,
+    node_id: str,
+) -> R:
+    response = await SynchronousOperation(
+        endpoint=ApiEndpoint(
+            path=vidu_endpoint,
+            method=HttpMethod.POST,
+            request_model=TaskCreationRequest,
+            response_model=TaskCreationResponse,
+        ),
+        request=payload,
+        auth_kwargs=auth_kwargs,
+    ).execute()
+    if response.state == TaskStatus.failed:
+        error_msg = f"Vidu request failed. Code: {response.code}"
+        logging.error(error_msg)
+        raise RuntimeError(error_msg)
+    return await poll_until_finished(
+        auth_kwargs,
+        ApiEndpoint(
+            path=VIDU_GET_GENERATION_STATUS % response.task_id,
+            method=HttpMethod.GET,
+            request_model=EmptyRequest,
+            response_model=TaskStatusResponse,
+        ),
+        result_url_extractor=get_video_url_from_response,
+        estimated_duration=estimated_duration,
+        node_id=node_id,
+    )
+
+
+class ViduTextToVideoNode(comfy_io.ComfyNode):
+
+    @classmethod
+    def define_schema(cls):
+        return comfy_io.Schema(
+            node_id="ViduTextToVideoNode",
+            display_name="Vidu Text To Video Generation",
+            category="api node/video/Vidu",
+            description="Generate video from text prompt",
+            inputs=[
+                comfy_io.Combo.Input(
+                    "model",
+                    options=[model.value for model in VideoModelName],
+                    default=VideoModelName.vidu_q1.value,
+                    tooltip="Model name",
+                ),
+                comfy_io.String.Input(
+                    "prompt",
+                    multiline=True,
+                    tooltip="A textual description for video generation",
+                ),
+                comfy_io.Int.Input(
+                    "duration",
+                    default=5,
+                    min=5,
+                    max=5,
+                    step=1,
+                    display_mode=comfy_io.NumberDisplay.number,
+                    tooltip="Duration of the output video in seconds",
+                    optional=True,
+                ),
+                comfy_io.Int.Input(
+                    "seed",
+                    default=0,
+                    min=0,
+                    max=2147483647,
+                    step=1,
+                    display_mode=comfy_io.NumberDisplay.number,
+                    control_after_generate=True,
+                    tooltip="Seed for video generation (0 for random)",
+                    optional=True,
+                ),
+                comfy_io.Combo.Input(
+                    "aspect_ratio",
+                    options=[model.value for model in AspectRatio],
+                    default=AspectRatio.r_16_9.value,
+                    tooltip="The aspect ratio of the output video",
+                    optional=True,
+                ),
+                comfy_io.Combo.Input(
+                    "resolution",
+                    options=[model.value for model in Resolution],
+                    default=Resolution.r_1080p.value,
+                    tooltip="Supported values may vary by model & duration",
+                    optional=True,
+                ),
+                comfy_io.Combo.Input(
+                    "movement_amplitude",
+                    options=[model.value for model in MovementAmplitude],
+                    default=MovementAmplitude.auto.value,
+                    tooltip="The movement amplitude of objects in the frame",
+                    optional=True,
+                ),
+            ],
+            outputs=[
+                comfy_io.Video.Output(),
+            ],
+            hidden=[
+                comfy_io.Hidden.auth_token_comfy_org,
+                comfy_io.Hidden.api_key_comfy_org,
+                comfy_io.Hidden.unique_id,
+            ],
+            is_api_node=True,
+        )
+
+    @classmethod
+    async def execute(
+        cls,
+        model: str,
+        prompt: str,
+        duration: int,
+        seed: int,
+        aspect_ratio: str,
+        resolution: str,
+        movement_amplitude: str,
+    ) -> comfy_io.NodeOutput:
+        if not prompt:
+            raise ValueError("The prompt field is required and cannot be empty.")
+        payload = TaskCreationRequest(
+            model_name=model,
+            prompt=prompt,
+            duration=duration,
+            seed=seed,
+            aspect_ratio=aspect_ratio,
+            resolution=resolution,
+            movement_amplitude=movement_amplitude,
+        )
+        auth = {
+            "auth_token": cls.hidden.auth_token_comfy_org,
+            "comfy_api_key": cls.hidden.api_key_comfy_org,
+        }
+        results = await execute_task(VIDU_TEXT_TO_VIDEO, auth, payload, 320, cls.hidden.unique_id)
+        return comfy_io.NodeOutput(await download_url_to_video_output(get_video_from_response(results).url))
+
+
+class ViduImageToVideoNode(comfy_io.ComfyNode):
+
+    @classmethod
+    def define_schema(cls):
+        return comfy_io.Schema(
+            node_id="ViduImageToVideoNode",
+            display_name="Vidu Image To Video Generation",
+            category="api node/video/Vidu",
+            description="Generate video from image and optional prompt",
+            inputs=[
+                comfy_io.Combo.Input(
+                    "model",
+                    options=[model.value for model in VideoModelName],
+                    default=VideoModelName.vidu_q1.value,
+                    tooltip="Model name",
+                ),
+                comfy_io.Image.Input(
+                    "image",
+                    tooltip="An image to be used as the start frame of the generated video",
+                ),
+                comfy_io.String.Input(
+                    "prompt",
+                    multiline=True,
+                    default="",
+                    tooltip="A textual description for video generation",
+                    optional=True,
+                ),
+                comfy_io.Int.Input(
+                    "duration",
+                    default=5,
+                    min=5,
+                    max=5,
+                    step=1,
+                    display_mode=comfy_io.NumberDisplay.number,
+                    tooltip="Duration of the output video in seconds",
+                    optional=True,
+                ),
+                comfy_io.Int.Input(
+                    "seed",
+                    default=0,
+                    min=0,
+                    max=2147483647,
+                    step=1,
+                    display_mode=comfy_io.NumberDisplay.number,
+                    control_after_generate=True,
+                    tooltip="Seed for video generation (0 for random)",
+                    optional=True,
+                ),
+                comfy_io.Combo.Input(
+                    "resolution",
+                    options=[model.value for model in Resolution],
+                    default=Resolution.r_1080p.value,
+                    tooltip="Supported values may vary by model & duration",
+                    optional=True,
+                ),
+                comfy_io.Combo.Input(
+                    "movement_amplitude",
+                    options=[model.value for model in MovementAmplitude],
+                    default=MovementAmplitude.auto.value,
+                    tooltip="The movement amplitude of objects in the frame",
+                    optional=True,
+                ),
+            ],
+            outputs=[
+                comfy_io.Video.Output(),
+            ],
+            hidden=[
+                comfy_io.Hidden.auth_token_comfy_org,
+                comfy_io.Hidden.api_key_comfy_org,
+                comfy_io.Hidden.unique_id,
+            ],
+            is_api_node=True,
+        )
+
+    @classmethod
+    async def execute(
+        cls,
+        model: str,
+        image: torch.Tensor,
+        prompt: str,
+        duration: int,
+        seed: int,
+        resolution: str,
+        movement_amplitude: str,
+    ) -> comfy_io.NodeOutput:
+        if get_number_of_images(image) > 1:
+            raise ValueError("Only one input image is allowed.")
+        validate_image_aspect_ratio_range(image, (1, 4), (4, 1))
+        payload = TaskCreationRequest(
+            model_name=model,
+            prompt=prompt,
+            duration=duration,
+            seed=seed,
+            resolution=resolution,
+            movement_amplitude=movement_amplitude,
+        )
+        auth = {
+            "auth_token": cls.hidden.auth_token_comfy_org,
+            "comfy_api_key": cls.hidden.api_key_comfy_org,
+        }
+        payload.images = await upload_images_to_comfyapi(
+            image,
+            max_images=1,
+            mime_type="image/png",
+            auth_kwargs=auth,
+        )
+        results = await execute_task(VIDU_IMAGE_TO_VIDEO, auth, payload, 120, cls.hidden.unique_id)
+        return comfy_io.NodeOutput(await download_url_to_video_output(get_video_from_response(results).url))
+
+
+class ViduReferenceVideoNode(comfy_io.ComfyNode):
+
+    @classmethod
+    def define_schema(cls):
+        return comfy_io.Schema(
+            node_id="ViduReferenceVideoNode",
+            display_name="Vidu Reference To Video Generation",
+            category="api node/video/Vidu",
+            description="Generate video from multiple images and prompt",
+            inputs=[
+                comfy_io.Combo.Input(
+                    "model",
+                    options=[model.value for model in VideoModelName],
+                    default=VideoModelName.vidu_q1.value,
+                    tooltip="Model name",
+                ),
+                comfy_io.Image.Input(
+                    "images",
+                    tooltip="Images to use as references to generate a video with consistent subjects (max 7 images).",
+                ),
+                comfy_io.String.Input(
+                    "prompt",
+                    multiline=True,
+                    tooltip="A textual description for video generation",
+                ),
+                comfy_io.Int.Input(
+                    "duration",
+                    default=5,
+                    min=5,
+                    max=5,
+                    step=1,
+                    display_mode=comfy_io.NumberDisplay.number,
+                    tooltip="Duration of the output video in seconds",
+                    optional=True,
+                ),
+                comfy_io.Int.Input(
+                    "seed",
+                    default=0,
+                    min=0,
+                    max=2147483647,
+                    step=1,
+                    display_mode=comfy_io.NumberDisplay.number,
+                    control_after_generate=True,
+                    tooltip="Seed for video generation (0 for random)",
+                    optional=True,
+                ),
+                comfy_io.Combo.Input(
+                    "aspect_ratio",
+                    options=[model.value for model in AspectRatio],
+                    default=AspectRatio.r_16_9.value,
+                    tooltip="The aspect ratio of the output video",
+                    optional=True,
+                ),
+                comfy_io.Combo.Input(
+                    "resolution",
+                    options=[model.value for model in Resolution],
+                    default=Resolution.r_1080p.value,
+                    tooltip="Supported values may vary by model & duration",
+                    optional=True,
+                ),
+                comfy_io.Combo.Input(
+                    "movement_amplitude",
+                    options=[model.value for model in MovementAmplitude],
+                    default=MovementAmplitude.auto.value,
+                    tooltip="The movement amplitude of objects in the frame",
+                    optional=True,
+                ),
+            ],
+            outputs=[
+                comfy_io.Video.Output(),
+            ],
+            hidden=[
+                comfy_io.Hidden.auth_token_comfy_org,
+                comfy_io.Hidden.api_key_comfy_org,
+                comfy_io.Hidden.unique_id,
+            ],
+            is_api_node=True,
+        )
+
+    @classmethod
+    async def execute(
+        cls,
+        model: str,
+        images: torch.Tensor,
+        prompt: str,
+        duration: int,
+        seed: int,
+        aspect_ratio: str,
+        resolution: str,
+        movement_amplitude: str,
+    ) -> comfy_io.NodeOutput:
+        if not prompt:
+            raise ValueError("The prompt field is required and cannot be empty.")
+        a = get_number_of_images(images)
+        if a > 7:
+            raise ValueError("Too many images, maximum allowed is 7.")
+        for image in images:
+            validate_image_aspect_ratio_range(image, (1, 4), (4, 1))
+            validate_image_dimensions(image, min_width=128, min_height=128)
+        payload = TaskCreationRequest(
+            model_name=model,
+            prompt=prompt,
+            duration=duration,
+            seed=seed,
+            aspect_ratio=aspect_ratio,
+            resolution=resolution,
+            movement_amplitude=movement_amplitude,
+        )
+        auth = {
+            "auth_token": cls.hidden.auth_token_comfy_org,
+            "comfy_api_key": cls.hidden.api_key_comfy_org,
+        }
+        payload.images = await upload_images_to_comfyapi(
+            images,
+            max_images=7,
+            mime_type="image/png",
+            auth_kwargs=auth,
+        )
+        results = await execute_task(VIDU_REFERENCE_VIDEO, auth, payload, 120, cls.hidden.unique_id)
+        return comfy_io.NodeOutput(await download_url_to_video_output(get_video_from_response(results).url))
+
+
+class ViduStartEndToVideoNode(comfy_io.ComfyNode):
+
+    @classmethod
+    def define_schema(cls):
+        return comfy_io.Schema(
+            node_id="ViduStartEndToVideoNode",
+            display_name="Vidu Start End To Video Generation",
+            category="api node/video/Vidu",
+            description="Generate a video from start and end frames and a prompt",
+            inputs=[
+                comfy_io.Combo.Input(
+                    "model",
+                    options=[model.value for model in VideoModelName],
+                    default=VideoModelName.vidu_q1.value,
+                    tooltip="Model name",
+                ),
+                comfy_io.Image.Input(
+                    "first_frame",
+                    tooltip="Start frame",
+                ),
+                comfy_io.Image.Input(
+                    "end_frame",
+                    tooltip="End frame",
+                ),
+                comfy_io.String.Input(
+                    "prompt",
+                    multiline=True,
+                    tooltip="A textual description for video generation",
+                    optional=True,
+                ),
+                comfy_io.Int.Input(
+                    "duration",
+                    default=5,
+                    min=5,
+                    max=5,
+                    step=1,
+                    display_mode=comfy_io.NumberDisplay.number,
+                    tooltip="Duration of the output video in seconds",
+                    optional=True,
+                ),
+                comfy_io.Int.Input(
+                    "seed",
+                    default=0,
+                    min=0,
+                    max=2147483647,
+                    step=1,
+                    display_mode=comfy_io.NumberDisplay.number,
+                    control_after_generate=True,
+                    tooltip="Seed for video generation (0 for random)",
+                    optional=True,
+                ),
+                comfy_io.Combo.Input(
+                    "resolution",
+                    options=[model.value for model in Resolution],
+                    default=Resolution.r_1080p.value,
+                    tooltip="Supported values may vary by model & duration",
+                    optional=True,
+                ),
+                comfy_io.Combo.Input(
+                    "movement_amplitude",
+                    options=[model.value for model in MovementAmplitude],
+                    default=MovementAmplitude.auto.value,
+                    tooltip="The movement amplitude of objects in the frame",
+                    optional=True,
+                ),
+            ],
+            outputs=[
+                comfy_io.Video.Output(),
+            ],
+            hidden=[
+                comfy_io.Hidden.auth_token_comfy_org,
+                comfy_io.Hidden.api_key_comfy_org,
+                comfy_io.Hidden.unique_id,
+            ],
+            is_api_node=True,
+        )
+
+    @classmethod
+    async def execute(
+        cls,
+        model: str,
+        first_frame: torch.Tensor,
+        end_frame: torch.Tensor,
+        prompt: str,
+        duration: int,
+        seed: int,
+        resolution: str,
+        movement_amplitude: str,
+    ) -> comfy_io.NodeOutput:
+        validate_aspect_ratio_closeness(first_frame, end_frame, min_rel=0.8, max_rel=1.25, strict=False)
+        payload = TaskCreationRequest(
+            model_name=model,
+            prompt=prompt,
+            duration=duration,
+            seed=seed,
+            resolution=resolution,
+            movement_amplitude=movement_amplitude,
+        )
+        auth = {
+            "auth_token": cls.hidden.auth_token_comfy_org,
+            "comfy_api_key": cls.hidden.api_key_comfy_org,
+        }
+        payload.images = [
+            (await upload_images_to_comfyapi(frame, max_images=1, mime_type="image/png", auth_kwargs=auth))[0]
+            for frame in (first_frame, end_frame)
+        ]
+        results = await execute_task(VIDU_START_END_VIDEO, auth, payload, 96, cls.hidden.unique_id)
+        return comfy_io.NodeOutput(await download_url_to_video_output(get_video_from_response(results).url))
+
+
+class ViduExtension(ComfyExtension):
+    @override
+    async def get_node_list(self) -> list[type[comfy_io.ComfyNode]]:
+        return [
+            ViduTextToVideoNode,
+            ViduImageToVideoNode,
+            ViduReferenceVideoNode,
+            ViduStartEndToVideoNode,
+        ]
+
+async def comfy_entrypoint() -> ViduExtension:
+    return ViduExtension()
--- a/comfy_api_nodes/util/validation_utils.py
+++ b/comfy_api_nodes/util/validation_utils.py
@ -53,6 +53,53 @@ def validate_image_aspect_ratio(
        )


+def validate_image_aspect_ratio_range(
+    image: torch.Tensor,
+    min_ratio: tuple[float, float],  # e.g. (1, 4)
+    max_ratio: tuple[float, float],  # e.g. (4, 1)
+    *,
+    strict: bool = True,             # True -> (min, max); False -> [min, max]
+) -> float:
+    a1, b1 = min_ratio
+    a2, b2 = max_ratio
+    if a1 <= 0 or b1 <= 0 or a2 <= 0 or b2 <= 0:
+        raise ValueError("Ratios must be positive, like (1, 4) or (4, 1).")
+    lo, hi = (a1 / b1), (a2 / b2)
+    if lo > hi:
+        lo, hi = hi, lo
+        a1, b1, a2, b2 = a2, b2, a1, b1  # swap only for error text
+    w, h = get_image_dimensions(image)
+    if w <= 0 or h <= 0:
+        raise ValueError(f"Invalid image dimensions: {w}x{h}")
+    ar = w / h
+    ok = (lo < ar < hi) if strict else (lo <= ar <= hi)
+    if not ok:
+        op = "<" if strict else "≤"
+        raise ValueError(f"Image aspect ratio {ar:.6g} is outside allowed range: {a1}:{b1} {op} ratio {op} {a2}:{b2}")
+    return ar
+
+
+def validate_aspect_ratio_closeness(
+    start_img,
+    end_img,
+    min_rel: float,
+    max_rel: float,
+    *,
+    strict: bool = False,   # True => exclusive, False => inclusive
+) -> None:
+    w1, h1 = get_image_dimensions(start_img)
+    w2, h2 = get_image_dimensions(end_img)
+    if min(w1, h1, w2, h2) <= 0:
+        raise ValueError("Invalid image dimensions")
+    ar1 = w1 / h1
+    ar2 = w2 / h2
+    # Normalize so it is symmetric (no need to check both ar1/ar2 and ar2/ar1)
+    closeness = max(ar1, ar2) / min(ar1, ar2)
+    limit = max(max_rel, 1.0 / min_rel)  # for 0.8..1.25 this is 1.25
+    if (closeness >= limit) if strict else (closeness > limit):
+        raise ValueError(f"Aspect ratios must be close: start/end={ar1/ar2:.4f}, allowed range {min_rel}–{max_rel}.")
+
+
 def validate_video_dimensions(
    video: VideoInput,
    min_width: Optional[int] = None,
@ -98,3 +145,9 @@ def validate_video_duration(
        raise ValueError(
            f"Video duration must be at most {max_duration}s, got {duration}s"
        )
+
+
+def get_number_of_images(images):
+    if isinstance(images, torch.Tensor):
+        return images.shape[0] if images.ndim >= 4 else 1
+    return len(images)
--- a/comfy_extras/nodes_qwen.py
+++ b/comfy_extras/nodes_qwen.py
@ -0,0 +1,48 @@
+import node_helpers
+import comfy.utils
+import math
+
+
+class TextEncodeQwenImageEdit:
+    @classmethod
+    def INPUT_TYPES(s):
+        return {"required": {
+            "clip": ("CLIP", ),
+            "prompt": ("STRING", {"multiline": True, "dynamicPrompts": True}),
+            },
+            "optional": {"vae": ("VAE", ),
+                         "image": ("IMAGE", ),}}
+
+    RETURN_TYPES = ("CONDITIONING",)
+    FUNCTION = "encode"
+
+    CATEGORY = "advanced/conditioning"
+
+    def encode(self, clip, prompt, vae=None, image=None):
+        ref_latent = None
+        if image is None:
+            images = []
+        else:
+            samples = image.movedim(-1, 1)
+            total = int(1024 * 1024)
+
+            scale_by = math.sqrt(total / (samples.shape[3] * samples.shape[2]))
+            width = round(samples.shape[3] * scale_by)
+            height = round(samples.shape[2] * scale_by)
+
+            s = comfy.utils.common_upscale(samples, width, height, "area", "disabled")
+            image = s.movedim(1, -1)
+            images = [image[:, :, :, :3]]
+            if vae is not None:
+                ref_latent = vae.encode(image[:, :, :, :3])
+
+        tokens = clip.tokenize(prompt, images=images)
+        conditioning = clip.encode_from_tokens_scheduled(tokens)
+        if ref_latent is not None:
+            conditioning = node_helpers.conditioning_set_values(conditioning, {"reference_latents": [ref_latent]}, append=True)
+        return (conditioning, )
+
+
+NODE_CLASS_MAPPINGS = {
+    "TextEncodeQwenImageEdit": TextEncodeQwenImageEdit,
+}
--- a/comfyui_version.py
+++ b/comfyui_version.py
@ -1,3 +1,3 @@
 # This file is automatically generated by the build process when version is
 # updated in pyproject.toml.
-__version__ = "0.3.50"
+__version__ = "0.3.51"
--- a/nodes.py
+++ b/nodes.py
@ -2321,6 +2321,7 @@ async def init_builtin_extra_nodes():
        "nodes_edit_model.py",
        "nodes_tcfg.py",
        "nodes_context_windows.py",
+        "nodes_qwen.py",
    ]

    import_failed = []
@ -2350,6 +2351,7 @@ async def init_builtin_api_nodes():
        "nodes_moonvalley.py",
        "nodes_rodin.py",
        "nodes_gemini.py",
+        "nodes_vidu.py",
    ]

    if not await load_custom_node(os.path.join(api_nodes_dir, "canary.py"), module_parent="comfy_api_nodes"):
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,6 +1,6 @@
 [project]
 name = "ComfyUI"
-version = "0.3.50"
+version = "0.3.51"
 readme = "README.md"
 license = { file = "LICENSE" }
 requires-python = ">=3.9"
--- a/requirements.txt
+++ b/requirements.txt
@ -1,5 +1,5 @@
 comfyui-frontend-package==1.25.9
-comfyui-workflow-templates==0.1.60
+comfyui-workflow-templates==0.1.62
 comfyui-embedded-docs==0.2.6
 torch
 torchsde
Author	SHA1	Message	Date
comfyanonymous	7139d6d93f	ComfyUI version 0.3.51	2025-08-20 03:15:30 -04:00
ComfyUI Wiki	2f52e8f05f	Bump template to 0.1.62 (#9419 ) * Bump template to 0.1.61 * Bump template to 0.1.62	2025-08-20 03:15:09 -04:00
comfyanonymous	8d38ea3bbf	Fix bf16 precision issue with qwen image embeddings. (#9441 )	2025-08-20 02:58:54 -04:00
comfyanonymous	5a8f502db5	Disable prompt weights for qwen. (#9438 )	2025-08-20 01:08:11 -04:00
comfyanonymous	7cd2c4bd6a	Qwen rotary embeddings should now match reference code. (#9437 )	2025-08-20 00:45:27 -04:00
comfyanonymous	dfa791eb4b	Rope fix for qwen vl. (#9435 )	2025-08-19 20:47:42 -04:00
comfyanonymous	bddd69618b	Change the TextEncodeQwenImageEdit node to use logic closer to reference. (#9432 )	2025-08-19 16:49:01 -04:00
Alexander Piskun	54d8fdbed0	feat(api-nodes): add Vidu Video nodes (#9368 )	2025-08-19 16:30:06 -04:00
Alexander Piskun	d844d8b13b	api_nodes: added release version of google's models (#9304 )	2025-08-19 16:29:24 -04:00
Alexander Piskun	07a927517c	api_nodes: add GPT-5 series models (#9325 )	2025-08-19 16:29:01 -04:00
Alexander Piskun	f16a70ba67	api_nodes: add MinimaxHailuoVideoNode node (#9262 )	2025-08-19 16:28:27 -04:00
Alexander Piskun	36b5127fd3	api_nodes: add kling-v2-1 and v2-1-master (#9257 )	2025-08-19 16:28:07 -04:00
comfyanonymous	4977f203fa	P2 of qwen edit model. (#9412 ) * P2 of qwen edit model. * Typo. * Fix normal qwen. * Fix. * Make the TextEncodeQwenImageEdit also set the ref latent. If you don't want it to set the ref latent and want to use the ReferenceLatent node with your custom latent instead just disconnect the VAE.	2025-08-18 22:38:34 -04:00