mirror of
https://github.com/comfyanonymous/ComfyUI.git
synced 2026-05-17 23:26:17 +08:00
Compare commits
14 Commits
release/v0
...
feat/gpt-i
| Author | SHA1 | Date | |
|---|---|---|---|
| e9f8aa9346 | |||
| 43a1263b60 | |||
| 102773cd2c | |||
| 1e1d4f1254 | |||
| eb22225387 | |||
| b38dd0ff23 | |||
| ad94d47221 | |||
| e75f775ae8 | |||
| c514890325 | |||
| 543e9fba64 | |||
| fc5f4a996b | |||
| 138571da95 | |||
| 3d816db07f | |||
| b9dedea57d |
@ -195,7 +195,9 @@ The portable above currently comes with python 3.13 and pytorch cuda 13.0. Updat
|
||||
|
||||
#### Alternative Downloads:
|
||||
|
||||
[Experimental portable for AMD GPUs](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_amd.7z)
|
||||
[Portable for AMD GPUs](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_amd.7z)
|
||||
|
||||
[Experimental portable for Intel GPUs](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_intel.7z)
|
||||
|
||||
[Portable with pytorch cuda 12.6 and python 3.12](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_nvidia_cu126.7z) (Supports Nvidia 10 series and older GPUs).
|
||||
|
||||
|
||||
@ -67,7 +67,7 @@ class InternalRoutes:
|
||||
(entry for entry in os.scandir(directory) if is_visible_file(entry)),
|
||||
key=lambda entry: -entry.stat().st_mtime
|
||||
)
|
||||
return web.json_response([entry.name for entry in sorted_files], status=200)
|
||||
return web.json_response([f"{entry.name} [{directory_type}]" for entry in sorted_files], status=200)
|
||||
|
||||
|
||||
def get_app(self):
|
||||
|
||||
@ -118,8 +118,6 @@ class ErnieImageAttention(nn.Module):
|
||||
query = apply_rotary_emb(query, image_rotary_emb)
|
||||
key = apply_rotary_emb(key, image_rotary_emb)
|
||||
|
||||
query, key = query.to(x.dtype), key.to(x.dtype)
|
||||
|
||||
q_flat = query.reshape(B, S, -1)
|
||||
k_flat = key.reshape(B, S, -1)
|
||||
|
||||
@ -161,16 +159,16 @@ class ErnieImageSharedAdaLNBlock(nn.Module):
|
||||
|
||||
residual = x
|
||||
x_norm = self.adaLN_sa_ln(x)
|
||||
x_norm = (x_norm.float() * (1 + scale_msa.float()) + shift_msa.float()).to(x.dtype)
|
||||
x_norm = x_norm * (1 + scale_msa) + shift_msa
|
||||
|
||||
attn_out = self.self_attention(x_norm, attention_mask=attention_mask, image_rotary_emb=rotary_pos_emb)
|
||||
x = residual + (gate_msa.float() * attn_out.float()).to(x.dtype)
|
||||
x = residual + gate_msa * attn_out
|
||||
|
||||
residual = x
|
||||
x_norm = self.adaLN_mlp_ln(x)
|
||||
x_norm = (x_norm.float() * (1 + scale_mlp.float()) + shift_mlp.float()).to(x.dtype)
|
||||
x_norm = x_norm * (1 + scale_mlp) + shift_mlp
|
||||
|
||||
return residual + (gate_mlp.float() * self.mlp(x_norm).float()).to(x.dtype)
|
||||
return residual + gate_mlp * self.mlp(x_norm)
|
||||
|
||||
class ErnieImageAdaLNContinuous(nn.Module):
|
||||
def __init__(self, hidden_size: int, eps: float = 1e-6, operations=None, device=None, dtype=None):
|
||||
@ -183,7 +181,7 @@ class ErnieImageAdaLNContinuous(nn.Module):
|
||||
def forward(self, x: torch.Tensor, conditioning: torch.Tensor) -> torch.Tensor:
|
||||
scale, shift = self.linear(conditioning).chunk(2, dim=-1)
|
||||
x = self.norm(x)
|
||||
x = x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
|
||||
x = torch.addcmul(shift.unsqueeze(1), x, 1 + scale.unsqueeze(1))
|
||||
return x
|
||||
|
||||
class ErnieImageModel(nn.Module):
|
||||
|
||||
@ -4,9 +4,6 @@ import math
|
||||
import torch
|
||||
import torchaudio
|
||||
|
||||
import comfy.model_management
|
||||
import comfy.model_patcher
|
||||
import comfy.utils as utils
|
||||
from comfy.ldm.mmaudio.vae.distributions import DiagonalGaussianDistribution
|
||||
from comfy.ldm.lightricks.symmetric_patchifier import AudioPatchifier
|
||||
from comfy.ldm.lightricks.vae.causal_audio_autoencoder import (
|
||||
@ -43,30 +40,6 @@ class AudioVAEComponentConfig:
|
||||
|
||||
return cls(autoencoder=audio_config, vocoder=vocoder_config)
|
||||
|
||||
|
||||
class ModelDeviceManager:
|
||||
"""Manages device placement and GPU residency for the composed model."""
|
||||
|
||||
def __init__(self, module: torch.nn.Module):
|
||||
load_device = comfy.model_management.get_torch_device()
|
||||
offload_device = comfy.model_management.vae_offload_device()
|
||||
self.patcher = comfy.model_patcher.ModelPatcher(module, load_device, offload_device)
|
||||
|
||||
def ensure_model_loaded(self) -> None:
|
||||
comfy.model_management.free_memory(
|
||||
self.patcher.model_size(),
|
||||
self.patcher.load_device,
|
||||
)
|
||||
comfy.model_management.load_model_gpu(self.patcher)
|
||||
|
||||
def move_to_load_device(self, tensor: torch.Tensor) -> torch.Tensor:
|
||||
return tensor.to(self.patcher.load_device)
|
||||
|
||||
@property
|
||||
def load_device(self):
|
||||
return self.patcher.load_device
|
||||
|
||||
|
||||
class AudioLatentNormalizer:
|
||||
"""Applies per-channel statistics in patch space and restores original layout."""
|
||||
|
||||
@ -132,23 +105,17 @@ class AudioPreprocessor:
|
||||
class AudioVAE(torch.nn.Module):
|
||||
"""High-level Audio VAE wrapper exposing encode and decode entry points."""
|
||||
|
||||
def __init__(self, state_dict: dict, metadata: dict):
|
||||
def __init__(self, metadata: dict):
|
||||
super().__init__()
|
||||
|
||||
component_config = AudioVAEComponentConfig.from_metadata(metadata)
|
||||
|
||||
vae_sd = utils.state_dict_prefix_replace(state_dict, {"audio_vae.": ""}, filter_keys=True)
|
||||
vocoder_sd = utils.state_dict_prefix_replace(state_dict, {"vocoder.": ""}, filter_keys=True)
|
||||
|
||||
self.autoencoder = CausalAudioAutoencoder(config=component_config.autoencoder)
|
||||
if "bwe" in component_config.vocoder:
|
||||
self.vocoder = VocoderWithBWE(config=component_config.vocoder)
|
||||
else:
|
||||
self.vocoder = Vocoder(config=component_config.vocoder)
|
||||
|
||||
self.autoencoder.load_state_dict(vae_sd, strict=False)
|
||||
self.vocoder.load_state_dict(vocoder_sd, strict=False)
|
||||
|
||||
autoencoder_config = self.autoencoder.get_config()
|
||||
self.normalizer = AudioLatentNormalizer(
|
||||
AudioPatchifier(
|
||||
@ -168,18 +135,12 @@ class AudioVAE(torch.nn.Module):
|
||||
n_fft=autoencoder_config["n_fft"],
|
||||
)
|
||||
|
||||
self.device_manager = ModelDeviceManager(self)
|
||||
|
||||
def encode(self, audio: dict) -> torch.Tensor:
|
||||
def encode(self, audio, sample_rate=44100) -> torch.Tensor:
|
||||
"""Encode a waveform dictionary into normalized latent tensors."""
|
||||
|
||||
waveform = audio["waveform"]
|
||||
waveform_sample_rate = audio["sample_rate"]
|
||||
waveform = audio
|
||||
waveform_sample_rate = sample_rate
|
||||
input_device = waveform.device
|
||||
# Ensure that Audio VAE is loaded on the correct device.
|
||||
self.device_manager.ensure_model_loaded()
|
||||
|
||||
waveform = self.device_manager.move_to_load_device(waveform)
|
||||
expected_channels = self.autoencoder.encoder.in_channels
|
||||
if waveform.shape[1] != expected_channels:
|
||||
if waveform.shape[1] == 1:
|
||||
@ -190,7 +151,7 @@ class AudioVAE(torch.nn.Module):
|
||||
)
|
||||
|
||||
mel_spec = self.preprocessor.waveform_to_mel(
|
||||
waveform, waveform_sample_rate, device=self.device_manager.load_device
|
||||
waveform, waveform_sample_rate, device=waveform.device
|
||||
)
|
||||
|
||||
latents = self.autoencoder.encode(mel_spec)
|
||||
@ -204,17 +165,13 @@ class AudioVAE(torch.nn.Module):
|
||||
"""Decode normalized latent tensors into an audio waveform."""
|
||||
original_shape = latents.shape
|
||||
|
||||
# Ensure that Audio VAE is loaded on the correct device.
|
||||
self.device_manager.ensure_model_loaded()
|
||||
|
||||
latents = self.device_manager.move_to_load_device(latents)
|
||||
latents = self.normalizer.denormalize(latents)
|
||||
|
||||
target_shape = self.target_shape_from_latents(original_shape)
|
||||
mel_spec = self.autoencoder.decode(latents, target_shape=target_shape)
|
||||
|
||||
waveform = self.run_vocoder(mel_spec)
|
||||
return self.device_manager.move_to_load_device(waveform)
|
||||
return waveform
|
||||
|
||||
def target_shape_from_latents(self, latents_shape):
|
||||
batch, _, time, _ = latents_shape
|
||||
|
||||
@ -34,6 +34,16 @@ class TimestepBlock(nn.Module):
|
||||
#This is needed because accelerate makes a copy of transformer_options which breaks "transformer_index"
|
||||
def forward_timestep_embed(ts, x, emb, context=None, transformer_options={}, output_shape=None, time_context=None, num_video_frames=None, image_only_indicator=None):
|
||||
for layer in ts:
|
||||
if "patches" in transformer_options and "forward_timestep_embed_patch" in transformer_options["patches"]:
|
||||
found_patched = False
|
||||
for class_type, handler in transformer_options["patches"]["forward_timestep_embed_patch"]:
|
||||
if isinstance(layer, class_type):
|
||||
x = handler(layer, x, emb, context, transformer_options, output_shape, time_context, num_video_frames, image_only_indicator)
|
||||
found_patched = True
|
||||
break
|
||||
if found_patched:
|
||||
continue
|
||||
|
||||
if isinstance(layer, VideoResBlock):
|
||||
x = layer(x, emb, num_video_frames, image_only_indicator)
|
||||
elif isinstance(layer, TimestepBlock):
|
||||
@ -49,15 +59,6 @@ def forward_timestep_embed(ts, x, emb, context=None, transformer_options={}, out
|
||||
elif isinstance(layer, Upsample):
|
||||
x = layer(x, output_shape=output_shape)
|
||||
else:
|
||||
if "patches" in transformer_options and "forward_timestep_embed_patch" in transformer_options["patches"]:
|
||||
found_patched = False
|
||||
for class_type, handler in transformer_options["patches"]["forward_timestep_embed_patch"]:
|
||||
if isinstance(layer, class_type):
|
||||
x = handler(layer, x, emb, context, transformer_options, output_shape, time_context, num_video_frames, image_only_indicator)
|
||||
found_patched = True
|
||||
break
|
||||
if found_patched:
|
||||
continue
|
||||
x = layer(x)
|
||||
return x
|
||||
|
||||
@ -894,6 +895,12 @@ class UNetModel(nn.Module):
|
||||
h = forward_timestep_embed(self.middle_block, h, emb, context, transformer_options, time_context=time_context, num_video_frames=num_video_frames, image_only_indicator=image_only_indicator)
|
||||
h = apply_control(h, control, 'middle')
|
||||
|
||||
if "middle_block_after_patch" in transformer_patches:
|
||||
patch = transformer_patches["middle_block_after_patch"]
|
||||
for p in patch:
|
||||
out = p({"h": h, "x": x, "emb": emb, "context": context, "y": y,
|
||||
"timesteps": timesteps, "transformer_options": transformer_options})
|
||||
h = out["h"]
|
||||
|
||||
for id, module in enumerate(self.output_blocks):
|
||||
transformer_options["block"] = ("output", id)
|
||||
@ -905,8 +912,9 @@ class UNetModel(nn.Module):
|
||||
for p in patch:
|
||||
h, hsp = p(h, hsp, transformer_options)
|
||||
|
||||
h = th.cat([h, hsp], dim=1)
|
||||
del hsp
|
||||
if hsp is not None:
|
||||
h = th.cat([h, hsp], dim=1)
|
||||
del hsp
|
||||
if len(hs) > 0:
|
||||
output_shape = hs[-1].shape
|
||||
else:
|
||||
|
||||
0
comfy/ldm/supir/__init__.py
Normal file
0
comfy/ldm/supir/__init__.py
Normal file
226
comfy/ldm/supir/supir_modules.py
Normal file
226
comfy/ldm/supir/supir_modules.py
Normal file
@ -0,0 +1,226 @@
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
from comfy.ldm.modules.diffusionmodules.util import timestep_embedding
|
||||
from comfy.ldm.modules.diffusionmodules.openaimodel import Downsample, TimestepEmbedSequential, ResBlock, SpatialTransformer
|
||||
from comfy.ldm.modules.attention import optimized_attention
|
||||
|
||||
|
||||
class ZeroSFT(nn.Module):
|
||||
def __init__(self, label_nc, norm_nc, concat_channels=0, dtype=None, device=None, operations=None):
|
||||
super().__init__()
|
||||
|
||||
ks = 3
|
||||
pw = ks // 2
|
||||
|
||||
self.param_free_norm = operations.GroupNorm(32, norm_nc + concat_channels, dtype=dtype, device=device)
|
||||
|
||||
nhidden = 128
|
||||
|
||||
self.mlp_shared = nn.Sequential(
|
||||
operations.Conv2d(label_nc, nhidden, kernel_size=ks, padding=pw, dtype=dtype, device=device),
|
||||
nn.SiLU()
|
||||
)
|
||||
self.zero_mul = operations.Conv2d(nhidden, norm_nc + concat_channels, kernel_size=ks, padding=pw, dtype=dtype, device=device)
|
||||
self.zero_add = operations.Conv2d(nhidden, norm_nc + concat_channels, kernel_size=ks, padding=pw, dtype=dtype, device=device)
|
||||
|
||||
self.zero_conv = operations.Conv2d(label_nc, norm_nc, 1, 1, 0, dtype=dtype, device=device)
|
||||
self.pre_concat = bool(concat_channels != 0)
|
||||
|
||||
def forward(self, c, h, h_ori=None, control_scale=1):
|
||||
if h_ori is not None and self.pre_concat:
|
||||
h_raw = torch.cat([h_ori, h], dim=1)
|
||||
else:
|
||||
h_raw = h
|
||||
|
||||
h = h + self.zero_conv(c)
|
||||
if h_ori is not None and self.pre_concat:
|
||||
h = torch.cat([h_ori, h], dim=1)
|
||||
actv = self.mlp_shared(c)
|
||||
gamma = self.zero_mul(actv)
|
||||
beta = self.zero_add(actv)
|
||||
h = self.param_free_norm(h)
|
||||
h = torch.addcmul(h + beta, h, gamma)
|
||||
if h_ori is not None and not self.pre_concat:
|
||||
h = torch.cat([h_ori, h], dim=1)
|
||||
return torch.lerp(h_raw, h, control_scale)
|
||||
|
||||
|
||||
class _CrossAttnInner(nn.Module):
|
||||
"""Inner cross-attention module matching the state_dict layout of the original CrossAttention."""
|
||||
def __init__(self, query_dim, context_dim, heads, dim_head, dtype=None, device=None, operations=None):
|
||||
super().__init__()
|
||||
inner_dim = dim_head * heads
|
||||
self.heads = heads
|
||||
self.to_q = operations.Linear(query_dim, inner_dim, bias=False, dtype=dtype, device=device)
|
||||
self.to_k = operations.Linear(context_dim, inner_dim, bias=False, dtype=dtype, device=device)
|
||||
self.to_v = operations.Linear(context_dim, inner_dim, bias=False, dtype=dtype, device=device)
|
||||
self.to_out = nn.Sequential(
|
||||
operations.Linear(inner_dim, query_dim, dtype=dtype, device=device),
|
||||
)
|
||||
|
||||
def forward(self, x, context):
|
||||
q = self.to_q(x)
|
||||
k = self.to_k(context)
|
||||
v = self.to_v(context)
|
||||
return self.to_out(optimized_attention(q, k, v, self.heads))
|
||||
|
||||
|
||||
class ZeroCrossAttn(nn.Module):
|
||||
def __init__(self, context_dim, query_dim, dtype=None, device=None, operations=None):
|
||||
super().__init__()
|
||||
heads = query_dim // 64
|
||||
dim_head = 64
|
||||
self.attn = _CrossAttnInner(query_dim, context_dim, heads, dim_head, dtype=dtype, device=device, operations=operations)
|
||||
self.norm1 = operations.GroupNorm(32, query_dim, dtype=dtype, device=device)
|
||||
self.norm2 = operations.GroupNorm(32, context_dim, dtype=dtype, device=device)
|
||||
|
||||
def forward(self, context, x, control_scale=1):
|
||||
b, c, h, w = x.shape
|
||||
x_in = x
|
||||
|
||||
x = self.attn(
|
||||
self.norm1(x).flatten(2).transpose(1, 2),
|
||||
self.norm2(context).flatten(2).transpose(1, 2),
|
||||
).transpose(1, 2).unflatten(2, (h, w))
|
||||
|
||||
return x_in + x * control_scale
|
||||
|
||||
|
||||
class GLVControl(nn.Module):
|
||||
"""SUPIR's Guided Latent Vector control encoder. Truncated UNet (input + middle blocks only)."""
|
||||
def __init__(
|
||||
self,
|
||||
in_channels=4,
|
||||
model_channels=320,
|
||||
num_res_blocks=2,
|
||||
attention_resolutions=(4, 2),
|
||||
channel_mult=(1, 2, 4),
|
||||
num_head_channels=64,
|
||||
transformer_depth=(1, 2, 10),
|
||||
context_dim=2048,
|
||||
adm_in_channels=2816,
|
||||
use_linear_in_transformer=True,
|
||||
use_checkpoint=False,
|
||||
dtype=None,
|
||||
device=None,
|
||||
operations=None,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__()
|
||||
self.model_channels = model_channels
|
||||
time_embed_dim = model_channels * 4
|
||||
|
||||
self.time_embed = nn.Sequential(
|
||||
operations.Linear(model_channels, time_embed_dim, dtype=dtype, device=device),
|
||||
nn.SiLU(),
|
||||
operations.Linear(time_embed_dim, time_embed_dim, dtype=dtype, device=device),
|
||||
)
|
||||
|
||||
self.label_emb = nn.Sequential(
|
||||
nn.Sequential(
|
||||
operations.Linear(adm_in_channels, time_embed_dim, dtype=dtype, device=device),
|
||||
nn.SiLU(),
|
||||
operations.Linear(time_embed_dim, time_embed_dim, dtype=dtype, device=device),
|
||||
)
|
||||
)
|
||||
|
||||
self.input_blocks = nn.ModuleList([
|
||||
TimestepEmbedSequential(
|
||||
operations.Conv2d(in_channels, model_channels, 3, padding=1, dtype=dtype, device=device)
|
||||
)
|
||||
])
|
||||
ch = model_channels
|
||||
ds = 1
|
||||
for level, mult in enumerate(channel_mult):
|
||||
for nr in range(num_res_blocks):
|
||||
layers = [
|
||||
ResBlock(ch, time_embed_dim, 0, out_channels=mult * model_channels,
|
||||
dtype=dtype, device=device, operations=operations)
|
||||
]
|
||||
ch = mult * model_channels
|
||||
if ds in attention_resolutions:
|
||||
num_heads = ch // num_head_channels
|
||||
layers.append(
|
||||
SpatialTransformer(ch, num_heads, num_head_channels,
|
||||
depth=transformer_depth[level], context_dim=context_dim,
|
||||
use_linear=use_linear_in_transformer,
|
||||
use_checkpoint=use_checkpoint,
|
||||
dtype=dtype, device=device, operations=operations)
|
||||
)
|
||||
self.input_blocks.append(TimestepEmbedSequential(*layers))
|
||||
if level != len(channel_mult) - 1:
|
||||
self.input_blocks.append(
|
||||
TimestepEmbedSequential(
|
||||
Downsample(ch, True, out_channels=ch, dtype=dtype, device=device, operations=operations)
|
||||
)
|
||||
)
|
||||
ds *= 2
|
||||
|
||||
num_heads = ch // num_head_channels
|
||||
self.middle_block = TimestepEmbedSequential(
|
||||
ResBlock(ch, time_embed_dim, 0, dtype=dtype, device=device, operations=operations),
|
||||
SpatialTransformer(ch, num_heads, num_head_channels,
|
||||
depth=transformer_depth[-1], context_dim=context_dim,
|
||||
use_linear=use_linear_in_transformer,
|
||||
use_checkpoint=use_checkpoint,
|
||||
dtype=dtype, device=device, operations=operations),
|
||||
ResBlock(ch, time_embed_dim, 0, dtype=dtype, device=device, operations=operations),
|
||||
)
|
||||
|
||||
self.input_hint_block = TimestepEmbedSequential(
|
||||
operations.Conv2d(in_channels, model_channels, 3, padding=1, dtype=dtype, device=device)
|
||||
)
|
||||
|
||||
def forward(self, x, timesteps, xt, context=None, y=None, **kwargs):
|
||||
t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False).to(x.dtype)
|
||||
emb = self.time_embed(t_emb) + self.label_emb(y)
|
||||
|
||||
guided_hint = self.input_hint_block(x, emb, context)
|
||||
|
||||
hs = []
|
||||
h = xt
|
||||
for module in self.input_blocks:
|
||||
if guided_hint is not None:
|
||||
h = module(h, emb, context)
|
||||
h += guided_hint
|
||||
guided_hint = None
|
||||
else:
|
||||
h = module(h, emb, context)
|
||||
hs.append(h)
|
||||
h = self.middle_block(h, emb, context)
|
||||
hs.append(h)
|
||||
return hs
|
||||
|
||||
|
||||
class SUPIR(nn.Module):
|
||||
"""
|
||||
SUPIR model containing GLVControl (control encoder) and project_modules (adapters).
|
||||
State dict keys match the original SUPIR checkpoint layout:
|
||||
control_model.* -> GLVControl
|
||||
project_modules.* -> nn.ModuleList of ZeroSFT/ZeroCrossAttn
|
||||
"""
|
||||
def __init__(self, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
|
||||
self.control_model = GLVControl(dtype=dtype, device=device, operations=operations)
|
||||
|
||||
project_channel_scale = 2
|
||||
cond_output_channels = [320] * 4 + [640] * 3 + [1280] * 3
|
||||
project_channels = [int(c * project_channel_scale) for c in [160] * 4 + [320] * 3 + [640] * 3]
|
||||
concat_channels = [320] * 2 + [640] * 3 + [1280] * 4 + [0]
|
||||
cross_attn_insert_idx = [6, 3]
|
||||
|
||||
self.project_modules = nn.ModuleList()
|
||||
for i in range(len(cond_output_channels)):
|
||||
self.project_modules.append(ZeroSFT(
|
||||
project_channels[i], cond_output_channels[i],
|
||||
concat_channels=concat_channels[i],
|
||||
dtype=dtype, device=device, operations=operations,
|
||||
))
|
||||
|
||||
for i in cross_attn_insert_idx:
|
||||
self.project_modules.insert(i, ZeroCrossAttn(
|
||||
cond_output_channels[i], concat_channels[i],
|
||||
dtype=dtype, device=device, operations=operations,
|
||||
))
|
||||
103
comfy/ldm/supir/supir_patch.py
Normal file
103
comfy/ldm/supir/supir_patch.py
Normal file
@ -0,0 +1,103 @@
|
||||
import torch
|
||||
from comfy.ldm.modules.diffusionmodules.openaimodel import Upsample
|
||||
|
||||
|
||||
class SUPIRPatch:
|
||||
"""
|
||||
Holds GLVControl (control encoder) + project_modules (ZeroSFT/ZeroCrossAttn adapters).
|
||||
Runs GLVControl lazily on first patch invocation per step, applies adapters through
|
||||
middle_block_after_patch, output_block_merge_patch, and forward_timestep_embed_patch.
|
||||
"""
|
||||
SIGMA_MAX = 14.6146
|
||||
|
||||
def __init__(self, model_patch, project_modules, hint_latent, strength_start, strength_end):
|
||||
self.model_patch = model_patch # CoreModelPatcher wrapping GLVControl
|
||||
self.project_modules = project_modules # nn.ModuleList of ZeroSFT/ZeroCrossAttn
|
||||
self.hint_latent = hint_latent # encoded LQ image latent
|
||||
self.strength_start = strength_start
|
||||
self.strength_end = strength_end
|
||||
self.cached_features = None
|
||||
self.adapter_idx = 0
|
||||
self.control_idx = 0
|
||||
self.current_control_idx = 0
|
||||
self.active = True
|
||||
|
||||
def _ensure_features(self, kwargs):
|
||||
"""Run GLVControl on first call per step, cache results."""
|
||||
if self.cached_features is not None:
|
||||
return
|
||||
x = kwargs["x"]
|
||||
b = x.shape[0]
|
||||
hint = self.hint_latent.to(device=x.device, dtype=x.dtype)
|
||||
if hint.shape[0] != b:
|
||||
hint = hint.expand(b, -1, -1, -1) if hint.shape[0] == 1 else hint.repeat((b + hint.shape[0] - 1) // hint.shape[0], 1, 1, 1)[:b]
|
||||
self.cached_features = self.model_patch.model.control_model(
|
||||
hint, kwargs["timesteps"], x,
|
||||
kwargs["context"], kwargs["y"]
|
||||
)
|
||||
self.adapter_idx = len(self.project_modules) - 1
|
||||
self.control_idx = len(self.cached_features) - 1
|
||||
|
||||
def _get_control_scale(self, kwargs):
|
||||
if self.strength_start == self.strength_end:
|
||||
return self.strength_end
|
||||
sigma = kwargs["transformer_options"].get("sigmas")
|
||||
if sigma is None:
|
||||
return self.strength_end
|
||||
s = sigma[0].item() if sigma.dim() > 0 else sigma.item()
|
||||
t = min(s / self.SIGMA_MAX, 1.0)
|
||||
return t * (self.strength_start - self.strength_end) + self.strength_end
|
||||
|
||||
def middle_after(self, kwargs):
|
||||
"""middle_block_after_patch: run GLVControl lazily, apply last adapter after middle block."""
|
||||
self.cached_features = None # reset from previous step
|
||||
self.current_scale = self._get_control_scale(kwargs)
|
||||
self.active = self.current_scale > 0
|
||||
if not self.active:
|
||||
return {"h": kwargs["h"]}
|
||||
self._ensure_features(kwargs)
|
||||
h = kwargs["h"]
|
||||
h = self.project_modules[self.adapter_idx](
|
||||
self.cached_features[self.control_idx], h, control_scale=self.current_scale
|
||||
)
|
||||
self.adapter_idx -= 1
|
||||
self.control_idx -= 1
|
||||
return {"h": h}
|
||||
|
||||
def output_block(self, h, hsp, transformer_options):
|
||||
"""output_block_patch: ZeroSFT adapter fusion replaces cat([h, hsp]). Returns (h, None) to skip cat."""
|
||||
if not self.active:
|
||||
return h, hsp
|
||||
self.current_control_idx = self.control_idx
|
||||
h = self.project_modules[self.adapter_idx](
|
||||
self.cached_features[self.control_idx], hsp, h, control_scale=self.current_scale
|
||||
)
|
||||
self.adapter_idx -= 1
|
||||
self.control_idx -= 1
|
||||
return h, None
|
||||
|
||||
def pre_upsample(self, layer, x, emb, context, transformer_options, output_shape, *args, **kw):
|
||||
"""forward_timestep_embed_patch for Upsample: extra cross-attn adapter before upsample."""
|
||||
block_type, _ = transformer_options["block"]
|
||||
if block_type == "output" and self.active and self.cached_features is not None:
|
||||
x = self.project_modules[self.adapter_idx](
|
||||
self.cached_features[self.current_control_idx], x, control_scale=self.current_scale
|
||||
)
|
||||
self.adapter_idx -= 1
|
||||
return layer(x, output_shape=output_shape)
|
||||
|
||||
def to(self, device_or_dtype):
|
||||
if isinstance(device_or_dtype, torch.device):
|
||||
self.cached_features = None
|
||||
if self.hint_latent is not None:
|
||||
self.hint_latent = self.hint_latent.to(device_or_dtype)
|
||||
return self
|
||||
|
||||
def models(self):
|
||||
return [self.model_patch]
|
||||
|
||||
def register(self, model_patcher):
|
||||
"""Register all patches on a cloned model patcher."""
|
||||
model_patcher.set_model_patch(self.middle_after, "middle_block_after_patch")
|
||||
model_patcher.set_model_output_block_patch(self.output_block)
|
||||
model_patcher.set_model_patch((Upsample, self.pre_upsample), "forward_timestep_embed_patch")
|
||||
@ -578,8 +578,8 @@ class Stable_Zero123(BaseModel):
|
||||
def __init__(self, model_config, model_type=ModelType.EPS, device=None, cc_projection_weight=None, cc_projection_bias=None):
|
||||
super().__init__(model_config, model_type, device=device)
|
||||
self.cc_projection = comfy.ops.manual_cast.Linear(cc_projection_weight.shape[1], cc_projection_weight.shape[0], dtype=self.get_dtype(), device=device)
|
||||
self.cc_projection.weight = torch.nn.Parameter(cc_projection_weight.clone())
|
||||
self.cc_projection.bias = torch.nn.Parameter(cc_projection_bias.clone())
|
||||
self.cc_projection.weight.copy_(cc_projection_weight)
|
||||
self.cc_projection.bias.copy_(cc_projection_bias)
|
||||
|
||||
def extra_conds(self, **kwargs):
|
||||
out = {}
|
||||
|
||||
@ -506,6 +506,10 @@ class ModelPatcher:
|
||||
def set_model_noise_refiner_patch(self, patch):
|
||||
self.set_model_patch(patch, "noise_refiner")
|
||||
|
||||
def set_model_middle_block_after_patch(self, patch):
|
||||
self.set_model_patch(patch, "middle_block_after_patch")
|
||||
|
||||
|
||||
def set_model_rope_options(self, scale_x, shift_x, scale_y, shift_y, scale_t, shift_t, **kwargs):
|
||||
rope_options = self.model_options["transformer_options"].get("rope_options", {})
|
||||
rope_options["scale_x"] = scale_x
|
||||
|
||||
19
comfy/sd.py
19
comfy/sd.py
@ -12,6 +12,7 @@ from .ldm.cascade.stage_c_coder import StageC_coder
|
||||
from .ldm.audio.autoencoder import AudioOobleckVAE
|
||||
import comfy.ldm.genmo.vae.model
|
||||
import comfy.ldm.lightricks.vae.causal_video_autoencoder
|
||||
import comfy.ldm.lightricks.vae.audio_vae
|
||||
import comfy.ldm.cosmos.vae
|
||||
import comfy.ldm.wan.vae
|
||||
import comfy.ldm.wan.vae2_2
|
||||
@ -805,6 +806,24 @@ class VAE:
|
||||
self.downscale_index_formula = (4, 8, 8)
|
||||
self.memory_used_encode = lambda shape, dtype: (700 * (max(1, (shape[-3] ** 0.66 * 0.11)) * shape[-2] * shape[-1]) * model_management.dtype_size(dtype))
|
||||
self.memory_used_decode = lambda shape, dtype: (50 * (max(1, (shape[-3] ** 0.65 * 0.26)) * shape[-2] * shape[-1] * 32 * 32) * model_management.dtype_size(dtype))
|
||||
elif "vocoder.resblocks.0.convs1.0.weight" in sd or "vocoder.vocoder.resblocks.0.convs1.0.weight" in sd: # LTX Audio
|
||||
sd = comfy.utils.state_dict_prefix_replace(sd, {"audio_vae.": "autoencoder."})
|
||||
self.first_stage_model = comfy.ldm.lightricks.vae.audio_vae.AudioVAE(metadata=metadata)
|
||||
self.memory_used_encode = lambda shape, dtype: (shape[2] * 330) * model_management.dtype_size(dtype)
|
||||
self.memory_used_decode = lambda shape, dtype: (shape[2] * shape[3] * 87000) * model_management.dtype_size(dtype)
|
||||
self.latent_channels = self.first_stage_model.latent_channels
|
||||
self.audio_sample_rate_output = self.first_stage_model.output_sample_rate
|
||||
self.autoencoder = self.first_stage_model.autoencoder # TODO: remove hack for ltxv custom nodes
|
||||
self.output_channels = 2
|
||||
self.pad_channel_value = "replicate"
|
||||
self.upscale_ratio = 4096
|
||||
self.downscale_ratio = 4096
|
||||
self.latent_dim = 2
|
||||
self.process_output = lambda audio: audio
|
||||
self.process_input = lambda audio: audio
|
||||
self.working_dtypes = [torch.float32]
|
||||
self.disable_offload = True
|
||||
self.extra_1d_channel = 16
|
||||
else:
|
||||
logging.warning("WARNING: No VAE weights detected, VAE not initalized.")
|
||||
self.first_stage_model = None
|
||||
|
||||
@ -122,41 +122,6 @@ class TaskStatusResponse(BaseModel):
|
||||
usage: TaskStatusUsage | None = Field(None)
|
||||
|
||||
|
||||
class GetAssetResponse(BaseModel):
|
||||
id: str = Field(...)
|
||||
name: str | None = Field(None)
|
||||
url: str | None = Field(None)
|
||||
asset_type: str = Field(...)
|
||||
group_id: str = Field(...)
|
||||
status: str = Field(...)
|
||||
error: TaskStatusError | None = Field(None)
|
||||
|
||||
|
||||
class SeedanceCreateVisualValidateSessionResponse(BaseModel):
|
||||
session_id: str = Field(...)
|
||||
h5_link: str = Field(...)
|
||||
|
||||
|
||||
class SeedanceGetVisualValidateSessionResponse(BaseModel):
|
||||
session_id: str = Field(...)
|
||||
status: str = Field(...)
|
||||
group_id: str | None = Field(None)
|
||||
error_code: str | None = Field(None)
|
||||
error_message: str | None = Field(None)
|
||||
|
||||
|
||||
class SeedanceCreateAssetRequest(BaseModel):
|
||||
group_id: str = Field(...)
|
||||
url: str = Field(...)
|
||||
asset_type: str = Field(...)
|
||||
name: str | None = Field(None, max_length=64)
|
||||
project_name: str | None = Field(None)
|
||||
|
||||
|
||||
class SeedanceCreateAssetResponse(BaseModel):
|
||||
asset_id: str = Field(...)
|
||||
|
||||
|
||||
# Dollars per 1K tokens, keyed by (model_id, has_video_input).
|
||||
SEEDANCE2_PRICE_PER_1K_TOKENS = {
|
||||
("dreamina-seedance-2-0-260128", False): 0.007,
|
||||
|
||||
@ -1,6 +1,5 @@
|
||||
import logging
|
||||
import math
|
||||
import re
|
||||
|
||||
import torch
|
||||
from typing_extensions import override
|
||||
@ -12,14 +11,9 @@ from comfy_api_nodes.apis.bytedance import (
|
||||
SEEDANCE2_PRICE_PER_1K_TOKENS,
|
||||
SEEDANCE2_REF_VIDEO_PIXEL_LIMITS,
|
||||
VIDEO_TASKS_EXECUTION_TIME,
|
||||
GetAssetResponse,
|
||||
Image2VideoTaskCreationRequest,
|
||||
ImageTaskCreationResponse,
|
||||
Seedance2TaskCreationRequest,
|
||||
SeedanceCreateAssetRequest,
|
||||
SeedanceCreateAssetResponse,
|
||||
SeedanceCreateVisualValidateSessionResponse,
|
||||
SeedanceGetVisualValidateSessionResponse,
|
||||
Seedream4Options,
|
||||
Seedream4TaskCreationRequest,
|
||||
TaskAudioContent,
|
||||
@ -50,16 +44,10 @@ from comfy_api_nodes.util import (
|
||||
validate_image_aspect_ratio,
|
||||
validate_image_dimensions,
|
||||
validate_string,
|
||||
validate_video_dimensions,
|
||||
validate_video_duration,
|
||||
)
|
||||
from server import PromptServer
|
||||
|
||||
BYTEPLUS_IMAGE_ENDPOINT = "/proxy/byteplus/api/v3/images/generations"
|
||||
|
||||
_VERIFICATION_POLL_TIMEOUT_SEC = 120
|
||||
_VERIFICATION_POLL_INTERVAL_SEC = 3
|
||||
|
||||
SEEDREAM_MODELS = {
|
||||
"seedream 5.0 lite": "seedream-5-0-260128",
|
||||
"seedream-4-5-251128": "seedream-4-5-251128",
|
||||
@ -108,169 +96,6 @@ def _validate_ref_video_pixels(video: Input.Video, model_id: str, resolution: st
|
||||
)
|
||||
|
||||
|
||||
async def _resolve_reference_assets(
|
||||
cls: type[IO.ComfyNode],
|
||||
asset_ids: list[str],
|
||||
) -> tuple[dict[str, str], dict[str, str], dict[str, str]]:
|
||||
"""Look up each asset, validate Active status, group by asset_type.
|
||||
|
||||
Returns (image_assets, video_assets, audio_assets), each mapping asset_id -> "asset://<asset_id>".
|
||||
"""
|
||||
image_assets: dict[str, str] = {}
|
||||
video_assets: dict[str, str] = {}
|
||||
audio_assets: dict[str, str] = {}
|
||||
for i, raw_id in enumerate(asset_ids, 1):
|
||||
asset_id = (raw_id or "").strip()
|
||||
if not asset_id:
|
||||
continue
|
||||
result = await sync_op(
|
||||
cls,
|
||||
ApiEndpoint(path=f"/proxy/seedance/assets/{asset_id}"),
|
||||
response_model=GetAssetResponse,
|
||||
)
|
||||
if result.status != "Active":
|
||||
extra = f" {result.error.code}: {result.error.message}" if result.error else ""
|
||||
raise ValueError(f"Reference asset {i} (Id={asset_id}) is not Active (Status={result.status}).{extra}")
|
||||
asset_uri = f"asset://{asset_id}"
|
||||
if result.asset_type == "Image":
|
||||
image_assets[asset_id] = asset_uri
|
||||
elif result.asset_type == "Video":
|
||||
video_assets[asset_id] = asset_uri
|
||||
elif result.asset_type == "Audio":
|
||||
audio_assets[asset_id] = asset_uri
|
||||
return image_assets, video_assets, audio_assets
|
||||
|
||||
|
||||
_ASSET_REF_RE = re.compile(r"\basset ?(\d{1,2})\b", re.IGNORECASE)
|
||||
|
||||
|
||||
def _build_asset_labels(
|
||||
reference_assets: dict[str, str],
|
||||
image_asset_uris: dict[str, str],
|
||||
video_asset_uris: dict[str, str],
|
||||
audio_asset_uris: dict[str, str],
|
||||
n_reference_images: int,
|
||||
n_reference_videos: int,
|
||||
n_reference_audios: int,
|
||||
) -> dict[int, str]:
|
||||
"""Map asset slot number (from 'asset_N' keys) to its positional label.
|
||||
|
||||
Asset entries are appended to `content` after the reference_images/videos/audios,
|
||||
so their 1-indexed labels continue from the count of existing same-type refs:
|
||||
one reference_images entry + one Image-type asset -> asset labelled "Image 2".
|
||||
"""
|
||||
image_n = n_reference_images
|
||||
video_n = n_reference_videos
|
||||
audio_n = n_reference_audios
|
||||
labels: dict[int, str] = {}
|
||||
for slot_key, raw_id in reference_assets.items():
|
||||
asset_id = (raw_id or "").strip()
|
||||
if not asset_id:
|
||||
continue
|
||||
try:
|
||||
slot_num = int(slot_key.rsplit("_", 1)[-1])
|
||||
except ValueError:
|
||||
continue
|
||||
if asset_id in image_asset_uris:
|
||||
image_n += 1
|
||||
labels[slot_num] = f"Image {image_n}"
|
||||
elif asset_id in video_asset_uris:
|
||||
video_n += 1
|
||||
labels[slot_num] = f"Video {video_n}"
|
||||
elif asset_id in audio_asset_uris:
|
||||
audio_n += 1
|
||||
labels[slot_num] = f"Audio {audio_n}"
|
||||
return labels
|
||||
|
||||
|
||||
def _rewrite_asset_refs(prompt: str, labels: dict[int, str]) -> str:
|
||||
"""Case-insensitively replace 'assetNN' (1-2 digit) tokens with their labels."""
|
||||
if not labels:
|
||||
return prompt
|
||||
|
||||
def _sub(m: "re.Match[str]") -> str:
|
||||
return labels.get(int(m.group(1)), m.group(0))
|
||||
|
||||
return _ASSET_REF_RE.sub(_sub, prompt)
|
||||
|
||||
|
||||
async def _obtain_group_id_via_h5_auth(cls: type[IO.ComfyNode]) -> str:
|
||||
session = await sync_op(
|
||||
cls,
|
||||
ApiEndpoint(path="/proxy/seedance/visual-validate/sessions", method="POST"),
|
||||
response_model=SeedanceCreateVisualValidateSessionResponse,
|
||||
)
|
||||
logger.warning("Seedance authentication required. Open link: %s", session.h5_link)
|
||||
|
||||
h5_text = f"Open this link in your browser and complete face verification:\n\n{session.h5_link}"
|
||||
|
||||
result = await poll_op(
|
||||
cls,
|
||||
ApiEndpoint(path=f"/proxy/seedance/visual-validate/sessions/{session.session_id}"),
|
||||
response_model=SeedanceGetVisualValidateSessionResponse,
|
||||
status_extractor=lambda r: r.status,
|
||||
completed_statuses=["completed"],
|
||||
failed_statuses=["failed"],
|
||||
poll_interval=_VERIFICATION_POLL_INTERVAL_SEC,
|
||||
max_poll_attempts=(_VERIFICATION_POLL_TIMEOUT_SEC // _VERIFICATION_POLL_INTERVAL_SEC) - 1,
|
||||
estimated_duration=_VERIFICATION_POLL_TIMEOUT_SEC - 1,
|
||||
extra_text=h5_text,
|
||||
)
|
||||
|
||||
if not result.group_id:
|
||||
raise RuntimeError(f"Seedance session {session.session_id} completed without a group_id")
|
||||
|
||||
logger.warning("Seedance authentication complete. New GroupId: %s", result.group_id)
|
||||
PromptServer.instance.send_progress_text(
|
||||
f"Authentication complete. New GroupId: {result.group_id}", cls.hidden.unique_id
|
||||
)
|
||||
return result.group_id
|
||||
|
||||
|
||||
async def _resolve_group_id(cls: type[IO.ComfyNode], group_id: str) -> str:
|
||||
if group_id and group_id.strip():
|
||||
return group_id.strip()
|
||||
return await _obtain_group_id_via_h5_auth(cls)
|
||||
|
||||
|
||||
async def _create_seedance_asset(
|
||||
cls: type[IO.ComfyNode],
|
||||
*,
|
||||
group_id: str,
|
||||
url: str,
|
||||
name: str,
|
||||
asset_type: str,
|
||||
) -> str:
|
||||
req = SeedanceCreateAssetRequest(
|
||||
group_id=group_id,
|
||||
url=url,
|
||||
asset_type=asset_type,
|
||||
name=name or None,
|
||||
)
|
||||
result = await sync_op(
|
||||
cls,
|
||||
ApiEndpoint(path="/proxy/seedance/assets", method="POST"),
|
||||
response_model=SeedanceCreateAssetResponse,
|
||||
data=req,
|
||||
)
|
||||
return result.asset_id
|
||||
|
||||
|
||||
async def _wait_for_asset_active(cls: type[IO.ComfyNode], asset_id: str, group_id: str) -> GetAssetResponse:
|
||||
"""Poll the newly created asset until its status becomes Active."""
|
||||
return await poll_op(
|
||||
cls,
|
||||
ApiEndpoint(path=f"/proxy/seedance/assets/{asset_id}"),
|
||||
response_model=GetAssetResponse,
|
||||
status_extractor=lambda r: r.status,
|
||||
completed_statuses=["Active"],
|
||||
failed_statuses=["Failed"],
|
||||
poll_interval=5,
|
||||
max_poll_attempts=1200,
|
||||
extra_text=f"Waiting for asset pre-processing...\n\nasset_id: {asset_id}\n\ngroup_id: {group_id}",
|
||||
)
|
||||
|
||||
|
||||
def _seedance2_price_extractor(model_id: str, has_video_input: bool):
|
||||
"""Returns a price_extractor closure for Seedance 2.0 poll_op."""
|
||||
rate = SEEDANCE2_PRICE_PER_1K_TOKENS.get((model_id, has_video_input))
|
||||
@ -1403,27 +1228,12 @@ class ByteDance2FirstLastFrameNode(IO.ComfyNode):
|
||||
IO.Image.Input(
|
||||
"first_frame",
|
||||
tooltip="First frame image for the video.",
|
||||
optional=True,
|
||||
),
|
||||
IO.Image.Input(
|
||||
"last_frame",
|
||||
tooltip="Last frame image for the video.",
|
||||
optional=True,
|
||||
),
|
||||
IO.String.Input(
|
||||
"first_frame_asset_id",
|
||||
default="",
|
||||
tooltip="Seedance asset_id to use as the first frame. "
|
||||
"Mutually exclusive with the first_frame image input.",
|
||||
optional=True,
|
||||
),
|
||||
IO.String.Input(
|
||||
"last_frame_asset_id",
|
||||
default="",
|
||||
tooltip="Seedance asset_id to use as the last frame. "
|
||||
"Mutually exclusive with the last_frame image input.",
|
||||
optional=True,
|
||||
),
|
||||
IO.Int.Input(
|
||||
"seed",
|
||||
default=0,
|
||||
@ -1476,54 +1286,24 @@ class ByteDance2FirstLastFrameNode(IO.ComfyNode):
|
||||
async def execute(
|
||||
cls,
|
||||
model: dict,
|
||||
first_frame: Input.Image,
|
||||
seed: int,
|
||||
watermark: bool,
|
||||
first_frame: Input.Image | None = None,
|
||||
last_frame: Input.Image | None = None,
|
||||
first_frame_asset_id: str = "",
|
||||
last_frame_asset_id: str = "",
|
||||
) -> IO.NodeOutput:
|
||||
validate_string(model["prompt"], strip_whitespace=True, min_length=1)
|
||||
model_id = SEEDANCE_MODELS[model["model"]]
|
||||
|
||||
first_frame_asset_id = first_frame_asset_id.strip()
|
||||
last_frame_asset_id = last_frame_asset_id.strip()
|
||||
|
||||
if first_frame is not None and first_frame_asset_id:
|
||||
raise ValueError("Provide only one of first_frame or first_frame_asset_id, not both.")
|
||||
if first_frame is None and not first_frame_asset_id:
|
||||
raise ValueError("Either first_frame or first_frame_asset_id is required.")
|
||||
if last_frame is not None and last_frame_asset_id:
|
||||
raise ValueError("Provide only one of last_frame or last_frame_asset_id, not both.")
|
||||
|
||||
asset_ids_to_resolve = [a for a in (first_frame_asset_id, last_frame_asset_id) if a]
|
||||
image_assets: dict[str, str] = {}
|
||||
if asset_ids_to_resolve:
|
||||
image_assets, _, _ = await _resolve_reference_assets(cls, asset_ids_to_resolve)
|
||||
for aid in asset_ids_to_resolve:
|
||||
if aid not in image_assets:
|
||||
raise ValueError(f"Asset {aid} is not an Image asset.")
|
||||
|
||||
if first_frame_asset_id:
|
||||
first_frame_url = image_assets[first_frame_asset_id]
|
||||
else:
|
||||
first_frame_url = await upload_image_to_comfyapi(cls, first_frame, wait_label="Uploading first frame.")
|
||||
|
||||
content: list[TaskTextContent | TaskImageContent] = [
|
||||
TaskTextContent(text=model["prompt"]),
|
||||
TaskImageContent(
|
||||
image_url=TaskImageContentUrl(url=first_frame_url),
|
||||
image_url=TaskImageContentUrl(
|
||||
url=await upload_image_to_comfyapi(cls, first_frame, wait_label="Uploading first frame.")
|
||||
),
|
||||
role="first_frame",
|
||||
),
|
||||
]
|
||||
if last_frame_asset_id:
|
||||
content.append(
|
||||
TaskImageContent(
|
||||
image_url=TaskImageContentUrl(url=image_assets[last_frame_asset_id]),
|
||||
role="last_frame",
|
||||
),
|
||||
)
|
||||
elif last_frame is not None:
|
||||
if last_frame is not None:
|
||||
content.append(
|
||||
TaskImageContent(
|
||||
image_url=TaskImageContentUrl(
|
||||
@ -1605,24 +1385,6 @@ def _seedance2_reference_inputs(resolutions: list[str]):
|
||||
tooltip="Automatically downscale reference videos that exceed the model's pixel budget "
|
||||
"for the selected resolution. Aspect ratio is preserved; videos already within limits are untouched.",
|
||||
),
|
||||
IO.Autogrow.Input(
|
||||
"reference_assets",
|
||||
template=IO.Autogrow.TemplateNames(
|
||||
IO.String.Input("reference_asset"),
|
||||
names=[
|
||||
"asset_1",
|
||||
"asset_2",
|
||||
"asset_3",
|
||||
"asset_4",
|
||||
"asset_5",
|
||||
"asset_6",
|
||||
"asset_7",
|
||||
"asset_8",
|
||||
"asset_9",
|
||||
],
|
||||
min=0,
|
||||
),
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
@ -1724,42 +1486,24 @@ class ByteDance2ReferenceNode(IO.ComfyNode):
|
||||
reference_images = model.get("reference_images", {})
|
||||
reference_videos = model.get("reference_videos", {})
|
||||
reference_audios = model.get("reference_audios", {})
|
||||
reference_assets = model.get("reference_assets", {})
|
||||
|
||||
reference_image_assets, reference_video_assets, reference_audio_assets = await _resolve_reference_assets(
|
||||
cls, list(reference_assets.values())
|
||||
)
|
||||
|
||||
if not reference_images and not reference_videos and not reference_image_assets and not reference_video_assets:
|
||||
raise ValueError("At least one reference image or video or asset is required.")
|
||||
|
||||
total_images = len(reference_images) + len(reference_image_assets)
|
||||
if total_images > 9:
|
||||
raise ValueError(
|
||||
f"Too many reference images: {total_images} "
|
||||
f"(images={len(reference_images)}, image assets={len(reference_image_assets)}). Maximum is 9."
|
||||
)
|
||||
total_videos = len(reference_videos) + len(reference_video_assets)
|
||||
if total_videos > 3:
|
||||
raise ValueError(
|
||||
f"Too many reference videos: {total_videos} "
|
||||
f"(videos={len(reference_videos)}, video assets={len(reference_video_assets)}). Maximum is 3."
|
||||
)
|
||||
total_audios = len(reference_audios) + len(reference_audio_assets)
|
||||
if total_audios > 3:
|
||||
raise ValueError(
|
||||
f"Too many reference audios: {total_audios} "
|
||||
f"(audios={len(reference_audios)}, audio assets={len(reference_audio_assets)}). Maximum is 3."
|
||||
)
|
||||
if not reference_images and not reference_videos:
|
||||
raise ValueError("At least one reference image or video is required.")
|
||||
|
||||
model_id = SEEDANCE_MODELS[model["model"]]
|
||||
has_video_input = total_videos > 0
|
||||
has_video_input = len(reference_videos) > 0
|
||||
|
||||
if model.get("auto_downscale") and reference_videos:
|
||||
max_px = SEEDANCE2_REF_VIDEO_PIXEL_LIMITS.get(model_id, {}).get(model["resolution"], {}).get("max")
|
||||
max_px = (
|
||||
SEEDANCE2_REF_VIDEO_PIXEL_LIMITS.get(model_id, {})
|
||||
.get(model["resolution"], {})
|
||||
.get("max")
|
||||
)
|
||||
if max_px:
|
||||
for key in reference_videos:
|
||||
reference_videos[key] = resize_video_to_pixel_budget(reference_videos[key], max_px)
|
||||
reference_videos[key] = resize_video_to_pixel_budget(
|
||||
reference_videos[key], max_px
|
||||
)
|
||||
|
||||
total_video_duration = 0.0
|
||||
for i, key in enumerate(reference_videos, 1):
|
||||
@ -1787,19 +1531,8 @@ class ByteDance2ReferenceNode(IO.ComfyNode):
|
||||
if total_audio_duration > 15.1:
|
||||
raise ValueError(f"Total reference audio duration is {total_audio_duration:.1f}s. Maximum is 15.1 seconds.")
|
||||
|
||||
asset_labels = _build_asset_labels(
|
||||
reference_assets,
|
||||
reference_image_assets,
|
||||
reference_video_assets,
|
||||
reference_audio_assets,
|
||||
len(reference_images),
|
||||
len(reference_videos),
|
||||
len(reference_audios),
|
||||
)
|
||||
prompt_text = _rewrite_asset_refs(model["prompt"], asset_labels)
|
||||
|
||||
content: list[TaskTextContent | TaskImageContent | TaskVideoContent | TaskAudioContent] = [
|
||||
TaskTextContent(text=prompt_text),
|
||||
TaskTextContent(text=model["prompt"]),
|
||||
]
|
||||
for i, key in enumerate(reference_images, 1):
|
||||
content.append(
|
||||
@ -1840,21 +1573,6 @@ class ByteDance2ReferenceNode(IO.ComfyNode):
|
||||
),
|
||||
),
|
||||
)
|
||||
for url in reference_image_assets.values():
|
||||
content.append(
|
||||
TaskImageContent(
|
||||
image_url=TaskImageContentUrl(url=url),
|
||||
role="reference_image",
|
||||
),
|
||||
)
|
||||
for url in reference_video_assets.values():
|
||||
content.append(
|
||||
TaskVideoContent(video_url=TaskVideoContentUrl(url=url)),
|
||||
)
|
||||
for url in reference_audio_assets.values():
|
||||
content.append(
|
||||
TaskAudioContent(audio_url=TaskAudioContentUrl(url=url)),
|
||||
)
|
||||
initial_response = await sync_op(
|
||||
cls,
|
||||
ApiEndpoint(path=BYTEPLUS_TASK_ENDPOINT, method="POST"),
|
||||
@ -1909,156 +1627,6 @@ async def process_video_task(
|
||||
return IO.NodeOutput(await download_url_to_video_output(response.content.video_url))
|
||||
|
||||
|
||||
class ByteDanceCreateImageAsset(IO.ComfyNode):
|
||||
|
||||
@classmethod
|
||||
def define_schema(cls) -> IO.Schema:
|
||||
return IO.Schema(
|
||||
node_id="ByteDanceCreateImageAsset",
|
||||
display_name="ByteDance Create Image Asset",
|
||||
category="api node/image/ByteDance",
|
||||
description=(
|
||||
"Create a Seedance 2.0 personal image asset. Uploads the input image and "
|
||||
"registers it in the given asset group. If group_id is empty, runs a real-person "
|
||||
"H5 authentication flow to create a new group before adding the asset."
|
||||
),
|
||||
inputs=[
|
||||
IO.Image.Input("image", tooltip="Image to register as a personal asset."),
|
||||
IO.String.Input(
|
||||
"group_id",
|
||||
default="",
|
||||
tooltip="Reuse an existing Seedance asset group ID to skip repeated human verification for the "
|
||||
"same person. Leave empty to run real-person authentication in the browser and create a new group.",
|
||||
),
|
||||
# IO.String.Input(
|
||||
# "name",
|
||||
# default="",
|
||||
# tooltip="Asset name (up to 64 characters).",
|
||||
# ),
|
||||
],
|
||||
outputs=[
|
||||
IO.String.Output(display_name="asset_id"),
|
||||
IO.String.Output(display_name="group_id"),
|
||||
],
|
||||
hidden=[
|
||||
IO.Hidden.auth_token_comfy_org,
|
||||
IO.Hidden.api_key_comfy_org,
|
||||
IO.Hidden.unique_id,
|
||||
],
|
||||
# is_api_node=True,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
async def execute(
|
||||
cls,
|
||||
image: Input.Image,
|
||||
group_id: str = "",
|
||||
# name: str = "",
|
||||
) -> IO.NodeOutput:
|
||||
# if len(name) > 64:
|
||||
# raise ValueError("Name of asset can not be greater then 64 symbols")
|
||||
validate_image_dimensions(image, min_width=300, max_width=6000, min_height=300, max_height=6000)
|
||||
validate_image_aspect_ratio(image, min_ratio=(0.4, 1), max_ratio=(2.5, 1))
|
||||
resolved_group = await _resolve_group_id(cls, group_id)
|
||||
asset_id = await _create_seedance_asset(
|
||||
cls,
|
||||
group_id=resolved_group,
|
||||
url=await upload_image_to_comfyapi(cls, image),
|
||||
name="",
|
||||
asset_type="Image",
|
||||
)
|
||||
await _wait_for_asset_active(cls, asset_id, resolved_group)
|
||||
PromptServer.instance.send_progress_text(
|
||||
f"Please save the asset_id and group_id for reuse.\n\nasset_id: {asset_id}\n\n"
|
||||
f"group_id: {resolved_group}",
|
||||
cls.hidden.unique_id,
|
||||
)
|
||||
return IO.NodeOutput(asset_id, resolved_group)
|
||||
|
||||
|
||||
class ByteDanceCreateVideoAsset(IO.ComfyNode):
|
||||
|
||||
@classmethod
|
||||
def define_schema(cls) -> IO.Schema:
|
||||
return IO.Schema(
|
||||
node_id="ByteDanceCreateVideoAsset",
|
||||
display_name="ByteDance Create Video Asset",
|
||||
category="api node/video/ByteDance",
|
||||
description=(
|
||||
"Create a Seedance 2.0 personal video asset. Uploads the input video and "
|
||||
"registers it in the given asset group. If group_id is empty, runs a real-person "
|
||||
"H5 authentication flow to create a new group before adding the asset."
|
||||
),
|
||||
inputs=[
|
||||
IO.Video.Input("video", tooltip="Video to register as a personal asset."),
|
||||
IO.String.Input(
|
||||
"group_id",
|
||||
default="",
|
||||
tooltip="Reuse an existing Seedance asset group ID to skip repeated human verification for the "
|
||||
"same person. Leave empty to run real-person authentication in the browser and create a new group.",
|
||||
),
|
||||
# IO.String.Input(
|
||||
# "name",
|
||||
# default="",
|
||||
# tooltip="Asset name (up to 64 characters).",
|
||||
# ),
|
||||
],
|
||||
outputs=[
|
||||
IO.String.Output(display_name="asset_id"),
|
||||
IO.String.Output(display_name="group_id"),
|
||||
],
|
||||
hidden=[
|
||||
IO.Hidden.auth_token_comfy_org,
|
||||
IO.Hidden.api_key_comfy_org,
|
||||
IO.Hidden.unique_id,
|
||||
],
|
||||
# is_api_node=True,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
async def execute(
|
||||
cls,
|
||||
video: Input.Video,
|
||||
group_id: str = "",
|
||||
# name: str = "",
|
||||
) -> IO.NodeOutput:
|
||||
# if len(name) > 64:
|
||||
# raise ValueError("Name of asset can not be greater then 64 symbols")
|
||||
validate_video_duration(video, min_duration=2, max_duration=15)
|
||||
validate_video_dimensions(video, min_width=300, max_width=6000, min_height=300, max_height=6000)
|
||||
|
||||
w, h = video.get_dimensions()
|
||||
if h > 0:
|
||||
ratio = w / h
|
||||
if not (0.4 <= ratio <= 2.5):
|
||||
raise ValueError(f"Asset video aspect ratio (W/H) must be in [0.4, 2.5], got {ratio:.3f} ({w}x{h}).")
|
||||
pixels = w * h
|
||||
if not (409_600 <= pixels <= 927_408):
|
||||
raise ValueError(
|
||||
f"Asset video total pixels (W×H) must be in [409600, 927408], " f"got {pixels:,} ({w}x{h})."
|
||||
)
|
||||
|
||||
fps = float(video.get_frame_rate())
|
||||
if not (24 <= fps <= 60):
|
||||
raise ValueError(f"Asset video FPS must be in [24, 60], got {fps:.2f}.")
|
||||
|
||||
resolved_group = await _resolve_group_id(cls, group_id)
|
||||
asset_id = await _create_seedance_asset(
|
||||
cls,
|
||||
group_id=resolved_group,
|
||||
url=await upload_video_to_comfyapi(cls, video),
|
||||
name="",
|
||||
asset_type="Video",
|
||||
)
|
||||
await _wait_for_asset_active(cls, asset_id, resolved_group)
|
||||
PromptServer.instance.send_progress_text(
|
||||
f"Please save the asset_id and group_id for reuse.\n\nasset_id: {asset_id}\n\n"
|
||||
f"group_id: {resolved_group}",
|
||||
cls.hidden.unique_id,
|
||||
)
|
||||
return IO.NodeOutput(asset_id, resolved_group)
|
||||
|
||||
|
||||
class ByteDanceExtension(ComfyExtension):
|
||||
@override
|
||||
async def get_node_list(self) -> list[type[IO.ComfyNode]]:
|
||||
@ -2072,8 +1640,6 @@ class ByteDanceExtension(ComfyExtension):
|
||||
ByteDance2TextToVideoNode,
|
||||
ByteDance2FirstLastFrameNode,
|
||||
ByteDance2ReferenceNode,
|
||||
ByteDanceCreateImageAsset,
|
||||
ByteDanceCreateVideoAsset,
|
||||
]
|
||||
|
||||
|
||||
|
||||
@ -276,7 +276,6 @@ async def finish_omni_video_task(cls: type[IO.ComfyNode], response: TaskStatusRe
|
||||
cls,
|
||||
ApiEndpoint(path=f"/proxy/kling/v1/videos/omni-video/{response.data.task_id}"),
|
||||
response_model=TaskStatusResponse,
|
||||
max_poll_attempts=280,
|
||||
status_extractor=lambda r: (r.data.task_status if r.data else None),
|
||||
)
|
||||
return IO.NodeOutput(await download_url_to_video_output(final_response.data.task_result.videos[0].url))
|
||||
@ -863,7 +862,7 @@ class OmniProTextToVideoNode(IO.ComfyNode):
|
||||
),
|
||||
IO.Combo.Input("aspect_ratio", options=["16:9", "9:16", "1:1"]),
|
||||
IO.Int.Input("duration", default=5, min=3, max=15, display_mode=IO.NumberDisplay.slider),
|
||||
IO.Combo.Input("resolution", options=["4k", "1080p", "720p"], default="1080p", optional=True),
|
||||
IO.Combo.Input("resolution", options=["1080p", "720p"], optional=True),
|
||||
IO.DynamicCombo.Input(
|
||||
"storyboards",
|
||||
options=[
|
||||
@ -905,13 +904,12 @@ class OmniProTextToVideoNode(IO.ComfyNode):
|
||||
depends_on=IO.PriceBadgeDepends(widgets=["duration", "resolution", "model_name", "generate_audio"]),
|
||||
expr="""
|
||||
(
|
||||
$res := widgets.resolution;
|
||||
$mode := $res = "4k" ? "4k" : ($res = "720p" ? "std" : "pro");
|
||||
$mode := (widgets.resolution = "720p") ? "std" : "pro";
|
||||
$isV3 := $contains(widgets.model_name, "v3");
|
||||
$audio := $isV3 and widgets.generate_audio;
|
||||
$rates := $audio
|
||||
? {"std": 0.112, "pro": 0.14, "4k": 0.42}
|
||||
: {"std": 0.084, "pro": 0.112, "4k": 0.42};
|
||||
? {"std": 0.112, "pro": 0.14}
|
||||
: {"std": 0.084, "pro": 0.112};
|
||||
{"type":"usd","usd": $lookup($rates, $mode) * widgets.duration}
|
||||
)
|
||||
""",
|
||||
@ -936,8 +934,6 @@ class OmniProTextToVideoNode(IO.ComfyNode):
|
||||
raise ValueError("kling-video-o1 only supports durations of 5 or 10 seconds.")
|
||||
if generate_audio:
|
||||
raise ValueError("kling-video-o1 does not support audio generation.")
|
||||
if resolution == "4k":
|
||||
raise ValueError("kling-video-o1 does not support 4k resolution.")
|
||||
stories_enabled = storyboards is not None and storyboards["storyboards"] != "disabled"
|
||||
if stories_enabled and model_name == "kling-video-o1":
|
||||
raise ValueError("kling-video-o1 does not support storyboards.")
|
||||
@ -967,12 +963,6 @@ class OmniProTextToVideoNode(IO.ComfyNode):
|
||||
f"must equal the global duration ({duration}s)."
|
||||
)
|
||||
|
||||
if resolution == "4k":
|
||||
mode = "4k"
|
||||
elif resolution == "1080p":
|
||||
mode = "pro"
|
||||
else:
|
||||
mode = "std"
|
||||
response = await sync_op(
|
||||
cls,
|
||||
ApiEndpoint(path="/proxy/kling/v1/videos/omni-video", method="POST"),
|
||||
@ -982,7 +972,7 @@ class OmniProTextToVideoNode(IO.ComfyNode):
|
||||
prompt=prompt,
|
||||
aspect_ratio=aspect_ratio,
|
||||
duration=str(duration),
|
||||
mode=mode,
|
||||
mode="pro" if resolution == "1080p" else "std",
|
||||
multi_shot=multi_shot,
|
||||
multi_prompt=multi_prompt_list,
|
||||
shot_type="customize" if multi_shot else None,
|
||||
@ -1024,7 +1014,7 @@ class OmniProFirstLastFrameNode(IO.ComfyNode):
|
||||
optional=True,
|
||||
tooltip="Up to 6 additional reference images.",
|
||||
),
|
||||
IO.Combo.Input("resolution", options=["4k", "1080p", "720p"], default="1080p", optional=True),
|
||||
IO.Combo.Input("resolution", options=["1080p", "720p"], optional=True),
|
||||
IO.DynamicCombo.Input(
|
||||
"storyboards",
|
||||
options=[
|
||||
@ -1071,13 +1061,12 @@ class OmniProFirstLastFrameNode(IO.ComfyNode):
|
||||
depends_on=IO.PriceBadgeDepends(widgets=["duration", "resolution", "model_name", "generate_audio"]),
|
||||
expr="""
|
||||
(
|
||||
$res := widgets.resolution;
|
||||
$mode := $res = "4k" ? "4k" : ($res = "720p" ? "std" : "pro");
|
||||
$mode := (widgets.resolution = "720p") ? "std" : "pro";
|
||||
$isV3 := $contains(widgets.model_name, "v3");
|
||||
$audio := $isV3 and widgets.generate_audio;
|
||||
$rates := $audio
|
||||
? {"std": 0.112, "pro": 0.14, "4k": 0.42}
|
||||
: {"std": 0.084, "pro": 0.112, "4k": 0.42};
|
||||
? {"std": 0.112, "pro": 0.14}
|
||||
: {"std": 0.084, "pro": 0.112};
|
||||
{"type":"usd","usd": $lookup($rates, $mode) * widgets.duration}
|
||||
)
|
||||
""",
|
||||
@ -1104,8 +1093,6 @@ class OmniProFirstLastFrameNode(IO.ComfyNode):
|
||||
raise ValueError("kling-video-o1 does not support durations greater than 10 seconds.")
|
||||
if generate_audio:
|
||||
raise ValueError("kling-video-o1 does not support audio generation.")
|
||||
if resolution == "4k":
|
||||
raise ValueError("kling-video-o1 does not support 4k resolution.")
|
||||
stories_enabled = storyboards is not None and storyboards["storyboards"] != "disabled"
|
||||
if stories_enabled and model_name == "kling-video-o1":
|
||||
raise ValueError("kling-video-o1 does not support storyboards.")
|
||||
@ -1174,12 +1161,6 @@ class OmniProFirstLastFrameNode(IO.ComfyNode):
|
||||
validate_image_aspect_ratio(i, (1, 2.5), (2.5, 1))
|
||||
for i in await upload_images_to_comfyapi(cls, reference_images, wait_label="Uploading reference frame(s)"):
|
||||
image_list.append(OmniParamImage(image_url=i))
|
||||
if resolution == "4k":
|
||||
mode = "4k"
|
||||
elif resolution == "1080p":
|
||||
mode = "pro"
|
||||
else:
|
||||
mode = "std"
|
||||
response = await sync_op(
|
||||
cls,
|
||||
ApiEndpoint(path="/proxy/kling/v1/videos/omni-video", method="POST"),
|
||||
@ -1189,7 +1170,7 @@ class OmniProFirstLastFrameNode(IO.ComfyNode):
|
||||
prompt=prompt,
|
||||
duration=str(duration),
|
||||
image_list=image_list,
|
||||
mode=mode,
|
||||
mode="pro" if resolution == "1080p" else "std",
|
||||
sound="on" if generate_audio else "off",
|
||||
multi_shot=multi_shot,
|
||||
multi_prompt=multi_prompt_list,
|
||||
@ -1223,7 +1204,7 @@ class OmniProImageToVideoNode(IO.ComfyNode):
|
||||
"reference_images",
|
||||
tooltip="Up to 7 reference images.",
|
||||
),
|
||||
IO.Combo.Input("resolution", options=["4k", "1080p", "720p"], default="1080p", optional=True),
|
||||
IO.Combo.Input("resolution", options=["1080p", "720p"], optional=True),
|
||||
IO.DynamicCombo.Input(
|
||||
"storyboards",
|
||||
options=[
|
||||
@ -1270,13 +1251,12 @@ class OmniProImageToVideoNode(IO.ComfyNode):
|
||||
depends_on=IO.PriceBadgeDepends(widgets=["duration", "resolution", "model_name", "generate_audio"]),
|
||||
expr="""
|
||||
(
|
||||
$res := widgets.resolution;
|
||||
$mode := $res = "4k" ? "4k" : ($res = "720p" ? "std" : "pro");
|
||||
$mode := (widgets.resolution = "720p") ? "std" : "pro";
|
||||
$isV3 := $contains(widgets.model_name, "v3");
|
||||
$audio := $isV3 and widgets.generate_audio;
|
||||
$rates := $audio
|
||||
? {"std": 0.112, "pro": 0.14, "4k": 0.42}
|
||||
: {"std": 0.084, "pro": 0.112, "4k": 0.42};
|
||||
? {"std": 0.112, "pro": 0.14}
|
||||
: {"std": 0.084, "pro": 0.112};
|
||||
{"type":"usd","usd": $lookup($rates, $mode) * widgets.duration}
|
||||
)
|
||||
""",
|
||||
@ -1302,8 +1282,6 @@ class OmniProImageToVideoNode(IO.ComfyNode):
|
||||
raise ValueError("kling-video-o1 does not support durations greater than 10 seconds.")
|
||||
if generate_audio:
|
||||
raise ValueError("kling-video-o1 does not support audio generation.")
|
||||
if resolution == "4k":
|
||||
raise ValueError("kling-video-o1 does not support 4k resolution.")
|
||||
stories_enabled = storyboards is not None and storyboards["storyboards"] != "disabled"
|
||||
if stories_enabled and model_name == "kling-video-o1":
|
||||
raise ValueError("kling-video-o1 does not support storyboards.")
|
||||
@ -1342,12 +1320,6 @@ class OmniProImageToVideoNode(IO.ComfyNode):
|
||||
image_list: list[OmniParamImage] = []
|
||||
for i in await upload_images_to_comfyapi(cls, reference_images, wait_label="Uploading reference image"):
|
||||
image_list.append(OmniParamImage(image_url=i))
|
||||
if resolution == "4k":
|
||||
mode = "4k"
|
||||
elif resolution == "1080p":
|
||||
mode = "pro"
|
||||
else:
|
||||
mode = "std"
|
||||
response = await sync_op(
|
||||
cls,
|
||||
ApiEndpoint(path="/proxy/kling/v1/videos/omni-video", method="POST"),
|
||||
@ -1358,7 +1330,7 @@ class OmniProImageToVideoNode(IO.ComfyNode):
|
||||
aspect_ratio=aspect_ratio,
|
||||
duration=str(duration),
|
||||
image_list=image_list,
|
||||
mode=mode,
|
||||
mode="pro" if resolution == "1080p" else "std",
|
||||
sound="on" if generate_audio else "off",
|
||||
multi_shot=multi_shot,
|
||||
multi_prompt=multi_prompt_list,
|
||||
@ -2888,7 +2860,7 @@ class KlingVideoNode(IO.ComfyNode):
|
||||
IO.DynamicCombo.Option(
|
||||
"kling-v3",
|
||||
[
|
||||
IO.Combo.Input("resolution", options=["4k", "1080p", "720p"], default="1080p"),
|
||||
IO.Combo.Input("resolution", options=["1080p", "720p"]),
|
||||
IO.Combo.Input(
|
||||
"aspect_ratio",
|
||||
options=["16:9", "9:16", "1:1"],
|
||||
@ -2941,11 +2913,7 @@ class KlingVideoNode(IO.ComfyNode):
|
||||
),
|
||||
expr="""
|
||||
(
|
||||
$rates := {
|
||||
"4k": {"off": 0.42, "on": 0.42},
|
||||
"1080p": {"off": 0.112, "on": 0.168},
|
||||
"720p": {"off": 0.084, "on": 0.126}
|
||||
};
|
||||
$rates := {"1080p": {"off": 0.112, "on": 0.168}, "720p": {"off": 0.084, "on": 0.126}};
|
||||
$res := $lookup(widgets, "model.resolution");
|
||||
$audio := widgets.generate_audio ? "on" : "off";
|
||||
$rate := $lookup($lookup($rates, $res), $audio);
|
||||
@ -2975,12 +2943,7 @@ class KlingVideoNode(IO.ComfyNode):
|
||||
start_frame: Input.Image | None = None,
|
||||
) -> IO.NodeOutput:
|
||||
_ = seed
|
||||
if model["resolution"] == "4k":
|
||||
mode = "4k"
|
||||
elif model["resolution"] == "1080p":
|
||||
mode = "pro"
|
||||
else:
|
||||
mode = "std"
|
||||
mode = "pro" if model["resolution"] == "1080p" else "std"
|
||||
custom_multi_shot = False
|
||||
if multi_shot["multi_shot"] == "disabled":
|
||||
shot_type = None
|
||||
@ -3062,7 +3025,6 @@ class KlingVideoNode(IO.ComfyNode):
|
||||
cls,
|
||||
ApiEndpoint(path=poll_path),
|
||||
response_model=TaskStatusResponse,
|
||||
max_poll_attempts=280,
|
||||
status_extractor=lambda r: (r.data.task_status if r.data else None),
|
||||
)
|
||||
return IO.NodeOutput(await download_url_to_video_output(final_response.data.task_result.videos[0].url))
|
||||
@ -3095,7 +3057,7 @@ class KlingFirstLastFrameNode(IO.ComfyNode):
|
||||
IO.DynamicCombo.Option(
|
||||
"kling-v3",
|
||||
[
|
||||
IO.Combo.Input("resolution", options=["4k", "1080p", "720p"], default="1080p"),
|
||||
IO.Combo.Input("resolution", options=["1080p", "720p"]),
|
||||
],
|
||||
),
|
||||
],
|
||||
@ -3127,11 +3089,7 @@ class KlingFirstLastFrameNode(IO.ComfyNode):
|
||||
),
|
||||
expr="""
|
||||
(
|
||||
$rates := {
|
||||
"4k": {"off": 0.42, "on": 0.42},
|
||||
"1080p": {"off": 0.112, "on": 0.168},
|
||||
"720p": {"off": 0.084, "on": 0.126}
|
||||
};
|
||||
$rates := {"1080p": {"off": 0.112, "on": 0.168}, "720p": {"off": 0.084, "on": 0.126}};
|
||||
$res := $lookup(widgets, "model.resolution");
|
||||
$audio := widgets.generate_audio ? "on" : "off";
|
||||
$rate := $lookup($lookup($rates, $res), $audio);
|
||||
@ -3160,12 +3118,6 @@ class KlingFirstLastFrameNode(IO.ComfyNode):
|
||||
validate_image_aspect_ratio(end_frame, (1, 2.5), (2.5, 1))
|
||||
image_url = await upload_image_to_comfyapi(cls, first_frame, wait_label="Uploading first frame")
|
||||
image_tail_url = await upload_image_to_comfyapi(cls, end_frame, wait_label="Uploading end frame")
|
||||
if model["resolution"] == "4k":
|
||||
mode = "4k"
|
||||
elif model["resolution"] == "1080p":
|
||||
mode = "pro"
|
||||
else:
|
||||
mode = "std"
|
||||
response = await sync_op(
|
||||
cls,
|
||||
ApiEndpoint(path="/proxy/kling/v1/videos/image2video", method="POST"),
|
||||
@ -3175,7 +3127,7 @@ class KlingFirstLastFrameNode(IO.ComfyNode):
|
||||
image=image_url,
|
||||
image_tail=image_tail_url,
|
||||
prompt=prompt,
|
||||
mode=mode,
|
||||
mode="pro" if model["resolution"] == "1080p" else "std",
|
||||
duration=str(duration),
|
||||
sound="on" if generate_audio else "off",
|
||||
),
|
||||
@ -3188,7 +3140,6 @@ class KlingFirstLastFrameNode(IO.ComfyNode):
|
||||
cls,
|
||||
ApiEndpoint(path=f"/proxy/kling/v1/videos/image2video/{response.data.task_id}"),
|
||||
response_model=TaskStatusResponse,
|
||||
max_poll_attempts=280,
|
||||
status_extractor=lambda r: (r.data.task_status if r.data else None),
|
||||
)
|
||||
return IO.NodeOutput(await download_url_to_video_output(final_response.data.task_result.videos[0].url))
|
||||
|
||||
@ -357,7 +357,8 @@ def calculate_tokens_price_image_1_5(response: OpenAIImageGenerationResponse) ->
|
||||
return ((response.usage.input_tokens * 8.0) + (response.usage.output_tokens * 32.0)) / 1_000_000.0
|
||||
|
||||
|
||||
def calculate_tokens_price_image_2_0(response: OpenAIImageGenerationResponse) -> float | None:
|
||||
def calculate_tokens_price_image_2(response: OpenAIImageGenerationResponse) -> float | None:
|
||||
# https://platform.openai.com/docs/pricing - gpt-image-2: input $8/1M, output $30/1M
|
||||
return ((response.usage.input_tokens * 8.0) + (response.usage.output_tokens * 30.0)) / 1_000_000.0
|
||||
|
||||
|
||||
@ -367,7 +368,7 @@ class OpenAIGPTImage1(IO.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return IO.Schema(
|
||||
node_id="OpenAIGPTImage1",
|
||||
display_name="OpenAI GPT Image 2",
|
||||
display_name="OpenAI GPT Image 1 & 1.5",
|
||||
category="api node/image/OpenAI",
|
||||
description="Generates images synchronously via OpenAI's GPT Image endpoint.",
|
||||
inputs=[
|
||||
@ -405,17 +406,7 @@ class OpenAIGPTImage1(IO.ComfyNode):
|
||||
IO.Combo.Input(
|
||||
"size",
|
||||
default="auto",
|
||||
options=[
|
||||
"auto",
|
||||
"1024x1024",
|
||||
"1024x1536",
|
||||
"1536x1024",
|
||||
"2048x2048",
|
||||
"2048x1152",
|
||||
"1152x2048",
|
||||
"3840x2160",
|
||||
"2160x3840",
|
||||
],
|
||||
options=["auto", "1024x1024", "1024x1536", "1536x1024"],
|
||||
tooltip="Image size",
|
||||
optional=True,
|
||||
),
|
||||
@ -441,8 +432,8 @@ class OpenAIGPTImage1(IO.ComfyNode):
|
||||
),
|
||||
IO.Combo.Input(
|
||||
"model",
|
||||
options=["gpt-image-1", "gpt-image-1.5", "gpt-image-2"],
|
||||
default="gpt-image-2",
|
||||
options=["gpt-image-1", "gpt-image-1.5"],
|
||||
default="gpt-image-1.5",
|
||||
optional=True,
|
||||
),
|
||||
],
|
||||
@ -459,33 +450,28 @@ class OpenAIGPTImage1(IO.ComfyNode):
|
||||
depends_on=IO.PriceBadgeDepends(widgets=["quality", "n", "model"]),
|
||||
expr="""
|
||||
(
|
||||
$ranges := {
|
||||
"gpt-image-1": {
|
||||
"low": [0.011, 0.02],
|
||||
"medium": [0.042, 0.07],
|
||||
"high": [0.167, 0.25]
|
||||
},
|
||||
"gpt-image-1.5": {
|
||||
"low": [0.009, 0.02],
|
||||
"medium": [0.034, 0.062],
|
||||
"high": [0.133, 0.22]
|
||||
},
|
||||
"gpt-image-2": {
|
||||
"low": [0.0048, 0.012],
|
||||
"medium": [0.041, 0.112],
|
||||
"high": [0.165, 0.43]
|
||||
}
|
||||
};
|
||||
$range := $lookup($lookup($ranges, widgets.model), widgets.quality);
|
||||
$nRaw := widgets.n;
|
||||
$n := ($nRaw != null and $nRaw != 0) ? $nRaw : 1;
|
||||
$m := widgets.model;
|
||||
$ranges :=
|
||||
$contains($m, "gpt-image-1.5")
|
||||
? {
|
||||
"low": [0.009, 0.016],
|
||||
"medium": [0.037, 0.056],
|
||||
"high": [0.134, 0.240]
|
||||
}
|
||||
: {
|
||||
"low": [0.011, 0.020],
|
||||
"medium": [0.046, 0.070],
|
||||
"high": [0.167, 0.300]
|
||||
};
|
||||
$range := $lookup($ranges, widgets.quality);
|
||||
$n := widgets.n;
|
||||
($n = 1)
|
||||
? {"type":"range_usd","min_usd": $range[0], "max_usd": $range[1], "format": {"approximate": true}}
|
||||
? {"type":"range_usd","min_usd": $range[0], "max_usd": $range[1]}
|
||||
: {
|
||||
"type":"range_usd",
|
||||
"min_usd": $range[0] * $n,
|
||||
"max_usd": $range[1] * $n,
|
||||
"format": { "suffix": "/Run", "approximate": true }
|
||||
"min_usd": $range[0],
|
||||
"max_usd": $range[1],
|
||||
"format": { "suffix": " x " & $string($n) & "/Run" }
|
||||
}
|
||||
)
|
||||
""",
|
||||
@ -510,18 +496,10 @@ class OpenAIGPTImage1(IO.ComfyNode):
|
||||
if mask is not None and image is None:
|
||||
raise ValueError("Cannot use a mask without an input image")
|
||||
|
||||
if model in ("gpt-image-1", "gpt-image-1.5"):
|
||||
if size not in ("auto", "1024x1024", "1024x1536", "1536x1024"):
|
||||
raise ValueError(f"Resolution {size} is only supported by GPT Image 2 model")
|
||||
|
||||
if model == "gpt-image-1":
|
||||
price_extractor = calculate_tokens_price_image_1
|
||||
elif model == "gpt-image-1.5":
|
||||
price_extractor = calculate_tokens_price_image_1_5
|
||||
elif model == "gpt-image-2":
|
||||
price_extractor = calculate_tokens_price_image_2_0
|
||||
if background == "transparent":
|
||||
raise ValueError("Transparent background is not supported for GPT Image 2 model")
|
||||
else:
|
||||
raise ValueError(f"Unknown model: {model}")
|
||||
|
||||
@ -599,6 +577,261 @@ class OpenAIGPTImage1(IO.ComfyNode):
|
||||
return IO.NodeOutput(await validate_and_cast_response(response))
|
||||
|
||||
|
||||
_GPT_IMAGE_2_SIZES = [
|
||||
"auto",
|
||||
"1024x1024",
|
||||
"1536x1024",
|
||||
"1024x1536",
|
||||
"2048x2048",
|
||||
"2048x1152",
|
||||
"3840x2160",
|
||||
"2160x3840",
|
||||
]
|
||||
|
||||
|
||||
def _resolve_gpt_image_2_size(size: str, custom_width: int, custom_height: int) -> str:
|
||||
if custom_width <= 0 or custom_height <= 0:
|
||||
return size
|
||||
w, h = custom_width, custom_height
|
||||
if max(w, h) > 3840:
|
||||
raise ValueError(f"Maximum edge length must be ≤ 3840px, got {max(w, h)}")
|
||||
if w % 16 != 0 or h % 16 != 0:
|
||||
raise ValueError(f"Both edges must be multiples of 16px, got {w}x{h}")
|
||||
if max(w, h) / min(w, h) > 3:
|
||||
raise ValueError(f"Long-to-short edge ratio must not exceed 3:1, got {max(w, h) / min(w, h):.2f}:1")
|
||||
total = w * h
|
||||
if total < 655_360 or total > 8_294_400:
|
||||
raise ValueError(f"Total pixels must be between 655,360 and 8,294,400, got {total:,}")
|
||||
return f"{w}x{h}"
|
||||
|
||||
|
||||
class OpenAIGPTImage2(IO.ComfyNode):
|
||||
|
||||
@classmethod
|
||||
def define_schema(cls):
|
||||
return IO.Schema(
|
||||
node_id="OpenAIGPTImage2",
|
||||
display_name="OpenAI GPT Image 2",
|
||||
category="api node/image/OpenAI",
|
||||
description="Generates images synchronously via OpenAI's GPT-Image-2 endpoint.",
|
||||
inputs=[
|
||||
IO.String.Input(
|
||||
"prompt",
|
||||
default="",
|
||||
multiline=True,
|
||||
tooltip="Text prompt for GPT Image 2",
|
||||
),
|
||||
IO.Int.Input(
|
||||
"seed",
|
||||
default=0,
|
||||
min=0,
|
||||
max=2**31 - 1,
|
||||
step=1,
|
||||
display_mode=IO.NumberDisplay.number,
|
||||
control_after_generate=True,
|
||||
tooltip="not implemented yet in backend",
|
||||
optional=True,
|
||||
),
|
||||
IO.Combo.Input(
|
||||
"quality",
|
||||
default="auto",
|
||||
options=["auto", "low", "medium", "high"],
|
||||
tooltip="Image quality. 'auto' lets the model decide based on the prompt. Square images are typically fastest.",
|
||||
optional=True,
|
||||
),
|
||||
IO.Combo.Input(
|
||||
"background",
|
||||
default="auto",
|
||||
options=["auto", "opaque"],
|
||||
tooltip="Background style. GPT-Image-2 does not support transparent backgrounds.",
|
||||
optional=True,
|
||||
),
|
||||
IO.Combo.Input(
|
||||
"size",
|
||||
default="auto",
|
||||
options=_GPT_IMAGE_2_SIZES,
|
||||
tooltip="Output image dimensions. Ignored when custom_width and custom_height are both non-zero.",
|
||||
optional=True,
|
||||
),
|
||||
IO.Int.Input(
|
||||
"custom_width",
|
||||
default=0,
|
||||
min=0,
|
||||
max=3840,
|
||||
step=16,
|
||||
display_mode=IO.NumberDisplay.number,
|
||||
tooltip="Custom output width in pixels. Set to 0 (default) to use the size preset. When both width and height are non-zero, they override the size preset. Slider enforces multiples of 16 and max edge 3840px. Additional constraints checked at generation: ratio ≤ 3:1, total pixels 655,360–8,294,400.",
|
||||
optional=True,
|
||||
),
|
||||
IO.Int.Input(
|
||||
"custom_height",
|
||||
default=0,
|
||||
min=0,
|
||||
max=3840,
|
||||
step=16,
|
||||
display_mode=IO.NumberDisplay.number,
|
||||
tooltip="Custom output height in pixels. Set to 0 (default) to use the size preset. When both width and height are non-zero, they override the size preset. Slider enforces multiples of 16 and max edge 3840px. Additional constraints checked at generation: ratio ≤ 3:1, total pixels 655,360–8,294,400.",
|
||||
optional=True,
|
||||
),
|
||||
IO.Int.Input(
|
||||
"num_images",
|
||||
default=1,
|
||||
min=1,
|
||||
max=8,
|
||||
step=1,
|
||||
tooltip="Number of images to generate per run.",
|
||||
display_mode=IO.NumberDisplay.number,
|
||||
optional=True,
|
||||
),
|
||||
IO.Image.Input(
|
||||
"image",
|
||||
tooltip="Optional reference image for image editing.",
|
||||
optional=True,
|
||||
),
|
||||
IO.Mask.Input(
|
||||
"mask",
|
||||
tooltip="Optional mask for inpainting (white areas will be replaced).",
|
||||
optional=True,
|
||||
),
|
||||
IO.Combo.Input(
|
||||
"model",
|
||||
options=["gpt-image-2"],
|
||||
default="gpt-image-2",
|
||||
tooltip="Model used for image generation.",
|
||||
optional=True,
|
||||
),
|
||||
],
|
||||
outputs=[
|
||||
IO.Image.Output(),
|
||||
],
|
||||
hidden=[
|
||||
IO.Hidden.auth_token_comfy_org,
|
||||
IO.Hidden.api_key_comfy_org,
|
||||
IO.Hidden.unique_id,
|
||||
],
|
||||
is_api_node=True,
|
||||
price_badge=IO.PriceBadge(
|
||||
depends_on=IO.PriceBadgeDepends(widgets=["quality", "num_images"]),
|
||||
expr="""
|
||||
(
|
||||
$ranges := {
|
||||
"low": [0.005, 0.010],
|
||||
"medium": [0.041, 0.060],
|
||||
"high": [0.165, 0.250]
|
||||
};
|
||||
$q := widgets.quality;
|
||||
$n := widgets.num_images;
|
||||
$n := ($n != null and $n != 0) ? $n : 1;
|
||||
$range := $lookup($ranges, $q);
|
||||
$lo := $range ? $range[0] : 0.005;
|
||||
$hi := $range ? $range[1] : 0.250;
|
||||
($n = 1)
|
||||
? {"type":"range_usd","min_usd": $lo, "max_usd": $hi, "format": {"approximate": ($range ? false : true)}}
|
||||
: {
|
||||
"type":"range_usd",
|
||||
"min_usd": $lo,
|
||||
"max_usd": $hi,
|
||||
"format": {"approximate": ($range ? false : true), "suffix": " x " & $string($n) & "/Run"}
|
||||
}
|
||||
)
|
||||
""",
|
||||
),
|
||||
)
|
||||
|
||||
@classmethod
|
||||
async def execute(
|
||||
cls,
|
||||
prompt: str,
|
||||
seed: int = 0,
|
||||
quality: str = "auto",
|
||||
background: str = "auto",
|
||||
image: Input.Image | None = None,
|
||||
mask: Input.Image | None = None,
|
||||
num_images: int = 1,
|
||||
size: str = "auto",
|
||||
custom_width: int = 0,
|
||||
custom_height: int = 0,
|
||||
model: str = "gpt-image-2",
|
||||
) -> IO.NodeOutput:
|
||||
validate_string(prompt, strip_whitespace=False)
|
||||
|
||||
if mask is not None and image is None:
|
||||
raise ValueError("Cannot use a mask without an input image")
|
||||
|
||||
resolved_size = _resolve_gpt_image_2_size(size, custom_width, custom_height)
|
||||
|
||||
if image is not None:
|
||||
files = []
|
||||
batch_size = image.shape[0]
|
||||
for i in range(batch_size):
|
||||
single_image = image[i : i + 1]
|
||||
scaled_image = downscale_image_tensor(single_image, total_pixels=2048 * 2048).squeeze()
|
||||
|
||||
image_np = (scaled_image.numpy() * 255).astype(np.uint8)
|
||||
img = Image.fromarray(image_np)
|
||||
img_byte_arr = BytesIO()
|
||||
img.save(img_byte_arr, format="PNG")
|
||||
img_byte_arr.seek(0)
|
||||
|
||||
if batch_size == 1:
|
||||
files.append(("image", (f"image_{i}.png", img_byte_arr, "image/png")))
|
||||
else:
|
||||
files.append(("image[]", (f"image_{i}.png", img_byte_arr, "image/png")))
|
||||
|
||||
if mask is not None:
|
||||
if image.shape[0] != 1:
|
||||
raise Exception("Cannot use a mask with multiple image")
|
||||
if mask.shape[1:] != image.shape[1:-1]:
|
||||
raise Exception("Mask and Image must be the same size")
|
||||
_, height, width = mask.shape
|
||||
rgba_mask = torch.zeros(height, width, 4, device="cpu")
|
||||
rgba_mask[:, :, 3] = 1 - mask.squeeze().cpu()
|
||||
|
||||
scaled_mask = downscale_image_tensor(rgba_mask.unsqueeze(0), total_pixels=2048 * 2048).squeeze()
|
||||
|
||||
mask_np = (scaled_mask.numpy() * 255).astype(np.uint8)
|
||||
mask_img = Image.fromarray(mask_np)
|
||||
mask_img_byte_arr = BytesIO()
|
||||
mask_img.save(mask_img_byte_arr, format="PNG")
|
||||
mask_img_byte_arr.seek(0)
|
||||
files.append(("mask", ("mask.png", mask_img_byte_arr, "image/png")))
|
||||
|
||||
response = await sync_op(
|
||||
cls,
|
||||
ApiEndpoint(path="/proxy/openai/images/edits", method="POST"),
|
||||
response_model=OpenAIImageGenerationResponse,
|
||||
data=OpenAIImageEditRequest(
|
||||
model=model,
|
||||
prompt=prompt,
|
||||
quality=quality,
|
||||
background=background,
|
||||
n=num_images,
|
||||
size=resolved_size,
|
||||
moderation="low",
|
||||
),
|
||||
content_type="multipart/form-data",
|
||||
files=files,
|
||||
price_extractor=calculate_tokens_price_image_2,
|
||||
)
|
||||
else:
|
||||
response = await sync_op(
|
||||
cls,
|
||||
ApiEndpoint(path="/proxy/openai/images/generations", method="POST"),
|
||||
response_model=OpenAIImageGenerationResponse,
|
||||
data=OpenAIImageGenerationRequest(
|
||||
model=model,
|
||||
prompt=prompt,
|
||||
quality=quality,
|
||||
background=background,
|
||||
n=num_images,
|
||||
size=resolved_size,
|
||||
moderation="low",
|
||||
),
|
||||
price_extractor=calculate_tokens_price_image_2,
|
||||
)
|
||||
return IO.NodeOutput(await validate_and_cast_response(response))
|
||||
|
||||
|
||||
class OpenAIChatNode(IO.ComfyNode):
|
||||
"""
|
||||
Node to generate text responses from an OpenAI model.
|
||||
@ -948,6 +1181,7 @@ class OpenAIExtension(ComfyExtension):
|
||||
OpenAIDalle2,
|
||||
OpenAIDalle3,
|
||||
OpenAIGPTImage1,
|
||||
OpenAIGPTImage2,
|
||||
OpenAIChatNode,
|
||||
OpenAIInputFiles,
|
||||
OpenAIChatConfig,
|
||||
|
||||
@ -393,8 +393,8 @@ class Veo3VideoGenerationNode(IO.ComfyNode):
|
||||
model="veo-3.0-generate-001",
|
||||
generate_audio=False,
|
||||
):
|
||||
if resolution == "4k" and ("lite" in model or "3.0" in model):
|
||||
raise Exception("4K resolution is not supported by the veo-3.1-lite or veo-3.0 models.")
|
||||
if "lite" in model and resolution == "4k":
|
||||
raise Exception("4K resolution is not supported by the veo-3.1-lite model.")
|
||||
|
||||
model = MODELS_MAP[model]
|
||||
|
||||
|
||||
@ -156,7 +156,6 @@ async def poll_op(
|
||||
estimated_duration: int | None = None,
|
||||
cancel_endpoint: ApiEndpoint | None = None,
|
||||
cancel_timeout: float = 10.0,
|
||||
extra_text: str | None = None,
|
||||
) -> M:
|
||||
raw = await poll_op_raw(
|
||||
cls,
|
||||
@ -177,7 +176,6 @@ async def poll_op(
|
||||
estimated_duration=estimated_duration,
|
||||
cancel_endpoint=cancel_endpoint,
|
||||
cancel_timeout=cancel_timeout,
|
||||
extra_text=extra_text,
|
||||
)
|
||||
if not isinstance(raw, dict):
|
||||
raise Exception("Expected JSON response to validate into a Pydantic model, got non-JSON (binary or text).")
|
||||
@ -262,7 +260,6 @@ async def poll_op_raw(
|
||||
estimated_duration: int | None = None,
|
||||
cancel_endpoint: ApiEndpoint | None = None,
|
||||
cancel_timeout: float = 10.0,
|
||||
extra_text: str | None = None,
|
||||
) -> dict[str, Any]:
|
||||
"""
|
||||
Polls an endpoint until the task reaches a terminal state. Displays time while queued/processing,
|
||||
@ -302,7 +299,6 @@ async def poll_op_raw(
|
||||
price=state.price,
|
||||
is_queued=state.is_queued,
|
||||
processing_elapsed_seconds=int(proc_elapsed),
|
||||
extra_text=extra_text,
|
||||
)
|
||||
await asyncio.sleep(1.0)
|
||||
except Exception as exc:
|
||||
@ -393,7 +389,6 @@ async def poll_op_raw(
|
||||
price=state.price,
|
||||
is_queued=False,
|
||||
processing_elapsed_seconds=int(state.base_processing_elapsed),
|
||||
extra_text=extra_text,
|
||||
)
|
||||
return resp_json
|
||||
|
||||
@ -467,7 +462,6 @@ def _display_time_progress(
|
||||
price: float | None = None,
|
||||
is_queued: bool | None = None,
|
||||
processing_elapsed_seconds: int | None = None,
|
||||
extra_text: str | None = None,
|
||||
) -> None:
|
||||
if estimated_total is not None and estimated_total > 0 and is_queued is False:
|
||||
pe = processing_elapsed_seconds if processing_elapsed_seconds is not None else elapsed_seconds
|
||||
@ -475,8 +469,7 @@ def _display_time_progress(
|
||||
time_line = f"Time elapsed: {int(elapsed_seconds)}s (~{remaining}s remaining)"
|
||||
else:
|
||||
time_line = f"Time elapsed: {int(elapsed_seconds)}s"
|
||||
text = f"{time_line}\n\n{extra_text}" if extra_text else time_line
|
||||
_display_text(node_cls, text, status=status, price=price)
|
||||
_display_text(node_cls, time_line, status=status, price=price)
|
||||
|
||||
|
||||
async def _diagnose_connectivity() -> dict[str, bool]:
|
||||
|
||||
@ -3,136 +3,136 @@ from typing_extensions import override
|
||||
|
||||
import comfy.model_management
|
||||
import node_helpers
|
||||
from comfy_api.latest import ComfyExtension, io
|
||||
from comfy_api.latest import ComfyExtension, IO
|
||||
|
||||
|
||||
class TextEncodeAceStepAudio(io.ComfyNode):
|
||||
class TextEncodeAceStepAudio(IO.ComfyNode):
|
||||
@classmethod
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
return IO.Schema(
|
||||
node_id="TextEncodeAceStepAudio",
|
||||
category="conditioning",
|
||||
inputs=[
|
||||
io.Clip.Input("clip"),
|
||||
io.String.Input("tags", multiline=True, dynamic_prompts=True),
|
||||
io.String.Input("lyrics", multiline=True, dynamic_prompts=True),
|
||||
io.Float.Input("lyrics_strength", default=1.0, min=0.0, max=10.0, step=0.01),
|
||||
IO.Clip.Input("clip"),
|
||||
IO.String.Input("tags", multiline=True, dynamic_prompts=True),
|
||||
IO.String.Input("lyrics", multiline=True, dynamic_prompts=True),
|
||||
IO.Float.Input("lyrics_strength", default=1.0, min=0.0, max=10.0, step=0.01),
|
||||
],
|
||||
outputs=[io.Conditioning.Output()],
|
||||
outputs=[IO.Conditioning.Output()],
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def execute(cls, clip, tags, lyrics, lyrics_strength) -> io.NodeOutput:
|
||||
def execute(cls, clip, tags, lyrics, lyrics_strength) -> IO.NodeOutput:
|
||||
tokens = clip.tokenize(tags, lyrics=lyrics)
|
||||
conditioning = clip.encode_from_tokens_scheduled(tokens)
|
||||
conditioning = node_helpers.conditioning_set_values(conditioning, {"lyrics_strength": lyrics_strength})
|
||||
return io.NodeOutput(conditioning)
|
||||
return IO.NodeOutput(conditioning)
|
||||
|
||||
class TextEncodeAceStepAudio15(io.ComfyNode):
|
||||
class TextEncodeAceStepAudio15(IO.ComfyNode):
|
||||
@classmethod
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
return IO.Schema(
|
||||
node_id="TextEncodeAceStepAudio1.5",
|
||||
category="conditioning",
|
||||
inputs=[
|
||||
io.Clip.Input("clip"),
|
||||
io.String.Input("tags", multiline=True, dynamic_prompts=True),
|
||||
io.String.Input("lyrics", multiline=True, dynamic_prompts=True),
|
||||
io.Int.Input("seed", default=0, min=0, max=0xffffffffffffffff, control_after_generate=True),
|
||||
io.Int.Input("bpm", default=120, min=10, max=300),
|
||||
io.Float.Input("duration", default=120.0, min=0.0, max=2000.0, step=0.1),
|
||||
io.Combo.Input("timesignature", options=['2', '3', '4', '6']),
|
||||
io.Combo.Input("language", options=["en", "ja", "zh", "es", "de", "fr", "pt", "ru", "it", "nl", "pl", "tr", "vi", "cs", "fa", "id", "ko", "uk", "hu", "ar", "sv", "ro", "el"]),
|
||||
io.Combo.Input("keyscale", options=[f"{root} {quality}" for quality in ["major", "minor"] for root in ["C", "C#", "Db", "D", "D#", "Eb", "E", "F", "F#", "Gb", "G", "G#", "Ab", "A", "A#", "Bb", "B"]]),
|
||||
io.Boolean.Input("generate_audio_codes", default=True, tooltip="Enable the LLM that generates audio codes. This can be slow but will increase the quality of the generated audio. Turn this off if you are giving the model an audio reference.", advanced=True),
|
||||
io.Float.Input("cfg_scale", default=2.0, min=0.0, max=100.0, step=0.1, advanced=True),
|
||||
io.Float.Input("temperature", default=0.85, min=0.0, max=2.0, step=0.01, advanced=True),
|
||||
io.Float.Input("top_p", default=0.9, min=0.0, max=2000.0, step=0.01, advanced=True),
|
||||
io.Int.Input("top_k", default=0, min=0, max=100, advanced=True),
|
||||
io.Float.Input("min_p", default=0.000, min=0.0, max=1.0, step=0.001, advanced=True),
|
||||
IO.Clip.Input("clip"),
|
||||
IO.String.Input("tags", multiline=True, dynamic_prompts=True),
|
||||
IO.String.Input("lyrics", multiline=True, dynamic_prompts=True),
|
||||
IO.Int.Input("seed", default=0, min=0, max=0xffffffffffffffff, control_after_generate=True),
|
||||
IO.Int.Input("bpm", default=120, min=10, max=300),
|
||||
IO.Float.Input("duration", default=120.0, min=0.0, max=2000.0, step=0.1),
|
||||
IO.Combo.Input("timesignature", options=['2', '3', '4', '6']),
|
||||
IO.Combo.Input("language", options=["en", "ja", "zh", "es", "de", "fr", "pt", "ru", "it", "nl", "pl", "tr", "vi", "cs", "fa", "id", "ko", "uk", "hu", "ar", "sv", "ro", "el"]),
|
||||
IO.Combo.Input("keyscale", options=[f"{root} {quality}" for quality in ["major", "minor"] for root in ["C", "C#", "Db", "D", "D#", "Eb", "E", "F", "F#", "Gb", "G", "G#", "Ab", "A", "A#", "Bb", "B"]]),
|
||||
IO.Boolean.Input("generate_audio_codes", default=True, tooltip="Enable the LLM that generates audio codes. This can be slow but will increase the quality of the generated audio. Turn this off if you are giving the model an audio reference.", advanced=True),
|
||||
IO.Float.Input("cfg_scale", default=2.0, min=0.0, max=100.0, step=0.1, advanced=True),
|
||||
IO.Float.Input("temperature", default=0.85, min=0.0, max=2.0, step=0.01, advanced=True),
|
||||
IO.Float.Input("top_p", default=0.9, min=0.0, max=2000.0, step=0.01, advanced=True),
|
||||
IO.Int.Input("top_k", default=0, min=0, max=100, advanced=True),
|
||||
IO.Float.Input("min_p", default=0.000, min=0.0, max=1.0, step=0.001, advanced=True),
|
||||
],
|
||||
outputs=[io.Conditioning.Output()],
|
||||
outputs=[IO.Conditioning.Output()],
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def execute(cls, clip, tags, lyrics, seed, bpm, duration, timesignature, language, keyscale, generate_audio_codes, cfg_scale, temperature, top_p, top_k, min_p) -> io.NodeOutput:
|
||||
def execute(cls, clip, tags, lyrics, seed, bpm, duration, timesignature, language, keyscale, generate_audio_codes, cfg_scale, temperature, top_p, top_k, min_p) -> IO.NodeOutput:
|
||||
tokens = clip.tokenize(tags, lyrics=lyrics, bpm=bpm, duration=duration, timesignature=int(timesignature), language=language, keyscale=keyscale, seed=seed, generate_audio_codes=generate_audio_codes, cfg_scale=cfg_scale, temperature=temperature, top_p=top_p, top_k=top_k, min_p=min_p)
|
||||
conditioning = clip.encode_from_tokens_scheduled(tokens)
|
||||
return io.NodeOutput(conditioning)
|
||||
return IO.NodeOutput(conditioning)
|
||||
|
||||
|
||||
class EmptyAceStepLatentAudio(io.ComfyNode):
|
||||
class EmptyAceStepLatentAudio(IO.ComfyNode):
|
||||
@classmethod
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
return IO.Schema(
|
||||
node_id="EmptyAceStepLatentAudio",
|
||||
display_name="Empty Ace Step 1.0 Latent Audio",
|
||||
category="latent/audio",
|
||||
inputs=[
|
||||
io.Float.Input("seconds", default=120.0, min=1.0, max=1000.0, step=0.1),
|
||||
io.Int.Input(
|
||||
IO.Float.Input("seconds", default=120.0, min=1.0, max=1000.0, step=0.1),
|
||||
IO.Int.Input(
|
||||
"batch_size", default=1, min=1, max=4096, tooltip="The number of latent images in the batch."
|
||||
),
|
||||
],
|
||||
outputs=[io.Latent.Output()],
|
||||
outputs=[IO.Latent.Output()],
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def execute(cls, seconds, batch_size) -> io.NodeOutput:
|
||||
def execute(cls, seconds, batch_size) -> IO.NodeOutput:
|
||||
length = int(seconds * 44100 / 512 / 8)
|
||||
latent = torch.zeros([batch_size, 8, 16, length], device=comfy.model_management.intermediate_device(), dtype=comfy.model_management.intermediate_dtype())
|
||||
return io.NodeOutput({"samples": latent, "type": "audio"})
|
||||
return IO.NodeOutput({"samples": latent, "type": "audio"})
|
||||
|
||||
|
||||
class EmptyAceStep15LatentAudio(io.ComfyNode):
|
||||
class EmptyAceStep15LatentAudio(IO.ComfyNode):
|
||||
@classmethod
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
return IO.Schema(
|
||||
node_id="EmptyAceStep1.5LatentAudio",
|
||||
display_name="Empty Ace Step 1.5 Latent Audio",
|
||||
category="latent/audio",
|
||||
inputs=[
|
||||
io.Float.Input("seconds", default=120.0, min=1.0, max=1000.0, step=0.01),
|
||||
io.Int.Input(
|
||||
IO.Float.Input("seconds", default=120.0, min=1.0, max=1000.0, step=0.01),
|
||||
IO.Int.Input(
|
||||
"batch_size", default=1, min=1, max=4096, tooltip="The number of latent images in the batch."
|
||||
),
|
||||
],
|
||||
outputs=[io.Latent.Output()],
|
||||
outputs=[IO.Latent.Output()],
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def execute(cls, seconds, batch_size) -> io.NodeOutput:
|
||||
def execute(cls, seconds, batch_size) -> IO.NodeOutput:
|
||||
length = round((seconds * 48000 / 1920))
|
||||
latent = torch.zeros([batch_size, 64, length], device=comfy.model_management.intermediate_device(), dtype=comfy.model_management.intermediate_dtype())
|
||||
return io.NodeOutput({"samples": latent, "type": "audio"})
|
||||
return IO.NodeOutput({"samples": latent, "type": "audio"})
|
||||
|
||||
class ReferenceAudio(io.ComfyNode):
|
||||
class ReferenceAudio(IO.ComfyNode):
|
||||
@classmethod
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
return IO.Schema(
|
||||
node_id="ReferenceTimbreAudio",
|
||||
display_name="Reference Audio",
|
||||
category="advanced/conditioning/audio",
|
||||
is_experimental=True,
|
||||
description="This node sets the reference audio for ace step 1.5",
|
||||
inputs=[
|
||||
io.Conditioning.Input("conditioning"),
|
||||
io.Latent.Input("latent", optional=True),
|
||||
IO.Conditioning.Input("conditioning"),
|
||||
IO.Latent.Input("latent", optional=True),
|
||||
],
|
||||
outputs=[
|
||||
io.Conditioning.Output(),
|
||||
IO.Conditioning.Output(),
|
||||
]
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def execute(cls, conditioning, latent=None) -> io.NodeOutput:
|
||||
def execute(cls, conditioning, latent=None) -> IO.NodeOutput:
|
||||
if latent is not None:
|
||||
conditioning = node_helpers.conditioning_set_values(conditioning, {"reference_audio_timbre_latents": [latent["samples"]]}, append=True)
|
||||
return io.NodeOutput(conditioning)
|
||||
return IO.NodeOutput(conditioning)
|
||||
|
||||
class AceExtension(ComfyExtension):
|
||||
@override
|
||||
async def get_node_list(self) -> list[type[io.ComfyNode]]:
|
||||
async def get_node_list(self) -> list[type[IO.ComfyNode]]:
|
||||
return [
|
||||
TextEncodeAceStepAudio,
|
||||
EmptyAceStepLatentAudio,
|
||||
|
||||
@ -104,7 +104,7 @@ def vae_decode_audio(vae, samples, tile=None, overlap=None):
|
||||
std = torch.std(audio, dim=[1, 2], keepdim=True) * 5.0
|
||||
std[std < 1.0] = 1.0
|
||||
audio /= std
|
||||
vae_sample_rate = getattr(vae, "audio_sample_rate", 44100)
|
||||
vae_sample_rate = getattr(vae, "audio_sample_rate_output", getattr(vae, "audio_sample_rate", 44100))
|
||||
return {"waveform": audio, "sample_rate": vae_sample_rate if "sample_rate" not in samples else samples["sample_rate"]}
|
||||
|
||||
|
||||
|
||||
@ -3,9 +3,8 @@ import comfy.utils
|
||||
import comfy.model_management
|
||||
import torch
|
||||
|
||||
from comfy.ldm.lightricks.vae.audio_vae import AudioVAE
|
||||
from comfy_api.latest import ComfyExtension, io
|
||||
|
||||
from comfy_extras.nodes_audio import VAEEncodeAudio
|
||||
|
||||
class LTXVAudioVAELoader(io.ComfyNode):
|
||||
@classmethod
|
||||
@ -28,10 +27,14 @@ class LTXVAudioVAELoader(io.ComfyNode):
|
||||
def execute(cls, ckpt_name: str) -> io.NodeOutput:
|
||||
ckpt_path = folder_paths.get_full_path_or_raise("checkpoints", ckpt_name)
|
||||
sd, metadata = comfy.utils.load_torch_file(ckpt_path, return_metadata=True)
|
||||
return io.NodeOutput(AudioVAE(sd, metadata))
|
||||
sd = comfy.utils.state_dict_prefix_replace(sd, {"audio_vae.": "autoencoder.", "vocoder.": "vocoder."}, filter_keys=True)
|
||||
vae = comfy.sd.VAE(sd=sd, metadata=metadata)
|
||||
vae.throw_exception_if_invalid()
|
||||
|
||||
return io.NodeOutput(vae)
|
||||
|
||||
|
||||
class LTXVAudioVAEEncode(io.ComfyNode):
|
||||
class LTXVAudioVAEEncode(VAEEncodeAudio):
|
||||
@classmethod
|
||||
def define_schema(cls) -> io.Schema:
|
||||
return io.Schema(
|
||||
@ -50,15 +53,8 @@ class LTXVAudioVAEEncode(io.ComfyNode):
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def execute(cls, audio, audio_vae: AudioVAE) -> io.NodeOutput:
|
||||
audio_latents = audio_vae.encode(audio)
|
||||
return io.NodeOutput(
|
||||
{
|
||||
"samples": audio_latents,
|
||||
"sample_rate": int(audio_vae.sample_rate),
|
||||
"type": "audio",
|
||||
}
|
||||
)
|
||||
def execute(cls, audio, audio_vae) -> io.NodeOutput:
|
||||
return super().execute(audio_vae, audio)
|
||||
|
||||
|
||||
class LTXVAudioVAEDecode(io.ComfyNode):
|
||||
@ -80,12 +76,12 @@ class LTXVAudioVAEDecode(io.ComfyNode):
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def execute(cls, samples, audio_vae: AudioVAE) -> io.NodeOutput:
|
||||
def execute(cls, samples, audio_vae) -> io.NodeOutput:
|
||||
audio_latent = samples["samples"]
|
||||
if audio_latent.is_nested:
|
||||
audio_latent = audio_latent.unbind()[-1]
|
||||
audio = audio_vae.decode(audio_latent).to(audio_latent.device)
|
||||
output_audio_sample_rate = audio_vae.output_sample_rate
|
||||
audio = audio_vae.decode(audio_latent).movedim(-1, 1).to(audio_latent.device)
|
||||
output_audio_sample_rate = audio_vae.first_stage_model.output_sample_rate
|
||||
return io.NodeOutput(
|
||||
{
|
||||
"waveform": audio,
|
||||
@ -143,17 +139,17 @@ class LTXVEmptyLatentAudio(io.ComfyNode):
|
||||
frames_number: int,
|
||||
frame_rate: int,
|
||||
batch_size: int,
|
||||
audio_vae: AudioVAE,
|
||||
audio_vae,
|
||||
) -> io.NodeOutput:
|
||||
"""Generate empty audio latents matching the reference pipeline structure."""
|
||||
|
||||
assert audio_vae is not None, "Audio VAE model is required"
|
||||
|
||||
z_channels = audio_vae.latent_channels
|
||||
audio_freq = audio_vae.latent_frequency_bins
|
||||
sampling_rate = int(audio_vae.sample_rate)
|
||||
audio_freq = audio_vae.first_stage_model.latent_frequency_bins
|
||||
sampling_rate = int(audio_vae.first_stage_model.sample_rate)
|
||||
|
||||
num_audio_latents = audio_vae.num_of_latents_from_frames(frames_number, frame_rate)
|
||||
num_audio_latents = audio_vae.first_stage_model.num_of_latents_from_frames(frames_number, frame_rate)
|
||||
|
||||
audio_latents = torch.zeros(
|
||||
(batch_size, z_channels, num_audio_latents, audio_freq),
|
||||
|
||||
@ -7,7 +7,10 @@ import comfy.model_management
|
||||
import comfy.ldm.common_dit
|
||||
import comfy.latent_formats
|
||||
import comfy.ldm.lumina.controlnet
|
||||
import comfy.ldm.supir.supir_modules
|
||||
from comfy.ldm.wan.model_multitalk import WanMultiTalkAttentionBlock, MultiTalkAudioProjModel
|
||||
from comfy_api.latest import io
|
||||
from comfy.ldm.supir.supir_patch import SUPIRPatch
|
||||
|
||||
|
||||
class BlockWiseControlBlock(torch.nn.Module):
|
||||
@ -266,6 +269,27 @@ class ModelPatchLoader:
|
||||
out_dim=sd["audio_proj.norm.weight"].shape[0],
|
||||
device=comfy.model_management.unet_offload_device(),
|
||||
operations=comfy.ops.manual_cast)
|
||||
elif 'model.control_model.input_hint_block.0.weight' in sd or 'control_model.input_hint_block.0.weight' in sd:
|
||||
prefix_replace = {}
|
||||
if 'model.control_model.input_hint_block.0.weight' in sd:
|
||||
prefix_replace["model.control_model."] = "control_model."
|
||||
prefix_replace["model.diffusion_model.project_modules."] = "project_modules."
|
||||
else:
|
||||
prefix_replace["control_model."] = "control_model."
|
||||
prefix_replace["project_modules."] = "project_modules."
|
||||
|
||||
# Extract denoise_encoder weights before filter_keys discards them
|
||||
de_prefix = "first_stage_model.denoise_encoder."
|
||||
denoise_encoder_sd = {}
|
||||
for k in list(sd.keys()):
|
||||
if k.startswith(de_prefix):
|
||||
denoise_encoder_sd[k[len(de_prefix):]] = sd.pop(k)
|
||||
|
||||
sd = comfy.utils.state_dict_prefix_replace(sd, prefix_replace, filter_keys=True)
|
||||
sd.pop("control_model.mask_LQ", None)
|
||||
model = comfy.ldm.supir.supir_modules.SUPIR(device=comfy.model_management.unet_offload_device(), dtype=dtype, operations=comfy.ops.manual_cast)
|
||||
if denoise_encoder_sd:
|
||||
model.denoise_encoder_sd = denoise_encoder_sd
|
||||
|
||||
model_patcher = comfy.model_patcher.CoreModelPatcher(model, load_device=comfy.model_management.get_torch_device(), offload_device=comfy.model_management.unet_offload_device())
|
||||
model.load_state_dict(sd, assign=model_patcher.is_dynamic())
|
||||
@ -565,9 +589,89 @@ class MultiTalkModelPatch(torch.nn.Module):
|
||||
)
|
||||
|
||||
|
||||
class SUPIRApply(io.ComfyNode):
|
||||
@classmethod
|
||||
def define_schema(cls) -> io.Schema:
|
||||
return io.Schema(
|
||||
node_id="SUPIRApply",
|
||||
category="model_patches/supir",
|
||||
is_experimental=True,
|
||||
inputs=[
|
||||
io.Model.Input("model"),
|
||||
io.ModelPatch.Input("model_patch"),
|
||||
io.Vae.Input("vae"),
|
||||
io.Image.Input("image"),
|
||||
io.Float.Input("strength_start", default=1.0, min=0.0, max=10.0, step=0.01,
|
||||
tooltip="Control strength at the start of sampling (high sigma)."),
|
||||
io.Float.Input("strength_end", default=1.0, min=0.0, max=10.0, step=0.01,
|
||||
tooltip="Control strength at the end of sampling (low sigma). Linearly interpolated from start."),
|
||||
io.Float.Input("restore_cfg", default=4.0, min=0.0, max=20.0, step=0.1, advanced=True,
|
||||
tooltip="Pulls denoised output toward the input latent. Higher = stronger fidelity to input. 0 to disable."),
|
||||
io.Float.Input("restore_cfg_s_tmin", default=0.05, min=0.0, max=1.0, step=0.01, advanced=True,
|
||||
tooltip="Sigma threshold below which restore_cfg is disabled."),
|
||||
],
|
||||
outputs=[io.Model.Output()],
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def _encode_with_denoise_encoder(cls, vae, model_patch, image):
|
||||
"""Encode using denoise_encoder weights from SUPIR checkpoint if available."""
|
||||
denoise_sd = getattr(model_patch.model, 'denoise_encoder_sd', None)
|
||||
if not denoise_sd:
|
||||
return vae.encode(image)
|
||||
|
||||
# Clone VAE patcher, apply denoise_encoder weights to clone, encode
|
||||
orig_patcher = vae.patcher
|
||||
vae.patcher = orig_patcher.clone()
|
||||
patches = {f"encoder.{k}": (v,) for k, v in denoise_sd.items()}
|
||||
vae.patcher.add_patches(patches, strength_patch=1.0, strength_model=0.0)
|
||||
try:
|
||||
return vae.encode(image)
|
||||
finally:
|
||||
vae.patcher = orig_patcher
|
||||
|
||||
@classmethod
|
||||
def execute(cls, *, model: io.Model.Type, model_patch: io.ModelPatch.Type, vae: io.Vae.Type, image: io.Image.Type,
|
||||
strength_start: float, strength_end: float, restore_cfg: float, restore_cfg_s_tmin: float) -> io.NodeOutput:
|
||||
model_patched = model.clone()
|
||||
hint_latent = model.get_model_object("latent_format").process_in(
|
||||
cls._encode_with_denoise_encoder(vae, model_patch, image[:, :, :, :3]))
|
||||
patch = SUPIRPatch(model_patch, model_patch.model.project_modules, hint_latent, strength_start, strength_end)
|
||||
patch.register(model_patched)
|
||||
|
||||
if restore_cfg > 0.0:
|
||||
# Round-trip to match original pipeline: decode hint, re-encode with regular VAE
|
||||
latent_format = model.get_model_object("latent_format")
|
||||
decoded = vae.decode(latent_format.process_out(hint_latent))
|
||||
x_center = latent_format.process_in(vae.encode(decoded[:, :, :, :3]))
|
||||
sigma_max = 14.6146
|
||||
|
||||
def restore_cfg_function(args):
|
||||
denoised = args["denoised"]
|
||||
sigma = args["sigma"]
|
||||
if sigma.dim() > 0:
|
||||
s = sigma[0].item()
|
||||
else:
|
||||
s = sigma.item()
|
||||
if s > restore_cfg_s_tmin:
|
||||
ref = x_center.to(device=denoised.device, dtype=denoised.dtype)
|
||||
b = denoised.shape[0]
|
||||
if ref.shape[0] != b:
|
||||
ref = ref.expand(b, -1, -1, -1) if ref.shape[0] == 1 else ref.repeat((b + ref.shape[0] - 1) // ref.shape[0], 1, 1, 1)[:b]
|
||||
sigma_val = sigma.view(-1, 1, 1, 1) if sigma.dim() > 0 else sigma
|
||||
d_center = denoised - ref
|
||||
denoised = denoised - d_center * ((sigma_val / sigma_max) ** restore_cfg)
|
||||
return denoised
|
||||
|
||||
model_patched.set_model_sampler_post_cfg_function(restore_cfg_function)
|
||||
|
||||
return io.NodeOutput(model_patched)
|
||||
|
||||
|
||||
NODE_CLASS_MAPPINGS = {
|
||||
"ModelPatchLoader": ModelPatchLoader,
|
||||
"QwenImageDiffsynthControlnet": QwenImageDiffsynthControlnet,
|
||||
"ZImageFunControlnet": ZImageFunControlnet,
|
||||
"USOStyleReference": USOStyleReference,
|
||||
"SUPIRApply": SUPIRApply,
|
||||
}
|
||||
|
||||
@ -6,6 +6,7 @@ from PIL import Image
|
||||
import math
|
||||
from enum import Enum
|
||||
from typing import TypedDict, Literal
|
||||
import kornia
|
||||
|
||||
import comfy.utils
|
||||
import comfy.model_management
|
||||
@ -660,6 +661,228 @@ class BatchImagesMasksLatentsNode(io.ComfyNode):
|
||||
return io.NodeOutput(batched)
|
||||
|
||||
|
||||
class ColorTransfer(io.ComfyNode):
|
||||
@classmethod
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="ColorTransfer",
|
||||
category="image/postprocessing",
|
||||
description="Match the colors of one image to another using various algorithms.",
|
||||
search_aliases=["color match", "color grading", "color correction", "match colors", "color transform", "mkl", "reinhard", "histogram"],
|
||||
inputs=[
|
||||
io.Image.Input("image_target", tooltip="Image(s) to apply the color transform to."),
|
||||
io.Image.Input("image_ref", optional=True, tooltip="Reference image(s) to match colors to. If not provided, processing is skipped"),
|
||||
io.Combo.Input("method", options=['reinhard_lab', 'mkl_lab', 'histogram'],),
|
||||
io.DynamicCombo.Input("source_stats",
|
||||
tooltip="per_frame: each frame matched to image_ref individually. uniform: pool stats across all source frames as baseline, match to image_ref. target_frame: use one chosen frame as the baseline for the transform to image_ref, applied uniformly to all frames (preserves relative differences)",
|
||||
options=[
|
||||
io.DynamicCombo.Option("per_frame", []),
|
||||
io.DynamicCombo.Option("uniform", []),
|
||||
io.DynamicCombo.Option("target_frame", [
|
||||
io.Int.Input("target_index", default=0, min=0, max=10000,
|
||||
tooltip="Frame index used as the source baseline for computing the transform to image_ref"),
|
||||
]),
|
||||
]),
|
||||
io.Float.Input("strength", default=1.0, min=0.0, max=10.0, step=0.01),
|
||||
],
|
||||
outputs=[
|
||||
io.Image.Output(display_name="image"),
|
||||
],
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _to_lab(images, i, device):
|
||||
return kornia.color.rgb_to_lab(
|
||||
images[i:i+1].to(device, dtype=torch.float32).permute(0, 3, 1, 2))
|
||||
|
||||
@staticmethod
|
||||
def _pool_stats(images, device, is_reinhard, eps):
|
||||
"""Two-pass pooled mean + std/cov across all frames."""
|
||||
N, C = images.shape[0], images.shape[3]
|
||||
HW = images.shape[1] * images.shape[2]
|
||||
mean = torch.zeros(C, 1, device=device, dtype=torch.float32)
|
||||
for i in range(N):
|
||||
mean += ColorTransfer._to_lab(images, i, device).view(C, -1).mean(dim=-1, keepdim=True)
|
||||
mean /= N
|
||||
acc = torch.zeros(C, 1 if is_reinhard else C, device=device, dtype=torch.float32)
|
||||
for i in range(N):
|
||||
centered = ColorTransfer._to_lab(images, i, device).view(C, -1) - mean
|
||||
if is_reinhard:
|
||||
acc += (centered * centered).mean(dim=-1, keepdim=True)
|
||||
else:
|
||||
acc += centered @ centered.T / HW
|
||||
if is_reinhard:
|
||||
return mean, torch.sqrt(acc / N).clamp_min_(eps)
|
||||
return mean, acc / N
|
||||
|
||||
@staticmethod
|
||||
def _frame_stats(lab_flat, hw, is_reinhard, eps):
|
||||
"""Per-frame mean + std/cov."""
|
||||
mean = lab_flat.mean(dim=-1, keepdim=True)
|
||||
if is_reinhard:
|
||||
return mean, lab_flat.std(dim=-1, keepdim=True, unbiased=False).clamp_min_(eps)
|
||||
centered = lab_flat - mean
|
||||
return mean, centered @ centered.T / hw
|
||||
|
||||
@staticmethod
|
||||
def _mkl_matrix(cov_s, cov_r, eps):
|
||||
"""Compute MKL 3x3 transform matrix from source and ref covariances."""
|
||||
eig_val_s, eig_vec_s = torch.linalg.eigh(cov_s)
|
||||
sqrt_val_s = torch.sqrt(eig_val_s.clamp_min(0)).clamp_min_(eps)
|
||||
|
||||
scaled_V = eig_vec_s * sqrt_val_s.unsqueeze(0)
|
||||
mid = scaled_V.T @ cov_r @ scaled_V
|
||||
eig_val_m, eig_vec_m = torch.linalg.eigh(mid)
|
||||
sqrt_m = torch.sqrt(eig_val_m.clamp_min(0))
|
||||
|
||||
inv_sqrt_s = 1.0 / sqrt_val_s
|
||||
inv_scaled_V = eig_vec_s * inv_sqrt_s.unsqueeze(0)
|
||||
M_half = (eig_vec_m * sqrt_m.unsqueeze(0)) @ eig_vec_m.T
|
||||
return inv_scaled_V @ M_half @ inv_scaled_V.T
|
||||
|
||||
@staticmethod
|
||||
def _histogram_lut(src, ref, bins=256):
|
||||
"""Build per-channel LUT from source and ref histograms. src/ref: (C, HW) in [0,1]."""
|
||||
s_bins = (src * (bins - 1)).long().clamp(0, bins - 1)
|
||||
r_bins = (ref * (bins - 1)).long().clamp(0, bins - 1)
|
||||
s_hist = torch.zeros(src.shape[0], bins, device=src.device, dtype=src.dtype)
|
||||
r_hist = torch.zeros(src.shape[0], bins, device=src.device, dtype=src.dtype)
|
||||
ones_s = torch.ones_like(src)
|
||||
ones_r = torch.ones_like(ref)
|
||||
s_hist.scatter_add_(1, s_bins, ones_s)
|
||||
r_hist.scatter_add_(1, r_bins, ones_r)
|
||||
s_cdf = s_hist.cumsum(1)
|
||||
s_cdf = s_cdf / s_cdf[:, -1:]
|
||||
r_cdf = r_hist.cumsum(1)
|
||||
r_cdf = r_cdf / r_cdf[:, -1:]
|
||||
return torch.searchsorted(r_cdf, s_cdf).clamp_max_(bins - 1).float() / (bins - 1)
|
||||
|
||||
@classmethod
|
||||
def _pooled_cdf(cls, images, device, num_bins=256):
|
||||
"""Build pooled CDF across all frames, one frame at a time."""
|
||||
C = images.shape[3]
|
||||
hist = torch.zeros(C, num_bins, device=device, dtype=torch.float32)
|
||||
for i in range(images.shape[0]):
|
||||
frame = images[i].to(device, dtype=torch.float32).permute(2, 0, 1).reshape(C, -1)
|
||||
bins = (frame * (num_bins - 1)).long().clamp(0, num_bins - 1)
|
||||
hist.scatter_add_(1, bins, torch.ones_like(frame))
|
||||
cdf = hist.cumsum(1)
|
||||
return cdf / cdf[:, -1:]
|
||||
|
||||
@classmethod
|
||||
def _build_histogram_transform(cls, image_target, image_ref, device, stats_mode, target_index, B):
|
||||
"""Build per-frame or uniform LUT transform for histogram mode."""
|
||||
if stats_mode == 'per_frame':
|
||||
return None # LUT computed per-frame in the apply loop
|
||||
|
||||
r_cdf = cls._pooled_cdf(image_ref, device)
|
||||
if stats_mode == 'target_frame':
|
||||
ti = min(target_index, B - 1)
|
||||
s_cdf = cls._pooled_cdf(image_target[ti:ti+1], device)
|
||||
else:
|
||||
s_cdf = cls._pooled_cdf(image_target, device)
|
||||
return torch.searchsorted(r_cdf, s_cdf).clamp_max_(255).float() / 255.0
|
||||
|
||||
@classmethod
|
||||
def _build_lab_transform(cls, image_target, image_ref, device, stats_mode, target_index, is_reinhard):
|
||||
"""Build transform parameters for Lab-based methods. Returns a transform function."""
|
||||
eps = 1e-6
|
||||
B, H, W, C = image_target.shape
|
||||
B_ref = image_ref.shape[0]
|
||||
single_ref = B_ref == 1
|
||||
HW = H * W
|
||||
HW_ref = image_ref.shape[1] * image_ref.shape[2]
|
||||
|
||||
# Precompute ref stats
|
||||
if single_ref or stats_mode in ('uniform', 'target_frame'):
|
||||
ref_mean, ref_sc = cls._pool_stats(image_ref, device, is_reinhard, eps)
|
||||
|
||||
# Uniform/target_frame: precompute single affine transform
|
||||
if stats_mode in ('uniform', 'target_frame'):
|
||||
if stats_mode == 'target_frame':
|
||||
ti = min(target_index, B - 1)
|
||||
s_lab = cls._to_lab(image_target, ti, device).view(C, -1)
|
||||
s_mean, s_sc = cls._frame_stats(s_lab, HW, is_reinhard, eps)
|
||||
else:
|
||||
s_mean, s_sc = cls._pool_stats(image_target, device, is_reinhard, eps)
|
||||
|
||||
if is_reinhard:
|
||||
scale = ref_sc / s_sc
|
||||
offset = ref_mean - scale * s_mean
|
||||
return lambda src_flat, **_: src_flat * scale + offset
|
||||
T = cls._mkl_matrix(s_sc, ref_sc, eps)
|
||||
offset = ref_mean - T @ s_mean
|
||||
return lambda src_flat, **_: T @ src_flat + offset
|
||||
|
||||
# per_frame
|
||||
def per_frame_transform(src_flat, frame_idx):
|
||||
s_mean, s_sc = cls._frame_stats(src_flat, HW, is_reinhard, eps)
|
||||
|
||||
if single_ref:
|
||||
r_mean, r_sc = ref_mean, ref_sc
|
||||
else:
|
||||
ri = min(frame_idx, B_ref - 1)
|
||||
r_mean, r_sc = cls._frame_stats(cls._to_lab(image_ref, ri, device).view(C, -1), HW_ref, is_reinhard, eps)
|
||||
|
||||
centered = src_flat - s_mean
|
||||
if is_reinhard:
|
||||
return centered * (r_sc / s_sc) + r_mean
|
||||
T = cls._mkl_matrix(centered @ centered.T / HW, r_sc, eps)
|
||||
return T @ centered + r_mean
|
||||
|
||||
return per_frame_transform
|
||||
|
||||
@classmethod
|
||||
def execute(cls, image_target, image_ref, method, source_stats, strength=1.0) -> io.NodeOutput:
|
||||
stats_mode = source_stats["source_stats"]
|
||||
target_index = source_stats.get("target_index", 0)
|
||||
|
||||
if strength == 0 or image_ref is None:
|
||||
return io.NodeOutput(image_target)
|
||||
|
||||
device = comfy.model_management.get_torch_device()
|
||||
intermediate_device = comfy.model_management.intermediate_device()
|
||||
intermediate_dtype = comfy.model_management.intermediate_dtype()
|
||||
|
||||
B, H, W, C = image_target.shape
|
||||
B_ref = image_ref.shape[0]
|
||||
pbar = comfy.utils.ProgressBar(B)
|
||||
out = torch.empty(B, H, W, C, device=intermediate_device, dtype=intermediate_dtype)
|
||||
|
||||
if method == 'histogram':
|
||||
uniform_lut = cls._build_histogram_transform(
|
||||
image_target, image_ref, device, stats_mode, target_index, B)
|
||||
|
||||
for i in range(B):
|
||||
src = image_target[i].to(device, dtype=torch.float32).permute(2, 0, 1)
|
||||
src_flat = src.reshape(C, -1)
|
||||
if uniform_lut is not None:
|
||||
lut = uniform_lut
|
||||
else:
|
||||
ri = min(i, B_ref - 1)
|
||||
ref = image_ref[ri].to(device, dtype=torch.float32).permute(2, 0, 1).reshape(C, -1)
|
||||
lut = cls._histogram_lut(src_flat, ref)
|
||||
bin_idx = (src_flat * 255).long().clamp(0, 255)
|
||||
matched = lut.gather(1, bin_idx).view(C, H, W)
|
||||
result = matched if strength == 1.0 else torch.lerp(src, matched, strength)
|
||||
out[i] = result.permute(1, 2, 0).clamp_(0, 1).to(device=intermediate_device, dtype=intermediate_dtype)
|
||||
pbar.update(1)
|
||||
else:
|
||||
transform = cls._build_lab_transform(image_target, image_ref, device, stats_mode, target_index, is_reinhard=method == "reinhard_lab")
|
||||
|
||||
for i in range(B):
|
||||
src_frame = cls._to_lab(image_target, i, device)
|
||||
corrected = transform(src_frame.view(C, -1), frame_idx=i)
|
||||
if strength == 1.0:
|
||||
result = kornia.color.lab_to_rgb(corrected.view(1, C, H, W))
|
||||
else:
|
||||
result = kornia.color.lab_to_rgb(torch.lerp(src_frame, corrected.view(1, C, H, W), strength))
|
||||
out[i] = result.squeeze(0).permute(1, 2, 0).clamp_(0, 1).to(device=intermediate_device, dtype=intermediate_dtype)
|
||||
pbar.update(1)
|
||||
|
||||
return io.NodeOutput(out)
|
||||
|
||||
|
||||
class PostProcessingExtension(ComfyExtension):
|
||||
@override
|
||||
async def get_node_list(self) -> list[type[io.ComfyNode]]:
|
||||
@ -673,6 +896,7 @@ class PostProcessingExtension(ComfyExtension):
|
||||
BatchImagesNode,
|
||||
BatchMasksNode,
|
||||
BatchLatentsNode,
|
||||
ColorTransfer,
|
||||
# BatchImagesMasksLatentsNode,
|
||||
]
|
||||
|
||||
|
||||
@ -1,3 +1,3 @@
|
||||
# This file is automatically generated by the build process when version is
|
||||
# updated in pyproject.toml.
|
||||
__version__ = "0.19.5"
|
||||
__version__ = "0.19.3"
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
[project]
|
||||
name = "ComfyUI"
|
||||
version = "0.19.5"
|
||||
version = "0.19.3"
|
||||
readme = "README.md"
|
||||
license = { file = "LICENSE" }
|
||||
requires-python = ">=3.10"
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
comfyui-frontend-package==1.42.14
|
||||
comfyui-workflow-templates==0.9.61
|
||||
comfyui-workflow-templates==0.9.57
|
||||
comfyui-embedded-docs==0.4.3
|
||||
torch
|
||||
torchsde
|
||||
@ -19,7 +19,7 @@ scipy
|
||||
tqdm
|
||||
psutil
|
||||
alembic
|
||||
SQLAlchemy
|
||||
SQLAlchemy>=2.0
|
||||
filelock
|
||||
av>=14.2.0
|
||||
comfy-kitchen>=0.2.8
|
||||
|
||||
246
tests-unit/comfy_api_test/openai_nodes_test.py
Normal file
246
tests-unit/comfy_api_test/openai_nodes_test.py
Normal file
@ -0,0 +1,246 @@
|
||||
import pytest
|
||||
|
||||
from comfy_api_nodes.nodes_openai import (
|
||||
OpenAIGPTImage1,
|
||||
OpenAIGPTImage2,
|
||||
_GPT_IMAGE_2_SIZES,
|
||||
_resolve_gpt_image_2_size,
|
||||
calculate_tokens_price_image_1,
|
||||
calculate_tokens_price_image_1_5,
|
||||
calculate_tokens_price_image_2,
|
||||
)
|
||||
from comfy_api_nodes.apis.openai import OpenAIImageGenerationResponse, Usage
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _make_response(input_tokens: int, output_tokens: int) -> OpenAIImageGenerationResponse:
|
||||
return OpenAIImageGenerationResponse(
|
||||
data=[],
|
||||
usage=Usage(input_tokens=input_tokens, output_tokens=output_tokens),
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Price extractor tests
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def test_price_image_1_formula():
|
||||
response = _make_response(input_tokens=1_000_000, output_tokens=1_000_000)
|
||||
assert calculate_tokens_price_image_1(response) == pytest.approx(50.0)
|
||||
|
||||
|
||||
def test_price_image_1_5_formula():
|
||||
response = _make_response(input_tokens=1_000_000, output_tokens=1_000_000)
|
||||
assert calculate_tokens_price_image_1_5(response) == pytest.approx(40.0)
|
||||
|
||||
|
||||
def test_price_image_2_formula():
|
||||
response = _make_response(input_tokens=1_000_000, output_tokens=1_000_000)
|
||||
assert calculate_tokens_price_image_2(response) == pytest.approx(38.0)
|
||||
|
||||
|
||||
def test_price_image_2_cheaper_than_1():
|
||||
response = _make_response(input_tokens=500, output_tokens=196)
|
||||
assert calculate_tokens_price_image_2(response) < calculate_tokens_price_image_1(response)
|
||||
|
||||
|
||||
def test_price_image_2_cheaper_output_than_1_5():
|
||||
# gpt-image-2 output rate ($30/1M) is lower than gpt-image-1.5 ($32/1M)
|
||||
response = _make_response(input_tokens=0, output_tokens=1_000_000)
|
||||
assert calculate_tokens_price_image_2(response) < calculate_tokens_price_image_1_5(response)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _resolve_gpt_image_2_size tests
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def test_resolve_preset_passthrough_when_custom_zero():
|
||||
# 0/0 means "use size preset"
|
||||
assert _resolve_gpt_image_2_size("1024x1024", 0, 0) == "1024x1024"
|
||||
assert _resolve_gpt_image_2_size("auto", 0, 0) == "auto"
|
||||
assert _resolve_gpt_image_2_size("3840x2160", 0, 0) == "3840x2160"
|
||||
|
||||
|
||||
def test_resolve_preset_passthrough_when_only_one_dim_set():
|
||||
# only one dimension set → still use preset
|
||||
assert _resolve_gpt_image_2_size("auto", 1024, 0) == "auto"
|
||||
assert _resolve_gpt_image_2_size("auto", 0, 1024) == "auto"
|
||||
|
||||
|
||||
def test_resolve_custom_overrides_preset():
|
||||
assert _resolve_gpt_image_2_size("auto", 1024, 1024) == "1024x1024"
|
||||
assert _resolve_gpt_image_2_size("1024x1024", 2048, 1152) == "2048x1152"
|
||||
assert _resolve_gpt_image_2_size("auto", 3840, 2160) == "3840x2160"
|
||||
|
||||
|
||||
def test_resolve_custom_rejects_edge_too_large():
|
||||
with pytest.raises(ValueError, match="3840"):
|
||||
_resolve_gpt_image_2_size("auto", 4096, 1024)
|
||||
|
||||
|
||||
def test_resolve_custom_rejects_non_multiple_of_16():
|
||||
with pytest.raises(ValueError, match="multiple of 16"):
|
||||
_resolve_gpt_image_2_size("auto", 1025, 1024)
|
||||
|
||||
|
||||
def test_resolve_custom_rejects_bad_ratio():
|
||||
with pytest.raises(ValueError, match="ratio"):
|
||||
_resolve_gpt_image_2_size("auto", 3840, 1024) # 3.75:1 > 3:1
|
||||
|
||||
|
||||
def test_resolve_custom_rejects_too_few_pixels():
|
||||
with pytest.raises(ValueError, match="Total pixels"):
|
||||
_resolve_gpt_image_2_size("auto", 16, 16)
|
||||
|
||||
|
||||
def test_resolve_custom_rejects_too_many_pixels():
|
||||
# 3840x2176 exceeds 8,294,400
|
||||
with pytest.raises(ValueError, match="Total pixels"):
|
||||
_resolve_gpt_image_2_size("auto", 3840, 2176)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# OpenAIGPTImage1 schema tests
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestOpenAIGPTImage1Schema:
|
||||
def setup_method(self):
|
||||
self.schema = OpenAIGPTImage1.define_schema()
|
||||
|
||||
def test_node_id(self):
|
||||
assert self.schema.node_id == "OpenAIGPTImage1"
|
||||
|
||||
def test_display_name(self):
|
||||
assert self.schema.display_name == "OpenAI GPT Image 1 & 1.5"
|
||||
|
||||
def test_model_options_exclude_gpt_image_2(self):
|
||||
model_input = next(i for i in self.schema.inputs if i.name == "model")
|
||||
assert "gpt-image-2" not in model_input.options
|
||||
|
||||
def test_model_options_include_legacy_models(self):
|
||||
model_input = next(i for i in self.schema.inputs if i.name == "model")
|
||||
assert "gpt-image-1" in model_input.options
|
||||
assert "gpt-image-1.5" in model_input.options
|
||||
|
||||
def test_has_background_with_transparent(self):
|
||||
bg_input = next(i for i in self.schema.inputs if i.name == "background")
|
||||
assert "transparent" in bg_input.options
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# OpenAIGPTImage2 schema tests
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestOpenAIGPTImage2Schema:
|
||||
def setup_method(self):
|
||||
self.schema = OpenAIGPTImage2.define_schema()
|
||||
|
||||
def test_node_id(self):
|
||||
assert self.schema.node_id == "OpenAIGPTImage2"
|
||||
|
||||
def test_display_name(self):
|
||||
assert self.schema.display_name == "OpenAI GPT Image 2"
|
||||
|
||||
def test_category(self):
|
||||
assert "OpenAI" in self.schema.category
|
||||
|
||||
def test_no_transparent_background(self):
|
||||
bg_input = next(i for i in self.schema.inputs if i.name == "background")
|
||||
assert "transparent" not in bg_input.options
|
||||
|
||||
def test_background_options(self):
|
||||
bg_input = next(i for i in self.schema.inputs if i.name == "background")
|
||||
assert set(bg_input.options) == {"auto", "opaque"}
|
||||
|
||||
def test_quality_options(self):
|
||||
quality_input = next(i for i in self.schema.inputs if i.name == "quality")
|
||||
assert set(quality_input.options) == {"auto", "low", "medium", "high"}
|
||||
|
||||
def test_quality_default_is_auto(self):
|
||||
quality_input = next(i for i in self.schema.inputs if i.name == "quality")
|
||||
assert quality_input.default == "auto"
|
||||
|
||||
def test_all_popular_sizes_present(self):
|
||||
size_input = next(i for i in self.schema.inputs if i.name == "size")
|
||||
for size in ["1024x1024", "1536x1024", "1024x1536", "2048x2048", "2048x1152", "3840x2160", "2160x3840"]:
|
||||
assert size in size_input.options, f"Missing size: {size}"
|
||||
|
||||
def test_no_custom_size_option(self):
|
||||
size_input = next(i for i in self.schema.inputs if i.name == "size")
|
||||
assert "custom" not in size_input.options
|
||||
|
||||
def test_size_default_is_auto(self):
|
||||
size_input = next(i for i in self.schema.inputs if i.name == "size")
|
||||
assert size_input.default == "auto"
|
||||
|
||||
def test_custom_width_and_height_inputs_exist(self):
|
||||
input_names = [i.name for i in self.schema.inputs]
|
||||
assert "custom_width" in input_names
|
||||
assert "custom_height" in input_names
|
||||
|
||||
def test_custom_width_height_default_zero(self):
|
||||
width_input = next(i for i in self.schema.inputs if i.name == "custom_width")
|
||||
height_input = next(i for i in self.schema.inputs if i.name == "custom_height")
|
||||
assert width_input.default == 0
|
||||
assert height_input.default == 0
|
||||
|
||||
def test_custom_width_height_step_is_16(self):
|
||||
width_input = next(i for i in self.schema.inputs if i.name == "custom_width")
|
||||
height_input = next(i for i in self.schema.inputs if i.name == "custom_height")
|
||||
assert width_input.step == 16
|
||||
assert height_input.step == 16
|
||||
|
||||
def test_custom_width_height_max_is_3840(self):
|
||||
width_input = next(i for i in self.schema.inputs if i.name == "custom_width")
|
||||
height_input = next(i for i in self.schema.inputs if i.name == "custom_height")
|
||||
assert width_input.max == 3840
|
||||
assert height_input.max == 3840
|
||||
|
||||
def test_uses_num_images_not_n(self):
|
||||
input_names = [i.name for i in self.schema.inputs]
|
||||
assert "num_images" in input_names
|
||||
assert "n" not in input_names
|
||||
|
||||
def test_model_input_shows_gpt_image_2(self):
|
||||
model_input = next(i for i in self.schema.inputs if i.name == "model")
|
||||
assert model_input.options == ["gpt-image-2"]
|
||||
assert model_input.default == "gpt-image-2"
|
||||
|
||||
def test_has_image_and_mask_inputs(self):
|
||||
input_names = [i.name for i in self.schema.inputs]
|
||||
assert "image" in input_names
|
||||
assert "mask" in input_names
|
||||
|
||||
def test_is_api_node(self):
|
||||
assert self.schema.is_api_node is True
|
||||
|
||||
def test_sizes_match_constant(self):
|
||||
size_input = next(i for i in self.schema.inputs if i.name == "size")
|
||||
assert size_input.options == _GPT_IMAGE_2_SIZES
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# OpenAIGPTImage2 execute validation tests
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_execute_raises_on_empty_prompt():
|
||||
with pytest.raises(Exception):
|
||||
await OpenAIGPTImage2.execute(prompt=" ")
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_execute_raises_mask_without_image():
|
||||
import torch
|
||||
mask = torch.ones(1, 64, 64)
|
||||
with pytest.raises(ValueError, match="mask without an input image"):
|
||||
await OpenAIGPTImage2.execute(prompt="test", mask=mask)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_execute_raises_invalid_custom_size():
|
||||
with pytest.raises(ValueError):
|
||||
await OpenAIGPTImage2.execute(prompt="test", custom_width=4096, custom_height=1024)
|
||||
Reference in New Issue
Block a user