Compare commits

...

15 Commits

Author SHA1 Message Date
699659c06e feat: add timestamp to default filename_prefix for cache-busting
Change default filename_prefix on all previewable save nodes (image, video,
audio, 3D, SVG) from 'ComfyUI' to 'ComfyUI_%year%%month%%day%-%hour%%minute%%second%'.

This leverages the existing compute_vars template system in
get_save_image_path — zero new backend code needed. Each output gets a
unique filename per second, preventing browser cache from showing stale
previews when files are overwritten.

Users can customize or remove the template from the node widget.
Existing workflows retain their saved prefix value (only new nodes
get the new default). Custom nodes are unaffected — they define their
own defaults independently.
2026-02-28 04:36:00 -08:00
95e1059661 fix(ace15): handle missing lm_metadata in memory estimation during checkpoint export #12669 (#12686) 2026-02-28 01:18:40 -05:00
80d49441e5 refactor: use AspectRatio enum members as ASPECT_RATIOS dict keys (#12689)
Amp-Thread-ID: https://ampcode.com/threads/T-019ca1cb-0150-7549-8b1b-6713060d3408

Co-authored-by: Jedrzej Kosinski <kosinkadink1@gmail.com>
2026-02-27 20:53:46 -08:00
9d0e114ee3 PyOpenGL-accelerate is not necessary. (#12692) 2026-02-27 23:34:58 -05:00
ac4412d0fa Native LongCat-Image implementation (#12597) 2026-02-27 23:04:34 -05:00
94f1a1cc9d Limit overlap in image tile and combine nodes to prevent issues. (#12688) 2026-02-27 20:16:24 -05:00
e721e24136 ops: implement lora requanting for non QuantizedTensor fp8 (#12668)
Allow non QuantizedTensor layer to set want_requant to get the post lora
calculation stochastic cast down to the original input dtype.

This is then used by the legacy fp8 Linear implementation to set the
compute_dtype to the preferred lora dtype but then want_requant it back
down to fp8.

This fixes the issue with --fast fp8_matrix_mult is combined with
--fast dynamic_vram which doing a lora on an fp8_ non QT model.
2026-02-27 19:05:51 -05:00
25ec3d96a3 Class WanVAE, def encode, feat_map is using self.decoder instead of self.encoder (#12682) 2026-02-27 19:03:45 -05:00
1f1ec377ce feat: add ResolutionSelector node for aspect ratio and megapixel-based resolution calculation (#12199)
Amp-Thread-ID: https://ampcode.com/threads/T-019c179e-cd8c-768f-ae66-207c7a53c01d

Co-authored-by: Jedrzej Kosinski <kosinkadink1@gmail.com>
2026-02-27 09:13:57 -08:00
0a7f8e11b6 fix torch.cat requiring inputs to all be same dimensions (#12673) 2026-02-27 08:13:24 -08:00
35e9fce775 Enable Pytorch Attention for gfx950 (#12641) 2026-02-26 20:16:12 -05:00
c7f7d52b68 feat: Support SDPose-OOD (#12661) 2026-02-26 19:59:05 -05:00
08b26ed7c2 bug_report template: Push harder for logs (#12657)
We get a lot od bug reports without logs, especially for performance
issues.
2026-02-26 18:59:24 -05:00
b233dbe0bc feat(ace-step): add ACE-Step 1.5 lycoris key alias mapping for LoKR #12638 (#12665) 2026-02-26 18:19:19 -05:00
3811780e4f Portable with cu128 isn't useful anymore. (#12666)
Users should either use the cu126 one or the regular one (cu130 at the moment)

The cu128 portable is still included in the latest github release but I will stop including it as soon as it becomes slightly annoying to deal with. This might happen as soon as next week.
2026-02-26 17:12:29 -05:00
25 changed files with 1369 additions and 73 deletions

View File

@ -16,7 +16,7 @@ body:
## Very Important
Please make sure that you post ALL your ComfyUI logs in the bug report. A bug report without logs will likely be ignored.
Please make sure that you post ALL your ComfyUI logs in the bug report **even if there is no crash**. Just paste everything. The startup log (everything before "To see the GUI go to: ...") contains critical information to developers trying to help. For a performance issue or crash, paste everything from "got prompt" to the end, including the crash. More is better - always. A bug report without logs will likely be ignored.
- type: checkboxes
id: custom-nodes-test
attributes:

View File

@ -189,8 +189,6 @@ The portable above currently comes with python 3.13 and pytorch cuda 13.0. Updat
[Experimental portable for AMD GPUs](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_amd.7z)
[Portable with pytorch cuda 12.8 and python 3.12](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_nvidia_cu128.7z).
[Portable with pytorch cuda 12.6 and python 3.12](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_nvidia_cu126.7z) (Supports Nvidia 10 series and older GPUs).
#### How do I share models between another UI and ComfyUI?

View File

@ -18,6 +18,8 @@ import comfy.patcher_extension
import comfy.ops
ops = comfy.ops.disable_weight_init
from ..sdpose import HeatmapHead
class TimestepBlock(nn.Module):
"""
Any module where forward() takes timestep embeddings as a second argument.
@ -441,6 +443,7 @@ class UNetModel(nn.Module):
disable_temporal_crossattention=False,
max_ddpm_temb_period=10000,
attn_precision=None,
heatmap_head=False,
device=None,
operations=ops,
):
@ -827,6 +830,9 @@ class UNetModel(nn.Module):
#nn.LogSoftmax(dim=1) # change to cross_entropy and produce non-normalized logits
)
if heatmap_head:
self.heatmap_head = HeatmapHead(device=device, dtype=self.dtype, operations=operations)
def forward(self, x, timesteps=None, context=None, y=None, control=None, transformer_options={}, **kwargs):
return comfy.patcher_extension.WrapperExecutor.new_class_executor(
self._forward,

130
comfy/ldm/modules/sdpose.py Normal file
View File

@ -0,0 +1,130 @@
import torch
import numpy as np
from scipy.ndimage import gaussian_filter
class HeatmapHead(torch.nn.Module):
def __init__(
self,
in_channels=640,
out_channels=133,
input_size=(768, 1024),
heatmap_scale=4,
deconv_out_channels=(640,),
deconv_kernel_sizes=(4,),
conv_out_channels=(640,),
conv_kernel_sizes=(1,),
final_layer_kernel_size=1,
device=None, dtype=None, operations=None
):
super().__init__()
self.heatmap_size = (input_size[0] // heatmap_scale, input_size[1] // heatmap_scale)
self.scale_factor = ((np.array(input_size) - 1) / (np.array(self.heatmap_size) - 1)).astype(np.float32)
# Deconv layers
if deconv_out_channels:
deconv_layers = []
for out_ch, kernel_size in zip(deconv_out_channels, deconv_kernel_sizes):
if kernel_size == 4:
padding, output_padding = 1, 0
elif kernel_size == 3:
padding, output_padding = 1, 1
elif kernel_size == 2:
padding, output_padding = 0, 0
else:
raise ValueError(f'Unsupported kernel size {kernel_size}')
deconv_layers.extend([
operations.ConvTranspose2d(in_channels, out_ch, kernel_size,
stride=2, padding=padding, output_padding=output_padding, bias=False, device=device, dtype=dtype),
torch.nn.InstanceNorm2d(out_ch, device=device, dtype=dtype),
torch.nn.SiLU(inplace=True)
])
in_channels = out_ch
self.deconv_layers = torch.nn.Sequential(*deconv_layers)
else:
self.deconv_layers = torch.nn.Identity()
# Conv layers
if conv_out_channels:
conv_layers = []
for out_ch, kernel_size in zip(conv_out_channels, conv_kernel_sizes):
padding = (kernel_size - 1) // 2
conv_layers.extend([
operations.Conv2d(in_channels, out_ch, kernel_size,
stride=1, padding=padding, device=device, dtype=dtype),
torch.nn.InstanceNorm2d(out_ch, device=device, dtype=dtype),
torch.nn.SiLU(inplace=True)
])
in_channels = out_ch
self.conv_layers = torch.nn.Sequential(*conv_layers)
else:
self.conv_layers = torch.nn.Identity()
self.final_layer = operations.Conv2d(in_channels, out_channels, kernel_size=final_layer_kernel_size, padding=final_layer_kernel_size // 2, device=device, dtype=dtype)
def forward(self, x): # Decode heatmaps to keypoints
heatmaps = self.final_layer(self.conv_layers(self.deconv_layers(x)))
heatmaps_np = heatmaps.float().cpu().numpy() # (B, K, H, W)
B, K, H, W = heatmaps_np.shape
batch_keypoints = []
batch_scores = []
for b in range(B):
hm = heatmaps_np[b].copy() # (K, H, W)
# --- vectorised argmax ---
flat = hm.reshape(K, -1)
idx = np.argmax(flat, axis=1)
scores = flat[np.arange(K), idx].copy()
y_locs, x_locs = np.unravel_index(idx, (H, W))
keypoints = np.stack([x_locs, y_locs], axis=-1).astype(np.float32) # (K, 2) in heatmap space
invalid = scores <= 0.
keypoints[invalid] = -1
# --- DARK sub-pixel refinement (UDP) ---
# 1. Gaussian blur with max-preserving normalisation
border = 5 # (kernel-1)//2 for kernel=11
for k in range(K):
origin_max = np.max(hm[k])
dr = np.zeros((H + 2 * border, W + 2 * border), dtype=np.float32)
dr[border:-border, border:-border] = hm[k].copy()
dr = gaussian_filter(dr, sigma=2.0)
hm[k] = dr[border:-border, border:-border].copy()
cur_max = np.max(hm[k])
if cur_max > 0:
hm[k] *= origin_max / cur_max
# 2. Log-space for Taylor expansion
np.clip(hm, 1e-3, 50., hm)
np.log(hm, hm)
# 3. Hessian-based Newton step
hm_pad = np.pad(hm, ((0, 0), (1, 1), (1, 1)), mode='edge').flatten()
index = keypoints[:, 0] + 1 + (keypoints[:, 1] + 1) * (W + 2)
index += (W + 2) * (H + 2) * np.arange(0, K)
index = index.astype(int).reshape(-1, 1)
i_ = hm_pad[index]
ix1 = hm_pad[index + 1]
iy1 = hm_pad[index + W + 2]
ix1y1 = hm_pad[index + W + 3]
ix1_y1_ = hm_pad[index - W - 3]
ix1_ = hm_pad[index - 1]
iy1_ = hm_pad[index - 2 - W]
dx = 0.5 * (ix1 - ix1_)
dy = 0.5 * (iy1 - iy1_)
derivative = np.concatenate([dx, dy], axis=1).reshape(K, 2, 1)
dxx = ix1 - 2 * i_ + ix1_
dyy = iy1 - 2 * i_ + iy1_
dxy = 0.5 * (ix1y1 - ix1 - iy1 + i_ + i_ - ix1_ - iy1_ + ix1_y1_)
hessian = np.concatenate([dxx, dxy, dxy, dyy], axis=1).reshape(K, 2, 2)
hessian = np.linalg.inv(hessian + np.finfo(np.float32).eps * np.eye(2))
keypoints -= np.einsum('imn,ink->imk', hessian, derivative).squeeze(axis=-1)
# --- restore to input image space ---
keypoints = keypoints * self.scale_factor
keypoints[invalid] = -1
batch_keypoints.append(keypoints)
batch_scores.append(scores)
return batch_keypoints, batch_scores

View File

@ -485,7 +485,7 @@ class WanVAE(nn.Module):
iter_ = 1 + (t - 1) // 4
feat_map = None
if iter_ > 1:
feat_map = [None] * count_conv3d(self.decoder)
feat_map = [None] * count_conv3d(self.encoder)
## 对encode输入的x按时间拆分为1、4、4、4....
for i in range(iter_):
conv_idx = [0]

View File

@ -337,6 +337,7 @@ def model_lora_keys_unet(model, key_map={}):
if k.startswith("diffusion_model.decoder.") and k.endswith(".weight"):
key_lora = k[len("diffusion_model.decoder."):-len(".weight")]
key_map["base_model.model.{}".format(key_lora)] = k # Official base model loras
key_map["lycoris_{}".format(key_lora.replace(".", "_"))] = k # LyCORIS/LoKR format
return key_map

View File

@ -925,6 +925,25 @@ class Flux(BaseModel):
out['ref_latents'] = list([1, 16, sum(map(lambda a: math.prod(a.size()[2:]), ref_latents))])
return out
class LongCatImage(Flux):
def _apply_model(self, x, t, c_concat=None, c_crossattn=None, control=None, transformer_options={}, **kwargs):
transformer_options = transformer_options.copy()
rope_opts = transformer_options.get("rope_options", {})
rope_opts = dict(rope_opts)
rope_opts.setdefault("shift_t", 1.0)
rope_opts.setdefault("shift_y", 512.0)
rope_opts.setdefault("shift_x", 512.0)
transformer_options["rope_options"] = rope_opts
return super()._apply_model(x, t, c_concat, c_crossattn, control, transformer_options, **kwargs)
def encode_adm(self, **kwargs):
return None
def extra_conds(self, **kwargs):
out = super().extra_conds(**kwargs)
out.pop('guidance', None)
return out
class Flux2(Flux):
def extra_conds(self, **kwargs):
out = super().extra_conds(**kwargs)

View File

@ -279,6 +279,8 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
dit_config["txt_norm"] = any_suffix_in(state_dict_keys, key_prefix, 'txt_norm.', ["weight", "scale"])
if dit_config["yak_mlp"] and dit_config["txt_norm"]: # Ovis model
dit_config["txt_ids_dims"] = [1, 2]
if dit_config.get("context_in_dim") == 3584 and dit_config["vec_in_dim"] is None: # LongCat-Image
dit_config["txt_ids_dims"] = [1, 2]
return dit_config
@ -795,6 +797,10 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
unet_config["use_temporal_resblock"] = False
unet_config["use_temporal_attention"] = False
heatmap_key = '{}heatmap_head.conv_layers.0.weight'.format(key_prefix)
if heatmap_key in state_dict_keys:
unet_config["heatmap_head"] = True
return unet_config
def model_config_from_unet_config(unet_config, state_dict=None):
@ -1015,7 +1021,7 @@ def unet_config_from_diffusers_unet(state_dict, dtype=None):
LotusD = {'use_checkpoint': False, 'image_size': 32, 'out_channels': 4, 'use_spatial_transformer': True, 'legacy': False, 'adm_in_channels': 4,
'dtype': dtype, 'in_channels': 4, 'model_channels': 320, 'num_res_blocks': [2, 2, 2, 2], 'transformer_depth': [1, 1, 1, 1, 1, 1, 0, 0],
'channel_mult': [1, 2, 4, 4], 'transformer_depth_middle': 1, 'use_linear_in_transformer': True, 'context_dim': 1024, 'num_heads': 8,
'channel_mult': [1, 2, 4, 4], 'transformer_depth_middle': 1, 'use_linear_in_transformer': True, 'context_dim': 1024, 'num_head_channels': 64,
'transformer_depth_output': [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
'use_temporal_attention': False, 'use_temporal_resblock': False}

View File

@ -350,7 +350,7 @@ AMD_ENABLE_MIOPEN_ENV = 'COMFYUI_ENABLE_MIOPEN'
try:
if is_amd():
arch = torch.cuda.get_device_properties(get_torch_device()).gcnArchName
arch = torch.cuda.get_device_properties(get_torch_device()).gcnArchName.split(':')[0]
if not (any((a in arch) for a in AMD_RDNA2_AND_OLDER_ARCH)):
if os.getenv(AMD_ENABLE_MIOPEN_ENV) != '1':
torch.backends.cudnn.enabled = False # Seems to improve things a lot on AMD
@ -378,7 +378,7 @@ try:
if args.use_split_cross_attention == False and args.use_quad_cross_attention == False:
if aotriton_supported(arch): # AMD efficient attention implementation depends on aotriton.
if torch_version_numeric >= (2, 7): # works on 2.6 but doesn't actually seem to improve much
if any((a in arch) for a in ["gfx90a", "gfx942", "gfx1100", "gfx1101", "gfx1151"]): # TODO: more arches, TODO: gfx950
if any((a in arch) for a in ["gfx90a", "gfx942", "gfx950", "gfx1100", "gfx1101", "gfx1151"]): # TODO: more arches, TODO: gfx950
ENABLE_PYTORCH_ATTENTION = True
if rocm_version >= (7, 0):
if any((a in arch) for a in ["gfx1200", "gfx1201"]):

View File

@ -167,17 +167,15 @@ def cast_bias_weight_with_vbar(s, dtype, device, bias_dtype, non_blocking, compu
x = to_dequant(x, dtype)
if not resident and lowvram_fn is not None:
x = to_dequant(x, dtype if compute_dtype is None else compute_dtype)
#FIXME: this is not accurate, we need to be sensitive to the compute dtype
x = lowvram_fn(x)
if (isinstance(orig, QuantizedTensor) and
(want_requant and len(fns) == 0 or update_weight)):
if (want_requant and len(fns) == 0 or update_weight):
seed = comfy.utils.string_to_seed(s.seed_key)
y = QuantizedTensor.from_float(x, s.layout_type, scale="recalculate", stochastic_rounding=seed)
if want_requant and len(fns) == 0:
#The layer actually wants our freshly saved QT
x = y
elif update_weight:
y = comfy.float.stochastic_rounding(x, orig.dtype, seed = comfy.utils.string_to_seed(s.seed_key))
if isinstance(orig, QuantizedTensor):
y = QuantizedTensor.from_float(x, s.layout_type, scale="recalculate", stochastic_rounding=seed)
else:
y = comfy.float.stochastic_rounding(x, orig.dtype, seed=seed)
if want_requant and len(fns) == 0:
x = y
if update_weight:
orig.copy_(y)
for f in fns:
@ -617,7 +615,8 @@ def fp8_linear(self, input):
if input.ndim != 2:
return None
w, bias, offload_stream = cast_bias_weight(self, input, dtype=dtype, bias_dtype=input_dtype, offloadable=True)
lora_compute_dtype=comfy.model_management.lora_compute_dtype(input.device)
w, bias, offload_stream = cast_bias_weight(self, input, dtype=dtype, bias_dtype=input_dtype, offloadable=True, compute_dtype=lora_compute_dtype, want_requant=True)
scale_weight = torch.ones((), device=input.device, dtype=torch.float32)
scale_input = torch.ones((), device=input.device, dtype=torch.float32)

View File

@ -60,6 +60,7 @@ import comfy.text_encoders.jina_clip_2
import comfy.text_encoders.newbie
import comfy.text_encoders.anima
import comfy.text_encoders.ace15
import comfy.text_encoders.longcat_image
import comfy.model_patcher
import comfy.lora
@ -1160,6 +1161,7 @@ class CLIPType(Enum):
KANDINSKY5_IMAGE = 23
NEWBIE = 24
FLUX2 = 25
LONGCAT_IMAGE = 26
def load_clip(ckpt_paths, embedding_directory=None, clip_type=CLIPType.STABLE_DIFFUSION, model_options={}):
@ -1372,6 +1374,9 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
if clip_type == CLIPType.HUNYUAN_IMAGE:
clip_target.clip = comfy.text_encoders.hunyuan_image.te(byt5=False, **llama_detect(clip_data))
clip_target.tokenizer = comfy.text_encoders.hunyuan_image.HunyuanImageTokenizer
elif clip_type == CLIPType.LONGCAT_IMAGE:
clip_target.clip = comfy.text_encoders.longcat_image.te(**llama_detect(clip_data))
clip_target.tokenizer = comfy.text_encoders.longcat_image.LongCatImageTokenizer
else:
clip_target.clip = comfy.text_encoders.qwen_image.te(**llama_detect(clip_data))
clip_target.tokenizer = comfy.text_encoders.qwen_image.QwenImageTokenizer

View File

@ -25,6 +25,7 @@ import comfy.text_encoders.kandinsky5
import comfy.text_encoders.z_image
import comfy.text_encoders.anima
import comfy.text_encoders.ace15
import comfy.text_encoders.longcat_image
from . import supported_models_base
from . import latent_formats
@ -525,7 +526,8 @@ class LotusD(SD20):
}
unet_extra_config = {
"num_classes": 'sequential'
"num_classes": 'sequential',
"num_head_channels": 64,
}
def get_model(self, state_dict, prefix="", device=None):
@ -1677,6 +1679,37 @@ class ACEStep15(supported_models_base.BASE):
return supported_models_base.ClipTarget(comfy.text_encoders.ace15.ACE15Tokenizer, comfy.text_encoders.ace15.te(**detect))
models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, LTXAV, HunyuanVideo15_SR_Distilled, HunyuanVideo15, HunyuanImage21Refiner, HunyuanImage21, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, ZImage, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, WAN21_HuMo, WAN22_Animate, WAN21_FlowRVS, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ChromaRadiance, ACEStep, ACEStep15, Omnigen2, QwenImage, Flux2, Kandinsky5Image, Kandinsky5, Anima]
class LongCatImage(supported_models_base.BASE):
unet_config = {
"image_model": "flux",
"guidance_embed": False,
"vec_in_dim": None,
"context_in_dim": 3584,
"txt_ids_dims": [1, 2],
}
sampling_settings = {
}
unet_extra_config = {}
latent_format = latent_formats.Flux
memory_usage_factor = 2.5
supported_inference_dtypes = [torch.bfloat16, torch.float16, torch.float32]
vae_key_prefix = ["vae."]
text_encoder_key_prefix = ["text_encoders."]
def get_model(self, state_dict, prefix="", device=None):
out = model_base.LongCatImage(self, device=device)
return out
def clip_target(self, state_dict={}):
pref = self.text_encoder_key_prefix[0]
hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen25_7b.transformer.".format(pref))
return supported_models_base.ClipTarget(comfy.text_encoders.longcat_image.LongCatImageTokenizer, comfy.text_encoders.longcat_image.te(**hunyuan_detect))
models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, LongCatImage, FluxSchnell, GenmoMochi, LTXV, LTXAV, HunyuanVideo15_SR_Distilled, HunyuanVideo15, HunyuanImage21Refiner, HunyuanImage21, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, ZImage, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, WAN21_HuMo, WAN22_Animate, WAN21_FlowRVS, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ChromaRadiance, ACEStep, ACEStep15, Omnigen2, QwenImage, Flux2, Kandinsky5Image, Kandinsky5, Anima]
models += [SVD_img2vid]

View File

@ -328,14 +328,14 @@ class ACE15TEModel(torch.nn.Module):
return getattr(self, self.lm_model).load_sd(sd)
def memory_estimation_function(self, token_weight_pairs, device=None):
lm_metadata = token_weight_pairs["lm_metadata"]
lm_metadata = token_weight_pairs.get("lm_metadata", {})
constant = self.constant
if comfy.model_management.should_use_bf16(device):
constant *= 0.5
token_weight_pairs = token_weight_pairs.get("lm_prompt", [])
num_tokens = sum(map(lambda a: len(a), token_weight_pairs))
num_tokens += lm_metadata['min_tokens']
num_tokens += lm_metadata.get("min_tokens", 0)
return num_tokens * constant * 1024 * 1024
def te(dtype_llama=None, llama_quantization_metadata=None, lm_model="qwen3_2b"):

View File

@ -0,0 +1,184 @@
import re
import numbers
import torch
from comfy import sd1_clip
from comfy.text_encoders.qwen_image import Qwen25_7BVLITokenizer, Qwen25_7BVLIModel
import logging
logger = logging.getLogger(__name__)
QUOTE_PAIRS = [("'", "'"), ('"', '"'), ("\u2018", "\u2019"), ("\u201c", "\u201d")]
QUOTE_PATTERN = "|".join(
[
re.escape(q1) + r"[^" + re.escape(q1 + q2) + r"]*?" + re.escape(q2)
for q1, q2 in QUOTE_PAIRS
]
)
WORD_INTERNAL_QUOTE_RE = re.compile(r"[a-zA-Z]+'[a-zA-Z]+")
def split_quotation(prompt):
matches = WORD_INTERNAL_QUOTE_RE.findall(prompt)
mapping = []
for i, word_src in enumerate(set(matches)):
word_tgt = "longcat_$##$_longcat" * (i + 1)
prompt = prompt.replace(word_src, word_tgt)
mapping.append((word_src, word_tgt))
parts = re.split(f"({QUOTE_PATTERN})", prompt)
result = []
for part in parts:
for word_src, word_tgt in mapping:
part = part.replace(word_tgt, word_src)
if not part:
continue
is_quoted = bool(re.match(QUOTE_PATTERN, part))
result.append((part, is_quoted))
return result
class LongCatImageBaseTokenizer(Qwen25_7BVLITokenizer):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.max_length = 512
def tokenize_with_weights(self, text, return_word_ids=False, **kwargs):
parts = split_quotation(text)
all_tokens = []
for part_text, is_quoted in parts:
if is_quoted:
for char in part_text:
ids = self.tokenizer(char, add_special_tokens=False)["input_ids"]
all_tokens.extend(ids)
else:
ids = self.tokenizer(part_text, add_special_tokens=False)["input_ids"]
all_tokens.extend(ids)
if len(all_tokens) > self.max_length:
all_tokens = all_tokens[: self.max_length]
logger.warning(f"Truncated prompt to {self.max_length} tokens")
output = [(t, 1.0) for t in all_tokens]
# Pad to max length
self.pad_tokens(output, self.max_length - len(output))
return [output]
class LongCatImageTokenizer(sd1_clip.SD1Tokenizer):
def __init__(self, embedding_directory=None, tokenizer_data={}):
super().__init__(
embedding_directory=embedding_directory,
tokenizer_data=tokenizer_data,
name="qwen25_7b",
tokenizer=LongCatImageBaseTokenizer,
)
self.longcat_template_prefix = "<|im_start|>system\nAs an image captioning expert, generate a descriptive text prompt based on an image content, suitable for input to a text-to-image model.<|im_end|>\n<|im_start|>user\n"
self.longcat_template_suffix = "<|im_end|>\n<|im_start|>assistant\n"
def tokenize_with_weights(self, text, return_word_ids=False, **kwargs):
skip_template = False
if text.startswith("<|im_start|>"):
skip_template = True
if text.startswith("<|start_header_id|>"):
skip_template = True
if text == "":
text = " "
base_tok = getattr(self, "qwen25_7b")
if skip_template:
tokens = super().tokenize_with_weights(
text, return_word_ids=return_word_ids, disable_weights=True, **kwargs
)
else:
prefix_ids = base_tok.tokenizer(
self.longcat_template_prefix, add_special_tokens=False
)["input_ids"]
suffix_ids = base_tok.tokenizer(
self.longcat_template_suffix, add_special_tokens=False
)["input_ids"]
prompt_tokens = base_tok.tokenize_with_weights(
text, return_word_ids=return_word_ids, **kwargs
)
prompt_pairs = prompt_tokens[0]
prefix_pairs = [(t, 1.0) for t in prefix_ids]
suffix_pairs = [(t, 1.0) for t in suffix_ids]
combined = prefix_pairs + prompt_pairs + suffix_pairs
tokens = {"qwen25_7b": [combined]}
return tokens
class LongCatImageTEModel(sd1_clip.SD1ClipModel):
def __init__(self, device="cpu", dtype=None, model_options={}):
super().__init__(
device=device,
dtype=dtype,
name="qwen25_7b",
clip_model=Qwen25_7BVLIModel,
model_options=model_options,
)
def encode_token_weights(self, token_weight_pairs, template_end=-1):
out, pooled, extra = super().encode_token_weights(token_weight_pairs)
tok_pairs = token_weight_pairs["qwen25_7b"][0]
count_im_start = 0
if template_end == -1:
for i, v in enumerate(tok_pairs):
elem = v[0]
if not torch.is_tensor(elem):
if isinstance(elem, numbers.Integral):
if elem == 151644 and count_im_start < 2:
template_end = i
count_im_start += 1
if out.shape[1] > (template_end + 3):
if tok_pairs[template_end + 1][0] == 872:
if tok_pairs[template_end + 2][0] == 198:
template_end += 3
if template_end == -1:
template_end = 0
suffix_start = None
for i in range(len(tok_pairs) - 1, -1, -1):
elem = tok_pairs[i][0]
if not torch.is_tensor(elem) and isinstance(elem, numbers.Integral):
if elem == 151645:
suffix_start = i
break
out = out[:, template_end:]
if "attention_mask" in extra:
extra["attention_mask"] = extra["attention_mask"][:, template_end:]
if extra["attention_mask"].sum() == torch.numel(extra["attention_mask"]):
extra.pop("attention_mask")
if suffix_start is not None:
suffix_len = len(tok_pairs) - suffix_start
if suffix_len > 0 and out.shape[1] > suffix_len:
out = out[:, :-suffix_len]
if "attention_mask" in extra:
extra["attention_mask"] = extra["attention_mask"][:, :-suffix_len]
if extra["attention_mask"].sum() == torch.numel(
extra["attention_mask"]
):
extra.pop("attention_mask")
return out, pooled, extra
def te(dtype_llama=None, llama_quantization_metadata=None):
class LongCatImageTEModel_(LongCatImageTEModel):
def __init__(self, device="cpu", dtype=None, model_options={}):
if llama_quantization_metadata is not None:
model_options = model_options.copy()
model_options["quantization_metadata"] = llama_quantization_metadata
if dtype_llama is not None:
dtype = dtype_llama
super().__init__(device=device, dtype=dtype, model_options=model_options)
return LongCatImageTEModel_

View File

@ -1224,9 +1224,10 @@ class BoundingBox(ComfyTypeIO):
class Input(WidgetInput):
def __init__(self, id: str, display_name: str=None, optional=False, tooltip: str=None,
socketless: bool=True, default: dict=None, component: str=None):
socketless: bool=True, default: dict=None, component: str=None, force_input: bool=None):
super().__init__(id, display_name, optional, tooltip, None, default, socketless)
self.component = component
self.force_input = force_input
if default is None:
self.default = {"x": 0, "y": 0, "width": 512, "height": 512}
@ -1234,6 +1235,8 @@ class BoundingBox(ComfyTypeIO):
d = super().as_dict()
if self.component:
d["component"] = self.component
if self.force_input is not None:
d["forceInput"] = self.force_input
return d

View File

@ -162,7 +162,7 @@ class SaveAudio(IO.ComfyNode):
essentials_category="Audio",
inputs=[
IO.Audio.Input("audio"),
IO.String.Input("filename_prefix", default="audio/ComfyUI"),
IO.String.Input("filename_prefix", default="audio/ComfyUI_%year%%month%%day%-%hour%%minute%%second%"),
],
hidden=[IO.Hidden.prompt, IO.Hidden.extra_pnginfo],
is_output_node=True,
@ -187,7 +187,7 @@ class SaveAudioMP3(IO.ComfyNode):
category="audio",
inputs=[
IO.Audio.Input("audio"),
IO.String.Input("filename_prefix", default="audio/ComfyUI"),
IO.String.Input("filename_prefix", default="audio/ComfyUI_%year%%month%%day%-%hour%%minute%%second%"),
IO.Combo.Input("quality", options=["V0", "128k", "320k"], default="V0"),
],
hidden=[IO.Hidden.prompt, IO.Hidden.extra_pnginfo],
@ -215,7 +215,7 @@ class SaveAudioOpus(IO.ComfyNode):
category="audio",
inputs=[
IO.Audio.Input("audio"),
IO.String.Input("filename_prefix", default="audio/ComfyUI"),
IO.String.Input("filename_prefix", default="audio/ComfyUI_%year%%month%%day%-%hour%%minute%%second%"),
IO.Combo.Input("quality", options=["64k", "96k", "128k", "192k", "320k"], default="128k"),
],
hidden=[IO.Hidden.prompt, IO.Hidden.extra_pnginfo],

View File

@ -865,14 +865,15 @@ class GLSLShader(io.ComfyNode):
cls, image_list: list[torch.Tensor], output_batch: torch.Tensor
) -> dict[str, list]:
"""Build UI output with input and output images for client-side shader execution."""
combined_inputs = torch.cat(image_list, dim=0)
input_images_ui = ui.ImageSaveHelper.save_images(
combined_inputs,
filename_prefix="GLSLShader_input",
folder_type=io.FolderType.temp,
cls=None,
compress_level=1,
)
input_images_ui = []
for img in image_list:
input_images_ui.extend(ui.ImageSaveHelper.save_images(
img,
filename_prefix="GLSLShader_input",
folder_type=io.FolderType.temp,
cls=None,
compress_level=1,
))
output_images_ui = ui.ImageSaveHelper.save_images(
output_batch,

View File

@ -637,7 +637,7 @@ class SaveGLB(IO.ComfyNode):
],
tooltip="Mesh or 3D file to save",
),
IO.String.Input("filename_prefix", default="mesh/ComfyUI"),
IO.String.Input("filename_prefix", default="mesh/ComfyUI_%year%%month%%day%-%hour%%minute%%second%"),
],
hidden=[IO.Hidden.prompt, IO.Hidden.extra_pnginfo]
)

View File

@ -190,7 +190,7 @@ class SaveAnimatedWEBP(IO.ComfyNode):
category="image/animation",
inputs=[
IO.Image.Input("images"),
IO.String.Input("filename_prefix", default="ComfyUI"),
IO.String.Input("filename_prefix", default="ComfyUI_%year%%month%%day%-%hour%%minute%%second%"),
IO.Float.Input("fps", default=6.0, min=0.01, max=1000.0, step=0.01),
IO.Boolean.Input("lossless", default=True),
IO.Int.Input("quality", default=80, min=0, max=100),
@ -227,7 +227,7 @@ class SaveAnimatedPNG(IO.ComfyNode):
category="image/animation",
inputs=[
IO.Image.Input("images"),
IO.String.Input("filename_prefix", default="ComfyUI"),
IO.String.Input("filename_prefix", default="ComfyUI_%year%%month%%day%-%hour%%minute%%second%"),
IO.Float.Input("fps", default=6.0, min=0.01, max=1000.0, step=0.01),
IO.Int.Input("compress_level", default=4, min=0, max=9, advanced=True),
],
@ -489,7 +489,7 @@ class SaveSVGNode(IO.ComfyNode):
IO.SVG.Input("svg"),
IO.String.Input(
"filename_prefix",
default="svg/ComfyUI",
default="svg/ComfyUI_%year%%month%%day%-%hour%%minute%%second%",
tooltip="The prefix for the file to save. This may include formatting information such as %date:yyyy-MM-dd% or %Empty Latent Image.width% to include values from nodes.",
),
],
@ -706,8 +706,8 @@ class SplitImageToTileList(IO.ComfyNode):
@staticmethod
def get_grid_coords(width, height, tile_width, tile_height, overlap):
coords = []
stride_x = max(1, tile_width - overlap)
stride_y = max(1, tile_height - overlap)
stride_x = round(max(tile_width * 0.25, tile_width - overlap))
stride_y = round(max(tile_width * 0.25, tile_height - overlap))
y = 0
while y < height:
@ -764,34 +764,6 @@ class ImageMergeTileList(IO.ComfyNode):
],
)
@staticmethod
def get_grid_coords(width, height, tile_width, tile_height, overlap):
coords = []
stride_x = max(1, tile_width - overlap)
stride_y = max(1, tile_height - overlap)
y = 0
while y < height:
x = 0
y_end = min(y + tile_height, height)
y_start = max(0, y_end - tile_height)
while x < width:
x_end = min(x + tile_width, width)
x_start = max(0, x_end - tile_width)
coords.append((x_start, y_start, x_end, y_end))
if x_end >= width:
break
x += stride_x
if y_end >= height:
break
y += stride_y
return coords
@classmethod
def execute(cls, image_list, final_width, final_height, overlap):
w = final_width[0]
@ -804,7 +776,7 @@ class ImageMergeTileList(IO.ComfyNode):
device = first_tile.device
dtype = first_tile.dtype
coords = cls.get_grid_coords(w, h, t_w, t_h, ovlp)
coords = SplitImageToTileList.get_grid_coords(w, h, t_w, t_h, ovlp)
canvas = torch.zeros((b, h, w, c), device=device, dtype=dtype)
weights = torch.zeros((b, h, w, 1), device=device, dtype=dtype)

View File

@ -0,0 +1,86 @@
from __future__ import annotations
import math
from enum import Enum
from typing_extensions import override
from comfy_api.latest import ComfyExtension, io
class AspectRatio(str, Enum):
SQUARE = "1:1 (Square)"
PHOTO_H = "3:2 (Photo)"
STANDARD_H = "4:3 (Standard)"
WIDESCREEN_H = "16:9 (Widescreen)"
ULTRAWIDE_H = "21:9 (Ultrawide)"
PHOTO_V = "2:3 (Portrait Photo)"
STANDARD_V = "3:4 (Portrait Standard)"
WIDESCREEN_V = "9:16 (Portrait Widescreen)"
ASPECT_RATIOS: dict[AspectRatio, tuple[int, int]] = {
AspectRatio.SQUARE: (1, 1),
AspectRatio.PHOTO_H: (3, 2),
AspectRatio.STANDARD_H: (4, 3),
AspectRatio.WIDESCREEN_H: (16, 9),
AspectRatio.ULTRAWIDE_H: (21, 9),
AspectRatio.PHOTO_V: (2, 3),
AspectRatio.STANDARD_V: (3, 4),
AspectRatio.WIDESCREEN_V: (9, 16),
}
class ResolutionSelector(io.ComfyNode):
"""Calculate width and height from aspect ratio and megapixel target."""
@classmethod
def define_schema(cls):
return io.Schema(
node_id="ResolutionSelector",
display_name="Resolution Selector",
category="utils",
description="Calculate width and height from aspect ratio and megapixel target. Useful for setting up Empty Latent Image dimensions.",
inputs=[
io.Combo.Input(
"aspect_ratio",
options=AspectRatio,
default=AspectRatio.SQUARE,
tooltip="The aspect ratio for the output dimensions.",
),
io.Float.Input(
"megapixels",
default=1.0,
min=0.1,
max=16.0,
step=0.1,
tooltip="Target total megapixels. 1.0 MP ≈ 1024×1024 for square.",
),
],
outputs=[
io.Int.Output(
"width", tooltip="Calculated width in pixels (multiple of 8)."
),
io.Int.Output(
"height", tooltip="Calculated height in pixels (multiple of 8)."
),
],
)
@classmethod
def execute(cls, aspect_ratio: str, megapixels: float) -> io.NodeOutput:
w_ratio, h_ratio = ASPECT_RATIOS[aspect_ratio]
total_pixels = megapixels * 1024 * 1024
scale = math.sqrt(total_pixels / (w_ratio * h_ratio))
width = round(w_ratio * scale / 8) * 8
height = round(h_ratio * scale / 8) * 8
return io.NodeOutput(width, height)
class ResolutionExtension(ComfyExtension):
@override
async def get_node_list(self) -> list[type[io.ComfyNode]]:
return [
ResolutionSelector,
]
async def comfy_entrypoint() -> ResolutionExtension:
return ResolutionExtension()

View File

@ -0,0 +1,740 @@
import torch
import comfy.utils
import numpy as np
import math
import colorsys
from tqdm import tqdm
from typing_extensions import override
from comfy_api.latest import ComfyExtension, io
from comfy_extras.nodes_lotus import LotusConditioning
def _preprocess_keypoints(kp_raw, sc_raw):
"""Insert neck keypoint and remap from MMPose to OpenPose ordering.
Returns (kp, sc) where kp has shape (134, 2) and sc has shape (134,).
Layout:
0-17 body (18 kp, OpenPose order)
18-23 feet (6 kp)
24-91 face (68 kp)
92-112 right hand (21 kp)
113-133 left hand (21 kp)
"""
kp = np.array(kp_raw, dtype=np.float32)
sc = np.array(sc_raw, dtype=np.float32)
if len(kp) >= 17:
neck = (kp[5] + kp[6]) / 2
neck_score = min(sc[5], sc[6]) if sc[5] > 0.3 and sc[6] > 0.3 else 0
kp = np.insert(kp, 17, neck, axis=0)
sc = np.insert(sc, 17, neck_score)
mmpose_idx = np.array([17, 6, 8, 10, 7, 9, 12, 14, 16, 13, 15, 2, 1, 4, 3])
openpose_idx = np.array([ 1, 2, 3, 4, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17])
tmp_kp, tmp_sc = kp.copy(), sc.copy()
tmp_kp[openpose_idx] = kp[mmpose_idx]
tmp_sc[openpose_idx] = sc[mmpose_idx]
kp, sc = tmp_kp, tmp_sc
return kp, sc
def _to_openpose_frames(all_keypoints, all_scores, height, width):
"""Convert raw keypoint lists to a list of OpenPose-style frame dicts.
Each frame dict contains:
canvas_width, canvas_height, people: list of person dicts with keys:
pose_keypoints_2d - 18 body kp as flat [x,y,score,...] (absolute pixels)
foot_keypoints_2d - 6 foot kp as flat [x,y,score,...] (absolute pixels)
face_keypoints_2d - 70 face kp as flat [x,y,score,...] (absolute pixels)
indices 0-67: 68 face landmarks
index 68: right eye (body[14])
index 69: left eye (body[15])
hand_right_keypoints_2d - 21 right-hand kp (absolute pixels)
hand_left_keypoints_2d - 21 left-hand kp (absolute pixels)
"""
def _flatten(kp_slice, sc_slice):
return np.stack([kp_slice[:, 0], kp_slice[:, 1], sc_slice], axis=1).flatten().tolist()
frames = []
for img_idx in range(len(all_keypoints)):
people = []
for kp_raw, sc_raw in zip(all_keypoints[img_idx], all_scores[img_idx]):
kp, sc = _preprocess_keypoints(kp_raw, sc_raw)
# 70 face kp = 68 face landmarks + REye (body[14]) + LEye (body[15])
face_kp = np.concatenate([kp[24:92], kp[[14, 15]]], axis=0)
face_sc = np.concatenate([sc[24:92], sc[[14, 15]]], axis=0)
people.append({
"pose_keypoints_2d": _flatten(kp[0:18], sc[0:18]),
"foot_keypoints_2d": _flatten(kp[18:24], sc[18:24]),
"face_keypoints_2d": _flatten(face_kp, face_sc),
"hand_right_keypoints_2d": _flatten(kp[92:113], sc[92:113]),
"hand_left_keypoints_2d": _flatten(kp[113:134], sc[113:134]),
})
frames.append({"canvas_width": width, "canvas_height": height, "people": people})
return frames
class KeypointDraw:
"""
Pose keypoint drawing class that supports both numpy and cv2 backends.
"""
def __init__(self):
try:
import cv2
self.draw = cv2
except ImportError:
self.draw = self
# Hand connections (same for both hands)
self.hand_edges = [
[0, 1], [1, 2], [2, 3], [3, 4], # thumb
[0, 5], [5, 6], [6, 7], [7, 8], # index
[0, 9], [9, 10], [10, 11], [11, 12], # middle
[0, 13], [13, 14], [14, 15], [15, 16], # ring
[0, 17], [17, 18], [18, 19], [19, 20], # pinky
]
# Body connections - matching DWPose limbSeq (1-indexed, converted to 0-indexed)
self.body_limbSeq = [
[2, 3], [2, 6], [3, 4], [4, 5], [6, 7], [7, 8], [2, 9], [9, 10],
[10, 11], [2, 12], [12, 13], [13, 14], [2, 1], [1, 15], [15, 17],
[1, 16], [16, 18]
]
# Colors matching DWPose
self.colors = [
[255, 0, 0], [255, 85, 0], [255, 170, 0], [255, 255, 0], [170, 255, 0],
[85, 255, 0], [0, 255, 0], [0, 255, 85], [0, 255, 170], [0, 255, 255],
[0, 170, 255], [0, 85, 255], [0, 0, 255], [85, 0, 255],
[170, 0, 255], [255, 0, 255], [255, 0, 170], [255, 0, 85]
]
@staticmethod
def circle(canvas_np, center, radius, color, **kwargs):
"""Draw a filled circle using NumPy vectorized operations."""
cx, cy = center
h, w = canvas_np.shape[:2]
radius_int = int(np.ceil(radius))
y_min, y_max = max(0, cy - radius_int), min(h, cy + radius_int + 1)
x_min, x_max = max(0, cx - radius_int), min(w, cx + radius_int + 1)
if y_max <= y_min or x_max <= x_min:
return
y, x = np.ogrid[y_min:y_max, x_min:x_max]
mask = (x - cx)**2 + (y - cy)**2 <= radius**2
canvas_np[y_min:y_max, x_min:x_max][mask] = color
@staticmethod
def line(canvas_np, pt1, pt2, color, thickness=1, **kwargs):
"""Draw line using Bresenham's algorithm with NumPy operations."""
x0, y0, x1, y1 = *pt1, *pt2
h, w = canvas_np.shape[:2]
dx, dy = abs(x1 - x0), abs(y1 - y0)
sx, sy = (1 if x0 < x1 else -1), (1 if y0 < y1 else -1)
err, x, y, line_points = dx - dy, x0, y0, []
while True:
line_points.append((x, y))
if x == x1 and y == y1:
break
e2 = 2 * err
if e2 > -dy:
err, x = err - dy, x + sx
if e2 < dx:
err, y = err + dx, y + sy
if thickness > 1:
radius, radius_int = (thickness / 2.0) + 0.5, int(np.ceil((thickness / 2.0) + 0.5))
for px, py in line_points:
y_min, y_max, x_min, x_max = max(0, py - radius_int), min(h, py + radius_int + 1), max(0, px - radius_int), min(w, px + radius_int + 1)
if y_max > y_min and x_max > x_min:
yy, xx = np.ogrid[y_min:y_max, x_min:x_max]
canvas_np[y_min:y_max, x_min:x_max][(xx - px)**2 + (yy - py)**2 <= radius**2] = color
else:
line_points = np.array(line_points)
valid = (line_points[:, 1] >= 0) & (line_points[:, 1] < h) & (line_points[:, 0] >= 0) & (line_points[:, 0] < w)
if (valid_points := line_points[valid]).size:
canvas_np[valid_points[:, 1], valid_points[:, 0]] = color
@staticmethod
def fillConvexPoly(canvas_np, pts, color, **kwargs):
"""Fill polygon using vectorized scanline algorithm."""
if len(pts) < 3:
return
pts = np.array(pts, dtype=np.int32)
h, w = canvas_np.shape[:2]
y_min, y_max, x_min, x_max = max(0, pts[:, 1].min()), min(h, pts[:, 1].max() + 1), max(0, pts[:, 0].min()), min(w, pts[:, 0].max() + 1)
if y_max <= y_min or x_max <= x_min:
return
yy, xx = np.mgrid[y_min:y_max, x_min:x_max]
mask = np.zeros((y_max - y_min, x_max - x_min), dtype=bool)
for i in range(len(pts)):
p1, p2 = pts[i], pts[(i + 1) % len(pts)]
y1, y2 = p1[1], p2[1]
if y1 == y2:
continue
if y1 > y2:
p1, p2, y1, y2 = p2, p1, p2[1], p1[1]
if not (edge_mask := (yy >= y1) & (yy < y2)).any():
continue
mask ^= edge_mask & (xx >= p1[0] + (yy - y1) * (p2[0] - p1[0]) / (y2 - y1))
canvas_np[y_min:y_max, x_min:x_max][mask] = color
@staticmethod
def ellipse2Poly(center, axes, angle, arc_start, arc_end, delta=1, **kwargs):
"""Python implementation of cv2.ellipse2Poly."""
axes = (axes[0] + 0.5, axes[1] + 0.5) # to better match cv2 output
angle = angle % 360
if arc_start > arc_end:
arc_start, arc_end = arc_end, arc_start
while arc_start < 0:
arc_start, arc_end = arc_start + 360, arc_end + 360
while arc_end > 360:
arc_end, arc_start = arc_end - 360, arc_start - 360
if arc_end - arc_start > 360:
arc_start, arc_end = 0, 360
angle_rad = math.radians(angle)
alpha, beta = math.cos(angle_rad), math.sin(angle_rad)
pts = []
for i in range(arc_start, arc_end + delta, delta):
theta_rad = math.radians(min(i, arc_end))
x, y = axes[0] * math.cos(theta_rad), axes[1] * math.sin(theta_rad)
pts.append([int(round(center[0] + x * alpha - y * beta)), int(round(center[1] + x * beta + y * alpha))])
unique_pts, prev_pt = [], (float('inf'), float('inf'))
for pt in pts:
if (pt_tuple := tuple(pt)) != prev_pt:
unique_pts.append(pt)
prev_pt = pt_tuple
return unique_pts if len(unique_pts) > 1 else [[center[0], center[1]], [center[0], center[1]]]
def draw_wholebody_keypoints(self, canvas, keypoints, scores=None, threshold=0.3,
draw_body=True, draw_feet=True, draw_face=True, draw_hands=True, stick_width=4, face_point_size=3):
"""
Draw wholebody keypoints (134 keypoints after processing) in DWPose style.
Expected keypoint format (after neck insertion and remapping):
- Body: 0-17 (18 keypoints in OpenPose format, neck at index 1)
- Foot: 18-23 (6 keypoints)
- Face: 24-91 (68 landmarks)
- Right hand: 92-112 (21 keypoints)
- Left hand: 113-133 (21 keypoints)
Args:
canvas: The canvas to draw on (numpy array)
keypoints: Array of keypoint coordinates
scores: Optional confidence scores for each keypoint
threshold: Minimum confidence threshold for drawing keypoints
Returns:
canvas: The canvas with keypoints drawn
"""
H, W, C = canvas.shape
# Draw body limbs
if draw_body and len(keypoints) >= 18:
for i, limb in enumerate(self.body_limbSeq):
# Convert from 1-indexed to 0-indexed
idx1, idx2 = limb[0] - 1, limb[1] - 1
if idx1 >= 18 or idx2 >= 18:
continue
if scores is not None:
if scores[idx1] < threshold or scores[idx2] < threshold:
continue
Y = [keypoints[idx1][0], keypoints[idx2][0]]
X = [keypoints[idx1][1], keypoints[idx2][1]]
mX, mY = (X[0] + X[1]) / 2, (Y[0] + Y[1]) / 2
length = math.sqrt((X[0] - X[1]) ** 2 + (Y[0] - Y[1]) ** 2)
if length < 1:
continue
angle = math.degrees(math.atan2(X[0] - X[1], Y[0] - Y[1]))
polygon = self.draw.ellipse2Poly((int(mY), int(mX)), (int(length / 2), stick_width), int(angle), 0, 360, 1)
self.draw.fillConvexPoly(canvas, polygon, self.colors[i % len(self.colors)])
# Draw body keypoints
if draw_body and len(keypoints) >= 18:
for i in range(18):
if scores is not None and scores[i] < threshold:
continue
x, y = int(keypoints[i][0]), int(keypoints[i][1])
if 0 <= x < W and 0 <= y < H:
self.draw.circle(canvas, (x, y), 4, self.colors[i % len(self.colors)], thickness=-1)
# Draw foot keypoints (18-23, 6 keypoints)
if draw_feet and len(keypoints) >= 24:
for i in range(18, 24):
if scores is not None and scores[i] < threshold:
continue
x, y = int(keypoints[i][0]), int(keypoints[i][1])
if 0 <= x < W and 0 <= y < H:
self.draw.circle(canvas, (x, y), 4, self.colors[i % len(self.colors)], thickness=-1)
# Draw right hand (92-112)
if draw_hands and len(keypoints) >= 113:
eps = 0.01
for ie, edge in enumerate(self.hand_edges):
idx1, idx2 = 92 + edge[0], 92 + edge[1]
if scores is not None:
if scores[idx1] < threshold or scores[idx2] < threshold:
continue
x1, y1 = int(keypoints[idx1][0]), int(keypoints[idx1][1])
x2, y2 = int(keypoints[idx2][0]), int(keypoints[idx2][1])
if x1 > eps and y1 > eps and x2 > eps and y2 > eps:
if 0 <= x1 < W and 0 <= y1 < H and 0 <= x2 < W and 0 <= y2 < H:
# HSV to RGB conversion for rainbow colors
r, g, b = colorsys.hsv_to_rgb(ie / float(len(self.hand_edges)), 1.0, 1.0)
color = (int(r * 255), int(g * 255), int(b * 255))
self.draw.line(canvas, (x1, y1), (x2, y2), color, thickness=2)
# Draw right hand keypoints
for i in range(92, 113):
if scores is not None and scores[i] < threshold:
continue
x, y = int(keypoints[i][0]), int(keypoints[i][1])
if x > eps and y > eps and 0 <= x < W and 0 <= y < H:
self.draw.circle(canvas, (x, y), 4, (0, 0, 255), thickness=-1)
# Draw left hand (113-133)
if draw_hands and len(keypoints) >= 134:
eps = 0.01
for ie, edge in enumerate(self.hand_edges):
idx1, idx2 = 113 + edge[0], 113 + edge[1]
if scores is not None:
if scores[idx1] < threshold or scores[idx2] < threshold:
continue
x1, y1 = int(keypoints[idx1][0]), int(keypoints[idx1][1])
x2, y2 = int(keypoints[idx2][0]), int(keypoints[idx2][1])
if x1 > eps and y1 > eps and x2 > eps and y2 > eps:
if 0 <= x1 < W and 0 <= y1 < H and 0 <= x2 < W and 0 <= y2 < H:
# HSV to RGB conversion for rainbow colors
r, g, b = colorsys.hsv_to_rgb(ie / float(len(self.hand_edges)), 1.0, 1.0)
color = (int(r * 255), int(g * 255), int(b * 255))
self.draw.line(canvas, (x1, y1), (x2, y2), color, thickness=2)
# Draw left hand keypoints
for i in range(113, 134):
if scores is not None and i < len(scores) and scores[i] < threshold:
continue
x, y = int(keypoints[i][0]), int(keypoints[i][1])
if x > eps and y > eps and 0 <= x < W and 0 <= y < H:
self.draw.circle(canvas, (x, y), 4, (0, 0, 255), thickness=-1)
# Draw face keypoints (24-91) - white dots only, no lines
if draw_face and len(keypoints) >= 92:
eps = 0.01
for i in range(24, 92):
if scores is not None and scores[i] < threshold:
continue
x, y = int(keypoints[i][0]), int(keypoints[i][1])
if x > eps and y > eps and 0 <= x < W and 0 <= y < H:
self.draw.circle(canvas, (x, y), face_point_size, (255, 255, 255), thickness=-1)
return canvas
class SDPoseDrawKeypoints(io.ComfyNode):
@classmethod
def define_schema(cls):
return io.Schema(
node_id="SDPoseDrawKeypoints",
category="image/preprocessors",
search_aliases=["openpose", "pose detection", "preprocessor", "keypoints", "pose"],
inputs=[
io.Custom("POSE_KEYPOINT").Input("keypoints"),
io.Boolean.Input("draw_body", default=True),
io.Boolean.Input("draw_hands", default=True),
io.Boolean.Input("draw_face", default=True),
io.Boolean.Input("draw_feet", default=False),
io.Int.Input("stick_width", default=4, min=1, max=10, step=1),
io.Int.Input("face_point_size", default=3, min=1, max=10, step=1),
io.Float.Input("score_threshold", default=0.3, min=0.0, max=1.0, step=0.01),
],
outputs=[
io.Image.Output(),
],
)
@classmethod
def execute(cls, keypoints, draw_body, draw_hands, draw_face, draw_feet, stick_width, face_point_size, score_threshold) -> io.NodeOutput:
if not keypoints:
return io.NodeOutput(torch.zeros((1, 64, 64, 3), dtype=torch.float32))
height = keypoints[0]["canvas_height"]
width = keypoints[0]["canvas_width"]
def _parse(flat, n):
arr = np.array(flat, dtype=np.float32).reshape(n, 3)
return arr[:, :2], arr[:, 2]
def _zeros(n):
return np.zeros((n, 2), dtype=np.float32), np.zeros(n, dtype=np.float32)
pose_outputs = []
drawer = KeypointDraw()
for frame in tqdm(keypoints, desc="Drawing keypoints on frames"):
canvas = np.zeros((height, width, 3), dtype=np.uint8)
for person in frame["people"]:
body_kp, body_sc = _parse(person["pose_keypoints_2d"], 18)
foot_raw = person.get("foot_keypoints_2d")
foot_kp, foot_sc = _parse(foot_raw, 6) if foot_raw else _zeros(6)
face_kp, face_sc = _parse(person["face_keypoints_2d"], 70)
face_kp, face_sc = face_kp[:68], face_sc[:68] # drop appended eye kp; body already draws them
rhand_kp, rhand_sc = _parse(person["hand_right_keypoints_2d"], 21)
lhand_kp, lhand_sc = _parse(person["hand_left_keypoints_2d"], 21)
kp = np.concatenate([body_kp, foot_kp, face_kp, rhand_kp, lhand_kp], axis=0)
sc = np.concatenate([body_sc, foot_sc, face_sc, rhand_sc, lhand_sc], axis=0)
canvas = drawer.draw_wholebody_keypoints(
canvas, kp, sc,
threshold=score_threshold,
draw_body=draw_body, draw_feet=draw_feet,
draw_face=draw_face, draw_hands=draw_hands,
stick_width=stick_width, face_point_size=face_point_size,
)
pose_outputs.append(canvas)
pose_outputs_np = np.stack(pose_outputs) if len(pose_outputs) > 1 else np.expand_dims(pose_outputs[0], 0)
final_pose_output = torch.from_numpy(pose_outputs_np).float() / 255.0
return io.NodeOutput(final_pose_output)
class SDPoseKeypointExtractor(io.ComfyNode):
@classmethod
def define_schema(cls):
return io.Schema(
node_id="SDPoseKeypointExtractor",
category="image/preprocessors",
search_aliases=["openpose", "pose detection", "preprocessor", "keypoints", "sdpose"],
description="Extract pose keypoints from images using the SDPose model: https://huggingface.co/Comfy-Org/SDPose/tree/main/checkpoints",
inputs=[
io.Model.Input("model"),
io.Vae.Input("vae"),
io.Image.Input("image"),
io.Int.Input("batch_size", default=16, min=1, max=10000, step=1),
io.BoundingBox.Input("bboxes", optional=True, force_input=True, tooltip="Optional bounding boxes for more accurate detections. Required for multi-person detection."),
],
outputs=[
io.Custom("POSE_KEYPOINT").Output("keypoints", tooltip="Keypoints in OpenPose frame format (canvas_width, canvas_height, people)"),
],
)
@classmethod
def execute(cls, model, vae, image, batch_size, bboxes=None) -> io.NodeOutput:
height, width = image.shape[-3], image.shape[-2]
context = LotusConditioning().execute().result[0]
# Use output_block_patch to capture the last 640-channel feature
def output_patch(h, hsp, transformer_options):
nonlocal captured_feat
if h.shape[1] == 640: # Capture the features for wholebody
captured_feat = h.clone()
return h, hsp
model_clone = model.clone()
model_clone.model_options["transformer_options"] = {"patches": {"output_block_patch": [output_patch]}}
if not hasattr(model.model.diffusion_model, 'heatmap_head'):
raise ValueError("The provided model does not have a heatmap_head. Please use SDPose model from here https://huggingface.co/Comfy-Org/SDPose/tree/main/checkpoints.")
head = model.model.diffusion_model.heatmap_head
total_images = image.shape[0]
captured_feat = None
model_h = int(head.heatmap_size[0]) * 4 # e.g. 192 * 4 = 768
model_w = int(head.heatmap_size[1]) * 4 # e.g. 256 * 4 = 1024
def _run_on_latent(latent_batch):
"""Run one forward pass and return (keypoints_list, scores_list) for the batch."""
nonlocal captured_feat
captured_feat = None
_ = comfy.sample.sample(
model_clone,
noise=torch.zeros_like(latent_batch),
steps=1, cfg=1.0,
sampler_name="euler", scheduler="simple",
positive=context, negative=context,
latent_image=latent_batch, disable_noise=True, disable_pbar=True,
)
return head(captured_feat) # keypoints_batch, scores_batch
# all_keypoints / all_scores are lists-of-lists:
# outer index = input image index
# inner index = detected person (one per bbox, or one for full-image)
all_keypoints = [] # shape: [n_images][n_persons]
all_scores = [] # shape: [n_images][n_persons]
pbar = comfy.utils.ProgressBar(total_images)
if bboxes is not None:
if not isinstance(bboxes, list):
bboxes = [[bboxes]]
elif len(bboxes) == 0:
bboxes = [None] * total_images
# --- bbox-crop mode: one forward pass per crop -------------------------
for img_idx in tqdm(range(total_images), desc="Extracting keypoints from crops"):
img = image[img_idx:img_idx + 1] # (1, H, W, C)
# Broadcasting: if fewer bbox lists than images, repeat the last one.
img_bboxes = bboxes[min(img_idx, len(bboxes) - 1)] if bboxes else None
img_keypoints = []
img_scores = []
if img_bboxes:
for bbox in img_bboxes:
x1 = max(0, int(bbox["x"]))
y1 = max(0, int(bbox["y"]))
x2 = min(width, int(bbox["x"] + bbox["width"]))
y2 = min(height, int(bbox["y"] + bbox["height"]))
if x2 <= x1 or y2 <= y1:
continue
crop_h_px, crop_w_px = y2 - y1, x2 - x1
crop = img[:, y1:y2, x1:x2, :] # (1, crop_h, crop_w, C)
# scale to fit inside (model_h, model_w) while preserving aspect ratio, then pad to exact model size.
scale = min(model_h / crop_h_px, model_w / crop_w_px)
scaled_h, scaled_w = int(round(crop_h_px * scale)), int(round(crop_w_px * scale))
pad_top, pad_left = (model_h - scaled_h) // 2, (model_w - scaled_w) // 2
crop_chw = crop.permute(0, 3, 1, 2).float() # BHWC → BCHW
scaled = comfy.utils.common_upscale(crop_chw, scaled_w, scaled_h, upscale_method="bilinear", crop="disabled")
padded = torch.zeros(1, scaled.shape[1], model_h, model_w, dtype=scaled.dtype, device=scaled.device)
padded[:, :, pad_top:pad_top + scaled_h, pad_left:pad_left + scaled_w] = scaled
crop_resized = padded.permute(0, 2, 3, 1) # BCHW → BHWC
latent_crop = vae.encode(crop_resized)
kp_batch, sc_batch = _run_on_latent(latent_crop)
kp, sc = kp_batch[0], sc_batch[0] # (K, 2), coords in model pixel space
# remove padding offset, undo scale, offset to full-image coordinates.
kp = kp.copy() if isinstance(kp, np.ndarray) else np.array(kp, dtype=np.float32)
kp[..., 0] = (kp[..., 0] - pad_left) / scale + x1
kp[..., 1] = (kp[..., 1] - pad_top) / scale + y1
img_keypoints.append(kp)
img_scores.append(sc)
else:
# No bboxes for this image run on the full image
latent_img = vae.encode(img)
kp_batch, sc_batch = _run_on_latent(latent_img)
img_keypoints.append(kp_batch[0])
img_scores.append(sc_batch[0])
all_keypoints.append(img_keypoints)
all_scores.append(img_scores)
pbar.update(1)
else: # full-image mode, batched
tqdm_pbar = tqdm(total=total_images, desc="Extracting keypoints")
for batch_start in range(0, total_images, batch_size):
batch_end = min(batch_start + batch_size, total_images)
latent_batch = vae.encode(image[batch_start:batch_end])
kp_batch, sc_batch = _run_on_latent(latent_batch)
for kp, sc in zip(kp_batch, sc_batch):
all_keypoints.append([kp])
all_scores.append([sc])
tqdm_pbar.update(1)
pbar.update(batch_end - batch_start)
openpose_frames = _to_openpose_frames(all_keypoints, all_scores, height, width)
return io.NodeOutput(openpose_frames)
def get_face_bboxes(kp2ds, scale, image_shape):
h, w = image_shape
kp2ds_face = kp2ds.copy()[1:] * (w, h)
min_x, min_y = np.min(kp2ds_face, axis=0)
max_x, max_y = np.max(kp2ds_face, axis=0)
initial_width = max_x - min_x
initial_height = max_y - min_y
if initial_width <= 0 or initial_height <= 0:
return [0, 0, 0, 0]
initial_area = initial_width * initial_height
expanded_area = initial_area * scale
new_width = np.sqrt(expanded_area * (initial_width / initial_height))
new_height = np.sqrt(expanded_area * (initial_height / initial_width))
delta_width = (new_width - initial_width) / 2
delta_height = (new_height - initial_height) / 4
expanded_min_x = max(min_x - delta_width, 0)
expanded_max_x = min(max_x + delta_width, w)
expanded_min_y = max(min_y - 3 * delta_height, 0)
expanded_max_y = min(max_y + delta_height, h)
return [int(expanded_min_x), int(expanded_max_x), int(expanded_min_y), int(expanded_max_y)]
class SDPoseFaceBBoxes(io.ComfyNode):
@classmethod
def define_schema(cls):
return io.Schema(
node_id="SDPoseFaceBBoxes",
category="image/preprocessors",
search_aliases=["face bbox", "face bounding box", "pose", "keypoints"],
inputs=[
io.Custom("POSE_KEYPOINT").Input("keypoints"),
io.Float.Input("scale", default=1.5, min=1.0, max=10.0, step=0.1, tooltip="Multiplier for the bounding box area around each detected face."),
io.Boolean.Input("force_square", default=True, tooltip="Expand the shorter bbox axis so the crop region is always square."),
],
outputs=[
io.BoundingBox.Output("bboxes", tooltip="Face bounding boxes per frame, compatible with SDPoseKeypointExtractor bboxes input."),
],
)
@classmethod
def execute(cls, keypoints, scale, force_square) -> io.NodeOutput:
all_bboxes = []
for frame in keypoints:
h = frame["canvas_height"]
w = frame["canvas_width"]
frame_bboxes = []
for person in frame["people"]:
face_flat = person.get("face_keypoints_2d", [])
if not face_flat:
continue
# Parse absolute-pixel face keypoints (70 kp: 68 landmarks + REye + LEye)
face_arr = np.array(face_flat, dtype=np.float32).reshape(-1, 3)
face_xy = face_arr[:, :2] # (70, 2) in absolute pixels
kp_norm = face_xy / np.array([w, h], dtype=np.float32)
kp_padded = np.vstack([np.zeros((1, 2), dtype=np.float32), kp_norm]) # (71, 2)
x1, x2, y1, y2 = get_face_bboxes(kp_padded, scale, (h, w))
if x2 > x1 and y2 > y1:
if force_square:
bw, bh = x2 - x1, y2 - y1
if bw != bh:
side = max(bw, bh)
cx, cy = (x1 + x2) // 2, (y1 + y2) // 2
half = side // 2
x1 = max(0, cx - half)
y1 = max(0, cy - half)
x2 = min(w, x1 + side)
y2 = min(h, y1 + side)
# Re-anchor if clamped
x1 = max(0, x2 - side)
y1 = max(0, y2 - side)
frame_bboxes.append({"x": x1, "y": y1, "width": x2 - x1, "height": y2 - y1})
all_bboxes.append(frame_bboxes)
return io.NodeOutput(all_bboxes)
class CropByBBoxes(io.ComfyNode):
@classmethod
def define_schema(cls):
return io.Schema(
node_id="CropByBBoxes",
category="image/preprocessors",
search_aliases=["crop", "face crop", "bbox crop", "pose", "bounding box"],
description="Crop and resize regions from the input image batch based on provided bounding boxes.",
inputs=[
io.Image.Input("image"),
io.BoundingBox.Input("bboxes", force_input=True),
io.Int.Input("output_width", default=512, min=64, max=4096, step=8, tooltip="Width each crop is resized to."),
io.Int.Input("output_height", default=512, min=64, max=4096, step=8, tooltip="Height each crop is resized to."),
io.Int.Input("padding", default=0, min=0, max=1024, step=1, tooltip="Extra padding in pixels added on each side of the bbox before cropping."),
],
outputs=[
io.Image.Output(tooltip="All crops stacked into a single image batch."),
],
)
@classmethod
def execute(cls, image, bboxes, output_width, output_height, padding) -> io.NodeOutput:
total_frames = image.shape[0]
img_h = image.shape[1]
img_w = image.shape[2]
num_ch = image.shape[3]
if not isinstance(bboxes, list):
bboxes = [[bboxes]]
elif len(bboxes) == 0:
return io.NodeOutput(image)
crops = []
for frame_idx in range(total_frames):
frame_bboxes = bboxes[min(frame_idx, len(bboxes) - 1)]
if not frame_bboxes:
continue
frame_chw = image[frame_idx].permute(2, 0, 1).unsqueeze(0) # BHWC → BCHW (1, C, H, W)
# Union all bboxes for this frame into a single crop region
x1 = min(int(b["x"]) for b in frame_bboxes)
y1 = min(int(b["y"]) for b in frame_bboxes)
x2 = max(int(b["x"] + b["width"]) for b in frame_bboxes)
y2 = max(int(b["y"] + b["height"]) for b in frame_bboxes)
if padding > 0:
x1 = max(0, x1 - padding)
y1 = max(0, y1 - padding)
x2 = min(img_w, x2 + padding)
y2 = min(img_h, y2 + padding)
x1, x2 = max(0, x1), min(img_w, x2)
y1, y2 = max(0, y1), min(img_h, y2)
# Fallback for empty/degenerate crops
if x2 <= x1 or y2 <= y1:
fallback_size = int(min(img_h, img_w) * 0.3)
fb_x1 = max(0, (img_w - fallback_size) // 2)
fb_y1 = max(0, int(img_h * 0.1))
fb_x2 = min(img_w, fb_x1 + fallback_size)
fb_y2 = min(img_h, fb_y1 + fallback_size)
if fb_x2 <= fb_x1 or fb_y2 <= fb_y1:
crops.append(torch.zeros(1, num_ch, output_height, output_width, dtype=image.dtype, device=image.device))
continue
x1, y1, x2, y2 = fb_x1, fb_y1, fb_x2, fb_y2
crop_chw = frame_chw[:, :, y1:y2, x1:x2] # (1, C, crop_h, crop_w)
resized = comfy.utils.common_upscale(crop_chw, output_width, output_height, upscale_method="bilinear", crop="disabled")
crops.append(resized)
if not crops:
return io.NodeOutput(image)
out_images = torch.cat(crops, dim=0).permute(0, 2, 3, 1) # (N, H, W, C)
return io.NodeOutput(out_images)
class SDPoseExtension(ComfyExtension):
@override
async def get_node_list(self) -> list[type[io.ComfyNode]]:
return [
SDPoseKeypointExtractor,
SDPoseDrawKeypoints,
SDPoseFaceBBoxes,
CropByBBoxes,
]
async def comfy_entrypoint() -> SDPoseExtension:
return SDPoseExtension()

View File

@ -21,7 +21,7 @@ class SaveWEBM(io.ComfyNode):
is_experimental=True,
inputs=[
io.Image.Input("images"),
io.String.Input("filename_prefix", default="ComfyUI"),
io.String.Input("filename_prefix", default="ComfyUI_%year%%month%%day%-%hour%%minute%%second%"),
io.Combo.Input("codec", options=["vp9", "av1"]),
io.Float.Input("fps", default=24.0, min=0.01, max=1000.0, step=0.01),
io.Float.Input("crf", default=32.0, min=0, max=63.0, step=1, tooltip="Higher crf means lower quality with a smaller file size, lower crf means higher quality higher filesize."),
@ -77,7 +77,7 @@ class SaveVideo(io.ComfyNode):
description="Saves the input images to your ComfyUI output directory.",
inputs=[
io.Video.Input("video", tooltip="The video to save."),
io.String.Input("filename_prefix", default="video/ComfyUI", tooltip="The prefix for the file to save. This may include formatting information such as %date:yyyy-MM-dd% or %Empty Latent Image.width% to include values from nodes."),
io.String.Input("filename_prefix", default="video/ComfyUI_%year%%month%%day%-%hour%%minute%%second%", tooltip="The prefix for the file to save. This may include formatting information such as %date:yyyy-MM-dd% or %Empty Latent Image.width% to include values from nodes."),
io.Combo.Input("format", options=Types.VideoContainer.as_input(), default="auto", tooltip="The format to save the video as."),
io.Combo.Input("codec", options=Types.VideoCodec.as_input(), default="auto", tooltip="The codec to use for the video."),
],

View File

@ -976,7 +976,7 @@ class CLIPLoader:
@classmethod
def INPUT_TYPES(s):
return {"required": { "clip_name": (folder_paths.get_filename_list("text_encoders"), ),
"type": (["stable_diffusion", "stable_cascade", "sd3", "stable_audio", "mochi", "ltxv", "pixart", "cosmos", "lumina2", "wan", "hidream", "chroma", "ace", "omnigen2", "qwen_image", "hunyuan_image", "flux2", "ovis"], ),
"type": (["stable_diffusion", "stable_cascade", "sd3", "stable_audio", "mochi", "ltxv", "pixart", "cosmos", "lumina2", "wan", "hidream", "chroma", "ace", "omnigen2", "qwen_image", "hunyuan_image", "flux2", "ovis", "longcat_image"], ),
},
"optional": {
"device": (["default", "cpu"], {"advanced": True}),
@ -1638,7 +1638,7 @@ class SaveImage:
return {
"required": {
"images": ("IMAGE", {"tooltip": "The images to save."}),
"filename_prefix": ("STRING", {"default": "ComfyUI", "tooltip": "The prefix for the file to save. This may include formatting information such as %date:yyyy-MM-dd% or %Empty Latent Image.width% to include values from nodes."})
"filename_prefix": ("STRING", {"default": "ComfyUI_%year%%month%%day%-%hour%%minute%%second%", "tooltip": "The prefix for the file to save. This may include formatting information such as %date:yyyy-MM-dd% or %Empty Latent Image.width% to include values from nodes."})
},
"hidden": {
"prompt": "PROMPT", "extra_pnginfo": "EXTRA_PNGINFO"
@ -2435,6 +2435,7 @@ async def init_builtin_extra_nodes():
"nodes_audio_encoder.py",
"nodes_rope.py",
"nodes_logic.py",
"nodes_resolution.py",
"nodes_nop.py",
"nodes_kandinsky5.py",
"nodes_wanmove.py",
@ -2447,6 +2448,7 @@ async def init_builtin_extra_nodes():
"nodes_toolkit.py",
"nodes_replacements.py",
"nodes_nag.py",
"nodes_sdpose.py",
]
import_failed = []

View File

@ -31,5 +31,4 @@ spandrel
pydantic~=2.0
pydantic-settings~=2.0
PyOpenGL
PyOpenGL-accelerate
glfw

View File

@ -0,0 +1,112 @@
import torch
from comfy.model_detection import detect_unet_config, model_config_from_unet_config
import comfy.supported_models
def _make_longcat_comfyui_sd():
"""Minimal ComfyUI-format state dict for pre-converted LongCat-Image weights."""
sd = {}
H = 32 # Reduce hidden state dimension to reduce memory usage
C_IN = 16
C_CTX = 3584
sd["img_in.weight"] = torch.empty(H, C_IN * 4)
sd["img_in.bias"] = torch.empty(H)
sd["txt_in.weight"] = torch.empty(H, C_CTX)
sd["txt_in.bias"] = torch.empty(H)
sd["time_in.in_layer.weight"] = torch.empty(H, 256)
sd["time_in.in_layer.bias"] = torch.empty(H)
sd["time_in.out_layer.weight"] = torch.empty(H, H)
sd["time_in.out_layer.bias"] = torch.empty(H)
sd["final_layer.adaLN_modulation.1.weight"] = torch.empty(2 * H, H)
sd["final_layer.adaLN_modulation.1.bias"] = torch.empty(2 * H)
sd["final_layer.linear.weight"] = torch.empty(C_IN * 4, H)
sd["final_layer.linear.bias"] = torch.empty(C_IN * 4)
for i in range(19):
sd[f"double_blocks.{i}.img_attn.norm.key_norm.weight"] = torch.empty(128)
sd[f"double_blocks.{i}.img_attn.qkv.weight"] = torch.empty(3 * H, H)
sd[f"double_blocks.{i}.img_mod.lin.weight"] = torch.empty(H, H)
for i in range(38):
sd[f"single_blocks.{i}.modulation.lin.weight"] = torch.empty(H, H)
return sd
def _make_flux_schnell_comfyui_sd():
"""Minimal ComfyUI-format state dict for standard Flux Schnell."""
sd = {}
H = 32 # Reduce hidden state dimension to reduce memory usage
C_IN = 16
sd["img_in.weight"] = torch.empty(H, C_IN * 4)
sd["img_in.bias"] = torch.empty(H)
sd["txt_in.weight"] = torch.empty(H, 4096)
sd["txt_in.bias"] = torch.empty(H)
sd["double_blocks.0.img_attn.norm.key_norm.weight"] = torch.empty(128)
sd["double_blocks.0.img_attn.qkv.weight"] = torch.empty(3 * H, H)
sd["double_blocks.0.img_mod.lin.weight"] = torch.empty(H, H)
for i in range(19):
sd[f"double_blocks.{i}.img_attn.norm.key_norm.weight"] = torch.empty(128)
for i in range(38):
sd[f"single_blocks.{i}.modulation.lin.weight"] = torch.empty(H, H)
return sd
class TestModelDetection:
"""Verify that first-match model detection selects the correct model
based on list ordering and unet_config specificity."""
def test_longcat_before_schnell_in_models_list(self):
"""LongCatImage must appear before FluxSchnell in the models list."""
models = comfy.supported_models.models
longcat_idx = next(i for i, m in enumerate(models) if m.__name__ == "LongCatImage")
schnell_idx = next(i for i, m in enumerate(models) if m.__name__ == "FluxSchnell")
assert longcat_idx < schnell_idx, (
f"LongCatImage (index {longcat_idx}) must come before "
f"FluxSchnell (index {schnell_idx}) in the models list"
)
def test_longcat_comfyui_detected_as_longcat(self):
sd = _make_longcat_comfyui_sd()
unet_config = detect_unet_config(sd, "")
assert unet_config is not None
assert unet_config["image_model"] == "flux"
assert unet_config["context_in_dim"] == 3584
assert unet_config["vec_in_dim"] is None
assert unet_config["guidance_embed"] is False
assert unet_config["txt_ids_dims"] == [1, 2]
model_config = model_config_from_unet_config(unet_config, sd)
assert model_config is not None
assert type(model_config).__name__ == "LongCatImage"
def test_longcat_comfyui_keys_pass_through_unchanged(self):
"""Pre-converted weights should not be transformed by process_unet_state_dict."""
sd = _make_longcat_comfyui_sd()
unet_config = detect_unet_config(sd, "")
model_config = model_config_from_unet_config(unet_config, sd)
processed = model_config.process_unet_state_dict(dict(sd))
assert "img_in.weight" in processed
assert "txt_in.weight" in processed
assert "time_in.in_layer.weight" in processed
assert "final_layer.linear.weight" in processed
def test_flux_schnell_comfyui_detected_as_flux_schnell(self):
sd = _make_flux_schnell_comfyui_sd()
unet_config = detect_unet_config(sd, "")
assert unet_config is not None
assert unet_config["image_model"] == "flux"
assert unet_config["context_in_dim"] == 4096
assert unet_config["txt_ids_dims"] == []
model_config = model_config_from_unet_config(unet_config, sd)
assert model_config is not None
assert type(model_config).__name__ == "FluxSchnell"