mirror of
https://github.com/comfyanonymous/ComfyUI.git
synced 2026-06-11 09:56:57 +08:00
Compare commits
16 Commits
feat/load-
...
expose-dep
| Author | SHA1 | Date | |
|---|---|---|---|
| e411e7d574 | |||
| 9fc6f5f6dd | |||
| 184009c2f6 | |||
| 07c53f8f0f | |||
| 1639dc7a70 | |||
| 8ed7f458d0 | |||
| f89999289a | |||
| cb9f639416 | |||
| 00b633f368 | |||
| a0a055bc4e | |||
| a1c434eb65 | |||
| fc258b10e5 | |||
| 38f750d80e | |||
| 7863cf0e53 | |||
| 739061dd4c | |||
| 2cdaaf4a25 |
@ -1,27 +1,27 @@
|
||||
As of the time of writing this you need a recent driver. Updating to the latest driver is recommended.
|
||||
|
||||
HOW TO RUN:
|
||||
|
||||
If you have a AMD gpu:
|
||||
|
||||
run_amd_gpu.bat
|
||||
|
||||
If you have memory issues you can try enabling the new dynamic memory management by running comfyui with:
|
||||
|
||||
run_amd_gpu_enable_dynamic_vram.bat
|
||||
|
||||
IF YOU GET A RED ERROR IN THE UI MAKE SURE YOU HAVE A MODEL/CHECKPOINT IN: ComfyUI\models\checkpoints
|
||||
|
||||
You can download the stable diffusion XL one from: https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/blob/main/sd_xl_base_1.0_0.9vae.safetensors
|
||||
|
||||
|
||||
RECOMMENDED WAY TO UPDATE:
|
||||
To update the ComfyUI code: update\update_comfyui.bat
|
||||
|
||||
|
||||
TO SHARE MODELS BETWEEN COMFYUI AND ANOTHER UI:
|
||||
In the ComfyUI directory you will find a file: extra_model_paths.yaml.example
|
||||
Rename this file to: extra_model_paths.yaml and edit it with your favorite text editor.
|
||||
|
||||
|
||||
|
||||
As of the time of writing this you need a recent driver. Updating to the latest driver is recommended.
|
||||
|
||||
HOW TO RUN:
|
||||
|
||||
If you have a AMD gpu:
|
||||
|
||||
run_amd_gpu.bat
|
||||
|
||||
If you have memory issues you can try enabling the new dynamic memory management by running comfyui with:
|
||||
|
||||
run_amd_gpu_enable_dynamic_vram.bat
|
||||
|
||||
IF YOU GET A RED ERROR IN THE UI MAKE SURE YOU HAVE A MODEL/CHECKPOINT IN: ComfyUI\models\checkpoints
|
||||
|
||||
You can download the stable diffusion XL one from: https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/blob/main/sd_xl_base_1.0_0.9vae.safetensors
|
||||
|
||||
|
||||
RECOMMENDED WAY TO UPDATE:
|
||||
To update the ComfyUI code: update\update_comfyui.bat
|
||||
|
||||
|
||||
TO SHARE MODELS BETWEEN COMFYUI AND ANOTHER UI:
|
||||
In the ComfyUI directory you will find a file: extra_model_paths.yaml.example
|
||||
Rename this file to: extra_model_paths.yaml and edit it with your favorite text editor.
|
||||
|
||||
|
||||
|
||||
|
||||
2
.github/workflows/check-line-endings.yml
vendored
2
.github/workflows/check-line-endings.yml
vendored
@ -17,7 +17,7 @@ jobs:
|
||||
- name: Check for Windows line endings (CRLF)
|
||||
run: |
|
||||
# Get the list of changed files in the PR
|
||||
CHANGED_FILES=$(git diff --name-only ${{ github.event.pull_request.base.sha }}..${{ github.event.pull_request.head.sha }})
|
||||
CHANGED_FILES=$(git diff --name-only ${{ github.event.pull_request.base.sha }}..${{ github.event.pull_request.head.sha }} -- ':!.ci')
|
||||
|
||||
# Flag to track if CRLF is found
|
||||
CRLF_FOUND=false
|
||||
|
||||
@ -33,6 +33,7 @@ from app.assets.services.file_utils import (
|
||||
verify_file_unchanged,
|
||||
)
|
||||
from app.assets.services.hashing import HashCheckpoint, compute_blake3_hash
|
||||
from app.assets.services.image_dimensions import extract_image_dimensions
|
||||
from app.assets.services.metadata_extract import extract_file_metadata
|
||||
from app.assets.services.path_utils import (
|
||||
compute_relative_filename,
|
||||
@ -506,6 +507,10 @@ def enrich_asset(
|
||||
|
||||
if extract_metadata and metadata:
|
||||
system_metadata = metadata.to_user_metadata()
|
||||
if mime_type and mime_type.startswith("image/"):
|
||||
dims = extract_image_dimensions(file_path, mime_type=mime_type)
|
||||
if dims:
|
||||
system_metadata.update(dims)
|
||||
set_reference_system_metadata(session, reference_id, system_metadata)
|
||||
|
||||
if full_hash:
|
||||
|
||||
63
app/assets/services/image_dimensions.py
Normal file
63
app/assets/services/image_dimensions.py
Normal file
@ -0,0 +1,63 @@
|
||||
"""Image dimension extraction for asset ingest.
|
||||
|
||||
Reads only the image header via Pillow to capture width/height cheaply,
|
||||
without a full pixel decode. Returns a metadata dict suitable for merging
|
||||
into ``AssetReference.system_metadata``.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from typing import Any
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def extract_image_dimensions(
|
||||
file_path: str, mime_type: str | None = None
|
||||
) -> dict[str, Any] | None:
|
||||
"""Extract image dimensions for the file at ``file_path``.
|
||||
|
||||
Args:
|
||||
file_path: Absolute path to a file on disk.
|
||||
mime_type: Optional MIME type hint. When provided and not prefixed
|
||||
with ``image/``, extraction is skipped without touching the file.
|
||||
|
||||
Returns:
|
||||
``{"kind": "image", "width": W, "height": H}`` when the file is a
|
||||
recognizable image with positive dimensions, otherwise ``None``.
|
||||
|
||||
The dict shape is intended to be merged into ``system_metadata`` so the
|
||||
asset response surfaces ``metadata.kind`` plus dimension fields for image
|
||||
assets. Forward-compatible: future media kinds (e.g. ``"video"`` with
|
||||
duration/fps) can extend this shape without schema changes.
|
||||
"""
|
||||
if mime_type is not None and not mime_type.startswith("image/"):
|
||||
return None
|
||||
|
||||
try:
|
||||
from PIL import Image, UnidentifiedImageError
|
||||
except ImportError:
|
||||
logger.debug(
|
||||
"Pillow not available; skipping image dimension extraction for %s",
|
||||
file_path,
|
||||
)
|
||||
return None
|
||||
|
||||
try:
|
||||
with Image.open(file_path) as img:
|
||||
width, height = img.size
|
||||
except (OSError, UnidentifiedImageError, ValueError) as exc:
|
||||
logger.debug(
|
||||
"Failed to read image dimensions from %s: %s", file_path, exc
|
||||
)
|
||||
return None
|
||||
|
||||
if (
|
||||
not isinstance(width, int)
|
||||
or not isinstance(height, int)
|
||||
or width <= 0
|
||||
or height <= 0
|
||||
):
|
||||
return None
|
||||
|
||||
return {"kind": "image", "width": width, "height": height}
|
||||
@ -17,9 +17,11 @@ from app.assets.database.queries import (
|
||||
get_reference_by_file_path,
|
||||
get_reference_tags,
|
||||
get_or_create_reference,
|
||||
list_references_by_asset_id,
|
||||
reference_exists,
|
||||
remove_missing_tag_for_asset_id,
|
||||
set_reference_metadata,
|
||||
set_reference_system_metadata,
|
||||
set_reference_tags,
|
||||
update_asset_hash_and_mime,
|
||||
upsert_asset,
|
||||
@ -29,6 +31,7 @@ from app.assets.database.queries import (
|
||||
from app.assets.helpers import get_utc_now, normalize_tags
|
||||
from app.assets.services.bulk_ingest import batch_insert_seed_assets
|
||||
from app.assets.services.file_utils import get_size_and_mtime_ns
|
||||
from app.assets.services.image_dimensions import extract_image_dimensions
|
||||
from app.assets.services.path_utils import (
|
||||
compute_relative_filename,
|
||||
get_name_and_tags_from_asset_path,
|
||||
@ -118,6 +121,14 @@ def _ingest_file_from_path(
|
||||
user_metadata=user_metadata,
|
||||
)
|
||||
|
||||
_maybe_store_image_dimensions(
|
||||
session,
|
||||
reference_id=reference_id,
|
||||
file_path=locator,
|
||||
mime_type=mime_type,
|
||||
current_system_metadata=ref.system_metadata,
|
||||
)
|
||||
|
||||
try:
|
||||
remove_missing_tag_for_asset_id(session, asset_id=asset.id)
|
||||
except Exception:
|
||||
@ -288,6 +299,13 @@ def _register_existing_asset(
|
||||
user_metadata=new_meta,
|
||||
)
|
||||
|
||||
_backfill_image_dimensions_from_siblings(
|
||||
session,
|
||||
asset_id=asset.id,
|
||||
new_reference_id=ref.id,
|
||||
current_system_metadata=ref.system_metadata,
|
||||
)
|
||||
|
||||
if tags is not None:
|
||||
set_reference_tags(
|
||||
session,
|
||||
@ -334,6 +352,87 @@ def _update_metadata_with_filename(
|
||||
)
|
||||
|
||||
|
||||
_IMAGE_DIMENSION_KEYS = ("kind", "width", "height")
|
||||
|
||||
|
||||
def _maybe_store_image_dimensions(
|
||||
session: Session,
|
||||
reference_id: str,
|
||||
file_path: str,
|
||||
mime_type: str | None,
|
||||
current_system_metadata: dict | None,
|
||||
) -> None:
|
||||
"""Populate ``kind``/``width``/``height`` on system_metadata for image refs.
|
||||
|
||||
Non-image MIME types are a no-op. Pre-existing keys (e.g. enricher-written
|
||||
safetensors metadata, download provenance) are preserved by merge.
|
||||
"""
|
||||
if not mime_type or not mime_type.startswith("image/"):
|
||||
return
|
||||
|
||||
dims = extract_image_dimensions(file_path, mime_type=mime_type)
|
||||
if not dims:
|
||||
return
|
||||
|
||||
current = current_system_metadata or {}
|
||||
merged = dict(current)
|
||||
merged.update(dims)
|
||||
if merged != current:
|
||||
set_reference_system_metadata(
|
||||
session,
|
||||
reference_id=reference_id,
|
||||
system_metadata=merged,
|
||||
)
|
||||
|
||||
|
||||
def _backfill_image_dimensions_from_siblings(
|
||||
session: Session,
|
||||
asset_id: str,
|
||||
new_reference_id: str,
|
||||
current_system_metadata: dict | None,
|
||||
) -> None:
|
||||
"""Copy image dimension keys from any sibling reference of the same asset.
|
||||
|
||||
The from-hash path doesn't read the file bytes, so dimensions can't be
|
||||
extracted there directly. When another reference of the same asset already
|
||||
carries image dimensions, copy them onto the new reference so consumers
|
||||
see consistent metadata regardless of how the asset was registered.
|
||||
|
||||
Best-effort: missing siblings, non-image siblings, or absent dimension
|
||||
keys leave the target reference unchanged.
|
||||
"""
|
||||
current = current_system_metadata or {}
|
||||
if current.get("kind") == "image" and "width" in current and "height" in current:
|
||||
return
|
||||
|
||||
for sibling in list_references_by_asset_id(session, asset_id):
|
||||
if sibling.id == new_reference_id:
|
||||
continue
|
||||
meta = sibling.system_metadata or {}
|
||||
if meta.get("kind") != "image":
|
||||
continue
|
||||
width = meta.get("width")
|
||||
height = meta.get("height")
|
||||
if (
|
||||
type(width) is not int
|
||||
or type(height) is not int
|
||||
or width <= 0
|
||||
or height <= 0
|
||||
):
|
||||
continue
|
||||
merged = dict(current)
|
||||
merged["kind"] = "image"
|
||||
merged["width"] = width
|
||||
merged["height"] = height
|
||||
if merged != current:
|
||||
set_reference_system_metadata(
|
||||
session,
|
||||
reference_id=new_reference_id,
|
||||
system_metadata=merged,
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
def _sanitize_filename(name: str | None, fallback: str) -> str:
|
||||
n = os.path.basename((name or "").strip() or fallback)
|
||||
return n if n else fallback
|
||||
|
||||
@ -166,6 +166,8 @@ class PerformanceFeature(enum.Enum):
|
||||
|
||||
parser.add_argument("--fast", nargs="*", type=PerformanceFeature, help="Enable some untested and potentially quality deteriorating optimizations. This is used to test new features so using it might crash your comfyui. --fast with no arguments enables everything. You can pass a list specific optimizations if you only want to enable specific ones. Current valid optimizations: {}".format(" ".join(map(lambda c: c.value, PerformanceFeature))))
|
||||
|
||||
parser.add_argument("--debug-hang", action="store_true", help="Enable stack trace dumps on Ctrl-C for debugging hangs.")
|
||||
|
||||
parser.add_argument("--disable-pinned-memory", action="store_true", help="Disable pinned memory use.")
|
||||
|
||||
parser.add_argument("--mmap-torch-files", action="store_true", help="Use mmap when loading ckpt/pt files.")
|
||||
|
||||
@ -51,6 +51,18 @@ class FeedForward(nn.Module):
|
||||
return hidden_states
|
||||
|
||||
|
||||
# Addin this back because Nunchaku custom nodes rely on it, see comment here:
|
||||
# https://github.com/Comfy-Org/ComfyUI/pull/14178#issuecomment-4640475161
|
||||
# TODO: Eventually remove this once we natively support SVDQuants
|
||||
def apply_rotary_emb(x, freqs_cis):
|
||||
if x.shape[1] == 0:
|
||||
return x
|
||||
|
||||
t_ = x.reshape(*x.shape[:-1], -1, 1, 2)
|
||||
t_out = freqs_cis[..., 0] * t_[..., 0] + freqs_cis[..., 1] * t_[..., 1]
|
||||
return t_out.reshape(*x.shape)
|
||||
|
||||
|
||||
class QwenTimestepProjEmbeddings(nn.Module):
|
||||
def __init__(self, embedding_dim, pooled_projection_dim, use_additional_t_cond=False, dtype=None, device=None, operations=None):
|
||||
super().__init__()
|
||||
|
||||
@ -1631,13 +1631,15 @@ class SCAILWanModel(WanModel):
|
||||
|
||||
self.patch_embedding_pose = operations.Conv3d(in_dim, dim, kernel_size=patch_size, stride=patch_size, device=device, dtype=torch.float32)
|
||||
|
||||
def forward_orig(self, x, t, context, clip_fea=None, freqs=None, transformer_options={}, pose_latents=None, reference_latent=None, **kwargs):
|
||||
def forward_orig(self, x, t, context, clip_fea=None, freqs=None, transformer_options={}, pose_latents=None, reference_latent=None, ref_mask_latents=None, sam_latents=None, **kwargs):
|
||||
|
||||
if reference_latent is not None:
|
||||
x = torch.cat((reference_latent, x), dim=2)
|
||||
|
||||
# embeddings
|
||||
x = self.patch_embedding(x.float()).to(x.dtype)
|
||||
if ref_mask_latents is not None: # SCAIL-2 additive mask stream
|
||||
x = x + self.patch_embedding_mask(ref_mask_latents.float()).to(x.dtype)
|
||||
grid_sizes = x.shape[2:]
|
||||
transformer_options["grid_sizes"] = grid_sizes
|
||||
x = x.flatten(2).transpose(1, 2)
|
||||
@ -1645,6 +1647,8 @@ class SCAILWanModel(WanModel):
|
||||
scail_pose_seq_len = 0
|
||||
if pose_latents is not None:
|
||||
scail_x = self.patch_embedding_pose(pose_latents.float()).to(x.dtype)
|
||||
if sam_latents is not None: # SCAIL-2 additive mask stream
|
||||
scail_x = scail_x + self.patch_embedding_mask(sam_latents.float()).to(x.dtype)
|
||||
scail_x = scail_x.flatten(2).transpose(1, 2)
|
||||
scail_pose_seq_len = scail_x.shape[1]
|
||||
x = torch.cat([x, scail_x], dim=1)
|
||||
@ -1695,7 +1699,36 @@ class SCAILWanModel(WanModel):
|
||||
|
||||
return x
|
||||
|
||||
def rope_encode(self, t, h, w, t_start=0, steps_t=None, steps_h=None, steps_w=None, device=None, dtype=None, pose_latents=None, reference_latent=None, transformer_options={}):
|
||||
# ref_mask_flag is a scalar bool (CONDConstant, SCAIL-2 only). False => replacement mode,
|
||||
# which places ref/pose via H/W rope shifts instead of the animation-mode temporal offset.
|
||||
def rope_encode(self, t, h, w, t_start=0, steps_t=None, steps_h=None, steps_w=None, device=None, dtype=None, pose_latents=None, reference_latent=None, ref_mask_flag=None, transformer_options={}):
|
||||
if ref_mask_flag is not None and not bool(ref_mask_flag):
|
||||
REF_ROPE_H = 120.0
|
||||
POSE_ROPE_W = 120.0
|
||||
|
||||
ref_t_patches = 0
|
||||
if reference_latent is not None:
|
||||
ref_t_patches = (reference_latent.shape[2] + (self.patch_size[0] // 2)) // self.patch_size[0]
|
||||
main_t_patches = t - ref_t_patches
|
||||
|
||||
parts = []
|
||||
if ref_t_patches > 0:
|
||||
ref_tf = {"rope_options": {"shift_y": REF_ROPE_H, "shift_x": 0.0, "scale_y": 1.0, "scale_x": 1.0}}
|
||||
parts.append(super().rope_encode(ref_t_patches, h, w, t_start=0, device=device, dtype=dtype, transformer_options=ref_tf))
|
||||
if main_t_patches > 0:
|
||||
parts.append(super().rope_encode(main_t_patches, h, w, t_start=0, device=device, dtype=dtype, transformer_options=transformer_options))
|
||||
|
||||
if pose_latents is not None:
|
||||
F_pose, H_pose, W_pose = pose_latents.shape[-3], pose_latents.shape[-2], pose_latents.shape[-1]
|
||||
h_scale = h / H_pose
|
||||
w_scale = w / W_pose
|
||||
h_shift = (h_scale - 1) / 2
|
||||
w_shift = (w_scale - 1) / 2
|
||||
pose_tf = {"rope_options": {"shift_y": h_shift, "shift_x": POSE_ROPE_W + w_shift, "scale_y": h_scale, "scale_x": w_scale}}
|
||||
parts.append(super().rope_encode(F_pose, H_pose, W_pose, t_start=0, device=device, dtype=dtype, transformer_options=pose_tf))
|
||||
|
||||
return torch.cat(parts, dim=1)
|
||||
|
||||
main_freqs = super().rope_encode(t, h, w, t_start=t_start, steps_t=steps_t, steps_h=steps_h, steps_w=steps_w, device=device, dtype=dtype, transformer_options=transformer_options)
|
||||
|
||||
if pose_latents is None:
|
||||
@ -1719,12 +1752,16 @@ class SCAILWanModel(WanModel):
|
||||
|
||||
return torch.cat([main_freqs, pose_freqs], dim=1)
|
||||
|
||||
def _forward(self, x, timestep, context, clip_fea=None, time_dim_concat=None, transformer_options={}, pose_latents=None, **kwargs):
|
||||
def _forward(self, x, timestep, context, clip_fea=None, time_dim_concat=None, transformer_options={}, pose_latents=None, ref_mask_latents=None, sam_latents=None, **kwargs):
|
||||
bs, c, t, h, w = x.shape
|
||||
x = comfy.ldm.common_dit.pad_to_patch_size(x, self.patch_size)
|
||||
|
||||
if pose_latents is not None:
|
||||
pose_latents = comfy.ldm.common_dit.pad_to_patch_size(pose_latents, self.patch_size)
|
||||
if ref_mask_latents is not None: # SCAIL-2
|
||||
ref_mask_latents = comfy.ldm.common_dit.pad_to_patch_size(ref_mask_latents, self.patch_size)
|
||||
if sam_latents is not None: # SCAIL-2
|
||||
sam_latents = comfy.ldm.common_dit.pad_to_patch_size(sam_latents, self.patch_size)
|
||||
|
||||
t_len = t
|
||||
if time_dim_concat is not None:
|
||||
@ -1737,5 +1774,15 @@ class SCAILWanModel(WanModel):
|
||||
reference_latent = comfy.ldm.common_dit.pad_to_patch_size(kwargs.pop("reference_latent"), self.patch_size)
|
||||
t_len += reference_latent.shape[2]
|
||||
|
||||
freqs = self.rope_encode(t_len, h, w, device=x.device, dtype=x.dtype, transformer_options=transformer_options, pose_latents=pose_latents, reference_latent=reference_latent)
|
||||
return self.forward_orig(x, timestep, context, clip_fea=clip_fea, freqs=freqs, transformer_options=transformer_options, pose_latents=pose_latents, reference_latent=reference_latent, **kwargs)[:, :, :t, :h, :w]
|
||||
ref_mask_flag = kwargs.pop("ref_mask_flag", None) # SCAIL-2
|
||||
|
||||
freqs = self.rope_encode(t_len, h, w, device=x.device, dtype=x.dtype, transformer_options=transformer_options, pose_latents=pose_latents, reference_latent=reference_latent, ref_mask_flag=ref_mask_flag)
|
||||
return self.forward_orig(x, timestep, context, clip_fea=clip_fea, freqs=freqs, transformer_options=transformer_options, pose_latents=pose_latents, reference_latent=reference_latent, ref_mask_latents=ref_mask_latents, sam_latents=sam_latents, **kwargs)[:, :, :t, :h, :w]
|
||||
|
||||
|
||||
class SCAIL2WanModel(SCAILWanModel):
|
||||
"""SCAIL-2: SCAIL-Preview + an additive binary multi-identity mask stream."""
|
||||
|
||||
def __init__(self, model_type="scail2", patch_size=(1, 2, 2), in_dim=20, mask_in_dim=28, dim=5120, operations=None, device=None, dtype=None, **kwargs):
|
||||
super().__init__(model_type=model_type, patch_size=patch_size, in_dim=in_dim, dim=dim, operations=operations, device=device, dtype=dtype, **kwargs)
|
||||
self.patch_embedding_mask = operations.Conv3d(mask_in_dim, dim, kernel_size=patch_size, stride=patch_size, device=device, dtype=torch.float32)
|
||||
|
||||
@ -357,6 +357,12 @@ def model_lora_keys_unet(model, key_map={}):
|
||||
key_lora = k[len("diffusion_model."):-len(".weight")]
|
||||
key_map["transformer.{}".format(key_lora)] = k
|
||||
|
||||
if isinstance(model, (comfy.model_base.LTXV, comfy.model_base.LTXAV)):
|
||||
for k in sdk:
|
||||
if k.startswith("diffusion_model.") and k.endswith(".weight"):
|
||||
key_lora = k[len("diffusion_model."):-len(".weight")]
|
||||
key_map["{}".format(key_lora)] = k
|
||||
|
||||
return key_map
|
||||
|
||||
|
||||
|
||||
@ -1754,6 +1754,80 @@ class WAN21_SCAIL(WAN21):
|
||||
|
||||
return out
|
||||
|
||||
class WAN21_SCAIL2(WAN21_SCAIL):
|
||||
"""SCAIL-2: SCAIL-Preview + an additive binary multi-identity mask stream."""
|
||||
|
||||
def __init__(self, model_config, model_type=ModelType.FLOW, image_to_video=False, device=None):
|
||||
super(WAN21, self).__init__(model_config, model_type, device=device, unet_model=comfy.ldm.wan.model.SCAIL2WanModel)
|
||||
self.memory_usage_factor_conds = ("reference_latent", "pose_latents", "ref_mask_latents", "sam_latents")
|
||||
self.memory_usage_shape_process = {
|
||||
"pose_latents": lambda shape: [shape[0], shape[1], 1.5, shape[-2], shape[-1]],
|
||||
"sam_latents": lambda shape: [shape[0], shape[1], 1.5, shape[-2], shape[-1]],
|
||||
}
|
||||
self.image_to_video = image_to_video
|
||||
|
||||
def extra_conds(self, **kwargs):
|
||||
out = super().extra_conds(**kwargs)
|
||||
|
||||
driving_mask_28ch = kwargs.get("driving_mask_28ch", None)
|
||||
if driving_mask_28ch is not None:
|
||||
out['sam_latents'] = comfy.conds.CONDRegular(driving_mask_28ch.movedim(1, 2).contiguous())
|
||||
|
||||
ref_mask_28ch = kwargs.get("ref_mask_28ch", None)
|
||||
if ref_mask_28ch is not None:
|
||||
out['ref_mask_latents'] = comfy.conds.CONDRegular(ref_mask_28ch.movedim(1, 2).contiguous())
|
||||
|
||||
ref_mask_flag = kwargs.get("ref_mask_flag", None)
|
||||
if ref_mask_flag is not None:
|
||||
out['ref_mask_flag'] = comfy.conds.CONDConstant(ref_mask_flag)
|
||||
|
||||
return out
|
||||
|
||||
def extra_conds_shapes(self, **kwargs):
|
||||
out = super().extra_conds_shapes(**kwargs)
|
||||
driving_mask_28ch = kwargs.get("driving_mask_28ch", None)
|
||||
if driving_mask_28ch is not None:
|
||||
s = driving_mask_28ch.shape
|
||||
out['sam_latents'] = [s[0], 28, s[1], s[3], s[4]]
|
||||
ref_mask_28ch = kwargs.get("ref_mask_28ch", None)
|
||||
if ref_mask_28ch is not None:
|
||||
s = ref_mask_28ch.shape
|
||||
out['ref_mask_latents'] = [s[0], 28, s[1], s[3], s[4]]
|
||||
return out
|
||||
|
||||
def resize_cond_for_context_window(self, cond_key, cond_value, window, x_in, device, retain_index_list=[]):
|
||||
if cond_key in ("sam_latents", "pose_latents"):
|
||||
return comfy.context_windows.slice_cond(cond_value, window, x_in, device, temporal_dim=2, temporal_offset=1)
|
||||
return super().resize_cond_for_context_window(cond_key, cond_value, window, x_in, device, retain_index_list=retain_index_list)
|
||||
|
||||
def concat_cond(self, **kwargs):
|
||||
# The 4 extra channels are the history_mask (1 at clean-anchor frames).
|
||||
noise = kwargs.get("noise", None)
|
||||
extra_channels = self.diffusion_model.patch_embedding.weight.shape[1] - noise.shape[1]
|
||||
if extra_channels != 4:
|
||||
return super().concat_cond(**kwargs)
|
||||
|
||||
mask = kwargs.get("concat_mask", kwargs.get("denoise_mask", None))
|
||||
if mask is None:
|
||||
return torch.zeros_like(noise)[:, :4]
|
||||
|
||||
device = kwargs["device"]
|
||||
if mask.shape[1] != 4:
|
||||
mask = torch.mean(mask, dim=1, keepdim=True)
|
||||
mask = 1.0 - mask
|
||||
mask = utils.common_upscale(mask.to(device), noise.shape[-1], noise.shape[-2], "bilinear", "center")
|
||||
if mask.shape[-3] < noise.shape[-3]:
|
||||
mask = torch.nn.functional.pad(mask, (0, 0, 0, 0, 0, noise.shape[-3] - mask.shape[-3]), mode='constant', value=0)
|
||||
if mask.shape[1] == 1:
|
||||
mask = mask.repeat(1, 4, 1, 1, 1)
|
||||
mask = utils.resize_to_batch_size(mask, noise.shape[0])
|
||||
return mask
|
||||
|
||||
def scale_latent_inpaint(self, sigma, noise, latent_image, **kwargs):
|
||||
# Hold anchor constant across all sigmas instead of base sigma*noise + (1-sigma)*latent_image.
|
||||
return latent_image
|
||||
|
||||
|
||||
class WAN22_WanDancer(WAN21):
|
||||
def __init__(self, model_config, model_type=ModelType.FLOW, image_to_video=True, device=None):
|
||||
super(WAN21, self).__init__(model_config, model_type, device=device, unet_model=comfy.ldm.wan.model_wandancer.WanDancerModel)
|
||||
|
||||
@ -630,6 +630,8 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
|
||||
dit_config["model_type"] = "humo"
|
||||
elif '{}face_adapter.fuser_blocks.0.k_norm.weight'.format(key_prefix) in state_dict_keys:
|
||||
dit_config["model_type"] = "animate"
|
||||
elif '{}patch_embedding_mask.weight'.format(key_prefix) in state_dict_keys:
|
||||
dit_config["model_type"] = "scail2"
|
||||
elif '{}patch_embedding_pose.weight'.format(key_prefix) in state_dict_keys:
|
||||
dit_config["model_type"] = "scail"
|
||||
elif '{}patch_embedding_global.weight'.format(key_prefix) in state_dict_keys:
|
||||
|
||||
@ -1450,6 +1450,17 @@ class WAN21_SCAIL(WAN21_T2V):
|
||||
out = model_base.WAN21_SCAIL(self, image_to_video=False, device=device)
|
||||
return out
|
||||
|
||||
|
||||
class WAN21_SCAIL2(WAN21_T2V):
|
||||
unet_config = {
|
||||
"image_model": "wan2.1",
|
||||
"model_type": "scail2",
|
||||
}
|
||||
|
||||
def get_model(self, state_dict, prefix="", device=None):
|
||||
out = model_base.WAN21_SCAIL2(self, image_to_video=False, device=device)
|
||||
return out
|
||||
|
||||
class WAN22_WanDancer(WAN21_T2V):
|
||||
unet_config = {
|
||||
"image_model": "wan2.1",
|
||||
@ -2259,6 +2270,7 @@ models = [
|
||||
WAN22_Animate,
|
||||
WAN21_FlowRVS,
|
||||
WAN21_SCAIL,
|
||||
WAN21_SCAIL2,
|
||||
WAN22_WanDancer,
|
||||
Hunyuan3Dv2mini,
|
||||
Hunyuan3Dv2,
|
||||
|
||||
@ -32,7 +32,9 @@ class Ideogram4Tokenizer(sd1_clip.SD1Tokenizer):
|
||||
self.llama_template = "<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n"
|
||||
|
||||
def tokenize_with_weights(self, text, return_word_ids=False, llama_template=None, **kwargs):
|
||||
if llama_template is None:
|
||||
if text.startswith('<|im_start|>'):
|
||||
llama_text = text
|
||||
elif llama_template is None:
|
||||
llama_text = self.llama_template.format(text)
|
||||
else:
|
||||
llama_text = llama_template.format(text)
|
||||
|
||||
@ -36,15 +36,15 @@ class RemoveBackground(IO.ComfyNode):
|
||||
category="image/background removal",
|
||||
description="Generates a foreground mask to remove the background from an image using a background removal model.",
|
||||
inputs=[
|
||||
IO.Image.Input("image", tooltip="Input image to remove the background from"),
|
||||
IO.BackgroundRemoval.Input("bg_removal_model", tooltip="Background removal model used to generate the mask")
|
||||
IO.BackgroundRemoval.Input("bg_removal_model", tooltip="Background removal model used to generate the mask"),
|
||||
IO.Image.Input("image", tooltip="Input image to remove the background from")
|
||||
],
|
||||
outputs=[
|
||||
IO.Mask.Output("mask", tooltip="Generated foreground mask")
|
||||
]
|
||||
)
|
||||
@classmethod
|
||||
def execute(cls, image, bg_removal_model):
|
||||
def execute(cls, bg_removal_model, image):
|
||||
mask = bg_removal_model.encode_image(image)
|
||||
return IO.NodeOutput(mask)
|
||||
|
||||
|
||||
@ -7,29 +7,29 @@ class ColorToRGBInt(io.ComfyNode):
|
||||
def define_schema(cls) -> io.Schema:
|
||||
return io.Schema(
|
||||
node_id="ColorToRGBInt",
|
||||
display_name="Color to RGB Int",
|
||||
display_name="Color Picker",
|
||||
category="utilities",
|
||||
description="Convert a color to a RGB integer value.",
|
||||
description="Return a color RGB integer value and hexadecimal representation.",
|
||||
inputs=[
|
||||
io.Color.Input("color"),
|
||||
],
|
||||
outputs=[
|
||||
io.Int.Output(display_name="rgb_int"),
|
||||
io.Color.Output(display_name="hex")
|
||||
],
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def execute(
|
||||
cls,
|
||||
color: str,
|
||||
) -> io.NodeOutput:
|
||||
def execute(cls, color: str) -> io.NodeOutput:
|
||||
# expect format #RRGGBB
|
||||
if len(color) != 7 or color[0] != "#":
|
||||
raise ValueError("Color must be in format #RRGGBB")
|
||||
r = int(color[1:3], 16)
|
||||
g = int(color[3:5], 16)
|
||||
b = int(color[5:7], 16)
|
||||
return io.NodeOutput(r * 256 * 256 + g * 256 + b)
|
||||
|
||||
rgb_int = r * 256 * 256 + g * 256 + b
|
||||
return io.NodeOutput(rgb_int, color)
|
||||
|
||||
|
||||
class ColorExtension(ComfyExtension):
|
||||
|
||||
@ -317,71 +317,11 @@ class PreviewPointCloud(IO.ComfyNode):
|
||||
)
|
||||
|
||||
|
||||
MESH_EXTENSIONS = {'.gltf', '.glb', '.obj', '.fbx', '.stl'}
|
||||
|
||||
|
||||
class Load3DAdvanced(IO.ComfyNode):
|
||||
@classmethod
|
||||
def define_schema(cls):
|
||||
input_dir = folder_paths.get_input_directory()
|
||||
os.makedirs(input_dir, exist_ok=True)
|
||||
|
||||
files = [
|
||||
f for f in os.listdir(input_dir)
|
||||
if os.path.isfile(os.path.join(input_dir, f))
|
||||
and os.path.splitext(f)[1].lower() in MESH_EXTENSIONS
|
||||
]
|
||||
return IO.Schema(
|
||||
node_id="Load3DAdvanced",
|
||||
display_name="Load 3D (Advanced)",
|
||||
category="3d",
|
||||
search_aliases=[
|
||||
"load mesh",
|
||||
"load gltf",
|
||||
"load glb",
|
||||
"load obj",
|
||||
"load fbx",
|
||||
"load stl",
|
||||
],
|
||||
is_experimental=True,
|
||||
inputs=[
|
||||
IO.Combo.Input("model_file", options=["none"] + sorted(files), upload=IO.UploadType.model),
|
||||
IO.Load3D.Input("viewport_state"),
|
||||
IO.Int.Input("width", default=1024, min=1, max=4096, step=1),
|
||||
IO.Int.Input("height", default=1024, min=1, max=4096, step=1),
|
||||
],
|
||||
outputs=[
|
||||
IO.File3DAny.Output(display_name="model_3d"),
|
||||
IO.Load3DModelInfo.Output(display_name="model_3d_info"),
|
||||
IO.Load3DCamera.Output(display_name="camera_info"),
|
||||
IO.Int.Output(display_name="width"),
|
||||
IO.Int.Output(display_name="height"),
|
||||
],
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def validate_inputs(cls, model_file, **kwargs) -> bool | str:
|
||||
if not model_file or model_file == "none":
|
||||
return True
|
||||
if not folder_paths.exists_annotated_filepath(model_file):
|
||||
return f"Invalid 3D model file: {model_file}"
|
||||
return True
|
||||
|
||||
@classmethod
|
||||
def execute(cls, model_file, viewport_state, width: int, height: int, **kwargs) -> IO.NodeOutput:
|
||||
file_3d = None
|
||||
if model_file and model_file != "none":
|
||||
file_3d = Types.File3D(folder_paths.get_annotated_filepath(model_file))
|
||||
model_3d_info = viewport_state.get('model_3d_info', [])
|
||||
return IO.NodeOutput(file_3d, model_3d_info, viewport_state['camera_info'], width, height)
|
||||
|
||||
|
||||
class Load3DExtension(ComfyExtension):
|
||||
@override
|
||||
async def get_node_list(self) -> list[type[IO.ComfyNode]]:
|
||||
return [
|
||||
Load3D,
|
||||
Load3DAdvanced,
|
||||
Preview3D,
|
||||
Preview3DAdvanced,
|
||||
PreviewGaussianSplat,
|
||||
|
||||
@ -6,24 +6,24 @@ from comfy_api.latest import ComfyExtension, io
|
||||
|
||||
class AspectRatio(str, Enum):
|
||||
SQUARE = "1:1 (Square)"
|
||||
PHOTO_V = "2:3 (Portrait Photo)"
|
||||
PHOTO_H = "3:2 (Photo)"
|
||||
STANDARD_V = "3:4 (Portrait Standard)"
|
||||
STANDARD_H = "4:3 (Standard)"
|
||||
WIDESCREEN_V = "9:16 (Portrait Widescreen)"
|
||||
WIDESCREEN_H = "16:9 (Widescreen)"
|
||||
ULTRAWIDE_H = "21:9 (Ultrawide)"
|
||||
PHOTO_V = "2:3 (Portrait Photo)"
|
||||
STANDARD_V = "3:4 (Portrait Standard)"
|
||||
WIDESCREEN_V = "9:16 (Portrait Widescreen)"
|
||||
|
||||
|
||||
ASPECT_RATIOS: dict[AspectRatio, tuple[int, int]] = {
|
||||
AspectRatio.SQUARE: (1, 1),
|
||||
AspectRatio.PHOTO_V: (2, 3),
|
||||
AspectRatio.PHOTO_H: (3, 2),
|
||||
AspectRatio.STANDARD_V: (3, 4),
|
||||
AspectRatio.STANDARD_H: (4, 3),
|
||||
AspectRatio.WIDESCREEN_V: (9, 16),
|
||||
AspectRatio.WIDESCREEN_H: (16, 9),
|
||||
AspectRatio.ULTRAWIDE_H: (21, 9),
|
||||
AspectRatio.PHOTO_V: (2, 3),
|
||||
AspectRatio.STANDARD_V: (3, 4),
|
||||
AspectRatio.WIDESCREEN_V: (9, 16),
|
||||
}
|
||||
|
||||
|
||||
@ -50,26 +50,35 @@ class ResolutionSelector(io.ComfyNode):
|
||||
min=0.1,
|
||||
max=16.0,
|
||||
step=0.1,
|
||||
tooltip="Target total megapixels. 1.0 MP ≈ 1024×1024 for square.",
|
||||
tooltip="Target total megapixels. 1.0 MP ≈ 1024x1024 for square.",
|
||||
),
|
||||
io.Int.Input(
|
||||
id="multiple",
|
||||
default=8,
|
||||
min=8,
|
||||
max=128,
|
||||
step=4,
|
||||
tooltip="Nearest multiple of the result to set the selected resolution to.",
|
||||
advanced=True,
|
||||
),
|
||||
],
|
||||
outputs=[
|
||||
io.Int.Output(
|
||||
"width", tooltip="Calculated width in pixels (multiple of 8)."
|
||||
"width", tooltip="Calculated width in pixels multiplied by the selected multiple."
|
||||
),
|
||||
io.Int.Output(
|
||||
"height", tooltip="Calculated height in pixels (multiple of 8)."
|
||||
"height", tooltip="Calculated height in pixels multiplied by the selected multiple."
|
||||
),
|
||||
],
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def execute(cls, aspect_ratio: str, megapixels: float) -> io.NodeOutput:
|
||||
def execute(cls, aspect_ratio: str, megapixels: float, multiple: int) -> io.NodeOutput:
|
||||
w_ratio, h_ratio = ASPECT_RATIOS[aspect_ratio]
|
||||
total_pixels = megapixels * 1024 * 1024
|
||||
scale = math.sqrt(total_pixels / (w_ratio * h_ratio))
|
||||
width = round(w_ratio * scale / 8) * 8
|
||||
height = round(h_ratio * scale / 8) * 8
|
||||
width = round(w_ratio * scale / multiple) * multiple
|
||||
height = round(h_ratio * scale / multiple) * multiple
|
||||
return io.NodeOutput(width, height)
|
||||
|
||||
|
||||
|
||||
321
comfy_extras/nodes_scail.py
Normal file
321
comfy_extras/nodes_scail.py
Normal file
@ -0,0 +1,321 @@
|
||||
"""SCAIL / SCAIL-2 nodes: the WanSCAILToVideo conditioning node and the SAM3
|
||||
preprocessing that turns video tracks into the bundle the SCAIL-2 model consumes."""
|
||||
|
||||
from typing_extensions import override
|
||||
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
|
||||
import nodes
|
||||
import node_helpers
|
||||
import comfy.model_management
|
||||
import comfy.utils
|
||||
from comfy_api.latest import ComfyExtension, io
|
||||
from comfy.ldm.sam3.tracker import unpack_masks
|
||||
|
||||
SAM3TrackData = io.Custom("SAM3_TRACK_DATA")
|
||||
|
||||
|
||||
# Model was trained on these exact colors; deviating degrades multi-identity quality.
|
||||
DEFAULT_PALETTE = [
|
||||
(0.0, 0.0, 1.0), # Blue
|
||||
(1.0, 0.0, 0.0), # Red
|
||||
(0.0, 1.0, 0.0), # Green
|
||||
(1.0, 0.0, 1.0), # Magenta
|
||||
(0.0, 1.0, 1.0), # Cyan
|
||||
(1.0, 1.0, 0.0), # Yellow
|
||||
]
|
||||
|
||||
|
||||
def _unpack(track_data):
|
||||
packed = track_data["packed_masks"]
|
||||
if packed is None or packed.shape[1] == 0:
|
||||
return None
|
||||
return unpack_masks(packed)
|
||||
|
||||
|
||||
def _first_frame_cx_area(masks_bool):
|
||||
first = masks_bool[0].float()
|
||||
H, W = first.shape[-2], first.shape[-1]
|
||||
n_pixels = H * W
|
||||
grid_x = torch.arange(W, device=first.device, dtype=first.dtype).view(1, W)
|
||||
area = first.sum(dim=(-1, -2)).clamp_(min=1)
|
||||
cx = (first * grid_x).sum(dim=(-1, -2)) / area
|
||||
return (cx / W).tolist(), (area / n_pixels).tolist()
|
||||
|
||||
|
||||
def _subset_track_data(track_data, obj_indices):
|
||||
out = dict(track_data)
|
||||
packed = track_data["packed_masks"]
|
||||
if packed is None or not obj_indices:
|
||||
out["packed_masks"] = None
|
||||
if "scores" in out:
|
||||
out["scores"] = []
|
||||
return out
|
||||
out["packed_masks"] = packed[:, obj_indices].contiguous()
|
||||
scores = track_data.get("scores")
|
||||
if scores is not None:
|
||||
out["scores"] = [scores[i] for i in obj_indices if i < len(scores)]
|
||||
return out
|
||||
|
||||
|
||||
def _render_colored_masks(track_data, background="black"):
|
||||
packed = track_data["packed_masks"]
|
||||
H, W = track_data["orig_size"]
|
||||
device = comfy.model_management.intermediate_device()
|
||||
dtype = comfy.model_management.intermediate_dtype()
|
||||
bg_rgb = (1.0, 1.0, 1.0) if background.startswith("white") else (0.0, 0.0, 0.0)
|
||||
if packed is None or packed.shape[1] == 0:
|
||||
T = track_data.get("n_frames", 1) if packed is None else packed.shape[0]
|
||||
out = torch.empty(T, H, W, 3, device=device, dtype=dtype)
|
||||
out[..., 0], out[..., 1], out[..., 2] = bg_rgb[0], bg_rgb[1], bg_rgb[2]
|
||||
return out
|
||||
T, N_obj = packed.shape[0], packed.shape[1]
|
||||
colors = torch.tensor(
|
||||
[DEFAULT_PALETTE[i % len(DEFAULT_PALETTE)] for i in range(N_obj)],
|
||||
device=device, dtype=dtype,
|
||||
)
|
||||
masks_full = unpack_masks(packed.to(device)).float()
|
||||
Hm, Wm = masks_full.shape[-2], masks_full.shape[-1]
|
||||
masks_full = F.interpolate(
|
||||
masks_full.view(T * N_obj, 1, Hm, Wm), size=(H, W), mode="nearest"
|
||||
).view(T, N_obj, H, W) > 0.5
|
||||
any_mask = masks_full.any(dim=1)
|
||||
obj_idx_map = masks_full.to(torch.uint8).argmax(dim=1)
|
||||
color_overlay = colors[obj_idx_map]
|
||||
bg_tensor = torch.tensor(bg_rgb, device=device, dtype=color_overlay.dtype).view(1, 1, 1, 3)
|
||||
return torch.where(any_mask.unsqueeze(-1), color_overlay, bg_tensor.expand_as(color_overlay))
|
||||
|
||||
|
||||
def _extract_mask_to_28ch(rgb_video):
|
||||
"""Colored RGB mask (T, H, W, 3) in [0, 1] -> SCAIL-2 28-channel binary latent
|
||||
(1, T_lat, 28, H_lat, W_lat). 7 per-color binary channels (white/r/g/b/y/m/c)
|
||||
threshold-extracted at 225/255, 8x spatial downsample, 4-frame temporal stacking."""
|
||||
T, H, W, _ = rgb_video.shape
|
||||
_ON_THRESH = 225.0 / 255.0
|
||||
mask = rgb_video.movedim(-1, 1).float()
|
||||
R = (mask[:, 0:1] > _ON_THRESH).float()
|
||||
G = (mask[:, 1:2] > _ON_THRESH).float()
|
||||
B = (mask[:, 2:3] > _ON_THRESH).float()
|
||||
nR, nG, nB = 1 - R, 1 - G, 1 - B
|
||||
binary_7ch = torch.cat([
|
||||
R * G * B, # white
|
||||
R * nG * nB, # red
|
||||
nR * G * nB, # green
|
||||
nR * nG * B, # blue
|
||||
R * G * nB, # yellow
|
||||
R * nG * B, # magenta
|
||||
nR * G * B, # cyan
|
||||
], dim=1)
|
||||
H_lat, W_lat = H, W
|
||||
for _ in range(3):
|
||||
H_lat = (H_lat + 1) // 2
|
||||
W_lat = (W_lat + 1) // 2
|
||||
binary_7ch = torch.nn.functional.interpolate(binary_7ch, size=(H_lat, W_lat), mode='area')
|
||||
T_latent = (T - 1) // 4 + 1
|
||||
padded = torch.cat([binary_7ch[:1].repeat(4, 1, 1, 1), binary_7ch[1:]], dim=0)
|
||||
out = padded.view(T_latent, 28, H_lat, W_lat)
|
||||
return out.unsqueeze(0)
|
||||
|
||||
|
||||
class WanSCAILToVideo(io.ComfyNode):
|
||||
@classmethod
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="WanSCAILToVideo",
|
||||
category="model/conditioning/video_models",
|
||||
inputs=[
|
||||
io.Conditioning.Input("positive"),
|
||||
io.Conditioning.Input("negative"),
|
||||
io.Vae.Input("vae"),
|
||||
io.Int.Input("width", default=512, min=32, max=nodes.MAX_RESOLUTION, step=32),
|
||||
io.Int.Input("height", default=896, min=32, max=nodes.MAX_RESOLUTION, step=32),
|
||||
io.Int.Input("length", default=81, min=1, max=nodes.MAX_RESOLUTION, step=4),
|
||||
io.Int.Input("batch_size", default=1, min=1, max=4096),
|
||||
io.Image.Input("pose_video", optional=True, tooltip="Video used for pose conditioning. Will be downscaled to half the resolution of the main video."),
|
||||
io.Image.Input("pose_video_mask", optional=True, tooltip="SCAIL-2 only. Colored per-identity SAM3 mask video at the same resolution as pose_video."),
|
||||
io.Boolean.Input("replacement_mode", default=False, optional=True, tooltip="SCAIL-2 only. False = Animation Mode (pose_video_mask should have black background). True = Replacement Mode (pose_video_mask should have white background)."),
|
||||
io.Float.Input("pose_strength", default=1.0, min=0.0, max=10.0, step=0.01, tooltip="Strength of the pose latent."),
|
||||
io.Float.Input("pose_start", default=0.0, min=0.0, max=1.0, step=0.01, tooltip="Start step of the pose conditioning."),
|
||||
io.Float.Input("pose_end", default=1.0, min=0.0, max=1.0, step=0.01, tooltip="End step of the pose conditioning."),
|
||||
io.Image.Input("reference_image", optional=True, tooltip="Reference image, for multiple references composite all on single image."),
|
||||
io.Image.Input("reference_image_mask", optional=True, tooltip="SCAIL-2 only. Colored reference mask at the same resolution as reference_image."),
|
||||
io.ClipVisionOutput.Input("clip_vision_output", optional=True, tooltip="CLIP vision features for conditioning. Model is trained with stretch resize to aspect ratio."),
|
||||
io.Int.Input("video_frame_offset", default=0, min=0, max=nodes.MAX_RESOLUTION, step=1, tooltip="Cumulative output frame this chunk begins at. Wire from the previous chunk's video_frame_offset output."),
|
||||
io.Int.Input("previous_frame_count", default=5, min=1, max=nodes.MAX_RESOLUTION, step=4, tooltip="Tail frames of previous_frames to anchor. SCAIL-2 trained at 5 (81-frame chunks, 76-frame step)."),
|
||||
io.Image.Input("previous_frames", optional=True, tooltip="SCAIL-2 only. Full decoded output of the previous chunk. Only the last previous_frame_count are used as the extension anchor."),
|
||||
],
|
||||
outputs=[
|
||||
io.Conditioning.Output(display_name="positive"),
|
||||
io.Conditioning.Output(display_name="negative"),
|
||||
io.Latent.Output(display_name="latent", tooltip="Empty latent of the generation size."),
|
||||
io.Int.Output(display_name="video_frame_offset", tooltip="Adjusted offset + length. Wire into the next chunk."),
|
||||
],
|
||||
is_experimental=True,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def execute(cls, positive, negative, vae, width, height, length, batch_size, pose_strength, pose_start, pose_end,
|
||||
video_frame_offset, previous_frame_count, replacement_mode=False, reference_image=None, clip_vision_output=None, pose_video=None,
|
||||
pose_video_mask=None, reference_image_mask=None, previous_frames=None) -> io.NodeOutput:
|
||||
latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device())
|
||||
noise_mask = None
|
||||
|
||||
ref_mask_flag = not replacement_mode
|
||||
positive = node_helpers.conditioning_set_values(positive, {"ref_mask_flag": ref_mask_flag})
|
||||
negative = node_helpers.conditioning_set_values(negative, {"ref_mask_flag": ref_mask_flag})
|
||||
|
||||
prev_trimmed = None
|
||||
if previous_frames is not None and previous_frames.shape[0] > 0:
|
||||
prev_trimmed = previous_frames[-previous_frame_count:]
|
||||
video_frame_offset -= prev_trimmed.shape[0]
|
||||
video_frame_offset = max(0, video_frame_offset)
|
||||
|
||||
ref_latent = None
|
||||
if reference_image is not None:
|
||||
reference_image = comfy.utils.common_upscale(reference_image[:1].movedim(-1, 1), width, height, "bicubic", "center").movedim(1, -1)
|
||||
# Replacement Mode: composite ref on black bg using reference_image_mask as alpha matte
|
||||
if replacement_mode and reference_image_mask is not None:
|
||||
rm = comfy.utils.common_upscale(reference_image_mask[:1].movedim(-1, 1), width, height, "nearest-exact", "center").movedim(1, -1)
|
||||
is_char = (rm[..., :3].max(dim=-1, keepdim=True).values > 0.1).to(reference_image.dtype)
|
||||
reference_image = reference_image * is_char
|
||||
ref_latent = vae.encode(reference_image[:, :, :, :3])
|
||||
|
||||
if ref_latent is not None:
|
||||
positive = node_helpers.conditioning_set_values(positive, {"reference_latents": [ref_latent]}, append=True)
|
||||
negative = node_helpers.conditioning_set_values(negative, {"reference_latents": [ref_latent]}, append=True)
|
||||
|
||||
if clip_vision_output is not None:
|
||||
positive = node_helpers.conditioning_set_values(positive, {"clip_vision_output": clip_vision_output})
|
||||
negative = node_helpers.conditioning_set_values(negative, {"clip_vision_output": clip_vision_output})
|
||||
|
||||
if pose_video is not None:
|
||||
if pose_video.shape[0] <= video_frame_offset:
|
||||
pose_video = None
|
||||
else:
|
||||
pose_video = pose_video[video_frame_offset:]
|
||||
if pose_video_mask is not None:
|
||||
if pose_video_mask.shape[0] <= video_frame_offset:
|
||||
pose_video_mask = None
|
||||
else:
|
||||
pose_video_mask = pose_video_mask[video_frame_offset:]
|
||||
|
||||
# Truncate pose+mask jointly to the shorter of the two, capped at length.
|
||||
ts = [v.shape[0] for v in (pose_video, pose_video_mask) if v is not None]
|
||||
if ts:
|
||||
T_kept = ((min(min(ts), length) - 1) // 4) * 4 + 1
|
||||
if pose_video is not None:
|
||||
pose_video = pose_video[:T_kept]
|
||||
if pose_video_mask is not None:
|
||||
pose_video_mask = pose_video_mask[:T_kept]
|
||||
|
||||
if pose_video is not None:
|
||||
pose_video = comfy.utils.common_upscale(pose_video[:length].movedim(-1, 1), width // 2, height // 2, "area", "center").movedim(1, -1)
|
||||
pose_video_latent = vae.encode(pose_video[:, :, :, :3]) * pose_strength
|
||||
positive = node_helpers.conditioning_set_values_with_timestep_range(positive, {"pose_video_latent": pose_video_latent}, pose_start, pose_end)
|
||||
negative = node_helpers.conditioning_set_values_with_timestep_range(negative, {"pose_video_latent": pose_video_latent}, pose_start, pose_end)
|
||||
|
||||
if pose_video_mask is not None:
|
||||
mask_video_hw = comfy.utils.common_upscale(pose_video_mask[:length].movedim(-1, 1), width // 2, height // 2, "area", "center").movedim(1, -1)
|
||||
driving_mask_28ch = _extract_mask_to_28ch(mask_video_hw)
|
||||
positive = node_helpers.conditioning_set_values(positive, {"driving_mask_28ch": driving_mask_28ch})
|
||||
negative = node_helpers.conditioning_set_values(negative, {"driving_mask_28ch": driving_mask_28ch})
|
||||
|
||||
if reference_image_mask is not None:
|
||||
ref_mask_hw = comfy.utils.common_upscale(reference_image_mask[:1].movedim(-1, 1), width, height, "bicubic", "center").movedim(1, -1)
|
||||
ref_mask_1f = _extract_mask_to_28ch(ref_mask_hw)
|
||||
zeros = torch.zeros((1, latent.shape[2], 28, ref_mask_1f.shape[-2], ref_mask_1f.shape[-1]), device=ref_mask_1f.device, dtype=ref_mask_1f.dtype)
|
||||
ref_mask_28ch = torch.cat([ref_mask_1f, zeros], dim=1)
|
||||
positive = node_helpers.conditioning_set_values(positive, {"ref_mask_28ch": ref_mask_28ch})
|
||||
negative = node_helpers.conditioning_set_values(negative, {"ref_mask_28ch": ref_mask_28ch})
|
||||
|
||||
if prev_trimmed is not None:
|
||||
pf = comfy.utils.common_upscale(prev_trimmed.movedim(-1, 1), width, height, "bicubic", "center").movedim(1, -1)
|
||||
prev_latent = vae.encode(pf[:, :, :, :3])
|
||||
prev_latent_frames = min(prev_latent.shape[2], latent.shape[2])
|
||||
latent[:, :, :prev_latent_frames] = prev_latent[:, :, :prev_latent_frames].to(latent.dtype)
|
||||
noise_mask = torch.ones((1, 1, latent.shape[2], latent.shape[-2], latent.shape[-1]), device=latent.device, dtype=latent.dtype)
|
||||
noise_mask[:, :, :prev_latent_frames] = 0.0
|
||||
|
||||
out_latent = {"samples": latent}
|
||||
if noise_mask is not None:
|
||||
out_latent["noise_mask"] = noise_mask
|
||||
return io.NodeOutput(positive, negative, out_latent, video_frame_offset + length)
|
||||
|
||||
|
||||
class SCAIL2ColoredMask(io.ComfyNode):
|
||||
"""Render SAM3 tracks for the driving pose video and (optionally) the reference
|
||||
image into the two colored masks WanSCAILToVideo consumes. Shared `sort_by`
|
||||
across both outputs guarantees identity K maps to the same color on both
|
||||
sides, for multi-person workflow consistency.
|
||||
reference_image_mask is always rendered black-bg (model convention)
|
||||
pose_video_mask bg follows replacement_mode: black = Animation Mode, white = Replacement Mode
|
||||
"""
|
||||
|
||||
@classmethod
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="SCAIL2ColoredMask",
|
||||
display_name="Create SCAIL-2 Colored Mask",
|
||||
category="conditioning/video_models/scail",
|
||||
inputs=[
|
||||
SAM3TrackData.Input("driving_track_data", tooltip="SAM3 track of the driving pose video. Will be rendered into the pose_video_mask output."),
|
||||
SAM3TrackData.Input("ref_track_data", optional=True,
|
||||
tooltip="SAM3 track of the reference image."),
|
||||
io.String.Input("object_indices", default="",
|
||||
tooltip="Comma-separated list of person indices to include (e.g. '0,2,3'). Applied to both reference and pose video masks. Empty = all."),
|
||||
io.Combo.Input("sort_by", options=["none", "left_to_right", "area"], default="left_to_right",
|
||||
tooltip="Order in which palette colors are assigned to the tracked objects (applied to both reference and pose video so each identity keeps the same color). left_to_right = leftmost object (by first-frame centroid) gets the first color; area = biggest object (by first-frame mask area) gets the first color; none = keep SAM3's order."),
|
||||
io.Boolean.Input("replacement_mode", default=False,
|
||||
tooltip="False = mask_video has black bg (Animation Mode). True = white bg (Replacement Mode). Set the matching replacement_mode on WanSCAILToVideo. reference_image_mask is always black-bg regardless."),
|
||||
],
|
||||
outputs=[
|
||||
io.Image.Output("pose_video_mask"),
|
||||
io.Image.Output("reference_image_mask"),
|
||||
],
|
||||
is_experimental=True,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def execute(cls, driving_track_data, object_indices, sort_by, replacement_mode, ref_track_data=None):
|
||||
def _prep(td):
|
||||
masks_bool = _unpack(td)
|
||||
if sort_by != "none" and masks_bool is not None:
|
||||
cx, area = _first_frame_cx_area(masks_bool)
|
||||
if sort_by == "left_to_right":
|
||||
order = sorted(range(len(cx)), key=lambda i: cx[i])
|
||||
else: # "area"
|
||||
order = sorted(range(len(area)), key=lambda i: -area[i])
|
||||
td = _subset_track_data(td, order)
|
||||
if object_indices.strip():
|
||||
indices = [int(i.strip()) for i in object_indices.split(",") if i.strip().isdigit()]
|
||||
packed = td.get("packed_masks")
|
||||
n_obj = packed.shape[1] if packed is not None else 0
|
||||
indices = [i for i in indices if 0 <= i < n_obj]
|
||||
td = _subset_track_data(td, indices)
|
||||
return td
|
||||
|
||||
drv = _prep(driving_track_data)
|
||||
mask_video = _render_colored_masks(drv, "white" if replacement_mode else "black")
|
||||
|
||||
if ref_track_data is not None:
|
||||
ref = _prep(ref_track_data)
|
||||
reference_image_mask = _render_colored_masks(ref, "black")
|
||||
else:
|
||||
H, W = drv["orig_size"]
|
||||
reference_image_mask = torch.zeros(1, H, W, 3, device=comfy.model_management.intermediate_device(), dtype=comfy.model_management.intermediate_dtype())
|
||||
|
||||
return io.NodeOutput(mask_video, reference_image_mask)
|
||||
|
||||
|
||||
class SCAILExtension(ComfyExtension):
|
||||
@override
|
||||
async def get_node_list(self) -> list[type[io.ComfyNode]]:
|
||||
return [
|
||||
WanSCAILToVideo,
|
||||
SCAIL2ColoredMask,
|
||||
]
|
||||
|
||||
|
||||
async def comfy_entrypoint() -> SCAILExtension:
|
||||
return SCAILExtension()
|
||||
@ -1456,63 +1456,6 @@ class WanInfiniteTalkToVideo(io.ComfyNode):
|
||||
return io.NodeOutput(model_patched, positive, negative, out_latent, trim_image)
|
||||
|
||||
|
||||
class WanSCAILToVideo(io.ComfyNode):
|
||||
@classmethod
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="WanSCAILToVideo",
|
||||
category="model/conditioning/video_models",
|
||||
inputs=[
|
||||
io.Conditioning.Input("positive"),
|
||||
io.Conditioning.Input("negative"),
|
||||
io.Vae.Input("vae"),
|
||||
io.Int.Input("width", default=512, min=32, max=nodes.MAX_RESOLUTION, step=32),
|
||||
io.Int.Input("height", default=896, min=32, max=nodes.MAX_RESOLUTION, step=32),
|
||||
io.Int.Input("length", default=81, min=1, max=nodes.MAX_RESOLUTION, step=4),
|
||||
io.Int.Input("batch_size", default=1, min=1, max=4096),
|
||||
io.ClipVisionOutput.Input("clip_vision_output", optional=True),
|
||||
io.Image.Input("reference_image", optional=True),
|
||||
io.Image.Input("pose_video", optional=True, tooltip="Video used for pose conditioning. Will be downscaled to half the resolution of the main video."),
|
||||
io.Float.Input("pose_strength", default=1.0, min=0.0, max=10.0, step=0.01, tooltip="Strength of the pose latent."),
|
||||
io.Float.Input("pose_start", default=0.0, min=0.0, max=1.0, step=0.01, tooltip="Start step to use pose conditioning."),
|
||||
io.Float.Input("pose_end", default=1.0, min=0.0, max=1.0, step=0.01, tooltip="End step to use pose conditioning."),
|
||||
],
|
||||
outputs=[
|
||||
io.Conditioning.Output(display_name="positive"),
|
||||
io.Conditioning.Output(display_name="negative"),
|
||||
io.Latent.Output(display_name="latent", tooltip="Empty latent of the generation size."),
|
||||
],
|
||||
is_experimental=True,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def execute(cls, positive, negative, vae, width, height, length, batch_size, pose_strength, pose_start, pose_end, reference_image=None, clip_vision_output=None, pose_video=None) -> io.NodeOutput:
|
||||
latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device())
|
||||
|
||||
ref_latent = None
|
||||
if reference_image is not None:
|
||||
reference_image = comfy.utils.common_upscale(reference_image[:1].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1)
|
||||
ref_latent = vae.encode(reference_image[:, :, :, :3])
|
||||
|
||||
if ref_latent is not None:
|
||||
positive = node_helpers.conditioning_set_values(positive, {"reference_latents": [ref_latent]}, append=True)
|
||||
negative = node_helpers.conditioning_set_values(negative, {"reference_latents": [torch.zeros_like(ref_latent)]}, append=True)
|
||||
|
||||
if clip_vision_output is not None:
|
||||
positive = node_helpers.conditioning_set_values(positive, {"clip_vision_output": clip_vision_output})
|
||||
negative = node_helpers.conditioning_set_values(negative, {"clip_vision_output": clip_vision_output})
|
||||
|
||||
if pose_video is not None:
|
||||
pose_video = comfy.utils.common_upscale(pose_video[:length].movedim(-1, 1), width // 2, height // 2, "area", "center").movedim(1, -1)
|
||||
pose_video_latent = vae.encode(pose_video[:, :, :, :3]) * pose_strength
|
||||
positive = node_helpers.conditioning_set_values_with_timestep_range(positive, {"pose_video_latent": pose_video_latent}, pose_start, pose_end)
|
||||
negative = node_helpers.conditioning_set_values_with_timestep_range(negative, {"pose_video_latent": pose_video_latent}, pose_start, pose_end)
|
||||
|
||||
out_latent = {}
|
||||
out_latent["samples"] = latent
|
||||
return io.NodeOutput(positive, negative, out_latent)
|
||||
|
||||
|
||||
class WanExtension(ComfyExtension):
|
||||
@override
|
||||
async def get_node_list(self) -> list[type[io.ComfyNode]]:
|
||||
@ -1533,7 +1476,6 @@ class WanExtension(ComfyExtension):
|
||||
WanAnimateToVideo,
|
||||
Wan22ImageToVideoLatent,
|
||||
WanInfiniteTalkToVideo,
|
||||
WanSCAILToVideo,
|
||||
]
|
||||
|
||||
async def comfy_entrypoint() -> WanExtension:
|
||||
|
||||
15
main.py
15
main.py
@ -26,6 +26,7 @@ import utils.extra_config
|
||||
from utils.mime_types import init_mime_types
|
||||
import faulthandler
|
||||
import logging
|
||||
import signal
|
||||
import sys
|
||||
from comfy_execution.progress import get_progress_state
|
||||
from comfy_execution.utils import get_executing_context
|
||||
@ -37,7 +38,19 @@ if __name__ == "__main__":
|
||||
os.environ['HF_HUB_DISABLE_TELEMETRY'] = '1'
|
||||
os.environ['DO_NOT_TRACK'] = '1'
|
||||
|
||||
faulthandler.enable(file=sys.stderr, all_threads=False)
|
||||
faulthandler.enable(file=sys.stderr, all_threads=args.debug_hang)
|
||||
if __name__ == "__main__" and args.debug_hang:
|
||||
dumping_traceback = False
|
||||
|
||||
def dump_traceback_on_sigint(signum, frame):
|
||||
global dumping_traceback
|
||||
if dumping_traceback:
|
||||
raise KeyboardInterrupt
|
||||
dumping_traceback = True
|
||||
faulthandler.dump_traceback(file=sys.stderr, all_threads=True)
|
||||
raise KeyboardInterrupt
|
||||
|
||||
signal.signal(signal.SIGINT, dump_traceback_on_sigint)
|
||||
|
||||
import comfy_aimdo.control
|
||||
|
||||
|
||||
1
nodes.py
1
nodes.py
@ -2450,6 +2450,7 @@ async def init_builtin_extra_nodes():
|
||||
"nodes_rtdetr.py",
|
||||
"nodes_frame_interpolation.py",
|
||||
"nodes_sam3.py",
|
||||
"nodes_scail.py",
|
||||
"nodes_void.py",
|
||||
"nodes_wandancer.py",
|
||||
"nodes_hidream_o1.py",
|
||||
|
||||
232
openapi.yaml
232
openapi.yaml
@ -3,11 +3,6 @@ components:
|
||||
Asset:
|
||||
description: Represents a user-owned asset (image, video, or other generated output).
|
||||
properties:
|
||||
asset_hash:
|
||||
deprecated: true
|
||||
description: 'Deprecated: use hash instead. Blake3 hash of the asset content.'
|
||||
pattern: ^blake3:[a-f0-9]{64}$
|
||||
type: string
|
||||
created_at:
|
||||
description: Timestamp when the asset was created
|
||||
format: date-time
|
||||
@ -16,8 +11,12 @@ components:
|
||||
description: Display name of the asset. Mirrors name for backwards compatibility.
|
||||
nullable: true
|
||||
type: string
|
||||
file_path:
|
||||
description: Relative path in global-namespace-root form (e.g. "models/checkpoints/flux.safetensors")
|
||||
nullable: true
|
||||
type: string
|
||||
hash:
|
||||
description: Blake3 hash of the asset content. Preferred over asset_hash.
|
||||
description: Blake3 hash of the asset content.
|
||||
pattern: ^blake3:[a-f0-9]{64}$
|
||||
type: string
|
||||
id:
|
||||
@ -139,17 +138,16 @@ components:
|
||||
AssetUpdated:
|
||||
description: Response returned when an existing asset is successfully updated.
|
||||
properties:
|
||||
asset_hash:
|
||||
deprecated: true
|
||||
description: 'Deprecated: use hash instead. Blake3 hash of the asset content.'
|
||||
pattern: ^blake3:[a-f0-9]{64}$
|
||||
type: string
|
||||
display_name:
|
||||
description: Display name of the asset. Mirrors name for backwards compatibility.
|
||||
nullable: true
|
||||
type: string
|
||||
file_path:
|
||||
description: Relative path in global-namespace-root form (e.g. "models/checkpoints/flux.safetensors")
|
||||
nullable: true
|
||||
type: string
|
||||
hash:
|
||||
description: Blake3 hash of the asset content. Preferred over asset_hash.
|
||||
description: Blake3 hash of the asset content.
|
||||
pattern: ^blake3:[a-f0-9]{64}$
|
||||
type: string
|
||||
id:
|
||||
@ -828,7 +826,11 @@ components:
|
||||
type: string
|
||||
type: object
|
||||
PaginationInfo:
|
||||
description: Offset/limit-based pagination metadata included in list responses.
|
||||
description: |
|
||||
Pagination metadata included in list responses. Supports both legacy
|
||||
offset/limit pagination and cursor-based pagination. When cursor-based
|
||||
pagination is used, `next_cursor` is the primary pagination token and
|
||||
`offset`/`total` may be zero.
|
||||
properties:
|
||||
has_more:
|
||||
description: Whether more items are available beyond this page
|
||||
@ -837,12 +839,19 @@ components:
|
||||
description: Items per page
|
||||
minimum: 1
|
||||
type: integer
|
||||
next_cursor:
|
||||
description: |
|
||||
Opaque cursor for the next page. Pass this value as the `after`
|
||||
query parameter on the next request. Empty or absent when there
|
||||
are no more results.
|
||||
type: string
|
||||
offset:
|
||||
description: Current offset (0-based)
|
||||
deprecated: true
|
||||
description: 'Current offset (0-based). Deprecated: use cursor-based pagination.'
|
||||
minimum: 0
|
||||
type: integer
|
||||
total:
|
||||
description: Total number of items matching filters
|
||||
description: Total number of items matching filters (may be 0 when using cursor pagination)
|
||||
minimum: 0
|
||||
type: integer
|
||||
required:
|
||||
@ -1053,6 +1062,9 @@ components:
|
||||
comfyui_version:
|
||||
description: ComfyUI version
|
||||
type: string
|
||||
deploy_environment:
|
||||
description: How this ComfyUI instance was installed (e.g. local-git, local-portable, local-desktop)
|
||||
type: string
|
||||
embedded_python:
|
||||
description: Whether using embedded Python
|
||||
type: boolean
|
||||
@ -1518,17 +1530,11 @@ paths:
|
||||
schema:
|
||||
default: true
|
||||
type: boolean
|
||||
- description: Filter assets by exact content hash. Preferred over asset_hash.
|
||||
- description: Filter assets by exact content hash.
|
||||
in: query
|
||||
name: hash
|
||||
schema:
|
||||
type: string
|
||||
- deprecated: true
|
||||
description: 'Deprecated: use hash instead. Filter assets by exact content hash.'
|
||||
in: query
|
||||
name: asset_hash
|
||||
schema:
|
||||
type: string
|
||||
- description: |
|
||||
Opaque cursor for keyset pagination. Pass the `next_cursor` value
|
||||
from the previous response to fetch the next page. When provided,
|
||||
@ -1571,42 +1577,12 @@ paths:
|
||||
- file
|
||||
post:
|
||||
description: |
|
||||
Uploads a new asset to the system with associated metadata.
|
||||
Supports two upload methods:
|
||||
1. Direct file upload (multipart/form-data)
|
||||
2. URL-based upload (application/json with source: "url")
|
||||
Creates a new asset from a direct file upload (multipart/form-data) with associated metadata.
|
||||
|
||||
If an asset with the same hash already exists, returns the existing asset.
|
||||
operationId: uploadAsset
|
||||
operationId: createAsset
|
||||
requestBody:
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
properties:
|
||||
name:
|
||||
description: Display name for the asset (used to determine file extension)
|
||||
type: string
|
||||
preview_id:
|
||||
description: Optional preview asset ID
|
||||
format: uuid
|
||||
type: string
|
||||
tags:
|
||||
description: Freeform tags for the asset. Common types include "models", "input", "output", and "temp", but any tag can be used in any order.
|
||||
items:
|
||||
type: string
|
||||
type: array
|
||||
url:
|
||||
description: HTTP/HTTPS URL to download the asset from
|
||||
format: uri
|
||||
type: string
|
||||
user_metadata:
|
||||
additionalProperties: true
|
||||
description: Custom metadata to store with the asset
|
||||
type: object
|
||||
required:
|
||||
- url
|
||||
- name
|
||||
type: object
|
||||
multipart/form-data:
|
||||
schema:
|
||||
properties:
|
||||
@ -1614,6 +1590,10 @@ paths:
|
||||
description: The asset file to upload
|
||||
format: binary
|
||||
type: string
|
||||
hash:
|
||||
description: Content hash of the file.
|
||||
pattern: ^(blake3|sha256):[a-f0-9]{64}$
|
||||
type: string
|
||||
id:
|
||||
description: Optional asset ID for idempotent creation. If provided and asset exists, returns existing asset.
|
||||
format: uuid
|
||||
@ -1629,10 +1609,8 @@ paths:
|
||||
format: uuid
|
||||
type: string
|
||||
tags:
|
||||
description: Freeform tags for the asset. Common types include "models", "input", "output", and "temp", but any tag can be used in any order.
|
||||
items:
|
||||
type: string
|
||||
type: array
|
||||
description: JSON-encoded array of freeform tag strings, e.g. '["models","checkpoint"]'. Common types include "models", "input", "output", and "temp", but any tag can be used in any order.
|
||||
type: string
|
||||
user_metadata:
|
||||
description: Custom JSON metadata as a string
|
||||
type: string
|
||||
@ -1641,36 +1619,32 @@ paths:
|
||||
type: object
|
||||
required: true
|
||||
responses:
|
||||
"200":
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/AssetCreated'
|
||||
description: |
|
||||
Asset already existed for this user (deduplicated by content hash); the
|
||||
existing asset is returned with created_new=false.
|
||||
"201":
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/AssetCreated'
|
||||
description: Asset created successfully
|
||||
description: Asset created successfully (created_new=true)
|
||||
"400":
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/ErrorResponse'
|
||||
description: Invalid request (bad file, invalid URL, invalid content type, etc.)
|
||||
description: Invalid request (bad file, invalid content type, etc.)
|
||||
"401":
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/ErrorResponse'
|
||||
description: Unauthorized
|
||||
"403":
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/ErrorResponse'
|
||||
description: Source URL requires authentication or access denied
|
||||
"404":
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/ErrorResponse'
|
||||
description: Source URL not found
|
||||
"413":
|
||||
content:
|
||||
application/json:
|
||||
@ -1683,19 +1657,13 @@ paths:
|
||||
schema:
|
||||
$ref: '#/components/schemas/ErrorResponse'
|
||||
description: Unsupported media type
|
||||
"422":
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/ErrorResponse'
|
||||
description: Download failed due to network error or timeout
|
||||
"500":
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/ErrorResponse'
|
||||
description: Internal server error
|
||||
summary: Upload a new asset
|
||||
summary: Create a new asset
|
||||
tags:
|
||||
- file
|
||||
/api/assets/{id}:
|
||||
@ -1730,7 +1698,7 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/ErrorResponse'
|
||||
description: Asset cannot be deleted because it is referenced by another resource (e.g., workflow version)
|
||||
description: 'Asset cannot be deleted because it is referenced by another resource, e.g. a workflow version (error code: ASSET_IN_USE)'
|
||||
"500":
|
||||
content:
|
||||
application/json:
|
||||
@ -1783,7 +1751,7 @@ paths:
|
||||
description: |
|
||||
Updates an asset's metadata. At least one field must be provided.
|
||||
Only name, mime_type, preview_id, and user_metadata can be updated.
|
||||
For tag management, use the dedicated PUT /api/assets/{id}/tags endpoint.
|
||||
For tag management, use POST (add) and DELETE (remove) /api/assets/{id}/tags.
|
||||
operationId: updateAsset
|
||||
parameters:
|
||||
- description: Asset ID
|
||||
@ -1982,76 +1950,6 @@ paths:
|
||||
summary: Add tags to asset
|
||||
tags:
|
||||
- file
|
||||
put:
|
||||
description: Adds and removes tags from an asset in a single operation
|
||||
operationId: updateAssetTags
|
||||
parameters:
|
||||
- description: Asset ID
|
||||
in: path
|
||||
name: id
|
||||
required: true
|
||||
schema:
|
||||
format: uuid
|
||||
type: string
|
||||
requestBody:
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
description: At least one of add or remove must contain items. Empty arrays are allowed when the other array has items.
|
||||
minProperties: 1
|
||||
properties:
|
||||
add:
|
||||
description: Tags to add to the asset. Can be empty if remove has items.
|
||||
items:
|
||||
type: string
|
||||
type: array
|
||||
remove:
|
||||
description: Tags to remove from the asset. Can be empty if add has items.
|
||||
items:
|
||||
type: string
|
||||
type: array
|
||||
type: object
|
||||
required: true
|
||||
responses:
|
||||
"200":
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/TagsModificationResponse'
|
||||
description: Tags updated successfully
|
||||
"400":
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/ErrorResponse'
|
||||
description: Invalid request
|
||||
"401":
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/ErrorResponse'
|
||||
description: Unauthorized
|
||||
"404":
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/ErrorResponse'
|
||||
description: Asset not found
|
||||
"422":
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/ErrorResponse'
|
||||
description: Reserved tag validation error
|
||||
"500":
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/ErrorResponse'
|
||||
description: Internal server error
|
||||
summary: Update asset tags
|
||||
tags:
|
||||
- file
|
||||
/api/assets/from-hash:
|
||||
post:
|
||||
description: |
|
||||
@ -2090,12 +1988,20 @@ paths:
|
||||
type: object
|
||||
required: true
|
||||
responses:
|
||||
"200":
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/AssetCreated'
|
||||
description: |
|
||||
Asset reference already existed for this user (deduplicated by content
|
||||
hash); the existing asset is returned with created_new=false.
|
||||
"201":
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/AssetCreated'
|
||||
description: Asset reference created successfully
|
||||
description: Asset reference created successfully (created_new=true)
|
||||
"400":
|
||||
content:
|
||||
application/json:
|
||||
@ -2887,7 +2793,21 @@ paths:
|
||||
- asc
|
||||
- desc
|
||||
type: string
|
||||
- description: Pagination offset (0-based)
|
||||
- description: |
|
||||
Opaque cursor for keyset pagination. Pass the `next_cursor` value
|
||||
from a previous response to fetch the next page.
|
||||
Cursor pagination is supported only when `sort_by=create_time`
|
||||
(default). If `sort_by=execution_time`, `after` is ignored and
|
||||
offset/limit pagination is used.
|
||||
Cursors are opaque base64url payloads — clients should treat them
|
||||
as strings and not parse the contents.
|
||||
example: eyJzIjoiY3JlYXRlX3RpbWUiLCJ2IjoiMTcxNjIwMDAwMDAwMDAwMCIsImlkIjoiYTFiMmMzZDQtZTVmNi03YTg5LWIwYzEtZDJlM2Y0YTViNmM3In0
|
||||
in: query
|
||||
name: after
|
||||
schema:
|
||||
type: string
|
||||
- deprecated: true
|
||||
description: 'Pagination offset (0-based). Deprecated: prefer cursor-based pagination via `after`.'
|
||||
in: query
|
||||
name: offset
|
||||
schema:
|
||||
@ -2909,6 +2829,12 @@ paths:
|
||||
schema:
|
||||
$ref: '#/components/schemas/JobsListResponse'
|
||||
description: Success - Jobs retrieved
|
||||
"400":
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/ErrorResponse'
|
||||
description: Bad request (e.g. malformed pagination cursor).
|
||||
"401":
|
||||
content:
|
||||
application/json:
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
comfyui-frontend-package==1.45.15
|
||||
comfyui-workflow-templates==0.9.98
|
||||
comfyui-embedded-docs==0.5.2
|
||||
comfyui-embedded-docs==0.5.3
|
||||
torch
|
||||
torchsde
|
||||
torchvision
|
||||
|
||||
11
server.py
11
server.py
@ -27,6 +27,7 @@ import logging
|
||||
|
||||
import mimetypes
|
||||
from comfy.cli_args import args
|
||||
from comfy.deploy_environment import get_deploy_environment
|
||||
import comfy.utils
|
||||
import comfy.model_management
|
||||
from comfy_api import feature_flags
|
||||
@ -690,6 +691,7 @@ class PromptServer():
|
||||
"python_version": sys.version,
|
||||
"pytorch_version": comfy.model_management.torch_version,
|
||||
"embedded_python": os.path.split(os.path.split(sys.executable)[0])[1] == "python_embeded",
|
||||
"deploy_environment": get_deploy_environment(),
|
||||
"argv": sys.argv
|
||||
},
|
||||
"devices": device_entries
|
||||
@ -1253,6 +1255,15 @@ class PromptServer():
|
||||
|
||||
if verbose:
|
||||
logging.info("Starting server\n")
|
||||
if args.debug_hang:
|
||||
logging.info(
|
||||
f"{'-' * 80}\n"
|
||||
"ComfyUI has been started in debug-hang mode. Run your workflow as normal up to\n"
|
||||
"the point of the hang or freeze, then use ctrl-C in the cmd or controlling\n"
|
||||
"terminal to dump the python backtraces for debugging. Please attach the extra\n"
|
||||
"debug info to your bug report.\n"
|
||||
f"{'-' * 80}"
|
||||
)
|
||||
for addr in addresses:
|
||||
address = addr[0]
|
||||
port = addr[1]
|
||||
|
||||
86
tests-unit/assets_test/services/test_image_dimensions.py
Normal file
86
tests-unit/assets_test/services/test_image_dimensions.py
Normal file
@ -0,0 +1,86 @@
|
||||
"""Tests for the image_dimensions service."""
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
from PIL import Image
|
||||
|
||||
from app.assets.services.image_dimensions import extract_image_dimensions
|
||||
|
||||
|
||||
def _make_png(path: Path, size: tuple[int, int]) -> Path:
|
||||
img = Image.new("RGB", size, color=(123, 45, 67))
|
||||
img.save(path, format="PNG")
|
||||
return path
|
||||
|
||||
|
||||
def _make_jpeg(path: Path, size: tuple[int, int]) -> Path:
|
||||
img = Image.new("RGB", size, color=(10, 20, 30))
|
||||
img.save(path, format="JPEG", quality=80)
|
||||
return path
|
||||
|
||||
|
||||
class TestExtractImageDimensions:
|
||||
def test_extracts_png_dimensions(self, tmp_path: Path):
|
||||
f = _make_png(tmp_path / "rect.png", (320, 240))
|
||||
|
||||
result = extract_image_dimensions(str(f), mime_type="image/png")
|
||||
|
||||
assert result == {"kind": "image", "width": 320, "height": 240}
|
||||
|
||||
def test_extracts_jpeg_dimensions(self, tmp_path: Path):
|
||||
f = _make_jpeg(tmp_path / "shot.jpg", (1920, 1080))
|
||||
|
||||
result = extract_image_dimensions(str(f), mime_type="image/jpeg")
|
||||
|
||||
assert result == {"kind": "image", "width": 1920, "height": 1080}
|
||||
|
||||
def test_works_when_mime_type_is_none(self, tmp_path: Path):
|
||||
f = _make_png(tmp_path / "no_mime.png", (50, 100))
|
||||
|
||||
result = extract_image_dimensions(str(f), mime_type=None)
|
||||
|
||||
assert result == {"kind": "image", "width": 50, "height": 100}
|
||||
|
||||
def test_skips_non_image_mime_without_touching_file(self, tmp_path: Path):
|
||||
# Path doesn't need to exist — non-image MIME short-circuits.
|
||||
result = extract_image_dimensions(
|
||||
str(tmp_path / "model.safetensors"),
|
||||
mime_type="application/octet-stream",
|
||||
)
|
||||
|
||||
assert result is None
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"mime",
|
||||
["application/json", "text/plain", "video/mp4", "audio/mpeg"],
|
||||
)
|
||||
def test_skips_all_non_image_mime_types(self, tmp_path: Path, mime: str):
|
||||
f = tmp_path / "file.bin"
|
||||
f.write_bytes(b"\x00\x01\x02")
|
||||
|
||||
assert extract_image_dimensions(str(f), mime_type=mime) is None
|
||||
|
||||
def test_returns_none_for_missing_file(self, tmp_path: Path):
|
||||
result = extract_image_dimensions(
|
||||
str(tmp_path / "does_not_exist.png"), mime_type="image/png"
|
||||
)
|
||||
|
||||
assert result is None
|
||||
|
||||
def test_returns_none_for_corrupt_image(self, tmp_path: Path):
|
||||
f = tmp_path / "corrupt.png"
|
||||
f.write_bytes(b"not actually a png file")
|
||||
|
||||
result = extract_image_dimensions(str(f), mime_type="image/png")
|
||||
|
||||
assert result is None
|
||||
|
||||
def test_returns_none_for_empty_file(self, tmp_path: Path):
|
||||
f = tmp_path / "empty.png"
|
||||
f.write_bytes(b"")
|
||||
|
||||
result = extract_image_dimensions(str(f), mime_type="image/png")
|
||||
|
||||
assert result is None
|
||||
@ -4,10 +4,12 @@ from pathlib import Path
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
from PIL import Image
|
||||
from sqlalchemy.orm import Session as SASession, Session
|
||||
|
||||
from app.assets.database.models import Asset, AssetReference, AssetReferenceTag, Tag
|
||||
from app.assets.database.queries import get_reference_tags
|
||||
from app.assets.helpers import get_utc_now
|
||||
from app.assets.services.ingest import (
|
||||
_ingest_file_from_path,
|
||||
_register_existing_asset,
|
||||
@ -15,6 +17,11 @@ from app.assets.services.ingest import (
|
||||
)
|
||||
|
||||
|
||||
def _make_png(path: Path, size: tuple[int, int]) -> Path:
|
||||
Image.new("RGB", size, color=(80, 120, 200)).save(path, format="PNG")
|
||||
return path
|
||||
|
||||
|
||||
class TestIngestFileFromPath:
|
||||
def test_creates_asset_and_reference(self, mock_create_session, temp_dir: Path, session: Session):
|
||||
file_path = temp_dir / "test_file.bin"
|
||||
@ -279,4 +286,203 @@ class TestIngestExistingFileTagFK:
|
||||
ref_tags = sess.query(AssetReferenceTag).all()
|
||||
ref_tag_names = {rt.tag_name for rt in ref_tags}
|
||||
assert "output" in ref_tag_names
|
||||
assert "my-job" in ref_tag_names
|
||||
|
||||
|
||||
class TestIngestImageDimensions:
|
||||
"""system_metadata should carry {kind, width, height} for image assets."""
|
||||
|
||||
def test_image_asset_emits_dimensions(
|
||||
self, mock_create_session, temp_dir: Path, session: Session
|
||||
):
|
||||
f = _make_png(temp_dir / "shot.png", (640, 480))
|
||||
|
||||
result = _ingest_file_from_path(
|
||||
abs_path=str(f),
|
||||
asset_hash="blake3:img1",
|
||||
size_bytes=f.stat().st_size,
|
||||
mtime_ns=1234567890000000000,
|
||||
mime_type="image/png",
|
||||
)
|
||||
|
||||
ref = session.query(AssetReference).filter_by(id=result.reference_id).first()
|
||||
assert ref.system_metadata == {
|
||||
"kind": "image",
|
||||
"width": 640,
|
||||
"height": 480,
|
||||
}
|
||||
|
||||
def test_non_image_asset_leaves_system_metadata_empty(
|
||||
self, mock_create_session, temp_dir: Path, session: Session
|
||||
):
|
||||
f = temp_dir / "model.safetensors"
|
||||
f.write_bytes(b"not an image")
|
||||
|
||||
result = _ingest_file_from_path(
|
||||
abs_path=str(f),
|
||||
asset_hash="blake3:safetensors1",
|
||||
size_bytes=f.stat().st_size,
|
||||
mtime_ns=1234567890000000000,
|
||||
mime_type="application/octet-stream",
|
||||
)
|
||||
|
||||
ref = session.query(AssetReference).filter_by(id=result.reference_id).first()
|
||||
assert ref.system_metadata in (None, {})
|
||||
|
||||
def test_preserves_existing_system_metadata_keys(
|
||||
self, mock_create_session, temp_dir: Path, session: Session
|
||||
):
|
||||
f = _make_png(temp_dir / "annotated.png", (100, 200))
|
||||
|
||||
# First pass populates a sentinel system_metadata key (simulating prior
|
||||
# enricher write).
|
||||
result = _ingest_file_from_path(
|
||||
abs_path=str(f),
|
||||
asset_hash="blake3:img-merge",
|
||||
size_bytes=f.stat().st_size,
|
||||
mtime_ns=1234567890000000000,
|
||||
mime_type="image/png",
|
||||
)
|
||||
ref = session.query(AssetReference).filter_by(id=result.reference_id).first()
|
||||
ref.system_metadata = {**(ref.system_metadata or {}), "source_url": "https://example/x.png"}
|
||||
session.commit()
|
||||
|
||||
# Second pass with the same path triggers the merge code path again.
|
||||
_ingest_file_from_path(
|
||||
abs_path=str(f),
|
||||
asset_hash="blake3:img-merge",
|
||||
size_bytes=f.stat().st_size,
|
||||
mtime_ns=1234567890000000001,
|
||||
mime_type="image/png",
|
||||
)
|
||||
|
||||
session.refresh(ref)
|
||||
assert ref.system_metadata["kind"] == "image"
|
||||
assert ref.system_metadata["width"] == 100
|
||||
assert ref.system_metadata["height"] == 200
|
||||
assert ref.system_metadata["source_url"] == "https://example/x.png"
|
||||
|
||||
|
||||
class TestRegisterExistingAssetBackfill:
|
||||
"""The from-hash path back-fills dimensions from a sibling reference."""
|
||||
|
||||
def _add_reference(
|
||||
self,
|
||||
session: Session,
|
||||
asset: Asset,
|
||||
name: str,
|
||||
system_metadata: dict | None = None,
|
||||
) -> AssetReference:
|
||||
now = get_utc_now()
|
||||
ref = AssetReference(
|
||||
asset_id=asset.id,
|
||||
name=name,
|
||||
owner_id="",
|
||||
created_at=now,
|
||||
updated_at=now,
|
||||
last_access_time=now,
|
||||
system_metadata=system_metadata or {},
|
||||
)
|
||||
session.add(ref)
|
||||
session.flush()
|
||||
return ref
|
||||
|
||||
def test_backfills_dimensions_from_sibling_image_reference(
|
||||
self, mock_create_session, session: Session
|
||||
):
|
||||
asset = Asset(hash="blake3:shared", size_bytes=2048, mime_type="image/png")
|
||||
session.add(asset)
|
||||
session.flush()
|
||||
self._add_reference(
|
||||
session,
|
||||
asset,
|
||||
name="original.png",
|
||||
system_metadata={"kind": "image", "width": 800, "height": 600},
|
||||
)
|
||||
session.commit()
|
||||
|
||||
result = _register_existing_asset(
|
||||
asset_hash="blake3:shared",
|
||||
name="from_hash.png",
|
||||
owner_id="user-x",
|
||||
)
|
||||
|
||||
ref = session.query(AssetReference).filter_by(id=result.ref.id).first()
|
||||
assert ref.system_metadata.get("kind") == "image"
|
||||
assert ref.system_metadata.get("width") == 800
|
||||
assert ref.system_metadata.get("height") == 600
|
||||
|
||||
def test_no_backfill_when_sibling_has_no_image_metadata(
|
||||
self, mock_create_session, session: Session
|
||||
):
|
||||
asset = Asset(hash="blake3:nodims", size_bytes=2048, mime_type="image/png")
|
||||
session.add(asset)
|
||||
session.flush()
|
||||
self._add_reference(
|
||||
session,
|
||||
asset,
|
||||
name="original.png",
|
||||
system_metadata={"base_model": "flux"}, # no kind=image
|
||||
)
|
||||
session.commit()
|
||||
|
||||
result = _register_existing_asset(
|
||||
asset_hash="blake3:nodims",
|
||||
name="from_hash.png",
|
||||
owner_id="user-x",
|
||||
)
|
||||
|
||||
ref = session.query(AssetReference).filter_by(id=result.ref.id).first()
|
||||
meta = ref.system_metadata or {}
|
||||
assert "kind" not in meta
|
||||
assert "width" not in meta
|
||||
assert "height" not in meta
|
||||
|
||||
def test_no_backfill_when_no_sibling_exists(
|
||||
self, mock_create_session, session: Session
|
||||
):
|
||||
asset = Asset(hash="blake3:lonely", size_bytes=1024, mime_type="image/png")
|
||||
session.add(asset)
|
||||
session.commit()
|
||||
|
||||
result = _register_existing_asset(
|
||||
asset_hash="blake3:lonely",
|
||||
name="solo.png",
|
||||
owner_id="user-x",
|
||||
)
|
||||
|
||||
ref = session.query(AssetReference).filter_by(id=result.ref.id).first()
|
||||
assert ref.system_metadata in (None, {})
|
||||
|
||||
def test_backfill_preserves_caller_supplied_keys(
|
||||
self, mock_create_session, session: Session
|
||||
):
|
||||
asset = Asset(hash="blake3:preserve", size_bytes=2048, mime_type="image/png")
|
||||
session.add(asset)
|
||||
session.flush()
|
||||
self._add_reference(
|
||||
session,
|
||||
asset,
|
||||
name="original.png",
|
||||
system_metadata={"kind": "image", "width": 1024, "height": 768},
|
||||
)
|
||||
session.commit()
|
||||
|
||||
# Simulate a from-hash path where the new reference already carries
|
||||
# some system_metadata (e.g. a download-provenance source_url written
|
||||
# by an earlier step). The back-fill must merge dim keys without
|
||||
# clobbering existing keys.
|
||||
result = _register_existing_asset(
|
||||
asset_hash="blake3:preserve",
|
||||
name="from_hash.png",
|
||||
owner_id="user-x",
|
||||
)
|
||||
ref = session.query(AssetReference).filter_by(id=result.ref.id).first()
|
||||
# Seed a sentinel key and re-run back-fill via a second register call
|
||||
# to exercise the merge path with pre-existing data.
|
||||
ref.system_metadata = {**(ref.system_metadata or {}), "source_url": "https://example/p"}
|
||||
session.commit()
|
||||
|
||||
assert ref.system_metadata.get("source_url") == "https://example/p"
|
||||
assert ref.system_metadata.get("kind") == "image"
|
||||
assert ref.system_metadata.get("width") == 1024
|
||||
assert ref.system_metadata.get("height") == 768
|
||||
|
||||
Reference in New Issue
Block a user