Compare commits

..

1 Commits

Author SHA1 Message Date
77d7b6126e Reformat models variable into multiline array CORE-59 2026-05-01 11:03:02 +02:00
259 changed files with 2669 additions and 42659 deletions

View File

@ -1,2 +1,2 @@
.\python_embeded\python.exe -s ComfyUI\main.py --windows-standalone-build --enable-dynamic-vram
.\python_embeded\python.exe -s ComfyUI\main.py --windows-standalone-build --disable-smart-memory
pause

View File

@ -1,31 +0,0 @@
name: OpenAPI Lint
on:
pull_request:
paths:
- 'openapi.yaml'
- '.spectral.yaml'
- '.github/workflows/openapi-lint.yml'
permissions:
contents: read
jobs:
spectral:
name: Run Spectral
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Set up Node.js
uses: actions/setup-node@v4
with:
node-version: '20'
- name: Install Spectral
run: npm install -g @stoplight/spectral-cli@6
- name: Lint openapi.yaml
run: spectral lint openapi.yaml --ruleset .spectral.yaml --fail-severity=error

View File

@ -145,8 +145,6 @@ jobs:
cp -r ComfyUI/.ci/windows_${{ inputs.rel_name }}_base_files/* ./
cp ../update_comfyui_and_python_dependencies.bat ./update/
echo 'local-portable' > ComfyUI/.comfy_environment
cd ..
"C:\Program Files\7-Zip\7z.exe" a -t7z -m0=lzma2 -mx=9 -mfb=128 -md=768m -ms=on -mf=BCJ2 ComfyUI_windows_portable.7z ComfyUI_windows_portable

1
.gitignore vendored
View File

@ -23,4 +23,3 @@ web_custom_versions/
.DS_Store
filtered-openapi.yaml
uv.lock
.comfy_environment

View File

@ -1,100 +0,0 @@
extends:
- spectral:oas
# Severity levels: error, warn, info, hint, off
# Rules from the built-in "spectral:oas" ruleset are active by default.
# Below we tune severity and add custom rules for our conventions.
#
# This ruleset mirrors Comfy-Org/cloud/.spectral.yaml so specs across the
# organization are linted against a single consistent standard.
rules:
# -----------------------------------------------------------------------
# Built-in rule severity overrides
# -----------------------------------------------------------------------
operation-operationId: error
operation-description: warn
operation-tag-defined: error
info-contact: off
info-description: warn
no-eval-in-markdown: error
no-$ref-siblings: error
# -----------------------------------------------------------------------
# Custom rules: naming conventions
# -----------------------------------------------------------------------
# Property names should be snake_case
property-name-snake-case:
description: Property names must be snake_case
severity: warn
given: "$.components.schemas.*.properties[*]~"
then:
function: pattern
functionOptions:
match: "^[a-z][a-z0-9]*(_[a-z0-9]+)*$"
# Operation IDs should be camelCase
operation-id-camel-case:
description: Operation IDs must be camelCase
severity: warn
given: "$.paths.*.*.operationId"
then:
function: pattern
functionOptions:
match: "^[a-z][a-zA-Z0-9]*$"
# -----------------------------------------------------------------------
# Custom rules: response conventions
# -----------------------------------------------------------------------
# Error responses (4xx, 5xx) should use a consistent shape
error-response-schema:
description: Error responses should reference a standard error schema
severity: hint
given: "$.paths.*.*.responses[?(@property >= '400' && @property < '600')].content['application/json'].schema"
then:
field: "$ref"
function: truthy
# All 2xx responses with JSON body should have a schema
response-schema-defined:
description: Success responses with JSON content should define a schema
severity: warn
given: "$.paths.*.*.responses[?(@property >= '200' && @property < '300')].content['application/json']"
then:
field: schema
function: truthy
# -----------------------------------------------------------------------
# Custom rules: best practices
# -----------------------------------------------------------------------
# Path parameters must have a description
path-param-description:
description: Path parameters should have a description
severity: warn
given:
- "$.paths.*.parameters[?(@.in == 'path')]"
- "$.paths.*.*.parameters[?(@.in == 'path')]"
then:
field: description
function: truthy
# Schemas should have a description
schema-description:
description: Component schemas should have a description
severity: hint
given: "$.components.schemas.*"
then:
field: description
function: truthy
overrides:
# /ws uses HTTP 101 (Switching Protocols) — a legitimate response for a
# WebSocket upgrade, but not a 2xx, so operation-success-response fires
# as a false positive. OpenAPI 3.x has no native WebSocket support.
- files:
- "openapi.yaml#/paths/~1ws"
rules:
operation-success-response: off

View File

@ -1,5 +1,2 @@
* @comfyanonymous @kosinkadink @guill @alexisrolland @rattus128 @kijai
/CODEOWNERS @comfyanonymous
/.ci/ @comfyanonymous
/.github/ @comfyanonymous
# Admins
* @comfyanonymous @kosinkadink @guill @alexisrolland @rattus128

View File

@ -1,7 +1,7 @@
<div align="center">
# ComfyUI
**The most powerful and modular AI engine for content creation.**
**The most powerful and modular visual AI engine and application.**
[![Website][website-shield]][website-url]
@ -31,16 +31,10 @@
[github-downloads-latest-shield]: https://img.shields.io/github/downloads/comfyanonymous/ComfyUI/latest/total?style=flat&label=downloads%40latest
[github-downloads-link]: https://github.com/comfyanonymous/ComfyUI/releases
<img width="1590" height="795" alt="ComfyUI Screenshot" src="https://github.com/user-attachments/assets/36e065e0-bfae-4456-8c7f-8369d5ea48a2" />
<br>
![ComfyUI Screenshot](https://github.com/user-attachments/assets/7ccaf2c1-9b72-41ae-9a89-5688c94b7abe)
</div>
ComfyUI is the AI creation engine for visual professionals who demand control over every model, every parameter, and every output. Its powerful and modular node graph interface empowers creatives to generate images, videos, 3D models, audio, and more...
- ComfyUI natively supports the latest open-source state of the art models.
- API nodes provide access to the best closed source models such as Nano Banana, Seedance, Hunyuan3D, etc.
- It is available on Windows, Linux, and macOS, locally with our [desktop application](https://www.comfy.org/download), our [portable install](#installing) or on our [cloud](https://www.comfy.org/cloud).
- The most sophisticated workflows can be exposed through a simple UI thanks to App Mode.
- It integrates seamlessly into production pipelines with our API endpoints.
ComfyUI lets you design and execute advanced stable diffusion pipelines using a graph/nodes/flowchart based interface. Available on Windows, Linux, and macOS.
## Get Started
@ -83,7 +77,6 @@ See what ComfyUI can do with the [newer template workflows](https://comfy.org/wo
- [Hunyuan Image 2.1](https://comfyanonymous.github.io/ComfyUI_examples/hunyuan_image/)
- [Flux 2](https://comfyanonymous.github.io/ComfyUI_examples/flux2/)
- [Z Image](https://comfyanonymous.github.io/ComfyUI_examples/z_image/)
- Ernie Image
- Image Editing Models
- [Omnigen 2](https://comfyanonymous.github.io/ComfyUI_examples/omnigen/)
- [Flux Kontext](https://comfyanonymous.github.io/ComfyUI_examples/flux/#flux-kontext-image-editing-model)
@ -133,7 +126,7 @@ Workflow examples can be found on the [Examples page](https://comfyanonymous.git
ComfyUI follows a weekly release cycle targeting Monday but this regularly changes because of model releases or large changes to the codebase. There are three interconnected repositories:
1. **[ComfyUI Core](https://github.com/comfyanonymous/ComfyUI)**
- Releases a new major stable version (e.g., v0.7.0) roughly every 2 weeks.
- Releases a new stable version (e.g., v0.7.0) roughly every week.
- Starting from v0.4.0 patch versions will be used for fixes backported onto the current stable release.
- Minor versions will be used for releases off the master branch.
- Patch versions may still be used for releases on the master branch in cases where a backport would not make sense.
@ -200,15 +193,13 @@ If you have trouble extracting it, right click the file -> properties -> unblock
The portable above currently comes with python 3.13 and pytorch cuda 13.0. Update your Nvidia drivers if it doesn't start.
#### All Official Portable Downloads:
#### Alternative Downloads:
[Portable for AMD GPUs](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_amd.7z)
[Portable for Intel GPUs](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_intel.7z)
[Experimental portable for Intel GPUs](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_intel.7z)
[Portable for Nvidia GPUs](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_nvidia.7z) (supports 20 series and above).
[Portable for Nvidia GPUs with pytorch cuda 12.6 and python 3.12](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_nvidia_cu126.7z) (Supports Nvidia 10 series and older GPUs).
[Portable with pytorch cuda 12.6 and python 3.12](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_nvidia_cu126.7z) (Supports Nvidia 10 series and older GPUs).
#### How do I share models between another UI and ComfyUI?
@ -429,8 +420,6 @@ Use `--tls-keyfile key.pem --tls-certfile cert.pem` to enable TLS/SSL, the app w
See also: [https://www.comfy.org/](https://www.comfy.org/)
> _psst — we're hiring!_ Help build ComfyUI: [comfy.org/careers](https://www.comfy.org/careers)
## Frontend Development
As of August 15, 2024, we have transitioned to a new frontend, which is now hosted in a separate repository: [ComfyUI Frontend](https://github.com/Comfy-Org/ComfyUI_frontend). This repository now hosts the compiled JS (from TS/Vue) under the `web/` directory.

View File

@ -1,44 +0,0 @@
# Security Policy
## Scope
ComfyUI is designed to run locally. By default, the server binds to `127.0.0.1`, meaning only the user's own machine can reach it. Our threat model assumes:
- The user installed ComfyUI through a supported channel: the desktop application, the portable build, or a manual install following the README.
- The user has not installed untrusted custom nodes. Custom nodes are arbitrary Python code and are trusted as much as any other software the user chooses to install.
- Anyone with access to the ComfyUI URL is trusted (a direct consequence of the localhost-only default).
- PyTorch and other dependencies are at the versions we ship or recommend in the README.
A report is in scope only if it affects a user operating within this threat model.
## What We Consider a Vulnerability
We want to hear about issues where a **reasonable user** — someone who does not install random untrusted nodes and who reads UI prompts and warnings before clicking through them — can be harmed by ComfyUI itself.
The clearest example: a workflow file that such a user might plausibly load and run, using only built-in nodes, that results in **untrusted code execution, arbitrary file read/write outside expected directories, or credential/data exfiltration**.
When submitting a report, please include a clear description of *why this is a problem for a typical local ComfyUI user*. Reports without this context are difficult to act on.
## What We Do Not Consider a Security Vulnerability
Please report the following through our regular [GitHub issues](https://github.com/comfyanonymous/ComfyUI/issues) instead. Filing them as security reports will likely cause them to be deprioritized or closed.
- **Issues requiring `--listen` or any non-default network exposure.** ComfyUI binds to localhost by default. If a remote attacker needs to reach the server for the attack to work, the user has chosen to expose it and is responsible for securing that deployment (firewall, reverse proxy, authentication, etc.). These are bugs, not vulnerabilities.
- **`torch.load` and related deserialization issues in old PyTorch versions.** These are upstream PyTorch issues. Our distributions ship with — and our documentation recommends — recent PyTorch versions where these are addressed.
- **Vulnerabilities that depend on outdated library versions** that we neither ship nor recommend (e.g., requiring PyTorch 2.6 or older).
- **Issues that require a specific custom node to be installed.** Custom nodes are third-party code. Report these to the maintainer of that node.
- **Crashes, hangs, or resource exhaustion from a loaded workflow.** Annoying, but not a security issue in our model. File a regular bug.
- **Social-engineering scenarios** where the user is expected to ignore an explicit UI warning or prompt.
## Reporting
If you believe you have found an issue that falls within the scope above, please report it privately via GitHub's [Report a vulnerability](https://github.com/comfyanonymous/ComfyUI/security/advisories/new) feature rather than opening a public issue.
Please include:
1. A description of the vulnerability and the affected component.
2. Reproduction steps, ideally with a minimal workflow file or proof-of-concept.
3. The ComfyUI version, install method (desktop / portable / manual), and OS.
4. An explanation of how this affects a typical local user as described in the threat model.
We will acknowledge valid reports and coordinate a fix and disclosure timeline with you.

View File

@ -27,7 +27,7 @@ def frontend_install_warning_message():
return f"""
{get_missing_requirements_message()}
The ComfyUI frontend is shipped in a pip package so it needs to be updated separately from the ComfyUI code.
This error is happening because the ComfyUI frontend is no longer shipped as part of the main repo but as a pip package instead.
""".strip()
def parse_version(version: str) -> tuple[int, int, int]:
@ -38,54 +38,40 @@ def is_valid_version(version: str) -> bool:
pattern = r"^(\d+)\.(\d+)\.(\d+)$"
return bool(re.match(pattern, version))
def get_installed_frontend_version():
"""Get the currently installed frontend package version."""
frontend_version_str = version("comfyui-frontend-package")
return frontend_version_str
def get_required_frontend_version():
return get_required_packages_versions().get("comfyui-frontend-package", None)
COMFY_PACKAGE_VERSIONS = []
def get_comfy_package_versions():
"""List installed/required versions for every comfy* package in requirements.txt."""
if COMFY_PACKAGE_VERSIONS:
return COMFY_PACKAGE_VERSIONS.copy()
out = COMFY_PACKAGE_VERSIONS
for name, required in (get_required_packages_versions() or {}).items():
if not name.startswith("comfy"):
continue
try:
installed = version(name)
except Exception:
installed = None
out.append({"name": name, "installed": installed, "required": required})
return out.copy()
def check_frontend_version():
"""Check if the frontend version is up to date."""
def check_comfy_packages_versions():
"""Warn for every comfy* package whose installed version is below requirements.txt."""
from packaging.version import InvalidVersion, parse as parse_pep440
for pkg in get_comfy_package_versions():
installed_str = pkg["installed"]
required_str = pkg["required"]
if not installed_str or not required_str:
continue
try:
outdated = parse_pep440(installed_str) < parse_pep440(required_str)
except InvalidVersion as e:
logging.error(f"Failed to check {pkg['name']} version: {e}")
continue
if outdated:
try:
frontend_version_str = get_installed_frontend_version()
frontend_version = parse_version(frontend_version_str)
required_frontend_str = get_required_frontend_version()
required_frontend = parse_version(required_frontend_str)
if frontend_version < required_frontend:
app.logger.log_startup_warning(
f"""
________________________________________________________________________
WARNING WARNING WARNING WARNING WARNING
Installed {pkg["name"]} version {installed_str} is lower than the recommended version {required_str}.
Installed frontend version {".".join(map(str, frontend_version))} is lower than the recommended version {".".join(map(str, required_frontend))}.
{get_missing_requirements_message()}
{frontend_install_warning_message()}
________________________________________________________________________
""".strip()
)
else:
logging.info("{} version: {}".format(pkg["name"], installed_str))
logging.info("ComfyUI frontend version: {}".format(frontend_version_str))
except Exception as e:
logging.error(f"Failed to check frontend version: {e}")
REQUEST_TIMEOUT = 10 # seconds
@ -215,11 +201,6 @@ class FrontendManager:
def get_required_templates_version(cls) -> str:
return get_required_packages_versions().get("comfyui-workflow-templates", None)
@classmethod
def get_comfy_package_versions(cls):
"""List installed/required versions for every comfy* package in requirements.txt."""
return get_comfy_package_versions()
@classmethod
def default_frontend_path(cls) -> str:
try:
@ -360,7 +341,7 @@ comfyui-workflow-templates is not installed.
main error source might be request timeout or invalid URL.
"""
if version_string == DEFAULT_VERSION_STRING:
check_comfy_packages_versions()
check_frontend_version()
return cls.default_frontend_path()
repo_owner, repo_name, version = cls.parse_version_string(version_string)
@ -422,7 +403,7 @@ comfyui-workflow-templates is not installed.
except Exception as e:
logging.error("Failed to initialize frontend: %s", e)
logging.info("Falling back to the default frontend.")
check_comfy_packages_versions()
check_frontend_version()
return cls.default_frontend_path()
@classmethod
def template_asset_handler(cls):

View File

@ -1,7 +1,5 @@
from __future__ import annotations
import logging
from aiohttp import web
from typing import TYPE_CHECKING, TypedDict
@ -33,22 +31,8 @@ class NodeReplaceManager:
self._replacements: dict[str, list[NodeReplace]] = {}
def register(self, node_replace: NodeReplace):
"""Register a node replacement mapping.
Idempotent: if a replacement with the same (old_node_id, new_node_id)
is already registered, the duplicate is ignored. This prevents stale
entries from accumulating when custom nodes are reloaded in the same
process (e.g. via ComfyUI-Manager).
"""
existing = self._replacements.setdefault(node_replace.old_node_id, [])
for entry in existing:
if entry.new_node_id == node_replace.new_node_id:
logging.debug(
"Node replacement %s -> %s already registered, ignoring duplicate.",
node_replace.old_node_id, node_replace.new_node_id,
)
return
existing.append(node_replace)
"""Register a node replacement mapping."""
self._replacements.setdefault(node_replace.old_node_id, []).append(node_replace)
def get_replacement(self, old_node_id: str) -> list[NodeReplace] | None:
"""Get replacements for an old node ID."""

View File

@ -28,8 +28,8 @@ def get_file_info(path: str, relative_to: str) -> FileInfo:
return {
"path": os.path.relpath(path, relative_to).replace(os.sep, '/'),
"size": os.path.getsize(path),
"modified": int(os.path.getmtime(path) * 1000),
"created": int(os.path.getctime(path) * 1000),
"modified": os.path.getmtime(path),
"created": os.path.getctime(path)
}

View File

@ -431,10 +431,9 @@
"extra": {
"workflowRendererVersion": "LG"
},
"category": "Image Tools/Color adjust",
"description": "Adjusts image brightness and contrast using a real-time GPU fragment shader."
"category": "Image Tools/Color adjust"
}
]
},
"extra": {}
}
}

View File

@ -162,7 +162,7 @@
},
"revision": 0,
"config": {},
"name": "Canny to Image (Z-Image-Turbo)",
"name": "local-Canny to Image (Z-Image-Turbo)",
"inputNode": {
"id": -10,
"bounding": [
@ -1553,8 +1553,7 @@
"VHS_MetadataImage": true,
"VHS_KeepIntermediate": true
},
"category": "Image generation and editing/Canny to image",
"description": "Generates an image from a Canny edge map using Z-Image-Turbo, with text conditioning."
"category": "Image generation and editing/Canny to image"
}
]
},
@ -1575,4 +1574,4 @@
}
},
"version": 0.4
}
}

View File

@ -192,7 +192,7 @@
},
"revision": 0,
"config": {},
"name": "Canny to Video (LTX 2.0)",
"name": "local-Canny to Video (LTX 2.0)",
"inputNode": {
"id": -10,
"bounding": [
@ -3600,8 +3600,7 @@
"extra": {
"workflowRendererVersion": "LG"
},
"category": "Video generation and editing/Canny to video",
"description": "Generates video from Canny edge maps using LTX-2, with optional synchronized audio."
"category": "Video generation and editing/Canny to video"
}
]
},
@ -3617,4 +3616,4 @@
}
},
"version": 0.4
}
}

View File

@ -377,9 +377,8 @@
"extra": {
"workflowRendererVersion": "LG"
},
"category": "Image Tools/Color adjust",
"description": "Adds lens-style chromatic aberration (color fringing) using a real-time GPU fragment shader."
"category": "Image Tools/Color adjust"
}
]
}
}
}

View File

@ -596,8 +596,7 @@
"extra": {
"workflowRendererVersion": "LG"
},
"category": "Image Tools/Color adjust",
"description": "Adjusts saturation, temperature, tint, and vibrance using a real-time GPU fragment shader."
"category": "Image Tools/Color adjust"
}
]
}

View File

@ -1129,8 +1129,7 @@
"extra": {
"workflowRendererVersion": "LG"
},
"category": "Image Tools/Color adjust",
"description": "Balances colors across shadows, midtones, and highlights using a real-time GPU fragment shader."
"category": "Image Tools/Color adjust"
}
]
}

View File

@ -608,8 +608,7 @@
"extra": {
"workflowRendererVersion": "LG"
},
"category": "Image Tools/Color adjust",
"description": "Fine-tunes tone and color with per-channel curve adjustments using a real-time GPU fragment shader."
"category": "Image Tools/Color adjust"
}
]
}

File diff suppressed because it is too large Load Diff

View File

@ -1609,8 +1609,7 @@
}
],
"extra": {},
"category": "Image Tools/Crop",
"description": "Splits an image into a 2×2 grid of four equal tiles."
"category": "Image Tools/Crop"
}
]
},

View File

@ -2946,8 +2946,7 @@
}
],
"extra": {},
"category": "Image Tools/Crop",
"description": "Splits an image into a 3×3 grid of nine equal tiles."
"category": "Image Tools/Crop"
}
]
},

View File

@ -1579,8 +1579,7 @@
"VHS_MetadataImage": true,
"VHS_KeepIntermediate": true
},
"category": "Image generation and editing/Depth to image",
"description": "Generates an image from a depth map using Z-Image-Turbo with text conditioning."
"category": "Image generation and editing/Depth to image"
},
{
"id": "458bdf3c-4b58-421c-af50-c9c663a4d74c",
@ -2462,8 +2461,7 @@
]
},
"workflowRendererVersion": "LG"
},
"description": "Estimates a monocular depth map from an input image using the Lotus depth estimation model."
}
}
]
},

View File

@ -4233,8 +4233,7 @@
"extra": {
"workflowRendererVersion": "LG"
},
"category": "Video generation and editing/Depth to video",
"description": "Generates depth-controlled video with LTX-2: motion and structure follow a depth-reference video alongside text prompting, optional first-frame image conditioning, with optional synchronized audio."
"category": "Video generation and editing/Depth to video"
},
{
"id": "38b60539-50a7-42f9-a5fe-bdeca26272e2",
@ -5193,8 +5192,7 @@
],
"extra": {
"workflowRendererVersion": "LG"
},
"description": "Estimates a monocular depth map from an input image using the Lotus depth estimation model."
}
}
]
},

View File

@ -450,10 +450,9 @@
"extra": {
"workflowRendererVersion": "LG"
},
"category": "Image Tools/Blur",
"description": "Applies bilateral (edge-preserving) blur to soften images while retaining detail."
"category": "Image Tools/Blur"
}
]
},
"extra": {}
}
}

View File

@ -580,9 +580,8 @@
"extra": {
"workflowRendererVersion": "LG"
},
"category": "Image Tools/Color adjust",
"description": "Adds procedural film grain texture for a cinematic look via GPU fragment shader."
"category": "Image Tools/Color adjust"
}
]
}
}
}

View File

@ -3350,8 +3350,7 @@
}
],
"extra": {},
"category": "Video generation and editing/First-Last-Frame to Video",
"description": "Generates a video interpolating between first and last keyframes using LTX-2.3."
"category": "Video generation and editing/First-Last-Frame to Video"
}
]
},

File diff suppressed because it is too large Load Diff

View File

@ -1,858 +0,0 @@
{
"revision": 0,
"last_node_id": 16,
"last_link_id": 0,
"nodes": [
{
"id": 16,
"type": "022693be-2baa-4009-870a-28921508a7ef",
"pos": [
-2990,
-3240
],
"size": [
410,
200
],
"flags": {},
"order": 2,
"mode": 0,
"inputs": [
{
"localized_name": "video",
"name": "video",
"type": "VIDEO",
"link": null
},
{
"label": "multiplier",
"name": "value",
"type": "INT",
"widget": {
"name": "value"
},
"link": null
},
{
"label": "enable_fps_multiplier",
"name": "value_1",
"type": "BOOLEAN",
"widget": {
"name": "value_1"
},
"link": null
},
{
"name": "model_name",
"type": "COMBO",
"widget": {
"name": "model_name"
},
"link": null
}
],
"outputs": [
{
"label": "VIDEO",
"name": "VIDEO_1",
"type": "VIDEO",
"links": []
},
{
"name": "IMAGE",
"type": "IMAGE",
"links": null
}
],
"properties": {
"proxyWidgets": [
[
"9",
"value"
],
[
"13",
"value"
],
[
"1",
"model_name"
]
],
"enableTabs": false,
"tabWidth": 65,
"tabXOffset": 10,
"hasSecondTab": false,
"secondTabText": "Send Back",
"secondTabOffset": 80,
"secondTabWidth": 65,
"cnr_id": "comfy-core",
"ver": "0.19.3"
},
"widgets_values": [],
"title": "Frame Interpolation"
}
],
"links": [],
"version": 0.4,
"definitions": {
"subgraphs": [
{
"id": "022693be-2baa-4009-870a-28921508a7ef",
"version": 1,
"state": {
"lastGroupId": 0,
"lastNodeId": 17,
"lastLinkId": 28,
"lastRerouteId": 0
},
"revision": 0,
"config": {},
"name": "Frame Interpolation",
"inputNode": {
"id": -10,
"bounding": [
-2810,
-3070,
159.7421875,
120
]
},
"outputNode": {
"id": -20,
"bounding": [
-1270,
-3075,
120,
80
]
},
"inputs": [
{
"id": "05e31c51-dcb6-4a1e-9651-1b9ad4f7a287",
"name": "video",
"type": "VIDEO",
"linkIds": [
2
],
"localized_name": "video",
"pos": [
-2670.2578125,
-3050
]
},
{
"id": "feecb409-7d1c-4a99-9c63-50c5fecdd3c9",
"name": "value",
"type": "INT",
"linkIds": [
22
],
"label": "multiplier",
"pos": [
-2670.2578125,
-3030
]
},
{
"id": "0b8a861b-b581-4068-9e8c-f8d15daf1ca6",
"name": "value_1",
"type": "BOOLEAN",
"linkIds": [
23
],
"label": "enable_fps_multiplier",
"pos": [
-2670.2578125,
-3010
]
},
{
"id": "a22b101e-8773-4e17-a297-7ee3aae09162",
"name": "model_name",
"type": "COMBO",
"linkIds": [
24
],
"pos": [
-2670.2578125,
-2990
]
}
],
"outputs": [
{
"id": "ef2ada05-d5aa-492a-9394-6c3e71e39ebb",
"name": "VIDEO_1",
"type": "VIDEO",
"linkIds": [
26
],
"label": "VIDEO",
"pos": [
-1250,
-3055
]
},
{
"id": "5aacc622-2a07-4983-b31c-e04461f7f953",
"name": "IMAGE",
"type": "IMAGE",
"linkIds": [
28
],
"pos": [
-1250,
-3035
]
}
],
"widgets": [],
"nodes": [
{
"id": 1,
"type": "FrameInterpolationModelLoader",
"pos": [
-2510,
-3370
],
"size": [
370,
90
],
"flags": {},
"order": 0,
"mode": 0,
"inputs": [
{
"localized_name": "model_name",
"name": "model_name",
"type": "COMBO",
"widget": {
"name": "model_name"
},
"link": 24
}
],
"outputs": [
{
"localized_name": "INTERP_MODEL",
"name": "INTERP_MODEL",
"type": "INTERP_MODEL",
"links": [
1
]
}
],
"properties": {
"Node name for S&R": "FrameInterpolationModelLoader",
"enableTabs": false,
"tabWidth": 65,
"tabXOffset": 10,
"hasSecondTab": false,
"secondTabText": "Send Back",
"secondTabOffset": 80,
"secondTabWidth": 65,
"cnr_id": "comfy-core",
"ver": "0.19.3",
"models": [
{
"name": "film_net_fp16.safetensors",
"url": "https://huggingface.co/Comfy-Org/frame_interpolation/resolve/main/frame_interpolation/film_net_fp16.safetensors",
"directory": "frame_interpolation"
}
]
},
"widgets_values": [
"film_net_fp16.safetensors"
]
},
{
"id": 2,
"type": "FrameInterpolate",
"pos": [
-2040,
-3370
],
"size": [
270,
110
],
"flags": {},
"order": 1,
"mode": 0,
"inputs": [
{
"localized_name": "interp_model",
"name": "interp_model",
"type": "INTERP_MODEL",
"link": 1
},
{
"localized_name": "images",
"name": "images",
"type": "IMAGE",
"link": 3
},
{
"localized_name": "multiplier",
"name": "multiplier",
"type": "INT",
"widget": {
"name": "multiplier"
},
"link": 8
}
],
"outputs": [
{
"localized_name": "IMAGE",
"name": "IMAGE",
"type": "IMAGE",
"links": [
4,
28
]
}
],
"properties": {
"Node name for S&R": "FrameInterpolate",
"enableTabs": false,
"tabWidth": 65,
"tabXOffset": 10,
"hasSecondTab": false,
"secondTabText": "Send Back",
"secondTabOffset": 80,
"secondTabWidth": 65,
"cnr_id": "comfy-core",
"ver": "0.19.3"
},
"widgets_values": [
2
]
},
{
"id": 5,
"type": "CreateVideo",
"pos": [
-1600,
-3370
],
"size": [
270,
110
],
"flags": {},
"order": 3,
"mode": 0,
"inputs": [
{
"localized_name": "images",
"name": "images",
"type": "IMAGE",
"link": 4
},
{
"localized_name": "audio",
"name": "audio",
"shape": 7,
"type": "AUDIO",
"link": 5
},
{
"localized_name": "fps",
"name": "fps",
"type": "FLOAT",
"widget": {
"name": "fps"
},
"link": 12
}
],
"outputs": [
{
"localized_name": "VIDEO",
"name": "VIDEO",
"type": "VIDEO",
"links": [
26
]
}
],
"properties": {
"Node name for S&R": "CreateVideo",
"enableTabs": false,
"tabWidth": 65,
"tabXOffset": 10,
"hasSecondTab": false,
"secondTabText": "Send Back",
"secondTabOffset": 80,
"secondTabWidth": 65,
"cnr_id": "comfy-core",
"ver": "0.19.3"
},
"widgets_values": [
30
]
},
{
"id": 9,
"type": "PrimitiveInt",
"pos": [
-2500,
-2970
],
"size": [
270,
90
],
"flags": {},
"order": 4,
"mode": 0,
"inputs": [
{
"localized_name": "value",
"name": "value",
"type": "INT",
"widget": {
"name": "value"
},
"link": 22
}
],
"outputs": [
{
"localized_name": "INT",
"name": "INT",
"type": "INT",
"links": [
8,
19
]
}
],
"title": "Int (Multiplier)",
"properties": {
"Node name for S&R": "PrimitiveInt",
"enableTabs": false,
"tabWidth": 65,
"tabXOffset": 10,
"hasSecondTab": false,
"secondTabText": "Send Back",
"secondTabOffset": 80,
"secondTabWidth": 65,
"cnr_id": "comfy-core",
"ver": "0.19.3"
},
"widgets_values": [
2,
"fixed"
]
},
{
"id": 10,
"type": "ComfySwitchNode",
"pos": [
-1610,
-3120
],
"size": [
270,
130
],
"flags": {},
"order": 5,
"mode": 0,
"inputs": [
{
"localized_name": "on_false",
"name": "on_false",
"type": "*",
"link": 11
},
{
"localized_name": "on_true",
"name": "on_true",
"type": "*",
"link": 13
},
{
"localized_name": "switch",
"name": "switch",
"type": "BOOLEAN",
"widget": {
"name": "switch"
},
"link": 15
}
],
"outputs": [
{
"localized_name": "output",
"name": "output",
"type": "*",
"links": [
12
]
}
],
"properties": {
"Node name for S&R": "ComfySwitchNode",
"enableTabs": false,
"tabWidth": 65,
"tabXOffset": 10,
"hasSecondTab": false,
"secondTabText": "Send Back",
"secondTabOffset": 80,
"secondTabWidth": 65,
"cnr_id": "comfy-core",
"ver": "0.19.3"
},
"widgets_values": [
true
]
},
{
"id": 13,
"type": "PrimitiveBoolean",
"pos": [
-2500,
-2770
],
"size": [
310,
90
],
"flags": {},
"order": 7,
"mode": 0,
"inputs": [
{
"localized_name": "value",
"name": "value",
"type": "BOOLEAN",
"widget": {
"name": "value"
},
"link": 23
}
],
"outputs": [
{
"localized_name": "BOOLEAN",
"name": "BOOLEAN",
"type": "BOOLEAN",
"links": [
15
]
}
],
"title": "Boolean (Apply multiplier to FPS?)",
"properties": {
"Node name for S&R": "PrimitiveBoolean",
"enableTabs": false,
"tabWidth": 65,
"tabXOffset": 10,
"hasSecondTab": false,
"secondTabText": "Send Back",
"secondTabOffset": 80,
"secondTabWidth": 65,
"cnr_id": "comfy-core",
"ver": "0.19.3"
},
"widgets_values": [
true
]
},
{
"id": 3,
"type": "GetVideoComponents",
"pos": [
-2500,
-3170
],
"size": [
230,
100
],
"flags": {},
"order": 2,
"mode": 0,
"inputs": [
{
"localized_name": "video",
"name": "video",
"type": "VIDEO",
"link": 2
}
],
"outputs": [
{
"localized_name": "images",
"name": "images",
"type": "IMAGE",
"links": [
3
]
},
{
"localized_name": "audio",
"name": "audio",
"type": "AUDIO",
"links": [
5
]
},
{
"localized_name": "fps",
"name": "fps",
"type": "FLOAT",
"links": [
11,
18
]
}
],
"properties": {
"Node name for S&R": "GetVideoComponents",
"enableTabs": false,
"tabWidth": 65,
"tabXOffset": 10,
"hasSecondTab": false,
"secondTabText": "Send Back",
"secondTabOffset": 80,
"secondTabWidth": 65,
"cnr_id": "comfy-core",
"ver": "0.19.3"
}
},
{
"id": 11,
"type": "ComfyMathExpression",
"pos": [
-2090,
-3070
],
"size": [
400,
210
],
"flags": {
"collapsed": false
},
"order": 6,
"mode": 0,
"inputs": [
{
"label": "a",
"localized_name": "values.a",
"name": "values.a",
"type": "FLOAT,INT",
"link": 18
},
{
"label": "b",
"localized_name": "values.b",
"name": "values.b",
"shape": 7,
"type": "FLOAT,INT",
"link": 19
},
{
"label": "c",
"localized_name": "values.c",
"name": "values.c",
"shape": 7,
"type": "FLOAT,INT",
"link": null
},
{
"localized_name": "expression",
"name": "expression",
"type": "STRING",
"widget": {
"name": "expression"
},
"link": null
}
],
"outputs": [
{
"localized_name": "FLOAT",
"name": "FLOAT",
"type": "FLOAT",
"links": [
13
]
},
{
"localized_name": "INT",
"name": "INT",
"type": "INT",
"links": null
}
],
"properties": {
"Node name for S&R": "ComfyMathExpression",
"enableTabs": false,
"tabWidth": 65,
"tabXOffset": 10,
"hasSecondTab": false,
"secondTabText": "Send Back",
"secondTabOffset": 80,
"secondTabWidth": 65,
"cnr_id": "comfy-core",
"ver": "0.19.3"
},
"widgets_values": [
"min(abs(b), 16) * a"
]
}
],
"groups": [],
"links": [
{
"id": 1,
"origin_id": 1,
"origin_slot": 0,
"target_id": 2,
"target_slot": 0,
"type": "INTERP_MODEL"
},
{
"id": 3,
"origin_id": 3,
"origin_slot": 0,
"target_id": 2,
"target_slot": 1,
"type": "IMAGE"
},
{
"id": 8,
"origin_id": 9,
"origin_slot": 0,
"target_id": 2,
"target_slot": 2,
"type": "INT"
},
{
"id": 4,
"origin_id": 2,
"origin_slot": 0,
"target_id": 5,
"target_slot": 0,
"type": "IMAGE"
},
{
"id": 5,
"origin_id": 3,
"origin_slot": 1,
"target_id": 5,
"target_slot": 1,
"type": "AUDIO"
},
{
"id": 12,
"origin_id": 10,
"origin_slot": 0,
"target_id": 5,
"target_slot": 2,
"type": "FLOAT"
},
{
"id": 11,
"origin_id": 3,
"origin_slot": 2,
"target_id": 10,
"target_slot": 0,
"type": "FLOAT"
},
{
"id": 13,
"origin_id": 11,
"origin_slot": 0,
"target_id": 10,
"target_slot": 1,
"type": "FLOAT"
},
{
"id": 15,
"origin_id": 13,
"origin_slot": 0,
"target_id": 10,
"target_slot": 2,
"type": "BOOLEAN"
},
{
"id": 18,
"origin_id": 3,
"origin_slot": 2,
"target_id": 11,
"target_slot": 0,
"type": "FLOAT"
},
{
"id": 19,
"origin_id": 9,
"origin_slot": 0,
"target_id": 11,
"target_slot": 1,
"type": "INT"
},
{
"id": 2,
"origin_id": -10,
"origin_slot": 0,
"target_id": 3,
"target_slot": 0,
"type": "VIDEO"
},
{
"id": 22,
"origin_id": -10,
"origin_slot": 1,
"target_id": 9,
"target_slot": 0,
"type": "INT"
},
{
"id": 23,
"origin_id": -10,
"origin_slot": 2,
"target_id": 13,
"target_slot": 0,
"type": "BOOLEAN"
},
{
"id": 24,
"origin_id": -10,
"origin_slot": 3,
"target_id": 1,
"target_slot": 0,
"type": "COMBO"
},
{
"id": 26,
"origin_id": 5,
"origin_slot": 0,
"target_id": -20,
"target_slot": 0,
"type": "VIDEO"
},
{
"id": 28,
"origin_id": 2,
"origin_slot": 0,
"target_id": -20,
"target_slot": 1,
"type": "IMAGE"
}
],
"extra": {},
"category": "Video Tools",
"description": "Increases video frame rate by synthesizing intermediate frames with a frame interpolation model."
}
]
},
"extra": {}
}

View File

@ -1,485 +0,0 @@
{
"revision": 0,
"last_node_id": 98,
"last_link_id": 0,
"nodes": [
{
"id": 98,
"type": "dca6e78d-fb06-421e-97f7-6ce17a665260",
"pos": [
-410,
-2230
],
"size": [
270,
104
],
"flags": {},
"order": 7,
"mode": 0,
"inputs": [
{
"name": "video",
"type": "VIDEO",
"link": null
},
{
"label": "frame_index",
"name": "value",
"type": "INT",
"widget": {
"name": "value"
},
"link": null
}
],
"outputs": [
{
"name": "IMAGE",
"type": "IMAGE",
"links": []
}
],
"title": "Get Any Video Frame",
"properties": {
"proxyWidgets": [
[
"100",
"value"
]
]
},
"widgets_values": []
}
],
"links": [],
"version": 0.4,
"definitions": {
"subgraphs": [
{
"id": "dca6e78d-fb06-421e-97f7-6ce17a665260",
"version": 1,
"state": {
"lastGroupId": 1,
"lastNodeId": 136,
"lastLinkId": 302,
"lastRerouteId": 0
},
"revision": 0,
"config": {},
"name": "Get Any Video Frame",
"inputNode": {
"id": -10,
"bounding": [
380,
-57,
120,
80
]
},
"outputNode": {
"id": -20,
"bounding": [
1460,
-57,
120,
60
]
},
"inputs": [
{
"id": "2ceec378-8dcf-4340-8570-155967f59a93",
"name": "video",
"type": "VIDEO",
"linkIds": [
4
],
"pos": [
480,
-37
]
},
{
"id": "819955f6-c686-4896-8032-ff2d0059109a",
"name": "value",
"type": "INT",
"linkIds": [
283
],
"label": "frame_index",
"pos": [
480,
-17
]
}
],
"outputs": [
{
"id": "1ab0684d-6a44-45b6-8aa4-a0b971a1d41e",
"name": "IMAGE",
"type": "IMAGE",
"linkIds": [
5
],
"pos": [
1480,
-37
]
}
],
"widgets": [],
"nodes": [
{
"id": 1,
"type": "GetVideoComponents",
"pos": [
560,
-150
],
"size": [
230,
120
],
"flags": {},
"order": 0,
"mode": 0,
"inputs": [
{
"localized_name": "video",
"name": "video",
"type": "VIDEO",
"link": 4
}
],
"outputs": [
{
"localized_name": "images",
"name": "images",
"type": "IMAGE",
"links": [
1,
2
]
},
{
"localized_name": "audio",
"name": "audio",
"type": "AUDIO",
"links": null
},
{
"localized_name": "fps",
"name": "fps",
"type": "FLOAT",
"links": null
}
],
"properties": {
"Node name for S&R": "GetVideoComponents"
}
},
{
"id": 2,
"type": "GetImageSize",
"pos": [
560,
50
],
"size": [
230,
120
],
"flags": {},
"order": 1,
"mode": 0,
"inputs": [
{
"localized_name": "image",
"name": "image",
"type": "IMAGE",
"link": 1
}
],
"outputs": [
{
"localized_name": "width",
"name": "width",
"type": "INT",
"links": null
},
{
"localized_name": "height",
"name": "height",
"type": "INT",
"links": null
},
{
"localized_name": "batch_size",
"name": "batch_size",
"type": "INT",
"links": [
285
]
}
],
"properties": {
"Node name for S&R": "GetImageSize"
}
},
{
"id": 3,
"type": "ImageFromBatch",
"pos": [
1130,
-150
],
"size": [
270,
140
],
"flags": {},
"order": 2,
"mode": 0,
"inputs": [
{
"localized_name": "image",
"name": "image",
"type": "IMAGE",
"link": 2
},
{
"localized_name": "batch_index",
"name": "batch_index",
"type": "INT",
"widget": {
"name": "batch_index"
},
"link": 286
},
{
"localized_name": "length",
"name": "length",
"type": "INT",
"widget": {
"name": "length"
},
"link": null
}
],
"outputs": [
{
"localized_name": "IMAGE",
"name": "IMAGE",
"type": "IMAGE",
"links": [
5
]
}
],
"properties": {
"Node name for S&R": "ImageFromBatch"
},
"widgets_values": [
0,
1
]
},
{
"id": 99,
"type": "ComfyMathExpression",
"pos": [
910,
100
],
"size": [
400,
200
],
"flags": {},
"order": 3,
"mode": 0,
"inputs": [
{
"label": "a",
"localized_name": "values.a",
"name": "values.a",
"type": "FLOAT,INT",
"link": 284
},
{
"label": "b",
"localized_name": "values.b",
"name": "values.b",
"shape": 7,
"type": "FLOAT,INT",
"link": 285
},
{
"label": "c",
"localized_name": "values.c",
"name": "values.c",
"shape": 7,
"type": "FLOAT,INT",
"link": null
},
{
"localized_name": "expression",
"name": "expression",
"type": "STRING",
"widget": {
"name": "expression"
},
"link": null
}
],
"outputs": [
{
"localized_name": "FLOAT",
"name": "FLOAT",
"type": "FLOAT",
"links": null
},
{
"localized_name": "INT",
"name": "INT",
"type": "INT",
"links": [
286
]
}
],
"properties": {
"Node name for S&R": "ComfyMathExpression"
},
"widgets_values": [
"min(max(int(a if a >= 0 else b + a), 0), b - 1)"
]
},
{
"id": 100,
"type": "PrimitiveInt",
"pos": [
560,
250
],
"size": [
270,
110
],
"flags": {},
"order": 4,
"mode": 0,
"inputs": [
{
"localized_name": "value",
"name": "value",
"type": "INT",
"widget": {
"name": "value"
},
"link": 283
}
],
"outputs": [
{
"localized_name": "INT",
"name": "INT",
"type": "INT",
"links": [
284
]
}
],
"properties": {
"Node name for S&R": "PrimitiveInt"
},
"widgets_values": [
0,
"fixed"
]
}
],
"groups": [],
"links": [
{
"id": 1,
"origin_id": 1,
"origin_slot": 0,
"target_id": 2,
"target_slot": 0,
"type": "IMAGE"
},
{
"id": 2,
"origin_id": 1,
"origin_slot": 0,
"target_id": 3,
"target_slot": 0,
"type": "IMAGE"
},
{
"id": 4,
"origin_id": -10,
"origin_slot": 0,
"target_id": 1,
"target_slot": 0,
"type": "VIDEO"
},
{
"id": 5,
"origin_id": 3,
"origin_slot": 0,
"target_id": -20,
"target_slot": 0,
"type": "IMAGE"
},
{
"id": 283,
"origin_id": -10,
"origin_slot": 1,
"target_id": 100,
"target_slot": 0,
"type": "INT"
},
{
"id": 284,
"origin_id": 100,
"origin_slot": 0,
"target_id": 99,
"target_slot": 0,
"type": "INT"
},
{
"id": 285,
"origin_id": 2,
"origin_slot": 2,
"target_id": 99,
"target_slot": 1,
"type": "INT"
},
{
"id": 286,
"origin_id": 99,
"origin_slot": 1,
"target_id": 3,
"target_slot": 1,
"type": "INT"
}
],
"extra": {},
"category": "Video Tools",
"description": "Extracts one image frame from a video at a chosen index, with optional trim and FPS control."
}
]
},
"extra": {
"ds": {
"scale": 1.197015527856339,
"offset": [
-168.76833554248222,
540.6638955283997
]
},
"frontendVersion": "1.42.8"
}
}

View File

@ -575,9 +575,8 @@
"extra": {
"workflowRendererVersion": "LG"
},
"category": "Image Tools/Color adjust",
"description": "Adds a glow/bloom effect around bright image areas via GPU fragment shader."
"category": "Image Tools/Color adjust"
}
]
}
}
}

View File

@ -752,9 +752,8 @@
"extra": {
"workflowRendererVersion": "LG"
},
"category": "Image Tools/Color adjust",
"description": "Adjusts hue, saturation, and lightness of an image using a real-time GPU fragment shader."
"category": "Image Tools/Color adjust"
}
]
}
}
}

View File

@ -374,8 +374,7 @@
"extra": {
"workflowRendererVersion": "LG"
},
"category": "Image Tools/Blur",
"description": "Applies Gaussian, Box, or Radial blur to soften images and create stylized depth or motion effects."
"category": "Image Tools/Blur"
}
]
}

View File

@ -310,8 +310,7 @@
"extra": {
"workflowRendererVersion": "LG"
},
"category": "Text generation/Image Captioning",
"description": "Generates descriptive captions for images using Google's Gemini multimodal LLM."
"category": "Text generation/Image Captioning"
}
]
}

View File

@ -315,9 +315,8 @@
"extra": {
"workflowRendererVersion": "LG"
},
"category": "Image Tools/Color adjust",
"description": "Manipulates individual RGBA channels for masking, compositing, and channel effects."
"category": "Image Tools/Color adjust"
}
]
}
}
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1472,8 +1472,7 @@
"extra": {
"workflowRendererVersion": "LG"
},
"category": "Image generation and editing/Edit image",
"description": "Edits an input image via text instructions using FLUX.2 [klein] 4B."
"category": "Image generation and editing/Edit image"
},
{
"id": "6007e698-2ebd-4917-84d8-299b35d7b7ab",
@ -1822,8 +1821,7 @@
],
"extra": {
"workflowRendererVersion": "LG"
},
"description": "Applies reference image conditioning for style/identity transfer (Flux.2 Klein 4B)."
}
}
]
},
@ -1839,4 +1837,4 @@
}
},
"version": 0.4
}
}

View File

@ -1417,8 +1417,7 @@
}
],
"extra": {},
"category": "Image generation and editing/Edit image",
"description": "Edits images via text instructions using LongCat Image Edit, an instruction-following image editing diffusion model."
"category": "Image generation and editing/Edit image"
}
]
},

File diff suppressed because it is too large Load Diff

View File

@ -132,7 +132,7 @@
},
"revision": 0,
"config": {},
"name": "Image Edit (Qwen 2511)",
"name": "local-Image Edit (Qwen 2511)",
"inputNode": {
"id": -10,
"bounding": [
@ -1468,8 +1468,7 @@
"VHS_MetadataImage": true,
"VHS_KeepIntermediate": true
},
"category": "Image generation and editing/Edit image",
"description": "Edits images via text instructions using Qwen-Image-Edit-2511 with improved character consistency and integrated LoRA."
"category": "Image generation and editing/Edit image"
}
]
},
@ -1490,4 +1489,4 @@
}
},
"version": 0.4
}
}

View File

@ -1188,8 +1188,7 @@
"extra": {
"workflowRendererVersion": "LG"
},
"category": "Image generation and editing/Inpaint image",
"description": "Inpaints masked image regions using Flux.1 fill [dev], Black Forest Labs' inpainting/outpainting model."
"category": "Image generation and editing/Inpaint image"
}
]
},
@ -1203,4 +1202,4 @@
},
"ue_links": []
}
}
}

View File

@ -1548,8 +1548,7 @@
"extra": {
"workflowRendererVersion": "LG"
},
"category": "Image generation and editing/Inpaint image",
"description": "Inpaints masked regions using Qwen-Image, extending its multilingual text rendering to inpainting tasks."
"category": "Image generation and editing/Inpaint image"
},
{
"id": "56a1f603-fbd2-40ed-94ef-c9ecbd96aca8",
@ -1908,8 +1907,7 @@
],
"extra": {
"workflowRendererVersion": "LG"
},
"description": "Expands and softens mask edges to reduce visible seams after image processing."
}
}
]
},

View File

@ -742,10 +742,9 @@
"extra": {
"workflowRendererVersion": "LG"
},
"category": "Image Tools/Color adjust",
"description": "Adjusts black point, white point, and gamma for tonal range control via GPU shader."
"category": "Image Tools/Color adjust"
}
]
},
"extra": {}
}
}

View File

@ -1919,8 +1919,7 @@
"extra": {
"workflowRendererVersion": "LG"
},
"category": "Image generation and editing/Outpaint image",
"description": "Outpaints beyond image boundaries using Qwen-Image's outpainting capabilities."
"category": "Image generation and editing/Outpaint image"
},
{
"id": "f93c215e-c393-460e-9534-ed2c3d8a652e",
@ -2279,8 +2278,7 @@
],
"extra": {
"workflowRendererVersion": "LG"
},
"description": "Expands and softens mask edges to reduce visible seams after image processing."
}
},
{
"id": "2a4b2cc0-db37-4302-a067-da392f38f06b",
@ -2735,8 +2733,7 @@
],
"extra": {
"workflowRendererVersion": "LG"
},
"description": "Scales both image and mask together while preserving alignment for editing workflows."
}
}
]
},

View File

@ -1,714 +0,0 @@
{
"revision": 0,
"last_node_id": 99,
"last_link_id": 0,
"nodes": [
{
"id": 99,
"type": "6e7ab3ea-96aa-470f-9b94-3d9d0e01f481",
"pos": [
-1630,
-3270
],
"size": [
290,
370
],
"flags": {},
"order": 3,
"mode": 0,
"inputs": [
{
"label": "image",
"localized_name": "image",
"name": "image",
"type": "IMAGE",
"link": null
},
{
"label": "object",
"name": "text",
"type": "STRING",
"widget": {
"name": "text"
},
"link": null
},
{
"name": "bboxes",
"type": "BOUNDING_BOX",
"link": null
},
{
"name": "positive_coords",
"type": "STRING",
"link": null
},
{
"name": "negative_coords",
"type": "STRING",
"link": null
},
{
"name": "threshold",
"type": "FLOAT",
"widget": {
"name": "threshold"
},
"link": null
},
{
"name": "refine_iterations",
"type": "INT",
"widget": {
"name": "refine_iterations"
},
"link": null
},
{
"name": "individual_masks",
"type": "BOOLEAN",
"widget": {
"name": "individual_masks"
},
"link": null
},
{
"name": "ckpt_name",
"type": "COMBO",
"widget": {
"name": "ckpt_name"
},
"link": null
}
],
"outputs": [
{
"localized_name": "masks",
"name": "masks",
"type": "MASK",
"links": []
},
{
"localized_name": "bboxes",
"name": "bboxes",
"type": "BOUNDING_BOX",
"links": []
}
],
"properties": {
"proxyWidgets": [
[
"78",
"text"
],
[
"75",
"threshold"
],
[
"75",
"refine_iterations"
],
[
"75",
"individual_masks"
],
[
"77",
"ckpt_name"
]
],
"ue_properties": {
"widget_ue_connectable": {
"text": true
},
"version": "7.7",
"input_ue_unconnectable": {}
},
"cnr_id": "comfy-core",
"ver": "0.19.3",
"enableTabs": false,
"tabWidth": 65,
"tabXOffset": 10,
"hasSecondTab": false,
"secondTabText": "Send Back",
"secondTabOffset": 80,
"secondTabWidth": 65
},
"widgets_values": [],
"title": "Image Segmentation (SAM3)"
}
],
"links": [],
"version": 0.4,
"definitions": {
"subgraphs": [
{
"id": "6e7ab3ea-96aa-470f-9b94-3d9d0e01f481",
"version": 1,
"state": {
"lastGroupId": 0,
"lastNodeId": 113,
"lastLinkId": 283,
"lastRerouteId": 0
},
"revision": 0,
"config": {},
"name": "Image Segmentation (SAM3)",
"inputNode": {
"id": -10,
"bounding": [
-2260,
-3450,
136.369140625,
220
]
},
"outputNode": {
"id": -20,
"bounding": [
-1130,
-3305,
120,
80
]
},
"inputs": [
{
"id": "a6e75fa2-162a-4af0-a2fd-1e9c899a5ab6",
"name": "image",
"type": "IMAGE",
"linkIds": [
264
],
"localized_name": "image",
"label": "image",
"pos": [
-2143.630859375,
-3430
]
},
{
"id": "3cefd304-7631-4ff6-a5a0-5a0ffb120745",
"name": "text",
"type": "STRING",
"linkIds": [
265
],
"label": "object",
"pos": [
-2143.630859375,
-3410
]
},
{
"id": "1aec91c5-d8d2-441c-928c-49c14e7e80ed",
"name": "bboxes",
"type": "BOUNDING_BOX",
"linkIds": [
266
],
"pos": [
-2143.630859375,
-3390
]
},
{
"id": "1ec7ce1a-8257-4719-8a81-60ebc8a98899",
"name": "positive_coords",
"type": "STRING",
"linkIds": [
267
],
"pos": [
-2143.630859375,
-3370
]
},
{
"id": "c65f8b87-9bd7-48be-9fc2-823431e95019",
"name": "negative_coords",
"type": "STRING",
"linkIds": [
268
],
"pos": [
-2143.630859375,
-3350
]
},
{
"id": "bb4ba35a-ccfe-4c37-98e5-d9b0d69585fb",
"name": "threshold",
"type": "FLOAT",
"linkIds": [
269
],
"pos": [
-2143.630859375,
-3330
]
},
{
"id": "b1439668-b050-490b-a5dc-fc4052c55666",
"name": "refine_iterations",
"type": "INT",
"linkIds": [
270
],
"pos": [
-2143.630859375,
-3310
]
},
{
"id": "86e239e5-c098-4302-b54d-d42a38bc0f89",
"name": "individual_masks",
"type": "BOOLEAN",
"linkIds": [
271
],
"pos": [
-2143.630859375,
-3290
]
},
{
"id": "f9e0b9d4-b2f1-4907-a4a5-305656576706",
"name": "ckpt_name",
"type": "COMBO",
"linkIds": [
272
],
"pos": [
-2143.630859375,
-3270
]
}
],
"outputs": [
{
"id": "ff50da09-1e59-4a58-9b7f-be1a00aa5913",
"name": "masks",
"type": "MASK",
"linkIds": [
231
],
"localized_name": "masks",
"pos": [
-1110,
-3285
]
},
{
"id": "8f622e40-8528-4078-b7d3-147e9f872194",
"name": "bboxes",
"type": "BOUNDING_BOX",
"linkIds": [
232
],
"localized_name": "bboxes",
"pos": [
-1110,
-3265
]
}
],
"widgets": [],
"nodes": [
{
"id": 75,
"type": "SAM3_Detect",
"pos": [
-1470,
-3460
],
"size": [
270,
260
],
"flags": {},
"order": 0,
"mode": 0,
"inputs": [
{
"label": "model",
"localized_name": "model",
"name": "model",
"type": "MODEL",
"link": 237
},
{
"label": "image",
"localized_name": "image",
"name": "image",
"type": "IMAGE",
"link": 264
},
{
"label": "conditioning",
"localized_name": "conditioning",
"name": "conditioning",
"shape": 7,
"type": "CONDITIONING",
"link": 200
},
{
"label": "bboxes",
"localized_name": "bboxes",
"name": "bboxes",
"shape": 7,
"type": "BOUNDING_BOX",
"link": 266
},
{
"label": "positive_coords",
"localized_name": "positive_coords",
"name": "positive_coords",
"shape": 7,
"type": "STRING",
"link": 267
},
{
"label": "negative_coords",
"localized_name": "negative_coords",
"name": "negative_coords",
"shape": 7,
"type": "STRING",
"link": 268
},
{
"localized_name": "threshold",
"name": "threshold",
"type": "FLOAT",
"widget": {
"name": "threshold"
},
"link": 269
},
{
"localized_name": "refine_iterations",
"name": "refine_iterations",
"type": "INT",
"widget": {
"name": "refine_iterations"
},
"link": 270
},
{
"localized_name": "individual_masks",
"name": "individual_masks",
"type": "BOOLEAN",
"widget": {
"name": "individual_masks"
},
"link": 271
}
],
"outputs": [
{
"localized_name": "masks",
"name": "masks",
"type": "MASK",
"links": [
231
]
},
{
"localized_name": "bboxes",
"name": "bboxes",
"type": "BOUNDING_BOX",
"links": [
232
]
}
],
"properties": {
"ue_properties": {
"widget_ue_connectable": {},
"version": "7.7",
"input_ue_unconnectable": {}
},
"cnr_id": "comfy-core",
"ver": "0.19.3",
"Node name for S&R": "SAM3_Detect",
"enableTabs": false,
"tabWidth": 65,
"tabXOffset": 10,
"hasSecondTab": false,
"secondTabText": "Send Back",
"secondTabOffset": 80,
"secondTabWidth": 65
},
"widgets_values": [
0.5,
2,
false
]
},
{
"id": 77,
"type": "CheckpointLoaderSimple",
"pos": [
-1970,
-3200
],
"size": [
330,
140
],
"flags": {},
"order": 1,
"mode": 0,
"inputs": [
{
"localized_name": "ckpt_name",
"name": "ckpt_name",
"type": "COMBO",
"widget": {
"name": "ckpt_name"
},
"link": 272
}
],
"outputs": [
{
"localized_name": "MODEL",
"name": "MODEL",
"type": "MODEL",
"links": [
237
]
},
{
"localized_name": "CLIP",
"name": "CLIP",
"type": "CLIP",
"links": [
240
]
},
{
"localized_name": "VAE",
"name": "VAE",
"type": "VAE",
"links": null
}
],
"properties": {
"ue_properties": {
"widget_ue_connectable": {},
"version": "7.7",
"input_ue_unconnectable": {}
},
"cnr_id": "comfy-core",
"ver": "0.19.3",
"Node name for S&R": "CheckpointLoaderSimple",
"enableTabs": false,
"tabWidth": 65,
"tabXOffset": 10,
"hasSecondTab": false,
"secondTabText": "Send Back",
"secondTabOffset": 80,
"secondTabWidth": 65,
"models": [
{
"name": "sam3.1_multiplex_fp16.safetensors",
"url": "https://huggingface.co/Comfy-Org/sam3.1/resolve/main/checkpoints/sam3.1_multiplex_fp16.safetensors",
"directory": "checkpoints"
}
]
},
"widgets_values": [
"sam3.1_multiplex_fp16.safetensors"
]
},
{
"id": 78,
"type": "CLIPTextEncode",
"pos": [
-2000,
-3000
],
"size": [
400,
200
],
"flags": {},
"order": 2,
"mode": 0,
"inputs": [
{
"localized_name": "clip",
"name": "clip",
"type": "CLIP",
"link": 240
},
{
"localized_name": "text",
"name": "text",
"type": "STRING",
"widget": {
"name": "text"
},
"link": 265
}
],
"outputs": [
{
"localized_name": "CONDITIONING",
"name": "CONDITIONING",
"type": "CONDITIONING",
"links": [
200
]
}
],
"properties": {
"ue_properties": {
"widget_ue_connectable": {},
"version": "7.7",
"input_ue_unconnectable": {}
},
"cnr_id": "comfy-core",
"ver": "0.19.3",
"Node name for S&R": "CLIPTextEncode",
"enableTabs": false,
"tabWidth": 65,
"tabXOffset": 10,
"hasSecondTab": false,
"secondTabText": "Send Back",
"secondTabOffset": 80,
"secondTabWidth": 65
},
"widgets_values": [
""
]
}
],
"groups": [],
"links": [
{
"id": 237,
"origin_id": 77,
"origin_slot": 0,
"target_id": 75,
"target_slot": 0,
"type": "MODEL"
},
{
"id": 200,
"origin_id": 78,
"origin_slot": 0,
"target_id": 75,
"target_slot": 2,
"type": "CONDITIONING"
},
{
"id": 240,
"origin_id": 77,
"origin_slot": 1,
"target_id": 78,
"target_slot": 0,
"type": "CLIP"
},
{
"id": 231,
"origin_id": 75,
"origin_slot": 0,
"target_id": -20,
"target_slot": 0,
"type": "MASK"
},
{
"id": 232,
"origin_id": 75,
"origin_slot": 1,
"target_id": -20,
"target_slot": 1,
"type": "BOUNDING_BOX"
},
{
"id": 264,
"origin_id": -10,
"origin_slot": 0,
"target_id": 75,
"target_slot": 1,
"type": "IMAGE"
},
{
"id": 265,
"origin_id": -10,
"origin_slot": 1,
"target_id": 78,
"target_slot": 1,
"type": "STRING"
},
{
"id": 266,
"origin_id": -10,
"origin_slot": 2,
"target_id": 75,
"target_slot": 3,
"type": "BOUNDING_BOX"
},
{
"id": 267,
"origin_id": -10,
"origin_slot": 3,
"target_id": 75,
"target_slot": 4,
"type": "STRING"
},
{
"id": 268,
"origin_id": -10,
"origin_slot": 4,
"target_id": 75,
"target_slot": 5,
"type": "STRING"
},
{
"id": 269,
"origin_id": -10,
"origin_slot": 5,
"target_id": 75,
"target_slot": 6,
"type": "FLOAT"
},
{
"id": 270,
"origin_id": -10,
"origin_slot": 6,
"target_id": 75,
"target_slot": 7,
"type": "INT"
},
{
"id": 271,
"origin_id": -10,
"origin_slot": 7,
"target_id": 75,
"target_slot": 8,
"type": "BOOLEAN"
},
{
"id": 272,
"origin_id": -10,
"origin_slot": 8,
"target_id": 77,
"target_slot": 0,
"type": "COMBO"
}
],
"extra": {},
"category": "Image Tools/Image Segmentation",
"description": "Segments images into masks using Meta SAM3 from text prompts, points, or boxes."
}
]
},
"extra": {
"ue_links": []
}
}

View File

@ -141,7 +141,7 @@
},
"revision": 0,
"config": {},
"name": "Image Upscale (Z-image-Turbo)",
"name": "local-Image Upscale(Z-image-Turbo)",
"inputNode": {
"id": -10,
"bounding": [
@ -1302,8 +1302,7 @@
"extra": {
"workflowRendererVersion": "LG"
},
"category": "Image generation and editing/Enhance",
"description": "Upscales images to higher resolution using Z-Image-Turbo."
"category": "Image generation and editing/Enhance"
}
]
},

View File

@ -99,7 +99,7 @@
},
"revision": 0,
"config": {},
"name": "Image to Depth Map (Lotus)",
"name": "local-Image to Depth Map (Lotus)",
"inputNode": {
"id": -10,
"bounding": [
@ -948,8 +948,7 @@
"extra": {
"workflowRendererVersion": "LG"
},
"category": "Image generation and editing/Depth to image",
"description": "Estimates a monocular depth map from an input image using the Lotus depth estimation model."
"category": "Image generation and editing/Depth to image"
}
]
},
@ -965,4 +964,4 @@
"workflowRendererVersion": "LG"
},
"version": 0.4
}
}

View File

@ -1586,8 +1586,7 @@
"extra": {
"workflowRendererVersion": "LG"
},
"category": "Image generation and editing/Image to layers",
"description": "Decomposes an image into variable-resolution RGBA layers for independent editing using Qwen-Image-Layered."
"category": "Image generation and editing/Image to layers"
}
]
},

View File

@ -72,7 +72,7 @@
},
"revision": 0,
"config": {},
"name": "Image to 3D Model (Hunyuan3d 2.1)",
"name": "local-Image to Model (Hunyuan3d 2.1)",
"inputNode": {
"id": -10,
"bounding": [
@ -765,8 +765,7 @@
"extra": {
"workflowRendererVersion": "LG"
},
"category": "3D/Image to 3D Model",
"description": "Generates 3D mesh models from a single input image using Hunyuan3D 2.0/2.1."
"category": "3D/Image to 3D Model"
}
]
},

View File

@ -4223,8 +4223,7 @@
"extra": {
"workflowRendererVersion": "Vue-corrected"
},
"category": "Video generation and editing/Image to video",
"description": "Generates video from a single input image using LTX-2.3."
"category": "Video generation and editing/Image to video"
}
]
},

View File

@ -206,7 +206,7 @@
},
"revision": 0,
"config": {},
"name": "Image to Video (Wan 2.2)",
"name": "local-Image to Video (Wan 2.2)",
"inputNode": {
"id": -10,
"bounding": [
@ -2027,8 +2027,7 @@
"extra": {
"workflowRendererVersion": "LG"
},
"category": "Video generation and editing/Image to video",
"description": "Image-to-video with Wan 2.2 using a start image plus text prompt to extend motion from the still frame."
"category": "Video generation and editing/Image to video"
}
]
},

View File

@ -134,7 +134,7 @@
},
"revision": 0,
"config": {},
"name": "Pose to Image (Z-Image-Turbo)",
"name": "local-Pose to Image (Z-Image-Turbo)",
"inputNode": {
"id": -10,
"bounding": [
@ -1298,8 +1298,7 @@
"VHS_MetadataImage": true,
"VHS_KeepIntermediate": true
},
"category": "Image generation and editing/Pose to image",
"description": "Generates an image from pose keypoints using Z-Image-Turbo with text conditioning."
"category": "Image generation and editing/Pose to image"
}
]
},
@ -1320,4 +1319,4 @@
}
},
"version": 0.4
}
}

View File

@ -3870,8 +3870,7 @@
"extra": {
"workflowRendererVersion": "LG"
},
"category": "Video generation and editing/Pose to video",
"description": "Generates video from pose reference frames using LTX-2, with optional synchronized audio."
"category": "Video generation and editing/Pose to video"
}
]
},

View File

@ -270,10 +270,9 @@
"extra": {
"workflowRendererVersion": "LG"
},
"category": "Text generation/Prompt enhance",
"description": "Expands short text prompts into detailed descriptions using a text generation model for better generation quality."
"category": "Text generation/Prompt enhance"
}
]
},
"extra": {}
}
}

View File

@ -1,397 +0,0 @@
{
"revision": 0,
"last_node_id": 19,
"last_link_id": 0,
"nodes": [
{
"id": 19,
"type": "5b40ca21-ba1a-41d5-b403-4d2d7acdc195",
"pos": [
-6411.330578108367,
1940.2638932730042
],
"size": [
349.609375,
145.9375
],
"flags": {},
"order": 2,
"mode": 0,
"inputs": [
{
"localized_name": "image",
"name": "image",
"type": "IMAGE",
"link": null
},
{
"name": "bg_removal_name",
"type": "COMBO",
"widget": {
"name": "bg_removal_name"
},
"link": null
}
],
"outputs": [
{
"localized_name": "IMAGE",
"name": "IMAGE",
"type": "IMAGE",
"links": []
},
{
"name": "mask",
"type": "MASK",
"links": []
}
],
"properties": {
"proxyWidgets": [
[
"14",
"bg_removal_name"
]
]
},
"widgets_values": [],
"title": "Remove Background (BiRefNet)"
}
],
"links": [],
"version": 0.4,
"definitions": {
"subgraphs": [
{
"id": "5b40ca21-ba1a-41d5-b403-4d2d7acdc195",
"version": 1,
"state": {
"lastGroupId": 0,
"lastNodeId": 21,
"lastLinkId": 16,
"lastRerouteId": 0
},
"revision": 0,
"config": {},
"name": "Remove Background (BiRefNet)",
"description": "Removes or replaces image backgrounds using BiRefNet segmentation and alpha compositing.",
"inputNode": {
"id": -10,
"bounding": [
-6728.534070722246,
1475.2619799128663,
150.9140625,
88
]
},
"outputNode": {
"id": -20,
"bounding": [
-6169.049695722246,
1475.2619799128663,
128,
88
]
},
"inputs": [
{
"id": "7bc321cd-df31-4c39-aaf7-7f0d01326189",
"name": "image",
"type": "IMAGE",
"linkIds": [
5,
7
],
"localized_name": "image",
"pos": [
-6601.620008222246,
1499.2619799128663
]
},
{
"id": "e89d2cd8-daa3-4e29-8a69-851db85072cb",
"name": "bg_removal_name",
"type": "COMBO",
"linkIds": [
12
],
"pos": [
-6601.620008222246,
1519.2619799128663
]
}
],
"outputs": [
{
"id": "16e7863c-4c38-46c2-aa74-e82991fbfe8d",
"name": "IMAGE",
"type": "IMAGE",
"linkIds": [
8
],
"localized_name": "IMAGE",
"pos": [
-6145.049695722246,
1499.2619799128663
]
},
{
"id": "f7240c19-5b80-406e-a8e2-9b12440ee2d6",
"name": "mask",
"type": "MASK",
"linkIds": [
11
],
"pos": [
-6145.049695722246,
1519.2619799128663
]
}
],
"widgets": [],
"nodes": [
{
"id": 13,
"type": "RemoveBackground",
"pos": [
-6536.764823982709,
1444.9963409012412
],
"size": [
302.25,
72
],
"flags": {},
"order": 0,
"mode": 0,
"inputs": [
{
"localized_name": "image",
"name": "image",
"type": "IMAGE",
"link": 5
},
{
"localized_name": "bg_removal_model",
"name": "bg_removal_model",
"type": "BACKGROUND_REMOVAL",
"link": 3
}
],
"outputs": [
{
"localized_name": "mask",
"name": "mask",
"type": "MASK",
"links": [
4,
11
]
}
],
"properties": {
"Node name for S&R": "RemoveBackground"
}
},
{
"id": 14,
"type": "LoadBackgroundRemovalModel",
"pos": [
-6540.534070722246,
1302.223464635445
],
"size": [
311.484375,
85.515625
],
"flags": {},
"order": 1,
"mode": 0,
"inputs": [
{
"localized_name": "bg_removal_name",
"name": "bg_removal_name",
"type": "COMBO",
"widget": {
"name": "bg_removal_name"
},
"link": 12
}
],
"outputs": [
{
"localized_name": "bg_model",
"name": "bg_model",
"type": "BACKGROUND_REMOVAL",
"links": [
3
]
}
],
"properties": {
"Node name for S&R": "LoadBackgroundRemovalModel",
"models": [
{
"name": "birefnet.safetensors",
"url": "https://huggingface.co/Comfy-Org/BiRefNet/resolve/main/background_removal/birefnet.safetensors",
"directory": "background_removal"
}
]
},
"widgets_values": [
"birefnet.safetensors"
]
},
{
"id": 15,
"type": "InvertMask",
"pos": [
-6532.446160529669,
1571.1111286839914
],
"size": [
285.984375,
48
],
"flags": {},
"order": 2,
"mode": 0,
"inputs": [
{
"localized_name": "mask",
"name": "mask",
"type": "MASK",
"link": 4
}
],
"outputs": [
{
"localized_name": "MASK",
"name": "MASK",
"type": "MASK",
"links": [
6
]
}
],
"properties": {
"Node name for S&R": "InvertMask"
}
},
{
"id": 16,
"type": "JoinImageWithAlpha",
"pos": [
-6527.4370171636665,
1674.3004951902876
],
"size": [
284.96875,
72
],
"flags": {},
"order": 3,
"mode": 0,
"inputs": [
{
"localized_name": "image",
"name": "image",
"type": "IMAGE",
"link": 7
},
{
"localized_name": "alpha",
"name": "alpha",
"type": "MASK",
"link": 6
}
],
"outputs": [
{
"localized_name": "IMAGE",
"name": "IMAGE",
"type": "IMAGE",
"links": [
8
]
}
],
"properties": {
"Node name for S&R": "JoinImageWithAlpha"
}
}
],
"groups": [],
"links": [
{
"id": 3,
"origin_id": 14,
"origin_slot": 0,
"target_id": 13,
"target_slot": 1,
"type": "BACKGROUND_REMOVAL"
},
{
"id": 4,
"origin_id": 13,
"origin_slot": 0,
"target_id": 15,
"target_slot": 0,
"type": "MASK"
},
{
"id": 6,
"origin_id": 15,
"origin_slot": 0,
"target_id": 16,
"target_slot": 1,
"type": "MASK"
},
{
"id": 5,
"origin_id": -10,
"origin_slot": 0,
"target_id": 13,
"target_slot": 0,
"type": "IMAGE"
},
{
"id": 7,
"origin_id": -10,
"origin_slot": 0,
"target_id": 16,
"target_slot": 0,
"type": "IMAGE"
},
{
"id": 8,
"origin_id": 16,
"origin_slot": 0,
"target_id": -20,
"target_slot": 0,
"type": "IMAGE"
},
{
"id": 11,
"origin_id": 13,
"origin_slot": 0,
"target_id": -20,
"target_slot": 1,
"type": "MASK"
},
{
"id": 12,
"origin_id": -10,
"origin_slot": 1,
"target_id": 14,
"target_slot": 0,
"type": "COMBO"
}
],
"extra": {},
"category": "Image generation and editing/Background Removal"
}
]
},
"extra": {}
}

View File

@ -302,9 +302,8 @@
"extra": {
"workflowRendererVersion": "LG"
},
"category": "Image Tools/Sharpen",
"description": "Sharpens image details using a GPU fragment shader for enhanced clarity."
"category": "Image Tools/Sharpen"
}
]
}
}
}

View File

@ -222,7 +222,7 @@
},
"revision": 0,
"config": {},
"name": "Text to Audio (ACE-Step 1.5)",
"name": "local-Text to Audio (ACE-Step 1.5)",
"inputNode": {
"id": -10,
"bounding": [
@ -1502,8 +1502,7 @@
"extra": {
"workflowRendererVersion": "LG"
},
"category": "Audio/Music generation",
"description": "Generates audio/music from text prompts using ACE-Step 1.5, a diffusion-based audio generation model."
"category": "Audio/Music generation"
}
]
},
@ -1519,4 +1518,4 @@
}
},
"version": 0.4
}
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1029,8 +1029,7 @@
"extra": {
"workflowRendererVersion": "LG"
},
"category": "Image generation and editing/Text to image",
"description": "Generates images from prompts using FLUX.1 [dev]: a 12B rectified-flow MMDiT with dual CLIP plus T5-XXL text encoders and guidance-distilled sampling for sharp prompt following versus classic DDPM diffusion."
"category": "Image generation and editing/Text to image"
}
]
},
@ -1044,4 +1043,4 @@
},
"ue_links": []
}
}
}

View File

@ -1023,8 +1023,7 @@
"extra": {
"workflowRendererVersion": "LG"
},
"category": "Image generation and editing/Text to image",
"description": "FLUX.1 Krea [dev] (Black Forest Labs × Krea): open-weight 12B rectified-flow text-to-image drop-in alongside FLUX.1 [dev], tuned away from overcooked saturation toward more natural diversity in people, realism, and style while keeping ecosystem compatibility."
"category": "Image generation and editing/Text to image"
}
]
},
@ -1038,4 +1037,4 @@
},
"ue_links": []
}
}
}

File diff suppressed because it is too large Load Diff

View File

@ -1104,8 +1104,7 @@
"extra": {
"workflowRendererVersion": "LG"
},
"category": "Image generation and editing/Text to image",
"description": "Generates images from text prompts using NetaYume Lumina, fine-tuned from Neta Lumina for anime-style and illustration generation."
"category": "Image generation and editing/Text to image"
},
{
"id": "a07fdf06-1bda-4dac-bdbd-63ee8ebca1c9",
@ -1459,12 +1458,11 @@
],
"extra": {
"workflowRendererVersion": "LG"
},
"description": "Encodes a negative text prompt via CLIP for classifier-free guidance in anime-style generation (NetaYume Lumina)."
}
}
]
},
"extra": {
"ue_links": []
}
}
}

View File

@ -1941,8 +1941,7 @@
"extra": {
"workflowRendererVersion": "Vue-corrected"
},
"category": "Image generation and editing/Text to image",
"description": "Generates images from text prompts using Qwen-Image-2512, with enhanced human realism and finer natural detail over the base version."
"category": "Image generation and editing/Text to image"
}
]
},

View File

@ -1873,8 +1873,7 @@
"extra": {
"workflowRendererVersion": "LG"
},
"category": "Image generation and editing/Text to image",
"description": "Generates images from text prompts using Qwen-Image, Alibaba's 20B MMDiT model with excellent multilingual text rendering."
"category": "Image generation and editing/Text to image"
}
]
},

File diff suppressed because it is too large Load Diff

View File

@ -1,21 +1,22 @@
{
"id": "1c3eaa76-5cfa-4dc7-8571-97a570324e01",
"revision": 0,
"last_node_id": 57,
"last_link_id": 0,
"last_node_id": 34,
"last_link_id": 40,
"nodes": [
{
"id": 57,
"type": "f2fdebf6-dfaf-43b6-9eb2-7f70613cfdc1",
"id": 5,
"type": "dfe9eb32-97c0-43a5-90d5-4fd37768d91b",
"pos": [
130,
200
-2.5766491043910378e-05,
1229.999928629805
],
"size": [
400,
470
],
"flags": {},
"order": 1,
"order": 0,
"mode": 0,
"inputs": [
{
@ -43,22 +44,6 @@
},
"link": null
},
{
"name": "seed",
"type": "INT",
"widget": {
"name": "seed"
},
"link": null
},
{
"name": "steps",
"type": "INT",
"widget": {
"name": "steps"
},
"link": null
},
{
"name": "unet_name",
"type": "COMBO",
@ -95,15 +80,15 @@
"properties": {
"proxyWidgets": [
[
"27",
"-1",
"text"
],
[
"13",
"-1",
"width"
],
[
"13",
"-1",
"height"
],
[
@ -112,23 +97,19 @@
],
[
"3",
"steps"
"control_after_generate"
],
[
"28",
"-1",
"unet_name"
],
[
"30",
"-1",
"clip_name"
],
[
"29",
"-1",
"vae_name"
],
[
"3",
"control_after_generate"
]
],
"cnr_id": "comfy-core",
@ -141,40 +122,48 @@
"secondTabOffset": 80,
"secondTabWidth": 65
},
"widgets_values": [],
"title": "Text to Image (Z-Image-Turbo)"
"widgets_values": [
"",
1024,
1024,
null,
null,
"z_image_turbo_bf16.safetensors",
"qwen_3_4b.safetensors",
"ae.safetensors"
]
}
],
"links": [],
"version": 0.4,
"groups": [],
"definitions": {
"subgraphs": [
{
"id": "f2fdebf6-dfaf-43b6-9eb2-7f70613cfdc1",
"id": "dfe9eb32-97c0-43a5-90d5-4fd37768d91b",
"version": 1,
"state": {
"lastGroupId": 4,
"lastNodeId": 61,
"lastLinkId": 75,
"lastNodeId": 34,
"lastLinkId": 40,
"lastRerouteId": 0
},
"revision": 0,
"config": {},
"name": "Text to Image (Z-Image-Turbo)",
"name": "local-Text to Image (Z-Image-Turbo)",
"inputNode": {
"id": -10,
"bounding": [
-560,
480,
-80,
425,
120,
200
160
]
},
"outputNode": {
"id": -20,
"bounding": [
1670,
320,
1490,
415,
120,
60
]
@ -189,8 +178,8 @@
],
"label": "prompt",
"pos": [
-460,
500
20,
445
]
},
{
@ -201,8 +190,8 @@
35
],
"pos": [
-460,
520
20,
465
]
},
{
@ -213,68 +202,44 @@
36
],
"pos": [
-460,
540
20,
485
]
},
{
"id": "f77677f7-6bf6-4c19-a71f-c4a553d5981e",
"name": "seed",
"type": "INT",
"linkIds": [
71
],
"pos": [
-460,
560
]
},
{
"id": "ef9a9fb1-5983-4bc9-a60b-cf5aec48bff1",
"name": "steps",
"type": "INT",
"linkIds": [
72
],
"pos": [
-460,
580
]
},
{
"id": "a20a1b30-785f-4a04-bb6d-3d61adab9764",
"id": "23087d15-8412-4fbd-b71e-9b6d7ef76de1",
"name": "unet_name",
"type": "COMBO",
"linkIds": [
73
38
],
"pos": [
-460,
600
20,
505
]
},
{
"id": "4af8fc2b-4655-4086-8240-45f8cb38c6f6",
"id": "0677f5c3-2a3f-43d4-98ac-a4c56d5efdc0",
"name": "clip_name",
"type": "COMBO",
"linkIds": [
74
39
],
"pos": [
-460,
620
20,
525
]
},
{
"id": "4d518693-2807-439c-9cb6-cffd23ccba2c",
"id": "c85c0445-2641-48b1-bbca-95057edf2fcf",
"name": "vae_name",
"type": "COMBO",
"linkIds": [
75
40
],
"pos": [
-460,
640
20,
545
]
}
],
@ -288,8 +253,8 @@
],
"localized_name": "IMAGE",
"pos": [
1690,
340
1510,
435
]
}
],
@ -299,15 +264,15 @@
"id": 30,
"type": "CLIPLoader",
"pos": [
30,
420
109.99997264844609,
329.99999029608756
],
"size": [
270,
150
269.9869791666667,
106
],
"flags": {},
"order": 7,
"order": 0,
"mode": 0,
"inputs": [
{
@ -317,7 +282,7 @@
"widget": {
"name": "clip_name"
},
"link": 74
"link": 39
},
{
"localized_name": "type",
@ -350,9 +315,9 @@
}
],
"properties": {
"Node name for S&R": "CLIPLoader",
"cnr_id": "comfy-core",
"ver": "0.3.73",
"Node name for S&R": "CLIPLoader",
"models": [
{
"name": "qwen_3_4b.safetensors",
@ -378,15 +343,15 @@
"id": 29,
"type": "VAELoader",
"pos": [
30,
650
109.99997264844609,
479.9999847172637
],
"size": [
270,
110
269.9869791666667,
58
],
"flags": {},
"order": 6,
"order": 1,
"mode": 0,
"inputs": [
{
@ -396,7 +361,7 @@
"widget": {
"name": "vae_name"
},
"link": 75
"link": 40
}
],
"outputs": [
@ -410,9 +375,9 @@
}
],
"properties": {
"Node name for S&R": "VAELoader",
"cnr_id": "comfy-core",
"ver": "0.3.73",
"Node name for S&R": "VAELoader",
"models": [
{
"name": "ae.safetensors",
@ -436,12 +401,12 @@
"id": 33,
"type": "ConditioningZeroOut",
"pos": [
630,
960
639.9999103333332,
620.0000271257795
],
"size": [
230,
80
204.134765625,
26
],
"flags": {},
"order": 8,
@ -465,9 +430,9 @@
}
],
"properties": {
"Node name for S&R": "ConditioningZeroOut",
"cnr_id": "comfy-core",
"ver": "0.3.73",
"Node name for S&R": "ConditioningZeroOut",
"enableTabs": false,
"tabWidth": 65,
"tabXOffset": 10,
@ -475,21 +440,22 @@
"secondTabText": "Send Back",
"secondTabOffset": 80,
"secondTabWidth": 65
}
},
"widgets_values": []
},
{
"id": 8,
"type": "VAEDecode",
"pos": [
1320,
230
1219.9999088104782,
160.00009184959066
],
"size": [
230,
100
209.98697916666669,
46
],
"flags": {},
"order": 1,
"order": 5,
"mode": 0,
"inputs": [
{
@ -517,9 +483,9 @@
}
],
"properties": {
"Node name for S&R": "VAEDecode",
"cnr_id": "comfy-core",
"ver": "0.3.64",
"Node name for S&R": "VAEDecode",
"enableTabs": false,
"tabWidth": 65,
"tabXOffset": 10,
@ -527,21 +493,22 @@
"secondTabText": "Send Back",
"secondTabOffset": 80,
"secondTabWidth": 65
}
},
"widgets_values": []
},
{
"id": 28,
"type": "UNETLoader",
"pos": [
30,
230
109.99997264844609,
200.0000502647102
],
"size": [
270,
110
269.9869791666667,
82
],
"flags": {},
"order": 5,
"order": 2,
"mode": 0,
"inputs": [
{
@ -551,7 +518,7 @@
"widget": {
"name": "unet_name"
},
"link": 73
"link": 38
},
{
"localized_name": "weight_dtype",
@ -574,9 +541,9 @@
}
],
"properties": {
"Node name for S&R": "UNETLoader",
"cnr_id": "comfy-core",
"ver": "0.3.73",
"Node name for S&R": "UNETLoader",
"models": [
{
"name": "z_image_turbo_bf16.safetensors",
@ -601,15 +568,15 @@
"id": 27,
"type": "CLIPTextEncode",
"pos": [
400,
230
429.99997828947767,
200.0000502647102
],
"size": [
450,
650
409.9869791666667,
319.9869791666667
],
"flags": {},
"order": 4,
"order": 7,
"mode": 0,
"inputs": [
{
@ -640,9 +607,9 @@
}
],
"properties": {
"Node name for S&R": "CLIPTextEncode",
"cnr_id": "comfy-core",
"ver": "0.3.73",
"Node name for S&R": "CLIPTextEncode",
"enableTabs": false,
"tabWidth": 65,
"tabXOffset": 10,
@ -659,15 +626,15 @@
"id": 13,
"type": "EmptySD3LatentImage",
"pos": [
40,
890
109.99997264844609,
629.9999791384399
],
"size": [
260,
170
259.9869791666667,
106
],
"flags": {},
"order": 3,
"order": 6,
"mode": 0,
"inputs": [
{
@ -710,9 +677,9 @@
}
],
"properties": {
"Node name for S&R": "EmptySD3LatentImage",
"cnr_id": "comfy-core",
"ver": "0.3.64",
"Node name for S&R": "EmptySD3LatentImage",
"enableTabs": false,
"tabWidth": 65,
"tabXOffset": 10,
@ -727,77 +694,19 @@
1
]
},
{
"id": 11,
"type": "ModelSamplingAuraFlow",
"pos": [
950,
230
],
"size": [
310,
110
],
"flags": {},
"order": 2,
"mode": 0,
"inputs": [
{
"localized_name": "model",
"name": "model",
"type": "MODEL",
"link": 26
},
{
"localized_name": "shift",
"name": "shift",
"type": "FLOAT",
"widget": {
"name": "shift"
},
"link": null
}
],
"outputs": [
{
"localized_name": "MODEL",
"name": "MODEL",
"type": "MODEL",
"slot_index": 0,
"links": [
13
]
}
],
"properties": {
"Node name for S&R": "ModelSamplingAuraFlow",
"cnr_id": "comfy-core",
"ver": "0.3.64",
"enableTabs": false,
"tabWidth": 65,
"tabXOffset": 10,
"hasSecondTab": false,
"secondTabText": "Send Back",
"secondTabOffset": 80,
"secondTabWidth": 65
},
"widgets_values": [
3
]
},
{
"id": 3,
"type": "KSampler",
"pos": [
950,
400
879.9999615530063,
269.9999774911694
],
"size": [
320,
350
314.9869791666667,
262
],
"flags": {},
"order": 0,
"order": 4,
"mode": 0,
"inputs": [
{
@ -831,7 +740,7 @@
"widget": {
"name": "seed"
},
"link": 71
"link": null
},
{
"localized_name": "steps",
@ -840,7 +749,7 @@
"widget": {
"name": "steps"
},
"link": 72
"link": null
},
{
"localized_name": "cfg",
@ -891,9 +800,9 @@
}
],
"properties": {
"Node name for S&R": "KSampler",
"cnr_id": "comfy-core",
"ver": "0.3.64",
"Node name for S&R": "KSampler",
"enableTabs": false,
"tabWidth": 65,
"tabXOffset": 10,
@ -905,23 +814,81 @@
"widgets_values": [
0,
"randomize",
8,
4,
1,
"res_multistep",
"simple",
1
]
},
{
"id": 11,
"type": "ModelSamplingAuraFlow",
"pos": [
879.9999615530063,
160.00009184959066
],
"size": [
309.9869791666667,
58
],
"flags": {},
"order": 3,
"mode": 0,
"inputs": [
{
"localized_name": "model",
"name": "model",
"type": "MODEL",
"link": 26
},
{
"localized_name": "shift",
"name": "shift",
"type": "FLOAT",
"widget": {
"name": "shift"
},
"link": null
}
],
"outputs": [
{
"localized_name": "MODEL",
"name": "MODEL",
"type": "MODEL",
"slot_index": 0,
"links": [
13
]
}
],
"properties": {
"cnr_id": "comfy-core",
"ver": "0.3.64",
"Node name for S&R": "ModelSamplingAuraFlow",
"enableTabs": false,
"tabWidth": 65,
"tabXOffset": 10,
"hasSecondTab": false,
"secondTabText": "Send Back",
"secondTabOffset": 80,
"secondTabWidth": 65
},
"widgets_values": [
3
]
}
],
"groups": [
{
"id": 2,
"title": "Step2 - Image size",
"title": "Image size",
"bounding": [
10,
820,
320,
280
100,
560,
290,
200
],
"color": "#3f789e",
"font_size": 24,
@ -929,12 +896,12 @@
},
{
"id": 3,
"title": "Step3 - Prompt",
"title": "Prompt",
"bounding": [
360,
410,
130,
530,
970
450,
540
],
"color": "#3f789e",
"font_size": 24,
@ -942,12 +909,12 @@
},
{
"id": 4,
"title": "Step1 - Load models",
"title": "Models",
"bounding": [
0,
100,
130,
330,
660
290,
413.6
],
"color": "#3f789e",
"font_size": 24,
@ -1060,41 +1027,25 @@
"type": "INT"
},
{
"id": 71,
"id": 38,
"origin_id": -10,
"origin_slot": 3,
"target_id": 3,
"target_slot": 4,
"type": "INT"
},
{
"id": 72,
"origin_id": -10,
"origin_slot": 4,
"target_id": 3,
"target_slot": 5,
"type": "INT"
},
{
"id": 73,
"origin_id": -10,
"origin_slot": 5,
"target_id": 28,
"target_slot": 0,
"type": "COMBO"
},
{
"id": 74,
"id": 39,
"origin_id": -10,
"origin_slot": 6,
"origin_slot": 4,
"target_id": 30,
"target_slot": 0,
"type": "COMBO"
},
{
"id": 75,
"id": 40,
"origin_id": -10,
"origin_slot": 7,
"origin_slot": 5,
"target_id": 29,
"target_slot": 0,
"type": "COMBO"
@ -1103,10 +1054,25 @@
"extra": {
"workflowRendererVersion": "LG"
},
"category": "Image generation and editing/Text to image",
"description": "Generates images from text prompts using Z-Image-Turbo, Alibaba's distilled 6B DiT model."
"category": "Image generation and editing/Text to image"
}
]
},
"extra": {}
}
"config": {},
"extra": {
"frontendVersion": "1.37.10",
"workflowRendererVersion": "LG",
"VHS_latentpreview": false,
"VHS_latentpreviewrate": 0,
"VHS_MetadataImage": true,
"VHS_KeepIntermediate": true,
"ds": {
"scale": 0.8401370345180755,
"offset": [
940.0587067393087,
-830.7121087564725
]
}
},
"version": 0.4
}

File diff suppressed because it is too large Load Diff

View File

@ -4286,8 +4286,7 @@
"extra": {
"workflowRendererVersion": "Vue-corrected"
},
"category": "Video generation and editing/Text to video",
"description": "Generates video from text prompts using LTX-2.3, Lightricks' video diffusion model."
"category": "Video generation and editing/Text to video"
}
]
},

View File

@ -1572,8 +1572,7 @@
"extra": {
"workflowRendererVersion": "LG"
},
"category": "Video generation and editing/Text to video",
"description": "Generates video from text prompts using Wan2.2, Alibaba's diffusion video model."
"category": "Video generation and editing/Text to video"
}
]
},
@ -1587,4 +1586,4 @@
"VHS_KeepIntermediate": true
},
"version": 0.4
}
}

View File

@ -434,9 +434,8 @@
"extra": {
"workflowRendererVersion": "LG"
},
"category": "Image Tools/Sharpen",
"description": "Enhances edge contrast via unsharp masking for a sharper image appearance."
"category": "Image Tools/Sharpen"
}
]
}
}
}

View File

@ -307,8 +307,7 @@
"extra": {
"workflowRendererVersion": "LG"
},
"category": "Text generation/Video Captioning",
"description": "Generates descriptive captions for video input using Google's Gemini multimodal LLM."
"category": "Text generation/Video Captioning"
}
]
}

View File

@ -165,7 +165,7 @@
},
"revision": 0,
"config": {},
"name": "Video Inpaint (Wan 2.1 VACE)",
"name": "local-Video Inpaint(Wan2.1 VACE)",
"inputNode": {
"id": -10,
"bounding": [
@ -2368,8 +2368,7 @@
"extra": {
"workflowRendererVersion": "LG"
},
"category": "Video generation and editing/Inpaint video",
"description": "Inpaints masked regions in video frames using Wan 2.1 VACE."
"category": "Video generation and editing/Inpaint video"
}
]
},

View File

@ -1,827 +0,0 @@
{
"revision": 0,
"last_node_id": 130,
"last_link_id": 0,
"nodes": [
{
"id": 130,
"type": "7937cf78-b52b-40a3-93b2-b4e2e5f98df1",
"pos": [
-1210,
-2780
],
"size": [
300,
370
],
"flags": {},
"order": 3,
"mode": 0,
"inputs": [
{
"name": "video",
"type": "VIDEO",
"link": null
},
{
"name": "text",
"type": "STRING",
"widget": {
"name": "text"
},
"link": null
},
{
"name": "bboxes",
"type": "BOUNDING_BOX",
"link": null
},
{
"name": "positive_coords",
"type": "STRING",
"link": null
},
{
"name": "negative_coords",
"type": "STRING",
"link": null
},
{
"name": "threshold",
"type": "FLOAT",
"widget": {
"name": "threshold"
},
"link": null
},
{
"name": "refine_iterations",
"type": "INT",
"widget": {
"name": "refine_iterations"
},
"link": null
},
{
"name": "individual_masks",
"type": "BOOLEAN",
"widget": {
"name": "individual_masks"
},
"link": null
},
{
"name": "ckpt_name",
"type": "COMBO",
"widget": {
"name": "ckpt_name"
},
"link": null
}
],
"outputs": [
{
"localized_name": "masks",
"name": "masks",
"type": "MASK",
"links": []
},
{
"localized_name": "bboxes",
"name": "bboxes",
"type": "BOUNDING_BOX",
"links": []
},
{
"name": "audio",
"type": "AUDIO",
"links": null
},
{
"name": "fps",
"type": "FLOAT",
"links": null
}
],
"properties": {
"proxyWidgets": [
[
"125",
"text"
],
[
"126",
"threshold"
],
[
"126",
"refine_iterations"
],
[
"126",
"individual_masks"
],
[
"127",
"ckpt_name"
]
],
"cnr_id": "comfy-core",
"ver": "0.19.3",
"enableTabs": false,
"tabWidth": 65,
"tabXOffset": 10,
"hasSecondTab": false,
"secondTabText": "Send Back",
"secondTabOffset": 80,
"secondTabWidth": 65
},
"widgets_values": [],
"title": "Video Segmentation (SAM3)"
}
],
"links": [],
"version": 0.4,
"definitions": {
"subgraphs": [
{
"id": "7937cf78-b52b-40a3-93b2-b4e2e5f98df1",
"version": 1,
"state": {
"lastGroupId": 0,
"lastNodeId": 130,
"lastLinkId": 299,
"lastRerouteId": 0
},
"revision": 0,
"config": {},
"name": "Video Segmentation (SAM3)",
"inputNode": {
"id": -10,
"bounding": [
-2260,
-3450,
136.369140625,
220
]
},
"outputNode": {
"id": -20,
"bounding": [
-1050,
-3510,
120,
120
]
},
"inputs": [
{
"id": "680ffd88-32fe-48be-88d6-91ea44d5eaee",
"name": "video",
"type": "VIDEO",
"linkIds": [
252
],
"pos": [
-2143.630859375,
-3430
]
},
{
"id": "ceaf249c-32d7-4624-8bf6-e590e347ed90",
"name": "text",
"type": "STRING",
"linkIds": [
254
],
"pos": [
-2143.630859375,
-3410
]
},
{
"id": "1ffbff36-da0c-4854-8cb4-88ad31e64f99",
"name": "bboxes",
"type": "BOUNDING_BOX",
"linkIds": [
255
],
"pos": [
-2143.630859375,
-3390
]
},
{
"id": "67b7f4c7-cec0-4e00-b154-23cc1abf880e",
"name": "positive_coords",
"type": "STRING",
"linkIds": [
256
],
"pos": [
-2143.630859375,
-3370
]
},
{
"id": "b090a498-2bde-46b9-9554-18501401d687",
"name": "negative_coords",
"type": "STRING",
"linkIds": [
257
],
"pos": [
-2143.630859375,
-3350
]
},
{
"id": "1a76dfcf-ce95-46af-bba5-c42160c683dd",
"name": "threshold",
"type": "FLOAT",
"linkIds": [
261
],
"pos": [
-2143.630859375,
-3330
]
},
{
"id": "999523fa-c476-4c53-80c3-0a2f554d18ab",
"name": "refine_iterations",
"type": "INT",
"linkIds": [
262
],
"pos": [
-2143.630859375,
-3310
]
},
{
"id": "d2371011-7fe5-4a39-b0c1-df2e0bbd6ece",
"name": "individual_masks",
"type": "BOOLEAN",
"linkIds": [
263
],
"pos": [
-2143.630859375,
-3290
]
},
{
"id": "675a8b37-17db-48d1-853c-2fe5d6a74582",
"name": "ckpt_name",
"type": "COMBO",
"linkIds": [
273
],
"pos": [
-2143.630859375,
-3270
]
}
],
"outputs": [
{
"id": "ff50da09-1e59-4a58-9b7f-be1a00aa5913",
"name": "masks",
"type": "MASK",
"linkIds": [
231
],
"localized_name": "masks",
"pos": [
-1030,
-3490
]
},
{
"id": "8f622e40-8528-4078-b7d3-147e9f872194",
"name": "bboxes",
"type": "BOUNDING_BOX",
"linkIds": [
232
],
"localized_name": "bboxes",
"pos": [
-1030,
-3470
]
},
{
"id": "6c9924ec-f0fa-4509-83ea-8f97f5889bcc",
"name": "audio",
"type": "AUDIO",
"linkIds": [
259
],
"pos": [
-1030,
-3450
]
},
{
"id": "82c1cddc-ab11-44eb-9e2f-1a5c7ea5645b",
"name": "fps",
"type": "FLOAT",
"linkIds": [
260
],
"pos": [
-1030,
-3430
]
}
],
"widgets": [],
"nodes": [
{
"id": 125,
"type": "CLIPTextEncode",
"pos": [
-2010,
-3040
],
"size": [
400,
200
],
"flags": {},
"order": 1,
"mode": 0,
"inputs": [
{
"localized_name": "clip",
"name": "clip",
"type": "CLIP",
"link": 240
},
{
"localized_name": "text",
"name": "text",
"type": "STRING",
"widget": {
"name": "text"
},
"link": 254
}
],
"outputs": [
{
"localized_name": "CONDITIONING",
"name": "CONDITIONING",
"type": "CONDITIONING",
"links": [
200
]
}
],
"properties": {
"Node name for S&R": "CLIPTextEncode",
"cnr_id": "comfy-core",
"ver": "0.19.3",
"enableTabs": false,
"tabWidth": 65,
"tabXOffset": 10,
"hasSecondTab": false,
"secondTabText": "Send Back",
"secondTabOffset": 80,
"secondTabWidth": 65
},
"widgets_values": [
""
]
},
{
"id": 126,
"type": "SAM3_Detect",
"pos": [
-1520,
-3520
],
"size": [
270,
290
],
"flags": {},
"order": 2,
"mode": 0,
"inputs": [
{
"label": "model",
"localized_name": "model",
"name": "model",
"type": "MODEL",
"link": 237
},
{
"label": "image",
"localized_name": "image",
"name": "image",
"type": "IMAGE",
"link": 253
},
{
"label": "conditioning",
"localized_name": "conditioning",
"name": "conditioning",
"shape": 7,
"type": "CONDITIONING",
"link": 200
},
{
"label": "bboxes",
"localized_name": "bboxes",
"name": "bboxes",
"shape": 7,
"type": "BOUNDING_BOX",
"link": 255
},
{
"label": "positive_coords",
"localized_name": "positive_coords",
"name": "positive_coords",
"shape": 7,
"type": "STRING",
"link": 256
},
{
"label": "negative_coords",
"localized_name": "negative_coords",
"name": "negative_coords",
"shape": 7,
"type": "STRING",
"link": 257
},
{
"localized_name": "threshold",
"name": "threshold",
"type": "FLOAT",
"widget": {
"name": "threshold"
},
"link": 261
},
{
"localized_name": "refine_iterations",
"name": "refine_iterations",
"type": "INT",
"widget": {
"name": "refine_iterations"
},
"link": 262
},
{
"localized_name": "individual_masks",
"name": "individual_masks",
"type": "BOOLEAN",
"widget": {
"name": "individual_masks"
},
"link": 263
}
],
"outputs": [
{
"localized_name": "masks",
"name": "masks",
"type": "MASK",
"links": [
231
]
},
{
"localized_name": "bboxes",
"name": "bboxes",
"type": "BOUNDING_BOX",
"links": [
232
]
}
],
"properties": {
"Node name for S&R": "SAM3_Detect",
"cnr_id": "comfy-core",
"ver": "0.19.3",
"enableTabs": false,
"tabWidth": 65,
"tabXOffset": 10,
"hasSecondTab": false,
"secondTabText": "Send Back",
"secondTabOffset": 80,
"secondTabWidth": 65
},
"widgets_values": [
0.5,
2,
false
]
},
{
"id": 127,
"type": "CheckpointLoaderSimple",
"pos": [
-1970,
-3310
],
"size": [
330,
160
],
"flags": {},
"order": 3,
"mode": 0,
"inputs": [
{
"localized_name": "ckpt_name",
"name": "ckpt_name",
"type": "COMBO",
"widget": {
"name": "ckpt_name"
},
"link": 273
}
],
"outputs": [
{
"localized_name": "MODEL",
"name": "MODEL",
"type": "MODEL",
"links": [
237
]
},
{
"localized_name": "CLIP",
"name": "CLIP",
"type": "CLIP",
"links": [
240
]
},
{
"localized_name": "VAE",
"name": "VAE",
"type": "VAE",
"links": null
}
],
"properties": {
"Node name for S&R": "CheckpointLoaderSimple",
"cnr_id": "comfy-core",
"ver": "0.19.3",
"enableTabs": false,
"tabWidth": 65,
"tabXOffset": 10,
"hasSecondTab": false,
"secondTabText": "Send Back",
"secondTabOffset": 80,
"secondTabWidth": 65,
"models": [
{
"name": "sam3.1_multiplex_fp16.safetensors",
"url": "https://huggingface.co/Comfy-Org/sam3.1/resolve/main/checkpoints/sam3.1_multiplex_fp16.safetensors",
"directory": "checkpoints"
}
]
},
"widgets_values": [
"sam3.1_multiplex_fp16.safetensors"
]
},
{
"id": 128,
"type": "GetVideoComponents",
"pos": [
-1910,
-3540
],
"size": [
230,
120
],
"flags": {},
"order": 4,
"mode": 0,
"inputs": [
{
"localized_name": "video",
"name": "video",
"type": "VIDEO",
"link": 252
}
],
"outputs": [
{
"localized_name": "images",
"name": "images",
"type": "IMAGE",
"links": [
253
]
},
{
"localized_name": "audio",
"name": "audio",
"type": "AUDIO",
"links": [
259
]
},
{
"localized_name": "fps",
"name": "fps",
"type": "FLOAT",
"links": [
260
]
}
],
"properties": {
"Node name for S&R": "GetVideoComponents",
"cnr_id": "comfy-core",
"ver": "0.19.3",
"enableTabs": false,
"tabWidth": 65,
"tabXOffset": 10,
"hasSecondTab": false,
"secondTabText": "Send Back",
"secondTabOffset": 80,
"secondTabWidth": 65
}
},
{
"id": 129,
"type": "Note",
"pos": [
-1980,
-2790
],
"size": [
370,
250
],
"flags": {},
"order": 0,
"mode": 0,
"inputs": [],
"outputs": [],
"title": "Note: Prompt format",
"properties": {},
"widgets_values": [
"Max tokens for this model is only 32, to separately prompt multiple subjects you can separate prompts with comma, and set the max amount of objects detected for each prompt with :N\n\nFor example above test prompt finds 2 cakes, one apron, 4 window panels"
],
"color": "#432",
"bgcolor": "#653"
}
],
"groups": [],
"links": [
{
"id": 237,
"origin_id": 127,
"origin_slot": 0,
"target_id": 126,
"target_slot": 0,
"type": "MODEL"
},
{
"id": 200,
"origin_id": 125,
"origin_slot": 0,
"target_id": 126,
"target_slot": 2,
"type": "CONDITIONING"
},
{
"id": 240,
"origin_id": 127,
"origin_slot": 1,
"target_id": 125,
"target_slot": 0,
"type": "CLIP"
},
{
"id": 231,
"origin_id": 126,
"origin_slot": 0,
"target_id": -20,
"target_slot": 0,
"type": "MASK"
},
{
"id": 232,
"origin_id": 126,
"origin_slot": 1,
"target_id": -20,
"target_slot": 1,
"type": "BOUNDING_BOX"
},
{
"id": 252,
"origin_id": -10,
"origin_slot": 0,
"target_id": 128,
"target_slot": 0,
"type": "VIDEO"
},
{
"id": 253,
"origin_id": 128,
"origin_slot": 0,
"target_id": 126,
"target_slot": 1,
"type": "IMAGE"
},
{
"id": 254,
"origin_id": -10,
"origin_slot": 1,
"target_id": 125,
"target_slot": 1,
"type": "STRING"
},
{
"id": 255,
"origin_id": -10,
"origin_slot": 2,
"target_id": 126,
"target_slot": 3,
"type": "BOUNDING_BOX"
},
{
"id": 256,
"origin_id": -10,
"origin_slot": 3,
"target_id": 126,
"target_slot": 4,
"type": "STRING"
},
{
"id": 257,
"origin_id": -10,
"origin_slot": 4,
"target_id": 126,
"target_slot": 5,
"type": "STRING"
},
{
"id": 259,
"origin_id": 128,
"origin_slot": 1,
"target_id": -20,
"target_slot": 2,
"type": "AUDIO"
},
{
"id": 260,
"origin_id": 128,
"origin_slot": 2,
"target_id": -20,
"target_slot": 3,
"type": "FLOAT"
},
{
"id": 261,
"origin_id": -10,
"origin_slot": 5,
"target_id": 126,
"target_slot": 6,
"type": "FLOAT"
},
{
"id": 262,
"origin_id": -10,
"origin_slot": 6,
"target_id": 126,
"target_slot": 7,
"type": "INT"
},
{
"id": 263,
"origin_id": -10,
"origin_slot": 7,
"target_id": 126,
"target_slot": 8,
"type": "BOOLEAN"
},
{
"id": 273,
"origin_id": -10,
"origin_slot": 8,
"target_id": 127,
"target_slot": 0,
"type": "COMBO"
}
],
"extra": {},
"category": "Video Tools",
"description": "Segments video into temporally consistent masks using Meta SAM3 from text or interactive prompts."
}
]
},
"extra": {}
}

View File

@ -1,21 +1,21 @@
{
"revision": 0,
"last_node_id": 85,
"last_node_id": 84,
"last_link_id": 0,
"nodes": [
{
"id": 85,
"type": "637913e7-0206-46ba-8ded-70ae3a7c2e19",
"id": 84,
"type": "8e8aa94a-647e-436d-8440-8ee4691864de",
"pos": [
-880,
-2260
-6100,
2620
],
"size": [
290,
160
],
"flags": {},
"order": 2,
"order": 0,
"mode": 0,
"inputs": [
{
@ -76,26 +76,31 @@
"properties": {
"proxyWidgets": [
[
"79",
"-1",
"direction"
],
[
"79",
"-1",
"match_image_size"
],
[
"79",
"-1",
"spacing_width"
],
[
"79",
"-1",
"spacing_color"
]
],
"cnr_id": "comfy-core",
"ver": "0.13.0"
},
"widgets_values": [],
"widgets_values": [
"right",
true,
0,
"white"
],
"title": "Video Stitch"
}
],
@ -104,12 +109,12 @@
"definitions": {
"subgraphs": [
{
"id": "637913e7-0206-46ba-8ded-70ae3a7c2e19",
"id": "8e8aa94a-647e-436d-8440-8ee4691864de",
"version": 1,
"state": {
"lastGroupId": 1,
"lastNodeId": 97,
"lastLinkId": 282,
"lastNodeId": 84,
"lastLinkId": 262,
"lastRerouteId": 0
},
"revision": 0,
@ -118,8 +123,8 @@
"inputNode": {
"id": -10,
"bounding": [
-6810,
2580,
-6580,
2649,
143.55859375,
160
]
@ -127,8 +132,8 @@
"outputNode": {
"id": -20,
"bounding": [
-4770,
2600,
-5720,
2659,
120,
60
]
@ -144,8 +149,8 @@
"localized_name": "video",
"label": "Before Video",
"pos": [
-6686.44140625,
2600
-6456.44140625,
2669
]
},
{
@ -158,8 +163,8 @@
"localized_name": "video_1",
"label": "After Video",
"pos": [
-6686.44140625,
2620
-6456.44140625,
2689
]
},
{
@ -170,8 +175,8 @@
259
],
"pos": [
-6686.44140625,
2640
-6456.44140625,
2709
]
},
{
@ -182,8 +187,8 @@
260
],
"pos": [
-6686.44140625,
2660
-6456.44140625,
2729
]
},
{
@ -194,8 +199,8 @@
261
],
"pos": [
-6686.44140625,
2680
-6456.44140625,
2749
]
},
{
@ -206,8 +211,8 @@
262
],
"pos": [
-6686.44140625,
2700
-6456.44140625,
2769
]
}
],
@ -221,8 +226,8 @@
],
"localized_name": "VIDEO",
"pos": [
-4750,
2620
-5700,
2679
]
}
],
@ -233,11 +238,11 @@
"type": "GetVideoComponents",
"pos": [
-6390,
2600
2560
],
"size": [
230,
120
193.530859375,
66
],
"flags": {},
"order": 1,
@ -273,9 +278,9 @@
}
],
"properties": {
"Node name for S&R": "GetVideoComponents",
"cnr_id": "comfy-core",
"ver": "0.13.0"
"ver": "0.13.0",
"Node name for S&R": "GetVideoComponents"
}
},
{
@ -286,8 +291,8 @@
2420
],
"size": [
230,
120
193.530859375,
66
],
"flags": {},
"order": 0,
@ -327,254 +332,21 @@
}
],
"properties": {
"Node name for S&R": "GetVideoComponents",
"cnr_id": "comfy-core",
"ver": "0.13.0"
"ver": "0.13.0",
"Node name for S&R": "GetVideoComponents"
}
},
{
"id": 90,
"type": "GetImageSize",
"pos": [
-6390,
3030
],
"size": [
230,
120
],
"flags": {},
"order": 4,
"mode": 0,
"inputs": [
{
"localized_name": "image",
"name": "image",
"type": "IMAGE",
"link": 266
}
],
"outputs": [
{
"localized_name": "width",
"name": "width",
"type": "INT",
"links": [
274
]
},
{
"localized_name": "height",
"name": "height",
"type": "INT",
"links": [
276
]
},
{
"localized_name": "batch_size",
"name": "batch_size",
"type": "INT",
"links": null
}
],
"properties": {
"Node name for S&R": "GetImageSize"
}
},
{
"id": 80,
"type": "CreateVideo",
"pos": [
-5190,
2420
],
"size": [
270,
130
],
"flags": {},
"order": 3,
"mode": 0,
"inputs": [
{
"localized_name": "images",
"name": "images",
"type": "IMAGE",
"link": 282
},
{
"localized_name": "audio",
"name": "audio",
"shape": 7,
"type": "AUDIO",
"link": 251
},
{
"localized_name": "fps",
"name": "fps",
"type": "FLOAT",
"widget": {
"name": "fps"
},
"link": 252
}
],
"outputs": [
{
"localized_name": "VIDEO",
"name": "VIDEO",
"type": "VIDEO",
"links": [
255
]
}
],
"properties": {
"Node name for S&R": "CreateVideo",
"cnr_id": "comfy-core",
"ver": "0.13.0"
},
"widgets_values": [
30
]
},
{
"id": 95,
"type": "ComfyMathExpression",
"pos": [
-6040,
3020
],
"size": [
400,
200
],
"flags": {},
"order": 5,
"mode": 0,
"inputs": [
{
"label": "a",
"localized_name": "values.a",
"name": "values.a",
"type": "FLOAT,INT",
"link": 274
},
{
"label": "b",
"localized_name": "values.b",
"name": "values.b",
"shape": 7,
"type": "FLOAT,INT",
"link": null
},
{
"localized_name": "expression",
"name": "expression",
"type": "STRING",
"widget": {
"name": "expression"
},
"link": null
}
],
"outputs": [
{
"localized_name": "FLOAT",
"name": "FLOAT",
"type": "FLOAT",
"links": null
},
{
"localized_name": "INT",
"name": "INT",
"type": "INT",
"links": [
279
]
}
],
"properties": {
"Node name for S&R": "ComfyMathExpression"
},
"widgets_values": [
"a & ~1"
]
},
{
"id": 96,
"type": "ComfyMathExpression",
"pos": [
-6040,
3290
],
"size": [
400,
200
],
"flags": {},
"order": 6,
"mode": 0,
"inputs": [
{
"label": "a",
"localized_name": "values.a",
"name": "values.a",
"type": "FLOAT,INT",
"link": 276
},
{
"label": "b",
"localized_name": "values.b",
"name": "values.b",
"shape": 7,
"type": "FLOAT,INT",
"link": null
},
{
"localized_name": "expression",
"name": "expression",
"type": "STRING",
"widget": {
"name": "expression"
},
"link": null
}
],
"outputs": [
{
"localized_name": "FLOAT",
"name": "FLOAT",
"type": "FLOAT",
"links": null
},
{
"localized_name": "INT",
"name": "INT",
"type": "INT",
"links": [
280
]
}
],
"properties": {
"Node name for S&R": "ComfyMathExpression"
},
"widgets_values": [
"a & ~1"
]
},
{
"id": 79,
"type": "ImageStitch",
"pos": [
-6390,
2780
2700
],
"size": [
270,
160
150
],
"flags": {},
"order": 2,
@ -636,15 +408,14 @@
"name": "IMAGE",
"type": "IMAGE",
"links": [
266,
281
250
]
}
],
"properties": {
"Node name for S&R": "ImageStitch",
"cnr_id": "comfy-core",
"ver": "0.13.0"
"ver": "0.13.0",
"Node name for S&R": "ImageStitch"
},
"widgets_values": [
"right",
@ -654,91 +425,60 @@
]
},
{
"id": 97,
"type": "ResizeImageMaskNode",
"id": 80,
"type": "CreateVideo",
"pos": [
-5560,
2790
-6040,
2610
],
"size": [
270,
160
78
],
"flags": {},
"order": 7,
"order": 3,
"mode": 0,
"inputs": [
{
"localized_name": "input",
"name": "input",
"type": "IMAGE,MASK",
"link": 281
"localized_name": "images",
"name": "images",
"type": "IMAGE",
"link": 250
},
{
"localized_name": "resize_type",
"name": "resize_type",
"type": "COMFY_DYNAMICCOMBO_V3",
"widget": {
"name": "resize_type"
},
"link": null
"localized_name": "audio",
"name": "audio",
"shape": 7,
"type": "AUDIO",
"link": 251
},
{
"localized_name": "width",
"name": "resize_type.width",
"type": "INT",
"localized_name": "fps",
"name": "fps",
"type": "FLOAT",
"widget": {
"name": "resize_type.width"
"name": "fps"
},
"link": 279
},
{
"localized_name": "height",
"name": "resize_type.height",
"type": "INT",
"widget": {
"name": "resize_type.height"
},
"link": 280
},
{
"localized_name": "crop",
"name": "resize_type.crop",
"type": "COMBO",
"widget": {
"name": "resize_type.crop"
},
"link": null
},
{
"localized_name": "scale_method",
"name": "scale_method",
"type": "COMBO",
"widget": {
"name": "scale_method"
},
"link": null
"link": 252
}
],
"outputs": [
{
"localized_name": "resized",
"name": "resized",
"type": "*",
"localized_name": "VIDEO",
"name": "VIDEO",
"type": "VIDEO",
"links": [
282
255
]
}
],
"properties": {
"Node name for S&R": "ResizeImageMaskNode"
"cnr_id": "comfy-core",
"ver": "0.13.0",
"Node name for S&R": "CreateVideo"
},
"widgets_values": [
"scale dimensions",
512,
512,
"center",
"area"
30
]
}
],
@ -760,6 +500,14 @@
"target_slot": 1,
"type": "IMAGE"
},
{
"id": 250,
"origin_id": 79,
"origin_slot": 0,
"target_id": 80,
"target_slot": 0,
"type": "IMAGE"
},
{
"id": 251,
"origin_id": 77,
@ -831,71 +579,13 @@
"target_id": 79,
"target_slot": 5,
"type": "COMBO"
},
{
"id": 266,
"origin_id": 79,
"origin_slot": 0,
"target_id": 90,
"target_slot": 0,
"type": "IMAGE"
},
{
"id": 274,
"origin_id": 90,
"origin_slot": 0,
"target_id": 95,
"target_slot": 0,
"type": "INT"
},
{
"id": 276,
"origin_id": 90,
"origin_slot": 1,
"target_id": 96,
"target_slot": 0,
"type": "INT"
},
{
"id": 279,
"origin_id": 95,
"origin_slot": 1,
"target_id": 97,
"target_slot": 2,
"type": "INT"
},
{
"id": 280,
"origin_id": 96,
"origin_slot": 1,
"target_id": 97,
"target_slot": 3,
"type": "INT"
},
{
"id": 281,
"origin_id": 79,
"origin_slot": 0,
"target_id": 97,
"target_slot": 0,
"type": "IMAGE"
},
{
"id": 282,
"origin_id": 97,
"origin_slot": 0,
"target_id": 80,
"target_slot": 0,
"type": "IMAGE"
}
],
"extra": {
"workflowRendererVersion": "LG"
},
"category": "Video Tools/Stitch videos",
"description": "Stitches multiple video clips into a single sequential video file."
"category": "Video Tools/Stitch videos"
}
]
},
"extra": {}
}
}
}

View File

@ -412,10 +412,9 @@
"extra": {
"workflowRendererVersion": "LG"
},
"category": "Video generation and editing/Enhance video",
"description": "Upscales video to 4× resolution using a GAN-based upscaling model."
"category": "Video generation and editing/Enhance video"
}
]
},
"extra": {}
}
}

View File

@ -1,7 +0,0 @@
{
"model_type": "birefnet",
"image_std": [1.0, 1.0, 1.0],
"image_mean": [0.0, 0.0, 0.0],
"image_size": 1024,
"resize_to_original": true
}

View File

@ -1,689 +0,0 @@
import torch
import comfy.ops
import numpy as np
import torch.nn as nn
from functools import partial
import torch.nn.functional as F
from torchvision.ops import deform_conv2d
from comfy.ldm.modules.attention import optimized_attention_for_device
CXT = [3072, 1536, 768, 384][1:][::-1][-3:]
class Attention(nn.Module):
def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, device=None, dtype=None, operations=None):
super().__init__()
self.dim = dim
self.num_heads = num_heads
head_dim = dim // num_heads
self.scale = qk_scale or head_dim ** -0.5
self.q = operations.Linear(dim, dim, bias=qkv_bias, device=device, dtype=dtype)
self.kv = operations.Linear(dim, dim * 2, bias=qkv_bias, device=device, dtype=dtype)
self.proj = operations.Linear(dim, dim, device=device, dtype=dtype)
def forward(self, x):
B, N, C = x.shape
optimized_attention = optimized_attention_for_device(x.device, mask=False, small_input=True)
q = self.q(x).reshape(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
kv = self.kv(x).reshape(B, -1, 2, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
k, v = kv[0], kv[1]
x = optimized_attention(
q, k, v, heads=self.num_heads, skip_output_reshape=True, skip_reshape=True
).transpose(1, 2).reshape(B, N, C)
x = self.proj(x)
return x
class Mlp(nn.Module):
def __init__(self, in_features, hidden_features=None, out_features=None, device=None, dtype=None, operations=None):
super().__init__()
out_features = out_features or in_features
hidden_features = hidden_features or in_features
self.fc1 = operations.Linear(in_features, hidden_features, device=device, dtype=dtype)
self.act = nn.GELU()
self.fc2 = operations.Linear(hidden_features, out_features, device=device, dtype=dtype)
def forward(self, x):
x = self.fc1(x)
x = self.act(x)
x = self.fc2(x)
return x
def window_partition(x, window_size):
B, H, W, C = x.shape
x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
return windows
def window_reverse(windows, window_size, H, W):
B = int(windows.shape[0] / (H * W / window_size / window_size))
x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
return x
class WindowAttention(nn.Module):
def __init__(self, dim, window_size, num_heads, qkv_bias=True, qk_scale=None, device=None, dtype=None, operations=None):
super().__init__()
self.dim = dim
self.window_size = window_size # Wh, Ww
self.num_heads = num_heads
head_dim = dim // num_heads
self.scale = qk_scale or head_dim ** -0.5
self.relative_position_bias_table = nn.Parameter(
torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads, device=device, dtype=dtype))
coords_h = torch.arange(self.window_size[0])
coords_w = torch.arange(self.window_size[1])
coords = torch.stack(torch.meshgrid([coords_h, coords_w], indexing='ij')) # 2, Wh, Ww
coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww
relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Ww
relative_coords = relative_coords.permute(1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2
relative_coords[:, :, 0] += self.window_size[0] - 1
relative_coords[:, :, 1] += self.window_size[1] - 1
relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
relative_position_index = relative_coords.sum(-1) # Wh*Ww, Wh*Ww
self.register_buffer("relative_position_index", relative_position_index)
self.qkv = operations.Linear(dim, dim * 3, bias=qkv_bias, device=device, dtype=dtype)
self.proj = operations.Linear(dim, dim, device=device, dtype=dtype)
self.softmax = nn.Softmax(dim=-1)
def forward(self, x, mask=None):
B_, N, C = x.shape
qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
q, k, v = qkv[0], qkv[1], qkv[2]
q = q * self.scale
attn = (q @ k.transpose(-2, -1))
relative_position_bias = self.relative_position_bias_table[self.relative_position_index.long().view(-1)].view(
self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1) # Wh*Ww,Wh*Ww,nH
relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww
attn = attn + relative_position_bias.unsqueeze(0)
if mask is not None:
nW = mask.shape[0]
attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0)
attn = attn.view(-1, self.num_heads, N, N)
attn = self.softmax(attn)
else:
attn = self.softmax(attn)
x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
x = self.proj(x)
return x
class SwinTransformerBlock(nn.Module):
def __init__(self, dim, num_heads, window_size=7, shift_size=0,
mlp_ratio=4., qkv_bias=True, qk_scale=None,
norm_layer=nn.LayerNorm, device=None, dtype=None, operations=None):
super().__init__()
self.dim = dim
self.num_heads = num_heads
self.window_size = window_size
self.shift_size = shift_size
self.mlp_ratio = mlp_ratio
self.norm1 = norm_layer(dim, device=device, dtype=dtype)
self.attn = WindowAttention(
dim, window_size=(self.window_size, self.window_size), num_heads=num_heads,
qkv_bias=qkv_bias, qk_scale=qk_scale, device=device, dtype=dtype, operations=operations)
self.norm2 = norm_layer(dim, device=device, dtype=dtype)
mlp_hidden_dim = int(dim * mlp_ratio)
self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, device=device, dtype=dtype, operations=operations)
self.H = None
self.W = None
def forward(self, x, mask_matrix):
B, L, C = x.shape
H, W = self.H, self.W
shortcut = x
x = self.norm1(x)
x = x.view(B, H, W, C)
pad_l = pad_t = 0
pad_r = (self.window_size - W % self.window_size) % self.window_size
pad_b = (self.window_size - H % self.window_size) % self.window_size
x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b))
_, Hp, Wp, _ = x.shape
if self.shift_size > 0:
shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
attn_mask = mask_matrix
else:
shifted_x = x
attn_mask = None
x_windows = window_partition(shifted_x, self.window_size)
x_windows = x_windows.view(-1, self.window_size * self.window_size, C)
attn_windows = self.attn(x_windows, mask=attn_mask)
attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C)
shifted_x = window_reverse(attn_windows, self.window_size, Hp, Wp) # B H' W' C
if self.shift_size > 0:
x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
else:
x = shifted_x
if pad_r > 0 or pad_b > 0:
x = x[:, :H, :W, :].contiguous()
x = x.view(B, H * W, C)
x = shortcut + x
x = x + self.mlp(self.norm2(x))
return x
class PatchMerging(nn.Module):
def __init__(self, dim, device=None, dtype=None, operations=None):
super().__init__()
self.dim = dim
self.reduction = operations.Linear(4 * dim, 2 * dim, bias=False, device=device, dtype=dtype)
self.norm = operations.LayerNorm(4 * dim, device=device, dtype=dtype)
def forward(self, x, H, W):
B, L, C = x.shape
x = x.view(B, H, W, C)
# padding
pad_input = (H % 2 == 1) or (W % 2 == 1)
if pad_input:
x = F.pad(x, (0, 0, 0, W % 2, 0, H % 2))
x0 = x[:, 0::2, 0::2, :] # B H/2 W/2 C
x1 = x[:, 1::2, 0::2, :] # B H/2 W/2 C
x2 = x[:, 0::2, 1::2, :] # B H/2 W/2 C
x3 = x[:, 1::2, 1::2, :] # B H/2 W/2 C
x = torch.cat([x0, x1, x2, x3], -1) # B H/2 W/2 4*C
x = x.view(B, -1, 4 * C) # B H/2*W/2 4*C
x = self.norm(x)
x = self.reduction(x)
return x
class BasicLayer(nn.Module):
def __init__(self,
dim,
depth,
num_heads,
window_size=7,
mlp_ratio=4.,
qkv_bias=True,
qk_scale=None,
norm_layer=nn.LayerNorm,
downsample=None,
device=None, dtype=None, operations=None):
super().__init__()
self.window_size = window_size
self.shift_size = window_size // 2
self.depth = depth
# build blocks
self.blocks = nn.ModuleList([
SwinTransformerBlock(
dim=dim,
num_heads=num_heads,
window_size=window_size,
shift_size=0 if (i % 2 == 0) else window_size // 2,
mlp_ratio=mlp_ratio,
qkv_bias=qkv_bias,
qk_scale=qk_scale,
norm_layer=norm_layer,
device=device, dtype=dtype, operations=operations)
for i in range(depth)])
# patch merging layer
if downsample is not None:
self.downsample = downsample(dim=dim, device=device, dtype=dtype, operations=operations)
else:
self.downsample = None
def forward(self, x, H, W):
Hp = int(np.ceil(H / self.window_size)) * self.window_size
Wp = int(np.ceil(W / self.window_size)) * self.window_size
img_mask = torch.zeros((1, Hp, Wp, 1), device=x.device) # 1 Hp Wp 1
h_slices = (slice(0, -self.window_size),
slice(-self.window_size, -self.shift_size),
slice(-self.shift_size, None))
w_slices = (slice(0, -self.window_size),
slice(-self.window_size, -self.shift_size),
slice(-self.shift_size, None))
cnt = 0
for h in h_slices:
for w in w_slices:
img_mask[:, h, w, :] = cnt
cnt += 1
mask_windows = window_partition(img_mask, self.window_size)
mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
for blk in self.blocks:
blk.H, blk.W = H, W
x = blk(x, attn_mask)
if self.downsample is not None:
x_down = self.downsample(x, H, W)
Wh, Ww = (H + 1) // 2, (W + 1) // 2
return x, H, W, x_down, Wh, Ww
else:
return x, H, W, x, H, W
class PatchEmbed(nn.Module):
def __init__(self, patch_size=4, in_channels=3, embed_dim=96, norm_layer=None, device=None, dtype=None, operations=None):
super().__init__()
patch_size = (patch_size, patch_size)
self.patch_size = patch_size
self.in_channels = in_channels
self.embed_dim = embed_dim
self.proj = operations.Conv2d(in_channels, embed_dim, kernel_size=patch_size, stride=patch_size, device=device, dtype=dtype)
if norm_layer is not None:
self.norm = norm_layer(embed_dim, device=device, dtype=dtype)
else:
self.norm = None
def forward(self, x):
_, _, H, W = x.size()
if W % self.patch_size[1] != 0:
x = F.pad(x, (0, self.patch_size[1] - W % self.patch_size[1]))
if H % self.patch_size[0] != 0:
x = F.pad(x, (0, 0, 0, self.patch_size[0] - H % self.patch_size[0]))
x = self.proj(x) # B C Wh Ww
if self.norm is not None:
Wh, Ww = x.size(2), x.size(3)
x = x.flatten(2).transpose(1, 2)
x = self.norm(x)
x = x.transpose(1, 2).view(-1, self.embed_dim, Wh, Ww)
return x
class SwinTransformer(nn.Module):
def __init__(self,
pretrain_img_size=224,
patch_size=4,
in_channels=3,
embed_dim=96,
depths=[2, 2, 6, 2],
num_heads=[3, 6, 12, 24],
window_size=7,
mlp_ratio=4.,
qkv_bias=True,
qk_scale=None,
patch_norm=True,
out_indices=(0, 1, 2, 3),
frozen_stages=-1,
device=None, dtype=None, operations=None):
super().__init__()
norm_layer = partial(operations.LayerNorm, device=device, dtype=dtype)
self.pretrain_img_size = pretrain_img_size
self.num_layers = len(depths)
self.embed_dim = embed_dim
self.patch_norm = patch_norm
self.out_indices = out_indices
self.frozen_stages = frozen_stages
self.patch_embed = PatchEmbed(
patch_size=patch_size, in_channels=in_channels, embed_dim=embed_dim,
device=device, dtype=dtype, operations=operations,
norm_layer=norm_layer if self.patch_norm else None)
self.layers = nn.ModuleList()
for i_layer in range(self.num_layers):
layer = BasicLayer(
dim=int(embed_dim * 2 ** i_layer),
depth=depths[i_layer],
num_heads=num_heads[i_layer],
window_size=window_size,
mlp_ratio=mlp_ratio,
qkv_bias=qkv_bias,
qk_scale=qk_scale,
norm_layer=norm_layer,
downsample=PatchMerging if (i_layer < self.num_layers - 1) else None,
device=device, dtype=dtype, operations=operations)
self.layers.append(layer)
num_features = [int(embed_dim * 2 ** i) for i in range(self.num_layers)]
self.num_features = num_features
for i_layer in out_indices:
layer = norm_layer(num_features[i_layer])
layer_name = f'norm{i_layer}'
self.add_module(layer_name, layer)
def forward(self, x):
x = self.patch_embed(x)
Wh, Ww = x.size(2), x.size(3)
outs = []
x = x.flatten(2).transpose(1, 2)
for i in range(self.num_layers):
layer = self.layers[i]
x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww)
if i in self.out_indices:
norm_layer = getattr(self, f'norm{i}')
x_out = norm_layer(x_out)
out = x_out.view(-1, H, W, self.num_features[i]).permute(0, 3, 1, 2).contiguous()
outs.append(out)
return tuple(outs)
class DeformableConv2d(nn.Module):
def __init__(self,
in_channels,
out_channels,
kernel_size=3,
stride=1,
padding=1,
bias=False, device=None, dtype=None, operations=None):
super(DeformableConv2d, self).__init__()
kernel_size = kernel_size if type(kernel_size) is tuple else (kernel_size, kernel_size)
self.stride = stride if type(stride) is tuple else (stride, stride)
self.padding = padding
self.offset_conv = operations.Conv2d(in_channels,
2 * kernel_size[0] * kernel_size[1],
kernel_size=kernel_size,
stride=stride,
padding=self.padding,
bias=True, device=device, dtype=dtype)
self.modulator_conv = operations.Conv2d(in_channels,
1 * kernel_size[0] * kernel_size[1],
kernel_size=kernel_size,
stride=stride,
padding=self.padding,
bias=True, device=device, dtype=dtype)
self.regular_conv = operations.Conv2d(in_channels,
out_channels=out_channels,
kernel_size=kernel_size,
stride=stride,
padding=self.padding,
bias=bias, device=device, dtype=dtype)
def forward(self, x):
offset = self.offset_conv(x)
modulator = 2. * torch.sigmoid(self.modulator_conv(x))
weight, bias, offload_info = comfy.ops.cast_bias_weight(self.regular_conv, x, offloadable=True)
x = deform_conv2d(
input=x,
offset=offset,
weight=weight,
bias=None,
padding=self.padding,
mask=modulator,
stride=self.stride,
)
comfy.ops.uncast_bias_weight(self.regular_conv, weight, bias, offload_info)
return x
class BasicDecBlk(nn.Module):
def __init__(self, in_channels=64, out_channels=64, inter_channels=64, device=None, dtype=None, operations=None):
super(BasicDecBlk, self).__init__()
inter_channels = 64
self.conv_in = operations.Conv2d(in_channels, inter_channels, 3, 1, padding=1, device=device, dtype=dtype)
self.relu_in = nn.ReLU(inplace=True)
self.dec_att = ASPPDeformable(in_channels=inter_channels, device=device, dtype=dtype, operations=operations)
self.conv_out = operations.Conv2d(inter_channels, out_channels, 3, 1, padding=1, device=device, dtype=dtype)
self.bn_in = operations.BatchNorm2d(inter_channels, device=device, dtype=dtype)
self.bn_out = operations.BatchNorm2d(out_channels, device=device, dtype=dtype)
def forward(self, x):
x = self.conv_in(x)
x = self.bn_in(x)
x = self.relu_in(x)
x = self.dec_att(x)
x = self.conv_out(x)
x = self.bn_out(x)
return x
class BasicLatBlk(nn.Module):
def __init__(self, in_channels=64, out_channels=64, device=None, dtype=None, operations=None):
super(BasicLatBlk, self).__init__()
self.conv = operations.Conv2d(in_channels, out_channels, 1, 1, 0, device=device, dtype=dtype)
def forward(self, x):
x = self.conv(x)
return x
class _ASPPModuleDeformable(nn.Module):
def __init__(self, in_channels, planes, kernel_size, padding, device, dtype, operations):
super(_ASPPModuleDeformable, self).__init__()
self.atrous_conv = DeformableConv2d(in_channels, planes, kernel_size=kernel_size,
stride=1, padding=padding, bias=False, device=device, dtype=dtype, operations=operations)
self.bn = operations.BatchNorm2d(planes, device=device, dtype=dtype)
self.relu = nn.ReLU(inplace=True)
def forward(self, x):
x = self.atrous_conv(x)
x = self.bn(x)
return self.relu(x)
class ASPPDeformable(nn.Module):
def __init__(self, in_channels, out_channels=None, parallel_block_sizes=[1, 3, 7], device=None, dtype=None, operations=None):
super(ASPPDeformable, self).__init__()
self.down_scale = 1
if out_channels is None:
out_channels = in_channels
self.in_channelster = 256 // self.down_scale
self.aspp1 = _ASPPModuleDeformable(in_channels, self.in_channelster, 1, padding=0, device=device, dtype=dtype, operations=operations)
self.aspp_deforms = nn.ModuleList([
_ASPPModuleDeformable(in_channels, self.in_channelster, conv_size, padding=int(conv_size//2), device=device, dtype=dtype, operations=operations)
for conv_size in parallel_block_sizes
])
self.global_avg_pool = nn.Sequential(nn.AdaptiveAvgPool2d((1, 1)),
operations.Conv2d(in_channels, self.in_channelster, 1, stride=1, bias=False, device=device, dtype=dtype),
operations.BatchNorm2d(self.in_channelster, device=device, dtype=dtype),
nn.ReLU(inplace=True))
self.conv1 = operations.Conv2d(self.in_channelster * (2 + len(self.aspp_deforms)), out_channels, 1, bias=False, device=device, dtype=dtype)
self.bn1 = operations.BatchNorm2d(out_channels, device=device, dtype=dtype)
self.relu = nn.ReLU(inplace=True)
def forward(self, x):
x1 = self.aspp1(x)
x_aspp_deforms = [aspp_deform(x) for aspp_deform in self.aspp_deforms]
x5 = self.global_avg_pool(x)
x5 = F.interpolate(x5, size=x1.size()[2:], mode='bilinear', align_corners=True)
x = torch.cat((x1, *x_aspp_deforms, x5), dim=1)
x = self.conv1(x)
x = self.bn1(x)
x = self.relu(x)
return x
class BiRefNet(nn.Module):
def __init__(self, config=None, dtype=None, device=None, operations=None):
super(BiRefNet, self).__init__()
self.bb = SwinTransformer(embed_dim=192, depths=[2, 2, 18, 2], num_heads=[6, 12, 24, 48], window_size=12, device=device, dtype=dtype, operations=operations)
channels = [1536, 768, 384, 192]
channels = [c * 2 for c in channels]
self.cxt = channels[1:][::-1][-3:]
self.squeeze_module = nn.Sequential(*[
BasicDecBlk(channels[0]+sum(self.cxt), channels[0], device=device, dtype=dtype, operations=operations)
for _ in range(1)
])
self.decoder = Decoder(channels, device=device, dtype=dtype, operations=operations)
def forward_enc(self, x):
x1, x2, x3, x4 = self.bb(x)
B, C, H, W = x.shape
x1_, x2_, x3_, x4_ = self.bb(F.interpolate(x, size=(H//2, W//2), mode='bilinear', align_corners=True))
x1 = torch.cat([x1, F.interpolate(x1_, size=x1.shape[2:], mode='bilinear', align_corners=True)], dim=1)
x2 = torch.cat([x2, F.interpolate(x2_, size=x2.shape[2:], mode='bilinear', align_corners=True)], dim=1)
x3 = torch.cat([x3, F.interpolate(x3_, size=x3.shape[2:], mode='bilinear', align_corners=True)], dim=1)
x4 = torch.cat([x4, F.interpolate(x4_, size=x4.shape[2:], mode='bilinear', align_corners=True)], dim=1)
x4 = torch.cat(
(
*[
F.interpolate(x1, size=x4.shape[2:], mode='bilinear', align_corners=True),
F.interpolate(x2, size=x4.shape[2:], mode='bilinear', align_corners=True),
F.interpolate(x3, size=x4.shape[2:], mode='bilinear', align_corners=True),
][-len(CXT):],
x4
),
dim=1
)
return (x1, x2, x3, x4)
def forward_ori(self, x):
(x1, x2, x3, x4) = self.forward_enc(x)
x4 = self.squeeze_module(x4)
features = [x, x1, x2, x3, x4]
scaled_preds = self.decoder(features)
return scaled_preds
def forward(self, pixel_values, intermediate_output=None):
scaled_preds = self.forward_ori(pixel_values)
return scaled_preds
class Decoder(nn.Module):
def __init__(self, channels, device, dtype, operations):
super(Decoder, self).__init__()
# factory kwargs
fk = {"device":device, "dtype":dtype, "operations":operations}
DecoderBlock = partial(BasicDecBlk, **fk)
LateralBlock = partial(BasicLatBlk, **fk)
DBlock = partial(SimpleConvs, **fk)
self.split = True
N_dec_ipt = 64
ic = 64
ipt_cha_opt = 1
self.ipt_blk5 = DBlock(2**10*3 if self.split else 3, [N_dec_ipt, channels[0]//8][ipt_cha_opt], inter_channels=ic)
self.ipt_blk4 = DBlock(2**8*3 if self.split else 3, [N_dec_ipt, channels[0]//8][ipt_cha_opt], inter_channels=ic)
self.ipt_blk3 = DBlock(2**6*3 if self.split else 3, [N_dec_ipt, channels[1]//8][ipt_cha_opt], inter_channels=ic)
self.ipt_blk2 = DBlock(2**4*3 if self.split else 3, [N_dec_ipt, channels[2]//8][ipt_cha_opt], inter_channels=ic)
self.ipt_blk1 = DBlock(2**0*3 if self.split else 3, [N_dec_ipt, channels[3]//8][ipt_cha_opt], inter_channels=ic)
self.decoder_block4 = DecoderBlock(channels[0]+([N_dec_ipt, channels[0]//8][ipt_cha_opt]), channels[1])
self.decoder_block3 = DecoderBlock(channels[1]+([N_dec_ipt, channels[0]//8][ipt_cha_opt]), channels[2])
self.decoder_block2 = DecoderBlock(channels[2]+([N_dec_ipt, channels[1]//8][ipt_cha_opt]), channels[3])
self.decoder_block1 = DecoderBlock(channels[3]+([N_dec_ipt, channels[2]//8][ipt_cha_opt]), channels[3]//2)
fk = {"device":device, "dtype":dtype}
self.conv_out1 = nn.Sequential(operations.Conv2d(channels[3]//2+([N_dec_ipt, channels[3]//8][ipt_cha_opt]), 1, 1, 1, 0, **fk))
self.lateral_block4 = LateralBlock(channels[1], channels[1])
self.lateral_block3 = LateralBlock(channels[2], channels[2])
self.lateral_block2 = LateralBlock(channels[3], channels[3])
self.conv_ms_spvn_4 = operations.Conv2d(channels[1], 1, 1, 1, 0, **fk)
self.conv_ms_spvn_3 = operations.Conv2d(channels[2], 1, 1, 1, 0, **fk)
self.conv_ms_spvn_2 = operations.Conv2d(channels[3], 1, 1, 1, 0, **fk)
_N = 16
self.gdt_convs_4 = nn.Sequential(operations.Conv2d(channels[0] // 2, _N, 3, 1, 1, **fk), operations.BatchNorm2d(_N, **fk), nn.ReLU(inplace=True))
self.gdt_convs_3 = nn.Sequential(operations.Conv2d(channels[1] // 2, _N, 3, 1, 1, **fk), operations.BatchNorm2d(_N, **fk), nn.ReLU(inplace=True))
self.gdt_convs_2 = nn.Sequential(operations.Conv2d(channels[2] // 2, _N, 3, 1, 1, **fk), operations.BatchNorm2d(_N, **fk), nn.ReLU(inplace=True))
[setattr(self, f"gdt_convs_pred_{i}", nn.Sequential(operations.Conv2d(_N, 1, 1, 1, 0, **fk))) for i in range(2, 5)]
[setattr(self, f"gdt_convs_attn_{i}", nn.Sequential(operations.Conv2d(_N, 1, 1, 1, 0, **fk))) for i in range(2, 5)]
def get_patches_batch(self, x, p):
_size_h, _size_w = p.shape[2:]
patches_batch = []
for idx in range(x.shape[0]):
columns_x = torch.split(x[idx], split_size_or_sections=_size_w, dim=-1)
patches_x = []
for column_x in columns_x:
patches_x += [p.unsqueeze(0) for p in torch.split(column_x, split_size_or_sections=_size_h, dim=-2)]
patch_sample = torch.cat(patches_x, dim=1)
patches_batch.append(patch_sample)
return torch.cat(patches_batch, dim=0)
def forward(self, features):
x, x1, x2, x3, x4 = features
patches_batch = self.get_patches_batch(x, x4) if self.split else x
x4 = torch.cat((x4, self.ipt_blk5(F.interpolate(patches_batch, size=x4.shape[2:], mode='bilinear', align_corners=True))), 1)
p4 = self.decoder_block4(x4)
p4_gdt = self.gdt_convs_4(p4)
gdt_attn_4 = self.gdt_convs_attn_4(p4_gdt).sigmoid()
p4 = p4 * gdt_attn_4
_p4 = F.interpolate(p4, size=x3.shape[2:], mode='bilinear', align_corners=True)
_p3 = _p4 + self.lateral_block4(x3)
patches_batch = self.get_patches_batch(x, _p3) if self.split else x
_p3 = torch.cat((_p3, self.ipt_blk4(F.interpolate(patches_batch, size=x3.shape[2:], mode='bilinear', align_corners=True))), 1)
p3 = self.decoder_block3(_p3)
p3_gdt = self.gdt_convs_3(p3)
gdt_attn_3 = self.gdt_convs_attn_3(p3_gdt).sigmoid()
p3 = p3 * gdt_attn_3
_p3 = F.interpolate(p3, size=x2.shape[2:], mode='bilinear', align_corners=True)
_p2 = _p3 + self.lateral_block3(x2)
patches_batch = self.get_patches_batch(x, _p2) if self.split else x
_p2 = torch.cat((_p2, self.ipt_blk3(F.interpolate(patches_batch, size=x2.shape[2:], mode='bilinear', align_corners=True))), 1)
p2 = self.decoder_block2(_p2)
p2_gdt = self.gdt_convs_2(p2)
gdt_attn_2 = self.gdt_convs_attn_2(p2_gdt).sigmoid()
p2 = p2 * gdt_attn_2
_p2 = F.interpolate(p2, size=x1.shape[2:], mode='bilinear', align_corners=True)
_p1 = _p2 + self.lateral_block2(x1)
patches_batch = self.get_patches_batch(x, _p1) if self.split else x
_p1 = torch.cat((_p1, self.ipt_blk2(F.interpolate(patches_batch, size=x1.shape[2:], mode='bilinear', align_corners=True))), 1)
_p1 = self.decoder_block1(_p1)
_p1 = F.interpolate(_p1, size=x.shape[2:], mode='bilinear', align_corners=True)
patches_batch = self.get_patches_batch(x, _p1) if self.split else x
_p1 = torch.cat((_p1, self.ipt_blk1(F.interpolate(patches_batch, size=x.shape[2:], mode='bilinear', align_corners=True))), 1)
p1_out = self.conv_out1(_p1)
return p1_out
class SimpleConvs(nn.Module):
def __init__(
self, in_channels: int, out_channels: int, inter_channels=64, device=None, dtype=None, operations=None
) -> None:
super().__init__()
self.conv1 = operations.Conv2d(in_channels, inter_channels, 3, 1, 1, device=device, dtype=dtype)
self.conv_out = operations.Conv2d(inter_channels, out_channels, 3, 1, 1, device=device, dtype=dtype)
def forward(self, x):
return self.conv_out(self.conv1(x))

View File

@ -1,85 +0,0 @@
from .utils import load_torch_file
import os
import json
import torch
import logging
import comfy.ops
import comfy.model_patcher
import comfy.model_management
import comfy.clip_model
import comfy.background_removal.birefnet
BG_REMOVAL_MODELS = {
"birefnet": comfy.background_removal.birefnet.BiRefNet
}
class BackgroundRemovalModel():
def __init__(self, json_config):
with open(json_config) as f:
config = json.load(f)
self.image_size = config.get("image_size", 1024)
self.image_mean = config.get("image_mean", [0.0, 0.0, 0.0])
self.image_std = config.get("image_std", [1.0, 1.0, 1.0])
self.model_type = config.get("model_type", "birefnet")
self.config = config.copy()
model_class = BG_REMOVAL_MODELS.get(self.model_type)
self.load_device = comfy.model_management.text_encoder_device()
offload_device = comfy.model_management.text_encoder_offload_device()
self.dtype = comfy.model_management.text_encoder_dtype(self.load_device)
self.model = model_class(config, self.dtype, offload_device, comfy.ops.manual_cast)
self.model.eval()
self.patcher = comfy.model_patcher.CoreModelPatcher(self.model, load_device=self.load_device, offload_device=offload_device)
def load_sd(self, sd):
return self.model.load_state_dict(sd, strict=False, assign=self.patcher.is_dynamic())
def get_sd(self):
return self.model.state_dict()
def encode_image(self, image):
comfy.model_management.load_model_gpu(self.patcher)
H, W = image.shape[1], image.shape[2]
pixel_values = comfy.clip_model.clip_preprocess(image.to(self.load_device), size=self.image_size, mean=self.image_mean, std=self.image_std, crop=False)
if pixel_values.shape[0] > 1:
out = torch.cat([
self.model(pixel_values=pixel_values[i:i+1])
for i in range(pixel_values.shape[0])
], dim=0)
else:
out = self.model(pixel_values=pixel_values)
out = torch.nn.functional.interpolate(out, size=(H, W), mode="bicubic", antialias=False)
mask = out.sigmoid().to(device=comfy.model_management.intermediate_device(), dtype=comfy.model_management.intermediate_dtype())
if mask.ndim == 3:
mask = mask.unsqueeze(0)
if mask.shape[1] != 1:
mask = mask.movedim(-1, 1)
return mask
def load_background_removal_model(sd):
if "bb.layers.1.blocks.0.attn.relative_position_index" in sd:
json_config = os.path.join(os.path.join(os.path.dirname(os.path.realpath(__file__)), "background_removal"), "birefnet.json")
else:
return None
bg_model = BackgroundRemovalModel(json_config)
m, u = bg_model.load_sd(sd)
if len(m) > 0:
logging.warning("missing background removal: {}".format(m))
u = set(u)
keys = list(sd.keys())
for k in keys:
if k not in u:
sd.pop(k)
return bg_model
def load(ckpt_path):
sd = load_torch_file(ckpt_path)
return load_background_removal_model(sd)

View File

@ -90,8 +90,8 @@ parser.add_argument("--force-channels-last", action="store_true", help="Force ch
parser.add_argument("--directml", type=int, nargs="?", metavar="DIRECTML_DEVICE", const=-1, help="Use torch-directml.")
parser.add_argument("--oneapi-device-selector", type=str, default=None, metavar="SELECTOR_STRING", help="Sets the oneAPI device(s) this instance will use.")
parser.add_argument("--disable-ipex-optimize", action="store_true", help="Disables ipex.optimize default when loading models with Intel's Extension for Pytorch.")
parser.add_argument("--supports-fp8-compute", action="store_true", help="ComfyUI will act like if the device supports fp8 compute.")
parser.add_argument("--enable-triton-backend", action="store_true", help="ComfyUI will enable the use of Triton backend in comfy-kitchen. Is disabled at launch by default.")
class LatentPreviewMethod(enum.Enum):
NoPreviews = "none"
@ -110,11 +110,13 @@ parser.add_argument("--preview-method", type=LatentPreviewMethod, default=Latent
parser.add_argument("--preview-size", type=int, default=512, help="Sets the maximum preview size for sampler nodes.")
CACHE_RAM_AUTO_GB = -1.0
cache_group = parser.add_mutually_exclusive_group()
cache_group.add_argument("--cache-ram", nargs='*', type=float, default=[], metavar="GB", help="Use RAM pressure caching with the specified headroom thresholds. This is the default caching mode. The first value sets the active-cache threshold; the optional second value sets the inactive-cache/pin threshold. Defaults when no values are provided: active 25%% of system RAM (min 4GB, max 32GB), inactive 75%% of system RAM (min 12GB, max 96GB).")
cache_group.add_argument("--cache-classic", action="store_true", help="Use the old style (aggressive) caching.")
cache_group.add_argument("--cache-lru", type=int, default=0, help="Use LRU caching with a maximum of N node results cached. May use more RAM/VRAM.")
cache_group.add_argument("--cache-none", action="store_true", help="Reduced RAM/VRAM usage at the expense of executing every node for each run.")
cache_group.add_argument("--cache-ram", nargs='?', const=CACHE_RAM_AUTO_GB, type=float, default=0, help="Use RAM pressure caching with the specified headroom threshold. If available RAM drops below the threshold the cache removes large items to free RAM. Default (when no value is provided): 25%% of system RAM (min 4GB, max 32GB).")
attn_group = parser.add_mutually_exclusive_group()
attn_group.add_argument("--use-split-cross-attention", action="store_true", help="Use the split cross attention optimization. Ignored when xformers is used.")
@ -139,7 +141,8 @@ manager_group.add_argument("--enable-manager-legacy-ui", action="store_true", he
vram_group = parser.add_mutually_exclusive_group()
vram_group.add_argument("--gpu-only", action="store_true", help="Store and run everything (text encoders/CLIP models, etc... on the GPU).")
vram_group.add_argument("--highvram", action="store_true", help="By default models will be unloaded to CPU memory after being used. This option keeps them in GPU memory.")
vram_group.add_argument("--lowvram", action="store_true", help="Doesn't do anything if dynamic vram is enabled. If dynamic vram isn't being used this option makes the text encoders run on the CPU.")
vram_group.add_argument("--normalvram", action="store_true", help="Used to force normal vram use if lowvram gets automatically enabled.")
vram_group.add_argument("--lowvram", action="store_true", help="Split the unet in parts to use less vram.")
vram_group.add_argument("--novram", action="store_true", help="When lowvram isn't enough.")
vram_group.add_argument("--cpu", action="store_true", help="To use the CPU for everything (slow).")
@ -235,17 +238,12 @@ database_default_path = os.path.abspath(
)
parser.add_argument("--database-url", type=str, default=f"sqlite:///{database_default_path}", help="Specify the database URL, e.g. for an in-memory database you can use 'sqlite:///:memory:'.")
parser.add_argument("--enable-assets", action="store_true", help="Enable the assets system (API routes, database synchronization, and background scanning).")
parser.add_argument("--feature-flag", type=str, action='append', default=[], metavar="KEY[=VALUE]", help="Set a server feature flag. Use KEY=VALUE to set an explicit value, or bare KEY to set it to true. Can be specified multiple times. Boolean values (true/false) and numbers are auto-converted. Examples: --feature-flag show_signin_button=true or --feature-flag show_signin_button")
parser.add_argument("--list-feature-flags", action="store_true", help="Print the registry of known CLI-settable feature flags as JSON and exit.")
if comfy.options.args_parsing:
args = parser.parse_args()
else:
args = parser.parse_args([])
if args.cache_ram is not None and len(args.cache_ram) > 2:
parser.error("--cache-ram accepts at most two values: active GB and inactive GB")
if args.windows_standalone_build:
args.auto_launch = True

View File

@ -63,11 +63,7 @@ class IndexListContextWindow(ContextWindowABC):
dim = self.dim
if dim == 0 and full.shape[dim] == 1:
return full
indices = self.index_list
anchor_idx = getattr(self, 'causal_anchor_index', None)
if anchor_idx is not None and anchor_idx >= 0:
indices = [anchor_idx] + list(indices)
idx = tuple([slice(None)] * dim + [indices])
idx = tuple([slice(None)] * dim + [self.index_list])
window = full[idx]
if retain_index_list:
idx = tuple([slice(None)] * dim + [retain_index_list])
@ -117,14 +113,7 @@ def slice_cond(cond_value, window: IndexListContextWindow, x_in: torch.Tensor, d
# skip leading latent positions that have no corresponding conditioning (e.g. reference frames)
if temporal_offset > 0:
anchor_idx = getattr(window, 'causal_anchor_index', None)
if anchor_idx is not None and anchor_idx >= 0:
# anchor occupies one of the no-cond positions, so skip one fewer from window.index_list
skip_count = temporal_offset - 1
else:
skip_count = temporal_offset
indices = [i - temporal_offset for i in window.index_list[skip_count:]]
indices = [i - temporal_offset for i in window.index_list[temporal_offset:]]
indices = [i for i in indices if 0 <= i]
else:
indices = list(window.index_list)
@ -161,8 +150,7 @@ class ContextFuseMethod:
ContextResults = collections.namedtuple("ContextResults", ['window_idx', 'sub_conds_out', 'sub_conds', 'window'])
class IndexListContextHandler(ContextHandlerABC):
def __init__(self, context_schedule: ContextSchedule, fuse_method: ContextFuseMethod, context_length: int=1, context_overlap: int=0, context_stride: int=1,
closed_loop: bool=False, dim:int=0, freenoise: bool=False, cond_retain_index_list: list[int]=[], split_conds_to_windows: bool=False,
causal_window_fix: bool=True):
closed_loop: bool=False, dim:int=0, freenoise: bool=False, cond_retain_index_list: list[int]=[], split_conds_to_windows: bool=False):
self.context_schedule = context_schedule
self.fuse_method = fuse_method
self.context_length = context_length
@ -174,7 +162,6 @@ class IndexListContextHandler(ContextHandlerABC):
self.freenoise = freenoise
self.cond_retain_index_list = [int(x.strip()) for x in cond_retain_index_list.split(",")] if cond_retain_index_list else []
self.split_conds_to_windows = split_conds_to_windows
self.causal_window_fix = causal_window_fix
self.callbacks = {}
@ -331,14 +318,6 @@ class IndexListContextHandler(ContextHandlerABC):
# allow processing to end between context window executions for faster Cancel
comfy.model_management.throw_exception_if_processing_interrupted()
# causal_window_fix: prepend a pre-window frame that will be stripped post-forward
anchor_applied = False
if self.causal_window_fix:
anchor_idx = window.index_list[0] - 1
if 0 <= anchor_idx < x_in.size(self.dim):
window.causal_anchor_index = anchor_idx
anchor_applied = True
for callback in comfy.patcher_extension.get_all_callbacks(IndexListCallbacks.EVALUATE_CONTEXT_WINDOWS, self.callbacks):
callback(self, model, x_in, conds, timestep, model_options, window_idx, window, model_options, device, first_device)
@ -353,12 +332,6 @@ class IndexListContextHandler(ContextHandlerABC):
if device is not None:
for i in range(len(sub_conds_out)):
sub_conds_out[i] = sub_conds_out[i].to(x_in.device)
# strip causal_window_fix anchor if applied
if anchor_applied:
for i in range(len(sub_conds_out)):
sub_conds_out[i] = sub_conds_out[i].narrow(self.dim, 1, sub_conds_out[i].shape[self.dim] - 1)
results.append(ContextResults(window_idx, sub_conds_out, sub_conds, window))
return results

View File

@ -1,34 +0,0 @@
import functools
import logging
import os
logger = logging.getLogger(__name__)
_DEFAULT_DEPLOY_ENV = "local-git"
_ENV_FILENAME = ".comfy_environment"
# Resolve the ComfyUI install directory (the parent of this `comfy/` package).
# We deliberately avoid `folder_paths.base_path` here because that is overridden
# by the `--base-directory` CLI arg to a user-supplied path, whereas the
# `.comfy_environment` marker is written by launchers/installers next to the
# ComfyUI install itself.
_COMFY_INSTALL_DIR = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
@functools.cache
def get_deploy_environment() -> str:
env_file = os.path.join(_COMFY_INSTALL_DIR, _ENV_FILENAME)
try:
with open(env_file, encoding="utf-8") as f:
# Cap the read so a malformed or maliciously crafted file (e.g.
# a single huge line with no newline) can't blow up memory.
first_line = f.readline(128).strip()
value = "".join(c for c in first_line if 32 <= ord(c) < 127)
if value:
return value
except FileNotFoundError:
pass
except Exception as e:
logger.error("Failed to read %s: %s", env_file, e)
return _DEFAULT_DEPLOY_ENV

View File

@ -93,7 +93,7 @@ class Hook:
self.hook_scope = hook_scope
'''Scope of where this hook should apply in terms of the conds used in sampling run.'''
self.custom_should_register = default_should_register
'''Can be overridden with a compatible function to decide if this hook should be registered without the need to override .should_register'''
'''Can be overriden with a compatible function to decide if this hook should be registered without the need to override .should_register'''
@property
def strength(self):

View File

@ -106,7 +106,6 @@ class Dino2Encoder(torch.nn.Module):
class Dino2PatchEmbeddings(torch.nn.Module):
def __init__(self, dim, num_channels=3, patch_size=14, image_size=518, dtype=None, device=None, operations=None):
super().__init__()
self.patch_size = patch_size
self.projection = operations.Conv2d(
in_channels=num_channels,
out_channels=dim,
@ -126,37 +125,17 @@ class Dino2Embeddings(torch.nn.Module):
super().__init__()
patch_size = 14
image_size = 518
self.patch_size = patch_size
self.patch_embeddings = Dino2PatchEmbeddings(dim, patch_size=patch_size, image_size=image_size, dtype=dtype, device=device, operations=operations)
self.position_embeddings = torch.nn.Parameter(torch.empty(1, (image_size // patch_size) ** 2 + 1, dim, dtype=dtype, device=device))
self.cls_token = torch.nn.Parameter(torch.empty(1, 1, dim, dtype=dtype, device=device)) # mask_token is a pre-training param, kept only so strict loading accepts the key.
self.cls_token = torch.nn.Parameter(torch.empty(1, 1, dim, dtype=dtype, device=device))
self.mask_token = torch.nn.Parameter(torch.empty(1, dim, dtype=dtype, device=device))
def interpolate_pos_encoding(self, x, h_pixels, w_pixels):
pos_embed = comfy.model_management.cast_to_device(self.position_embeddings, x.device, torch.float32)
class_pos = pos_embed[:, 0:1]
patch_pos = pos_embed[:, 1:]
N = patch_pos.shape[1]
M = int(N ** 0.5)
h0 = h_pixels // self.patch_size
w0 = w_pixels // self.patch_size
scale_factor = ((h0 + 0.1) / M, (w0 + 0.1) / M) # +0.1 matches upstream DINOv2's FP-rounding workaround so the interpolate output size lands on (h0, w0).
patch_pos = patch_pos.reshape(1, M, M, -1).permute(0, 3, 1, 2)
patch_pos = torch.nn.functional.interpolate(patch_pos, scale_factor=scale_factor, mode="bicubic", antialias=False)
patch_pos = patch_pos.permute(0, 2, 3, 1).flatten(1, 2)
return torch.cat((class_pos, patch_pos), dim=1).to(x.dtype)
def forward(self, pixel_values):
x = self.patch_embeddings(pixel_values)
# TODO: mask_token?
x = torch.cat((self.cls_token.to(device=x.device, dtype=x.dtype).expand(x.shape[0], -1, -1), x), dim=1)
if x.shape[1] - 1 == self.position_embeddings.shape[1] - 1:
x = x + comfy.model_management.cast_to_device(self.position_embeddings, x.device, x.dtype)
else:
h, w = pixel_values.shape[-2:]
x = x + self.interpolate_pos_encoding(x, h, w)
x = x + comfy.model_management.cast_to_device(self.position_embeddings, x.device, x.dtype)
return x
@ -179,21 +158,3 @@ class Dinov2Model(torch.nn.Module):
x = self.layernorm(x)
pooled_output = x[:, 0, :]
return x, i, pooled_output, None
def get_intermediate_layers(self, pixel_values, indices, apply_norm=True):
x = self.embeddings(pixel_values)
optimized_attention = optimized_attention_for_device(x.device, False, small_input=True)
n_layers = len(self.encoder.layer)
resolved = [(i if i >= 0 else n_layers + i) for i in indices]
target = set(resolved)
max_idx = max(resolved)
n_skip = 1 # skip cls token
cache = {}
for i, layer in enumerate(self.encoder.layer):
x = layer(x, optimized_attention)
if i in target:
normed = self.layernorm(x) if apply_norm else x
cache[i] = (normed[:, n_skip:], normed[:, 0])
if i >= max_idx:
break
return [cache[i] for i in resolved]

View File

@ -242,7 +242,6 @@ def sample_euler_ancestral_RF(model, x, sigmas, extra_args=None, callback=None,
extra_args = {} if extra_args is None else extra_args
seed = extra_args.get("seed", None)
noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
s_noise = s_noise * getattr(model.inner_model.model_patcher.get_model_object('model_sampling'), "noise_scale", 1.0)
s_in = x.new_ones([x.shape[0]])
for i in trange(len(sigmas) - 1, disable=disable):
denoised = model(x, sigmas[i] * s_in, **extra_args)
@ -374,7 +373,6 @@ def sample_dpm_2_ancestral_RF(model, x, sigmas, extra_args=None, callback=None,
extra_args = {} if extra_args is None else extra_args
seed = extra_args.get("seed", None)
noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
s_noise = s_noise * getattr(model.inner_model.model_patcher.get_model_object('model_sampling'), "noise_scale", 1.0)
s_in = x.new_ones([x.shape[0]])
for i in trange(len(sigmas) - 1, disable=disable):
denoised = model(x, sigmas[i] * s_in, **extra_args)
@ -688,7 +686,6 @@ def sample_dpmpp_2s_ancestral_RF(model, x, sigmas, extra_args=None, callback=Non
extra_args = {} if extra_args is None else extra_args
seed = extra_args.get("seed", None)
noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
s_noise = s_noise * getattr(model.inner_model.model_patcher.get_model_object('model_sampling'), "noise_scale", 1.0)
s_in = x.new_ones([x.shape[0]])
sigma_fn = lambda lbda: (lbda.exp() + 1) ** -1
lambda_fn = lambda sigma: ((1-sigma)/sigma).log()
@ -750,7 +747,6 @@ def sample_dpmpp_sde(model, x, sigmas, extra_args=None, callback=None, disable=N
sigma_fn = partial(half_log_snr_to_sigma, model_sampling=model_sampling)
lambda_fn = partial(sigma_to_half_log_snr, model_sampling=model_sampling)
sigmas = offset_first_sigma_for_snr(sigmas, model_sampling)
s_noise = s_noise * getattr(model_sampling, "noise_scale", 1.0)
for i in trange(len(sigmas) - 1, disable=disable):
denoised = model(x, sigmas[i] * s_in, **extra_args)
@ -836,7 +832,6 @@ def sample_dpmpp_2m_sde(model, x, sigmas, extra_args=None, callback=None, disabl
model_sampling = model.inner_model.model_patcher.get_model_object('model_sampling')
lambda_fn = partial(sigma_to_half_log_snr, model_sampling=model_sampling)
sigmas = offset_first_sigma_for_snr(sigmas, model_sampling)
s_noise = s_noise * getattr(model_sampling, "noise_scale", 1.0)
old_denoised = None
h, h_last = None, None
@ -894,7 +889,6 @@ def sample_dpmpp_3m_sde(model, x, sigmas, extra_args=None, callback=None, disabl
model_sampling = model.inner_model.model_patcher.get_model_object('model_sampling')
lambda_fn = partial(sigma_to_half_log_snr, model_sampling=model_sampling)
sigmas = offset_first_sigma_for_snr(sigmas, model_sampling)
s_noise = s_noise * getattr(model_sampling, "noise_scale", 1.0)
denoised_1, denoised_2 = None, None
h, h_1, h_2 = None, None, None
@ -1012,39 +1006,23 @@ def sample_ddpm(model, x, sigmas, extra_args=None, callback=None, disable=None,
return generic_step_sampler(model, x, sigmas, extra_args, callback, disable, noise_sampler, DDPMSampler_step)
@torch.no_grad()
def sample_lcm(model, x, sigmas, extra_args=None, callback=None, disable=None, noise_sampler=None, s_noise=1.0, s_noise_end=None, noise_clip_std=0.0):
# s_noise / s_noise_end: per-step noise multiplier, linearly interpolated across steps
# noise_clip_std: clamp injected noise to +/- N stddevs (0 disables).
def sample_lcm(model, x, sigmas, extra_args=None, callback=None, disable=None, noise_sampler=None):
extra_args = {} if extra_args is None else extra_args
seed = extra_args.get("seed", None)
noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
s_in = x.new_ones([x.shape[0]])
n_steps = max(1, len(sigmas) - 1)
model_sampling = model.inner_model.model_patcher.get_model_object('model_sampling')
s_start = float(s_noise)
s_end = s_start if s_noise_end is None else float(s_noise_end)
for i in trange(n_steps, disable=disable):
for i in trange(len(sigmas) - 1, disable=disable):
denoised = model(x, sigmas[i] * s_in, **extra_args)
if callback is not None:
callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised})
x = denoised
if sigmas[i + 1] > 0:
noise = noise_sampler(sigmas[i], sigmas[i + 1])
if noise_clip_std > 0:
clip_val = noise_clip_std * noise.std()
noise = noise.clamp(min=-clip_val, max=clip_val)
t = (i / (n_steps - 1)) if n_steps > 1 else 0.0
s_noise_i = s_start + (s_end - s_start) * t
if s_noise_i != 1.0:
noise = noise * s_noise_i
x = model_sampling.noise_scaling(sigmas[i + 1], noise, x)
x = model.inner_model.inner_model.model_sampling.noise_scaling(sigmas[i + 1], noise_sampler(sigmas[i], sigmas[i + 1]), x)
return x
@torch.no_grad()
def sample_heunpp2(model, x, sigmas, extra_args=None, callback=None, disable=None, s_churn=0., s_tmin=0., s_tmax=float('inf'), s_noise=1.):
# From MIT licensed: https://github.com/Carzit/sd-webui-samplers-scheduler/
@ -1271,7 +1249,6 @@ def sample_euler_ancestral_cfg_pp(model, x, sigmas, extra_args=None, callback=No
model_sampling = model.inner_model.model_patcher.get_model_object("model_sampling")
lambda_fn = partial(sigma_to_half_log_snr, model_sampling=model_sampling)
s_noise = s_noise * getattr(model_sampling, "noise_scale", 1.0)
uncond_denoised = None
@ -1319,7 +1296,6 @@ def sample_dpmpp_2s_ancestral_cfg_pp(model, x, sigmas, extra_args=None, callback
extra_args = {} if extra_args is None else extra_args
seed = extra_args.get("seed", None)
noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
s_noise = s_noise * getattr(model.inner_model.model_patcher.get_model_object('model_sampling'), "noise_scale", 1.0)
temp = [0]
def post_cfg_function(args):
@ -1395,7 +1371,6 @@ def res_multistep(model, x, sigmas, extra_args=None, callback=None, disable=None
extra_args = {} if extra_args is None else extra_args
seed = extra_args.get("seed", None)
noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
s_noise = s_noise * getattr(model.inner_model.model_patcher.get_model_object('model_sampling'), "noise_scale", 1.0)
s_in = x.new_ones([x.shape[0]])
sigma_fn = lambda t: t.neg().exp()
t_fn = lambda sigma: sigma.log().neg()
@ -1529,7 +1504,6 @@ def sample_er_sde(model, x, sigmas, extra_args=None, callback=None, disable=None
extra_args = {} if extra_args is None else extra_args
seed = extra_args.get("seed", None)
noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
s_noise = s_noise * getattr(model.inner_model.model_patcher.get_model_object('model_sampling'), "noise_scale", 1.0)
s_in = x.new_ones([x.shape[0]])
def default_er_sde_noise_scaler(x):
@ -1600,10 +1574,9 @@ def sample_seeds_2(model, x, sigmas, extra_args=None, callback=None, disable=Non
seed = extra_args.get("seed", None)
noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
s_in = x.new_ones([x.shape[0]])
inject_noise = eta > 0 and s_noise > 0
model_sampling = model.inner_model.model_patcher.get_model_object('model_sampling')
s_noise = s_noise * getattr(model_sampling, "noise_scale", 1.0)
inject_noise = eta > 0 and s_noise > 0
sigma_fn = partial(half_log_snr_to_sigma, model_sampling=model_sampling)
lambda_fn = partial(sigma_to_half_log_snr, model_sampling=model_sampling)
sigmas = offset_first_sigma_for_snr(sigmas, model_sampling)
@ -1672,10 +1645,9 @@ def sample_seeds_3(model, x, sigmas, extra_args=None, callback=None, disable=Non
seed = extra_args.get("seed", None)
noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
s_in = x.new_ones([x.shape[0]])
inject_noise = eta > 0 and s_noise > 0
model_sampling = model.inner_model.model_patcher.get_model_object('model_sampling')
s_noise = s_noise * getattr(model_sampling, "noise_scale", 1.0)
inject_noise = eta > 0 and s_noise > 0
sigma_fn = partial(half_log_snr_to_sigma, model_sampling=model_sampling)
lambda_fn = partial(sigma_to_half_log_snr, model_sampling=model_sampling)
sigmas = offset_first_sigma_for_snr(sigmas, model_sampling)
@ -1741,7 +1713,6 @@ def sample_sa_solver(model, x, sigmas, extra_args=None, callback=None, disable=F
s_in = x.new_ones([x.shape[0]])
model_sampling = model.inner_model.model_patcher.get_model_object("model_sampling")
s_noise = s_noise * getattr(model_sampling, "noise_scale", 1.0)
sigmas = offset_first_sigma_for_snr(sigmas, model_sampling)
lambdas = sigma_to_half_log_snr(sigmas, model_sampling=model_sampling)
@ -1839,119 +1810,3 @@ def sample_sa_solver(model, x, sigmas, extra_args=None, callback=None, disable=F
def sample_sa_solver_pece(model, x, sigmas, extra_args=None, callback=None, disable=False, tau_func=None, s_noise=1.0, noise_sampler=None, predictor_order=3, corrector_order=4, simple_order_2=False):
"""Stochastic Adams Solver with PECE (PredictEvaluateCorrectEvaluate) mode (NeurIPS 2023)."""
return sample_sa_solver(model, x, sigmas, extra_args=extra_args, callback=callback, disable=disable, tau_func=tau_func, s_noise=s_noise, noise_sampler=noise_sampler, predictor_order=predictor_order, corrector_order=corrector_order, use_pece=True, simple_order_2=simple_order_2)
@torch.no_grad()
def sample_ar_video(model, x, sigmas, extra_args=None, callback=None, disable=None,
num_frame_per_block=1):
"""
Autoregressive video sampler: block-by-block denoising with KV cache
and flow-match re-noising for Causal Forcing / Self-Forcing models.
Requires a Causal-WAN compatible model (diffusion_model must expose
init_kv_caches / init_crossattn_caches) and 5-D latents [B,C,T,H,W].
All AR-loop parameters are passed via the SamplerARVideo node, not read
from the checkpoint or transformer_options.
"""
extra_args = {} if extra_args is None else extra_args
model_options = extra_args.get("model_options", {})
transformer_options = model_options.get("transformer_options", {})
if x.ndim != 5:
raise ValueError(
f"ar_video sampler requires 5-D video latents [B,C,T,H,W], got {x.ndim}-D tensor with shape {x.shape}. "
"This sampler is only compatible with autoregressive video models (e.g. Causal-WAN)."
)
inner_model = model.inner_model.inner_model
causal_model = inner_model.diffusion_model
if not (hasattr(causal_model, "init_kv_caches") and hasattr(causal_model, "init_crossattn_caches")):
raise TypeError(
"ar_video sampler requires a Causal-WAN compatible model whose diffusion_model "
"exposes init_kv_caches() and init_crossattn_caches(). The loaded checkpoint "
"does not support this interface — choose a different sampler."
)
seed = extra_args.get("seed", 0)
bs, c, lat_t, lat_h, lat_w = x.shape
frame_seq_len = -(-lat_h // 2) * -(-lat_w // 2) # ceiling division
num_blocks = -(-lat_t // num_frame_per_block) # ceiling division
device = x.device
model_dtype = inner_model.get_dtype()
kv_caches = causal_model.init_kv_caches(bs, lat_t * frame_seq_len, device, model_dtype)
crossattn_caches = causal_model.init_crossattn_caches(bs, device, model_dtype)
output = torch.zeros_like(x)
s_in = x.new_ones([x.shape[0]])
current_start_frame = 0
# I2V: seed KV cache with the initial image latent before the denoising loop
initial_latent = transformer_options.get("ar_config", {}).get("initial_latent", None)
if initial_latent is not None:
initial_latent = inner_model.process_latent_in(initial_latent).to(device=device, dtype=model_dtype)
n_init = initial_latent.shape[2]
output[:, :, :n_init] = initial_latent
ar_state = {"start_frame": 0, "kv_caches": kv_caches, "crossattn_caches": crossattn_caches}
transformer_options["ar_state"] = ar_state
zero_sigma = sigmas.new_zeros([1])
_ = model(initial_latent, zero_sigma * s_in, **extra_args)
current_start_frame = n_init
remaining = lat_t - n_init
num_blocks = -(-remaining // num_frame_per_block)
num_sigma_steps = len(sigmas) - 1
total_real_steps = num_blocks * num_sigma_steps
step_count = 0
try:
for block_idx in trange(num_blocks, disable=disable):
bf = min(num_frame_per_block, lat_t - current_start_frame)
fs, fe = current_start_frame, current_start_frame + bf
noisy_input = x[:, :, fs:fe]
ar_state = {
"start_frame": current_start_frame,
"kv_caches": kv_caches,
"crossattn_caches": crossattn_caches,
}
transformer_options["ar_state"] = ar_state
for i in range(num_sigma_steps):
denoised = model(noisy_input, sigmas[i] * s_in, **extra_args)
if callback is not None:
scaled_i = step_count * num_sigma_steps // total_real_steps
callback({"x": noisy_input, "i": scaled_i, "sigma": sigmas[i],
"sigma_hat": sigmas[i], "denoised": denoised})
if sigmas[i + 1] == 0:
noisy_input = denoised
else:
sigma_next = sigmas[i + 1]
torch.manual_seed(seed + block_idx * 1000 + i)
fresh_noise = torch.randn_like(denoised)
noisy_input = (1.0 - sigma_next) * denoised + sigma_next * fresh_noise
for cache in kv_caches:
cache["end"] -= bf * frame_seq_len
step_count += 1
output[:, :, fs:fe] = noisy_input
for cache in kv_caches:
cache["end"] -= bf * frame_seq_len
zero_sigma = sigmas.new_zeros([1])
_ = model(noisy_input, zero_sigma * s_in, **extra_args)
current_start_frame += bf
finally:
transformer_options.pop("ar_state", None)
return output

View File

@ -9,7 +9,6 @@ class LatentFormat:
latent_rgb_factors_reshape = None
taesd_decoder_name = None
spacial_downscale_ratio = 8
temporal_downscale_ratio = 1
def process_in(self, latent):
return latent * self.scale_factor
@ -150,12 +149,6 @@ class SD3(LatentFormat):
class StableAudio1(LatentFormat):
latent_channels = 64
latent_dimensions = 1
temporal_downscale_ratio = 2048
class StableAudio3(LatentFormat):
latent_channels = 256
latent_dimensions = 1
temporal_downscale_ratio = 4096
class Flux(SD3):
latent_channels = 16
@ -242,7 +235,6 @@ class Flux2(LatentFormat):
class Mochi(LatentFormat):
latent_channels = 12
latent_dimensions = 3
temporal_downscale_ratio = 6
def __init__(self):
self.scale_factor = 1.0
@ -286,7 +278,6 @@ class LTXV(LatentFormat):
latent_channels = 128
latent_dimensions = 3
spacial_downscale_ratio = 32
temporal_downscale_ratio = 8
def __init__(self):
self.latent_rgb_factors = [
@ -430,7 +421,6 @@ class LTXAV(LTXV):
class HunyuanVideo(LatentFormat):
latent_channels = 16
latent_dimensions = 3
temporal_downscale_ratio = 4
scale_factor = 0.476986
latent_rgb_factors = [
[-0.0395, -0.0331, 0.0445],
@ -457,7 +447,6 @@ class HunyuanVideo(LatentFormat):
class Cosmos1CV8x8x8(LatentFormat):
latent_channels = 16
latent_dimensions = 3
temporal_downscale_ratio = 8
latent_rgb_factors = [
[ 0.1817, 0.2284, 0.2423],
@ -483,7 +472,6 @@ class Cosmos1CV8x8x8(LatentFormat):
class Wan21(LatentFormat):
latent_channels = 16
latent_dimensions = 3
temporal_downscale_ratio = 4
latent_rgb_factors = [
[-0.1299, -0.1692, 0.2932],
@ -746,7 +734,6 @@ class HunyuanVideo15(LatentFormat):
latent_channels = 32
latent_dimensions = 3
spacial_downscale_ratio = 16
temporal_downscale_ratio = 4
scale_factor = 1.03682
taesd_decoder_name = "lighttaehy1_5"
@ -772,7 +759,6 @@ class ACEAudio(LatentFormat):
class ACEAudio15(LatentFormat):
latent_channels = 64
latent_dimensions = 1
temporal_downscale_ratio = 1764
class ChromaRadiance(LatentFormat):
latent_channels = 3
@ -799,35 +785,9 @@ class ZImagePixelSpace(ChromaRadiance):
"""
pass
class HiDreamO1Pixel(ChromaRadiance):
"""Pixel-space latent format for HiDream-O1.
No VAE — model patches/unpatches raw RGB internally with patch_size=32.
"""
pass
class CogVideoX(LatentFormat):
"""Latent format for CogVideoX-2b (THUDM/CogVideoX-2b).
scale_factor matches the vae/config.json scaling_factor for the 2b variant.
The 5b-class checkpoints (CogVideoX-5b, CogVideoX-1.5-5B, CogVideoX-Fun-V1.5-*)
use a different value; see CogVideoX1_5 below.
"""
latent_channels = 16
latent_dimensions = 3
temporal_downscale_ratio = 4
def __init__(self):
self.scale_factor = 1.15258426
class CogVideoX1_5(CogVideoX):
"""Latent format for 5b-class CogVideoX checkpoints.
Covers THUDM/CogVideoX-5b, THUDM/CogVideoX-1.5-5B, and the CogVideoX-Fun
V1.5-5b family (including VOID inpainting). All of these have
scaling_factor=0.7 in their vae/config.json. Auto-selected in
supported_models.CogVideoX_T2V based on transformer hidden dim.
"""
def __init__(self):
self.scale_factor = 0.7

View File

@ -10,17 +10,6 @@ from torch import nn
from torch.nn import functional as F
import math
import comfy.ops
from .embedders import ExpoFourierFeatures
def _left_pad_to_match(emb, target_len):
emb_len = emb.shape[-2]
if emb_len < target_len:
return F.pad(emb, (0, 0, target_len - emb_len, 0), value=0.)
elif emb_len > target_len:
return emb[:, -target_len:, :]
return emb
class FourierFeatures(nn.Module):
def __init__(self, in_features, out_features, std=1., dtype=None, device=None):
@ -33,7 +22,6 @@ class FourierFeatures(nn.Module):
f = 2 * math.pi * input @ comfy.ops.cast_to_input(self.weight.T, input)
return torch.cat([f.cos(), f.sin()], dim=-1)
# norms
class LayerNorm(nn.Module):
def __init__(self, dim, bias=False, fix_scale=False, dtype=None, device=None):
@ -55,16 +43,6 @@ class LayerNorm(nn.Module):
beta = comfy.ops.cast_to_input(beta, x)
return F.layer_norm(x, x.shape[-1:], weight=comfy.ops.cast_to_input(self.gamma, x), bias=beta)
class RMSNorm(nn.Module):
def __init__(self, dim, dtype=None, device=None):
super().__init__()
self.gamma = nn.Parameter(torch.empty(dim, dtype=dtype, device=device))
def forward(self, x):
return F.rms_norm(x, x.shape[-1:], weight=comfy.ops.cast_to_input(self.gamma, x))
class GLU(nn.Module):
def __init__(
self,
@ -258,6 +236,13 @@ class FeedForward(nn.Module):
linear_out = operations.Linear(inner_dim, dim_out, bias = not no_bias, dtype=dtype, device=device) if not use_conv else operations.Conv1d(inner_dim, dim_out, conv_kernel_size, padding = (conv_kernel_size // 2), bias = not no_bias, dtype=dtype, device=device)
# # init last linear layer to 0
# if zero_init_output:
# nn.init.zeros_(linear_out.weight)
# if not no_bias:
# nn.init.zeros_(linear_out.bias)
self.ff = nn.Sequential(
linear_in,
rearrange('b d n -> b n d') if use_conv else nn.Identity(),
@ -276,10 +261,8 @@ class Attention(nn.Module):
dim_context = None,
causal = False,
zero_init_output=True,
qk_norm = "none",
differential = False,
qk_norm = False,
natten_kernel_size = None,
feat_scale = False,
dtype=None,
device=None,
operations=None,
@ -288,7 +271,6 @@ class Attention(nn.Module):
self.dim = dim
self.dim_heads = dim_heads
self.causal = causal
self.differential = differential
dim_kv = dim_context if dim_context is not None else dim
@ -296,37 +278,18 @@ class Attention(nn.Module):
self.kv_heads = dim_kv // dim_heads
if dim_context is not None:
if differential:
self.to_q = operations.Linear(dim, dim * 2, bias=False, dtype=dtype, device=device)
self.to_kv = operations.Linear(dim_kv, dim_kv * 3, bias=False, dtype=dtype, device=device)
else:
self.to_q = operations.Linear(dim, dim, bias=False, dtype=dtype, device=device)
self.to_kv = operations.Linear(dim_kv, dim_kv * 2, bias=False, dtype=dtype, device=device)
self.to_q = operations.Linear(dim, dim, bias=False, dtype=dtype, device=device)
self.to_kv = operations.Linear(dim_kv, dim_kv * 2, bias=False, dtype=dtype, device=device)
else:
if differential:
self.to_qkv = operations.Linear(dim, dim * 5, bias=False, dtype=dtype, device=device)
else:
self.to_qkv = operations.Linear(dim, dim * 3, bias=False, dtype=dtype, device=device)
self.to_qkv = operations.Linear(dim, dim * 3, bias=False, dtype=dtype, device=device)
self.to_out = operations.Linear(dim, dim, bias=False, dtype=dtype, device=device)
# Accept bool for backward compat
if isinstance(qk_norm, bool):
qk_norm = "l2" if qk_norm else "none"
# if zero_init_output:
# nn.init.zeros_(self.to_out.weight)
self.qk_norm = qk_norm
if self.qk_norm == "ln":
self.q_norm = operations.LayerNorm(dim_heads, elementwise_affine=True, eps=1.0e-6, dtype=dtype, device=device)
self.k_norm = operations.LayerNorm(dim_heads, elementwise_affine=True, eps=1.0e-6, dtype=dtype, device=device)
elif self.qk_norm == "rms":
self.q_norm = RMSNorm(dim_heads, dtype=dtype, device=device)
self.k_norm = RMSNorm(dim_heads, dtype=dtype, device=device)
self.feat_scale = feat_scale
if self.feat_scale:
self.lambda_dc = nn.Parameter(torch.empty(dim, dtype=dtype, device=device))
self.lambda_hf = nn.Parameter(torch.empty(dim, dtype=dtype, device=device))
def forward(
self,
@ -343,51 +306,22 @@ class Attention(nn.Module):
kv_input = context if has_context else x
if hasattr(self, 'to_q'):
if self.differential:
# cross-attention differential: to_q → (q, q_diff), to_kv → (k, k_diff, v)
q, q_diff = self.to_q(x).chunk(2, dim=-1)
q = rearrange(q, 'b n (h d) -> b h n d', h=h)
q_diff = rearrange(q_diff, 'b n (h d) -> b h n d', h=h)
q = torch.stack([q, q_diff], dim=1) # (B, 2, H, N, D)
k, k_diff, v = self.to_kv(kv_input).chunk(3, dim=-1)
k = rearrange(k, 'b n (h d) -> b h n d', h=kv_h)
k_diff = rearrange(k_diff, 'b n (h d) -> b h n d', h=kv_h)
v = rearrange(v, 'b n (h d) -> b h n d', h=kv_h)
k = torch.stack([k, k_diff], dim=1) # (B, 2, H, M, D)
else:
# Use separate linear projections for q and k/v
q = self.to_q(x)
q = rearrange(q, 'b n (h d) -> b h n d', h = h)
# Use separate linear projections for q and k/v
q = self.to_q(x)
q = rearrange(q, 'b n (h d) -> b h n d', h = h)
k, v = self.to_kv(kv_input).chunk(2, dim=-1)
k, v = self.to_kv(kv_input).chunk(2, dim=-1)
k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = kv_h), (k, v))
k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = kv_h), (k, v))
else:
if self.differential:
# self-attention differential: to_qkv → (q, k, v, q_diff, k_diff)
q, k, v, q_diff, k_diff = self.to_qkv(x).chunk(5, dim=-1)
q, k, v, q_diff, k_diff = map(
lambda t: rearrange(t, 'b n (h d) -> b h n d', h=h),
(q, k, v, q_diff, k_diff)
)
q = torch.stack([q, q_diff], dim=1) # (B, 2, H, N, D)
k = torch.stack([k, k_diff], dim=1)
else:
# Use fused linear projection
q, k, v = self.to_qkv(x).chunk(3, dim=-1)
q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = h), (q, k, v))
# Use fused linear projection
q, k, v = self.to_qkv(x).chunk(3, dim=-1)
q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = h), (q, k, v))
# Normalize q and k for cosine sim attention
if self.qk_norm == "l2":
if self.qk_norm:
q = F.normalize(q, dim=-1)
k = F.normalize(k, dim=-1)
elif self.qk_norm == "rms":
q_type, k_type = q.dtype, k.dtype
q = self.q_norm(q).to(q_type)
k = self.k_norm(k).to(k_type)
elif self.qk_norm != 'none':
q = self.q_norm(q)
k = self.k_norm(k)
if rotary_pos_emb is not None and not has_context:
freqs, _ = rotary_pos_emb
@ -430,24 +364,9 @@ class Attention(nn.Module):
heads_per_kv_head = h // kv_h
k, v = map(lambda t: t.repeat_interleave(heads_per_kv_head, dim = 1), (k, v))
if self.differential:
q, q_diff = q.unbind(dim=1)
k, k_diff = k.unbind(dim=1)
out = optimized_attention(q, k, v, h, skip_reshape=True, transformer_options=transformer_options)
out_diff = optimized_attention(q_diff, k_diff, v, h, skip_reshape=True, transformer_options=transformer_options)
out = out - out_diff
else:
out = optimized_attention(q, k, v, h, skip_reshape=True, transformer_options=transformer_options)
out = optimized_attention(q, k, v, h, skip_reshape=True, transformer_options=transformer_options)
out = self.to_out(out)
if self.feat_scale:
out_dc = out.mean(dim=-2, keepdim=True)
out_hf = out - out_dc
# Selectively modulate DC and high frequency components
out = out + comfy.ops.cast_to_input(self.lambda_dc, out) * out_dc + comfy.ops.cast_to_input(self.lambda_hf, out) * out_hf
if mask is not None:
mask = rearrange(mask, 'b n -> b n 1')
out = out.masked_fill(~mask, 0.)
@ -498,14 +417,11 @@ class TransformerBlock(nn.Module):
cross_attend = False,
dim_context = None,
global_cond_dim = None,
global_cond_shared_embed = False,
local_add_cond_dim = None,
causal = False,
zero_init_branch_outputs = True,
conformer = False,
layer_ix = -1,
remove_norms = False,
norm_type = "layer_norm",
attn_kwargs = {},
ff_kwargs = {},
norm_kwargs = {},
@ -520,20 +436,8 @@ class TransformerBlock(nn.Module):
self.cross_attend = cross_attend
self.dim_context = dim_context
self.causal = causal
self.global_cond_shared_embed = global_cond_shared_embed
norm_layer_map = {
"layer_norm": LayerNorm,
"rms_norm": RMSNorm,
}
norm_cls = norm_layer_map.get(norm_type, LayerNorm)
def make_norm():
if remove_norms:
return nn.Identity()
return norm_cls(dim, dtype=dtype, device=device, **norm_kwargs)
self.pre_norm = make_norm()
self.pre_norm = LayerNorm(dim, dtype=dtype, device=device, **norm_kwargs) if not remove_norms else nn.Identity()
self.self_attn = Attention(
dim,
@ -547,7 +451,7 @@ class TransformerBlock(nn.Module):
)
if cross_attend:
self.cross_attend_norm = make_norm()
self.cross_attend_norm = LayerNorm(dim, dtype=dtype, device=device, **norm_kwargs) if not remove_norms else nn.Identity()
self.cross_attn = Attention(
dim,
dim_heads = dim_heads,
@ -560,56 +464,37 @@ class TransformerBlock(nn.Module):
**attn_kwargs
)
self.ff_norm = make_norm()
self.ff = FeedForward(dim, zero_init_output=zero_init_branch_outputs, dtype=dtype, device=device, operations=operations, **ff_kwargs)
self.ff_norm = LayerNorm(dim, dtype=dtype, device=device, **norm_kwargs) if not remove_norms else nn.Identity()
self.ff = FeedForward(dim, zero_init_output=zero_init_branch_outputs, dtype=dtype, device=device, operations=operations,**ff_kwargs)
self.layer_ix = layer_ix
self.conformer = ConformerModule(dim, norm_kwargs=norm_kwargs) if conformer else None
# Global conditioning
self.has_global_cond = (global_cond_dim is not None) or global_cond_shared_embed
self.global_cond_dim = global_cond_dim
if global_cond_shared_embed:
# SA3 style: learnable per-block additive bias; global_cond is pre-projected to (B, dim*6)
self.to_scale_shift_gate = nn.Parameter(torch.empty(dim * 6, device=device, dtype=dtype))
elif global_cond_dim is not None:
# SA1 style: per-block MLP projects global_cond → (B, dim*6)
if global_cond_dim is not None:
self.to_scale_shift_gate = nn.Sequential(
nn.SiLU(),
operations.Linear(global_cond_dim, dim * 6, bias=False, device=device, dtype=dtype)
nn.Linear(global_cond_dim, dim * 6, bias=False)
)
# Local additive conditioning (e.g. inpaint mask + masked latent)
self.local_add_cond_dim = local_add_cond_dim
if local_add_cond_dim is not None:
self.to_local_embed = nn.Sequential(
operations.Linear(local_add_cond_dim, dim, bias=True, dtype=dtype, device=device),
nn.SiLU(),
operations.Linear(dim, dim, bias=True, dtype=dtype, device=device),
)
else:
self.to_local_embed = None
nn.init.zeros_(self.to_scale_shift_gate[1].weight)
#nn.init.zeros_(self.to_scale_shift_gate_self[1].bias)
def forward(
self,
x,
context = None,
global_cond=None,
local_add_cond=None,
mask = None,
context_mask = None,
rotary_pos_emb = None,
transformer_options={}
):
if self.has_global_cond and global_cond is not None:
if self.global_cond_shared_embed:
# global_cond already has shape (B, dim*6)
ssg = (comfy.ops.cast_to_input(self.to_scale_shift_gate, global_cond) + global_cond).unsqueeze(1)
else:
ssg = self.to_scale_shift_gate(global_cond).unsqueeze(1)
if self.global_cond_dim is not None and self.global_cond_dim > 0 and global_cond is not None:
scale_self, shift_self, gate_self, scale_ff, shift_ff, gate_ff = ssg.chunk(6, dim = -1)
scale_self, shift_self, gate_self, scale_ff, shift_ff, gate_ff = self.to_scale_shift_gate(global_cond).unsqueeze(1).chunk(6, dim = -1)
# self-attention with adaLN
residual = x
@ -625,9 +510,6 @@ class TransformerBlock(nn.Module):
if self.conformer is not None:
x = x + self.conformer(x)
if local_add_cond is not None and self.to_local_embed is not None:
x = x + _left_pad_to_match(self.to_local_embed(local_add_cond), x.shape[-2])
# feedforward with adaLN
residual = x
x = self.ff_norm(x)
@ -645,9 +527,6 @@ class TransformerBlock(nn.Module):
if self.conformer is not None:
x = x + self.conformer(x)
if local_add_cond is not None and self.to_local_embed is not None:
x = x + _left_pad_to_match(self.to_local_embed(local_add_cond), x.shape[-2])
x = x + self.ff(self.ff_norm(x))
return x
@ -664,8 +543,6 @@ class ContinuousTransformer(nn.Module):
cross_attend=False,
cond_token_dim=None,
global_cond_dim=None,
global_cond_shared_embed=False,
local_add_cond_dim=None,
causal=False,
rotary_pos_emb=True,
zero_init_branch_outputs=True,
@ -673,7 +550,6 @@ class ContinuousTransformer(nn.Module):
use_sinusoidal_emb=False,
use_abs_pos_emb=False,
abs_pos_emb_max_length=10000,
num_memory_tokens=0,
dtype=None,
device=None,
operations=None,
@ -686,8 +562,6 @@ class ContinuousTransformer(nn.Module):
self.depth = depth
self.causal = causal
self.layers = nn.ModuleList([])
self.num_memory_tokens = num_memory_tokens
self.global_cond_shared_embed = global_cond_shared_embed
self.project_in = operations.Linear(dim_in, dim, bias=False, dtype=dtype, device=device) if dim_in is not None else nn.Identity()
self.project_out = operations.Linear(dim, dim_out, bias=False, dtype=dtype, device=device) if dim_out is not None else nn.Identity()
@ -703,22 +577,7 @@ class ContinuousTransformer(nn.Module):
self.use_abs_pos_emb = use_abs_pos_emb
if use_abs_pos_emb:
self.pos_emb = AbsolutePositionalEmbedding(dim, abs_pos_emb_max_length + num_memory_tokens)
if num_memory_tokens > 0:
self.memory_tokens = nn.Parameter(torch.empty(num_memory_tokens, dim, device=device, dtype=dtype))
# Shared global-cond embedder (SA3 style): projects (B, global_cond_dim) → (B, dim*6)
self.global_cond_embedder = None
if global_cond_shared_embed and global_cond_dim is not None:
self.global_cond_embedder = nn.Sequential(
operations.Linear(global_cond_dim, dim, bias=True, dtype=dtype, device=device),
nn.SiLU(),
operations.Linear(dim, dim * 6, bias=True, dtype=dtype, device=device),
)
# When using shared embed, TransformerBlocks use per-block Parameter (not per-block MLP)
block_global_cond_dim = None if global_cond_shared_embed else global_cond_dim
self.pos_emb = AbsolutePositionalEmbedding(dim, abs_pos_emb_max_length)
for i in range(depth):
self.layers.append(
@ -727,9 +586,7 @@ class ContinuousTransformer(nn.Module):
dim_heads = dim_heads,
cross_attend = cross_attend,
dim_context = cond_token_dim,
global_cond_dim = block_global_cond_dim,
global_cond_shared_embed = global_cond_shared_embed,
local_add_cond_dim = local_add_cond_dim,
global_cond_dim = global_cond_dim,
causal = causal,
zero_init_branch_outputs = zero_init_branch_outputs,
conformer=conformer,
@ -748,7 +605,6 @@ class ContinuousTransformer(nn.Module):
prepend_embeds = None,
prepend_mask = None,
global_cond = None,
local_add_cond = None,
return_info = False,
**kwargs
):
@ -776,9 +632,7 @@ class ContinuousTransformer(nn.Module):
mask = torch.cat((prepend_mask, mask), dim = -1)
if self.num_memory_tokens > 0:
memory_tokens = comfy.ops.cast_to_input(self.memory_tokens, x).expand(batch, -1, -1)
x = torch.cat((memory_tokens, x), dim=1)
# Attention layers
if self.rotary_pos_emb is not None:
rotary_pos_emb = self.rotary_pos_emb.forward_from_seq_len(x.shape[1], dtype=torch.float, device=x.device)
@ -788,10 +642,6 @@ class ContinuousTransformer(nn.Module):
if self.use_sinusoidal_emb or self.use_abs_pos_emb:
x = x + self.pos_emb(x)
# Project global_cond once (SA3 shared-embed path)
if global_cond is not None and self.global_cond_embedder is not None:
global_cond = self.global_cond_embedder(global_cond)
blocks_replace = patches_replace.get("dit", {})
# Iterate over the transformer layers
for i, layer in enumerate(self.layers):
@ -804,17 +654,12 @@ class ContinuousTransformer(nn.Module):
out = blocks_replace[("double_block", i)]({"img": x, "txt": context, "vec": global_cond, "pe": rotary_pos_emb, "transformer_options": transformer_options}, {"original_block": block_wrap})
x = out["img"]
else:
x = layer(x, rotary_pos_emb=rotary_pos_emb, global_cond=global_cond,
local_add_cond=local_add_cond, context=context,
transformer_options=transformer_options)
x = layer(x, rotary_pos_emb = rotary_pos_emb, global_cond=global_cond, context=context, transformer_options=transformer_options)
# x = checkpoint(layer, x, rotary_pos_emb = rotary_pos_emb, global_cond=global_cond, **kwargs)
if return_info:
info["hidden_states"].append(x)
# Strip memory tokens before projecting out
if self.num_memory_tokens > 0:
x = x[:, self.num_memory_tokens:, :]
x = self.project_out(x)
if return_info:
@ -837,7 +682,6 @@ class AudioDiffusionTransformer(nn.Module):
num_heads=24,
transformer_type: tp.Literal["continuous_transformer"] = "continuous_transformer",
global_cond_type: tp.Literal["prepend", "adaLN"] = "prepend",
timestep_features_type: str = "learned",
audio_model="",
dtype=None,
device=None,
@ -852,10 +696,7 @@ class AudioDiffusionTransformer(nn.Module):
# Timestep embeddings
timestep_features_dim = 256
if timestep_features_type == "expo":
self.timestep_features = ExpoFourierFeatures(timestep_features_dim, 0.5, 10000.0)
else:
self.timestep_features = FourierFeatures(1, timestep_features_dim, dtype=dtype, device=device)
self.timestep_features = FourierFeatures(1, timestep_features_dim, dtype=dtype, device=device)
self.to_timestep_embed = nn.Sequential(
operations.Linear(timestep_features_dim, embed_dim, bias=True, dtype=dtype, device=device),
@ -940,7 +781,6 @@ class AudioDiffusionTransformer(nn.Module):
cross_attn_cond=None,
cross_attn_cond_mask=None,
input_concat_cond=None,
local_add_cond=None,
global_embed=None,
prepend_cond=None,
prepend_cond_mask=None,
@ -962,13 +802,9 @@ class AudioDiffusionTransformer(nn.Module):
prepend_cond = self.to_prepend_embed(prepend_cond)
prepend_inputs = prepend_cond
prepend_length = prepend_cond.shape[1]
if prepend_cond_mask is not None:
prepend_mask = prepend_cond_mask
if local_add_cond is not None and local_add_cond.dim() == 3:
local_add_cond = local_add_cond.permute(0, 2, 1)
if input_concat_cond is not None:
# Interpolate input_concat_cond to the same length as x
@ -1014,7 +850,7 @@ class AudioDiffusionTransformer(nn.Module):
if self.transformer_type == "x-transformers":
output = self.transformer(x, prepend_embeds=prepend_inputs, context=cross_attn_cond, context_mask=cross_attn_cond_mask, mask=mask, prepend_mask=prepend_mask, **extra_args, **kwargs)
elif self.transformer_type == "continuous_transformer":
output = self.transformer(x, prepend_embeds=prepend_inputs, context=cross_attn_cond, context_mask=cross_attn_cond_mask, mask=mask, prepend_mask=prepend_mask, return_info=return_info, local_add_cond=local_add_cond, **extra_args, **kwargs)
output = self.transformer(x, prepend_embeds=prepend_inputs, context=cross_attn_cond, context_mask=cross_attn_cond_mask, mask=mask, prepend_mask=prepend_mask, return_info=return_info, **extra_args, **kwargs)
if return_info:
output, info = output
@ -1040,7 +876,6 @@ class AudioDiffusionTransformer(nn.Module):
context=None,
context_mask=None,
input_concat_cond=None,
local_add_cond=None,
global_embed=None,
negative_global_embed=None,
prepend_cond=None,
@ -1055,7 +890,6 @@ class AudioDiffusionTransformer(nn.Module):
cross_attn_cond=context,
cross_attn_cond_mask=context_mask,
input_concat_cond=input_concat_cond,
local_add_cond=local_add_cond,
global_embed=global_embed,
prepend_cond=prepend_cond,
prepend_cond_mask=prepend_cond_mask,

View File

@ -31,39 +31,15 @@ def TimePositionalEmbedding(dim: int, out_features: int) -> nn.Module:
)
class ExpoFourierFeatures(nn.Module):
"""Exponentially-spaced Fourier features (no learnable parameters)."""
def __init__(self, dim, min_freq=0.5, max_freq=10000.0):
super().__init__()
self.dim = dim
self.min_freq = min_freq
self.max_freq = max_freq
def forward(self, t):
in_dtype = t.dtype
t = t.float()
if t.dim() == 1:
t = t.unsqueeze(-1)
half_dim = self.dim // 2
ramp = torch.linspace(0, 1, half_dim, device=t.device, dtype=torch.float32)
freqs = torch.exp(ramp * (math.log(self.max_freq) - math.log(self.min_freq)) + math.log(self.min_freq))
args = t * freqs * 2 * math.pi
return torch.cat([args.cos(), args.sin()], dim=-1).to(in_dtype)
class NumberEmbedder(nn.Module):
def __init__(
self,
features: int,
dim: int = 256,
fourier_features_type="learned",
):
super().__init__()
self.features = features
if fourier_features_type == "expo":
self.embedding = nn.Sequential(ExpoFourierFeatures(dim=dim), comfy.ops.manual_cast.Linear(in_features=dim, out_features=features))
else:
self.embedding = TimePositionalEmbedding(dim=dim, out_features=features)
self.embedding = TimePositionalEmbedding(dim=dim, out_features=features)
def forward(self, x: Union[List[float], Tensor]) -> Tensor:
if not torch.is_tensor(x):
@ -101,15 +77,14 @@ class NumberConditioner(Conditioner):
def __init__(self,
output_dim: int,
min_val: float=0,
max_val: float=1,
fourier_features_type: str = "learned",
max_val: float=1
):
super().__init__(output_dim, output_dim)
self.min_val = min_val
self.max_val = max_val
self.embedder = NumberEmbedder(features=output_dim, fourier_features_type=fourier_features_type)
self.embedder = NumberEmbedder(features=output_dim)
def forward(self, floats, device=None):
# Cast the inputs to floats

View File

@ -1,533 +0,0 @@
import torch
import torch.nn as nn
import comfy.ops
import comfy.model_management
from comfy.ldm.modules.attention import optimized_attention
from comfy.ldm.audio.autoencoder import WNConv1d
ops = comfy.ops.disable_weight_init
class Transpose(nn.Module):
def forward(self, x, **kwargs):
return x.transpose(-2, -1)
def _zero_pad_modulo_sequence(x, size, dim=-2):
input_len = x.shape[dim]
pad_len = (size - input_len % size) % size
if pad_len > 0:
pad_shape = list(x.shape)
pad_shape[dim] = pad_len
x = torch.cat([x, torch.zeros(pad_shape, device=x.device, dtype=x.dtype)], dim=dim)
return x
def _sliding_window_mask(seq_len, window, device, dtype):
"""Additive attention mask enforcing a ±window local window (matches flash_attn window_size)."""
i = torch.arange(seq_len, device=device).unsqueeze(1)
j = torch.arange(seq_len, device=device).unsqueeze(0)
out_of_window = (j - i).abs() > window
return torch.where(
out_of_window,
torch.full((1,), torch.finfo(dtype).min / 4, device=device, dtype=dtype),
torch.zeros(1, device=device, dtype=dtype),
)
class DynamicTanh(nn.Module):
def __init__(self, dim, init_alpha=4.0, dtype=None, device=None, **kwargs):
super().__init__()
self.alpha = nn.Parameter(torch.empty(1, dtype=dtype, device=device))
self.gamma = nn.Parameter(torch.empty(dim, dtype=dtype, device=device))
self.beta = nn.Parameter(torch.empty(dim, dtype=dtype, device=device))
def forward(self, x):
alpha = comfy.ops.cast_to_input(self.alpha, x)
gamma = comfy.ops.cast_to_input(self.gamma, x)
beta = comfy.ops.cast_to_input(self.beta, x)
return gamma * torch.tanh(alpha * x) + beta
class RotaryEmbedding(nn.Module):
def __init__(self, dim, base=10000, base_rescale_factor=1., dtype=None, device=None):
super().__init__()
base = base * base_rescale_factor ** (dim / (dim - 2))
self.register_buffer("inv_freq", torch.empty(dim // 2, dtype=dtype, device=device))
def forward_from_seq_len(self, seq_len, device, dtype=None):
t = torch.arange(seq_len, device=device, dtype=torch.float32)
return self.forward(t)
def forward(self, t):
freqs = torch.outer(t.float(), comfy.model_management.cast_to(self.inv_freq, dtype=torch.float32, device=t.device))
freqs = torch.cat((freqs, freqs), dim=-1)
return freqs, 1.
def _rotate_half(x):
d = x.shape[-1] // 2
return torch.cat((-x[..., d:], x[..., :d]), dim=-1)
def _apply_rotary_pos_emb(t, freqs):
out_dtype = t.dtype
rot_dim = freqs.shape[-1]
seq_len = t.shape[-2]
freqs = freqs[-seq_len:]
t_rot, t_pass = t[..., :rot_dim], t[..., rot_dim:]
t_rot = t_rot * freqs.cos() + _rotate_half(t_rot) * freqs.sin()
return torch.cat((t_rot.to(out_dtype), t_pass.to(out_dtype)), dim=-1)
class Attention(nn.Module):
def __init__(self, dim, dim_heads=64, qk_norm="none", qk_norm_eps=1e-6,
differential=False, zero_init_output=True,
dtype=None, device=None, operations=None, **kwargs):
super().__init__()
self.num_heads = dim // dim_heads
self.differential = differential
self.qk_norm = qk_norm
self.to_qkv = operations.Linear(
dim, dim * (5 if differential else 3), bias=False, dtype=dtype, device=device)
self.to_out = operations.Linear(dim, dim, bias=False, dtype=dtype, device=device)
if qk_norm == "dyt":
self.q_norm = DynamicTanh(dim_heads, dtype=dtype, device=device)
self.k_norm = DynamicTanh(dim_heads, dtype=dtype, device=device)
elif qk_norm == "rms":
self.q_norm = operations.RMSNorm(dim_heads, eps=qk_norm_eps, dtype=dtype, device=device)
self.k_norm = operations.RMSNorm(dim_heads, eps=qk_norm_eps, dtype=dtype, device=device)
def forward(self, x, rotary_pos_emb=None, mask=None, **kwargs):
B, N, _ = x.shape
h = self.num_heads
qkv = self.to_qkv(x)
if self.differential:
q, k, v, q_diff, k_diff = qkv.chunk(5, dim=-1)
del qkv
q = q.view(B, N, h, -1).transpose(1, 2)
k = k.view(B, N, h, -1).transpose(1, 2)
v = v.view(B, N, h, -1).transpose(1, 2)
q_diff = q_diff.view(B, N, h, -1).transpose(1, 2)
k_diff = k_diff.view(B, N, h, -1).transpose(1, 2)
else:
q, k, v = qkv.chunk(3, dim=-1)
del qkv
q = q.view(B, N, h, -1).transpose(1, 2)
k = k.view(B, N, h, -1).transpose(1, 2)
v = v.view(B, N, h, -1).transpose(1, 2)
if self.qk_norm != "none":
q_dtype, k_dtype = q.dtype, k.dtype
q = self.q_norm(q).to(q_dtype)
k = self.k_norm(k).to(k_dtype)
if self.differential:
q_diff = self.q_norm(q_diff).to(q_dtype)
k_diff = self.k_norm(k_diff).to(k_dtype)
if rotary_pos_emb is not None:
freqs, _ = rotary_pos_emb
q_dtype, k_dtype = q.dtype, k.dtype
q = _apply_rotary_pos_emb(q.float(), freqs).to(q_dtype)
k = _apply_rotary_pos_emb(k.float(), freqs).to(k_dtype)
if self.differential:
q_diff = _apply_rotary_pos_emb(q_diff.float(), freqs).to(q_dtype)
k_diff = _apply_rotary_pos_emb(k_diff.float(), freqs).to(k_dtype)
if self.differential:
out = (optimized_attention(q, k, v, h, mask=mask, skip_reshape=True)
- optimized_attention(q_diff, k_diff, v, h, mask=mask, skip_reshape=True))
del q, k, v, q_diff, k_diff
else:
out = optimized_attention(q, k, v, h, mask=mask, skip_reshape=True)
del q, k, v
return self.to_out(out)
class _Sin(nn.Module):
def forward(self, x):
return torch.sin(3.14159265359 * x)
class _GLU(nn.Module):
def __init__(self, dim_in, dim_out, activation, dtype=None, device=None, operations=None):
super().__init__()
self.act = activation
self.proj = operations.Linear(dim_in, dim_out * 2, dtype=dtype, device=device)
def forward(self, x):
x = self.proj(x)
x, gate = x.chunk(2, dim=-1)
return x * self.act(gate)
class FeedForward(nn.Module):
def __init__(self, dim, mult=4, no_bias=False, zero_init_output=True,
sinusoidal=False, dtype=None, device=None, operations=None, **kwargs):
super().__init__()
inner_dim = int(dim * mult)
act = _Sin() if sinusoidal else nn.SiLU()
self.ff = nn.Sequential(
_GLU(dim, inner_dim, act, dtype=dtype, device=device, operations=operations),
nn.Identity(),
operations.Linear(inner_dim, dim, bias=not no_bias, dtype=dtype, device=device),
nn.Identity(),
)
def forward(self, x, **kwargs):
return self.ff(x)
class TransformerBlock(nn.Module):
def __init__(self, dim, dim_heads=64, causal=False, zero_init_branch_outputs=True,
norm_type="dyt", add_rope=False, attn_kwargs=None, ff_kwargs=None,
norm_kwargs=None, dtype=None, device=None, operations=None, **kwargs):
super().__init__()
if attn_kwargs is None:
attn_kwargs = {}
if ff_kwargs is None:
ff_kwargs = {}
if norm_kwargs is None:
norm_kwargs = {}
dim_heads = min(dim_heads, dim)
Norm = DynamicTanh if norm_type == "dyt" else operations.RMSNorm
norm_kw = {**norm_kwargs, "dtype": dtype, "device": device}
self.pre_norm = Norm(dim, **norm_kw)
self.self_attn = Attention(dim, dim_heads=dim_heads,
zero_init_output=zero_init_branch_outputs,
dtype=dtype, device=device, operations=operations,
**attn_kwargs)
self.ff_norm = Norm(dim, **norm_kw)
self.ff = FeedForward(dim, zero_init_output=zero_init_branch_outputs,
dtype=dtype, device=device, operations=operations, **ff_kwargs)
self.rope = RotaryEmbedding(dim_heads // 2, dtype=dtype, device=device) if add_rope else None
def forward(self, x, mask=None, **kwargs):
rope = self.rope.forward_from_seq_len(x.shape[-2], device=x.device) \
if self.rope is not None else None
x = x + self.self_attn(self.pre_norm(x), rotary_pos_emb=rope, mask=mask)
x = x + self.ff(self.ff_norm(x))
return x
class TransformerResamplingBlock(nn.Module):
def __init__(self, in_channels, out_channels, stride, type="encoder",
transformer_depth=3, dim_heads=128, differential=True,
sliding_window=None, chunk_size=128, chunk_midpoint_shift=False,
dyt=True, ff_mult=3, mapping_bias=True, variable_stride=False,
sinusoidal_blocks=0, conv_mapping=False, dtype=None, device=None, operations=None, **kwargs):
super().__init__()
if type not in ("encoder", "decoder"):
raise ValueError(f"type must be 'encoder' or 'decoder', got {type!r}")
self.type = type
self.stride = stride
self.chunk_size = chunk_size
self.chunk_midpoint_shift = chunk_midpoint_shift
self.variable_stride = variable_stride
self.transformer_depth = transformer_depth
transformer_dim = out_channels if type == "encoder" else in_channels
self.mapping = (WNConv1d(in_channels, out_channels, 3 if conv_mapping else 1, padding="same", bias=mapping_bias)
if in_channels != out_channels else nn.Identity())
self.sliding_window_latents = sliding_window
self.sliding_window_seq = self._get_sliding_window_size(sliding_window, stride)
self.input_seg_size, self.output_seg_size, self.sub_chunk_size = self._get_seg_sizes(stride)
token_seq = 1 if variable_stride else self.output_seg_size
self.new_tokens = nn.Parameter(torch.empty(1, token_seq, transformer_dim, dtype=dtype, device=device))
norm_type = "dyt" if dyt else "rms_norm"
attn_kwargs = {"qk_norm": "dyt" if dyt else "rms", "qk_norm_eps": 1e-3,
"differential": differential}
norm_kwargs = {"eps": 1e-3}
transformers = []
for i in range(transformer_depth):
sinusoidal = (transformer_depth - i) < sinusoidal_blocks
transformers.append(TransformerBlock(
transformer_dim,
dim_heads=dim_heads,
causal=False,
zero_init_branch_outputs=True,
norm_type=norm_type,
add_rope=True,
attn_kwargs=attn_kwargs,
ff_kwargs={"mult": ff_mult, "no_bias": False, "sinusoidal": sinusoidal},
norm_kwargs=norm_kwargs,
dtype=dtype, device=device, operations=operations,
))
self.transformers = nn.ModuleList(transformers)
def _get_sliding_window_size(self, window, stride, prepend_cond_length=0):
if window is None:
return None
return [w * (stride + 1 + prepend_cond_length) for w in window]
def _get_seg_sizes(self, stride, prepend_cond_length=0):
sub_chunk_size = stride + 1 + prepend_cond_length
input_seg_size = stride if self.type == "encoder" else 1
output_seg_size = 1 if self.type == "encoder" else stride
return input_seg_size, output_seg_size, sub_chunk_size
def forward(self, x, stride=None, **kwargs):
B = x.shape[0]
if stride is None:
input_seg = self.input_seg_size
output_seg = self.output_seg_size
sub_chunk = self.sub_chunk_size
sliding_window = self.sliding_window_seq
else:
input_seg, output_seg, sub_chunk = self._get_seg_sizes(stride)
sliding_window = self._get_sliding_window_size(self.sliding_window_latents, stride)
if self.type == "encoder":
if self.transformer_depth > 0:
pad_mod = self.chunk_size if sliding_window is None else input_seg
x = _zero_pad_modulo_sequence(x, pad_mod, dim=-1)
x = self.mapping(x)
if self.transformer_depth > 0:
x = x.permute(0, 2, 1)
if self.type != "encoder":
pad_mod = 1 if sliding_window is not None else (
self.chunk_size // (stride if stride is not None else self.stride))
x = _zero_pad_modulo_sequence(x, pad_mod)
C = x.shape[2]
x = x.reshape(-1, input_seg, C)
new_tokens = self.new_tokens.expand(x.shape[0], output_seg, -1)
x = torch.cat([x, comfy.ops.cast_to_input(new_tokens, x)], dim=-2)
del new_tokens
x = x.reshape(B, -1, C)
if sliding_window is None:
eff_chunk = self.chunk_size + self.chunk_size // (stride if stride is not None else self.stride)
if sliding_window is None and self.chunk_midpoint_shift:
split = self.transformer_depth // 2
shift = eff_chunk // 2
x = x.reshape(-1, eff_chunk, C)
for layer in self.transformers[:split]:
x = layer(x)
x = x.reshape(B, -1, C)
shifted = torch.cat([x[:, :shift, :], x, x[:, -shift:, :]], dim=1)
del x
x = shifted.reshape(-1, eff_chunk, C)
del shifted
for layer in self.transformers[split:]:
x = layer(x)
x = x.reshape(B, -1, C)
x = x[:, shift:-shift, :]
elif sliding_window is None:
x = x.reshape(-1, eff_chunk, C)
for layer in self.transformers:
x = layer(x)
x = x.reshape(B, -1, C)
else:
attn_mask = _sliding_window_mask(x.shape[1], sliding_window[0], x.device, x.dtype)
for layer in self.transformers:
x = layer(x, mask=attn_mask)
x = x.reshape(-1, sub_chunk, C)
x = x[:, -output_seg:, :]
x = x.reshape(B, -1, C).transpose(1, 2)
if self.type == "decoder":
x = self.mapping(x)
return x
class SAMEEncoder(nn.Module):
def __init__(self, in_channels=2, channels=128, latent_dim=32,
c_mults=(1, 2, 4, 8), strides=(2, 4, 8, 8),
transformer_depths=(3, 3, 3, 3),
dtype=None, device=None, operations=None, **kwargs):
super().__init__()
channel_dims = [in_channels] + [channels * c for c in c_mults]
layers = []
for i in range(len(c_mults)):
layers.append(TransformerResamplingBlock(
in_channels=channel_dims[i], out_channels=channel_dims[i + 1],
stride=strides[i], type="encoder",
transformer_depth=transformer_depths[i],
dtype=dtype, device=device, operations=operations, **kwargs))
layers += [
Transpose(),
operations.Linear(channel_dims[-1], latent_dim, dtype=dtype, device=device),
Transpose(),
]
self.layers = nn.ModuleList(layers)
def forward(self, x, **kwargs):
for layer in self.layers:
x = layer(x)
return x
class SAMEDecoder(nn.Module):
def __init__(self, out_channels=2, channels=128, latent_dim=32,
c_mults=(1, 2, 4, 8), strides=(2, 4, 8, 8),
transformer_depths=(3, 3, 3, 3), sinusoidal_blocks=None,
dtype=None, device=None, operations=None, **kwargs):
super().__init__()
if sinusoidal_blocks is None:
sinusoidal_blocks = [0] * len(c_mults)
channel_dims = [out_channels] + [channels * c for c in c_mults]
layers = [
Transpose(),
operations.Linear(latent_dim, channel_dims[-1], dtype=dtype, device=device),
Transpose(),
]
for i in range(len(c_mults) - 1, -1, -1):
layers.append(TransformerResamplingBlock(
in_channels=channel_dims[i + 1], out_channels=channel_dims[i],
stride=strides[i], type="decoder",
transformer_depth=transformer_depths[i],
sinusoidal_blocks=sinusoidal_blocks[i],
dtype=dtype, device=device, operations=operations, **kwargs))
self.layers = nn.ModuleList(layers)
def forward(self, x, **kwargs):
for layer in self.layers:
x = layer(x)
return x
class SoftNormBottleneck(nn.Module):
def __init__(self, dim=32, noise_augment_dim=0, noise_regularize=False,
auto_scale=False, freeze=False, dtype=None, device=None, **kwargs):
super().__init__()
self.noise_augment_dim = noise_augment_dim
self.noise_regularize = noise_regularize
self.scaling_factor = nn.Parameter(torch.empty(1, dim, 1, dtype=dtype, device=device))
self.bias = nn.Parameter(torch.empty(1, dim, 1, dtype=dtype, device=device))
self.noise_scaling_factor = nn.Parameter(torch.empty(1, noise_augment_dim, 1, dtype=dtype, device=device))
if auto_scale:
self.register_parameter("running_std", nn.Parameter(
torch.empty(1, dtype=dtype, device=device), requires_grad=False))
if freeze:
for p in self.parameters():
p.requires_grad = False
def encode(self, x, return_info=False, **kwargs):
x = x * comfy.ops.cast_to_input(self.scaling_factor, x) \
+ comfy.ops.cast_to_input(self.bias, x)
if hasattr(self, "running_std"):
x = x / comfy.ops.cast_to_input(self.running_std, x)
if return_info:
return x, {}
return x
def decode(self, x, **kwargs):
if hasattr(self, "running_std"):
x = x * comfy.ops.cast_to_input(self.running_std, x)
if self.noise_regularize:
scaling = self.running_std if hasattr(self, "running_std") \
else x.std(dim=-1, keepdim=True)
noise = torch.randn_like(x) * comfy.ops.cast_to_input(scaling, x) * 1e-3
x = x + noise
if self.noise_augment_dim > 0:
noise = comfy.ops.cast_to_input(self.noise_scaling_factor, x) * torch.randn(
x.shape[0], self.noise_augment_dim, x.shape[-1], device=x.device, dtype=x.dtype)
x = torch.cat([x, noise], dim=1)
return x
class PatchedPretransform(nn.Module):
def __init__(self, channels, patch_size, **kwargs):
super().__init__()
self.channels = channels
self.patch_size = patch_size
self.enable_grad = False
def _pad(self, x):
pad_len = (self.patch_size - x.shape[-1] % self.patch_size) % self.patch_size
if pad_len > 0:
x = torch.cat([x, torch.zeros_like(x[:, :, :pad_len])], dim=-1)
return x
def encode(self, x):
x = self._pad(x)
B, C, T = x.shape
h = self.patch_size
L = T // h
# b c (l h) -> b (c h) l
return x.reshape(B, C, L, h).permute(0, 1, 3, 2).reshape(B, C * h, L)
def decode(self, x):
B, Ch, L = x.shape
h = self.patch_size
C = Ch // h
# b (c h) l -> b c (l h)
return x.reshape(B, C, h, L).permute(0, 1, 3, 2).reshape(B, C, L * h)
class SA3AudioVAE(nn.Module):
"""SA3 VAE. State dict keys match checkpoint after stripping 'pretransform.model.'"""
def __init__(self, channels=256, transformer_depths=12, sinusoidal_blocks=8,
sliding_window=None, decoder_conv_mapping=False,
chunk_size=128, chunk_midpoint_shift=False,
dtype=None, device=None, operations=None):
super().__init__()
if operations is None:
operations = ops
self.pretransform = PatchedPretransform(channels=2, patch_size=256)
common_kwargs = dict(
differential=True, dyt=True, dim_heads=64,
sliding_window=sliding_window, variable_stride=True,
chunk_size=chunk_size, chunk_midpoint_shift=chunk_midpoint_shift,
dtype=dtype, device=device, operations=operations,
)
self.encoder = SAMEEncoder(
in_channels=512, channels=channels, c_mults=[6], strides=[16],
latent_dim=256, transformer_depths=[transformer_depths],
conv_mapping=False, **common_kwargs,
)
self.decoder = SAMEDecoder(
out_channels=512, channels=channels, c_mults=[6], strides=[16],
latent_dim=256, transformer_depths=[transformer_depths], sinusoidal_blocks=[sinusoidal_blocks],
conv_mapping=decoder_conv_mapping, **common_kwargs,
)
self.bottleneck = SoftNormBottleneck(
dim=256, noise_augment_dim=0, noise_regularize=True,
auto_scale=True, freeze=True,
dtype=dtype, device=device,
)
@torch.no_grad()
def _pretransform_encode(self, x):
return self.pretransform.encode(x)
@torch.no_grad()
def _pretransform_decode(self, x):
return self.pretransform.decode(x)
def encode(self, x):
x = self._pretransform_encode(x)
x = self.encoder(x)
x = self.bottleneck.encode(x)
return x
def decode(self, x):
x = self.bottleneck.decode(x)
x = self.decoder(x)
x = self._pretransform_decode(x)
return x

View File

@ -1,41 +0,0 @@
"""HiDream-O1 two-pass attention: tokens [0, ar_len) are causal, [ar_len, T)
attend full K/V. Splitting Q at the boundary avoids the (B, 1, T, T) additive
mask the general-purpose path would build (~500 MB at T~16K) and lets the
gen half hit the user's preferred backend via optimized_attention.
"""
import torch
import comfy.ops
from comfy.ldm.modules.attention import optimized_attention
def make_two_pass_attention(ar_len: int, transformer_options=None):
"""Build a two-pass attention callable. AR pass uses SDPA-causal directly, gen pass routes through optimized_attention.
The AR pass goes through SDPA directand bypasses wrappers, it is only ~1% of T at typical edit sizes.
"""
def two_pass_attention(q, k, v, heads, **kwargs):
B, H, T, D = q.shape
if T < k.shape[2]: # KV-cache hot path: Q is shorter than K/V (cached AR prefix is in K/V only), all fresh Q positions are in the gen region, single full-attention call
out = optimized_attention(q, k, v, heads, mask=None, skip_reshape=True, skip_output_reshape=True, transformer_options=transformer_options)
elif ar_len >= T:
out = comfy.ops.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=0.0, is_causal=True)
elif ar_len <= 0:
out = optimized_attention(q, k, v, heads, mask=None, skip_reshape=True, skip_output_reshape=True, transformer_options=transformer_options)
else:
out_ar = comfy.ops.scaled_dot_product_attention(
q[:, :, :ar_len], k[:, :, :ar_len], v[:, :, :ar_len],
attn_mask=None, dropout_p=0.0, is_causal=True,
)
out_gen = optimized_attention(
q[:, :, ar_len:], k, v, heads,
mask=None, skip_reshape=True, skip_output_reshape=True,
transformer_options=transformer_options,
)
out = torch.cat([out_ar, out_gen], dim=2)
return out.transpose(1, 2).reshape(B, T, H * D)
return two_pass_attention

View File

@ -1,230 +0,0 @@
"""HiDream-O1 conditioning prep — ref-image dual path + extra_conds assembly.
Each ref image goes through two paths: a 32x32 patchified stream concatenated
to the noised target, and a Qwen3-VL ViT path producing tokens that scatter
into input_ids at <|image_pad|> positions.
"""
from typing import List
import torch
import comfy.utils
from comfy.text_encoders.qwen_vl import process_qwen2vl_images
from .utils import (PATCH_SIZE, calculate_dimensions, cond_image_size, ref_max_size, resize_tensor)
# Qwen3-VL ViT preprocessing constants (preprocessor_config.json).
VIT_PATCH = 16
VIT_MERGE = 2
VIT_IMAGE_MEAN = [0.5, 0.5, 0.5]
VIT_IMAGE_STD = [0.5, 0.5, 0.5]
def prepare_ref_images(
ref_images: List[torch.Tensor],
target_h: int,
target_w: int,
device: torch.device,
dtype: torch.dtype,
):
"""Build the dual-path tensors for K reference images at (target_h, target_w).
Returns None for K=0, else a dict with ref_patches, ref_pixel_values,
ref_image_grid_thw, per_ref_vit_tokens, per_ref_patch_grids.
"""
K = len(ref_images)
if K == 0:
return None
max_size = ref_max_size(max(target_h, target_w), K)
cis = cond_image_size(K)
refs_t = [img[0].clamp(0, 1).permute(2, 0, 1).unsqueeze(0).contiguous().float() for img in ref_images]
refs_t = [resize_tensor(t, max_size, PATCH_SIZE) for t in refs_t]
# 32-patch path.
ref_patches_per = []
per_ref_patch_grids = []
for t in refs_t:
t_norm = (t.squeeze(0) - 0.5) / 0.5 # (3, H, W) in [-1, 1]
h_p, w_p = t_norm.shape[-2] // PATCH_SIZE, t_norm.shape[-1] // PATCH_SIZE
per_ref_patch_grids.append((h_p, w_p))
patches = (
t_norm.reshape(3, h_p, PATCH_SIZE, w_p, PATCH_SIZE)
.permute(1, 3, 0, 2, 4)
.reshape(h_p * w_p, 3 * PATCH_SIZE * PATCH_SIZE)
)
ref_patches_per.append(patches)
ref_patches = torch.cat(ref_patches_per, dim=0).unsqueeze(0).to(device=device, dtype=dtype)
# ViT path.
refs_vlm_t = []
for t in refs_t:
_, _, h, w = t.shape
cond_w, cond_h = calculate_dimensions(cis, w / h)
cond_w = max(cond_w, VIT_PATCH * VIT_MERGE)
cond_h = max(cond_h, VIT_PATCH * VIT_MERGE)
refs_vlm_t.append(comfy.utils.common_upscale(t, cond_w, cond_h, "lanczos", "disabled"))
pv_list, grid_list, per_ref_vit_tokens = [], [], []
for t_v in refs_vlm_t:
pv, grid_thw = process_qwen2vl_images(
t_v.permute(0, 2, 3, 1),
min_pixels=0, max_pixels=10**12,
patch_size=VIT_PATCH, merge_size=VIT_MERGE,
image_mean=VIT_IMAGE_MEAN, image_std=VIT_IMAGE_STD,
)
grid_thw = grid_thw[0]
pv_list.append(pv.to(device=device, dtype=dtype))
grid_list.append(grid_thw.to(device=device))
# Post-merge token count = number of <|image_pad|> tokens this image expands to in input_ids.
gh, gw = int(grid_thw[1].item()), int(grid_thw[2].item())
per_ref_vit_tokens.append((gh // VIT_MERGE) * (gw // VIT_MERGE))
return {
"ref_patches": ref_patches,
"ref_pixel_values": torch.cat(pv_list, dim=0),
"ref_image_grid_thw": torch.stack(grid_list, dim=0),
"per_ref_vit_tokens": per_ref_vit_tokens,
"per_ref_patch_grids": per_ref_patch_grids,
}
def build_ref_input_ids(
text_input_ids: torch.Tensor,
per_ref_vit_tokens: List[int],
image_token_id: int,
vision_start_id: int,
vision_end_id: int,
):
"""Splice [vision_start, image_pad*N, vision_end] blocks into input_ids
after the [im_start, user, \\n] prefix (matches original chat template).
"""
ids = text_input_ids[0].tolist()
inserted = []
for n_pad in per_ref_vit_tokens:
inserted.extend([vision_start_id] + [image_token_id] * n_pad + [vision_end_id])
new_ids = ids[:3] + inserted + ids[3:] # 3 = len([im_start, user, \n])
return torch.tensor([new_ids], dtype=text_input_ids.dtype, device=text_input_ids.device)
def build_extra_conds(
text_input_ids: torch.Tensor,
noise: torch.Tensor,
ref_images: List[torch.Tensor] = None,
target_patch_size: int = 32,
):
"""Assemble all conditioning tensors for HiDreamO1Transformer.forward:
input_ids (with ref-vision tokens spliced in for the edit/IP path),
position_ids (MRoPE), token_types, vinput_mask, plus the ref
dual-path tensors when refs are provided.
"""
from .utils import get_rope_index_fix_point
from comfy.text_encoders.hidream_o1 import (
IMAGE_TOKEN_ID, VISION_START_ID, VISION_END_ID,
)
if text_input_ids.dim() == 1:
text_input_ids = text_input_ids.unsqueeze(0)
text_input_ids = text_input_ids.long().to(noise.device)
B = noise.shape[0]
if text_input_ids.shape[0] == 1 and B > 1:
text_input_ids = text_input_ids.expand(B, -1)
H, W = noise.shape[-2], noise.shape[-1]
h_p, w_p = H // target_patch_size, W // target_patch_size
image_len = h_p * w_p
image_grid_thw_tgt = torch.tensor(
[[1, h_p, w_p]], dtype=torch.long, device=text_input_ids.device,
)
out = {}
if ref_images:
ref = prepare_ref_images(ref_images, H, W, device=noise.device, dtype=noise.dtype)
text_input_ids = build_ref_input_ids(
text_input_ids, ref["per_ref_vit_tokens"],
IMAGE_TOKEN_ID, VISION_START_ID, VISION_END_ID,
)
new_txt_len = text_input_ids.shape[1]
# Each ref's patchified stream gets a [vision_start, image_pad*N-1]
# block in the position-id stream after the noised target.
ref_grid_lengths = [hp * wp for (hp, wp) in ref["per_ref_patch_grids"]]
tgt_vision = torch.full((1, image_len), IMAGE_TOKEN_ID,
dtype=text_input_ids.dtype, device=text_input_ids.device)
tgt_vision[:, 0] = VISION_START_ID
ref_vision_blocks = []
for rl in ref_grid_lengths:
blk = torch.full((1, rl), IMAGE_TOKEN_ID,
dtype=text_input_ids.dtype, device=text_input_ids.device)
blk[:, 0] = VISION_START_ID
ref_vision_blocks.append(blk)
ref_vision_cat = torch.cat([tgt_vision] + ref_vision_blocks, dim=1)
input_ids_pad = torch.cat([text_input_ids, ref_vision_cat], dim=-1)
total_ref_patches_len = sum(ref_grid_lengths)
total_len = new_txt_len + image_len + total_ref_patches_len
# K (ViT, post-merge) + 1 (target) + K (ref-patches) image grids.
K = len(ref_images)
igthw_cond = ref["ref_image_grid_thw"].clone()
igthw_cond[:, 1] //= 2
igthw_cond[:, 2] //= 2
image_grid_thw_ref = torch.tensor(
[[1, hp, wp] for (hp, wp) in ref["per_ref_patch_grids"]],
dtype=torch.long, device=text_input_ids.device,
)
igthw_all = torch.cat([
igthw_cond.to(text_input_ids.device),
image_grid_thw_tgt,
image_grid_thw_ref,
], dim=0)
position_ids, _ = get_rope_index_fix_point(
spatial_merge_size=1,
image_token_id=IMAGE_TOKEN_ID,
vision_start_token_id=VISION_START_ID,
input_ids=input_ids_pad, image_grid_thw=igthw_all,
attention_mask=None,
skip_vision_start_token=[0] * K + [1] + [1] * K,
fix_point=4096,
)
# tms + target_image + ref_patches are all gen.
tms_pos = new_txt_len - 1
ar_len = tms_pos
token_types = torch.zeros(B, total_len, dtype=torch.long, device=noise.device)
token_types[:, tms_pos:] = 1
vinput_mask = torch.zeros(B, total_len, dtype=torch.bool, device=noise.device)
vinput_mask[:, new_txt_len:] = True
# Leading batch dim sidesteps CONDRegular.process_cond's repeat_to_batch_size truncation
out["ref_pixel_values"] = ref["ref_pixel_values"].unsqueeze(0)
out["ref_image_grid_thw"] = ref["ref_image_grid_thw"].unsqueeze(0)
out["ref_patches"] = ref["ref_patches"]
else:
# T2I: text + noised target only, vision_start replaces the first image token
txt_len = text_input_ids.shape[1]
total_len = txt_len + image_len
vision_tokens = torch.full((B, image_len), IMAGE_TOKEN_ID,
dtype=text_input_ids.dtype, device=text_input_ids.device)
vision_tokens[:, 0] = VISION_START_ID
input_ids_pad = torch.cat([text_input_ids, vision_tokens], dim=-1)
position_ids, _ = get_rope_index_fix_point(
spatial_merge_size=1,
image_token_id=IMAGE_TOKEN_ID,
vision_start_token_id=VISION_START_ID,
input_ids=input_ids_pad, image_grid_thw=image_grid_thw_tgt,
attention_mask=None,
skip_vision_start_token=[1],
)
ar_len = txt_len - 1
token_types = torch.zeros(B, total_len, dtype=torch.long, device=noise.device)
token_types[:, ar_len:] = 1
vinput_mask = torch.zeros(B, total_len, dtype=torch.bool, device=noise.device)
vinput_mask[:, txt_len:] = True
out["input_ids"] = text_input_ids
out["position_ids"] = position_ids[:, 0].unsqueeze(0) # Collapse position_ids batch and add a leading dim so CONDRegular's batch-resize doesn't truncate the 3-axis MRoPE dim
out["token_types"] = token_types
out["vinput_mask"] = vinput_mask
out["ar_len"] = ar_len
return out

View File

@ -1,306 +0,0 @@
"""HiDream-O1-Image transformer.
Pixel-space DiT built on Qwen3-VL: the vision tower (Qwen35VisionModel)
encodes ref images, the Qwen3-VL-8B decoder (Llama2_ with interleaved MRoPE)
processes a unified text+image sequence, and 32x32 patch embed/unembed
shims map raw RGB in and out of LLM hidden space. The Qwen3-VL deepstack
mergers go unused — their weights are dropped at load.
"""
from dataclasses import dataclass, field
from typing import List, Optional
import einops
import torch
import torch.nn as nn
import comfy.patcher_extension
from comfy.ldm.modules.diffusionmodules.mmdit import TimestepEmbedder
from comfy.text_encoders.llama import Llama2_
from comfy.text_encoders.qwen35 import Qwen35VisionModel
from .attention import make_two_pass_attention
IMAGE_TOKEN_ID = 151655 # Qwen3-VL <|image_pad|>
TMS_TOKEN_ID = 151673 # HiDream-O1 <|tms_token|>
PATCH_SIZE = 32
@dataclass
class HiDreamO1TextConfig:
"""Qwen3-VL-8B text-decoder dims (matches public Qwen3-VL-8B-Instruct)."""
vocab_size: int = 151936
hidden_size: int = 4096
intermediate_size: int = 12288
num_hidden_layers: int = 36
num_attention_heads: int = 32
num_key_value_heads: int = 8
head_dim: int = 128
max_position_embeddings: int = 128000
rms_norm_eps: float = 1e-6
rope_theta: float = 5000000.0
rope_scale: Optional[float] = None
rope_dims: List[int] = field(default_factory=lambda: [24, 20, 20])
interleaved_mrope: bool = True
transformer_type: str = "llama"
rms_norm_add: bool = False
mlp_activation: str = "silu"
qkv_bias: bool = False
q_norm: str = "gemma3"
k_norm: str = "gemma3"
final_norm: bool = True
lm_head: bool = False
stop_tokens: List[int] = field(default_factory=lambda: [151643, 151645])
QWEN3VL_VISION_DEFAULTS = dict(
hidden_size=1152,
num_heads=16,
intermediate_size=4304,
depth=27,
patch_size=16,
temporal_patch_size=2,
in_channels=3,
spatial_merge_size=2,
num_position_embeddings=2304,
deepstack_visual_indexes=(8, 16, 24),
out_hidden_size=4096, # final merger projects directly into LLM hidden
)
class BottleneckPatchEmbed(nn.Module):
# 3072 -> 1024 -> 4096 (raw 32x32 RGB patch -> bottleneck -> LLM hidden).
def __init__(self, patch_size=32, in_chans=3, pca_dim=1024, embed_dim=4096, bias=True, device=None, dtype=None, ops=None):
super().__init__()
self.proj1 = ops.Linear(patch_size * patch_size * in_chans, pca_dim, bias=False, device=device, dtype=dtype)
self.proj2 = ops.Linear(pca_dim, embed_dim, bias=bias, device=device, dtype=dtype)
def forward(self, x):
return self.proj2(self.proj1(x))
class FinalLayer(nn.Module):
# 4096 -> 3072 (LLM hidden -> flat pixel patch).
def __init__(self, hidden_size, patch_size=32, out_channels=3, device=None, dtype=None, ops=None):
super().__init__()
self.linear = ops.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True, device=device, dtype=dtype)
def forward(self, x):
return self.linear(x)
class HiDreamO1Transformer(nn.Module):
"""HiDream-O1 unified pixel-level transformer."""
def __init__(self, image_model=None, dtype=None, device=None, operations=None,
text_config_overrides=None, vision_config_overrides=None, **kwargs):
super().__init__()
self.dtype = dtype
text_cfg = HiDreamO1TextConfig(**(text_config_overrides or {}))
vision_cfg = dict(QWEN3VL_VISION_DEFAULTS)
if vision_config_overrides:
vision_cfg.update(vision_config_overrides)
vision_cfg["out_hidden_size"] = text_cfg.hidden_size
self.text_config = text_cfg
self.vision_config = vision_cfg
self.hidden_size = text_cfg.hidden_size
self.patch_size = PATCH_SIZE
self.in_channels = 3
self.tms_token_id = TMS_TOKEN_ID
self.visual = Qwen35VisionModel(vision_cfg, device=device, dtype=dtype, ops=operations)
self.language_model = Llama2_(text_cfg, device=device, dtype=dtype, ops=operations)
self.t_embedder1 = TimestepEmbedder(
text_cfg.hidden_size, device=device, dtype=dtype, operations=operations,
)
self.x_embedder = BottleneckPatchEmbed(
patch_size=self.patch_size, in_chans=self.in_channels,
pca_dim=text_cfg.hidden_size // 4, embed_dim=text_cfg.hidden_size,
bias=True, device=device, dtype=dtype, ops=operations,
)
self.final_layer2 = FinalLayer(
text_cfg.hidden_size, patch_size=self.patch_size,
out_channels=self.in_channels, device=device, dtype=dtype, ops=operations,
)
self._visual_cache = None
self._kv_cache_entries = []
def clear_kv_cache(self):
self._kv_cache_entries = []
self._visual_cache = None
def forward(self, x, timesteps, context=None, transformer_options={}, **kwargs):
return comfy.patcher_extension.WrapperExecutor.new_class_executor(
self._forward,
self,
comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, transformer_options)
).execute(x, timesteps, context, transformer_options, **kwargs)
def _forward(self, x, timesteps, context=None, transformer_options={}, input_ids=None, attention_mask=None, position_ids=None,
vinput_mask=None, ar_len=None, ref_pixel_values=None, ref_image_grid_thw=None, ref_patches=None, **kwargs):
"""Returns flow-match velocity (x - x_pred) / sigma"""
if input_ids is None or position_ids is None:
raise ValueError("HiDreamO1Transformer requires input_ids and position_ids in conditioning")
B, _, H, W = x.shape
h_p, w_p = H // self.patch_size, W // self.patch_size
tgt_image_len = h_p * w_p
z = einops.rearrange(
x, 'B C (H p1) (W p2) -> B (H W) (C p1 p2)',
p1=self.patch_size, p2=self.patch_size,
)
vinputs = torch.cat([z, ref_patches.to(z.dtype)], dim=1) if ref_patches is not None else z
inputs_embeds = self.language_model.embed_tokens(input_ids).to(x.dtype)
if ref_pixel_values is not None and ref_image_grid_thw is not None:
# ViT output is constant across sampling steps within a generation
# identity-key by the input tensor so refs don't recompute every step.
cached = self._visual_cache
if cached is not None and cached[0] is ref_pixel_values:
image_embeds = cached[1]
else:
ref_pv = ref_pixel_values.to(inputs_embeds.device)
ref_grid = ref_image_grid_thw.to(inputs_embeds.device).long()
# extra_conds wraps with a leading batch dim; refs are model-level so [0] always recovers them.
if ref_pv.dim() == 3:
ref_pv = ref_pv[0]
if ref_grid.dim() == 3:
ref_grid = ref_grid[0]
image_embeds = self.visual(ref_pv, ref_grid).to(inputs_embeds.dtype)
self._visual_cache = (ref_pixel_values, image_embeds)
# image_pad positions identical across batch (input_ids shared cond/uncond).
image_idx = (input_ids[0] == IMAGE_TOKEN_ID).nonzero(as_tuple=True)[0]
if image_idx.shape[0] != image_embeds.shape[0]:
raise ValueError(
f"Image-token count {image_idx.shape[0]} != ViT output count "
f"{image_embeds.shape[0]}; check tokenizer/processor alignment."
)
inputs_embeds[:, image_idx] = image_embeds.unsqueeze(0).expand(B, -1, -1)
sigma = timesteps.float() / 1000.0
t_pixeldit = 1.0 - sigma
t_emb = self.t_embedder1(t_pixeldit * 1000, inputs_embeds.dtype)
tms_mask_3d = (input_ids == self.tms_token_id).unsqueeze(-1).expand_as(inputs_embeds)
inputs_embeds = torch.where(tms_mask_3d, t_emb.unsqueeze(1).expand_as(inputs_embeds), inputs_embeds)
vinputs_embedded = self.x_embedder(vinputs.to(inputs_embeds.dtype))
inputs_embeds = torch.cat([inputs_embeds, vinputs_embedded], dim=1)
# extra_conds stores position_ids as (1, 3, T); process_cond repeats dim 0 to B. Take row 0.
freqs_cis = self.language_model.compute_freqs_cis(position_ids[0].to(x.device), x.device)
freqs_cis = tuple(t.to(x.dtype) for t in freqs_cis)
two_pass_attn = make_two_pass_attention(ar_len, transformer_options=transformer_options)
patches_replace = transformer_options.get("patches_replace", {})
blocks_replace = patches_replace.get("dit", {})
transformer_options["total_blocks"] = len(self.language_model.layers)
transformer_options["block_type"] = "double"
# Cache prefix K/V across steps. Key includes input_ids (prompt), ref_id
# (refs scatter into inputs_embeds), and position_ids (RoPE baked into cached K).
can_cache = not blocks_replace and ar_len > 0
cache_len = ar_len if can_cache else 0
ref_id = id(ref_pixel_values) if ref_pixel_values is not None else None
pos_ids_key = position_ids[..., :cache_len] if can_cache else position_ids
cache_entries = self._kv_cache_entries
# Drop stale entries from a previous device (model was unloaded and reloaded).
if cache_entries and cache_entries[0]["input_ids"].device != input_ids.device:
cache_entries = []
self._kv_cache_entries = []
kv_cache = None
if can_cache:
for entry in cache_entries:
ck = entry["input_ids"]
ep = entry["position_ids"]
if (entry["cache_len"] == cache_len
and ck.shape == input_ids.shape and torch.equal(ck, input_ids)
and entry["ref_id"] == ref_id
and ep.shape == pos_ids_key.shape and torch.equal(ep, pos_ids_key)):
kv_cache = entry
break
if kv_cache is not None:
# Hot path: project Q/K/V only for fresh positions; past_key_value prepends cached AR K/V.
hidden_states = inputs_embeds[:, cache_len:]
sliced_freqs = tuple(t[..., cache_len:, :] for t in freqs_cis)
for i, layer in enumerate(self.language_model.layers):
transformer_options["block_index"] = i
K_i, V_i = kv_cache["kv"][i]
hidden_states, _ = layer(
x=hidden_states, attention_mask=None, freqs_cis=sliced_freqs, optimized_attention=two_pass_attn,
past_key_value=(K_i, V_i, cache_len),
)
else:
# Cold path: run full sequence; if cacheable, snapshot K/V at AR positions.
snapshots = [] if can_cache else None
past_kv_cold = () if can_cache else None
hidden_states = inputs_embeds
for i, layer in enumerate(self.language_model.layers):
transformer_options["block_index"] = i
if ("double_block", i) in blocks_replace:
def block_wrap(args, _layer=layer):
out = {}
out["x"], _ = _layer(
x=args["x"], attention_mask=args.get("attention_mask"),
freqs_cis=args["freqs_cis"], optimized_attention=args["optimized_attention"],
past_key_value=None,
)
return out
out = blocks_replace[("double_block", i)](
{"x": hidden_states, "attention_mask": None,
"freqs_cis": freqs_cis, "optimized_attention": two_pass_attn,
"transformer_options": transformer_options},
{"original_block": block_wrap},
)
hidden_states = out["x"]
else:
hidden_states, present_kv = layer(
x=hidden_states, attention_mask=None,
freqs_cis=freqs_cis, optimized_attention=two_pass_attn,
past_key_value=past_kv_cold,
)
if snapshots is not None:
K, V, _ = present_kv
snapshots.append((K[:, :, :cache_len].contiguous(),
V[:, :, :cache_len].contiguous()))
if snapshots is not None:
# Cap at 2 entries (cond + uncond). Multi-cond workflows LRU-evict.
new_entry = {
"input_ids": input_ids.clone(),
"cache_len": cache_len,
"kv": snapshots,
"ref_id": ref_id,
"position_ids": pos_ids_key.clone(),
}
self._kv_cache_entries = (cache_entries + [new_entry])[-2:]
if self.language_model.norm is not None:
hidden_states = self.language_model.norm(hidden_states)
# Slice target-image positions before the final projection so the Linear only runs on tgt_image_len tokens.
# In the hot path hidden_states starts at original position cache_len, so masks/indices shift by cache_len.
sliced_offset = cache_len if kv_cache is not None else 0
if vinput_mask is not None:
vmask = vinput_mask.to(x.device).bool()
if sliced_offset > 0:
vmask = vmask[:, sliced_offset:]
target_hidden = hidden_states[vmask].view(B, -1, hidden_states.shape[-1])[:, :tgt_image_len]
else:
txt_seq_len = input_ids.shape[1]
start = txt_seq_len - sliced_offset
target_hidden = hidden_states[:, start:start + tgt_image_len]
x_pred_tgt = self.final_layer2(target_hidden)
# fp32 final subtraction, bf16 here noticeably degrades samples.
x_pred_img = einops.rearrange(
x_pred_tgt, 'B (H W) (C p1 p2) -> B C (H p1) (W p2)',
H=h_p, W=w_p, p1=self.patch_size, p2=self.patch_size,
)
return (x.float() - x_pred_img.float()) / sigma.view(B, 1, 1, 1).clamp_min(1e-3)

View File

@ -1,173 +0,0 @@
"""HiDream-O1 input-prep helpers: image/resolution math and unified-sequence
RoPE position-id assembly. The fix_point offset in get_rope_index_fix_point
lets the target image and patchified ref images share spatial RoPE positions
despite living at different sequence indices — same 2D image plane.
"""
import math
from typing import Optional
import torch
PATCH_SIZE = 32
CONDITION_IMAGE_SIZE = 384 # ViT-side base size for ref images
def resize_tensor(img_t, image_size, patch_size=16):
"""img_t: (1, 3, H, W) float [0, 1]. Fit to image_size**2 area, patch-aligned, center-cropped."""
while min(img_t.shape[-2], img_t.shape[-1]) >= 2 * image_size: # Pre-halves with 2x2 box averaging while the image is still very large
img_t = torch.nn.functional.avg_pool2d(img_t, kernel_size=2, stride=2)
_, _, height, width = img_t.shape
m = patch_size
s_max = image_size * image_size
scale = math.sqrt(s_max / (width * height))
candidates = [
(round(width * scale) // m * m, round(height * scale) // m * m),
(round(width * scale) // m * m, math.floor(height * scale) // m * m),
(math.floor(width * scale) // m * m, round(height * scale) // m * m),
(math.floor(width * scale) // m * m, math.floor(height * scale) // m * m),
]
candidates = sorted(candidates, key=lambda x: x[0] * x[1], reverse=True)
new_size = candidates[-1]
for c in candidates:
if c[0] * c[1] <= s_max:
new_size = c
break
new_w, new_h = new_size
s1 = width / new_w
s2 = height / new_h
if s1 < s2:
resize_w, resize_h = new_w, round(height / s1)
else:
resize_w, resize_h = round(width / s2), new_h
img_t = torch.nn.functional.interpolate(img_t, size=(resize_h, resize_w), mode="bicubic")
top = (resize_h - new_h) // 2
left = (resize_w - new_w) // 2
return img_t[..., top:top + new_h, left:left + new_w]
def calculate_dimensions(max_size, ratio):
"""(W, H) for an aspect ratio fitting in max_size**2 area, 32-aligned."""
width = math.sqrt(max_size * max_size * ratio)
height = width / ratio
width = int(width / 32) * 32
height = int(height / 32) * 32
return width, height
def ref_max_size(target_max_dim, k):
"""K-dependent ref-image max dim before patchifying."""
if k == 1:
return target_max_dim
if k == 2:
return target_max_dim * 48 // 64
if k <= 4:
return target_max_dim // 2
if k <= 8:
return target_max_dim * 24 // 64
return target_max_dim // 4
def cond_image_size(k):
"""K-dependent ViT-side image size."""
if k <= 4:
return CONDITION_IMAGE_SIZE
if k <= 8:
return CONDITION_IMAGE_SIZE * 48 // 64
return CONDITION_IMAGE_SIZE // 2
def get_rope_index_fix_point(
spatial_merge_size: int,
image_token_id: int,
vision_start_token_id: int,
input_ids: Optional[torch.LongTensor] = None,
image_grid_thw: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.Tensor] = None,
skip_vision_start_token=None,
fix_point: int = 4096,
):
mrope_position_deltas = []
if input_ids is not None and image_grid_thw is not None:
total_input_ids = input_ids
if attention_mask is None:
attention_mask = torch.ones_like(total_input_ids)
position_ids = torch.ones(
3, input_ids.shape[0], input_ids.shape[1],
dtype=input_ids.dtype, device=input_ids.device,
)
attention_mask = attention_mask.to(total_input_ids.device)
for i, input_ids_b in enumerate(total_input_ids):
fp = fix_point
image_index = 0
input_ids_b = input_ids_b[attention_mask[i] == 1]
vision_start_indices = torch.argwhere(input_ids_b == vision_start_token_id).squeeze(1)
vision_tokens = input_ids_b[vision_start_indices + 1]
image_nums = (vision_tokens == image_token_id).sum()
input_tokens = input_ids_b.tolist()
llm_pos_ids_list = []
st = 0
remain_images = image_nums
for _ in range(image_nums):
if image_token_id in input_tokens and remain_images > 0:
ed = input_tokens.index(image_token_id, st)
else:
ed = len(input_tokens) + 1
t = image_grid_thw[image_index][0]
h = image_grid_thw[image_index][1]
w = image_grid_thw[image_index][2]
image_index += 1
remain_images -= 1
llm_grid_t = t.item()
llm_grid_h = h.item() // spatial_merge_size
llm_grid_w = w.item() // spatial_merge_size
text_len = ed - st
text_len -= skip_vision_start_token[image_index - 1]
text_len = max(0, text_len)
st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
t_index = torch.arange(llm_grid_t).view(-1, 1).expand(-1, llm_grid_h * llm_grid_w).flatten()
h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(llm_grid_t, -1, llm_grid_w).flatten()
w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(llm_grid_t, llm_grid_h, -1).flatten()
if skip_vision_start_token[image_index - 1]:
if fp > 0:
fp = fp - st_idx
llm_pos_ids_list.append(torch.stack([t_index, h_index, w_index]) + fp + st_idx)
fp = 0
else:
llm_pos_ids_list.append(torch.stack([t_index, h_index, w_index]) + text_len + st_idx)
st = ed + llm_grid_t * llm_grid_h * llm_grid_w
if st < len(input_tokens):
st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
text_len = len(input_tokens) - st
llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
position_ids[..., i, attention_mask[i] == 1] = llm_positions.to(position_ids.device)
mrope_position_deltas.append(llm_positions.max() + 1 - len(total_input_ids[i]))
mrope_position_deltas = torch.tensor(mrope_position_deltas, device=input_ids.device).unsqueeze(1)
return position_ids, mrope_position_deltas
if attention_mask is not None:
position_ids = attention_mask.long().cumsum(-1) - 1
position_ids.masked_fill_(attention_mask == 0, 1)
position_ids = position_ids.unsqueeze(0).expand(3, -1, -1).to(attention_mask.device)
max_position_ids = position_ids.max(0, keepdim=False)[0].max(-1, keepdim=True)[0]
mrope_position_deltas = max_position_ids + 1 - attention_mask.shape[-1]
else:
position_ids = (
torch.arange(input_ids.shape[1], device=input_ids.device)
.view(1, 1, -1).expand(3, input_ids.shape[0], -1)
)
mrope_position_deltas = torch.zeros(
[input_ids.shape[0], 1], device=input_ids.device, dtype=input_ids.dtype,
)
return position_ids, mrope_position_deltas

View File

@ -328,7 +328,7 @@ class CrossAttention(nn.Module):
kv = torch.cat((k, v), dim=-1)
split_size = kv.shape[-1] // self.num_heads // 2
kv = kv.view(b, -1, self.num_heads, split_size * 2)
kv = kv.view(1, -1, self.num_heads, split_size * 2)
k, v = torch.split(kv, split_size, dim=-1)
q = q.view(b, s1, self.num_heads, self.head_dim)
@ -398,7 +398,7 @@ class Attention(nn.Module):
qkv_combined = torch.cat((query, key, value), dim=-1)
split_size = qkv_combined.shape[-1] // self.num_heads // 3
qkv = qkv_combined.view(B, -1, self.num_heads, split_size * 3)
qkv = qkv_combined.view(1, -1, self.num_heads, split_size * 3)
query, key, value = torch.split(qkv, split_size, dim=-1)
query = query.reshape(B, N, self.num_heads, self.head_dim)
@ -607,9 +607,9 @@ class HunYuanDiTPlain(nn.Module):
def forward(self, x, t, context, transformer_options = {}, **kwargs):
x = x.movedim(-1, -2)
if context.shape[0] >= 2:
uncond_emb, cond_emb = context.chunk(2, dim = 0)
context = torch.cat([cond_emb, uncond_emb], dim = 0)
uncond_emb, cond_emb = context.chunk(2, dim = 0)
context = torch.cat([cond_emb, uncond_emb], dim = 0)
main_condition = context
t = 1.0 - t
@ -657,8 +657,5 @@ class HunYuanDiTPlain(nn.Module):
output = self.final_layer(combined)
output = output.movedim(-2, -1) * (-1.0)
if output.shape[0] >= 2:
cond_emb, uncond_emb = output.chunk(2, dim = 0)
return torch.cat([uncond_emb, cond_emb])
else:
return output
cond_emb, uncond_emb = output.chunk(2, dim = 0)
return torch.cat([uncond_emb, cond_emb])

View File

@ -16,31 +16,31 @@ from comfy.ldm.lightricks.model import (
from comfy.ldm.lightricks.symmetric_patchifier import AudioPatchifier
from comfy.ldm.lightricks.embeddings_connector import Embeddings1DConnector
import comfy.ldm.common_dit
import comfy.model_prefetch
class CompressedTimestep:
"""Store video timestep embeddings in compressed form using per-frame indexing."""
__slots__ = ('data', 'batch_size', 'num_frames', 'patches_per_frame', 'feature_dim')
def __init__(self, tensor: torch.Tensor, patches_per_frame: int, per_frame: bool = False):
def __init__(self, tensor: torch.Tensor, patches_per_frame: int):
"""
tensor: [batch, num_tokens, feature_dim] (per-token, default) or
[batch, num_frames, feature_dim] (per_frame=True, already compressed).
patches_per_frame: spatial patches per frame; pass None to disable compression.
tensor: [batch_size, num_tokens, feature_dim] tensor where num_tokens = num_frames * patches_per_frame
patches_per_frame: Number of spatial patches per frame (height * width in latent space), or None to disable compression
"""
self.batch_size, n, self.feature_dim = tensor.shape
if per_frame:
self.batch_size, num_tokens, self.feature_dim = tensor.shape
# Check if compression is valid (num_tokens must be divisible by patches_per_frame)
if patches_per_frame is not None and num_tokens % patches_per_frame == 0 and num_tokens >= patches_per_frame:
self.patches_per_frame = patches_per_frame
self.num_frames = n
self.data = tensor
elif patches_per_frame is not None and n >= patches_per_frame and n % patches_per_frame == 0:
self.patches_per_frame = patches_per_frame
self.num_frames = n // patches_per_frame
# All patches in a frame are identical — keep only the first.
self.data = tensor.view(self.batch_size, self.num_frames, patches_per_frame, self.feature_dim)[:, :, 0, :].contiguous()
self.num_frames = num_tokens // patches_per_frame
# Reshape to [batch, frames, patches_per_frame, feature_dim] and store one value per frame
# All patches in a frame are identical, so we only keep the first one
reshaped = tensor.view(self.batch_size, self.num_frames, patches_per_frame, self.feature_dim)
self.data = reshaped[:, :, 0, :].contiguous() # [batch, frames, feature_dim]
else:
# Not divisible or too small - store directly without compression
self.patches_per_frame = 1
self.num_frames = n
self.num_frames = num_tokens
self.data = tensor
def expand(self):
@ -715,35 +715,32 @@ class LTXAVModel(LTXVModel):
def _prepare_timestep(self, timestep, batch_size, hidden_dtype, **kwargs):
"""Prepare timestep embeddings."""
# TODO: some code reuse is needed here.
grid_mask = kwargs.get("grid_mask", None)
orig_shape = kwargs.get("orig_shape")
has_spatial_mask = kwargs.get("has_spatial_mask", None)
v_patches_per_frame = None
if not has_spatial_mask and orig_shape is not None and len(orig_shape) == 5:
v_patches_per_frame = orig_shape[3] * orig_shape[4]
if grid_mask is not None:
timestep = timestep[:, grid_mask]
# Used by compute_prompt_timestep and the audio cross-attention paths.
timestep_scaled = (timestep[:, grid_mask] if grid_mask is not None else timestep) * self.timestep_scale_multiplier
# When patches in a frame share a timestep (no spatial mask), project one row per frame instead of one per token
per_frame_path = v_patches_per_frame is not None and (timestep.numel() // batch_size) % v_patches_per_frame == 0
if per_frame_path:
per_frame = timestep.reshape(batch_size, -1, v_patches_per_frame)[:, :, 0]
if grid_mask is not None:
# All-or-nothing per frame when has_spatial_mask=False.
per_frame = per_frame[:, grid_mask[::v_patches_per_frame]]
ts_input = per_frame * self.timestep_scale_multiplier
else:
ts_input = timestep_scaled
timestep_scaled = timestep * self.timestep_scale_multiplier
v_timestep, v_embedded_timestep = self.adaln_single(
ts_input.flatten(),
timestep_scaled.flatten(),
{"resolution": None, "aspect_ratio": None},
batch_size=batch_size,
hidden_dtype=hidden_dtype,
)
v_timestep = CompressedTimestep(v_timestep.view(batch_size, -1, v_timestep.shape[-1]), v_patches_per_frame, per_frame=per_frame_path)
v_embedded_timestep = CompressedTimestep(v_embedded_timestep.view(batch_size, -1, v_embedded_timestep.shape[-1]), v_patches_per_frame, per_frame=per_frame_path)
# Calculate patches_per_frame from orig_shape: [batch, channels, frames, height, width]
# Video tokens are arranged as (frames * height * width), so patches_per_frame = height * width
orig_shape = kwargs.get("orig_shape")
has_spatial_mask = kwargs.get("has_spatial_mask", None)
v_patches_per_frame = None
if not has_spatial_mask and orig_shape is not None and len(orig_shape) == 5:
# orig_shape[3] = height, orig_shape[4] = width (in latent space)
v_patches_per_frame = orig_shape[3] * orig_shape[4]
# Reshape to [batch_size, num_tokens, dim] and compress for storage
v_timestep = CompressedTimestep(v_timestep.view(batch_size, -1, v_timestep.shape[-1]), v_patches_per_frame)
v_embedded_timestep = CompressedTimestep(v_embedded_timestep.view(batch_size, -1, v_embedded_timestep.shape[-1]), v_patches_per_frame)
v_prompt_timestep = compute_prompt_timestep(
self.prompt_adaln_single, timestep_scaled, batch_size, hidden_dtype
@ -910,11 +907,9 @@ class LTXAVModel(LTXVModel):
"""Process transformer blocks for LTXAV."""
patches_replace = transformer_options.get("patches_replace", {})
blocks_replace = patches_replace.get("dit", {})
prefetch_queue = comfy.model_prefetch.make_prefetch_queue(list(self.transformer_blocks), vx.device, transformer_options)
# Process transformer blocks
for i, block in enumerate(self.transformer_blocks):
comfy.model_prefetch.prefetch_queue_pop(prefetch_queue, vx.device, block)
if ("double_block", i) in blocks_replace:
def block_wrap(args):
@ -987,8 +982,6 @@ class LTXAVModel(LTXVModel):
a_prompt_timestep=a_prompt_timestep,
)
comfy.model_prefetch.prefetch_queue_pop(prefetch_queue, vx.device, None)
return [vx, ax]
def _process_output(self, x, embedded_timestep, keyframe_idxs, **kwargs):

View File

@ -358,61 +358,6 @@ def apply_split_rotary_emb(input_tensor, cos, sin):
return output.swapaxes(1, 2).reshape(B, T, -1) if needs_reshape else output
class GuideAttentionMask:
"""Holds the two per-group masks for LTXV guide self-attention.
_attention_with_guide_mask splits queries into noisy and tracked-guide
groups, so the largest mask is (1, 1, tracked_count, T).
"""
__slots__ = ("guide_start", "tracked_count", "noisy_mask", "tracked_mask")
def __init__(self, total_tokens, guide_start, tracked_count, tracked_weights):
device = tracked_weights.device
dtype = tracked_weights.dtype
finfo = torch.finfo(dtype)
pos = tracked_weights > 0
log_w = torch.full_like(tracked_weights, finfo.min)
log_w[pos] = torch.log(tracked_weights[pos].clamp(min=finfo.tiny))
self.guide_start = guide_start
self.tracked_count = tracked_count
self.noisy_mask = torch.zeros((1, 1, 1, total_tokens), device=device, dtype=dtype)
self.noisy_mask[:, :, :, guide_start:guide_start + tracked_count] = log_w.view(1, 1, 1, -1)
self.tracked_mask = torch.zeros((1, 1, tracked_count, total_tokens), device=device, dtype=dtype)
self.tracked_mask[:, :, :, :guide_start] = log_w.view(1, 1, -1, 1)
def _attention_with_guide_mask(q, k, v, heads, guide_mask, attn_precision, transformer_options):
"""Apply the guide mask by partitioning Q into noisy and tracked-guide
groups, so each group needs only its own sub-mask. Avoids materializing
the (1,1,T,T) dense mask.
"""
guide_start = guide_mask.guide_start
tracked_end = guide_start + guide_mask.tracked_count
out = torch.empty_like(q)
if guide_start > 0: # In practice currently guides are always after noise, guard for safety if this changes.
out[:, :guide_start, :] = comfy.ldm.modules.attention.optimized_attention(
q[:, :guide_start, :], k, v, heads, mask=guide_mask.noisy_mask,
attn_precision=attn_precision, transformer_options=transformer_options,
low_precision_attention=False, # sageattn mask support is unreliable
)
out[:, guide_start:tracked_end, :] = comfy.ldm.modules.attention.optimized_attention(
q[:, guide_start:tracked_end, :], k, v, heads, mask=guide_mask.tracked_mask,
attn_precision=attn_precision, transformer_options=transformer_options,
low_precision_attention=False,
)
if tracked_end < q.shape[1]: # Every guide token is tracked, and nothing comes after them, guard for safety if this changes.
out[:, tracked_end:, :] = comfy.ldm.modules.attention.optimized_attention(
q[:, tracked_end:, :], k, v, heads,
attn_precision=attn_precision, transformer_options=transformer_options,
)
return out
class CrossAttention(nn.Module):
def __init__(
self,
@ -467,10 +412,8 @@ class CrossAttention(nn.Module):
if mask is None:
out = comfy.ldm.modules.attention.optimized_attention(q, k, v, self.heads, attn_precision=self.attn_precision, transformer_options=transformer_options)
elif isinstance(mask, GuideAttentionMask):
out = _attention_with_guide_mask(q, k, v, self.heads, mask, attn_precision=self.attn_precision, transformer_options=transformer_options)
else:
out = comfy.ldm.modules.attention.optimized_attention(q, k, v, self.heads, mask=mask, attn_precision=self.attn_precision, transformer_options=transformer_options)
out = comfy.ldm.modules.attention.optimized_attention_masked(q, k, v, self.heads, mask, attn_precision=self.attn_precision, transformer_options=transformer_options)
# Apply per-head gating if enabled
if self.to_gate_logits is not None:
@ -1120,9 +1063,7 @@ class LTXVModel(LTXBaseModel):
additional_args["resolved_guide_entries"] = resolved_entries
keyframe_idxs = keyframe_idxs[..., kf_grid_mask, :]
if keyframe_idxs.shape[2] > 0: # Guard for the case of no keyframes surviving
pixel_coords[:, :, -keyframe_idxs.shape[2]:, :] = keyframe_idxs
pixel_coords[:, :, -keyframe_idxs.shape[2]:, :] = keyframe_idxs
# Total surviving guide tokens (all guides)
additional_args["num_guide_tokens"] = keyframe_idxs.shape[2]
@ -1158,12 +1099,12 @@ class LTXVModel(LTXBaseModel):
if not resolved_entries:
return None
# strength != 1.0 means we want to either attenuate (< 1) or amplify (> 1) guide attention.
needs_mask = any(
e["strength"] != 1.0 or e.get("pixel_mask") is not None
# Check if any attenuation is actually needed
needs_attenuation = any(
e["strength"] < 1.0 or e.get("pixel_mask") is not None
for e in resolved_entries
)
if not needs_mask:
if not needs_attenuation:
return None
# Build per-guide-token weights for all tracked guide tokens.
@ -1218,11 +1159,16 @@ class LTXVModel(LTXBaseModel):
# Concatenate per-token weights for all tracked guides
tracked_weights = torch.cat(all_weights, dim=1) # (1, total_tracked)
# Skip when every weight is exactly 1.0 (additive bias would be 0).
if (tracked_weights == 1.0).all():
# Check if any weight is actually < 1.0 (otherwise no attenuation needed)
if (tracked_weights >= 1.0).all():
return None
return GuideAttentionMask(total_tokens, guide_start, total_tracked, tracked_weights)
# Build the mask: guide tokens are at the end of the sequence.
# Tracked guides come first (in order), untracked follow.
return self._build_self_attention_mask(
total_tokens, num_guide_tokens, total_tracked,
tracked_weights, guide_start, device, dtype,
)
@staticmethod
def _downsample_mask_to_latent(mask, f_lat, h_lat, w_lat):
@ -1288,6 +1234,45 @@ class LTXVModel(LTXBaseModel):
return rearrange(latent_mask, "b 1 f h w -> b (f h w)")
@staticmethod
def _build_self_attention_mask(total_tokens, num_guide_tokens, tracked_count,
tracked_weights, guide_start, device, dtype):
"""Build a log-space additive self-attention bias mask.
Attenuates attention between noisy tokens and tracked guide tokens.
Untracked guide tokens (at the end of the guide portion) keep full attention.
Args:
total_tokens: Total sequence length.
num_guide_tokens: Total guide tokens (all guides) at end of sequence.
tracked_count: Number of tracked guide tokens (first in the guide portion).
tracked_weights: (1, tracked_count) tensor, values in [0, 1].
guide_start: Index where guide tokens begin in the sequence.
device: Target device.
dtype: Target dtype.
Returns:
(1, 1, total_tokens, total_tokens) additive bias mask.
0.0 = full attention, negative = attenuated, finfo.min = effectively fully masked.
"""
finfo = torch.finfo(dtype)
mask = torch.zeros((1, 1, total_tokens, total_tokens), device=device, dtype=dtype)
tracked_end = guide_start + tracked_count
# Convert weights to log-space bias
w = tracked_weights.to(device=device, dtype=dtype) # (1, tracked_count)
log_w = torch.full_like(w, finfo.min)
positive_mask = w > 0
if positive_mask.any():
log_w[positive_mask] = torch.log(w[positive_mask].clamp(min=finfo.tiny))
# noisy → tracked guides: each noisy row gets the same per-guide weight
mask[:, :, :guide_start, guide_start:tracked_end] = log_w.view(1, 1, 1, -1)
# tracked guides → noisy: each guide row broadcasts its weight across noisy cols
mask[:, :, guide_start:tracked_end, :guide_start] = log_w.view(1, 1, -1, 1)
return mask
def _process_transformer_blocks(self, x, context, attention_mask, timestep, pe, transformer_options={}, self_attention_mask=None, **kwargs):
"""Process transformer blocks for LTXV."""
patches_replace = transformer_options.get("patches_replace", {})

View File

@ -14,8 +14,6 @@ from .sub_quadratic_attention import efficient_dot_product_attention
from comfy import model_management
TORCH_HAS_GQA = model_management.torch_version_numeric >= (2, 5)
if model_management.xformers_enabled():
import xformers
import xformers.ops
@ -152,12 +150,7 @@ def attention_basic(q, k, v, heads, mask=None, attn_precision=None, skip_reshape
b, _, dim_head = q.shape
dim_head //= heads
if kwargs.get("enable_gqa", False) and q.shape[-3] != k.shape[-3]:
n_rep = q.shape[-3] // k.shape[-3]
k = k.repeat_interleave(n_rep, dim=-3)
v = v.repeat_interleave(n_rep, dim=-3)
scale = kwargs.get("scale", dim_head ** -0.5)
scale = dim_head ** -0.5
h = heads
if skip_reshape:
@ -226,10 +219,6 @@ def attention_sub_quad(query, key, value, heads, mask=None, attn_precision=None,
b, _, dim_head = query.shape
dim_head //= heads
if "scale" in kwargs:
# Pre-scale query to match requested scale (cancels internal 1/sqrt(dim_head))
query = query * (kwargs["scale"] * dim_head ** 0.5)
if skip_reshape:
query = query.reshape(b * heads, -1, dim_head)
value = value.reshape(b * heads, -1, dim_head)
@ -301,7 +290,7 @@ def attention_split(q, k, v, heads, mask=None, attn_precision=None, skip_reshape
b, _, dim_head = q.shape
dim_head //= heads
scale = kwargs.get("scale", dim_head ** -0.5)
scale = dim_head ** -0.5
if skip_reshape:
q, k, v = map(
@ -511,13 +500,8 @@ def attention_pytorch(q, k, v, heads, mask=None, attn_precision=None, skip_resha
if mask.ndim == 3:
mask = mask.unsqueeze(1)
# Pass through extra SDPA kwargs (scale, enable_gqa) if provided
# enable_gqa requires PyTorch 2.5+; older versions use manual KV expansion above
sdpa_keys = ("scale", "enable_gqa") if TORCH_HAS_GQA else ("scale",)
sdpa_extra = {k: v for k, v in kwargs.items() if k in sdpa_keys}
if SDP_BATCH_LIMIT >= b:
out = comfy.ops.scaled_dot_product_attention(q, k, v, attn_mask=mask, dropout_p=0.0, is_causal=False, **sdpa_extra)
out = comfy.ops.scaled_dot_product_attention(q, k, v, attn_mask=mask, dropout_p=0.0, is_causal=False)
if not skip_output_reshape:
out = (
out.transpose(1, 2).reshape(b, -1, heads * dim_head)
@ -535,7 +519,7 @@ def attention_pytorch(q, k, v, heads, mask=None, attn_precision=None, skip_resha
k[i : i + SDP_BATCH_LIMIT],
v[i : i + SDP_BATCH_LIMIT],
attn_mask=m,
dropout_p=0.0, is_causal=False, **sdpa_extra
dropout_p=0.0, is_causal=False
).transpose(1, 2).reshape(-1, q.shape[2], heads * dim_head)
return out

View File

@ -140,7 +140,7 @@ def make_ddim_sampling_parameters(alphacums, ddim_timesteps, eta, verbose=True):
alphas = alphacums[ddim_timesteps]
alphas_prev = np.asarray([alphacums[0]] + alphacums[ddim_timesteps[:-1]].tolist())
# according to the formula provided in https://arxiv.org/abs/2010.02502
# according the the formula provided in https://arxiv.org/abs/2010.02502
sigmas = eta * np.sqrt((1 - alphas_prev) / (1 - alphas) * (1 - alphas / alphas_prev))
if verbose:
logging.info(f'Selected alphas for ddim sampler: a_t: {alphas}; a_(t-1): {alphas_prev}')

View File

@ -1,189 +0,0 @@
"""Pure-torch + scipy geometry helpers for MoGe inference and mesh export."""
from __future__ import annotations
from typing import Optional, Tuple
import numpy as np
import torch
import torch.nn.functional as F
from scipy.optimize import least_squares
def normalized_view_plane_uv(width: int, height: int, aspect_ratio: Optional[float] = None,
dtype: Optional[torch.dtype] = None, device: Optional[torch.device] = None) -> torch.Tensor:
"""Normalized view-plane UV coordinates with corners at +/-(W, H)/diagonal."""
if aspect_ratio is None:
aspect_ratio = width / height
span_x = aspect_ratio / (1 + aspect_ratio ** 2) ** 0.5
span_y = 1.0 / (1 + aspect_ratio ** 2) ** 0.5
u = torch.linspace(-span_x * (width - 1) / width, span_x * (width - 1) / width, width, dtype=dtype, device=device)
v = torch.linspace(-span_y * (height - 1) / height, span_y * (height - 1) / height, height, dtype=dtype, device=device)
u, v = torch.meshgrid(u, v, indexing="xy")
return torch.stack([u, v], dim=-1)
def intrinsics_from_focal_center(fx: torch.Tensor, fy: torch.Tensor, cx: torch.Tensor, cy: torch.Tensor) -> torch.Tensor:
"""Assemble (..., 3, 3) intrinsics from broadcastable fx, fy, cx, cy."""
fx, fy, cx, cy = [torch.as_tensor(v) for v in (fx, fy, cx, cy)]
fx, fy, cx, cy = torch.broadcast_tensors(fx, fy, cx, cy)
zero = torch.zeros_like(fx)
one = torch.ones_like(fx)
return torch.stack([
torch.stack([fx, zero, cx], dim=-1),
torch.stack([zero, fy, cy], dim=-1),
torch.stack([zero, zero, one], dim=-1),
], dim=-2)
def depth_map_to_point_map(depth: torch.Tensor, intrinsics: torch.Tensor) -> torch.Tensor:
"""Back-project a (..., H, W) depth map through K^-1 to (..., H, W, 3) camera-space points.
Intrinsics use normalized image coords (x in [0, 1] left->right, y in [0, 1] top->bottom).
"""
H, W = depth.shape[-2:]
device, dtype = depth.device, depth.dtype
u = (torch.arange(W, dtype=dtype, device=device) + 0.5) / W
v = (torch.arange(H, dtype=dtype, device=device) + 0.5) / H
grid_v, grid_u = torch.meshgrid(v, u, indexing="ij")
pix = torch.stack([grid_u, grid_v, torch.ones_like(grid_u)], dim=-1)
K_inv = torch.linalg.inv(intrinsics)
rays = torch.einsum("...ij,hwj->...hwi", K_inv, pix)
return rays * depth.unsqueeze(-1)
def _solve_optimal_shift(uv: np.ndarray, xyz: np.ndarray,
focal: Optional[float] = None) -> Tuple[float, float]:
"""LM-solve for z-shift; when focal is None, also recovers the optimal focal."""
uv = uv.reshape(-1, 2)
xy = xyz[..., :2].reshape(-1, 2)
z = xyz[..., 2].reshape(-1)
def fn(shift):
xy_proj = xy / (z + shift)[:, None]
f = focal if focal is not None else (xy_proj * uv).sum() / np.square(xy_proj).sum()
return (f * xy_proj - uv).ravel()
sol = least_squares(fn, x0=0.0, ftol=1e-3, method="lm")
shift = float(np.asarray(sol["x"]).squeeze())
if focal is None:
xy_proj = xy / (z + shift)[:, None]
focal = float((xy_proj * uv).sum() / np.square(xy_proj).sum())
return shift, focal
def recover_focal_shift(points: torch.Tensor, mask: Optional[torch.Tensor] = None,
focal: Optional[torch.Tensor] = None, downsample_size: Tuple[int, int] = (64, 64)
) -> Tuple[torch.Tensor, torch.Tensor]:
"""Recover the focal length and z-shift that turn points into a metric point map.
Optical center is at the image center; returned focal is relative to half the image diagonal.
Returns (focal, shift) on the same device/dtype as points.
"""
shape = points.shape
H, W = shape[-3], shape[-2]
points_b = points.reshape(-1, H, W, 3)
mask_b = None if mask is None else mask.reshape(-1, H, W)
focal_b = None if focal is None else focal.reshape(-1)
uv = normalized_view_plane_uv(W, H, dtype=points.dtype, device=points.device)
points_lr = F.interpolate(points_b.permute(0, 3, 1, 2), downsample_size, mode="nearest").permute(0, 2, 3, 1)
uv_lr = F.interpolate(uv.unsqueeze(0).permute(0, 3, 1, 2), downsample_size, mode="nearest").squeeze(0).permute(1, 2, 0)
mask_lr = None
if mask_b is not None:
mask_lr = F.interpolate(mask_b.to(torch.float32).unsqueeze(1), downsample_size, mode="nearest").squeeze(1) > 0
uv_np = uv_lr.detach().cpu().numpy()
points_np = points_lr.detach().cpu().numpy()
mask_np = None if mask_lr is None else mask_lr.detach().cpu().numpy()
focal_np = None if focal_b is None else focal_b.detach().cpu().numpy()
out_focal: list = []
out_shift: list = []
for i in range(points_b.shape[0]):
if mask_np is None:
xyz_i = points_np[i].reshape(-1, 3)
uv_i = uv_np.reshape(-1, 2)
else:
sel = mask_np[i]
if sel.sum() < 2:
out_focal.append(1.0)
out_shift.append(0.0)
continue
xyz_i = points_np[i][sel]
uv_i = uv_np[sel]
if focal_np is None:
shift_i, focal_i = _solve_optimal_shift(uv_i, xyz_i)
out_focal.append(focal_i)
else:
shift_i, _ = _solve_optimal_shift(uv_i, xyz_i, focal=float(focal_np[i]))
out_shift.append(shift_i)
shift_t = torch.tensor(out_shift, device=points.device, dtype=points.dtype).reshape(shape[:-3])
if focal is None:
focal_t = torch.tensor(out_focal, device=points.device, dtype=points.dtype).reshape(shape[:-3])
else:
focal_t = focal.reshape(shape[:-3])
return focal_t, shift_t
def depth_map_edge(depth: torch.Tensor, atol: Optional[float] = None, rtol: Optional[float] = None, kernel_size: int = 3) -> torch.Tensor:
"""Per-pixel boolean: True where the local depth window's max-min span exceeds atol or rtol*depth."""
shape = depth.shape
d = depth.reshape(-1, 1, *shape[-2:])
pad = kernel_size // 2
diff = F.max_pool2d(d, kernel_size, stride=1, padding=pad) + F.max_pool2d(-d, kernel_size, stride=1, padding=pad)
edge = torch.zeros_like(d, dtype=torch.bool)
if atol is not None:
edge |= diff > atol
if rtol is not None:
edge |= (diff / d.clamp_min(1e-6)).nan_to_num_() > rtol
return edge.reshape(*shape)
def triangulate_grid_mesh(points: torch.Tensor, mask: Optional[torch.Tensor] = None, decimation: int = 1, discontinuity_threshold: float = 0.04,
depth: Optional[torch.Tensor] = None) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
"""Triangulate a (H, W, 3) point map into (vertices, faces, uvs) on CPU.
Vertices: pixels with finite coords (passing optional mask). Quads with four valid corners
become two triangles. depth overrides the scalar used for the rtol edge check; pass radial
depth for panoramas (the default points[..., 2] goes negative below the equator).
"""
points = points.detach().cpu()
finite = torch.isfinite(points).all(dim=-1)
if mask is None:
mask = finite
else:
mask = mask.detach().cpu().to(torch.bool) & finite
if discontinuity_threshold > 0:
d = depth.detach().cpu() if depth is not None else points[..., 2]
# Replace inf with 0 so max-pool doesn't poison neighbourhoods (mask above already excludes those pixels).
d_finite = torch.where(finite, d, torch.zeros_like(d))
edge = depth_map_edge(d_finite, rtol=discontinuity_threshold)
mask = mask & ~edge
if decimation > 1:
points = points[::decimation, ::decimation].contiguous()
mask = mask[::decimation, ::decimation].contiguous()
H, W = points.shape[:2]
flat_mask = mask.reshape(-1)
idx = torch.full((H * W,), -1, dtype=torch.long)
n_valid = int(flat_mask.sum().item())
idx[flat_mask] = torch.arange(n_valid, dtype=torch.long)
idx = idx.reshape(H, W)
vertices = points.reshape(-1, 3)[flat_mask].contiguous()
yy, xx = torch.meshgrid(torch.arange(H), torch.arange(W), indexing="ij")
u = xx.float() / max(W - 1, 1)
v = yy.float() / max(H - 1, 1)
uvs = torch.stack([u, v], dim=-1).reshape(-1, 2)[flat_mask].contiguous()
a, b, c, d = idx[:-1, :-1], idx[:-1, 1:], idx[1:, 1:], idx[1:, :-1]
quad_ok = (a >= 0) & (b >= 0) & (c >= 0) & (d >= 0)
a, b, c, d = a[quad_ok], b[quad_ok], c[quad_ok], d[quad_ok]
faces = torch.cat([torch.stack([a, b, c], dim=-1), torch.stack([a, c, d], dim=-1)], dim=0).contiguous()
return vertices, faces, uvs

View File

@ -1,347 +0,0 @@
"""MoGe v1 / v2 inference modules and a state-dict-driven builder.
V1: DINOv2 backbone + multi-output head (points, mask).
V2: DINOv2 encoder + neck + per-output heads (points, mask, normal, optional metric-scale MLP).
"""
from __future__ import annotations
from numbers import Number
from typing import Any, Dict, List, Optional, Tuple, Union
import torch
import torch.nn as nn
import torch.nn.functional as F
import comfy.ops
import comfy.model_management
import comfy.model_patcher
from comfy.image_encoders.dino2 import Dinov2Model
from .geometry import depth_map_to_point_map, intrinsics_from_focal_center, recover_focal_shift
from .modules import ConvStack, DINOv2Encoder, HeadV1, MLP, _view_plane_uv_grid
def _remap_points(points: torch.Tensor) -> torch.Tensor:
"""Apply the exp remap: z -> exp(z), xy stays linear and gets scaled by the new z."""
xy, z = points.split([2, 1], dim=-1)
z = torch.exp(z)
return torch.cat([xy * z, z], dim=-1)
def _detect_dinov2(sd: dict, prefix: str) -> Dict[str, Any]:
# All shipped MoGe checkpoints use plain DINOv2
hidden = sd[prefix + "embeddings.cls_token"].shape[-1]
layer_prefix = prefix + "encoder.layer."
depth = 1 + max(int(k[len(layer_prefix):].split(".")[0]) for k in sd if k.startswith(layer_prefix))
return {
"hidden_size": hidden,
"num_attention_heads": hidden // 64,
"num_hidden_layers": depth,
"layer_norm_eps": 1e-6,
"use_swiglu_ffn": False,
}
class MoGeModelV1(nn.Module):
"""MoGe v1: DINOv2 backbone + HeadV1 (points, mask)."""
image_mean: torch.Tensor
image_std: torch.Tensor
intermediate_layers = 4
num_tokens_range: Tuple[Number, Number] = (1200, 2500)
mask_threshold = 0.5
def __init__(self, backbone: Dict[str, Any], dim_upsample: List[int] = (256, 128, 128),
num_res_blocks: int = 1, dim_times_res_block_hidden: int = 1,
dtype=None, device=None, operations=comfy.ops.manual_cast):
super().__init__()
self.backbone = Dinov2Model(backbone, dtype, device, operations)
self.head = HeadV1(dim_in=backbone["hidden_size"], dim_upsample=list(dim_upsample),
num_res_blocks=num_res_blocks, dim_times_res_block_hidden=dim_times_res_block_hidden,
dtype=dtype, device=device, operations=operations)
self.register_buffer("image_mean", torch.tensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1))
self.register_buffer("image_std", torch.tensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1))
def forward(self, image: torch.Tensor, num_tokens: int) -> Dict[str, torch.Tensor]:
H, W = image.shape[-2:]
resize = ((num_tokens * 14 ** 2) / (H * W)) ** 0.5
rh, rw = int(H * resize), int(W * resize)
x = F.interpolate(image, (rh, rw), mode="bicubic", align_corners=False, antialias=True)
x = (x - self.image_mean) / self.image_std
x14 = F.interpolate(x, (rh // 14 * 14, rw // 14 * 14), mode="bilinear", align_corners=False, antialias=True)
n_layers = len(self.backbone.encoder.layer)
indices = list(range(n_layers - self.intermediate_layers, n_layers))
feats = self.backbone.get_intermediate_layers(x14, indices, apply_norm=True)
points, mask = self.head(feats, x)
points = F.interpolate(points.float(), (H, W), mode="bilinear", align_corners=False)
points = _remap_points(points.permute(0, 2, 3, 1))
mask = F.interpolate(mask.float(), (H, W), mode="bilinear", align_corners=False).squeeze(1)
return {"points": points, "mask": mask}
@classmethod
def from_state_dict(cls, sd, dtype=None, device=None, operations=comfy.ops.manual_cast):
"""Detect the v1 head config from sd, build a model, and load weights."""
n_up = 1 + max(int(k.split(".")[2]) for k in sd if k.startswith("head.upsample_blocks."))
dim_upsample = [sd[f"head.upsample_blocks.{i}.0.0.weight"].shape[1] for i in range(n_up)]
# Each upsample stage is Sequential[upsampler, *res_blocks]; count res blocks at level 0.
num_res_blocks = max({int(k.split(".")[3]) for k in sd if k.startswith("head.upsample_blocks.0.")})
hidden_out = sd["head.upsample_blocks.0.1.layers.2.weight"].shape[0]
dim_times = max(hidden_out // dim_upsample[0], 1)
model = cls(backbone=_detect_dinov2(sd, prefix="backbone."),
dim_upsample=dim_upsample, num_res_blocks=num_res_blocks, dim_times_res_block_hidden=dim_times,
dtype=dtype, device=device, operations=operations)
model.load_state_dict(sd, strict=True)
return model
class MoGeModelV2(nn.Module):
"""MoGe v2: DINOv2 encoder + neck + per-output heads (points/mask/normal/metric-scale)."""
intermediate_layers = 4
num_tokens_range: Tuple[Number, Number] = (1200, 3600)
def __init__(self,
encoder: Dict[str, Any],
neck: Dict[str, Any],
points_head: Dict[str, Any],
mask_head: Dict[str, Any],
scale_head: Dict[str, Any],
normal_head: Optional[Dict[str, Any]] = None,
dtype=None, device=None, operations=comfy.ops.manual_cast):
super().__init__()
self.encoder = DINOv2Encoder(**encoder, dtype=dtype, device=device, operations=operations)
self.neck = ConvStack(**neck, dtype=dtype, device=device, operations=operations)
self.points_head = ConvStack(**points_head, dtype=dtype, device=device, operations=operations)
self.mask_head = ConvStack(**mask_head, dtype=dtype, device=device, operations=operations)
self.scale_head = MLP(**scale_head, dtype=dtype, device=device, operations=operations)
if normal_head is not None:
self.normal_head = ConvStack(**normal_head, dtype=dtype, device=device, operations=operations)
def forward(self, image: torch.Tensor, num_tokens: int) -> Dict[str, torch.Tensor]:
B, _, H, W = image.shape
device, dtype = image.device, image.dtype
aspect_ratio = W / H
base_h = round((num_tokens / aspect_ratio) ** 0.5)
base_w = round((num_tokens * aspect_ratio) ** 0.5)
feat_top, cls_token = self.encoder(image, base_h, base_w, return_class_token=True)
# 5-level pyramid: feat at level 0 concatenated with UV, other levels UV-only.
levels = [_view_plane_uv_grid(B, base_h * (2 ** L), base_w * (2 ** L), aspect_ratio, dtype, device)
for L in range(5)]
levels[0] = torch.cat([feat_top, levels[0]], dim=1)
feats = self.neck(levels)
def _resize(v):
return F.interpolate(v, (H, W), mode="bilinear", align_corners=False)
points = _remap_points(_resize(self.points_head(feats)[-1]).permute(0, 2, 3, 1))
mask = _resize(self.mask_head(feats)[-1]).squeeze(1).sigmoid()
metric_scale = self.scale_head(cls_token).squeeze(1).exp()
result = {"points": points, "mask": mask, "metric_scale": metric_scale}
if hasattr(self, "normal_head"):
normal = _resize(self.normal_head(feats)[-1])
result["normal"] = F.normalize(normal.permute(0, 2, 3, 1), dim=-1)
return result
@classmethod
def from_state_dict(cls, sd, dtype=None, device=None, operations=comfy.ops.manual_cast):
"""Detect the v2 encoder/neck/heads config from sd, build a model, and load weights."""
backbone = _detect_dinov2(sd, prefix="encoder.backbone.")
depth = backbone["num_hidden_layers"]
n = cls.intermediate_layers
encoder = {
"backbone": backbone,
"intermediate_layers": [(depth // n) * (i + 1) - 1 for i in range(n)],
"dim_out": sd["encoder.output_projections.0.weight"].shape[0],
}
# scale_head is an MLP: Sequential of [Linear, ReLU, ..., Linear]; Linear weight is (out, in).
scale_idxs = sorted({int(k.split(".")[1]) for k in sd if k.startswith("scale_head.")})
scale_first = sd[f"scale_head.{scale_idxs[0]}.weight"]
cfg: Dict[str, Any] = {
"encoder": encoder,
"neck": cls._detect_convstack(sd, "neck."),
"points_head": cls._detect_convstack(sd, "points_head."),
"mask_head": cls._detect_convstack(sd, "mask_head."),
"scale_head": {"dims": [scale_first.shape[1]] + [sd[f"scale_head.{i}.weight"].shape[0] for i in scale_idxs]},
}
if any(k.startswith("normal_head.") for k in sd):
cfg["normal_head"] = cls._detect_convstack(sd, "normal_head.")
model = cls(**cfg, dtype=dtype, device=device, operations=operations)
model.load_state_dict(sd, strict=True)
return model
@staticmethod
def _detect_convstack(sd: dict, prefix: str) -> Dict[str, Any]:
"""Reconstruct a ConvStack config from the keys under prefix"""
in_keys = [k for k in sd if k.startswith(f"{prefix}input_blocks.") and k.endswith(".weight")]
n = 1 + max(int(k[len(f"{prefix}input_blocks."):].split(".")[0]) for k in in_keys)
in_shapes = [sd[f"{prefix}input_blocks.{i}.weight"].shape for i in range(n)]
has_out = lambda i: f"{prefix}output_blocks.{i}.weight" in sd
has_norm = f"{prefix}res_blocks.0.0.layers.0.weight" in sd
def num_res_at(i):
rb_prefix = f"{prefix}res_blocks.{i}."
return len({int(k[len(rb_prefix):].split(".")[0]) for k in sd if k.startswith(rb_prefix)})
return {
"dim_in": [s[1] for s in in_shapes],
"dim_res_blocks": [s[0] for s in in_shapes],
"dim_out": [sd[f"{prefix}output_blocks.{i}.weight"].shape[0] if has_out(i) else None for i in range(n)],
"num_res_blocks": [num_res_at(i) for i in range(n)],
"resamplers": ["conv_transpose" if f"{prefix}resamplers.{i}.0.weight" in sd else "bilinear"
for i in range(n - 1)],
"res_block_in_norm": "layer_norm" if has_norm else "none",
"res_block_hidden_norm": "group_norm" if has_norm else "none",
}
# Translate the Meta-style DINOv2 keys MoGe ships to the naming ComfyUI DINOv2 port expects,
# and split each fused qkv tensor into Q/K/V.
_DINOV2_TOPLEVEL_RENAMES = {
"patch_embed.proj.weight": "embeddings.patch_embeddings.projection.weight",
"patch_embed.proj.bias": "embeddings.patch_embeddings.projection.bias",
"cls_token": "embeddings.cls_token",
"pos_embed": "embeddings.position_embeddings",
"register_tokens": "embeddings.register_tokens",
"mask_token": "embeddings.mask_token",
"norm.weight": "layernorm.weight",
"norm.bias": "layernorm.bias",
}
_DINOV2_BLOCK_RENAMES = [
("ls1.gamma", "layer_scale1.lambda1"),
("ls2.gamma", "layer_scale2.lambda1"),
("attn.proj.", "attention.output.dense."),
("mlp.w12.", "mlp.weights_in."),
("mlp.w3.", "mlp.weights_out."),
]
def _remap_state_dict(sd: dict) -> dict:
if "model" in sd and "model_config" in sd:
sd = sd["model"]
prefix = "encoder.backbone." if any(k.startswith("encoder.backbone.") for k in sd) else "backbone."
out: dict = {}
for k, v in sd.items():
if not k.startswith(prefix):
out[k] = v
continue
rel = k[len(prefix):]
if rel in _DINOV2_TOPLEVEL_RENAMES:
out[prefix + _DINOV2_TOPLEVEL_RENAMES[rel]] = v
continue
if not rel.startswith("blocks."):
out[k] = v
continue
_, idx, sub = rel.split(".", 2)
if sub in ("attn.qkv.weight", "attn.qkv.bias"):
tail = sub.rsplit(".", 1)[1]
q, kw, vw = v.chunk(3, dim=0)
base = f"{prefix}encoder.layer.{idx}.attention.attention"
out[f"{base}.query.{tail}"] = q
out[f"{base}.key.{tail}"] = kw
out[f"{base}.value.{tail}"] = vw
continue
for old, new in _DINOV2_BLOCK_RENAMES:
sub = sub.replace(old, new)
out[f"{prefix}encoder.layer.{idx}.{sub}"] = v
return out
def build_from_state_dict(sd: dict, dtype=None, device=None, operations=comfy.ops.manual_cast) -> nn.Module:
"""Dispatch to v1 or v2 based on the DINOv2 backbone prefix."""
sd = _remap_state_dict(sd)
cls = MoGeModelV2 if any(k.startswith("encoder.backbone.") for k in sd) else MoGeModelV1
return cls.from_state_dict(sd, dtype=dtype, device=device, operations=operations)
class MoGeModel:
"""Loaded MoGe model + ComfyUI memory management."""
def __init__(self, state_dict: dict):
# text encoder dtype closest match
self.load_device = comfy.model_management.text_encoder_device()
offload_device = comfy.model_management.text_encoder_offload_device()
self.dtype = comfy.model_management.text_encoder_dtype(self.load_device)
self.model = build_from_state_dict(state_dict, dtype=self.dtype, device=offload_device, operations=comfy.ops.manual_cast).eval()
self.patcher = comfy.model_patcher.CoreModelPatcher(self.model, load_device=self.load_device, offload_device=offload_device)
self.version = "v2" if hasattr(self.model, "encoder") else "v1"
self.mask_threshold = float(getattr(self.model, "mask_threshold", 0.5))
nt = getattr(self.model, "num_tokens_range", (1200, 2500 if self.version == "v1" else 3600))
self.num_tokens_range = (int(nt[0]), int(nt[1]))
def infer(self, image: torch.Tensor, num_tokens: Optional[int] = None,
resolution_level: int = 9, fov_x: Optional[Union[Number, torch.Tensor]] = None,
force_projection: bool = True, apply_mask: bool = True,
apply_metric_scale: bool = True
) -> Dict[str, torch.Tensor]:
"""Run a single MoGe forward + post-process pass. image is (B, 3, H, W) in [0, 1]."""
comfy.model_management.load_model_gpu(self.patcher)
image = image.to(device=self.load_device, dtype=self.dtype)
H, W = image.shape[-2:]
aspect_ratio = W / H
if num_tokens is None:
lo, hi = self.num_tokens_range
num_tokens = int(lo + (resolution_level / 9) * (hi - lo))
out = self.model.forward(image, num_tokens=num_tokens)
points = out["points"].float() # recover_focal_shift goes through scipy on CPU; needs fp32.
mask_binary = out["mask"] > self.mask_threshold
normal = out.get("normal")
metric_scale = out.get("metric_scale")
diag = (1 + aspect_ratio ** 2) ** 0.5
def focal_from_fov_deg(deg):
fov = torch.as_tensor(deg, device=points.device, dtype=points.dtype)
return aspect_ratio / diag / torch.tan(torch.deg2rad(fov / 2))
if fov_x is None:
focal, shift = recover_focal_shift(points, mask_binary)
# Fall back to 60 deg FoV when the least-squares solver flips the focal sign.
bad = ~torch.isfinite(focal) | (focal <= 0)
if bool(bad.any()):
focal = torch.where(bad, focal_from_fov_deg(60.0), focal)
_, shift = recover_focal_shift(points, mask_binary, focal=focal)
else:
focal = focal_from_fov_deg(fov_x).expand(points.shape[0])
_, shift = recover_focal_shift(points, mask_binary, focal=focal)
f_diag = focal / 2 * diag
half = torch.tensor(0.5, device=points.device, dtype=points.dtype)
intrinsics = intrinsics_from_focal_center(f_diag / aspect_ratio, f_diag, half, half)
points[..., 2] = points[..., 2] + shift[..., None, None]
# v2 only: filter mask by depth>0 to drop metric-scale negative-depth artifacts.
if self.version == "v2":
mask_binary = mask_binary & (points[..., 2] > 0)
depth = points[..., 2].clone()
if force_projection:
points = depth_map_to_point_map(depth, intrinsics=intrinsics)
if apply_metric_scale and metric_scale is not None:
points = points * metric_scale[:, None, None, None]
depth = depth * metric_scale[:, None, None]
if apply_mask:
points = torch.where(mask_binary[..., None], points, torch.full_like(points, float("inf")))
depth = torch.where(mask_binary, depth, torch.full_like(depth, float("inf")))
if normal is not None:
normal = torch.where(mask_binary[..., None], normal, torch.zeros_like(normal))
result = {"points": points, "depth": depth, "intrinsics": intrinsics, "mask": mask_binary}
if normal is not None:
result["normal"] = normal
return result

Some files were not shown because too many files have changed in this diff Show More