mirror of
https://github.com/comfyanonymous/ComfyUI.git
synced 2026-05-15 14:17:51 +08:00
Compare commits
3 Commits
depth-anyt
...
range-type
| Author | SHA1 | Date | |
|---|---|---|---|
| 8ff57addaa | |||
| 4a8ada2d15 | |||
| 8822627a60 |
@ -1,2 +1,2 @@
|
||||
.\python_embeded\python.exe -s ComfyUI\main.py --windows-standalone-build --enable-dynamic-vram
|
||||
.\python_embeded\python.exe -s ComfyUI\main.py --windows-standalone-build --disable-smart-memory
|
||||
pause
|
||||
31
.github/workflows/openapi-lint.yml
vendored
31
.github/workflows/openapi-lint.yml
vendored
@ -1,31 +0,0 @@
|
||||
name: OpenAPI Lint
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
paths:
|
||||
- 'openapi.yaml'
|
||||
- '.spectral.yaml'
|
||||
- '.github/workflows/openapi-lint.yml'
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
spectral:
|
||||
name: Run Spectral
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Set up Node.js
|
||||
uses: actions/setup-node@v4
|
||||
with:
|
||||
node-version: '20'
|
||||
|
||||
- name: Install Spectral
|
||||
run: npm install -g @stoplight/spectral-cli@6
|
||||
|
||||
- name: Lint openapi.yaml
|
||||
run: spectral lint openapi.yaml --ruleset .spectral.yaml --fail-severity=error
|
||||
2
.github/workflows/stable-release.yml
vendored
2
.github/workflows/stable-release.yml
vendored
@ -145,8 +145,6 @@ jobs:
|
||||
cp -r ComfyUI/.ci/windows_${{ inputs.rel_name }}_base_files/* ./
|
||||
cp ../update_comfyui_and_python_dependencies.bat ./update/
|
||||
|
||||
echo 'local-portable' > ComfyUI/.comfy_environment
|
||||
|
||||
cd ..
|
||||
|
||||
"C:\Program Files\7-Zip\7z.exe" a -t7z -m0=lzma2 -mx=9 -mfb=128 -md=768m -ms=on -mf=BCJ2 ComfyUI_windows_portable.7z ComfyUI_windows_portable
|
||||
|
||||
45
.github/workflows/tag-dispatch-cloud.yml
vendored
45
.github/workflows/tag-dispatch-cloud.yml
vendored
@ -1,45 +0,0 @@
|
||||
name: Tag Dispatch to Cloud
|
||||
|
||||
on:
|
||||
push:
|
||||
tags:
|
||||
- 'v*'
|
||||
|
||||
jobs:
|
||||
dispatch-cloud:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Send repository dispatch to cloud
|
||||
env:
|
||||
DISPATCH_TOKEN: ${{ secrets.CLOUD_REPO_DISPATCH_TOKEN }}
|
||||
RELEASE_TAG: ${{ github.ref_name }}
|
||||
run: |
|
||||
set -euo pipefail
|
||||
|
||||
if [ -z "${DISPATCH_TOKEN:-}" ]; then
|
||||
echo "::error::CLOUD_REPO_DISPATCH_TOKEN is required but not set."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
RELEASE_URL="https://github.com/${{ github.repository }}/releases/tag/${RELEASE_TAG}"
|
||||
|
||||
PAYLOAD="$(jq -n \
|
||||
--arg release_tag "$RELEASE_TAG" \
|
||||
--arg release_url "$RELEASE_URL" \
|
||||
'{
|
||||
event_type: "comfyui_tag_pushed",
|
||||
client_payload: {
|
||||
release_tag: $release_tag,
|
||||
release_url: $release_url
|
||||
}
|
||||
}')"
|
||||
|
||||
curl -fsSL \
|
||||
-X POST \
|
||||
-H "Accept: application/vnd.github+json" \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "Authorization: Bearer ${DISPATCH_TOKEN}" \
|
||||
https://api.github.com/repos/Comfy-Org/cloud/dispatches \
|
||||
-d "$PAYLOAD"
|
||||
|
||||
echo "✅ Dispatched ComfyUI tag ${RELEASE_TAG} to Comfy-Org/cloud"
|
||||
2
.gitignore
vendored
2
.gitignore
vendored
@ -21,6 +21,6 @@ venv*/
|
||||
*.log
|
||||
web_custom_versions/
|
||||
.DS_Store
|
||||
openapi.yaml
|
||||
filtered-openapi.yaml
|
||||
uv.lock
|
||||
.comfy_environment
|
||||
|
||||
100
.spectral.yaml
100
.spectral.yaml
@ -1,100 +0,0 @@
|
||||
extends:
|
||||
- spectral:oas
|
||||
|
||||
# Severity levels: error, warn, info, hint, off
|
||||
# Rules from the built-in "spectral:oas" ruleset are active by default.
|
||||
# Below we tune severity and add custom rules for our conventions.
|
||||
#
|
||||
# This ruleset mirrors Comfy-Org/cloud/.spectral.yaml so specs across the
|
||||
# organization are linted against a single consistent standard.
|
||||
|
||||
rules:
|
||||
# -----------------------------------------------------------------------
|
||||
# Built-in rule severity overrides
|
||||
# -----------------------------------------------------------------------
|
||||
operation-operationId: error
|
||||
operation-description: warn
|
||||
operation-tag-defined: error
|
||||
info-contact: off
|
||||
info-description: warn
|
||||
no-eval-in-markdown: error
|
||||
no-$ref-siblings: error
|
||||
|
||||
# -----------------------------------------------------------------------
|
||||
# Custom rules: naming conventions
|
||||
# -----------------------------------------------------------------------
|
||||
|
||||
# Property names should be snake_case
|
||||
property-name-snake-case:
|
||||
description: Property names must be snake_case
|
||||
severity: warn
|
||||
given: "$.components.schemas.*.properties[*]~"
|
||||
then:
|
||||
function: pattern
|
||||
functionOptions:
|
||||
match: "^[a-z][a-z0-9]*(_[a-z0-9]+)*$"
|
||||
|
||||
# Operation IDs should be camelCase
|
||||
operation-id-camel-case:
|
||||
description: Operation IDs must be camelCase
|
||||
severity: warn
|
||||
given: "$.paths.*.*.operationId"
|
||||
then:
|
||||
function: pattern
|
||||
functionOptions:
|
||||
match: "^[a-z][a-zA-Z0-9]*$"
|
||||
|
||||
# -----------------------------------------------------------------------
|
||||
# Custom rules: response conventions
|
||||
# -----------------------------------------------------------------------
|
||||
|
||||
# Error responses (4xx, 5xx) should use a consistent shape
|
||||
error-response-schema:
|
||||
description: Error responses should reference a standard error schema
|
||||
severity: hint
|
||||
given: "$.paths.*.*.responses[?(@property >= '400' && @property < '600')].content['application/json'].schema"
|
||||
then:
|
||||
field: "$ref"
|
||||
function: truthy
|
||||
|
||||
# All 2xx responses with JSON body should have a schema
|
||||
response-schema-defined:
|
||||
description: Success responses with JSON content should define a schema
|
||||
severity: warn
|
||||
given: "$.paths.*.*.responses[?(@property >= '200' && @property < '300')].content['application/json']"
|
||||
then:
|
||||
field: schema
|
||||
function: truthy
|
||||
|
||||
# -----------------------------------------------------------------------
|
||||
# Custom rules: best practices
|
||||
# -----------------------------------------------------------------------
|
||||
|
||||
# Path parameters must have a description
|
||||
path-param-description:
|
||||
description: Path parameters should have a description
|
||||
severity: warn
|
||||
given:
|
||||
- "$.paths.*.parameters[?(@.in == 'path')]"
|
||||
- "$.paths.*.*.parameters[?(@.in == 'path')]"
|
||||
then:
|
||||
field: description
|
||||
function: truthy
|
||||
|
||||
# Schemas should have a description
|
||||
schema-description:
|
||||
description: Component schemas should have a description
|
||||
severity: hint
|
||||
given: "$.components.schemas.*"
|
||||
then:
|
||||
field: description
|
||||
function: truthy
|
||||
|
||||
overrides:
|
||||
# /ws uses HTTP 101 (Switching Protocols) — a legitimate response for a
|
||||
# WebSocket upgrade, but not a 2xx, so operation-success-response fires
|
||||
# as a false positive. OpenAPI 3.x has no native WebSocket support.
|
||||
- files:
|
||||
- "openapi.yaml#/paths/~1ws"
|
||||
rules:
|
||||
operation-success-response: off
|
||||
@ -1,2 +1,2 @@
|
||||
# Admins
|
||||
* @comfyanonymous @kosinkadink @guill @alexisrolland @rattus128 @kijai
|
||||
* @comfyanonymous @kosinkadink @guill
|
||||
|
||||
23
README.md
23
README.md
@ -1,7 +1,7 @@
|
||||
<div align="center">
|
||||
|
||||
# ComfyUI
|
||||
**The most powerful and modular AI engine for content creation.**
|
||||
**The most powerful and modular visual AI engine and application.**
|
||||
|
||||
|
||||
[![Website][website-shield]][website-url]
|
||||
@ -31,16 +31,10 @@
|
||||
[github-downloads-latest-shield]: https://img.shields.io/github/downloads/comfyanonymous/ComfyUI/latest/total?style=flat&label=downloads%40latest
|
||||
[github-downloads-link]: https://github.com/comfyanonymous/ComfyUI/releases
|
||||
|
||||
<img width="1590" height="795" alt="ComfyUI Screenshot" src="https://github.com/user-attachments/assets/36e065e0-bfae-4456-8c7f-8369d5ea48a2" />
|
||||
<br>
|
||||

|
||||
</div>
|
||||
|
||||
ComfyUI is the AI creation engine for visual professionals who demand control over every model, every parameter, and every output. Its powerful and modular node graph interface empowers creatives to generate images, videos, 3D models, audio, and more...
|
||||
- ComfyUI natively supports the latest open-source state of the art models.
|
||||
- API nodes provide access to the best closed source models such as Nano Banana, Seedance, Hunyuan3D, etc.
|
||||
- It is available on Windows, Linux, and macOS, locally with our desktop application or on our cloud.
|
||||
- The most sophisticated workflows can be exposed through a simple UI thanks to App Mode.
|
||||
- It integrates seamlessly into production pipelines with our API endpoints.
|
||||
ComfyUI lets you design and execute advanced stable diffusion pipelines using a graph/nodes/flowchart based interface. Available on Windows, Linux, and macOS.
|
||||
|
||||
## Get Started
|
||||
|
||||
@ -83,7 +77,6 @@ See what ComfyUI can do with the [newer template workflows](https://comfy.org/wo
|
||||
- [Hunyuan Image 2.1](https://comfyanonymous.github.io/ComfyUI_examples/hunyuan_image/)
|
||||
- [Flux 2](https://comfyanonymous.github.io/ComfyUI_examples/flux2/)
|
||||
- [Z Image](https://comfyanonymous.github.io/ComfyUI_examples/z_image/)
|
||||
- Ernie Image
|
||||
- Image Editing Models
|
||||
- [Omnigen 2](https://comfyanonymous.github.io/ComfyUI_examples/omnigen/)
|
||||
- [Flux Kontext](https://comfyanonymous.github.io/ComfyUI_examples/flux/#flux-kontext-image-editing-model)
|
||||
@ -133,7 +126,7 @@ Workflow examples can be found on the [Examples page](https://comfyanonymous.git
|
||||
ComfyUI follows a weekly release cycle targeting Monday but this regularly changes because of model releases or large changes to the codebase. There are three interconnected repositories:
|
||||
|
||||
1. **[ComfyUI Core](https://github.com/comfyanonymous/ComfyUI)**
|
||||
- Releases a new major stable version (e.g., v0.7.0) roughly every 2 weeks.
|
||||
- Releases a new stable version (e.g., v0.7.0) roughly every week.
|
||||
- Starting from v0.4.0 patch versions will be used for fixes backported onto the current stable release.
|
||||
- Minor versions will be used for releases off the master branch.
|
||||
- Patch versions may still be used for releases on the master branch in cases where a backport would not make sense.
|
||||
@ -200,15 +193,13 @@ If you have trouble extracting it, right click the file -> properties -> unblock
|
||||
|
||||
The portable above currently comes with python 3.13 and pytorch cuda 13.0. Update your Nvidia drivers if it doesn't start.
|
||||
|
||||
#### All Official Portable Downloads:
|
||||
#### Alternative Downloads:
|
||||
|
||||
[Portable for AMD GPUs](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_amd.7z)
|
||||
|
||||
[Portable for Intel GPUs](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_intel.7z)
|
||||
[Experimental portable for Intel GPUs](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_intel.7z)
|
||||
|
||||
[Portable for Nvidia GPUs](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_nvidia.7z) (supports 20 series and above).
|
||||
|
||||
[Portable for Nvidia GPUs with pytorch cuda 12.6 and python 3.12](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_nvidia_cu126.7z) (Supports Nvidia 10 series and older GPUs).
|
||||
[Portable with pytorch cuda 12.6 and python 3.12](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_nvidia_cu126.7z) (Supports Nvidia 10 series and older GPUs).
|
||||
|
||||
#### How do I share models between another UI and ComfyUI?
|
||||
|
||||
|
||||
@ -27,7 +27,7 @@ def frontend_install_warning_message():
|
||||
return f"""
|
||||
{get_missing_requirements_message()}
|
||||
|
||||
The ComfyUI frontend is shipped in a pip package so it needs to be updated separately from the ComfyUI code.
|
||||
This error is happening because the ComfyUI frontend is no longer shipped as part of the main repo but as a pip package instead.
|
||||
""".strip()
|
||||
|
||||
def parse_version(version: str) -> tuple[int, int, int]:
|
||||
|
||||
@ -1,7 +1,5 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
|
||||
from aiohttp import web
|
||||
|
||||
from typing import TYPE_CHECKING, TypedDict
|
||||
@ -33,22 +31,8 @@ class NodeReplaceManager:
|
||||
self._replacements: dict[str, list[NodeReplace]] = {}
|
||||
|
||||
def register(self, node_replace: NodeReplace):
|
||||
"""Register a node replacement mapping.
|
||||
|
||||
Idempotent: if a replacement with the same (old_node_id, new_node_id)
|
||||
is already registered, the duplicate is ignored. This prevents stale
|
||||
entries from accumulating when custom nodes are reloaded in the same
|
||||
process (e.g. via ComfyUI-Manager).
|
||||
"""
|
||||
existing = self._replacements.setdefault(node_replace.old_node_id, [])
|
||||
for entry in existing:
|
||||
if entry.new_node_id == node_replace.new_node_id:
|
||||
logging.debug(
|
||||
"Node replacement %s -> %s already registered, ignoring duplicate.",
|
||||
node_replace.old_node_id, node_replace.new_node_id,
|
||||
)
|
||||
return
|
||||
existing.append(node_replace)
|
||||
"""Register a node replacement mapping."""
|
||||
self._replacements.setdefault(node_replace.old_node_id, []).append(node_replace)
|
||||
|
||||
def get_replacement(self, old_node_id: str) -> list[NodeReplace] | None:
|
||||
"""Get replacements for an old node ID."""
|
||||
|
||||
@ -28,8 +28,8 @@ def get_file_info(path: str, relative_to: str) -> FileInfo:
|
||||
return {
|
||||
"path": os.path.relpath(path, relative_to).replace(os.sep, '/'),
|
||||
"size": os.path.getsize(path),
|
||||
"modified": int(os.path.getmtime(path) * 1000),
|
||||
"created": int(os.path.getctime(path) * 1000),
|
||||
"modified": os.path.getmtime(path),
|
||||
"created": os.path.getctime(path)
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -431,10 +431,9 @@
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Image Tools/Color adjust",
|
||||
"description": "Adjusts image brightness and contrast using a real-time GPU fragment shader."
|
||||
"category": "Image Tools/Color adjust"
|
||||
}
|
||||
]
|
||||
},
|
||||
"extra": {}
|
||||
}
|
||||
}
|
||||
|
||||
@ -162,7 +162,7 @@
|
||||
},
|
||||
"revision": 0,
|
||||
"config": {},
|
||||
"name": "Canny to Image (Z-Image-Turbo)",
|
||||
"name": "local-Canny to Image (Z-Image-Turbo)",
|
||||
"inputNode": {
|
||||
"id": -10,
|
||||
"bounding": [
|
||||
@ -1553,8 +1553,7 @@
|
||||
"VHS_MetadataImage": true,
|
||||
"VHS_KeepIntermediate": true
|
||||
},
|
||||
"category": "Image generation and editing/Canny to image",
|
||||
"description": "Generates an image from a Canny edge map using Z-Image-Turbo, with text conditioning."
|
||||
"category": "Image generation and editing/Canny to image"
|
||||
}
|
||||
]
|
||||
},
|
||||
@ -1575,4 +1574,4 @@
|
||||
}
|
||||
},
|
||||
"version": 0.4
|
||||
}
|
||||
}
|
||||
|
||||
@ -192,7 +192,7 @@
|
||||
},
|
||||
"revision": 0,
|
||||
"config": {},
|
||||
"name": "Canny to Video (LTX 2.0)",
|
||||
"name": "local-Canny to Video (LTX 2.0)",
|
||||
"inputNode": {
|
||||
"id": -10,
|
||||
"bounding": [
|
||||
@ -3600,8 +3600,7 @@
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Video generation and editing/Canny to video",
|
||||
"description": "Generates video from Canny edge maps using LTX-2, with optional synchronized audio."
|
||||
"category": "Video generation and editing/Canny to video"
|
||||
}
|
||||
]
|
||||
},
|
||||
@ -3617,4 +3616,4 @@
|
||||
}
|
||||
},
|
||||
"version": 0.4
|
||||
}
|
||||
}
|
||||
|
||||
@ -377,9 +377,8 @@
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Image Tools/Color adjust",
|
||||
"description": "Adds lens-style chromatic aberration (color fringing) using a real-time GPU fragment shader."
|
||||
"category": "Image Tools/Color adjust"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -596,8 +596,7 @@
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Image Tools/Color adjust",
|
||||
"description": "Adjusts saturation, temperature, tint, and vibrance using a real-time GPU fragment shader."
|
||||
"category": "Image Tools/Color adjust"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
@ -1129,8 +1129,7 @@
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Image Tools/Color adjust",
|
||||
"description": "Balances colors across shadows, midtones, and highlights using a real-time GPU fragment shader."
|
||||
"category": "Image Tools/Color adjust"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
@ -608,8 +608,7 @@
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Image Tools/Color adjust",
|
||||
"description": "Fine-tunes tone and color with per-channel curve adjustments using a real-time GPU fragment shader."
|
||||
"category": "Image Tools/Color adjust"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -160,7 +160,7 @@
|
||||
},
|
||||
"revision": 0,
|
||||
"config": {},
|
||||
"name": "Depth to Image (Z-Image-Turbo)",
|
||||
"name": "local-Depth to Image (Z-Image-Turbo)",
|
||||
"inputNode": {
|
||||
"id": -10,
|
||||
"bounding": [
|
||||
@ -1579,8 +1579,7 @@
|
||||
"VHS_MetadataImage": true,
|
||||
"VHS_KeepIntermediate": true
|
||||
},
|
||||
"category": "Image generation and editing/Depth to image",
|
||||
"description": "Generates an image from a depth map using Z-Image-Turbo with text conditioning."
|
||||
"category": "Image generation and editing/Depth to image"
|
||||
},
|
||||
{
|
||||
"id": "458bdf3c-4b58-421c-af50-c9c663a4d74c",
|
||||
@ -2462,8 +2461,7 @@
|
||||
]
|
||||
},
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"description": "Estimates a monocular depth map from an input image using the Lotus depth estimation model."
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
@ -2484,4 +2482,4 @@
|
||||
"VHS_KeepIntermediate": true
|
||||
},
|
||||
"version": 0.4
|
||||
}
|
||||
}
|
||||
|
||||
@ -261,7 +261,7 @@
|
||||
},
|
||||
"revision": 0,
|
||||
"config": {},
|
||||
"name": "Depth to Video (LTX 2.0)",
|
||||
"name": "local-Depth to Video (LTX 2.0)",
|
||||
"inputNode": {
|
||||
"id": -10,
|
||||
"bounding": [
|
||||
@ -4233,8 +4233,7 @@
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Video generation and editing/Depth to video",
|
||||
"description": "Generates depth-controlled video with LTX-2: motion and structure follow a depth-reference video alongside text prompting, optional first-frame image conditioning, with optional synchronized audio."
|
||||
"category": "Video generation and editing/Depth to video"
|
||||
},
|
||||
{
|
||||
"id": "38b60539-50a7-42f9-a5fe-bdeca26272e2",
|
||||
@ -5193,8 +5192,7 @@
|
||||
],
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"description": "Estimates a monocular depth map from an input image using the Lotus depth estimation model."
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
@ -5210,4 +5208,4 @@
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"version": 0.4
|
||||
}
|
||||
}
|
||||
|
||||
@ -450,10 +450,9 @@
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Image Tools/Blur",
|
||||
"description": "Applies bilateral (edge-preserving) blur to soften images while retaining detail."
|
||||
"category": "Image Tools/Blur"
|
||||
}
|
||||
]
|
||||
},
|
||||
"extra": {}
|
||||
}
|
||||
}
|
||||
|
||||
@ -580,9 +580,8 @@
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Image Tools/Color adjust",
|
||||
"description": "Adds procedural film grain texture for a cinematic look via GPU fragment shader."
|
||||
"category": "Image Tools/Color adjust"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -1,858 +0,0 @@
|
||||
{
|
||||
"revision": 0,
|
||||
"last_node_id": 16,
|
||||
"last_link_id": 0,
|
||||
"nodes": [
|
||||
{
|
||||
"id": 16,
|
||||
"type": "022693be-2baa-4009-870a-28921508a7ef",
|
||||
"pos": [
|
||||
-2990,
|
||||
-3240
|
||||
],
|
||||
"size": [
|
||||
410,
|
||||
200
|
||||
],
|
||||
"flags": {},
|
||||
"order": 2,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "video",
|
||||
"name": "video",
|
||||
"type": "VIDEO",
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"label": "multiplier",
|
||||
"name": "value",
|
||||
"type": "INT",
|
||||
"widget": {
|
||||
"name": "value"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"label": "enable_fps_multiplier",
|
||||
"name": "value_1",
|
||||
"type": "BOOLEAN",
|
||||
"widget": {
|
||||
"name": "value_1"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "model_name",
|
||||
"type": "COMBO",
|
||||
"widget": {
|
||||
"name": "model_name"
|
||||
},
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"label": "VIDEO",
|
||||
"name": "VIDEO_1",
|
||||
"type": "VIDEO",
|
||||
"links": []
|
||||
},
|
||||
{
|
||||
"name": "IMAGE",
|
||||
"type": "IMAGE",
|
||||
"links": null
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"proxyWidgets": [
|
||||
[
|
||||
"9",
|
||||
"value"
|
||||
],
|
||||
[
|
||||
"13",
|
||||
"value"
|
||||
],
|
||||
[
|
||||
"1",
|
||||
"model_name"
|
||||
]
|
||||
],
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
"hasSecondTab": false,
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65,
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.19.3"
|
||||
},
|
||||
"widgets_values": [],
|
||||
"title": "Frame Interpolation"
|
||||
}
|
||||
],
|
||||
"links": [],
|
||||
"version": 0.4,
|
||||
"definitions": {
|
||||
"subgraphs": [
|
||||
{
|
||||
"id": "022693be-2baa-4009-870a-28921508a7ef",
|
||||
"version": 1,
|
||||
"state": {
|
||||
"lastGroupId": 0,
|
||||
"lastNodeId": 17,
|
||||
"lastLinkId": 28,
|
||||
"lastRerouteId": 0
|
||||
},
|
||||
"revision": 0,
|
||||
"config": {},
|
||||
"name": "Frame Interpolation",
|
||||
"inputNode": {
|
||||
"id": -10,
|
||||
"bounding": [
|
||||
-2810,
|
||||
-3070,
|
||||
159.7421875,
|
||||
120
|
||||
]
|
||||
},
|
||||
"outputNode": {
|
||||
"id": -20,
|
||||
"bounding": [
|
||||
-1270,
|
||||
-3075,
|
||||
120,
|
||||
80
|
||||
]
|
||||
},
|
||||
"inputs": [
|
||||
{
|
||||
"id": "05e31c51-dcb6-4a1e-9651-1b9ad4f7a287",
|
||||
"name": "video",
|
||||
"type": "VIDEO",
|
||||
"linkIds": [
|
||||
2
|
||||
],
|
||||
"localized_name": "video",
|
||||
"pos": [
|
||||
-2670.2578125,
|
||||
-3050
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "feecb409-7d1c-4a99-9c63-50c5fecdd3c9",
|
||||
"name": "value",
|
||||
"type": "INT",
|
||||
"linkIds": [
|
||||
22
|
||||
],
|
||||
"label": "multiplier",
|
||||
"pos": [
|
||||
-2670.2578125,
|
||||
-3030
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "0b8a861b-b581-4068-9e8c-f8d15daf1ca6",
|
||||
"name": "value_1",
|
||||
"type": "BOOLEAN",
|
||||
"linkIds": [
|
||||
23
|
||||
],
|
||||
"label": "enable_fps_multiplier",
|
||||
"pos": [
|
||||
-2670.2578125,
|
||||
-3010
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "a22b101e-8773-4e17-a297-7ee3aae09162",
|
||||
"name": "model_name",
|
||||
"type": "COMBO",
|
||||
"linkIds": [
|
||||
24
|
||||
],
|
||||
"pos": [
|
||||
-2670.2578125,
|
||||
-2990
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"id": "ef2ada05-d5aa-492a-9394-6c3e71e39ebb",
|
||||
"name": "VIDEO_1",
|
||||
"type": "VIDEO",
|
||||
"linkIds": [
|
||||
26
|
||||
],
|
||||
"label": "VIDEO",
|
||||
"pos": [
|
||||
-1250,
|
||||
-3055
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "5aacc622-2a07-4983-b31c-e04461f7f953",
|
||||
"name": "IMAGE",
|
||||
"type": "IMAGE",
|
||||
"linkIds": [
|
||||
28
|
||||
],
|
||||
"pos": [
|
||||
-1250,
|
||||
-3035
|
||||
]
|
||||
}
|
||||
],
|
||||
"widgets": [],
|
||||
"nodes": [
|
||||
{
|
||||
"id": 1,
|
||||
"type": "FrameInterpolationModelLoader",
|
||||
"pos": [
|
||||
-2510,
|
||||
-3370
|
||||
],
|
||||
"size": [
|
||||
370,
|
||||
90
|
||||
],
|
||||
"flags": {},
|
||||
"order": 0,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "model_name",
|
||||
"name": "model_name",
|
||||
"type": "COMBO",
|
||||
"widget": {
|
||||
"name": "model_name"
|
||||
},
|
||||
"link": 24
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "INTERP_MODEL",
|
||||
"name": "INTERP_MODEL",
|
||||
"type": "INTERP_MODEL",
|
||||
"links": [
|
||||
1
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "FrameInterpolationModelLoader",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
"hasSecondTab": false,
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65,
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.19.3",
|
||||
"models": [
|
||||
{
|
||||
"name": "film_net_fp16.safetensors",
|
||||
"url": "https://huggingface.co/Comfy-Org/frame_interpolation/resolve/main/frame_interpolation/film_net_fp16.safetensors",
|
||||
"directory": "frame_interpolation"
|
||||
}
|
||||
]
|
||||
},
|
||||
"widgets_values": [
|
||||
"film_net_fp16.safetensors"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"type": "FrameInterpolate",
|
||||
"pos": [
|
||||
-2040,
|
||||
-3370
|
||||
],
|
||||
"size": [
|
||||
270,
|
||||
110
|
||||
],
|
||||
"flags": {},
|
||||
"order": 1,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "interp_model",
|
||||
"name": "interp_model",
|
||||
"type": "INTERP_MODEL",
|
||||
"link": 1
|
||||
},
|
||||
{
|
||||
"localized_name": "images",
|
||||
"name": "images",
|
||||
"type": "IMAGE",
|
||||
"link": 3
|
||||
},
|
||||
{
|
||||
"localized_name": "multiplier",
|
||||
"name": "multiplier",
|
||||
"type": "INT",
|
||||
"widget": {
|
||||
"name": "multiplier"
|
||||
},
|
||||
"link": 8
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "IMAGE",
|
||||
"name": "IMAGE",
|
||||
"type": "IMAGE",
|
||||
"links": [
|
||||
4,
|
||||
28
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "FrameInterpolate",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
"hasSecondTab": false,
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65,
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.19.3"
|
||||
},
|
||||
"widgets_values": [
|
||||
2
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"type": "CreateVideo",
|
||||
"pos": [
|
||||
-1600,
|
||||
-3370
|
||||
],
|
||||
"size": [
|
||||
270,
|
||||
110
|
||||
],
|
||||
"flags": {},
|
||||
"order": 3,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "images",
|
||||
"name": "images",
|
||||
"type": "IMAGE",
|
||||
"link": 4
|
||||
},
|
||||
{
|
||||
"localized_name": "audio",
|
||||
"name": "audio",
|
||||
"shape": 7,
|
||||
"type": "AUDIO",
|
||||
"link": 5
|
||||
},
|
||||
{
|
||||
"localized_name": "fps",
|
||||
"name": "fps",
|
||||
"type": "FLOAT",
|
||||
"widget": {
|
||||
"name": "fps"
|
||||
},
|
||||
"link": 12
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "VIDEO",
|
||||
"name": "VIDEO",
|
||||
"type": "VIDEO",
|
||||
"links": [
|
||||
26
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "CreateVideo",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
"hasSecondTab": false,
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65,
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.19.3"
|
||||
},
|
||||
"widgets_values": [
|
||||
30
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 9,
|
||||
"type": "PrimitiveInt",
|
||||
"pos": [
|
||||
-2500,
|
||||
-2970
|
||||
],
|
||||
"size": [
|
||||
270,
|
||||
90
|
||||
],
|
||||
"flags": {},
|
||||
"order": 4,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "value",
|
||||
"name": "value",
|
||||
"type": "INT",
|
||||
"widget": {
|
||||
"name": "value"
|
||||
},
|
||||
"link": 22
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "INT",
|
||||
"name": "INT",
|
||||
"type": "INT",
|
||||
"links": [
|
||||
8,
|
||||
19
|
||||
]
|
||||
}
|
||||
],
|
||||
"title": "Int (Multiplier)",
|
||||
"properties": {
|
||||
"Node name for S&R": "PrimitiveInt",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
"hasSecondTab": false,
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65,
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.19.3"
|
||||
},
|
||||
"widgets_values": [
|
||||
2,
|
||||
"fixed"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 10,
|
||||
"type": "ComfySwitchNode",
|
||||
"pos": [
|
||||
-1610,
|
||||
-3120
|
||||
],
|
||||
"size": [
|
||||
270,
|
||||
130
|
||||
],
|
||||
"flags": {},
|
||||
"order": 5,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "on_false",
|
||||
"name": "on_false",
|
||||
"type": "*",
|
||||
"link": 11
|
||||
},
|
||||
{
|
||||
"localized_name": "on_true",
|
||||
"name": "on_true",
|
||||
"type": "*",
|
||||
"link": 13
|
||||
},
|
||||
{
|
||||
"localized_name": "switch",
|
||||
"name": "switch",
|
||||
"type": "BOOLEAN",
|
||||
"widget": {
|
||||
"name": "switch"
|
||||
},
|
||||
"link": 15
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "output",
|
||||
"name": "output",
|
||||
"type": "*",
|
||||
"links": [
|
||||
12
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "ComfySwitchNode",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
"hasSecondTab": false,
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65,
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.19.3"
|
||||
},
|
||||
"widgets_values": [
|
||||
true
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 13,
|
||||
"type": "PrimitiveBoolean",
|
||||
"pos": [
|
||||
-2500,
|
||||
-2770
|
||||
],
|
||||
"size": [
|
||||
310,
|
||||
90
|
||||
],
|
||||
"flags": {},
|
||||
"order": 7,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "value",
|
||||
"name": "value",
|
||||
"type": "BOOLEAN",
|
||||
"widget": {
|
||||
"name": "value"
|
||||
},
|
||||
"link": 23
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "BOOLEAN",
|
||||
"name": "BOOLEAN",
|
||||
"type": "BOOLEAN",
|
||||
"links": [
|
||||
15
|
||||
]
|
||||
}
|
||||
],
|
||||
"title": "Boolean (Apply multiplier to FPS?)",
|
||||
"properties": {
|
||||
"Node name for S&R": "PrimitiveBoolean",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
"hasSecondTab": false,
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65,
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.19.3"
|
||||
},
|
||||
"widgets_values": [
|
||||
true
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"type": "GetVideoComponents",
|
||||
"pos": [
|
||||
-2500,
|
||||
-3170
|
||||
],
|
||||
"size": [
|
||||
230,
|
||||
100
|
||||
],
|
||||
"flags": {},
|
||||
"order": 2,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "video",
|
||||
"name": "video",
|
||||
"type": "VIDEO",
|
||||
"link": 2
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "images",
|
||||
"name": "images",
|
||||
"type": "IMAGE",
|
||||
"links": [
|
||||
3
|
||||
]
|
||||
},
|
||||
{
|
||||
"localized_name": "audio",
|
||||
"name": "audio",
|
||||
"type": "AUDIO",
|
||||
"links": [
|
||||
5
|
||||
]
|
||||
},
|
||||
{
|
||||
"localized_name": "fps",
|
||||
"name": "fps",
|
||||
"type": "FLOAT",
|
||||
"links": [
|
||||
11,
|
||||
18
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "GetVideoComponents",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
"hasSecondTab": false,
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65,
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.19.3"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 11,
|
||||
"type": "ComfyMathExpression",
|
||||
"pos": [
|
||||
-2090,
|
||||
-3070
|
||||
],
|
||||
"size": [
|
||||
400,
|
||||
210
|
||||
],
|
||||
"flags": {
|
||||
"collapsed": false
|
||||
},
|
||||
"order": 6,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"label": "a",
|
||||
"localized_name": "values.a",
|
||||
"name": "values.a",
|
||||
"type": "FLOAT,INT",
|
||||
"link": 18
|
||||
},
|
||||
{
|
||||
"label": "b",
|
||||
"localized_name": "values.b",
|
||||
"name": "values.b",
|
||||
"shape": 7,
|
||||
"type": "FLOAT,INT",
|
||||
"link": 19
|
||||
},
|
||||
{
|
||||
"label": "c",
|
||||
"localized_name": "values.c",
|
||||
"name": "values.c",
|
||||
"shape": 7,
|
||||
"type": "FLOAT,INT",
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"localized_name": "expression",
|
||||
"name": "expression",
|
||||
"type": "STRING",
|
||||
"widget": {
|
||||
"name": "expression"
|
||||
},
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "FLOAT",
|
||||
"name": "FLOAT",
|
||||
"type": "FLOAT",
|
||||
"links": [
|
||||
13
|
||||
]
|
||||
},
|
||||
{
|
||||
"localized_name": "INT",
|
||||
"name": "INT",
|
||||
"type": "INT",
|
||||
"links": null
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "ComfyMathExpression",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
"hasSecondTab": false,
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65,
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.19.3"
|
||||
},
|
||||
"widgets_values": [
|
||||
"min(abs(b), 16) * a"
|
||||
]
|
||||
}
|
||||
],
|
||||
"groups": [],
|
||||
"links": [
|
||||
{
|
||||
"id": 1,
|
||||
"origin_id": 1,
|
||||
"origin_slot": 0,
|
||||
"target_id": 2,
|
||||
"target_slot": 0,
|
||||
"type": "INTERP_MODEL"
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"origin_id": 3,
|
||||
"origin_slot": 0,
|
||||
"target_id": 2,
|
||||
"target_slot": 1,
|
||||
"type": "IMAGE"
|
||||
},
|
||||
{
|
||||
"id": 8,
|
||||
"origin_id": 9,
|
||||
"origin_slot": 0,
|
||||
"target_id": 2,
|
||||
"target_slot": 2,
|
||||
"type": "INT"
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"origin_id": 2,
|
||||
"origin_slot": 0,
|
||||
"target_id": 5,
|
||||
"target_slot": 0,
|
||||
"type": "IMAGE"
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"origin_id": 3,
|
||||
"origin_slot": 1,
|
||||
"target_id": 5,
|
||||
"target_slot": 1,
|
||||
"type": "AUDIO"
|
||||
},
|
||||
{
|
||||
"id": 12,
|
||||
"origin_id": 10,
|
||||
"origin_slot": 0,
|
||||
"target_id": 5,
|
||||
"target_slot": 2,
|
||||
"type": "FLOAT"
|
||||
},
|
||||
{
|
||||
"id": 11,
|
||||
"origin_id": 3,
|
||||
"origin_slot": 2,
|
||||
"target_id": 10,
|
||||
"target_slot": 0,
|
||||
"type": "FLOAT"
|
||||
},
|
||||
{
|
||||
"id": 13,
|
||||
"origin_id": 11,
|
||||
"origin_slot": 0,
|
||||
"target_id": 10,
|
||||
"target_slot": 1,
|
||||
"type": "FLOAT"
|
||||
},
|
||||
{
|
||||
"id": 15,
|
||||
"origin_id": 13,
|
||||
"origin_slot": 0,
|
||||
"target_id": 10,
|
||||
"target_slot": 2,
|
||||
"type": "BOOLEAN"
|
||||
},
|
||||
{
|
||||
"id": 18,
|
||||
"origin_id": 3,
|
||||
"origin_slot": 2,
|
||||
"target_id": 11,
|
||||
"target_slot": 0,
|
||||
"type": "FLOAT"
|
||||
},
|
||||
{
|
||||
"id": 19,
|
||||
"origin_id": 9,
|
||||
"origin_slot": 0,
|
||||
"target_id": 11,
|
||||
"target_slot": 1,
|
||||
"type": "INT"
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 0,
|
||||
"target_id": 3,
|
||||
"target_slot": 0,
|
||||
"type": "VIDEO"
|
||||
},
|
||||
{
|
||||
"id": 22,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 1,
|
||||
"target_id": 9,
|
||||
"target_slot": 0,
|
||||
"type": "INT"
|
||||
},
|
||||
{
|
||||
"id": 23,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 2,
|
||||
"target_id": 13,
|
||||
"target_slot": 0,
|
||||
"type": "BOOLEAN"
|
||||
},
|
||||
{
|
||||
"id": 24,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 3,
|
||||
"target_id": 1,
|
||||
"target_slot": 0,
|
||||
"type": "COMBO"
|
||||
},
|
||||
{
|
||||
"id": 26,
|
||||
"origin_id": 5,
|
||||
"origin_slot": 0,
|
||||
"target_id": -20,
|
||||
"target_slot": 0,
|
||||
"type": "VIDEO"
|
||||
},
|
||||
{
|
||||
"id": 28,
|
||||
"origin_id": 2,
|
||||
"origin_slot": 0,
|
||||
"target_id": -20,
|
||||
"target_slot": 1,
|
||||
"type": "IMAGE"
|
||||
}
|
||||
],
|
||||
"extra": {},
|
||||
"category": "Video Tools",
|
||||
"description": "Increases video frame rate by synthesizing intermediate frames with a frame interpolation model."
|
||||
}
|
||||
]
|
||||
},
|
||||
"extra": {}
|
||||
}
|
||||
@ -1,485 +0,0 @@
|
||||
{
|
||||
"revision": 0,
|
||||
"last_node_id": 98,
|
||||
"last_link_id": 0,
|
||||
"nodes": [
|
||||
{
|
||||
"id": 98,
|
||||
"type": "dca6e78d-fb06-421e-97f7-6ce17a665260",
|
||||
"pos": [
|
||||
-410,
|
||||
-2230
|
||||
],
|
||||
"size": [
|
||||
270,
|
||||
104
|
||||
],
|
||||
"flags": {},
|
||||
"order": 7,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"name": "video",
|
||||
"type": "VIDEO",
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"label": "frame_index",
|
||||
"name": "value",
|
||||
"type": "INT",
|
||||
"widget": {
|
||||
"name": "value"
|
||||
},
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "IMAGE",
|
||||
"type": "IMAGE",
|
||||
"links": []
|
||||
}
|
||||
],
|
||||
"title": "Get Any Video Frame",
|
||||
"properties": {
|
||||
"proxyWidgets": [
|
||||
[
|
||||
"100",
|
||||
"value"
|
||||
]
|
||||
]
|
||||
},
|
||||
"widgets_values": []
|
||||
}
|
||||
],
|
||||
"links": [],
|
||||
"version": 0.4,
|
||||
"definitions": {
|
||||
"subgraphs": [
|
||||
{
|
||||
"id": "dca6e78d-fb06-421e-97f7-6ce17a665260",
|
||||
"version": 1,
|
||||
"state": {
|
||||
"lastGroupId": 1,
|
||||
"lastNodeId": 136,
|
||||
"lastLinkId": 302,
|
||||
"lastRerouteId": 0
|
||||
},
|
||||
"revision": 0,
|
||||
"config": {},
|
||||
"name": "Get Any Video Frame",
|
||||
"inputNode": {
|
||||
"id": -10,
|
||||
"bounding": [
|
||||
380,
|
||||
-57,
|
||||
120,
|
||||
80
|
||||
]
|
||||
},
|
||||
"outputNode": {
|
||||
"id": -20,
|
||||
"bounding": [
|
||||
1460,
|
||||
-57,
|
||||
120,
|
||||
60
|
||||
]
|
||||
},
|
||||
"inputs": [
|
||||
{
|
||||
"id": "2ceec378-8dcf-4340-8570-155967f59a93",
|
||||
"name": "video",
|
||||
"type": "VIDEO",
|
||||
"linkIds": [
|
||||
4
|
||||
],
|
||||
"pos": [
|
||||
480,
|
||||
-37
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "819955f6-c686-4896-8032-ff2d0059109a",
|
||||
"name": "value",
|
||||
"type": "INT",
|
||||
"linkIds": [
|
||||
283
|
||||
],
|
||||
"label": "frame_index",
|
||||
"pos": [
|
||||
480,
|
||||
-17
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"id": "1ab0684d-6a44-45b6-8aa4-a0b971a1d41e",
|
||||
"name": "IMAGE",
|
||||
"type": "IMAGE",
|
||||
"linkIds": [
|
||||
5
|
||||
],
|
||||
"pos": [
|
||||
1480,
|
||||
-37
|
||||
]
|
||||
}
|
||||
],
|
||||
"widgets": [],
|
||||
"nodes": [
|
||||
{
|
||||
"id": 1,
|
||||
"type": "GetVideoComponents",
|
||||
"pos": [
|
||||
560,
|
||||
-150
|
||||
],
|
||||
"size": [
|
||||
230,
|
||||
120
|
||||
],
|
||||
"flags": {},
|
||||
"order": 0,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "video",
|
||||
"name": "video",
|
||||
"type": "VIDEO",
|
||||
"link": 4
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "images",
|
||||
"name": "images",
|
||||
"type": "IMAGE",
|
||||
"links": [
|
||||
1,
|
||||
2
|
||||
]
|
||||
},
|
||||
{
|
||||
"localized_name": "audio",
|
||||
"name": "audio",
|
||||
"type": "AUDIO",
|
||||
"links": null
|
||||
},
|
||||
{
|
||||
"localized_name": "fps",
|
||||
"name": "fps",
|
||||
"type": "FLOAT",
|
||||
"links": null
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "GetVideoComponents"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"type": "GetImageSize",
|
||||
"pos": [
|
||||
560,
|
||||
50
|
||||
],
|
||||
"size": [
|
||||
230,
|
||||
120
|
||||
],
|
||||
"flags": {},
|
||||
"order": 1,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "image",
|
||||
"name": "image",
|
||||
"type": "IMAGE",
|
||||
"link": 1
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "width",
|
||||
"name": "width",
|
||||
"type": "INT",
|
||||
"links": null
|
||||
},
|
||||
{
|
||||
"localized_name": "height",
|
||||
"name": "height",
|
||||
"type": "INT",
|
||||
"links": null
|
||||
},
|
||||
{
|
||||
"localized_name": "batch_size",
|
||||
"name": "batch_size",
|
||||
"type": "INT",
|
||||
"links": [
|
||||
285
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "GetImageSize"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"type": "ImageFromBatch",
|
||||
"pos": [
|
||||
1130,
|
||||
-150
|
||||
],
|
||||
"size": [
|
||||
270,
|
||||
140
|
||||
],
|
||||
"flags": {},
|
||||
"order": 2,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "image",
|
||||
"name": "image",
|
||||
"type": "IMAGE",
|
||||
"link": 2
|
||||
},
|
||||
{
|
||||
"localized_name": "batch_index",
|
||||
"name": "batch_index",
|
||||
"type": "INT",
|
||||
"widget": {
|
||||
"name": "batch_index"
|
||||
},
|
||||
"link": 286
|
||||
},
|
||||
{
|
||||
"localized_name": "length",
|
||||
"name": "length",
|
||||
"type": "INT",
|
||||
"widget": {
|
||||
"name": "length"
|
||||
},
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "IMAGE",
|
||||
"name": "IMAGE",
|
||||
"type": "IMAGE",
|
||||
"links": [
|
||||
5
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "ImageFromBatch"
|
||||
},
|
||||
"widgets_values": [
|
||||
0,
|
||||
1
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 99,
|
||||
"type": "ComfyMathExpression",
|
||||
"pos": [
|
||||
910,
|
||||
100
|
||||
],
|
||||
"size": [
|
||||
400,
|
||||
200
|
||||
],
|
||||
"flags": {},
|
||||
"order": 3,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"label": "a",
|
||||
"localized_name": "values.a",
|
||||
"name": "values.a",
|
||||
"type": "FLOAT,INT",
|
||||
"link": 284
|
||||
},
|
||||
{
|
||||
"label": "b",
|
||||
"localized_name": "values.b",
|
||||
"name": "values.b",
|
||||
"shape": 7,
|
||||
"type": "FLOAT,INT",
|
||||
"link": 285
|
||||
},
|
||||
{
|
||||
"label": "c",
|
||||
"localized_name": "values.c",
|
||||
"name": "values.c",
|
||||
"shape": 7,
|
||||
"type": "FLOAT,INT",
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"localized_name": "expression",
|
||||
"name": "expression",
|
||||
"type": "STRING",
|
||||
"widget": {
|
||||
"name": "expression"
|
||||
},
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "FLOAT",
|
||||
"name": "FLOAT",
|
||||
"type": "FLOAT",
|
||||
"links": null
|
||||
},
|
||||
{
|
||||
"localized_name": "INT",
|
||||
"name": "INT",
|
||||
"type": "INT",
|
||||
"links": [
|
||||
286
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "ComfyMathExpression"
|
||||
},
|
||||
"widgets_values": [
|
||||
"min(max(int(a if a >= 0 else b + a), 0), b - 1)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 100,
|
||||
"type": "PrimitiveInt",
|
||||
"pos": [
|
||||
560,
|
||||
250
|
||||
],
|
||||
"size": [
|
||||
270,
|
||||
110
|
||||
],
|
||||
"flags": {},
|
||||
"order": 4,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "value",
|
||||
"name": "value",
|
||||
"type": "INT",
|
||||
"widget": {
|
||||
"name": "value"
|
||||
},
|
||||
"link": 283
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "INT",
|
||||
"name": "INT",
|
||||
"type": "INT",
|
||||
"links": [
|
||||
284
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "PrimitiveInt"
|
||||
},
|
||||
"widgets_values": [
|
||||
0,
|
||||
"fixed"
|
||||
]
|
||||
}
|
||||
],
|
||||
"groups": [],
|
||||
"links": [
|
||||
{
|
||||
"id": 1,
|
||||
"origin_id": 1,
|
||||
"origin_slot": 0,
|
||||
"target_id": 2,
|
||||
"target_slot": 0,
|
||||
"type": "IMAGE"
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"origin_id": 1,
|
||||
"origin_slot": 0,
|
||||
"target_id": 3,
|
||||
"target_slot": 0,
|
||||
"type": "IMAGE"
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 0,
|
||||
"target_id": 1,
|
||||
"target_slot": 0,
|
||||
"type": "VIDEO"
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"origin_id": 3,
|
||||
"origin_slot": 0,
|
||||
"target_id": -20,
|
||||
"target_slot": 0,
|
||||
"type": "IMAGE"
|
||||
},
|
||||
{
|
||||
"id": 283,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 1,
|
||||
"target_id": 100,
|
||||
"target_slot": 0,
|
||||
"type": "INT"
|
||||
},
|
||||
{
|
||||
"id": 284,
|
||||
"origin_id": 100,
|
||||
"origin_slot": 0,
|
||||
"target_id": 99,
|
||||
"target_slot": 0,
|
||||
"type": "INT"
|
||||
},
|
||||
{
|
||||
"id": 285,
|
||||
"origin_id": 2,
|
||||
"origin_slot": 2,
|
||||
"target_id": 99,
|
||||
"target_slot": 1,
|
||||
"type": "INT"
|
||||
},
|
||||
{
|
||||
"id": 286,
|
||||
"origin_id": 99,
|
||||
"origin_slot": 1,
|
||||
"target_id": 3,
|
||||
"target_slot": 1,
|
||||
"type": "INT"
|
||||
}
|
||||
],
|
||||
"extra": {},
|
||||
"category": "Video Tools",
|
||||
"description": "Extracts one image frame from a video at a chosen index, with optional trim and FPS control."
|
||||
}
|
||||
]
|
||||
},
|
||||
"extra": {
|
||||
"ds": {
|
||||
"scale": 1.197015527856339,
|
||||
"offset": [
|
||||
-168.76833554248222,
|
||||
540.6638955283997
|
||||
]
|
||||
},
|
||||
"frontendVersion": "1.42.8"
|
||||
}
|
||||
}
|
||||
@ -575,9 +575,8 @@
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Image Tools/Color adjust",
|
||||
"description": "Adds a glow/bloom effect around bright image areas via GPU fragment shader."
|
||||
"category": "Image Tools/Color adjust"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -752,9 +752,8 @@
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Image Tools/Color adjust",
|
||||
"description": "Adjusts hue, saturation, and lightness of an image using a real-time GPU fragment shader."
|
||||
"category": "Image Tools/Color adjust"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -374,8 +374,7 @@
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Image Tools/Blur",
|
||||
"description": "Applies Gaussian, Box, or Radial blur to soften images and create stylized depth or motion effects."
|
||||
"category": "Image Tools/Blur"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
@ -310,8 +310,7 @@
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Text generation/Image Captioning",
|
||||
"description": "Generates descriptive captions for images using Google's Gemini multimodal LLM."
|
||||
"category": "Text generation/Image Captioning"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
@ -315,9 +315,8 @@
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Image Tools/Color adjust",
|
||||
"description": "Manipulates individual RGBA channels for masking, compositing, and channel effects."
|
||||
"category": "Image Tools/Color adjust"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -128,7 +128,7 @@
|
||||
},
|
||||
"revision": 0,
|
||||
"config": {},
|
||||
"name": "Image Edit (Flux.2 Klein 4B)",
|
||||
"name": "local-Image Edit (Flux.2 Klein 4B)",
|
||||
"inputNode": {
|
||||
"id": -10,
|
||||
"bounding": [
|
||||
@ -1472,8 +1472,7 @@
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Image generation and editing/Edit image",
|
||||
"description": "Edits an input image via text instructions using FLUX.2 [klein] 4B."
|
||||
"category": "Image generation and editing/Edit image"
|
||||
},
|
||||
{
|
||||
"id": "6007e698-2ebd-4917-84d8-299b35d7b7ab",
|
||||
@ -1822,8 +1821,7 @@
|
||||
],
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"description": "Applies reference image conditioning for style/identity transfer (Flux.2 Klein 4B)."
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -132,7 +132,7 @@
|
||||
},
|
||||
"revision": 0,
|
||||
"config": {},
|
||||
"name": "Image Edit (Qwen 2511)",
|
||||
"name": "local-Image Edit (Qwen 2511)",
|
||||
"inputNode": {
|
||||
"id": -10,
|
||||
"bounding": [
|
||||
@ -1468,8 +1468,7 @@
|
||||
"VHS_MetadataImage": true,
|
||||
"VHS_KeepIntermediate": true
|
||||
},
|
||||
"category": "Image generation and editing/Edit image",
|
||||
"description": "Edits images via text instructions using Qwen-Image-Edit-2511 with improved character consistency and integrated LoRA."
|
||||
"category": "Image generation and editing/Edit image"
|
||||
}
|
||||
]
|
||||
},
|
||||
@ -1490,4 +1489,4 @@
|
||||
}
|
||||
},
|
||||
"version": 0.4
|
||||
}
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -124,7 +124,7 @@
|
||||
},
|
||||
"revision": 0,
|
||||
"config": {},
|
||||
"name": "Image Inpainting (Qwen-image)",
|
||||
"name": "local-Image Inpainting (Qwen-image)",
|
||||
"inputNode": {
|
||||
"id": -10,
|
||||
"bounding": [
|
||||
@ -1548,8 +1548,7 @@
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Image generation and editing/Inpaint image",
|
||||
"description": "Inpaints masked regions using Qwen-Image, extending its multilingual text rendering to inpainting tasks."
|
||||
"category": "Image generation and editing/Inpaint image"
|
||||
},
|
||||
{
|
||||
"id": "56a1f603-fbd2-40ed-94ef-c9ecbd96aca8",
|
||||
@ -1908,8 +1907,7 @@
|
||||
],
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"description": "Expands and softens mask edges to reduce visible seams after image processing."
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
@ -1925,4 +1923,4 @@
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"version": 0.4
|
||||
}
|
||||
}
|
||||
|
||||
@ -742,10 +742,9 @@
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Image Tools/Color adjust",
|
||||
"description": "Adjusts black point, white point, and gamma for tonal range control via GPU shader."
|
||||
"category": "Image Tools/Color adjust"
|
||||
}
|
||||
]
|
||||
},
|
||||
"extra": {}
|
||||
}
|
||||
}
|
||||
|
||||
@ -204,7 +204,7 @@
|
||||
},
|
||||
"revision": 0,
|
||||
"config": {},
|
||||
"name": "Image Outpainting (Qwen-Image)",
|
||||
"name": "local-Image Outpainting (Qwen-Image)",
|
||||
"inputNode": {
|
||||
"id": -10,
|
||||
"bounding": [
|
||||
@ -1919,8 +1919,7 @@
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Image generation and editing/Outpaint image",
|
||||
"description": "Outpaints beyond image boundaries using Qwen-Image's outpainting capabilities."
|
||||
"category": "Image generation and editing/Outpaint image"
|
||||
},
|
||||
{
|
||||
"id": "f93c215e-c393-460e-9534-ed2c3d8a652e",
|
||||
@ -2279,8 +2278,7 @@
|
||||
],
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"description": "Expands and softens mask edges to reduce visible seams after image processing."
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "2a4b2cc0-db37-4302-a067-da392f38f06b",
|
||||
@ -2735,8 +2733,7 @@
|
||||
],
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"description": "Scales both image and mask together while preserving alignment for editing workflows."
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
@ -2752,4 +2749,4 @@
|
||||
}
|
||||
},
|
||||
"version": 0.4
|
||||
}
|
||||
}
|
||||
|
||||
@ -1,714 +0,0 @@
|
||||
{
|
||||
"revision": 0,
|
||||
"last_node_id": 99,
|
||||
"last_link_id": 0,
|
||||
"nodes": [
|
||||
{
|
||||
"id": 99,
|
||||
"type": "6e7ab3ea-96aa-470f-9b94-3d9d0e01f481",
|
||||
"pos": [
|
||||
-1630,
|
||||
-3270
|
||||
],
|
||||
"size": [
|
||||
290,
|
||||
370
|
||||
],
|
||||
"flags": {},
|
||||
"order": 3,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"label": "image",
|
||||
"localized_name": "image",
|
||||
"name": "image",
|
||||
"type": "IMAGE",
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"label": "object",
|
||||
"name": "text",
|
||||
"type": "STRING",
|
||||
"widget": {
|
||||
"name": "text"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "bboxes",
|
||||
"type": "BOUNDING_BOX",
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "positive_coords",
|
||||
"type": "STRING",
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "negative_coords",
|
||||
"type": "STRING",
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "threshold",
|
||||
"type": "FLOAT",
|
||||
"widget": {
|
||||
"name": "threshold"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "refine_iterations",
|
||||
"type": "INT",
|
||||
"widget": {
|
||||
"name": "refine_iterations"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "individual_masks",
|
||||
"type": "BOOLEAN",
|
||||
"widget": {
|
||||
"name": "individual_masks"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "ckpt_name",
|
||||
"type": "COMBO",
|
||||
"widget": {
|
||||
"name": "ckpt_name"
|
||||
},
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "masks",
|
||||
"name": "masks",
|
||||
"type": "MASK",
|
||||
"links": []
|
||||
},
|
||||
{
|
||||
"localized_name": "bboxes",
|
||||
"name": "bboxes",
|
||||
"type": "BOUNDING_BOX",
|
||||
"links": []
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"proxyWidgets": [
|
||||
[
|
||||
"78",
|
||||
"text"
|
||||
],
|
||||
[
|
||||
"75",
|
||||
"threshold"
|
||||
],
|
||||
[
|
||||
"75",
|
||||
"refine_iterations"
|
||||
],
|
||||
[
|
||||
"75",
|
||||
"individual_masks"
|
||||
],
|
||||
[
|
||||
"77",
|
||||
"ckpt_name"
|
||||
]
|
||||
],
|
||||
"ue_properties": {
|
||||
"widget_ue_connectable": {
|
||||
"text": true
|
||||
},
|
||||
"version": "7.7",
|
||||
"input_ue_unconnectable": {}
|
||||
},
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.19.3",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
"hasSecondTab": false,
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65
|
||||
},
|
||||
"widgets_values": [],
|
||||
"title": "Image Segmentation (SAM3)"
|
||||
}
|
||||
],
|
||||
"links": [],
|
||||
"version": 0.4,
|
||||
"definitions": {
|
||||
"subgraphs": [
|
||||
{
|
||||
"id": "6e7ab3ea-96aa-470f-9b94-3d9d0e01f481",
|
||||
"version": 1,
|
||||
"state": {
|
||||
"lastGroupId": 0,
|
||||
"lastNodeId": 113,
|
||||
"lastLinkId": 283,
|
||||
"lastRerouteId": 0
|
||||
},
|
||||
"revision": 0,
|
||||
"config": {},
|
||||
"name": "Image Segmentation (SAM3)",
|
||||
"inputNode": {
|
||||
"id": -10,
|
||||
"bounding": [
|
||||
-2260,
|
||||
-3450,
|
||||
136.369140625,
|
||||
220
|
||||
]
|
||||
},
|
||||
"outputNode": {
|
||||
"id": -20,
|
||||
"bounding": [
|
||||
-1130,
|
||||
-3305,
|
||||
120,
|
||||
80
|
||||
]
|
||||
},
|
||||
"inputs": [
|
||||
{
|
||||
"id": "a6e75fa2-162a-4af0-a2fd-1e9c899a5ab6",
|
||||
"name": "image",
|
||||
"type": "IMAGE",
|
||||
"linkIds": [
|
||||
264
|
||||
],
|
||||
"localized_name": "image",
|
||||
"label": "image",
|
||||
"pos": [
|
||||
-2143.630859375,
|
||||
-3430
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "3cefd304-7631-4ff6-a5a0-5a0ffb120745",
|
||||
"name": "text",
|
||||
"type": "STRING",
|
||||
"linkIds": [
|
||||
265
|
||||
],
|
||||
"label": "object",
|
||||
"pos": [
|
||||
-2143.630859375,
|
||||
-3410
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "1aec91c5-d8d2-441c-928c-49c14e7e80ed",
|
||||
"name": "bboxes",
|
||||
"type": "BOUNDING_BOX",
|
||||
"linkIds": [
|
||||
266
|
||||
],
|
||||
"pos": [
|
||||
-2143.630859375,
|
||||
-3390
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "1ec7ce1a-8257-4719-8a81-60ebc8a98899",
|
||||
"name": "positive_coords",
|
||||
"type": "STRING",
|
||||
"linkIds": [
|
||||
267
|
||||
],
|
||||
"pos": [
|
||||
-2143.630859375,
|
||||
-3370
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "c65f8b87-9bd7-48be-9fc2-823431e95019",
|
||||
"name": "negative_coords",
|
||||
"type": "STRING",
|
||||
"linkIds": [
|
||||
268
|
||||
],
|
||||
"pos": [
|
||||
-2143.630859375,
|
||||
-3350
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "bb4ba35a-ccfe-4c37-98e5-d9b0d69585fb",
|
||||
"name": "threshold",
|
||||
"type": "FLOAT",
|
||||
"linkIds": [
|
||||
269
|
||||
],
|
||||
"pos": [
|
||||
-2143.630859375,
|
||||
-3330
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "b1439668-b050-490b-a5dc-fc4052c55666",
|
||||
"name": "refine_iterations",
|
||||
"type": "INT",
|
||||
"linkIds": [
|
||||
270
|
||||
],
|
||||
"pos": [
|
||||
-2143.630859375,
|
||||
-3310
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "86e239e5-c098-4302-b54d-d42a38bc0f89",
|
||||
"name": "individual_masks",
|
||||
"type": "BOOLEAN",
|
||||
"linkIds": [
|
||||
271
|
||||
],
|
||||
"pos": [
|
||||
-2143.630859375,
|
||||
-3290
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "f9e0b9d4-b2f1-4907-a4a5-305656576706",
|
||||
"name": "ckpt_name",
|
||||
"type": "COMBO",
|
||||
"linkIds": [
|
||||
272
|
||||
],
|
||||
"pos": [
|
||||
-2143.630859375,
|
||||
-3270
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"id": "ff50da09-1e59-4a58-9b7f-be1a00aa5913",
|
||||
"name": "masks",
|
||||
"type": "MASK",
|
||||
"linkIds": [
|
||||
231
|
||||
],
|
||||
"localized_name": "masks",
|
||||
"pos": [
|
||||
-1110,
|
||||
-3285
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "8f622e40-8528-4078-b7d3-147e9f872194",
|
||||
"name": "bboxes",
|
||||
"type": "BOUNDING_BOX",
|
||||
"linkIds": [
|
||||
232
|
||||
],
|
||||
"localized_name": "bboxes",
|
||||
"pos": [
|
||||
-1110,
|
||||
-3265
|
||||
]
|
||||
}
|
||||
],
|
||||
"widgets": [],
|
||||
"nodes": [
|
||||
{
|
||||
"id": 75,
|
||||
"type": "SAM3_Detect",
|
||||
"pos": [
|
||||
-1470,
|
||||
-3460
|
||||
],
|
||||
"size": [
|
||||
270,
|
||||
260
|
||||
],
|
||||
"flags": {},
|
||||
"order": 0,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"label": "model",
|
||||
"localized_name": "model",
|
||||
"name": "model",
|
||||
"type": "MODEL",
|
||||
"link": 237
|
||||
},
|
||||
{
|
||||
"label": "image",
|
||||
"localized_name": "image",
|
||||
"name": "image",
|
||||
"type": "IMAGE",
|
||||
"link": 264
|
||||
},
|
||||
{
|
||||
"label": "conditioning",
|
||||
"localized_name": "conditioning",
|
||||
"name": "conditioning",
|
||||
"shape": 7,
|
||||
"type": "CONDITIONING",
|
||||
"link": 200
|
||||
},
|
||||
{
|
||||
"label": "bboxes",
|
||||
"localized_name": "bboxes",
|
||||
"name": "bboxes",
|
||||
"shape": 7,
|
||||
"type": "BOUNDING_BOX",
|
||||
"link": 266
|
||||
},
|
||||
{
|
||||
"label": "positive_coords",
|
||||
"localized_name": "positive_coords",
|
||||
"name": "positive_coords",
|
||||
"shape": 7,
|
||||
"type": "STRING",
|
||||
"link": 267
|
||||
},
|
||||
{
|
||||
"label": "negative_coords",
|
||||
"localized_name": "negative_coords",
|
||||
"name": "negative_coords",
|
||||
"shape": 7,
|
||||
"type": "STRING",
|
||||
"link": 268
|
||||
},
|
||||
{
|
||||
"localized_name": "threshold",
|
||||
"name": "threshold",
|
||||
"type": "FLOAT",
|
||||
"widget": {
|
||||
"name": "threshold"
|
||||
},
|
||||
"link": 269
|
||||
},
|
||||
{
|
||||
"localized_name": "refine_iterations",
|
||||
"name": "refine_iterations",
|
||||
"type": "INT",
|
||||
"widget": {
|
||||
"name": "refine_iterations"
|
||||
},
|
||||
"link": 270
|
||||
},
|
||||
{
|
||||
"localized_name": "individual_masks",
|
||||
"name": "individual_masks",
|
||||
"type": "BOOLEAN",
|
||||
"widget": {
|
||||
"name": "individual_masks"
|
||||
},
|
||||
"link": 271
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "masks",
|
||||
"name": "masks",
|
||||
"type": "MASK",
|
||||
"links": [
|
||||
231
|
||||
]
|
||||
},
|
||||
{
|
||||
"localized_name": "bboxes",
|
||||
"name": "bboxes",
|
||||
"type": "BOUNDING_BOX",
|
||||
"links": [
|
||||
232
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"ue_properties": {
|
||||
"widget_ue_connectable": {},
|
||||
"version": "7.7",
|
||||
"input_ue_unconnectable": {}
|
||||
},
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.19.3",
|
||||
"Node name for S&R": "SAM3_Detect",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
"hasSecondTab": false,
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65
|
||||
},
|
||||
"widgets_values": [
|
||||
0.5,
|
||||
2,
|
||||
false
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 77,
|
||||
"type": "CheckpointLoaderSimple",
|
||||
"pos": [
|
||||
-1970,
|
||||
-3200
|
||||
],
|
||||
"size": [
|
||||
330,
|
||||
140
|
||||
],
|
||||
"flags": {},
|
||||
"order": 1,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "ckpt_name",
|
||||
"name": "ckpt_name",
|
||||
"type": "COMBO",
|
||||
"widget": {
|
||||
"name": "ckpt_name"
|
||||
},
|
||||
"link": 272
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "MODEL",
|
||||
"name": "MODEL",
|
||||
"type": "MODEL",
|
||||
"links": [
|
||||
237
|
||||
]
|
||||
},
|
||||
{
|
||||
"localized_name": "CLIP",
|
||||
"name": "CLIP",
|
||||
"type": "CLIP",
|
||||
"links": [
|
||||
240
|
||||
]
|
||||
},
|
||||
{
|
||||
"localized_name": "VAE",
|
||||
"name": "VAE",
|
||||
"type": "VAE",
|
||||
"links": null
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"ue_properties": {
|
||||
"widget_ue_connectable": {},
|
||||
"version": "7.7",
|
||||
"input_ue_unconnectable": {}
|
||||
},
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.19.3",
|
||||
"Node name for S&R": "CheckpointLoaderSimple",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
"hasSecondTab": false,
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65,
|
||||
"models": [
|
||||
{
|
||||
"name": "sam3.1_multiplex_fp16.safetensors",
|
||||
"url": "https://huggingface.co/Comfy-Org/sam3.1/resolve/main/checkpoints/sam3.1_multiplex_fp16.safetensors",
|
||||
"directory": "checkpoints"
|
||||
}
|
||||
]
|
||||
},
|
||||
"widgets_values": [
|
||||
"sam3.1_multiplex_fp16.safetensors"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 78,
|
||||
"type": "CLIPTextEncode",
|
||||
"pos": [
|
||||
-2000,
|
||||
-3000
|
||||
],
|
||||
"size": [
|
||||
400,
|
||||
200
|
||||
],
|
||||
"flags": {},
|
||||
"order": 2,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "clip",
|
||||
"name": "clip",
|
||||
"type": "CLIP",
|
||||
"link": 240
|
||||
},
|
||||
{
|
||||
"localized_name": "text",
|
||||
"name": "text",
|
||||
"type": "STRING",
|
||||
"widget": {
|
||||
"name": "text"
|
||||
},
|
||||
"link": 265
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "CONDITIONING",
|
||||
"name": "CONDITIONING",
|
||||
"type": "CONDITIONING",
|
||||
"links": [
|
||||
200
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"ue_properties": {
|
||||
"widget_ue_connectable": {},
|
||||
"version": "7.7",
|
||||
"input_ue_unconnectable": {}
|
||||
},
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.19.3",
|
||||
"Node name for S&R": "CLIPTextEncode",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
"hasSecondTab": false,
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65
|
||||
},
|
||||
"widgets_values": [
|
||||
""
|
||||
]
|
||||
}
|
||||
],
|
||||
"groups": [],
|
||||
"links": [
|
||||
{
|
||||
"id": 237,
|
||||
"origin_id": 77,
|
||||
"origin_slot": 0,
|
||||
"target_id": 75,
|
||||
"target_slot": 0,
|
||||
"type": "MODEL"
|
||||
},
|
||||
{
|
||||
"id": 200,
|
||||
"origin_id": 78,
|
||||
"origin_slot": 0,
|
||||
"target_id": 75,
|
||||
"target_slot": 2,
|
||||
"type": "CONDITIONING"
|
||||
},
|
||||
{
|
||||
"id": 240,
|
||||
"origin_id": 77,
|
||||
"origin_slot": 1,
|
||||
"target_id": 78,
|
||||
"target_slot": 0,
|
||||
"type": "CLIP"
|
||||
},
|
||||
{
|
||||
"id": 231,
|
||||
"origin_id": 75,
|
||||
"origin_slot": 0,
|
||||
"target_id": -20,
|
||||
"target_slot": 0,
|
||||
"type": "MASK"
|
||||
},
|
||||
{
|
||||
"id": 232,
|
||||
"origin_id": 75,
|
||||
"origin_slot": 1,
|
||||
"target_id": -20,
|
||||
"target_slot": 1,
|
||||
"type": "BOUNDING_BOX"
|
||||
},
|
||||
{
|
||||
"id": 264,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 0,
|
||||
"target_id": 75,
|
||||
"target_slot": 1,
|
||||
"type": "IMAGE"
|
||||
},
|
||||
{
|
||||
"id": 265,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 1,
|
||||
"target_id": 78,
|
||||
"target_slot": 1,
|
||||
"type": "STRING"
|
||||
},
|
||||
{
|
||||
"id": 266,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 2,
|
||||
"target_id": 75,
|
||||
"target_slot": 3,
|
||||
"type": "BOUNDING_BOX"
|
||||
},
|
||||
{
|
||||
"id": 267,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 3,
|
||||
"target_id": 75,
|
||||
"target_slot": 4,
|
||||
"type": "STRING"
|
||||
},
|
||||
{
|
||||
"id": 268,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 4,
|
||||
"target_id": 75,
|
||||
"target_slot": 5,
|
||||
"type": "STRING"
|
||||
},
|
||||
{
|
||||
"id": 269,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 5,
|
||||
"target_id": 75,
|
||||
"target_slot": 6,
|
||||
"type": "FLOAT"
|
||||
},
|
||||
{
|
||||
"id": 270,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 6,
|
||||
"target_id": 75,
|
||||
"target_slot": 7,
|
||||
"type": "INT"
|
||||
},
|
||||
{
|
||||
"id": 271,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 7,
|
||||
"target_id": 75,
|
||||
"target_slot": 8,
|
||||
"type": "BOOLEAN"
|
||||
},
|
||||
{
|
||||
"id": 272,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 8,
|
||||
"target_id": 77,
|
||||
"target_slot": 0,
|
||||
"type": "COMBO"
|
||||
}
|
||||
],
|
||||
"extra": {},
|
||||
"category": "Image Tools/Image Segmentation",
|
||||
"description": "Segments images into masks using Meta SAM3 from text prompts, points, or boxes."
|
||||
}
|
||||
]
|
||||
},
|
||||
"extra": {
|
||||
"ue_links": []
|
||||
}
|
||||
}
|
||||
@ -141,7 +141,7 @@
|
||||
},
|
||||
"revision": 0,
|
||||
"config": {},
|
||||
"name": "Image Upscale (Z-image-Turbo)",
|
||||
"name": "local-Image Upscale(Z-image-Turbo)",
|
||||
"inputNode": {
|
||||
"id": -10,
|
||||
"bounding": [
|
||||
@ -1302,8 +1302,7 @@
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Image generation and editing/Enhance",
|
||||
"description": "Upscales images to higher resolution using Z-Image-Turbo."
|
||||
"category": "Image generation and editing/Enhance"
|
||||
}
|
||||
]
|
||||
},
|
||||
|
||||
@ -99,7 +99,7 @@
|
||||
},
|
||||
"revision": 0,
|
||||
"config": {},
|
||||
"name": "Image to Depth Map (Lotus)",
|
||||
"name": "local-Image to Depth Map (Lotus)",
|
||||
"inputNode": {
|
||||
"id": -10,
|
||||
"bounding": [
|
||||
@ -948,8 +948,7 @@
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Image generation and editing/Depth to image",
|
||||
"description": "Estimates a monocular depth map from an input image using the Lotus depth estimation model."
|
||||
"category": "Image generation and editing/Depth to image"
|
||||
}
|
||||
]
|
||||
},
|
||||
@ -965,4 +964,4 @@
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"version": 0.4
|
||||
}
|
||||
}
|
||||
|
||||
@ -1,14 +1,15 @@
|
||||
{
|
||||
"id": "1a761372-7c82-4016-b9bf-fa285967e1e9",
|
||||
"revision": 0,
|
||||
"last_node_id": 176,
|
||||
"last_node_id": 83,
|
||||
"last_link_id": 0,
|
||||
"nodes": [
|
||||
{
|
||||
"id": 176,
|
||||
"type": "2d2e3c8e-53b3-4618-be52-6d1d99382f0e",
|
||||
"id": 83,
|
||||
"type": "f754a936-daaf-4b6e-9658-41fdc54d301d",
|
||||
"pos": [
|
||||
-1150,
|
||||
200
|
||||
61.999827823554256,
|
||||
153.3332507624185
|
||||
],
|
||||
"size": [
|
||||
400,
|
||||
@ -55,38 +56,6 @@
|
||||
"name": "layers"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "seed",
|
||||
"type": "INT",
|
||||
"widget": {
|
||||
"name": "seed"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "unet_name",
|
||||
"type": "COMBO",
|
||||
"widget": {
|
||||
"name": "unet_name"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "clip_name",
|
||||
"type": "COMBO",
|
||||
"widget": {
|
||||
"name": "clip_name"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "vae_name",
|
||||
"type": "COMBO",
|
||||
"widget": {
|
||||
"name": "vae_name"
|
||||
},
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
@ -97,41 +66,28 @@
|
||||
"links": []
|
||||
}
|
||||
],
|
||||
"title": "Image to Layers (Qwen-Image-Layered)",
|
||||
"properties": {
|
||||
"proxyWidgets": [
|
||||
[
|
||||
"6",
|
||||
"-1",
|
||||
"text"
|
||||
],
|
||||
[
|
||||
"3",
|
||||
"-1",
|
||||
"steps"
|
||||
],
|
||||
[
|
||||
"3",
|
||||
"-1",
|
||||
"cfg"
|
||||
],
|
||||
[
|
||||
"83",
|
||||
"-1",
|
||||
"layers"
|
||||
],
|
||||
[
|
||||
"3",
|
||||
"seed"
|
||||
],
|
||||
[
|
||||
"37",
|
||||
"unet_name"
|
||||
],
|
||||
[
|
||||
"38",
|
||||
"clip_name"
|
||||
],
|
||||
[
|
||||
"39",
|
||||
"vae_name"
|
||||
],
|
||||
[
|
||||
"3",
|
||||
"control_after_generate"
|
||||
@ -139,11 +95,6 @@
|
||||
],
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.5.1",
|
||||
"ue_properties": {
|
||||
"widget_ue_connectable": {},
|
||||
"input_ue_unconnectable": {},
|
||||
"version": "7.7"
|
||||
},
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
@ -152,20 +103,25 @@
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65
|
||||
},
|
||||
"widgets_values": []
|
||||
"widgets_values": [
|
||||
"",
|
||||
20,
|
||||
2.5,
|
||||
2
|
||||
]
|
||||
}
|
||||
],
|
||||
"links": [],
|
||||
"version": 0.4,
|
||||
"groups": [],
|
||||
"definitions": {
|
||||
"subgraphs": [
|
||||
{
|
||||
"id": "2d2e3c8e-53b3-4618-be52-6d1d99382f0e",
|
||||
"id": "f754a936-daaf-4b6e-9658-41fdc54d301d",
|
||||
"version": 1,
|
||||
"state": {
|
||||
"lastGroupId": 8,
|
||||
"lastNodeId": 176,
|
||||
"lastLinkId": 380,
|
||||
"lastGroupId": 3,
|
||||
"lastNodeId": 83,
|
||||
"lastLinkId": 159,
|
||||
"lastRerouteId": 0
|
||||
},
|
||||
"revision": 0,
|
||||
@ -174,10 +130,10 @@
|
||||
"inputNode": {
|
||||
"id": -10,
|
||||
"bounding": [
|
||||
-720,
|
||||
720,
|
||||
-510,
|
||||
523,
|
||||
120,
|
||||
220
|
||||
140
|
||||
]
|
||||
},
|
||||
"outputNode": {
|
||||
@ -200,8 +156,8 @@
|
||||
],
|
||||
"localized_name": "image",
|
||||
"pos": [
|
||||
-620,
|
||||
740
|
||||
-410,
|
||||
543
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -212,8 +168,8 @@
|
||||
150
|
||||
],
|
||||
"pos": [
|
||||
-620,
|
||||
760
|
||||
-410,
|
||||
563
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -224,8 +180,8 @@
|
||||
153
|
||||
],
|
||||
"pos": [
|
||||
-620,
|
||||
780
|
||||
-410,
|
||||
583
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -236,8 +192,8 @@
|
||||
154
|
||||
],
|
||||
"pos": [
|
||||
-620,
|
||||
800
|
||||
-410,
|
||||
603
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -248,56 +204,8 @@
|
||||
159
|
||||
],
|
||||
"pos": [
|
||||
-620,
|
||||
820
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "9f76338b-f4ca-4bb3-b61a-57b3f233061e",
|
||||
"name": "seed",
|
||||
"type": "INT",
|
||||
"linkIds": [
|
||||
377
|
||||
],
|
||||
"pos": [
|
||||
-620,
|
||||
840
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "8d0422d5-5eee-4f7e-9817-dc613cc62eca",
|
||||
"name": "unet_name",
|
||||
"type": "COMBO",
|
||||
"linkIds": [
|
||||
378
|
||||
],
|
||||
"pos": [
|
||||
-620,
|
||||
860
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "552eece2-a735-4d00-ae78-ded454622bc1",
|
||||
"name": "clip_name",
|
||||
"type": "COMBO",
|
||||
"linkIds": [
|
||||
379
|
||||
],
|
||||
"pos": [
|
||||
-620,
|
||||
880
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "1e6d141c-d0f9-4a2b-895c-b6780e57cfa0",
|
||||
"name": "vae_name",
|
||||
"type": "COMBO",
|
||||
"linkIds": [
|
||||
380
|
||||
],
|
||||
"pos": [
|
||||
-620,
|
||||
900
|
||||
-410,
|
||||
623
|
||||
]
|
||||
}
|
||||
],
|
||||
@ -323,14 +231,14 @@
|
||||
"type": "CLIPLoader",
|
||||
"pos": [
|
||||
-320,
|
||||
360
|
||||
310
|
||||
],
|
||||
"size": [
|
||||
350,
|
||||
150
|
||||
346.7470703125,
|
||||
106
|
||||
],
|
||||
"flags": {},
|
||||
"order": 5,
|
||||
"order": 0,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
@ -340,7 +248,7 @@
|
||||
"widget": {
|
||||
"name": "clip_name"
|
||||
},
|
||||
"link": 379
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"localized_name": "type",
|
||||
@ -375,14 +283,9 @@
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "CLIPLoader",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.5.1",
|
||||
"ue_properties": {
|
||||
"widget_ue_connectable": {},
|
||||
"input_ue_unconnectable": {},
|
||||
"version": "7.7"
|
||||
},
|
||||
"Node name for S&R": "CLIPLoader",
|
||||
"models": [
|
||||
{
|
||||
"name": "qwen_2.5_vl_7b_fp8_scaled.safetensors",
|
||||
@ -409,14 +312,14 @@
|
||||
"type": "VAELoader",
|
||||
"pos": [
|
||||
-320,
|
||||
580
|
||||
460
|
||||
],
|
||||
"size": [
|
||||
350,
|
||||
110
|
||||
346.7470703125,
|
||||
58
|
||||
],
|
||||
"flags": {},
|
||||
"order": 6,
|
||||
"order": 1,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
@ -426,7 +329,7 @@
|
||||
"widget": {
|
||||
"name": "vae_name"
|
||||
},
|
||||
"link": 380
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
@ -442,14 +345,9 @@
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "VAELoader",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.5.1",
|
||||
"ue_properties": {
|
||||
"widget_ue_connectable": {},
|
||||
"input_ue_unconnectable": {},
|
||||
"version": "7.7"
|
||||
},
|
||||
"Node name for S&R": "VAELoader",
|
||||
"models": [
|
||||
{
|
||||
"name": "qwen_image_layered_vae.safetensors",
|
||||
@ -477,11 +375,11 @@
|
||||
420
|
||||
],
|
||||
"size": [
|
||||
430,
|
||||
190
|
||||
425.27801513671875,
|
||||
180.6060791015625
|
||||
],
|
||||
"flags": {},
|
||||
"order": 2,
|
||||
"order": 3,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
@ -513,14 +411,9 @@
|
||||
],
|
||||
"title": "CLIP Text Encode (Negative Prompt)",
|
||||
"properties": {
|
||||
"Node name for S&R": "CLIPTextEncode",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.5.1",
|
||||
"ue_properties": {
|
||||
"widget_ue_connectable": {},
|
||||
"input_ue_unconnectable": {},
|
||||
"version": "7.7"
|
||||
},
|
||||
"Node name for S&R": "CLIPTextEncode",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
@ -539,12 +432,12 @@
|
||||
"id": 70,
|
||||
"type": "ReferenceLatent",
|
||||
"pos": [
|
||||
140,
|
||||
700
|
||||
330,
|
||||
670
|
||||
],
|
||||
"size": [
|
||||
210,
|
||||
50
|
||||
204.1666717529297,
|
||||
46
|
||||
],
|
||||
"flags": {
|
||||
"collapsed": true
|
||||
@ -577,14 +470,9 @@
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "ReferenceLatent",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.5.1",
|
||||
"ue_properties": {
|
||||
"widget_ue_connectable": {},
|
||||
"input_ue_unconnectable": {},
|
||||
"version": "7.7"
|
||||
},
|
||||
"Node name for S&R": "ReferenceLatent",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
@ -592,18 +480,19 @@
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65
|
||||
}
|
||||
},
|
||||
"widgets_values": []
|
||||
},
|
||||
{
|
||||
"id": 69,
|
||||
"type": "ReferenceLatent",
|
||||
"pos": [
|
||||
160,
|
||||
820
|
||||
330,
|
||||
710
|
||||
],
|
||||
"size": [
|
||||
210,
|
||||
50
|
||||
204.1666717529297,
|
||||
46
|
||||
],
|
||||
"flags": {
|
||||
"collapsed": true
|
||||
@ -636,14 +525,9 @@
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "ReferenceLatent",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.5.1",
|
||||
"ue_properties": {
|
||||
"widget_ue_connectable": {},
|
||||
"input_ue_unconnectable": {},
|
||||
"version": "7.7"
|
||||
},
|
||||
"Node name for S&R": "ReferenceLatent",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
@ -651,7 +535,8 @@
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65
|
||||
}
|
||||
},
|
||||
"widgets_values": []
|
||||
},
|
||||
{
|
||||
"id": 66,
|
||||
@ -662,10 +547,10 @@
|
||||
],
|
||||
"size": [
|
||||
270,
|
||||
110
|
||||
58
|
||||
],
|
||||
"flags": {},
|
||||
"order": 7,
|
||||
"order": 4,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
@ -695,14 +580,9 @@
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "ModelSamplingAuraFlow",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.5.1",
|
||||
"ue_properties": {
|
||||
"widget_ue_connectable": {},
|
||||
"input_ue_unconnectable": {},
|
||||
"version": "7.7"
|
||||
},
|
||||
"Node name for S&R": "ModelSamplingAuraFlow",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
@ -720,11 +600,11 @@
|
||||
"type": "LatentCutToBatch",
|
||||
"pos": [
|
||||
830,
|
||||
140
|
||||
160
|
||||
],
|
||||
"size": [
|
||||
270,
|
||||
140
|
||||
82
|
||||
],
|
||||
"flags": {},
|
||||
"order": 11,
|
||||
@ -766,14 +646,9 @@
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "LatentCutToBatch",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.5.1",
|
||||
"ue_properties": {
|
||||
"widget_ue_connectable": {},
|
||||
"input_ue_unconnectable": {},
|
||||
"version": "7.7"
|
||||
},
|
||||
"Node name for S&R": "LatentCutToBatch",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
@ -791,12 +666,12 @@
|
||||
"id": 71,
|
||||
"type": "VAEEncode",
|
||||
"pos": [
|
||||
-280,
|
||||
780
|
||||
100,
|
||||
690
|
||||
],
|
||||
"size": [
|
||||
230,
|
||||
100
|
||||
140,
|
||||
46
|
||||
],
|
||||
"flags": {
|
||||
"collapsed": false
|
||||
@ -829,14 +704,9 @@
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "VAEEncode",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.5.1",
|
||||
"ue_properties": {
|
||||
"widget_ue_connectable": {},
|
||||
"input_ue_unconnectable": {},
|
||||
"version": "7.7"
|
||||
},
|
||||
"Node name for S&R": "VAEEncode",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
@ -844,23 +714,24 @@
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65
|
||||
}
|
||||
},
|
||||
"widgets_values": []
|
||||
},
|
||||
{
|
||||
"id": 8,
|
||||
"type": "VAEDecode",
|
||||
"pos": [
|
||||
850,
|
||||
370
|
||||
310
|
||||
],
|
||||
"size": [
|
||||
210,
|
||||
50
|
||||
46
|
||||
],
|
||||
"flags": {
|
||||
"collapsed": true
|
||||
},
|
||||
"order": 3,
|
||||
"order": 7,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
@ -888,14 +759,9 @@
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "VAEDecode",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.5.1",
|
||||
"ue_properties": {
|
||||
"widget_ue_connectable": {},
|
||||
"input_ue_unconnectable": {},
|
||||
"version": "7.7"
|
||||
},
|
||||
"Node name for S&R": "VAEDecode",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
@ -903,7 +769,8 @@
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65
|
||||
}
|
||||
},
|
||||
"widgets_values": []
|
||||
},
|
||||
{
|
||||
"id": 6,
|
||||
@ -913,11 +780,11 @@
|
||||
180
|
||||
],
|
||||
"size": [
|
||||
430,
|
||||
170
|
||||
422.84503173828125,
|
||||
164.31304931640625
|
||||
],
|
||||
"flags": {},
|
||||
"order": 1,
|
||||
"order": 6,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
@ -949,14 +816,9 @@
|
||||
],
|
||||
"title": "CLIP Text Encode (Positive Prompt)",
|
||||
"properties": {
|
||||
"Node name for S&R": "CLIPTextEncode",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.5.1",
|
||||
"ue_properties": {
|
||||
"widget_ue_connectable": {},
|
||||
"input_ue_unconnectable": {},
|
||||
"version": "7.7"
|
||||
},
|
||||
"Node name for S&R": "CLIPTextEncode",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
@ -976,14 +838,14 @@
|
||||
"type": "KSampler",
|
||||
"pos": [
|
||||
530,
|
||||
340
|
||||
280
|
||||
],
|
||||
"size": [
|
||||
270,
|
||||
400
|
||||
],
|
||||
"flags": {},
|
||||
"order": 0,
|
||||
"order": 5,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
@ -1017,7 +879,7 @@
|
||||
"widget": {
|
||||
"name": "seed"
|
||||
},
|
||||
"link": 377
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"localized_name": "steps",
|
||||
@ -1077,14 +939,9 @@
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "KSampler",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.5.1",
|
||||
"ue_properties": {
|
||||
"widget_ue_connectable": {},
|
||||
"input_ue_unconnectable": {},
|
||||
"version": "7.7"
|
||||
},
|
||||
"Node name for S&R": "KSampler",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
@ -1107,12 +964,12 @@
|
||||
"id": 78,
|
||||
"type": "GetImageSize",
|
||||
"pos": [
|
||||
-280,
|
||||
930
|
||||
80,
|
||||
790
|
||||
],
|
||||
"size": [
|
||||
230,
|
||||
140
|
||||
210,
|
||||
136
|
||||
],
|
||||
"flags": {},
|
||||
"order": 12,
|
||||
@ -1150,14 +1007,9 @@
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "GetImageSize",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.5.1",
|
||||
"ue_properties": {
|
||||
"widget_ue_connectable": {},
|
||||
"input_ue_unconnectable": {},
|
||||
"version": "7.7"
|
||||
},
|
||||
"Node name for S&R": "GetImageSize",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
@ -1165,23 +1017,23 @@
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65
|
||||
}
|
||||
},
|
||||
"widgets_values": []
|
||||
},
|
||||
{
|
||||
"id": 83,
|
||||
"type": "EmptyQwenImageLayeredLatentImage",
|
||||
"pos": [
|
||||
-280,
|
||||
1120
|
||||
320,
|
||||
790
|
||||
],
|
||||
"size": [
|
||||
340,
|
||||
200
|
||||
330.9341796875,
|
||||
130
|
||||
],
|
||||
"flags": {},
|
||||
"order": 13,
|
||||
"mode": 0,
|
||||
"showAdvanced": true,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "width",
|
||||
@ -1231,14 +1083,9 @@
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "EmptyQwenImageLayeredLatentImage",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.5.1",
|
||||
"ue_properties": {
|
||||
"widget_ue_connectable": {},
|
||||
"input_ue_unconnectable": {},
|
||||
"version": "7.7"
|
||||
},
|
||||
"Node name for S&R": "EmptyQwenImageLayeredLatentImage",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
@ -1262,11 +1109,11 @@
|
||||
180
|
||||
],
|
||||
"size": [
|
||||
350,
|
||||
110
|
||||
346.7470703125,
|
||||
82
|
||||
],
|
||||
"flags": {},
|
||||
"order": 4,
|
||||
"order": 2,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
@ -1276,7 +1123,7 @@
|
||||
"widget": {
|
||||
"name": "unet_name"
|
||||
},
|
||||
"link": 378
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"localized_name": "weight_dtype",
|
||||
@ -1300,14 +1147,9 @@
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "UNETLoader",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.5.1",
|
||||
"ue_properties": {
|
||||
"widget_ue_connectable": {},
|
||||
"input_ue_unconnectable": {},
|
||||
"version": "7.7"
|
||||
},
|
||||
"Node name for S&R": "UNETLoader",
|
||||
"models": [
|
||||
{
|
||||
"name": "qwen_image_layered_bf16.safetensors",
|
||||
@ -1349,8 +1191,8 @@
|
||||
"bounding": [
|
||||
-330,
|
||||
110,
|
||||
370,
|
||||
610
|
||||
366.7470703125,
|
||||
421.6
|
||||
],
|
||||
"color": "#3f789e",
|
||||
"font_size": 24,
|
||||
@ -1549,48 +1391,16 @@
|
||||
"target_id": 83,
|
||||
"target_slot": 2,
|
||||
"type": "INT"
|
||||
},
|
||||
{
|
||||
"id": 377,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 5,
|
||||
"target_id": 3,
|
||||
"target_slot": 4,
|
||||
"type": "INT"
|
||||
},
|
||||
{
|
||||
"id": 378,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 6,
|
||||
"target_id": 37,
|
||||
"target_slot": 0,
|
||||
"type": "COMBO"
|
||||
},
|
||||
{
|
||||
"id": 379,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 7,
|
||||
"target_id": 38,
|
||||
"target_slot": 0,
|
||||
"type": "COMBO"
|
||||
},
|
||||
{
|
||||
"id": 380,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 8,
|
||||
"target_id": 39,
|
||||
"target_slot": 0,
|
||||
"type": "COMBO"
|
||||
}
|
||||
],
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Image generation and editing/Image to layers",
|
||||
"description": "Decomposes an image into variable-resolution RGBA layers for independent editing using Qwen-Image-Layered."
|
||||
"category": "Image generation and editing/Image to layers"
|
||||
}
|
||||
]
|
||||
},
|
||||
"config": {},
|
||||
"extra": {
|
||||
"ds": {
|
||||
"scale": 1.14,
|
||||
@ -1599,6 +1409,7 @@
|
||||
6.855893974423647
|
||||
]
|
||||
},
|
||||
"ue_links": []
|
||||
}
|
||||
}
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"version": 0.4
|
||||
}
|
||||
@ -72,7 +72,7 @@
|
||||
},
|
||||
"revision": 0,
|
||||
"config": {},
|
||||
"name": "Image to 3D Model (Hunyuan3d 2.1)",
|
||||
"name": "local-Image to Model (Hunyuan3d 2.1)",
|
||||
"inputNode": {
|
||||
"id": -10,
|
||||
"bounding": [
|
||||
@ -765,8 +765,7 @@
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "3D/Image to 3D Model",
|
||||
"description": "Generates 3D mesh models from a single input image using Hunyuan3D 2.0/2.1."
|
||||
"category": "3D/Image to 3D Model"
|
||||
}
|
||||
]
|
||||
},
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -206,7 +206,7 @@
|
||||
},
|
||||
"revision": 0,
|
||||
"config": {},
|
||||
"name": "Image to Video (Wan 2.2)",
|
||||
"name": "local-Image to Video (Wan 2.2)",
|
||||
"inputNode": {
|
||||
"id": -10,
|
||||
"bounding": [
|
||||
@ -2027,8 +2027,7 @@
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Video generation and editing/Image to video",
|
||||
"description": "Image-to-video with Wan 2.2 using a start image plus text prompt to extend motion from the still frame."
|
||||
"category": "Video generation and editing/Image to video"
|
||||
}
|
||||
]
|
||||
},
|
||||
|
||||
@ -134,7 +134,7 @@
|
||||
},
|
||||
"revision": 0,
|
||||
"config": {},
|
||||
"name": "Pose to Image (Z-Image-Turbo)",
|
||||
"name": "local-Pose to Image (Z-Image-Turbo)",
|
||||
"inputNode": {
|
||||
"id": -10,
|
||||
"bounding": [
|
||||
@ -1298,8 +1298,7 @@
|
||||
"VHS_MetadataImage": true,
|
||||
"VHS_KeepIntermediate": true
|
||||
},
|
||||
"category": "Image generation and editing/Pose to image",
|
||||
"description": "Generates an image from pose keypoints using Z-Image-Turbo with text conditioning."
|
||||
"category": "Image generation and editing/Pose to image"
|
||||
}
|
||||
]
|
||||
},
|
||||
@ -1320,4 +1319,4 @@
|
||||
}
|
||||
},
|
||||
"version": 0.4
|
||||
}
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -270,10 +270,9 @@
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Text generation/Prompt enhance",
|
||||
"description": "Expands short text prompts into detailed descriptions using a text generation model for better generation quality."
|
||||
"category": "Text generation/Prompt enhance"
|
||||
}
|
||||
]
|
||||
},
|
||||
"extra": {}
|
||||
}
|
||||
}
|
||||
|
||||
@ -1,397 +0,0 @@
|
||||
{
|
||||
"revision": 0,
|
||||
"last_node_id": 19,
|
||||
"last_link_id": 0,
|
||||
"nodes": [
|
||||
{
|
||||
"id": 19,
|
||||
"type": "5b40ca21-ba1a-41d5-b403-4d2d7acdc195",
|
||||
"pos": [
|
||||
-6411.330578108367,
|
||||
1940.2638932730042
|
||||
],
|
||||
"size": [
|
||||
349.609375,
|
||||
145.9375
|
||||
],
|
||||
"flags": {},
|
||||
"order": 2,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "image",
|
||||
"name": "image",
|
||||
"type": "IMAGE",
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "bg_removal_name",
|
||||
"type": "COMBO",
|
||||
"widget": {
|
||||
"name": "bg_removal_name"
|
||||
},
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "IMAGE",
|
||||
"name": "IMAGE",
|
||||
"type": "IMAGE",
|
||||
"links": []
|
||||
},
|
||||
{
|
||||
"name": "mask",
|
||||
"type": "MASK",
|
||||
"links": []
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"proxyWidgets": [
|
||||
[
|
||||
"14",
|
||||
"bg_removal_name"
|
||||
]
|
||||
]
|
||||
},
|
||||
"widgets_values": [],
|
||||
"title": "Remove Background (BiRefNet)"
|
||||
}
|
||||
],
|
||||
"links": [],
|
||||
"version": 0.4,
|
||||
"definitions": {
|
||||
"subgraphs": [
|
||||
{
|
||||
"id": "5b40ca21-ba1a-41d5-b403-4d2d7acdc195",
|
||||
"version": 1,
|
||||
"state": {
|
||||
"lastGroupId": 0,
|
||||
"lastNodeId": 21,
|
||||
"lastLinkId": 16,
|
||||
"lastRerouteId": 0
|
||||
},
|
||||
"revision": 0,
|
||||
"config": {},
|
||||
"name": "Remove Background (BiRefNet)",
|
||||
"description": "Removes or replaces image backgrounds using BiRefNet segmentation and alpha compositing.",
|
||||
"inputNode": {
|
||||
"id": -10,
|
||||
"bounding": [
|
||||
-6728.534070722246,
|
||||
1475.2619799128663,
|
||||
150.9140625,
|
||||
88
|
||||
]
|
||||
},
|
||||
"outputNode": {
|
||||
"id": -20,
|
||||
"bounding": [
|
||||
-6169.049695722246,
|
||||
1475.2619799128663,
|
||||
128,
|
||||
88
|
||||
]
|
||||
},
|
||||
"inputs": [
|
||||
{
|
||||
"id": "7bc321cd-df31-4c39-aaf7-7f0d01326189",
|
||||
"name": "image",
|
||||
"type": "IMAGE",
|
||||
"linkIds": [
|
||||
5,
|
||||
7
|
||||
],
|
||||
"localized_name": "image",
|
||||
"pos": [
|
||||
-6601.620008222246,
|
||||
1499.2619799128663
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "e89d2cd8-daa3-4e29-8a69-851db85072cb",
|
||||
"name": "bg_removal_name",
|
||||
"type": "COMBO",
|
||||
"linkIds": [
|
||||
12
|
||||
],
|
||||
"pos": [
|
||||
-6601.620008222246,
|
||||
1519.2619799128663
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"id": "16e7863c-4c38-46c2-aa74-e82991fbfe8d",
|
||||
"name": "IMAGE",
|
||||
"type": "IMAGE",
|
||||
"linkIds": [
|
||||
8
|
||||
],
|
||||
"localized_name": "IMAGE",
|
||||
"pos": [
|
||||
-6145.049695722246,
|
||||
1499.2619799128663
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "f7240c19-5b80-406e-a8e2-9b12440ee2d6",
|
||||
"name": "mask",
|
||||
"type": "MASK",
|
||||
"linkIds": [
|
||||
11
|
||||
],
|
||||
"pos": [
|
||||
-6145.049695722246,
|
||||
1519.2619799128663
|
||||
]
|
||||
}
|
||||
],
|
||||
"widgets": [],
|
||||
"nodes": [
|
||||
{
|
||||
"id": 13,
|
||||
"type": "RemoveBackground",
|
||||
"pos": [
|
||||
-6536.764823982709,
|
||||
1444.9963409012412
|
||||
],
|
||||
"size": [
|
||||
302.25,
|
||||
72
|
||||
],
|
||||
"flags": {},
|
||||
"order": 0,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "image",
|
||||
"name": "image",
|
||||
"type": "IMAGE",
|
||||
"link": 5
|
||||
},
|
||||
{
|
||||
"localized_name": "bg_removal_model",
|
||||
"name": "bg_removal_model",
|
||||
"type": "BACKGROUND_REMOVAL",
|
||||
"link": 3
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "mask",
|
||||
"name": "mask",
|
||||
"type": "MASK",
|
||||
"links": [
|
||||
4,
|
||||
11
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "RemoveBackground"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 14,
|
||||
"type": "LoadBackgroundRemovalModel",
|
||||
"pos": [
|
||||
-6540.534070722246,
|
||||
1302.223464635445
|
||||
],
|
||||
"size": [
|
||||
311.484375,
|
||||
85.515625
|
||||
],
|
||||
"flags": {},
|
||||
"order": 1,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "bg_removal_name",
|
||||
"name": "bg_removal_name",
|
||||
"type": "COMBO",
|
||||
"widget": {
|
||||
"name": "bg_removal_name"
|
||||
},
|
||||
"link": 12
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "bg_model",
|
||||
"name": "bg_model",
|
||||
"type": "BACKGROUND_REMOVAL",
|
||||
"links": [
|
||||
3
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "LoadBackgroundRemovalModel",
|
||||
"models": [
|
||||
{
|
||||
"name": "birefnet.safetensors",
|
||||
"url": "https://huggingface.co/Comfy-Org/BiRefNet/resolve/main/background_removal/birefnet.safetensors",
|
||||
"directory": "background_removal"
|
||||
}
|
||||
]
|
||||
},
|
||||
"widgets_values": [
|
||||
"birefnet.safetensors"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 15,
|
||||
"type": "InvertMask",
|
||||
"pos": [
|
||||
-6532.446160529669,
|
||||
1571.1111286839914
|
||||
],
|
||||
"size": [
|
||||
285.984375,
|
||||
48
|
||||
],
|
||||
"flags": {},
|
||||
"order": 2,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "mask",
|
||||
"name": "mask",
|
||||
"type": "MASK",
|
||||
"link": 4
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "MASK",
|
||||
"name": "MASK",
|
||||
"type": "MASK",
|
||||
"links": [
|
||||
6
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "InvertMask"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 16,
|
||||
"type": "JoinImageWithAlpha",
|
||||
"pos": [
|
||||
-6527.4370171636665,
|
||||
1674.3004951902876
|
||||
],
|
||||
"size": [
|
||||
284.96875,
|
||||
72
|
||||
],
|
||||
"flags": {},
|
||||
"order": 3,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "image",
|
||||
"name": "image",
|
||||
"type": "IMAGE",
|
||||
"link": 7
|
||||
},
|
||||
{
|
||||
"localized_name": "alpha",
|
||||
"name": "alpha",
|
||||
"type": "MASK",
|
||||
"link": 6
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "IMAGE",
|
||||
"name": "IMAGE",
|
||||
"type": "IMAGE",
|
||||
"links": [
|
||||
8
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "JoinImageWithAlpha"
|
||||
}
|
||||
}
|
||||
],
|
||||
"groups": [],
|
||||
"links": [
|
||||
{
|
||||
"id": 3,
|
||||
"origin_id": 14,
|
||||
"origin_slot": 0,
|
||||
"target_id": 13,
|
||||
"target_slot": 1,
|
||||
"type": "BACKGROUND_REMOVAL"
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"origin_id": 13,
|
||||
"origin_slot": 0,
|
||||
"target_id": 15,
|
||||
"target_slot": 0,
|
||||
"type": "MASK"
|
||||
},
|
||||
{
|
||||
"id": 6,
|
||||
"origin_id": 15,
|
||||
"origin_slot": 0,
|
||||
"target_id": 16,
|
||||
"target_slot": 1,
|
||||
"type": "MASK"
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 0,
|
||||
"target_id": 13,
|
||||
"target_slot": 0,
|
||||
"type": "IMAGE"
|
||||
},
|
||||
{
|
||||
"id": 7,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 0,
|
||||
"target_id": 16,
|
||||
"target_slot": 0,
|
||||
"type": "IMAGE"
|
||||
},
|
||||
{
|
||||
"id": 8,
|
||||
"origin_id": 16,
|
||||
"origin_slot": 0,
|
||||
"target_id": -20,
|
||||
"target_slot": 0,
|
||||
"type": "IMAGE"
|
||||
},
|
||||
{
|
||||
"id": 11,
|
||||
"origin_id": 13,
|
||||
"origin_slot": 0,
|
||||
"target_id": -20,
|
||||
"target_slot": 1,
|
||||
"type": "MASK"
|
||||
},
|
||||
{
|
||||
"id": 12,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 1,
|
||||
"target_id": 14,
|
||||
"target_slot": 0,
|
||||
"type": "COMBO"
|
||||
}
|
||||
],
|
||||
"extra": {},
|
||||
"category": "Image generation and editing/Background Removal"
|
||||
}
|
||||
]
|
||||
},
|
||||
"extra": {}
|
||||
}
|
||||
@ -302,9 +302,8 @@
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Image Tools/Sharpen",
|
||||
"description": "Sharpens image details using a GPU fragment shader for enhanced clarity."
|
||||
"category": "Image Tools/Sharpen"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -222,7 +222,7 @@
|
||||
},
|
||||
"revision": 0,
|
||||
"config": {},
|
||||
"name": "Text to Audio (ACE-Step 1.5)",
|
||||
"name": "local-Text to Audio (ACE-Step 1.5)",
|
||||
"inputNode": {
|
||||
"id": -10,
|
||||
"bounding": [
|
||||
@ -1502,8 +1502,7 @@
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Audio/Music generation",
|
||||
"description": "Generates audio/music from text prompts using ACE-Step 1.5, a diffusion-based audio generation model."
|
||||
"category": "Audio/Music generation"
|
||||
}
|
||||
]
|
||||
},
|
||||
@ -1519,4 +1518,4 @@
|
||||
}
|
||||
},
|
||||
"version": 0.4
|
||||
}
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -1,21 +1,22 @@
|
||||
{
|
||||
"id": "1c3eaa76-5cfa-4dc7-8571-97a570324e01",
|
||||
"revision": 0,
|
||||
"last_node_id": 57,
|
||||
"last_link_id": 0,
|
||||
"last_node_id": 34,
|
||||
"last_link_id": 40,
|
||||
"nodes": [
|
||||
{
|
||||
"id": 57,
|
||||
"type": "f2fdebf6-dfaf-43b6-9eb2-7f70613cfdc1",
|
||||
"id": 5,
|
||||
"type": "dfe9eb32-97c0-43a5-90d5-4fd37768d91b",
|
||||
"pos": [
|
||||
130,
|
||||
200
|
||||
-2.5766491043910378e-05,
|
||||
1229.999928629805
|
||||
],
|
||||
"size": [
|
||||
400,
|
||||
470
|
||||
],
|
||||
"flags": {},
|
||||
"order": 1,
|
||||
"order": 0,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
@ -43,22 +44,6 @@
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "seed",
|
||||
"type": "INT",
|
||||
"widget": {
|
||||
"name": "seed"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "steps",
|
||||
"type": "INT",
|
||||
"widget": {
|
||||
"name": "steps"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "unet_name",
|
||||
"type": "COMBO",
|
||||
@ -95,15 +80,15 @@
|
||||
"properties": {
|
||||
"proxyWidgets": [
|
||||
[
|
||||
"27",
|
||||
"-1",
|
||||
"text"
|
||||
],
|
||||
[
|
||||
"13",
|
||||
"-1",
|
||||
"width"
|
||||
],
|
||||
[
|
||||
"13",
|
||||
"-1",
|
||||
"height"
|
||||
],
|
||||
[
|
||||
@ -112,23 +97,19 @@
|
||||
],
|
||||
[
|
||||
"3",
|
||||
"steps"
|
||||
"control_after_generate"
|
||||
],
|
||||
[
|
||||
"28",
|
||||
"-1",
|
||||
"unet_name"
|
||||
],
|
||||
[
|
||||
"30",
|
||||
"-1",
|
||||
"clip_name"
|
||||
],
|
||||
[
|
||||
"29",
|
||||
"-1",
|
||||
"vae_name"
|
||||
],
|
||||
[
|
||||
"3",
|
||||
"control_after_generate"
|
||||
]
|
||||
],
|
||||
"cnr_id": "comfy-core",
|
||||
@ -141,40 +122,48 @@
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65
|
||||
},
|
||||
"widgets_values": [],
|
||||
"title": "Text to Image (Z-Image-Turbo)"
|
||||
"widgets_values": [
|
||||
"",
|
||||
1024,
|
||||
1024,
|
||||
null,
|
||||
null,
|
||||
"z_image_turbo_bf16.safetensors",
|
||||
"qwen_3_4b.safetensors",
|
||||
"ae.safetensors"
|
||||
]
|
||||
}
|
||||
],
|
||||
"links": [],
|
||||
"version": 0.4,
|
||||
"groups": [],
|
||||
"definitions": {
|
||||
"subgraphs": [
|
||||
{
|
||||
"id": "f2fdebf6-dfaf-43b6-9eb2-7f70613cfdc1",
|
||||
"id": "dfe9eb32-97c0-43a5-90d5-4fd37768d91b",
|
||||
"version": 1,
|
||||
"state": {
|
||||
"lastGroupId": 4,
|
||||
"lastNodeId": 61,
|
||||
"lastLinkId": 75,
|
||||
"lastNodeId": 34,
|
||||
"lastLinkId": 40,
|
||||
"lastRerouteId": 0
|
||||
},
|
||||
"revision": 0,
|
||||
"config": {},
|
||||
"name": "Text to Image (Z-Image-Turbo)",
|
||||
"name": "local-Text to Image (Z-Image-Turbo)",
|
||||
"inputNode": {
|
||||
"id": -10,
|
||||
"bounding": [
|
||||
-560,
|
||||
480,
|
||||
-80,
|
||||
425,
|
||||
120,
|
||||
200
|
||||
160
|
||||
]
|
||||
},
|
||||
"outputNode": {
|
||||
"id": -20,
|
||||
"bounding": [
|
||||
1670,
|
||||
320,
|
||||
1490,
|
||||
415,
|
||||
120,
|
||||
60
|
||||
]
|
||||
@ -189,8 +178,8 @@
|
||||
],
|
||||
"label": "prompt",
|
||||
"pos": [
|
||||
-460,
|
||||
500
|
||||
20,
|
||||
445
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -201,8 +190,8 @@
|
||||
35
|
||||
],
|
||||
"pos": [
|
||||
-460,
|
||||
520
|
||||
20,
|
||||
465
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -213,68 +202,44 @@
|
||||
36
|
||||
],
|
||||
"pos": [
|
||||
-460,
|
||||
540
|
||||
20,
|
||||
485
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "f77677f7-6bf6-4c19-a71f-c4a553d5981e",
|
||||
"name": "seed",
|
||||
"type": "INT",
|
||||
"linkIds": [
|
||||
71
|
||||
],
|
||||
"pos": [
|
||||
-460,
|
||||
560
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "ef9a9fb1-5983-4bc9-a60b-cf5aec48bff1",
|
||||
"name": "steps",
|
||||
"type": "INT",
|
||||
"linkIds": [
|
||||
72
|
||||
],
|
||||
"pos": [
|
||||
-460,
|
||||
580
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "a20a1b30-785f-4a04-bb6d-3d61adab9764",
|
||||
"id": "23087d15-8412-4fbd-b71e-9b6d7ef76de1",
|
||||
"name": "unet_name",
|
||||
"type": "COMBO",
|
||||
"linkIds": [
|
||||
73
|
||||
38
|
||||
],
|
||||
"pos": [
|
||||
-460,
|
||||
600
|
||||
20,
|
||||
505
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "4af8fc2b-4655-4086-8240-45f8cb38c6f6",
|
||||
"id": "0677f5c3-2a3f-43d4-98ac-a4c56d5efdc0",
|
||||
"name": "clip_name",
|
||||
"type": "COMBO",
|
||||
"linkIds": [
|
||||
74
|
||||
39
|
||||
],
|
||||
"pos": [
|
||||
-460,
|
||||
620
|
||||
20,
|
||||
525
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "4d518693-2807-439c-9cb6-cffd23ccba2c",
|
||||
"id": "c85c0445-2641-48b1-bbca-95057edf2fcf",
|
||||
"name": "vae_name",
|
||||
"type": "COMBO",
|
||||
"linkIds": [
|
||||
75
|
||||
40
|
||||
],
|
||||
"pos": [
|
||||
-460,
|
||||
640
|
||||
20,
|
||||
545
|
||||
]
|
||||
}
|
||||
],
|
||||
@ -288,8 +253,8 @@
|
||||
],
|
||||
"localized_name": "IMAGE",
|
||||
"pos": [
|
||||
1690,
|
||||
340
|
||||
1510,
|
||||
435
|
||||
]
|
||||
}
|
||||
],
|
||||
@ -299,15 +264,15 @@
|
||||
"id": 30,
|
||||
"type": "CLIPLoader",
|
||||
"pos": [
|
||||
30,
|
||||
420
|
||||
109.99997264844609,
|
||||
329.99999029608756
|
||||
],
|
||||
"size": [
|
||||
270,
|
||||
150
|
||||
269.9869791666667,
|
||||
106
|
||||
],
|
||||
"flags": {},
|
||||
"order": 7,
|
||||
"order": 0,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
@ -317,7 +282,7 @@
|
||||
"widget": {
|
||||
"name": "clip_name"
|
||||
},
|
||||
"link": 74
|
||||
"link": 39
|
||||
},
|
||||
{
|
||||
"localized_name": "type",
|
||||
@ -350,9 +315,9 @@
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "CLIPLoader",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.3.73",
|
||||
"Node name for S&R": "CLIPLoader",
|
||||
"models": [
|
||||
{
|
||||
"name": "qwen_3_4b.safetensors",
|
||||
@ -378,15 +343,15 @@
|
||||
"id": 29,
|
||||
"type": "VAELoader",
|
||||
"pos": [
|
||||
30,
|
||||
650
|
||||
109.99997264844609,
|
||||
479.9999847172637
|
||||
],
|
||||
"size": [
|
||||
270,
|
||||
110
|
||||
269.9869791666667,
|
||||
58
|
||||
],
|
||||
"flags": {},
|
||||
"order": 6,
|
||||
"order": 1,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
@ -396,7 +361,7 @@
|
||||
"widget": {
|
||||
"name": "vae_name"
|
||||
},
|
||||
"link": 75
|
||||
"link": 40
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
@ -410,9 +375,9 @@
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "VAELoader",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.3.73",
|
||||
"Node name for S&R": "VAELoader",
|
||||
"models": [
|
||||
{
|
||||
"name": "ae.safetensors",
|
||||
@ -436,12 +401,12 @@
|
||||
"id": 33,
|
||||
"type": "ConditioningZeroOut",
|
||||
"pos": [
|
||||
630,
|
||||
960
|
||||
639.9999103333332,
|
||||
620.0000271257795
|
||||
],
|
||||
"size": [
|
||||
230,
|
||||
80
|
||||
204.134765625,
|
||||
26
|
||||
],
|
||||
"flags": {},
|
||||
"order": 8,
|
||||
@ -465,9 +430,9 @@
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "ConditioningZeroOut",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.3.73",
|
||||
"Node name for S&R": "ConditioningZeroOut",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
@ -475,21 +440,22 @@
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65
|
||||
}
|
||||
},
|
||||
"widgets_values": []
|
||||
},
|
||||
{
|
||||
"id": 8,
|
||||
"type": "VAEDecode",
|
||||
"pos": [
|
||||
1320,
|
||||
230
|
||||
1219.9999088104782,
|
||||
160.00009184959066
|
||||
],
|
||||
"size": [
|
||||
230,
|
||||
100
|
||||
209.98697916666669,
|
||||
46
|
||||
],
|
||||
"flags": {},
|
||||
"order": 1,
|
||||
"order": 5,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
@ -517,9 +483,9 @@
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "VAEDecode",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.3.64",
|
||||
"Node name for S&R": "VAEDecode",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
@ -527,21 +493,22 @@
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65
|
||||
}
|
||||
},
|
||||
"widgets_values": []
|
||||
},
|
||||
{
|
||||
"id": 28,
|
||||
"type": "UNETLoader",
|
||||
"pos": [
|
||||
30,
|
||||
230
|
||||
109.99997264844609,
|
||||
200.0000502647102
|
||||
],
|
||||
"size": [
|
||||
270,
|
||||
110
|
||||
269.9869791666667,
|
||||
82
|
||||
],
|
||||
"flags": {},
|
||||
"order": 5,
|
||||
"order": 2,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
@ -551,7 +518,7 @@
|
||||
"widget": {
|
||||
"name": "unet_name"
|
||||
},
|
||||
"link": 73
|
||||
"link": 38
|
||||
},
|
||||
{
|
||||
"localized_name": "weight_dtype",
|
||||
@ -574,9 +541,9 @@
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "UNETLoader",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.3.73",
|
||||
"Node name for S&R": "UNETLoader",
|
||||
"models": [
|
||||
{
|
||||
"name": "z_image_turbo_bf16.safetensors",
|
||||
@ -601,15 +568,15 @@
|
||||
"id": 27,
|
||||
"type": "CLIPTextEncode",
|
||||
"pos": [
|
||||
400,
|
||||
230
|
||||
429.99997828947767,
|
||||
200.0000502647102
|
||||
],
|
||||
"size": [
|
||||
450,
|
||||
650
|
||||
409.9869791666667,
|
||||
319.9869791666667
|
||||
],
|
||||
"flags": {},
|
||||
"order": 4,
|
||||
"order": 7,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
@ -640,9 +607,9 @@
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "CLIPTextEncode",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.3.73",
|
||||
"Node name for S&R": "CLIPTextEncode",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
@ -659,15 +626,15 @@
|
||||
"id": 13,
|
||||
"type": "EmptySD3LatentImage",
|
||||
"pos": [
|
||||
40,
|
||||
890
|
||||
109.99997264844609,
|
||||
629.9999791384399
|
||||
],
|
||||
"size": [
|
||||
260,
|
||||
170
|
||||
259.9869791666667,
|
||||
106
|
||||
],
|
||||
"flags": {},
|
||||
"order": 3,
|
||||
"order": 6,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
@ -710,9 +677,9 @@
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "EmptySD3LatentImage",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.3.64",
|
||||
"Node name for S&R": "EmptySD3LatentImage",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
@ -727,77 +694,19 @@
|
||||
1
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 11,
|
||||
"type": "ModelSamplingAuraFlow",
|
||||
"pos": [
|
||||
950,
|
||||
230
|
||||
],
|
||||
"size": [
|
||||
310,
|
||||
110
|
||||
],
|
||||
"flags": {},
|
||||
"order": 2,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "model",
|
||||
"name": "model",
|
||||
"type": "MODEL",
|
||||
"link": 26
|
||||
},
|
||||
{
|
||||
"localized_name": "shift",
|
||||
"name": "shift",
|
||||
"type": "FLOAT",
|
||||
"widget": {
|
||||
"name": "shift"
|
||||
},
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "MODEL",
|
||||
"name": "MODEL",
|
||||
"type": "MODEL",
|
||||
"slot_index": 0,
|
||||
"links": [
|
||||
13
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "ModelSamplingAuraFlow",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.3.64",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
"hasSecondTab": false,
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65
|
||||
},
|
||||
"widgets_values": [
|
||||
3
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"type": "KSampler",
|
||||
"pos": [
|
||||
950,
|
||||
400
|
||||
879.9999615530063,
|
||||
269.9999774911694
|
||||
],
|
||||
"size": [
|
||||
320,
|
||||
350
|
||||
314.9869791666667,
|
||||
262
|
||||
],
|
||||
"flags": {},
|
||||
"order": 0,
|
||||
"order": 4,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
@ -831,7 +740,7 @@
|
||||
"widget": {
|
||||
"name": "seed"
|
||||
},
|
||||
"link": 71
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"localized_name": "steps",
|
||||
@ -840,7 +749,7 @@
|
||||
"widget": {
|
||||
"name": "steps"
|
||||
},
|
||||
"link": 72
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"localized_name": "cfg",
|
||||
@ -891,9 +800,9 @@
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "KSampler",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.3.64",
|
||||
"Node name for S&R": "KSampler",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
@ -905,23 +814,81 @@
|
||||
"widgets_values": [
|
||||
0,
|
||||
"randomize",
|
||||
8,
|
||||
4,
|
||||
1,
|
||||
"res_multistep",
|
||||
"simple",
|
||||
1
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 11,
|
||||
"type": "ModelSamplingAuraFlow",
|
||||
"pos": [
|
||||
879.9999615530063,
|
||||
160.00009184959066
|
||||
],
|
||||
"size": [
|
||||
309.9869791666667,
|
||||
58
|
||||
],
|
||||
"flags": {},
|
||||
"order": 3,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "model",
|
||||
"name": "model",
|
||||
"type": "MODEL",
|
||||
"link": 26
|
||||
},
|
||||
{
|
||||
"localized_name": "shift",
|
||||
"name": "shift",
|
||||
"type": "FLOAT",
|
||||
"widget": {
|
||||
"name": "shift"
|
||||
},
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "MODEL",
|
||||
"name": "MODEL",
|
||||
"type": "MODEL",
|
||||
"slot_index": 0,
|
||||
"links": [
|
||||
13
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.3.64",
|
||||
"Node name for S&R": "ModelSamplingAuraFlow",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
"hasSecondTab": false,
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65
|
||||
},
|
||||
"widgets_values": [
|
||||
3
|
||||
]
|
||||
}
|
||||
],
|
||||
"groups": [
|
||||
{
|
||||
"id": 2,
|
||||
"title": "Step2 - Image size",
|
||||
"title": "Image size",
|
||||
"bounding": [
|
||||
10,
|
||||
820,
|
||||
320,
|
||||
280
|
||||
100,
|
||||
560,
|
||||
290,
|
||||
200
|
||||
],
|
||||
"color": "#3f789e",
|
||||
"font_size": 24,
|
||||
@ -929,12 +896,12 @@
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"title": "Step3 - Prompt",
|
||||
"title": "Prompt",
|
||||
"bounding": [
|
||||
360,
|
||||
410,
|
||||
130,
|
||||
530,
|
||||
970
|
||||
450,
|
||||
540
|
||||
],
|
||||
"color": "#3f789e",
|
||||
"font_size": 24,
|
||||
@ -942,12 +909,12 @@
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"title": "Step1 - Load models",
|
||||
"title": "Models",
|
||||
"bounding": [
|
||||
0,
|
||||
100,
|
||||
130,
|
||||
330,
|
||||
660
|
||||
290,
|
||||
413.6
|
||||
],
|
||||
"color": "#3f789e",
|
||||
"font_size": 24,
|
||||
@ -1060,41 +1027,25 @@
|
||||
"type": "INT"
|
||||
},
|
||||
{
|
||||
"id": 71,
|
||||
"id": 38,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 3,
|
||||
"target_id": 3,
|
||||
"target_slot": 4,
|
||||
"type": "INT"
|
||||
},
|
||||
{
|
||||
"id": 72,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 4,
|
||||
"target_id": 3,
|
||||
"target_slot": 5,
|
||||
"type": "INT"
|
||||
},
|
||||
{
|
||||
"id": 73,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 5,
|
||||
"target_id": 28,
|
||||
"target_slot": 0,
|
||||
"type": "COMBO"
|
||||
},
|
||||
{
|
||||
"id": 74,
|
||||
"id": 39,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 6,
|
||||
"origin_slot": 4,
|
||||
"target_id": 30,
|
||||
"target_slot": 0,
|
||||
"type": "COMBO"
|
||||
},
|
||||
{
|
||||
"id": 75,
|
||||
"id": 40,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 7,
|
||||
"origin_slot": 5,
|
||||
"target_id": 29,
|
||||
"target_slot": 0,
|
||||
"type": "COMBO"
|
||||
@ -1103,10 +1054,25 @@
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Image generation and editing/Text to image",
|
||||
"description": "Generates images from text prompts using Z-Image-Turbo, Alibaba's distilled 6B DiT model."
|
||||
"category": "Image generation and editing/Text to image"
|
||||
}
|
||||
]
|
||||
},
|
||||
"extra": {}
|
||||
}
|
||||
"config": {},
|
||||
"extra": {
|
||||
"frontendVersion": "1.37.10",
|
||||
"workflowRendererVersion": "LG",
|
||||
"VHS_latentpreview": false,
|
||||
"VHS_latentpreviewrate": 0,
|
||||
"VHS_MetadataImage": true,
|
||||
"VHS_KeepIntermediate": true,
|
||||
"ds": {
|
||||
"scale": 0.8401370345180755,
|
||||
"offset": [
|
||||
940.0587067393087,
|
||||
-830.7121087564725
|
||||
]
|
||||
}
|
||||
},
|
||||
"version": 0.4
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -1572,8 +1572,7 @@
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Video generation and editing/Text to video",
|
||||
"description": "Generates video from text prompts using Wan2.2, Alibaba's diffusion video model."
|
||||
"category": "Video generation and editing/Text to video"
|
||||
}
|
||||
]
|
||||
},
|
||||
@ -1587,4 +1586,4 @@
|
||||
"VHS_KeepIntermediate": true
|
||||
},
|
||||
"version": 0.4
|
||||
}
|
||||
}
|
||||
|
||||
@ -434,9 +434,8 @@
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Image Tools/Sharpen",
|
||||
"description": "Enhances edge contrast via unsharp masking for a sharper image appearance."
|
||||
"category": "Image Tools/Sharpen"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -307,8 +307,7 @@
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Text generation/Video Captioning",
|
||||
"description": "Generates descriptive captions for video input using Google's Gemini multimodal LLM."
|
||||
"category": "Text generation/Video Captioning"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
@ -165,7 +165,7 @@
|
||||
},
|
||||
"revision": 0,
|
||||
"config": {},
|
||||
"name": "Video Inpaint (Wan 2.1 VACE)",
|
||||
"name": "local-Video Inpaint(Wan2.1 VACE)",
|
||||
"inputNode": {
|
||||
"id": -10,
|
||||
"bounding": [
|
||||
@ -2368,8 +2368,7 @@
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Video generation and editing/Inpaint video",
|
||||
"description": "Inpaints masked regions in video frames using Wan 2.1 VACE."
|
||||
"category": "Video generation and editing/Inpaint video"
|
||||
}
|
||||
]
|
||||
},
|
||||
|
||||
@ -1,827 +0,0 @@
|
||||
{
|
||||
"revision": 0,
|
||||
"last_node_id": 130,
|
||||
"last_link_id": 0,
|
||||
"nodes": [
|
||||
{
|
||||
"id": 130,
|
||||
"type": "7937cf78-b52b-40a3-93b2-b4e2e5f98df1",
|
||||
"pos": [
|
||||
-1210,
|
||||
-2780
|
||||
],
|
||||
"size": [
|
||||
300,
|
||||
370
|
||||
],
|
||||
"flags": {},
|
||||
"order": 3,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"name": "video",
|
||||
"type": "VIDEO",
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "text",
|
||||
"type": "STRING",
|
||||
"widget": {
|
||||
"name": "text"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "bboxes",
|
||||
"type": "BOUNDING_BOX",
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "positive_coords",
|
||||
"type": "STRING",
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "negative_coords",
|
||||
"type": "STRING",
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "threshold",
|
||||
"type": "FLOAT",
|
||||
"widget": {
|
||||
"name": "threshold"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "refine_iterations",
|
||||
"type": "INT",
|
||||
"widget": {
|
||||
"name": "refine_iterations"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "individual_masks",
|
||||
"type": "BOOLEAN",
|
||||
"widget": {
|
||||
"name": "individual_masks"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "ckpt_name",
|
||||
"type": "COMBO",
|
||||
"widget": {
|
||||
"name": "ckpt_name"
|
||||
},
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "masks",
|
||||
"name": "masks",
|
||||
"type": "MASK",
|
||||
"links": []
|
||||
},
|
||||
{
|
||||
"localized_name": "bboxes",
|
||||
"name": "bboxes",
|
||||
"type": "BOUNDING_BOX",
|
||||
"links": []
|
||||
},
|
||||
{
|
||||
"name": "audio",
|
||||
"type": "AUDIO",
|
||||
"links": null
|
||||
},
|
||||
{
|
||||
"name": "fps",
|
||||
"type": "FLOAT",
|
||||
"links": null
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"proxyWidgets": [
|
||||
[
|
||||
"125",
|
||||
"text"
|
||||
],
|
||||
[
|
||||
"126",
|
||||
"threshold"
|
||||
],
|
||||
[
|
||||
"126",
|
||||
"refine_iterations"
|
||||
],
|
||||
[
|
||||
"126",
|
||||
"individual_masks"
|
||||
],
|
||||
[
|
||||
"127",
|
||||
"ckpt_name"
|
||||
]
|
||||
],
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.19.3",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
"hasSecondTab": false,
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65
|
||||
},
|
||||
"widgets_values": [],
|
||||
"title": "Video Segmentation (SAM3)"
|
||||
}
|
||||
],
|
||||
"links": [],
|
||||
"version": 0.4,
|
||||
"definitions": {
|
||||
"subgraphs": [
|
||||
{
|
||||
"id": "7937cf78-b52b-40a3-93b2-b4e2e5f98df1",
|
||||
"version": 1,
|
||||
"state": {
|
||||
"lastGroupId": 0,
|
||||
"lastNodeId": 130,
|
||||
"lastLinkId": 299,
|
||||
"lastRerouteId": 0
|
||||
},
|
||||
"revision": 0,
|
||||
"config": {},
|
||||
"name": "Video Segmentation (SAM3)",
|
||||
"inputNode": {
|
||||
"id": -10,
|
||||
"bounding": [
|
||||
-2260,
|
||||
-3450,
|
||||
136.369140625,
|
||||
220
|
||||
]
|
||||
},
|
||||
"outputNode": {
|
||||
"id": -20,
|
||||
"bounding": [
|
||||
-1050,
|
||||
-3510,
|
||||
120,
|
||||
120
|
||||
]
|
||||
},
|
||||
"inputs": [
|
||||
{
|
||||
"id": "680ffd88-32fe-48be-88d6-91ea44d5eaee",
|
||||
"name": "video",
|
||||
"type": "VIDEO",
|
||||
"linkIds": [
|
||||
252
|
||||
],
|
||||
"pos": [
|
||||
-2143.630859375,
|
||||
-3430
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "ceaf249c-32d7-4624-8bf6-e590e347ed90",
|
||||
"name": "text",
|
||||
"type": "STRING",
|
||||
"linkIds": [
|
||||
254
|
||||
],
|
||||
"pos": [
|
||||
-2143.630859375,
|
||||
-3410
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "1ffbff36-da0c-4854-8cb4-88ad31e64f99",
|
||||
"name": "bboxes",
|
||||
"type": "BOUNDING_BOX",
|
||||
"linkIds": [
|
||||
255
|
||||
],
|
||||
"pos": [
|
||||
-2143.630859375,
|
||||
-3390
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "67b7f4c7-cec0-4e00-b154-23cc1abf880e",
|
||||
"name": "positive_coords",
|
||||
"type": "STRING",
|
||||
"linkIds": [
|
||||
256
|
||||
],
|
||||
"pos": [
|
||||
-2143.630859375,
|
||||
-3370
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "b090a498-2bde-46b9-9554-18501401d687",
|
||||
"name": "negative_coords",
|
||||
"type": "STRING",
|
||||
"linkIds": [
|
||||
257
|
||||
],
|
||||
"pos": [
|
||||
-2143.630859375,
|
||||
-3350
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "1a76dfcf-ce95-46af-bba5-c42160c683dd",
|
||||
"name": "threshold",
|
||||
"type": "FLOAT",
|
||||
"linkIds": [
|
||||
261
|
||||
],
|
||||
"pos": [
|
||||
-2143.630859375,
|
||||
-3330
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "999523fa-c476-4c53-80c3-0a2f554d18ab",
|
||||
"name": "refine_iterations",
|
||||
"type": "INT",
|
||||
"linkIds": [
|
||||
262
|
||||
],
|
||||
"pos": [
|
||||
-2143.630859375,
|
||||
-3310
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "d2371011-7fe5-4a39-b0c1-df2e0bbd6ece",
|
||||
"name": "individual_masks",
|
||||
"type": "BOOLEAN",
|
||||
"linkIds": [
|
||||
263
|
||||
],
|
||||
"pos": [
|
||||
-2143.630859375,
|
||||
-3290
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "675a8b37-17db-48d1-853c-2fe5d6a74582",
|
||||
"name": "ckpt_name",
|
||||
"type": "COMBO",
|
||||
"linkIds": [
|
||||
273
|
||||
],
|
||||
"pos": [
|
||||
-2143.630859375,
|
||||
-3270
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"id": "ff50da09-1e59-4a58-9b7f-be1a00aa5913",
|
||||
"name": "masks",
|
||||
"type": "MASK",
|
||||
"linkIds": [
|
||||
231
|
||||
],
|
||||
"localized_name": "masks",
|
||||
"pos": [
|
||||
-1030,
|
||||
-3490
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "8f622e40-8528-4078-b7d3-147e9f872194",
|
||||
"name": "bboxes",
|
||||
"type": "BOUNDING_BOX",
|
||||
"linkIds": [
|
||||
232
|
||||
],
|
||||
"localized_name": "bboxes",
|
||||
"pos": [
|
||||
-1030,
|
||||
-3470
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "6c9924ec-f0fa-4509-83ea-8f97f5889bcc",
|
||||
"name": "audio",
|
||||
"type": "AUDIO",
|
||||
"linkIds": [
|
||||
259
|
||||
],
|
||||
"pos": [
|
||||
-1030,
|
||||
-3450
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "82c1cddc-ab11-44eb-9e2f-1a5c7ea5645b",
|
||||
"name": "fps",
|
||||
"type": "FLOAT",
|
||||
"linkIds": [
|
||||
260
|
||||
],
|
||||
"pos": [
|
||||
-1030,
|
||||
-3430
|
||||
]
|
||||
}
|
||||
],
|
||||
"widgets": [],
|
||||
"nodes": [
|
||||
{
|
||||
"id": 125,
|
||||
"type": "CLIPTextEncode",
|
||||
"pos": [
|
||||
-2010,
|
||||
-3040
|
||||
],
|
||||
"size": [
|
||||
400,
|
||||
200
|
||||
],
|
||||
"flags": {},
|
||||
"order": 1,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "clip",
|
||||
"name": "clip",
|
||||
"type": "CLIP",
|
||||
"link": 240
|
||||
},
|
||||
{
|
||||
"localized_name": "text",
|
||||
"name": "text",
|
||||
"type": "STRING",
|
||||
"widget": {
|
||||
"name": "text"
|
||||
},
|
||||
"link": 254
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "CONDITIONING",
|
||||
"name": "CONDITIONING",
|
||||
"type": "CONDITIONING",
|
||||
"links": [
|
||||
200
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "CLIPTextEncode",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.19.3",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
"hasSecondTab": false,
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65
|
||||
},
|
||||
"widgets_values": [
|
||||
""
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 126,
|
||||
"type": "SAM3_Detect",
|
||||
"pos": [
|
||||
-1520,
|
||||
-3520
|
||||
],
|
||||
"size": [
|
||||
270,
|
||||
290
|
||||
],
|
||||
"flags": {},
|
||||
"order": 2,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"label": "model",
|
||||
"localized_name": "model",
|
||||
"name": "model",
|
||||
"type": "MODEL",
|
||||
"link": 237
|
||||
},
|
||||
{
|
||||
"label": "image",
|
||||
"localized_name": "image",
|
||||
"name": "image",
|
||||
"type": "IMAGE",
|
||||
"link": 253
|
||||
},
|
||||
{
|
||||
"label": "conditioning",
|
||||
"localized_name": "conditioning",
|
||||
"name": "conditioning",
|
||||
"shape": 7,
|
||||
"type": "CONDITIONING",
|
||||
"link": 200
|
||||
},
|
||||
{
|
||||
"label": "bboxes",
|
||||
"localized_name": "bboxes",
|
||||
"name": "bboxes",
|
||||
"shape": 7,
|
||||
"type": "BOUNDING_BOX",
|
||||
"link": 255
|
||||
},
|
||||
{
|
||||
"label": "positive_coords",
|
||||
"localized_name": "positive_coords",
|
||||
"name": "positive_coords",
|
||||
"shape": 7,
|
||||
"type": "STRING",
|
||||
"link": 256
|
||||
},
|
||||
{
|
||||
"label": "negative_coords",
|
||||
"localized_name": "negative_coords",
|
||||
"name": "negative_coords",
|
||||
"shape": 7,
|
||||
"type": "STRING",
|
||||
"link": 257
|
||||
},
|
||||
{
|
||||
"localized_name": "threshold",
|
||||
"name": "threshold",
|
||||
"type": "FLOAT",
|
||||
"widget": {
|
||||
"name": "threshold"
|
||||
},
|
||||
"link": 261
|
||||
},
|
||||
{
|
||||
"localized_name": "refine_iterations",
|
||||
"name": "refine_iterations",
|
||||
"type": "INT",
|
||||
"widget": {
|
||||
"name": "refine_iterations"
|
||||
},
|
||||
"link": 262
|
||||
},
|
||||
{
|
||||
"localized_name": "individual_masks",
|
||||
"name": "individual_masks",
|
||||
"type": "BOOLEAN",
|
||||
"widget": {
|
||||
"name": "individual_masks"
|
||||
},
|
||||
"link": 263
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "masks",
|
||||
"name": "masks",
|
||||
"type": "MASK",
|
||||
"links": [
|
||||
231
|
||||
]
|
||||
},
|
||||
{
|
||||
"localized_name": "bboxes",
|
||||
"name": "bboxes",
|
||||
"type": "BOUNDING_BOX",
|
||||
"links": [
|
||||
232
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "SAM3_Detect",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.19.3",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
"hasSecondTab": false,
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65
|
||||
},
|
||||
"widgets_values": [
|
||||
0.5,
|
||||
2,
|
||||
false
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 127,
|
||||
"type": "CheckpointLoaderSimple",
|
||||
"pos": [
|
||||
-1970,
|
||||
-3310
|
||||
],
|
||||
"size": [
|
||||
330,
|
||||
160
|
||||
],
|
||||
"flags": {},
|
||||
"order": 3,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "ckpt_name",
|
||||
"name": "ckpt_name",
|
||||
"type": "COMBO",
|
||||
"widget": {
|
||||
"name": "ckpt_name"
|
||||
},
|
||||
"link": 273
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "MODEL",
|
||||
"name": "MODEL",
|
||||
"type": "MODEL",
|
||||
"links": [
|
||||
237
|
||||
]
|
||||
},
|
||||
{
|
||||
"localized_name": "CLIP",
|
||||
"name": "CLIP",
|
||||
"type": "CLIP",
|
||||
"links": [
|
||||
240
|
||||
]
|
||||
},
|
||||
{
|
||||
"localized_name": "VAE",
|
||||
"name": "VAE",
|
||||
"type": "VAE",
|
||||
"links": null
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "CheckpointLoaderSimple",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.19.3",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
"hasSecondTab": false,
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65,
|
||||
"models": [
|
||||
{
|
||||
"name": "sam3.1_multiplex_fp16.safetensors",
|
||||
"url": "https://huggingface.co/Comfy-Org/sam3.1/resolve/main/checkpoints/sam3.1_multiplex_fp16.safetensors",
|
||||
"directory": "checkpoints"
|
||||
}
|
||||
]
|
||||
},
|
||||
"widgets_values": [
|
||||
"sam3.1_multiplex_fp16.safetensors"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 128,
|
||||
"type": "GetVideoComponents",
|
||||
"pos": [
|
||||
-1910,
|
||||
-3540
|
||||
],
|
||||
"size": [
|
||||
230,
|
||||
120
|
||||
],
|
||||
"flags": {},
|
||||
"order": 4,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "video",
|
||||
"name": "video",
|
||||
"type": "VIDEO",
|
||||
"link": 252
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "images",
|
||||
"name": "images",
|
||||
"type": "IMAGE",
|
||||
"links": [
|
||||
253
|
||||
]
|
||||
},
|
||||
{
|
||||
"localized_name": "audio",
|
||||
"name": "audio",
|
||||
"type": "AUDIO",
|
||||
"links": [
|
||||
259
|
||||
]
|
||||
},
|
||||
{
|
||||
"localized_name": "fps",
|
||||
"name": "fps",
|
||||
"type": "FLOAT",
|
||||
"links": [
|
||||
260
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "GetVideoComponents",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.19.3",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
"hasSecondTab": false,
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 129,
|
||||
"type": "Note",
|
||||
"pos": [
|
||||
-1980,
|
||||
-2790
|
||||
],
|
||||
"size": [
|
||||
370,
|
||||
250
|
||||
],
|
||||
"flags": {},
|
||||
"order": 0,
|
||||
"mode": 0,
|
||||
"inputs": [],
|
||||
"outputs": [],
|
||||
"title": "Note: Prompt format",
|
||||
"properties": {},
|
||||
"widgets_values": [
|
||||
"Max tokens for this model is only 32, to separately prompt multiple subjects you can separate prompts with comma, and set the max amount of objects detected for each prompt with :N\n\nFor example above test prompt finds 2 cakes, one apron, 4 window panels"
|
||||
],
|
||||
"color": "#432",
|
||||
"bgcolor": "#653"
|
||||
}
|
||||
],
|
||||
"groups": [],
|
||||
"links": [
|
||||
{
|
||||
"id": 237,
|
||||
"origin_id": 127,
|
||||
"origin_slot": 0,
|
||||
"target_id": 126,
|
||||
"target_slot": 0,
|
||||
"type": "MODEL"
|
||||
},
|
||||
{
|
||||
"id": 200,
|
||||
"origin_id": 125,
|
||||
"origin_slot": 0,
|
||||
"target_id": 126,
|
||||
"target_slot": 2,
|
||||
"type": "CONDITIONING"
|
||||
},
|
||||
{
|
||||
"id": 240,
|
||||
"origin_id": 127,
|
||||
"origin_slot": 1,
|
||||
"target_id": 125,
|
||||
"target_slot": 0,
|
||||
"type": "CLIP"
|
||||
},
|
||||
{
|
||||
"id": 231,
|
||||
"origin_id": 126,
|
||||
"origin_slot": 0,
|
||||
"target_id": -20,
|
||||
"target_slot": 0,
|
||||
"type": "MASK"
|
||||
},
|
||||
{
|
||||
"id": 232,
|
||||
"origin_id": 126,
|
||||
"origin_slot": 1,
|
||||
"target_id": -20,
|
||||
"target_slot": 1,
|
||||
"type": "BOUNDING_BOX"
|
||||
},
|
||||
{
|
||||
"id": 252,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 0,
|
||||
"target_id": 128,
|
||||
"target_slot": 0,
|
||||
"type": "VIDEO"
|
||||
},
|
||||
{
|
||||
"id": 253,
|
||||
"origin_id": 128,
|
||||
"origin_slot": 0,
|
||||
"target_id": 126,
|
||||
"target_slot": 1,
|
||||
"type": "IMAGE"
|
||||
},
|
||||
{
|
||||
"id": 254,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 1,
|
||||
"target_id": 125,
|
||||
"target_slot": 1,
|
||||
"type": "STRING"
|
||||
},
|
||||
{
|
||||
"id": 255,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 2,
|
||||
"target_id": 126,
|
||||
"target_slot": 3,
|
||||
"type": "BOUNDING_BOX"
|
||||
},
|
||||
{
|
||||
"id": 256,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 3,
|
||||
"target_id": 126,
|
||||
"target_slot": 4,
|
||||
"type": "STRING"
|
||||
},
|
||||
{
|
||||
"id": 257,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 4,
|
||||
"target_id": 126,
|
||||
"target_slot": 5,
|
||||
"type": "STRING"
|
||||
},
|
||||
{
|
||||
"id": 259,
|
||||
"origin_id": 128,
|
||||
"origin_slot": 1,
|
||||
"target_id": -20,
|
||||
"target_slot": 2,
|
||||
"type": "AUDIO"
|
||||
},
|
||||
{
|
||||
"id": 260,
|
||||
"origin_id": 128,
|
||||
"origin_slot": 2,
|
||||
"target_id": -20,
|
||||
"target_slot": 3,
|
||||
"type": "FLOAT"
|
||||
},
|
||||
{
|
||||
"id": 261,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 5,
|
||||
"target_id": 126,
|
||||
"target_slot": 6,
|
||||
"type": "FLOAT"
|
||||
},
|
||||
{
|
||||
"id": 262,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 6,
|
||||
"target_id": 126,
|
||||
"target_slot": 7,
|
||||
"type": "INT"
|
||||
},
|
||||
{
|
||||
"id": 263,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 7,
|
||||
"target_id": 126,
|
||||
"target_slot": 8,
|
||||
"type": "BOOLEAN"
|
||||
},
|
||||
{
|
||||
"id": 273,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 8,
|
||||
"target_id": 127,
|
||||
"target_slot": 0,
|
||||
"type": "COMBO"
|
||||
}
|
||||
],
|
||||
"extra": {},
|
||||
"category": "Video Tools",
|
||||
"description": "Segments video into temporally consistent masks using Meta SAM3 from text or interactive prompts."
|
||||
}
|
||||
]
|
||||
},
|
||||
"extra": {}
|
||||
}
|
||||
@ -1,21 +1,21 @@
|
||||
{
|
||||
"revision": 0,
|
||||
"last_node_id": 85,
|
||||
"last_node_id": 84,
|
||||
"last_link_id": 0,
|
||||
"nodes": [
|
||||
{
|
||||
"id": 85,
|
||||
"type": "637913e7-0206-46ba-8ded-70ae3a7c2e19",
|
||||
"id": 84,
|
||||
"type": "8e8aa94a-647e-436d-8440-8ee4691864de",
|
||||
"pos": [
|
||||
-880,
|
||||
-2260
|
||||
-6100,
|
||||
2620
|
||||
],
|
||||
"size": [
|
||||
290,
|
||||
160
|
||||
],
|
||||
"flags": {},
|
||||
"order": 2,
|
||||
"order": 0,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
@ -76,26 +76,31 @@
|
||||
"properties": {
|
||||
"proxyWidgets": [
|
||||
[
|
||||
"79",
|
||||
"-1",
|
||||
"direction"
|
||||
],
|
||||
[
|
||||
"79",
|
||||
"-1",
|
||||
"match_image_size"
|
||||
],
|
||||
[
|
||||
"79",
|
||||
"-1",
|
||||
"spacing_width"
|
||||
],
|
||||
[
|
||||
"79",
|
||||
"-1",
|
||||
"spacing_color"
|
||||
]
|
||||
],
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.13.0"
|
||||
},
|
||||
"widgets_values": [],
|
||||
"widgets_values": [
|
||||
"right",
|
||||
true,
|
||||
0,
|
||||
"white"
|
||||
],
|
||||
"title": "Video Stitch"
|
||||
}
|
||||
],
|
||||
@ -104,12 +109,12 @@
|
||||
"definitions": {
|
||||
"subgraphs": [
|
||||
{
|
||||
"id": "637913e7-0206-46ba-8ded-70ae3a7c2e19",
|
||||
"id": "8e8aa94a-647e-436d-8440-8ee4691864de",
|
||||
"version": 1,
|
||||
"state": {
|
||||
"lastGroupId": 1,
|
||||
"lastNodeId": 97,
|
||||
"lastLinkId": 282,
|
||||
"lastNodeId": 84,
|
||||
"lastLinkId": 262,
|
||||
"lastRerouteId": 0
|
||||
},
|
||||
"revision": 0,
|
||||
@ -118,8 +123,8 @@
|
||||
"inputNode": {
|
||||
"id": -10,
|
||||
"bounding": [
|
||||
-6810,
|
||||
2580,
|
||||
-6580,
|
||||
2649,
|
||||
143.55859375,
|
||||
160
|
||||
]
|
||||
@ -127,8 +132,8 @@
|
||||
"outputNode": {
|
||||
"id": -20,
|
||||
"bounding": [
|
||||
-4770,
|
||||
2600,
|
||||
-5720,
|
||||
2659,
|
||||
120,
|
||||
60
|
||||
]
|
||||
@ -144,8 +149,8 @@
|
||||
"localized_name": "video",
|
||||
"label": "Before Video",
|
||||
"pos": [
|
||||
-6686.44140625,
|
||||
2600
|
||||
-6456.44140625,
|
||||
2669
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -158,8 +163,8 @@
|
||||
"localized_name": "video_1",
|
||||
"label": "After Video",
|
||||
"pos": [
|
||||
-6686.44140625,
|
||||
2620
|
||||
-6456.44140625,
|
||||
2689
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -170,8 +175,8 @@
|
||||
259
|
||||
],
|
||||
"pos": [
|
||||
-6686.44140625,
|
||||
2640
|
||||
-6456.44140625,
|
||||
2709
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -182,8 +187,8 @@
|
||||
260
|
||||
],
|
||||
"pos": [
|
||||
-6686.44140625,
|
||||
2660
|
||||
-6456.44140625,
|
||||
2729
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -194,8 +199,8 @@
|
||||
261
|
||||
],
|
||||
"pos": [
|
||||
-6686.44140625,
|
||||
2680
|
||||
-6456.44140625,
|
||||
2749
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -206,8 +211,8 @@
|
||||
262
|
||||
],
|
||||
"pos": [
|
||||
-6686.44140625,
|
||||
2700
|
||||
-6456.44140625,
|
||||
2769
|
||||
]
|
||||
}
|
||||
],
|
||||
@ -221,8 +226,8 @@
|
||||
],
|
||||
"localized_name": "VIDEO",
|
||||
"pos": [
|
||||
-4750,
|
||||
2620
|
||||
-5700,
|
||||
2679
|
||||
]
|
||||
}
|
||||
],
|
||||
@ -233,11 +238,11 @@
|
||||
"type": "GetVideoComponents",
|
||||
"pos": [
|
||||
-6390,
|
||||
2600
|
||||
2560
|
||||
],
|
||||
"size": [
|
||||
230,
|
||||
120
|
||||
193.530859375,
|
||||
66
|
||||
],
|
||||
"flags": {},
|
||||
"order": 1,
|
||||
@ -273,9 +278,9 @@
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "GetVideoComponents",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.13.0"
|
||||
"ver": "0.13.0",
|
||||
"Node name for S&R": "GetVideoComponents"
|
||||
}
|
||||
},
|
||||
{
|
||||
@ -286,8 +291,8 @@
|
||||
2420
|
||||
],
|
||||
"size": [
|
||||
230,
|
||||
120
|
||||
193.530859375,
|
||||
66
|
||||
],
|
||||
"flags": {},
|
||||
"order": 0,
|
||||
@ -327,254 +332,21 @@
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "GetVideoComponents",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.13.0"
|
||||
"ver": "0.13.0",
|
||||
"Node name for S&R": "GetVideoComponents"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 90,
|
||||
"type": "GetImageSize",
|
||||
"pos": [
|
||||
-6390,
|
||||
3030
|
||||
],
|
||||
"size": [
|
||||
230,
|
||||
120
|
||||
],
|
||||
"flags": {},
|
||||
"order": 4,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "image",
|
||||
"name": "image",
|
||||
"type": "IMAGE",
|
||||
"link": 266
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "width",
|
||||
"name": "width",
|
||||
"type": "INT",
|
||||
"links": [
|
||||
274
|
||||
]
|
||||
},
|
||||
{
|
||||
"localized_name": "height",
|
||||
"name": "height",
|
||||
"type": "INT",
|
||||
"links": [
|
||||
276
|
||||
]
|
||||
},
|
||||
{
|
||||
"localized_name": "batch_size",
|
||||
"name": "batch_size",
|
||||
"type": "INT",
|
||||
"links": null
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "GetImageSize"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 80,
|
||||
"type": "CreateVideo",
|
||||
"pos": [
|
||||
-5190,
|
||||
2420
|
||||
],
|
||||
"size": [
|
||||
270,
|
||||
130
|
||||
],
|
||||
"flags": {},
|
||||
"order": 3,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "images",
|
||||
"name": "images",
|
||||
"type": "IMAGE",
|
||||
"link": 282
|
||||
},
|
||||
{
|
||||
"localized_name": "audio",
|
||||
"name": "audio",
|
||||
"shape": 7,
|
||||
"type": "AUDIO",
|
||||
"link": 251
|
||||
},
|
||||
{
|
||||
"localized_name": "fps",
|
||||
"name": "fps",
|
||||
"type": "FLOAT",
|
||||
"widget": {
|
||||
"name": "fps"
|
||||
},
|
||||
"link": 252
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "VIDEO",
|
||||
"name": "VIDEO",
|
||||
"type": "VIDEO",
|
||||
"links": [
|
||||
255
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "CreateVideo",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.13.0"
|
||||
},
|
||||
"widgets_values": [
|
||||
30
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 95,
|
||||
"type": "ComfyMathExpression",
|
||||
"pos": [
|
||||
-6040,
|
||||
3020
|
||||
],
|
||||
"size": [
|
||||
400,
|
||||
200
|
||||
],
|
||||
"flags": {},
|
||||
"order": 5,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"label": "a",
|
||||
"localized_name": "values.a",
|
||||
"name": "values.a",
|
||||
"type": "FLOAT,INT",
|
||||
"link": 274
|
||||
},
|
||||
{
|
||||
"label": "b",
|
||||
"localized_name": "values.b",
|
||||
"name": "values.b",
|
||||
"shape": 7,
|
||||
"type": "FLOAT,INT",
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"localized_name": "expression",
|
||||
"name": "expression",
|
||||
"type": "STRING",
|
||||
"widget": {
|
||||
"name": "expression"
|
||||
},
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "FLOAT",
|
||||
"name": "FLOAT",
|
||||
"type": "FLOAT",
|
||||
"links": null
|
||||
},
|
||||
{
|
||||
"localized_name": "INT",
|
||||
"name": "INT",
|
||||
"type": "INT",
|
||||
"links": [
|
||||
279
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "ComfyMathExpression"
|
||||
},
|
||||
"widgets_values": [
|
||||
"a & ~1"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 96,
|
||||
"type": "ComfyMathExpression",
|
||||
"pos": [
|
||||
-6040,
|
||||
3290
|
||||
],
|
||||
"size": [
|
||||
400,
|
||||
200
|
||||
],
|
||||
"flags": {},
|
||||
"order": 6,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"label": "a",
|
||||
"localized_name": "values.a",
|
||||
"name": "values.a",
|
||||
"type": "FLOAT,INT",
|
||||
"link": 276
|
||||
},
|
||||
{
|
||||
"label": "b",
|
||||
"localized_name": "values.b",
|
||||
"name": "values.b",
|
||||
"shape": 7,
|
||||
"type": "FLOAT,INT",
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"localized_name": "expression",
|
||||
"name": "expression",
|
||||
"type": "STRING",
|
||||
"widget": {
|
||||
"name": "expression"
|
||||
},
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "FLOAT",
|
||||
"name": "FLOAT",
|
||||
"type": "FLOAT",
|
||||
"links": null
|
||||
},
|
||||
{
|
||||
"localized_name": "INT",
|
||||
"name": "INT",
|
||||
"type": "INT",
|
||||
"links": [
|
||||
280
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "ComfyMathExpression"
|
||||
},
|
||||
"widgets_values": [
|
||||
"a & ~1"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 79,
|
||||
"type": "ImageStitch",
|
||||
"pos": [
|
||||
-6390,
|
||||
2780
|
||||
2700
|
||||
],
|
||||
"size": [
|
||||
270,
|
||||
160
|
||||
150
|
||||
],
|
||||
"flags": {},
|
||||
"order": 2,
|
||||
@ -636,15 +408,14 @@
|
||||
"name": "IMAGE",
|
||||
"type": "IMAGE",
|
||||
"links": [
|
||||
266,
|
||||
281
|
||||
250
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "ImageStitch",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.13.0"
|
||||
"ver": "0.13.0",
|
||||
"Node name for S&R": "ImageStitch"
|
||||
},
|
||||
"widgets_values": [
|
||||
"right",
|
||||
@ -654,91 +425,60 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 97,
|
||||
"type": "ResizeImageMaskNode",
|
||||
"id": 80,
|
||||
"type": "CreateVideo",
|
||||
"pos": [
|
||||
-5560,
|
||||
2790
|
||||
-6040,
|
||||
2610
|
||||
],
|
||||
"size": [
|
||||
270,
|
||||
160
|
||||
78
|
||||
],
|
||||
"flags": {},
|
||||
"order": 7,
|
||||
"order": 3,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "input",
|
||||
"name": "input",
|
||||
"type": "IMAGE,MASK",
|
||||
"link": 281
|
||||
"localized_name": "images",
|
||||
"name": "images",
|
||||
"type": "IMAGE",
|
||||
"link": 250
|
||||
},
|
||||
{
|
||||
"localized_name": "resize_type",
|
||||
"name": "resize_type",
|
||||
"type": "COMFY_DYNAMICCOMBO_V3",
|
||||
"widget": {
|
||||
"name": "resize_type"
|
||||
},
|
||||
"link": null
|
||||
"localized_name": "audio",
|
||||
"name": "audio",
|
||||
"shape": 7,
|
||||
"type": "AUDIO",
|
||||
"link": 251
|
||||
},
|
||||
{
|
||||
"localized_name": "width",
|
||||
"name": "resize_type.width",
|
||||
"type": "INT",
|
||||
"localized_name": "fps",
|
||||
"name": "fps",
|
||||
"type": "FLOAT",
|
||||
"widget": {
|
||||
"name": "resize_type.width"
|
||||
"name": "fps"
|
||||
},
|
||||
"link": 279
|
||||
},
|
||||
{
|
||||
"localized_name": "height",
|
||||
"name": "resize_type.height",
|
||||
"type": "INT",
|
||||
"widget": {
|
||||
"name": "resize_type.height"
|
||||
},
|
||||
"link": 280
|
||||
},
|
||||
{
|
||||
"localized_name": "crop",
|
||||
"name": "resize_type.crop",
|
||||
"type": "COMBO",
|
||||
"widget": {
|
||||
"name": "resize_type.crop"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"localized_name": "scale_method",
|
||||
"name": "scale_method",
|
||||
"type": "COMBO",
|
||||
"widget": {
|
||||
"name": "scale_method"
|
||||
},
|
||||
"link": null
|
||||
"link": 252
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "resized",
|
||||
"name": "resized",
|
||||
"type": "*",
|
||||
"localized_name": "VIDEO",
|
||||
"name": "VIDEO",
|
||||
"type": "VIDEO",
|
||||
"links": [
|
||||
282
|
||||
255
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "ResizeImageMaskNode"
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.13.0",
|
||||
"Node name for S&R": "CreateVideo"
|
||||
},
|
||||
"widgets_values": [
|
||||
"scale dimensions",
|
||||
512,
|
||||
512,
|
||||
"center",
|
||||
"area"
|
||||
30
|
||||
]
|
||||
}
|
||||
],
|
||||
@ -760,6 +500,14 @@
|
||||
"target_slot": 1,
|
||||
"type": "IMAGE"
|
||||
},
|
||||
{
|
||||
"id": 250,
|
||||
"origin_id": 79,
|
||||
"origin_slot": 0,
|
||||
"target_id": 80,
|
||||
"target_slot": 0,
|
||||
"type": "IMAGE"
|
||||
},
|
||||
{
|
||||
"id": 251,
|
||||
"origin_id": 77,
|
||||
@ -831,71 +579,13 @@
|
||||
"target_id": 79,
|
||||
"target_slot": 5,
|
||||
"type": "COMBO"
|
||||
},
|
||||
{
|
||||
"id": 266,
|
||||
"origin_id": 79,
|
||||
"origin_slot": 0,
|
||||
"target_id": 90,
|
||||
"target_slot": 0,
|
||||
"type": "IMAGE"
|
||||
},
|
||||
{
|
||||
"id": 274,
|
||||
"origin_id": 90,
|
||||
"origin_slot": 0,
|
||||
"target_id": 95,
|
||||
"target_slot": 0,
|
||||
"type": "INT"
|
||||
},
|
||||
{
|
||||
"id": 276,
|
||||
"origin_id": 90,
|
||||
"origin_slot": 1,
|
||||
"target_id": 96,
|
||||
"target_slot": 0,
|
||||
"type": "INT"
|
||||
},
|
||||
{
|
||||
"id": 279,
|
||||
"origin_id": 95,
|
||||
"origin_slot": 1,
|
||||
"target_id": 97,
|
||||
"target_slot": 2,
|
||||
"type": "INT"
|
||||
},
|
||||
{
|
||||
"id": 280,
|
||||
"origin_id": 96,
|
||||
"origin_slot": 1,
|
||||
"target_id": 97,
|
||||
"target_slot": 3,
|
||||
"type": "INT"
|
||||
},
|
||||
{
|
||||
"id": 281,
|
||||
"origin_id": 79,
|
||||
"origin_slot": 0,
|
||||
"target_id": 97,
|
||||
"target_slot": 0,
|
||||
"type": "IMAGE"
|
||||
},
|
||||
{
|
||||
"id": 282,
|
||||
"origin_id": 97,
|
||||
"origin_slot": 0,
|
||||
"target_id": 80,
|
||||
"target_slot": 0,
|
||||
"type": "IMAGE"
|
||||
}
|
||||
],
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Video Tools/Stitch videos",
|
||||
"description": "Stitches multiple video clips into a single sequential video file."
|
||||
"category": "Video Tools/Stitch videos"
|
||||
}
|
||||
]
|
||||
},
|
||||
"extra": {}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -412,10 +412,9 @@
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Video generation and editing/Enhance video",
|
||||
"description": "Upscales video to 4× resolution using a GAN-based upscaling model."
|
||||
"category": "Video generation and editing/Enhance video"
|
||||
}
|
||||
]
|
||||
},
|
||||
"extra": {}
|
||||
}
|
||||
}
|
||||
|
||||
@ -1,7 +0,0 @@
|
||||
{
|
||||
"model_type": "birefnet",
|
||||
"image_std": [1.0, 1.0, 1.0],
|
||||
"image_mean": [0.0, 0.0, 0.0],
|
||||
"image_size": 1024,
|
||||
"resize_to_original": true
|
||||
}
|
||||
@ -1,689 +0,0 @@
|
||||
import torch
|
||||
import comfy.ops
|
||||
import numpy as np
|
||||
import torch.nn as nn
|
||||
from functools import partial
|
||||
import torch.nn.functional as F
|
||||
from torchvision.ops import deform_conv2d
|
||||
from comfy.ldm.modules.attention import optimized_attention_for_device
|
||||
|
||||
CXT = [3072, 1536, 768, 384][1:][::-1][-3:]
|
||||
|
||||
class Attention(nn.Module):
|
||||
def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
|
||||
self.dim = dim
|
||||
self.num_heads = num_heads
|
||||
head_dim = dim // num_heads
|
||||
self.scale = qk_scale or head_dim ** -0.5
|
||||
|
||||
self.q = operations.Linear(dim, dim, bias=qkv_bias, device=device, dtype=dtype)
|
||||
self.kv = operations.Linear(dim, dim * 2, bias=qkv_bias, device=device, dtype=dtype)
|
||||
self.proj = operations.Linear(dim, dim, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, x):
|
||||
B, N, C = x.shape
|
||||
optimized_attention = optimized_attention_for_device(x.device, mask=False, small_input=True)
|
||||
q = self.q(x).reshape(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
|
||||
kv = self.kv(x).reshape(B, -1, 2, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
|
||||
k, v = kv[0], kv[1]
|
||||
|
||||
x = optimized_attention(
|
||||
q, k, v, heads=self.num_heads, skip_output_reshape=True, skip_reshape=True
|
||||
).transpose(1, 2).reshape(B, N, C)
|
||||
x = self.proj(x)
|
||||
|
||||
return x
|
||||
|
||||
class Mlp(nn.Module):
|
||||
def __init__(self, in_features, hidden_features=None, out_features=None, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
out_features = out_features or in_features
|
||||
hidden_features = hidden_features or in_features
|
||||
self.fc1 = operations.Linear(in_features, hidden_features, device=device, dtype=dtype)
|
||||
self.act = nn.GELU()
|
||||
self.fc2 = operations.Linear(hidden_features, out_features, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.fc1(x)
|
||||
x = self.act(x)
|
||||
x = self.fc2(x)
|
||||
return x
|
||||
|
||||
|
||||
def window_partition(x, window_size):
|
||||
B, H, W, C = x.shape
|
||||
x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
|
||||
windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
|
||||
return windows
|
||||
|
||||
|
||||
def window_reverse(windows, window_size, H, W):
|
||||
B = int(windows.shape[0] / (H * W / window_size / window_size))
|
||||
x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
|
||||
x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
|
||||
return x
|
||||
|
||||
|
||||
class WindowAttention(nn.Module):
|
||||
def __init__(self, dim, window_size, num_heads, qkv_bias=True, qk_scale=None, device=None, dtype=None, operations=None):
|
||||
|
||||
super().__init__()
|
||||
self.dim = dim
|
||||
self.window_size = window_size # Wh, Ww
|
||||
self.num_heads = num_heads
|
||||
head_dim = dim // num_heads
|
||||
self.scale = qk_scale or head_dim ** -0.5
|
||||
|
||||
self.relative_position_bias_table = nn.Parameter(
|
||||
torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads, device=device, dtype=dtype))
|
||||
|
||||
coords_h = torch.arange(self.window_size[0])
|
||||
coords_w = torch.arange(self.window_size[1])
|
||||
coords = torch.stack(torch.meshgrid([coords_h, coords_w], indexing='ij')) # 2, Wh, Ww
|
||||
coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww
|
||||
relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Ww
|
||||
relative_coords = relative_coords.permute(1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2
|
||||
relative_coords[:, :, 0] += self.window_size[0] - 1
|
||||
relative_coords[:, :, 1] += self.window_size[1] - 1
|
||||
relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
|
||||
relative_position_index = relative_coords.sum(-1) # Wh*Ww, Wh*Ww
|
||||
self.register_buffer("relative_position_index", relative_position_index)
|
||||
|
||||
self.qkv = operations.Linear(dim, dim * 3, bias=qkv_bias, device=device, dtype=dtype)
|
||||
self.proj = operations.Linear(dim, dim, device=device, dtype=dtype)
|
||||
self.softmax = nn.Softmax(dim=-1)
|
||||
|
||||
def forward(self, x, mask=None):
|
||||
B_, N, C = x.shape
|
||||
qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
|
||||
q, k, v = qkv[0], qkv[1], qkv[2]
|
||||
|
||||
q = q * self.scale
|
||||
attn = (q @ k.transpose(-2, -1))
|
||||
|
||||
relative_position_bias = self.relative_position_bias_table[self.relative_position_index.long().view(-1)].view(
|
||||
self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1) # Wh*Ww,Wh*Ww,nH
|
||||
relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww
|
||||
attn = attn + relative_position_bias.unsqueeze(0)
|
||||
|
||||
if mask is not None:
|
||||
nW = mask.shape[0]
|
||||
attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0)
|
||||
attn = attn.view(-1, self.num_heads, N, N)
|
||||
attn = self.softmax(attn)
|
||||
else:
|
||||
attn = self.softmax(attn)
|
||||
|
||||
x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
|
||||
x = self.proj(x)
|
||||
return x
|
||||
|
||||
|
||||
class SwinTransformerBlock(nn.Module):
|
||||
def __init__(self, dim, num_heads, window_size=7, shift_size=0,
|
||||
mlp_ratio=4., qkv_bias=True, qk_scale=None,
|
||||
norm_layer=nn.LayerNorm, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.dim = dim
|
||||
self.num_heads = num_heads
|
||||
self.window_size = window_size
|
||||
self.shift_size = shift_size
|
||||
self.mlp_ratio = mlp_ratio
|
||||
|
||||
self.norm1 = norm_layer(dim, device=device, dtype=dtype)
|
||||
self.attn = WindowAttention(
|
||||
dim, window_size=(self.window_size, self.window_size), num_heads=num_heads,
|
||||
qkv_bias=qkv_bias, qk_scale=qk_scale, device=device, dtype=dtype, operations=operations)
|
||||
|
||||
self.norm2 = norm_layer(dim, device=device, dtype=dtype)
|
||||
mlp_hidden_dim = int(dim * mlp_ratio)
|
||||
self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, device=device, dtype=dtype, operations=operations)
|
||||
|
||||
self.H = None
|
||||
self.W = None
|
||||
|
||||
def forward(self, x, mask_matrix):
|
||||
B, L, C = x.shape
|
||||
H, W = self.H, self.W
|
||||
|
||||
shortcut = x
|
||||
x = self.norm1(x)
|
||||
x = x.view(B, H, W, C)
|
||||
|
||||
pad_l = pad_t = 0
|
||||
pad_r = (self.window_size - W % self.window_size) % self.window_size
|
||||
pad_b = (self.window_size - H % self.window_size) % self.window_size
|
||||
x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b))
|
||||
_, Hp, Wp, _ = x.shape
|
||||
|
||||
if self.shift_size > 0:
|
||||
shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
|
||||
attn_mask = mask_matrix
|
||||
else:
|
||||
shifted_x = x
|
||||
attn_mask = None
|
||||
|
||||
x_windows = window_partition(shifted_x, self.window_size)
|
||||
x_windows = x_windows.view(-1, self.window_size * self.window_size, C)
|
||||
|
||||
attn_windows = self.attn(x_windows, mask=attn_mask)
|
||||
|
||||
attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C)
|
||||
shifted_x = window_reverse(attn_windows, self.window_size, Hp, Wp) # B H' W' C
|
||||
|
||||
if self.shift_size > 0:
|
||||
x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
|
||||
else:
|
||||
x = shifted_x
|
||||
|
||||
if pad_r > 0 or pad_b > 0:
|
||||
x = x[:, :H, :W, :].contiguous()
|
||||
|
||||
x = x.view(B, H * W, C)
|
||||
|
||||
x = shortcut + x
|
||||
x = x + self.mlp(self.norm2(x))
|
||||
|
||||
return x
|
||||
|
||||
|
||||
class PatchMerging(nn.Module):
|
||||
def __init__(self, dim, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.dim = dim
|
||||
self.reduction = operations.Linear(4 * dim, 2 * dim, bias=False, device=device, dtype=dtype)
|
||||
self.norm = operations.LayerNorm(4 * dim, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, x, H, W):
|
||||
B, L, C = x.shape
|
||||
x = x.view(B, H, W, C)
|
||||
|
||||
# padding
|
||||
pad_input = (H % 2 == 1) or (W % 2 == 1)
|
||||
if pad_input:
|
||||
x = F.pad(x, (0, 0, 0, W % 2, 0, H % 2))
|
||||
|
||||
x0 = x[:, 0::2, 0::2, :] # B H/2 W/2 C
|
||||
x1 = x[:, 1::2, 0::2, :] # B H/2 W/2 C
|
||||
x2 = x[:, 0::2, 1::2, :] # B H/2 W/2 C
|
||||
x3 = x[:, 1::2, 1::2, :] # B H/2 W/2 C
|
||||
x = torch.cat([x0, x1, x2, x3], -1) # B H/2 W/2 4*C
|
||||
x = x.view(B, -1, 4 * C) # B H/2*W/2 4*C
|
||||
|
||||
x = self.norm(x)
|
||||
x = self.reduction(x)
|
||||
|
||||
return x
|
||||
|
||||
|
||||
class BasicLayer(nn.Module):
|
||||
def __init__(self,
|
||||
dim,
|
||||
depth,
|
||||
num_heads,
|
||||
window_size=7,
|
||||
mlp_ratio=4.,
|
||||
qkv_bias=True,
|
||||
qk_scale=None,
|
||||
norm_layer=nn.LayerNorm,
|
||||
downsample=None,
|
||||
device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.window_size = window_size
|
||||
self.shift_size = window_size // 2
|
||||
self.depth = depth
|
||||
|
||||
# build blocks
|
||||
self.blocks = nn.ModuleList([
|
||||
SwinTransformerBlock(
|
||||
dim=dim,
|
||||
num_heads=num_heads,
|
||||
window_size=window_size,
|
||||
shift_size=0 if (i % 2 == 0) else window_size // 2,
|
||||
mlp_ratio=mlp_ratio,
|
||||
qkv_bias=qkv_bias,
|
||||
qk_scale=qk_scale,
|
||||
norm_layer=norm_layer,
|
||||
device=device, dtype=dtype, operations=operations)
|
||||
for i in range(depth)])
|
||||
|
||||
# patch merging layer
|
||||
if downsample is not None:
|
||||
self.downsample = downsample(dim=dim, device=device, dtype=dtype, operations=operations)
|
||||
else:
|
||||
self.downsample = None
|
||||
|
||||
def forward(self, x, H, W):
|
||||
Hp = int(np.ceil(H / self.window_size)) * self.window_size
|
||||
Wp = int(np.ceil(W / self.window_size)) * self.window_size
|
||||
img_mask = torch.zeros((1, Hp, Wp, 1), device=x.device) # 1 Hp Wp 1
|
||||
h_slices = (slice(0, -self.window_size),
|
||||
slice(-self.window_size, -self.shift_size),
|
||||
slice(-self.shift_size, None))
|
||||
w_slices = (slice(0, -self.window_size),
|
||||
slice(-self.window_size, -self.shift_size),
|
||||
slice(-self.shift_size, None))
|
||||
cnt = 0
|
||||
for h in h_slices:
|
||||
for w in w_slices:
|
||||
img_mask[:, h, w, :] = cnt
|
||||
cnt += 1
|
||||
|
||||
mask_windows = window_partition(img_mask, self.window_size)
|
||||
mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
|
||||
attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
|
||||
attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
|
||||
|
||||
for blk in self.blocks:
|
||||
blk.H, blk.W = H, W
|
||||
x = blk(x, attn_mask)
|
||||
if self.downsample is not None:
|
||||
x_down = self.downsample(x, H, W)
|
||||
Wh, Ww = (H + 1) // 2, (W + 1) // 2
|
||||
return x, H, W, x_down, Wh, Ww
|
||||
else:
|
||||
return x, H, W, x, H, W
|
||||
|
||||
|
||||
class PatchEmbed(nn.Module):
|
||||
def __init__(self, patch_size=4, in_channels=3, embed_dim=96, norm_layer=None, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
patch_size = (patch_size, patch_size)
|
||||
self.patch_size = patch_size
|
||||
|
||||
self.in_channels = in_channels
|
||||
self.embed_dim = embed_dim
|
||||
|
||||
self.proj = operations.Conv2d(in_channels, embed_dim, kernel_size=patch_size, stride=patch_size, device=device, dtype=dtype)
|
||||
if norm_layer is not None:
|
||||
self.norm = norm_layer(embed_dim, device=device, dtype=dtype)
|
||||
else:
|
||||
self.norm = None
|
||||
|
||||
def forward(self, x):
|
||||
_, _, H, W = x.size()
|
||||
if W % self.patch_size[1] != 0:
|
||||
x = F.pad(x, (0, self.patch_size[1] - W % self.patch_size[1]))
|
||||
if H % self.patch_size[0] != 0:
|
||||
x = F.pad(x, (0, 0, 0, self.patch_size[0] - H % self.patch_size[0]))
|
||||
|
||||
x = self.proj(x) # B C Wh Ww
|
||||
if self.norm is not None:
|
||||
Wh, Ww = x.size(2), x.size(3)
|
||||
x = x.flatten(2).transpose(1, 2)
|
||||
x = self.norm(x)
|
||||
x = x.transpose(1, 2).view(-1, self.embed_dim, Wh, Ww)
|
||||
|
||||
return x
|
||||
|
||||
|
||||
class SwinTransformer(nn.Module):
|
||||
def __init__(self,
|
||||
pretrain_img_size=224,
|
||||
patch_size=4,
|
||||
in_channels=3,
|
||||
embed_dim=96,
|
||||
depths=[2, 2, 6, 2],
|
||||
num_heads=[3, 6, 12, 24],
|
||||
window_size=7,
|
||||
mlp_ratio=4.,
|
||||
qkv_bias=True,
|
||||
qk_scale=None,
|
||||
patch_norm=True,
|
||||
out_indices=(0, 1, 2, 3),
|
||||
frozen_stages=-1,
|
||||
device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
|
||||
norm_layer = partial(operations.LayerNorm, device=device, dtype=dtype)
|
||||
self.pretrain_img_size = pretrain_img_size
|
||||
self.num_layers = len(depths)
|
||||
self.embed_dim = embed_dim
|
||||
self.patch_norm = patch_norm
|
||||
self.out_indices = out_indices
|
||||
self.frozen_stages = frozen_stages
|
||||
|
||||
self.patch_embed = PatchEmbed(
|
||||
patch_size=patch_size, in_channels=in_channels, embed_dim=embed_dim,
|
||||
device=device, dtype=dtype, operations=operations,
|
||||
norm_layer=norm_layer if self.patch_norm else None)
|
||||
|
||||
self.layers = nn.ModuleList()
|
||||
for i_layer in range(self.num_layers):
|
||||
layer = BasicLayer(
|
||||
dim=int(embed_dim * 2 ** i_layer),
|
||||
depth=depths[i_layer],
|
||||
num_heads=num_heads[i_layer],
|
||||
window_size=window_size,
|
||||
mlp_ratio=mlp_ratio,
|
||||
qkv_bias=qkv_bias,
|
||||
qk_scale=qk_scale,
|
||||
norm_layer=norm_layer,
|
||||
downsample=PatchMerging if (i_layer < self.num_layers - 1) else None,
|
||||
device=device, dtype=dtype, operations=operations)
|
||||
self.layers.append(layer)
|
||||
|
||||
num_features = [int(embed_dim * 2 ** i) for i in range(self.num_layers)]
|
||||
self.num_features = num_features
|
||||
|
||||
for i_layer in out_indices:
|
||||
layer = norm_layer(num_features[i_layer])
|
||||
layer_name = f'norm{i_layer}'
|
||||
self.add_module(layer_name, layer)
|
||||
|
||||
|
||||
def forward(self, x):
|
||||
x = self.patch_embed(x)
|
||||
|
||||
Wh, Ww = x.size(2), x.size(3)
|
||||
|
||||
outs = []
|
||||
x = x.flatten(2).transpose(1, 2)
|
||||
for i in range(self.num_layers):
|
||||
layer = self.layers[i]
|
||||
x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww)
|
||||
|
||||
if i in self.out_indices:
|
||||
norm_layer = getattr(self, f'norm{i}')
|
||||
x_out = norm_layer(x_out)
|
||||
|
||||
out = x_out.view(-1, H, W, self.num_features[i]).permute(0, 3, 1, 2).contiguous()
|
||||
outs.append(out)
|
||||
|
||||
return tuple(outs)
|
||||
|
||||
class DeformableConv2d(nn.Module):
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
kernel_size=3,
|
||||
stride=1,
|
||||
padding=1,
|
||||
bias=False, device=None, dtype=None, operations=None):
|
||||
|
||||
super(DeformableConv2d, self).__init__()
|
||||
|
||||
kernel_size = kernel_size if type(kernel_size) is tuple else (kernel_size, kernel_size)
|
||||
self.stride = stride if type(stride) is tuple else (stride, stride)
|
||||
self.padding = padding
|
||||
|
||||
self.offset_conv = operations.Conv2d(in_channels,
|
||||
2 * kernel_size[0] * kernel_size[1],
|
||||
kernel_size=kernel_size,
|
||||
stride=stride,
|
||||
padding=self.padding,
|
||||
bias=True, device=device, dtype=dtype)
|
||||
|
||||
self.modulator_conv = operations.Conv2d(in_channels,
|
||||
1 * kernel_size[0] * kernel_size[1],
|
||||
kernel_size=kernel_size,
|
||||
stride=stride,
|
||||
padding=self.padding,
|
||||
bias=True, device=device, dtype=dtype)
|
||||
|
||||
self.regular_conv = operations.Conv2d(in_channels,
|
||||
out_channels=out_channels,
|
||||
kernel_size=kernel_size,
|
||||
stride=stride,
|
||||
padding=self.padding,
|
||||
bias=bias, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, x):
|
||||
offset = self.offset_conv(x)
|
||||
modulator = 2. * torch.sigmoid(self.modulator_conv(x))
|
||||
weight, bias, offload_info = comfy.ops.cast_bias_weight(self.regular_conv, x, offloadable=True)
|
||||
|
||||
x = deform_conv2d(
|
||||
input=x,
|
||||
offset=offset,
|
||||
weight=weight,
|
||||
bias=None,
|
||||
padding=self.padding,
|
||||
mask=modulator,
|
||||
stride=self.stride,
|
||||
)
|
||||
comfy.ops.uncast_bias_weight(self.regular_conv, weight, bias, offload_info)
|
||||
return x
|
||||
|
||||
class BasicDecBlk(nn.Module):
|
||||
def __init__(self, in_channels=64, out_channels=64, inter_channels=64, device=None, dtype=None, operations=None):
|
||||
super(BasicDecBlk, self).__init__()
|
||||
inter_channels = 64
|
||||
self.conv_in = operations.Conv2d(in_channels, inter_channels, 3, 1, padding=1, device=device, dtype=dtype)
|
||||
self.relu_in = nn.ReLU(inplace=True)
|
||||
self.dec_att = ASPPDeformable(in_channels=inter_channels, device=device, dtype=dtype, operations=operations)
|
||||
self.conv_out = operations.Conv2d(inter_channels, out_channels, 3, 1, padding=1, device=device, dtype=dtype)
|
||||
self.bn_in = operations.BatchNorm2d(inter_channels, device=device, dtype=dtype)
|
||||
self.bn_out = operations.BatchNorm2d(out_channels, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.conv_in(x)
|
||||
x = self.bn_in(x)
|
||||
x = self.relu_in(x)
|
||||
x = self.dec_att(x)
|
||||
x = self.conv_out(x)
|
||||
x = self.bn_out(x)
|
||||
return x
|
||||
|
||||
|
||||
class BasicLatBlk(nn.Module):
|
||||
def __init__(self, in_channels=64, out_channels=64, device=None, dtype=None, operations=None):
|
||||
super(BasicLatBlk, self).__init__()
|
||||
self.conv = operations.Conv2d(in_channels, out_channels, 1, 1, 0, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.conv(x)
|
||||
return x
|
||||
|
||||
|
||||
class _ASPPModuleDeformable(nn.Module):
|
||||
def __init__(self, in_channels, planes, kernel_size, padding, device, dtype, operations):
|
||||
super(_ASPPModuleDeformable, self).__init__()
|
||||
self.atrous_conv = DeformableConv2d(in_channels, planes, kernel_size=kernel_size,
|
||||
stride=1, padding=padding, bias=False, device=device, dtype=dtype, operations=operations)
|
||||
self.bn = operations.BatchNorm2d(planes, device=device, dtype=dtype)
|
||||
self.relu = nn.ReLU(inplace=True)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.atrous_conv(x)
|
||||
x = self.bn(x)
|
||||
|
||||
return self.relu(x)
|
||||
|
||||
|
||||
class ASPPDeformable(nn.Module):
|
||||
def __init__(self, in_channels, out_channels=None, parallel_block_sizes=[1, 3, 7], device=None, dtype=None, operations=None):
|
||||
super(ASPPDeformable, self).__init__()
|
||||
self.down_scale = 1
|
||||
if out_channels is None:
|
||||
out_channels = in_channels
|
||||
self.in_channelster = 256 // self.down_scale
|
||||
|
||||
self.aspp1 = _ASPPModuleDeformable(in_channels, self.in_channelster, 1, padding=0, device=device, dtype=dtype, operations=operations)
|
||||
self.aspp_deforms = nn.ModuleList([
|
||||
_ASPPModuleDeformable(in_channels, self.in_channelster, conv_size, padding=int(conv_size//2), device=device, dtype=dtype, operations=operations)
|
||||
for conv_size in parallel_block_sizes
|
||||
])
|
||||
|
||||
self.global_avg_pool = nn.Sequential(nn.AdaptiveAvgPool2d((1, 1)),
|
||||
operations.Conv2d(in_channels, self.in_channelster, 1, stride=1, bias=False, device=device, dtype=dtype),
|
||||
operations.BatchNorm2d(self.in_channelster, device=device, dtype=dtype),
|
||||
nn.ReLU(inplace=True))
|
||||
self.conv1 = operations.Conv2d(self.in_channelster * (2 + len(self.aspp_deforms)), out_channels, 1, bias=False, device=device, dtype=dtype)
|
||||
self.bn1 = operations.BatchNorm2d(out_channels, device=device, dtype=dtype)
|
||||
self.relu = nn.ReLU(inplace=True)
|
||||
|
||||
def forward(self, x):
|
||||
x1 = self.aspp1(x)
|
||||
x_aspp_deforms = [aspp_deform(x) for aspp_deform in self.aspp_deforms]
|
||||
x5 = self.global_avg_pool(x)
|
||||
x5 = F.interpolate(x5, size=x1.size()[2:], mode='bilinear', align_corners=True)
|
||||
x = torch.cat((x1, *x_aspp_deforms, x5), dim=1)
|
||||
|
||||
x = self.conv1(x)
|
||||
x = self.bn1(x)
|
||||
x = self.relu(x)
|
||||
|
||||
return x
|
||||
|
||||
class BiRefNet(nn.Module):
|
||||
def __init__(self, config=None, dtype=None, device=None, operations=None):
|
||||
super(BiRefNet, self).__init__()
|
||||
self.bb = SwinTransformer(embed_dim=192, depths=[2, 2, 18, 2], num_heads=[6, 12, 24, 48], window_size=12, device=device, dtype=dtype, operations=operations)
|
||||
|
||||
channels = [1536, 768, 384, 192]
|
||||
channels = [c * 2 for c in channels]
|
||||
self.cxt = channels[1:][::-1][-3:]
|
||||
self.squeeze_module = nn.Sequential(*[
|
||||
BasicDecBlk(channels[0]+sum(self.cxt), channels[0], device=device, dtype=dtype, operations=operations)
|
||||
for _ in range(1)
|
||||
])
|
||||
|
||||
self.decoder = Decoder(channels, device=device, dtype=dtype, operations=operations)
|
||||
|
||||
def forward_enc(self, x):
|
||||
x1, x2, x3, x4 = self.bb(x)
|
||||
B, C, H, W = x.shape
|
||||
x1_, x2_, x3_, x4_ = self.bb(F.interpolate(x, size=(H//2, W//2), mode='bilinear', align_corners=True))
|
||||
x1 = torch.cat([x1, F.interpolate(x1_, size=x1.shape[2:], mode='bilinear', align_corners=True)], dim=1)
|
||||
x2 = torch.cat([x2, F.interpolate(x2_, size=x2.shape[2:], mode='bilinear', align_corners=True)], dim=1)
|
||||
x3 = torch.cat([x3, F.interpolate(x3_, size=x3.shape[2:], mode='bilinear', align_corners=True)], dim=1)
|
||||
x4 = torch.cat([x4, F.interpolate(x4_, size=x4.shape[2:], mode='bilinear', align_corners=True)], dim=1)
|
||||
x4 = torch.cat(
|
||||
(
|
||||
*[
|
||||
F.interpolate(x1, size=x4.shape[2:], mode='bilinear', align_corners=True),
|
||||
F.interpolate(x2, size=x4.shape[2:], mode='bilinear', align_corners=True),
|
||||
F.interpolate(x3, size=x4.shape[2:], mode='bilinear', align_corners=True),
|
||||
][-len(CXT):],
|
||||
x4
|
||||
),
|
||||
dim=1
|
||||
)
|
||||
return (x1, x2, x3, x4)
|
||||
|
||||
def forward_ori(self, x):
|
||||
(x1, x2, x3, x4) = self.forward_enc(x)
|
||||
x4 = self.squeeze_module(x4)
|
||||
features = [x, x1, x2, x3, x4]
|
||||
scaled_preds = self.decoder(features)
|
||||
return scaled_preds
|
||||
|
||||
def forward(self, pixel_values, intermediate_output=None):
|
||||
scaled_preds = self.forward_ori(pixel_values)
|
||||
return scaled_preds
|
||||
|
||||
|
||||
class Decoder(nn.Module):
|
||||
def __init__(self, channels, device, dtype, operations):
|
||||
super(Decoder, self).__init__()
|
||||
# factory kwargs
|
||||
fk = {"device":device, "dtype":dtype, "operations":operations}
|
||||
DecoderBlock = partial(BasicDecBlk, **fk)
|
||||
LateralBlock = partial(BasicLatBlk, **fk)
|
||||
DBlock = partial(SimpleConvs, **fk)
|
||||
|
||||
self.split = True
|
||||
N_dec_ipt = 64
|
||||
ic = 64
|
||||
ipt_cha_opt = 1
|
||||
self.ipt_blk5 = DBlock(2**10*3 if self.split else 3, [N_dec_ipt, channels[0]//8][ipt_cha_opt], inter_channels=ic)
|
||||
self.ipt_blk4 = DBlock(2**8*3 if self.split else 3, [N_dec_ipt, channels[0]//8][ipt_cha_opt], inter_channels=ic)
|
||||
self.ipt_blk3 = DBlock(2**6*3 if self.split else 3, [N_dec_ipt, channels[1]//8][ipt_cha_opt], inter_channels=ic)
|
||||
self.ipt_blk2 = DBlock(2**4*3 if self.split else 3, [N_dec_ipt, channels[2]//8][ipt_cha_opt], inter_channels=ic)
|
||||
self.ipt_blk1 = DBlock(2**0*3 if self.split else 3, [N_dec_ipt, channels[3]//8][ipt_cha_opt], inter_channels=ic)
|
||||
|
||||
self.decoder_block4 = DecoderBlock(channels[0]+([N_dec_ipt, channels[0]//8][ipt_cha_opt]), channels[1])
|
||||
self.decoder_block3 = DecoderBlock(channels[1]+([N_dec_ipt, channels[0]//8][ipt_cha_opt]), channels[2])
|
||||
self.decoder_block2 = DecoderBlock(channels[2]+([N_dec_ipt, channels[1]//8][ipt_cha_opt]), channels[3])
|
||||
self.decoder_block1 = DecoderBlock(channels[3]+([N_dec_ipt, channels[2]//8][ipt_cha_opt]), channels[3]//2)
|
||||
|
||||
fk = {"device":device, "dtype":dtype}
|
||||
|
||||
self.conv_out1 = nn.Sequential(operations.Conv2d(channels[3]//2+([N_dec_ipt, channels[3]//8][ipt_cha_opt]), 1, 1, 1, 0, **fk))
|
||||
|
||||
self.lateral_block4 = LateralBlock(channels[1], channels[1])
|
||||
self.lateral_block3 = LateralBlock(channels[2], channels[2])
|
||||
self.lateral_block2 = LateralBlock(channels[3], channels[3])
|
||||
|
||||
self.conv_ms_spvn_4 = operations.Conv2d(channels[1], 1, 1, 1, 0, **fk)
|
||||
self.conv_ms_spvn_3 = operations.Conv2d(channels[2], 1, 1, 1, 0, **fk)
|
||||
self.conv_ms_spvn_2 = operations.Conv2d(channels[3], 1, 1, 1, 0, **fk)
|
||||
|
||||
_N = 16
|
||||
|
||||
self.gdt_convs_4 = nn.Sequential(operations.Conv2d(channels[0] // 2, _N, 3, 1, 1, **fk), operations.BatchNorm2d(_N, **fk), nn.ReLU(inplace=True))
|
||||
self.gdt_convs_3 = nn.Sequential(operations.Conv2d(channels[1] // 2, _N, 3, 1, 1, **fk), operations.BatchNorm2d(_N, **fk), nn.ReLU(inplace=True))
|
||||
self.gdt_convs_2 = nn.Sequential(operations.Conv2d(channels[2] // 2, _N, 3, 1, 1, **fk), operations.BatchNorm2d(_N, **fk), nn.ReLU(inplace=True))
|
||||
|
||||
[setattr(self, f"gdt_convs_pred_{i}", nn.Sequential(operations.Conv2d(_N, 1, 1, 1, 0, **fk))) for i in range(2, 5)]
|
||||
[setattr(self, f"gdt_convs_attn_{i}", nn.Sequential(operations.Conv2d(_N, 1, 1, 1, 0, **fk))) for i in range(2, 5)]
|
||||
|
||||
def get_patches_batch(self, x, p):
|
||||
_size_h, _size_w = p.shape[2:]
|
||||
patches_batch = []
|
||||
for idx in range(x.shape[0]):
|
||||
columns_x = torch.split(x[idx], split_size_or_sections=_size_w, dim=-1)
|
||||
patches_x = []
|
||||
for column_x in columns_x:
|
||||
patches_x += [p.unsqueeze(0) for p in torch.split(column_x, split_size_or_sections=_size_h, dim=-2)]
|
||||
patch_sample = torch.cat(patches_x, dim=1)
|
||||
patches_batch.append(patch_sample)
|
||||
return torch.cat(patches_batch, dim=0)
|
||||
|
||||
def forward(self, features):
|
||||
x, x1, x2, x3, x4 = features
|
||||
|
||||
patches_batch = self.get_patches_batch(x, x4) if self.split else x
|
||||
x4 = torch.cat((x4, self.ipt_blk5(F.interpolate(patches_batch, size=x4.shape[2:], mode='bilinear', align_corners=True))), 1)
|
||||
p4 = self.decoder_block4(x4)
|
||||
p4_gdt = self.gdt_convs_4(p4)
|
||||
gdt_attn_4 = self.gdt_convs_attn_4(p4_gdt).sigmoid()
|
||||
p4 = p4 * gdt_attn_4
|
||||
_p4 = F.interpolate(p4, size=x3.shape[2:], mode='bilinear', align_corners=True)
|
||||
_p3 = _p4 + self.lateral_block4(x3)
|
||||
|
||||
patches_batch = self.get_patches_batch(x, _p3) if self.split else x
|
||||
_p3 = torch.cat((_p3, self.ipt_blk4(F.interpolate(patches_batch, size=x3.shape[2:], mode='bilinear', align_corners=True))), 1)
|
||||
p3 = self.decoder_block3(_p3)
|
||||
|
||||
p3_gdt = self.gdt_convs_3(p3)
|
||||
gdt_attn_3 = self.gdt_convs_attn_3(p3_gdt).sigmoid()
|
||||
p3 = p3 * gdt_attn_3
|
||||
_p3 = F.interpolate(p3, size=x2.shape[2:], mode='bilinear', align_corners=True)
|
||||
_p2 = _p3 + self.lateral_block3(x2)
|
||||
|
||||
patches_batch = self.get_patches_batch(x, _p2) if self.split else x
|
||||
_p2 = torch.cat((_p2, self.ipt_blk3(F.interpolate(patches_batch, size=x2.shape[2:], mode='bilinear', align_corners=True))), 1)
|
||||
p2 = self.decoder_block2(_p2)
|
||||
|
||||
p2_gdt = self.gdt_convs_2(p2)
|
||||
gdt_attn_2 = self.gdt_convs_attn_2(p2_gdt).sigmoid()
|
||||
p2 = p2 * gdt_attn_2
|
||||
|
||||
_p2 = F.interpolate(p2, size=x1.shape[2:], mode='bilinear', align_corners=True)
|
||||
_p1 = _p2 + self.lateral_block2(x1)
|
||||
|
||||
patches_batch = self.get_patches_batch(x, _p1) if self.split else x
|
||||
_p1 = torch.cat((_p1, self.ipt_blk2(F.interpolate(patches_batch, size=x1.shape[2:], mode='bilinear', align_corners=True))), 1)
|
||||
_p1 = self.decoder_block1(_p1)
|
||||
_p1 = F.interpolate(_p1, size=x.shape[2:], mode='bilinear', align_corners=True)
|
||||
|
||||
patches_batch = self.get_patches_batch(x, _p1) if self.split else x
|
||||
_p1 = torch.cat((_p1, self.ipt_blk1(F.interpolate(patches_batch, size=x.shape[2:], mode='bilinear', align_corners=True))), 1)
|
||||
p1_out = self.conv_out1(_p1)
|
||||
return p1_out
|
||||
|
||||
|
||||
class SimpleConvs(nn.Module):
|
||||
def __init__(
|
||||
self, in_channels: int, out_channels: int, inter_channels=64, device=None, dtype=None, operations=None
|
||||
) -> None:
|
||||
super().__init__()
|
||||
self.conv1 = operations.Conv2d(in_channels, inter_channels, 3, 1, 1, device=device, dtype=dtype)
|
||||
self.conv_out = operations.Conv2d(inter_channels, out_channels, 3, 1, 1, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, x):
|
||||
return self.conv_out(self.conv1(x))
|
||||
@ -1,78 +0,0 @@
|
||||
from .utils import load_torch_file
|
||||
import os
|
||||
import json
|
||||
import torch
|
||||
import logging
|
||||
|
||||
import comfy.ops
|
||||
import comfy.model_patcher
|
||||
import comfy.model_management
|
||||
import comfy.clip_model
|
||||
import comfy.background_removal.birefnet
|
||||
|
||||
BG_REMOVAL_MODELS = {
|
||||
"birefnet": comfy.background_removal.birefnet.BiRefNet
|
||||
}
|
||||
|
||||
class BackgroundRemovalModel():
|
||||
def __init__(self, json_config):
|
||||
with open(json_config) as f:
|
||||
config = json.load(f)
|
||||
|
||||
self.image_size = config.get("image_size", 1024)
|
||||
self.image_mean = config.get("image_mean", [0.0, 0.0, 0.0])
|
||||
self.image_std = config.get("image_std", [1.0, 1.0, 1.0])
|
||||
self.model_type = config.get("model_type", "birefnet")
|
||||
self.config = config.copy()
|
||||
model_class = BG_REMOVAL_MODELS.get(self.model_type)
|
||||
|
||||
self.load_device = comfy.model_management.text_encoder_device()
|
||||
offload_device = comfy.model_management.text_encoder_offload_device()
|
||||
self.dtype = comfy.model_management.text_encoder_dtype(self.load_device)
|
||||
self.model = model_class(config, self.dtype, offload_device, comfy.ops.manual_cast)
|
||||
self.model.eval()
|
||||
|
||||
self.patcher = comfy.model_patcher.CoreModelPatcher(self.model, load_device=self.load_device, offload_device=offload_device)
|
||||
|
||||
def load_sd(self, sd):
|
||||
return self.model.load_state_dict(sd, strict=False, assign=self.patcher.is_dynamic())
|
||||
|
||||
def get_sd(self):
|
||||
return self.model.state_dict()
|
||||
|
||||
def encode_image(self, image):
|
||||
comfy.model_management.load_model_gpu(self.patcher)
|
||||
H, W = image.shape[1], image.shape[2]
|
||||
pixel_values = comfy.clip_model.clip_preprocess(image.to(self.load_device), size=self.image_size, mean=self.image_mean, std=self.image_std, crop=False)
|
||||
out = self.model(pixel_values=pixel_values)
|
||||
out = torch.nn.functional.interpolate(out, size=(H, W), mode="bicubic", antialias=False)
|
||||
|
||||
mask = out.sigmoid().to(device=comfy.model_management.intermediate_device(), dtype=comfy.model_management.intermediate_dtype())
|
||||
if mask.ndim == 3:
|
||||
mask = mask.unsqueeze(0)
|
||||
if mask.shape[1] != 1:
|
||||
mask = mask.movedim(-1, 1)
|
||||
|
||||
return mask
|
||||
|
||||
|
||||
def load_background_removal_model(sd):
|
||||
if "bb.layers.1.blocks.0.attn.relative_position_index" in sd:
|
||||
json_config = os.path.join(os.path.join(os.path.dirname(os.path.realpath(__file__)), "background_removal"), "birefnet.json")
|
||||
else:
|
||||
return None
|
||||
|
||||
bg_model = BackgroundRemovalModel(json_config)
|
||||
m, u = bg_model.load_sd(sd)
|
||||
if len(m) > 0:
|
||||
logging.warning("missing background removal: {}".format(m))
|
||||
u = set(u)
|
||||
keys = list(sd.keys())
|
||||
for k in keys:
|
||||
if k not in u:
|
||||
sd.pop(k)
|
||||
return bg_model
|
||||
|
||||
def load(ckpt_path):
|
||||
sd = load_torch_file(ckpt_path)
|
||||
return load_background_removal_model(sd)
|
||||
@ -90,8 +90,8 @@ parser.add_argument("--force-channels-last", action="store_true", help="Force ch
|
||||
parser.add_argument("--directml", type=int, nargs="?", metavar="DIRECTML_DEVICE", const=-1, help="Use torch-directml.")
|
||||
|
||||
parser.add_argument("--oneapi-device-selector", type=str, default=None, metavar="SELECTOR_STRING", help="Sets the oneAPI device(s) this instance will use.")
|
||||
parser.add_argument("--disable-ipex-optimize", action="store_true", help="Disables ipex.optimize default when loading models with Intel's Extension for Pytorch.")
|
||||
parser.add_argument("--supports-fp8-compute", action="store_true", help="ComfyUI will act like if the device supports fp8 compute.")
|
||||
parser.add_argument("--enable-triton-backend", action="store_true", help="ComfyUI will enable the use of Triton backend in comfy-kitchen. Is disabled at launch by default.")
|
||||
|
||||
class LatentPreviewMethod(enum.Enum):
|
||||
NoPreviews = "none"
|
||||
@ -238,8 +238,6 @@ database_default_path = os.path.abspath(
|
||||
)
|
||||
parser.add_argument("--database-url", type=str, default=f"sqlite:///{database_default_path}", help="Specify the database URL, e.g. for an in-memory database you can use 'sqlite:///:memory:'.")
|
||||
parser.add_argument("--enable-assets", action="store_true", help="Enable the assets system (API routes, database synchronization, and background scanning).")
|
||||
parser.add_argument("--feature-flag", type=str, action='append', default=[], metavar="KEY[=VALUE]", help="Set a server feature flag. Use KEY=VALUE to set an explicit value, or bare KEY to set it to true. Can be specified multiple times. Boolean values (true/false) and numbers are auto-converted. Examples: --feature-flag show_signin_button=true or --feature-flag show_signin_button")
|
||||
parser.add_argument("--list-feature-flags", action="store_true", help="Print the registry of known CLI-settable feature flags as JSON and exit.")
|
||||
|
||||
if comfy.options.args_parsing:
|
||||
args = parser.parse_args()
|
||||
|
||||
@ -63,11 +63,7 @@ class IndexListContextWindow(ContextWindowABC):
|
||||
dim = self.dim
|
||||
if dim == 0 and full.shape[dim] == 1:
|
||||
return full
|
||||
indices = self.index_list
|
||||
anchor_idx = getattr(self, 'causal_anchor_index', None)
|
||||
if anchor_idx is not None and anchor_idx >= 0:
|
||||
indices = [anchor_idx] + list(indices)
|
||||
idx = tuple([slice(None)] * dim + [indices])
|
||||
idx = tuple([slice(None)] * dim + [self.index_list])
|
||||
window = full[idx]
|
||||
if retain_index_list:
|
||||
idx = tuple([slice(None)] * dim + [retain_index_list])
|
||||
@ -117,14 +113,7 @@ def slice_cond(cond_value, window: IndexListContextWindow, x_in: torch.Tensor, d
|
||||
|
||||
# skip leading latent positions that have no corresponding conditioning (e.g. reference frames)
|
||||
if temporal_offset > 0:
|
||||
anchor_idx = getattr(window, 'causal_anchor_index', None)
|
||||
if anchor_idx is not None and anchor_idx >= 0:
|
||||
# anchor occupies one of the no-cond positions, so skip one fewer from window.index_list
|
||||
skip_count = temporal_offset - 1
|
||||
else:
|
||||
skip_count = temporal_offset
|
||||
|
||||
indices = [i - temporal_offset for i in window.index_list[skip_count:]]
|
||||
indices = [i - temporal_offset for i in window.index_list[temporal_offset:]]
|
||||
indices = [i for i in indices if 0 <= i]
|
||||
else:
|
||||
indices = list(window.index_list)
|
||||
@ -161,8 +150,7 @@ class ContextFuseMethod:
|
||||
ContextResults = collections.namedtuple("ContextResults", ['window_idx', 'sub_conds_out', 'sub_conds', 'window'])
|
||||
class IndexListContextHandler(ContextHandlerABC):
|
||||
def __init__(self, context_schedule: ContextSchedule, fuse_method: ContextFuseMethod, context_length: int=1, context_overlap: int=0, context_stride: int=1,
|
||||
closed_loop: bool=False, dim:int=0, freenoise: bool=False, cond_retain_index_list: list[int]=[], split_conds_to_windows: bool=False,
|
||||
causal_window_fix: bool=True):
|
||||
closed_loop: bool=False, dim:int=0, freenoise: bool=False, cond_retain_index_list: list[int]=[], split_conds_to_windows: bool=False):
|
||||
self.context_schedule = context_schedule
|
||||
self.fuse_method = fuse_method
|
||||
self.context_length = context_length
|
||||
@ -174,7 +162,6 @@ class IndexListContextHandler(ContextHandlerABC):
|
||||
self.freenoise = freenoise
|
||||
self.cond_retain_index_list = [int(x.strip()) for x in cond_retain_index_list.split(",")] if cond_retain_index_list else []
|
||||
self.split_conds_to_windows = split_conds_to_windows
|
||||
self.causal_window_fix = causal_window_fix
|
||||
|
||||
self.callbacks = {}
|
||||
|
||||
@ -331,14 +318,6 @@ class IndexListContextHandler(ContextHandlerABC):
|
||||
# allow processing to end between context window executions for faster Cancel
|
||||
comfy.model_management.throw_exception_if_processing_interrupted()
|
||||
|
||||
# causal_window_fix: prepend a pre-window frame that will be stripped post-forward
|
||||
anchor_applied = False
|
||||
if self.causal_window_fix:
|
||||
anchor_idx = window.index_list[0] - 1
|
||||
if 0 <= anchor_idx < x_in.size(self.dim):
|
||||
window.causal_anchor_index = anchor_idx
|
||||
anchor_applied = True
|
||||
|
||||
for callback in comfy.patcher_extension.get_all_callbacks(IndexListCallbacks.EVALUATE_CONTEXT_WINDOWS, self.callbacks):
|
||||
callback(self, model, x_in, conds, timestep, model_options, window_idx, window, model_options, device, first_device)
|
||||
|
||||
@ -353,12 +332,6 @@ class IndexListContextHandler(ContextHandlerABC):
|
||||
if device is not None:
|
||||
for i in range(len(sub_conds_out)):
|
||||
sub_conds_out[i] = sub_conds_out[i].to(x_in.device)
|
||||
|
||||
# strip causal_window_fix anchor if applied
|
||||
if anchor_applied:
|
||||
for i in range(len(sub_conds_out)):
|
||||
sub_conds_out[i] = sub_conds_out[i].narrow(self.dim, 1, sub_conds_out[i].shape[self.dim] - 1)
|
||||
|
||||
results.append(ContextResults(window_idx, sub_conds_out, sub_conds, window))
|
||||
return results
|
||||
|
||||
|
||||
@ -1,34 +0,0 @@
|
||||
import functools
|
||||
import logging
|
||||
import os
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_DEFAULT_DEPLOY_ENV = "local-git"
|
||||
_ENV_FILENAME = ".comfy_environment"
|
||||
|
||||
# Resolve the ComfyUI install directory (the parent of this `comfy/` package).
|
||||
# We deliberately avoid `folder_paths.base_path` here because that is overridden
|
||||
# by the `--base-directory` CLI arg to a user-supplied path, whereas the
|
||||
# `.comfy_environment` marker is written by launchers/installers next to the
|
||||
# ComfyUI install itself.
|
||||
_COMFY_INSTALL_DIR = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
|
||||
|
||||
|
||||
@functools.cache
|
||||
def get_deploy_environment() -> str:
|
||||
env_file = os.path.join(_COMFY_INSTALL_DIR, _ENV_FILENAME)
|
||||
try:
|
||||
with open(env_file, encoding="utf-8") as f:
|
||||
# Cap the read so a malformed or maliciously crafted file (e.g.
|
||||
# a single huge line with no newline) can't blow up memory.
|
||||
first_line = f.readline(128).strip()
|
||||
value = "".join(c for c in first_line if 32 <= ord(c) < 127)
|
||||
if value:
|
||||
return value
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
except Exception as e:
|
||||
logger.error("Failed to read %s: %s", env_file, e)
|
||||
|
||||
return _DEFAULT_DEPLOY_ENV
|
||||
@ -93,7 +93,7 @@ class Hook:
|
||||
self.hook_scope = hook_scope
|
||||
'''Scope of where this hook should apply in terms of the conds used in sampling run.'''
|
||||
self.custom_should_register = default_should_register
|
||||
'''Can be overridden with a compatible function to decide if this hook should be registered without the need to override .should_register'''
|
||||
'''Can be overriden with a compatible function to decide if this hook should be registered without the need to override .should_register'''
|
||||
|
||||
@property
|
||||
def strength(self):
|
||||
|
||||
@ -1,15 +1,7 @@
|
||||
import math
|
||||
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
|
||||
from comfy.text_encoders.bert import BertAttention
|
||||
import comfy.model_management
|
||||
from comfy.ldm.modules.attention import optimized_attention_for_device
|
||||
from comfy.ldm.depth_anything_3.reference_view_selector import (
|
||||
select_reference_view, reorder_by_reference, restore_original_order,
|
||||
THRESH_FOR_REF_SELECTION,
|
||||
)
|
||||
|
||||
|
||||
class Dino2AttentionOutput(torch.nn.Module):
|
||||
@ -22,42 +14,13 @@ class Dino2AttentionOutput(torch.nn.Module):
|
||||
|
||||
|
||||
class Dino2AttentionBlock(torch.nn.Module):
|
||||
def __init__(self, embed_dim, heads, layer_norm_eps, dtype, device, operations,
|
||||
qk_norm=False):
|
||||
def __init__(self, embed_dim, heads, layer_norm_eps, dtype, device, operations):
|
||||
super().__init__()
|
||||
self.heads = heads
|
||||
self.head_dim = embed_dim // heads
|
||||
self.attention = BertAttention(embed_dim, heads, dtype, device, operations)
|
||||
self.output = Dino2AttentionOutput(embed_dim, embed_dim, layer_norm_eps, dtype, device, operations)
|
||||
if qk_norm:
|
||||
self.q_norm = operations.LayerNorm(self.head_dim, dtype=dtype, device=device)
|
||||
self.k_norm = operations.LayerNorm(self.head_dim, dtype=dtype, device=device)
|
||||
else:
|
||||
self.q_norm = None
|
||||
self.k_norm = None
|
||||
|
||||
def forward(self, x, mask, optimized_attention, pos=None, rope=None):
|
||||
# Fast path used by the existing CLIP-vision DINOv2 (no DA3 extensions).
|
||||
if self.q_norm is None and rope is None:
|
||||
return self.output(self.attention(x, mask, optimized_attention))
|
||||
|
||||
# DA3 path: do QKV manually so we can apply per-head QK-norm and 2D RoPE.
|
||||
attn = self.attention
|
||||
B, N, C = x.shape
|
||||
h = self.heads
|
||||
d = self.head_dim
|
||||
q = attn.query(x).view(B, N, h, d).transpose(1, 2)
|
||||
k = attn.key(x).view(B, N, h, d).transpose(1, 2)
|
||||
v = attn.value(x).view(B, N, h, d).transpose(1, 2)
|
||||
if self.q_norm is not None:
|
||||
q = self.q_norm(q)
|
||||
k = self.k_norm(k)
|
||||
if rope is not None and pos is not None:
|
||||
q = rope(q, pos)
|
||||
k = rope(k, pos)
|
||||
out = F.scaled_dot_product_attention(q, k, v, attn_mask=mask)
|
||||
out = out.transpose(1, 2).reshape(B, N, C)
|
||||
return self.output(out)
|
||||
def forward(self, x, mask, optimized_attention):
|
||||
return self.output(self.attention(x, mask, optimized_attention))
|
||||
|
||||
|
||||
class LayerScale(torch.nn.Module):
|
||||
@ -101,11 +64,9 @@ class SwiGLUFFN(torch.nn.Module):
|
||||
|
||||
|
||||
class Dino2Block(torch.nn.Module):
|
||||
def __init__(self, dim, num_heads, layer_norm_eps, dtype, device, operations, use_swiglu_ffn,
|
||||
qk_norm=False):
|
||||
def __init__(self, dim, num_heads, layer_norm_eps, dtype, device, operations, use_swiglu_ffn):
|
||||
super().__init__()
|
||||
self.attention = Dino2AttentionBlock(dim, num_heads, layer_norm_eps, dtype, device, operations,
|
||||
qk_norm=qk_norm)
|
||||
self.attention = Dino2AttentionBlock(dim, num_heads, layer_norm_eps, dtype, device, operations)
|
||||
self.layer_scale1 = LayerScale(dim, dtype, device, operations)
|
||||
self.layer_scale2 = LayerScale(dim, dtype, device, operations)
|
||||
if use_swiglu_ffn:
|
||||
@ -115,93 +76,19 @@ class Dino2Block(torch.nn.Module):
|
||||
self.norm1 = operations.LayerNorm(dim, eps=layer_norm_eps, dtype=dtype, device=device)
|
||||
self.norm2 = operations.LayerNorm(dim, eps=layer_norm_eps, dtype=dtype, device=device)
|
||||
|
||||
def forward(self, x, optimized_attention, pos=None, rope=None, attn_mask=None):
|
||||
x = x + self.layer_scale1(self.attention(self.norm1(x), attn_mask, optimized_attention,
|
||||
pos=pos, rope=rope))
|
||||
def forward(self, x, optimized_attention):
|
||||
x = x + self.layer_scale1(self.attention(self.norm1(x), None, optimized_attention))
|
||||
x = x + self.layer_scale2(self.mlp(self.norm2(x)))
|
||||
return x
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# 2D Rotary position embedding (DA3 extension)
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
|
||||
class _PositionGetter:
|
||||
"""Cache (h, w) -> flat (y, x) position grid used to feed ``rope``."""
|
||||
|
||||
def __init__(self):
|
||||
self._cache: dict = {}
|
||||
|
||||
def __call__(self, batch_size: int, height: int, width: int, device) -> torch.Tensor:
|
||||
key = (height, width, device)
|
||||
if key not in self._cache:
|
||||
y = torch.arange(height, device=device)
|
||||
x = torch.arange(width, device=device)
|
||||
self._cache[key] = torch.cartesian_prod(y, x)
|
||||
cached = self._cache[key]
|
||||
return cached.view(1, height * width, 2).expand(batch_size, -1, -1).clone()
|
||||
|
||||
|
||||
class RotaryPositionEmbedding2D(torch.nn.Module):
|
||||
"""2D RoPE used by DA3-Small/Base. No learnable parameters."""
|
||||
|
||||
def __init__(self, frequency: float = 100.0):
|
||||
super().__init__()
|
||||
self.base_frequency = frequency
|
||||
self._freq_cache: dict = {}
|
||||
|
||||
def _components(self, dim: int, seq_len: int, device, dtype):
|
||||
key = (dim, seq_len, device, dtype)
|
||||
if key not in self._freq_cache:
|
||||
exp = torch.arange(0, dim, 2, device=device).float() / dim
|
||||
inv_freq = 1.0 / (self.base_frequency ** exp)
|
||||
pos = torch.arange(seq_len, device=device, dtype=inv_freq.dtype)
|
||||
ang = torch.einsum("i,j->ij", pos, inv_freq)
|
||||
ang = ang.to(dtype)
|
||||
ang = torch.cat((ang, ang), dim=-1)
|
||||
self._freq_cache[key] = (ang.cos().to(dtype), ang.sin().to(dtype))
|
||||
return self._freq_cache[key]
|
||||
|
||||
@staticmethod
|
||||
def _rotate(x: torch.Tensor) -> torch.Tensor:
|
||||
d = x.shape[-1]
|
||||
x1, x2 = x[..., : d // 2], x[..., d // 2:]
|
||||
return torch.cat((-x2, x1), dim=-1)
|
||||
|
||||
def _apply_1d(self, tokens, positions, cos_c, sin_c):
|
||||
cos = F.embedding(positions, cos_c)[:, None, :, :]
|
||||
sin = F.embedding(positions, sin_c)[:, None, :, :]
|
||||
return (tokens * cos) + (self._rotate(tokens) * sin)
|
||||
|
||||
def forward(self, tokens: torch.Tensor, positions: torch.Tensor) -> torch.Tensor:
|
||||
feature_dim = tokens.size(-1) // 2
|
||||
max_pos = int(positions.max()) + 1
|
||||
cos_c, sin_c = self._components(feature_dim, max_pos, tokens.device, tokens.dtype)
|
||||
v, h = tokens.chunk(2, dim=-1)
|
||||
v = self._apply_1d(v, positions[..., 0], cos_c, sin_c)
|
||||
h = self._apply_1d(h, positions[..., 1], cos_c, sin_c)
|
||||
return torch.cat((v, h), dim=-1)
|
||||
|
||||
|
||||
class Dino2Encoder(torch.nn.Module):
|
||||
def __init__(self, dim, num_heads, layer_norm_eps, num_layers, dtype, device, operations, use_swiglu_ffn,
|
||||
qknorm_start: int = -1, rope: "RotaryPositionEmbedding2D | None" = None,
|
||||
rope_start: int = -1):
|
||||
def __init__(self, dim, num_heads, layer_norm_eps, num_layers, dtype, device, operations, use_swiglu_ffn):
|
||||
super().__init__()
|
||||
self.layer = torch.nn.ModuleList([
|
||||
Dino2Block(
|
||||
dim, num_heads, layer_norm_eps, dtype, device, operations,
|
||||
use_swiglu_ffn=use_swiglu_ffn,
|
||||
qk_norm=(qknorm_start != -1 and i >= qknorm_start),
|
||||
)
|
||||
for i in range(num_layers)
|
||||
])
|
||||
self.rope = rope
|
||||
self.rope_start = rope_start
|
||||
self.layer = torch.nn.ModuleList([Dino2Block(dim, num_heads, layer_norm_eps, dtype, device, operations, use_swiglu_ffn = use_swiglu_ffn)
|
||||
for _ in range(num_layers)])
|
||||
|
||||
def forward(self, x, intermediate_output=None):
|
||||
# Backward-compat path used by ``ClipVisionModel`` (no DA3 extensions).
|
||||
optimized_attention = optimized_attention_for_device(x.device, False, small_input=True)
|
||||
|
||||
if intermediate_output is not None:
|
||||
@ -234,84 +121,25 @@ class Dino2PatchEmbeddings(torch.nn.Module):
|
||||
|
||||
|
||||
class Dino2Embeddings(torch.nn.Module):
|
||||
def __init__(self, dim, dtype, device, operations,
|
||||
patch_size: int = 14, image_size: int = 518,
|
||||
use_mask_token: bool = True,
|
||||
num_camera_tokens: int = 0):
|
||||
def __init__(self, dim, dtype, device, operations):
|
||||
super().__init__()
|
||||
self.patch_size = patch_size
|
||||
self.image_size = image_size
|
||||
patch_size = 14
|
||||
image_size = 518
|
||||
|
||||
self.patch_embeddings = Dino2PatchEmbeddings(dim, patch_size=patch_size, image_size=image_size, dtype=dtype, device=device, operations=operations)
|
||||
self.position_embeddings = torch.nn.Parameter(torch.empty(1, (image_size // patch_size) ** 2 + 1, dim, dtype=dtype, device=device))
|
||||
self.cls_token = torch.nn.Parameter(torch.empty(1, 1, dim, dtype=dtype, device=device))
|
||||
if use_mask_token:
|
||||
self.mask_token = torch.nn.Parameter(torch.empty(1, dim, dtype=dtype, device=device))
|
||||
else:
|
||||
self.mask_token = None
|
||||
if num_camera_tokens > 0:
|
||||
# DA3 stores (ref_token, src_token) pairs that get injected at the
|
||||
# alt-attn boundary; see ``Dinov2Model._inject_camera_token``.
|
||||
self.camera_token = torch.nn.Parameter(torch.empty(1, num_camera_tokens, dim, dtype=dtype, device=device))
|
||||
else:
|
||||
self.camera_token = None
|
||||
|
||||
def _interpolate_pos_encoding(self, x: torch.Tensor, h: int, w: int) -> torch.Tensor:
|
||||
previous_dtype = x.dtype
|
||||
npatch = x.shape[1] - 1
|
||||
N = self.position_embeddings.shape[1] - 1
|
||||
pos_embed = comfy.model_management.cast_to_device(self.position_embeddings, x.device, x.dtype).float()
|
||||
if npatch == N and w == h:
|
||||
return pos_embed
|
||||
class_pos_embed = pos_embed[:, 0]
|
||||
patch_pos_embed = pos_embed[:, 1:]
|
||||
dim = x.shape[-1]
|
||||
ph = h // self.patch_size # patch grid height
|
||||
pw = w // self.patch_size # patch grid width
|
||||
M = int(math.sqrt(N))
|
||||
assert N == M * M
|
||||
# Historical 0.1 offset preserves bicubic resample compatibility with
|
||||
# the original DINOv2 release; see the upstream PR for context.
|
||||
# ``scale_factor`` is interpreted as (height_scale, width_scale) by
|
||||
# ``F.interpolate`` so we must put the height scale FIRST. Earlier
|
||||
# revisions of this function had it swapped which only worked for
|
||||
# square inputs (e.g. CLIP-vision square crops); non-square inputs
|
||||
# like DA3-Small / DA3-Base multi-view paths exposed the bug.
|
||||
sh = float(ph + 0.1) / M
|
||||
sw = float(pw + 0.1) / M
|
||||
patch_pos_embed = F.interpolate(
|
||||
patch_pos_embed.reshape(1, M, M, dim).permute(0, 3, 1, 2),
|
||||
scale_factor=(sh, sw), mode="bicubic", antialias=False,
|
||||
)
|
||||
assert (ph, pw) == patch_pos_embed.shape[-2:]
|
||||
patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
|
||||
return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1).to(previous_dtype)
|
||||
self.mask_token = torch.nn.Parameter(torch.empty(1, dim, dtype=dtype, device=device))
|
||||
|
||||
def forward(self, pixel_values):
|
||||
_, _, H, W = pixel_values.shape
|
||||
x = self.patch_embeddings(pixel_values)
|
||||
# TODO: mask_token?
|
||||
x = torch.cat((self.cls_token.to(device=x.device, dtype=x.dtype).expand(x.shape[0], -1, -1), x), dim=1)
|
||||
x = x + self._interpolate_pos_encoding(x, H, W)
|
||||
x = x + comfy.model_management.cast_to_device(self.position_embeddings, x.device, x.dtype)
|
||||
return x
|
||||
|
||||
|
||||
class Dinov2Model(torch.nn.Module):
|
||||
"""DINOv2 vision backbone.
|
||||
|
||||
Supports two operating modes:
|
||||
|
||||
* **CLIP-vision DINOv2** (default): vanilla DINOv2-ViT used for
|
||||
``ClipVisionModel`` and SigLIP-style image encoding.
|
||||
* **Depth Anything 3** extensions (opt-in via config keys): 2D RoPE,
|
||||
QK-norm, alternating local/global attention, camera-token injection,
|
||||
``cat_token`` output and multi-layer feature extraction. These are
|
||||
enabled when the corresponding fields (``alt_start``, ``qknorm_start``,
|
||||
``rope_start``, ``cat_token``) are set in ``config_dict``. When all of
|
||||
them are at their disabled defaults this module behaves identically to
|
||||
the historical ``Dinov2Model``.
|
||||
"""
|
||||
|
||||
def __init__(self, config_dict, dtype, device, operations):
|
||||
super().__init__()
|
||||
num_layers = config_dict["num_hidden_layers"]
|
||||
@ -319,209 +147,14 @@ class Dinov2Model(torch.nn.Module):
|
||||
heads = config_dict["num_attention_heads"]
|
||||
layer_norm_eps = config_dict["layer_norm_eps"]
|
||||
use_swiglu_ffn = config_dict["use_swiglu_ffn"]
|
||||
patch_size = config_dict.get("patch_size", 14)
|
||||
image_size = config_dict.get("image_size", 518)
|
||||
use_mask_token = config_dict.get("use_mask_token", True)
|
||||
|
||||
# DA3 extensions (all default to disabled).
|
||||
self.alt_start = config_dict.get("alt_start", -1)
|
||||
self.qknorm_start = config_dict.get("qknorm_start", -1)
|
||||
self.rope_start = config_dict.get("rope_start", -1)
|
||||
self.cat_token = config_dict.get("cat_token", False)
|
||||
rope_freq = config_dict.get("rope_freq", 100.0)
|
||||
|
||||
self.embed_dim = dim
|
||||
self.patch_size = patch_size
|
||||
self.num_register_tokens = 0
|
||||
self.patch_start_idx = 1
|
||||
|
||||
if self.rope_start != -1 and rope_freq > 0:
|
||||
self.rope = RotaryPositionEmbedding2D(frequency=rope_freq)
|
||||
self._position_getter = _PositionGetter()
|
||||
else:
|
||||
self.rope = None
|
||||
self._position_getter = None
|
||||
|
||||
# camera_token shape: (1, 2, dim) -> (ref_token, src_token).
|
||||
num_cam_tokens = 2 if self.alt_start != -1 else 0
|
||||
|
||||
self.embeddings = Dino2Embeddings(
|
||||
dim, dtype, device, operations,
|
||||
patch_size=patch_size, image_size=image_size,
|
||||
use_mask_token=use_mask_token, num_camera_tokens=num_cam_tokens,
|
||||
)
|
||||
self.encoder = Dino2Encoder(
|
||||
dim, heads, layer_norm_eps, num_layers, dtype, device, operations,
|
||||
use_swiglu_ffn=use_swiglu_ffn,
|
||||
qknorm_start=self.qknorm_start,
|
||||
rope=self.rope, rope_start=self.rope_start,
|
||||
)
|
||||
self.embeddings = Dino2Embeddings(dim, dtype, device, operations)
|
||||
self.encoder = Dino2Encoder(dim, heads, layer_norm_eps, num_layers, dtype, device, operations, use_swiglu_ffn = use_swiglu_ffn)
|
||||
self.layernorm = operations.LayerNorm(dim, eps=layer_norm_eps, dtype=dtype, device=device)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# CLIP-vision-style forward (no DA3 extensions, no multi-layer output).
|
||||
# Kept for backward compatibility with ``ClipVisionModel.encode_image``.
|
||||
# ------------------------------------------------------------------
|
||||
def forward(self, pixel_values, attention_mask=None, intermediate_output=None):
|
||||
x = self.embeddings(pixel_values)
|
||||
x, i = self.encoder(x, intermediate_output=intermediate_output)
|
||||
x = self.layernorm(x)
|
||||
pooled_output = x[:, 0, :]
|
||||
return x, i, pooled_output, None
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Depth Anything 3 forward
|
||||
# ------------------------------------------------------------------
|
||||
def _prepare_rope_positions(self, B, S, H, W, device):
|
||||
if self.rope is None:
|
||||
return None, None
|
||||
ph, pw = H // self.patch_size, W // self.patch_size
|
||||
pos = self._position_getter(B * S, ph, pw, device=device)
|
||||
# Shift so the cls/cam token at position 0 is reserved for "no diff".
|
||||
pos = pos + 1
|
||||
cls_pos = torch.zeros(B * S, self.patch_start_idx, 2, device=device, dtype=pos.dtype)
|
||||
# Per-view local: real grid positions for patches, 0 for cls token.
|
||||
pos_local = torch.cat([cls_pos, pos], dim=1)
|
||||
# Global (across views): same grid positions; cls token still at 0,
|
||||
# but patches share the same positions in every view.
|
||||
pos_global = torch.cat([cls_pos, torch.zeros_like(pos) + 1], dim=1)
|
||||
return pos_local, pos_global
|
||||
|
||||
def _inject_camera_token(self, x: torch.Tensor, B: int, S: int,
|
||||
cam_token: "torch.Tensor | None") -> torch.Tensor:
|
||||
# x: (B, S, N, C). Replace token at index 0 with the camera token.
|
||||
if cam_token is not None:
|
||||
inj = cam_token
|
||||
else:
|
||||
ct = comfy.model_management.cast_to_device(self.embeddings.camera_token, x.device, x.dtype)
|
||||
ref_token = ct[:, :1].expand(B, -1, -1)
|
||||
src_token = ct[:, 1:].expand(B, max(S - 1, 0), -1)
|
||||
inj = torch.cat([ref_token, src_token], dim=1)
|
||||
x = x.clone()
|
||||
x[:, :, 0] = inj
|
||||
return x
|
||||
|
||||
def get_intermediate_layers(self, pixel_values, out_layers, cam_token=None,
|
||||
ref_view_strategy="saddle_balanced",
|
||||
export_feat_layers=None):
|
||||
"""Multi-layer DINOv2 feature extraction used by Depth Anything 3.
|
||||
|
||||
Args:
|
||||
pixel_values: ``(B, S, 3, H, W)`` views or ``(B, 3, H, W)``.
|
||||
out_layers: indices into ``self.encoder.layer``.
|
||||
cam_token: optional ``(B, S, dim)`` camera token to inject at
|
||||
``alt_start``. If ``None`` and the model has its own
|
||||
``camera_token`` parameter, that is used.
|
||||
ref_view_strategy: when ``S >= 3`` and ``cam_token is None``,
|
||||
pick a reference view via this strategy and move it to
|
||||
position 0 right before the first alt-attention block.
|
||||
The original view order is restored on the way out.
|
||||
export_feat_layers: optional iterable of layer indices whose
|
||||
local attention outputs to also return as auxiliary
|
||||
features (``(B, S, N_patch, C)`` after final norm). Used
|
||||
by the multi-view path to expose intermediate features
|
||||
to the nested-architecture wrapper.
|
||||
|
||||
Returns:
|
||||
``(layer_outputs, aux_outputs)`` where ``layer_outputs`` is a
|
||||
list of ``(patch_tokens, cls_or_cam_token)`` tuples (one per
|
||||
``out_layers`` entry) and ``aux_outputs`` is a list of
|
||||
``(B, S, N_patch, C)`` features for ``export_feat_layers``
|
||||
(empty list when not requested).
|
||||
"""
|
||||
if pixel_values.ndim == 4:
|
||||
pixel_values = pixel_values.unsqueeze(1)
|
||||
assert pixel_values.ndim == 5 and pixel_values.shape[2] == 3, \
|
||||
f"expected (B,3,H,W) or (B,S,3,H,W); got {tuple(pixel_values.shape)}"
|
||||
B, S, _, H, W = pixel_values.shape
|
||||
|
||||
# Patch + cls + (interpolated) pos embed for each view.
|
||||
x = pixel_values.reshape(B * S, 3, H, W)
|
||||
x = self.embeddings(x) # (B*S, 1+N, C)
|
||||
x = x.reshape(B, S, x.shape[-2], x.shape[-1]) # (B, S, 1+N, C)
|
||||
|
||||
pos_local, pos_global = self._prepare_rope_positions(B, S, H, W, x.device)
|
||||
# ``optimized_attention`` is only used by blocks without QK-norm/RoPE
|
||||
# (vanilla DINOv2 path); enabling-aware blocks fall through to SDPA.
|
||||
optimized_attention = optimized_attention_for_device(x.device, False, small_input=True)
|
||||
|
||||
out_set = set(out_layers)
|
||||
export_set = set(export_feat_layers) if export_feat_layers else set()
|
||||
outputs: list[torch.Tensor] = []
|
||||
aux_outputs: list[torch.Tensor] = []
|
||||
local_x = x
|
||||
b_idx = None
|
||||
|
||||
|
||||
for i, blk in enumerate(self.encoder.layer):
|
||||
apply_rope = self.rope is not None and i >= self.rope_start
|
||||
block_rope = self.rope if apply_rope else None
|
||||
l_pos = pos_local if apply_rope else None
|
||||
g_pos = pos_global if apply_rope else None
|
||||
|
||||
# Reference-view selection threshold: matches the upstream constant
|
||||
# ``THRESH_FOR_REF_SELECTION = 3``. Skipped when a user-supplied
|
||||
# cam_token is provided (camera info already pins the geometry).
|
||||
if (self.alt_start != -1 and i == self.alt_start - 1
|
||||
and S >= THRESH_FOR_REF_SELECTION and cam_token is None):
|
||||
b_idx = select_reference_view(x, strategy=ref_view_strategy)
|
||||
x = reorder_by_reference(x, b_idx)
|
||||
local_x = reorder_by_reference(local_x, b_idx)
|
||||
|
||||
if self.alt_start != -1 and i == self.alt_start:
|
||||
x = self._inject_camera_token(x, B, S, cam_token)
|
||||
|
||||
if self.alt_start != -1 and i >= self.alt_start and (i % 2 == 1):
|
||||
# Global attention across views: flatten S into the seq dim.
|
||||
t = x.reshape(B, S * x.shape[-2], x.shape[-1])
|
||||
p = g_pos.reshape(B, S * g_pos.shape[-2], g_pos.shape[-1]) if g_pos is not None else None
|
||||
t = blk(t, optimized_attention=optimized_attention, pos=p, rope=block_rope)
|
||||
x = t.reshape(B, S, x.shape[-2], x.shape[-1])
|
||||
else:
|
||||
# Per-view local attention.
|
||||
t = x.reshape(B * S, x.shape[-2], x.shape[-1])
|
||||
p = l_pos.reshape(B * S, l_pos.shape[-2], l_pos.shape[-1]) if l_pos is not None else None
|
||||
t = blk(t, optimized_attention=optimized_attention, pos=p, rope=block_rope)
|
||||
x = t.reshape(B, S, x.shape[-2], x.shape[-1])
|
||||
local_x = x
|
||||
|
||||
if i in out_set:
|
||||
if self.cat_token:
|
||||
out_x = torch.cat([local_x, x], dim=-1)
|
||||
else:
|
||||
out_x = x
|
||||
# Restore original view order on the way out so heads see views
|
||||
# in the user's expected order.
|
||||
if b_idx is not None and self.alt_start != -1:
|
||||
out_x = restore_original_order(out_x, b_idx)
|
||||
outputs.append(out_x)
|
||||
|
||||
if i in export_set:
|
||||
aux = x
|
||||
if b_idx is not None and self.alt_start != -1:
|
||||
aux = restore_original_order(aux, b_idx)
|
||||
aux_outputs.append(aux)
|
||||
|
||||
# Apply final norm. When ``cat_token`` is set, only the right half
|
||||
# ("global" features) is normalised; the left half is left as-is to
|
||||
# match the upstream DA3 head signature.
|
||||
normed: list[torch.Tensor] = []
|
||||
cls_tokens: list[torch.Tensor] = []
|
||||
for out_x in outputs:
|
||||
cls_tokens.append(out_x[:, :, 0])
|
||||
if out_x.shape[-1] == self.embed_dim:
|
||||
normed.append(self.layernorm(out_x))
|
||||
elif out_x.shape[-1] == self.embed_dim * 2:
|
||||
left = out_x[..., :self.embed_dim]
|
||||
right = self.layernorm(out_x[..., self.embed_dim:])
|
||||
normed.append(torch.cat([left, right], dim=-1))
|
||||
else:
|
||||
raise ValueError(f"Unexpected token width: {out_x.shape[-1]}")
|
||||
|
||||
# Drop cls/cam token from the patch sequence.
|
||||
normed = [o[..., 1 + self.num_register_tokens:, :] for o in normed]
|
||||
|
||||
# Final layernorm + drop cls token from auxiliary features too.
|
||||
aux_normed = [self.layernorm(o)[..., 1 + self.num_register_tokens:, :]
|
||||
for o in aux_outputs]
|
||||
return list(zip(normed, cls_tokens)), aux_normed
|
||||
|
||||
@ -242,7 +242,6 @@ def sample_euler_ancestral_RF(model, x, sigmas, extra_args=None, callback=None,
|
||||
extra_args = {} if extra_args is None else extra_args
|
||||
seed = extra_args.get("seed", None)
|
||||
noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
|
||||
s_noise = s_noise * getattr(model.inner_model.model_patcher.get_model_object('model_sampling'), "noise_scale", 1.0)
|
||||
s_in = x.new_ones([x.shape[0]])
|
||||
for i in trange(len(sigmas) - 1, disable=disable):
|
||||
denoised = model(x, sigmas[i] * s_in, **extra_args)
|
||||
@ -374,7 +373,6 @@ def sample_dpm_2_ancestral_RF(model, x, sigmas, extra_args=None, callback=None,
|
||||
extra_args = {} if extra_args is None else extra_args
|
||||
seed = extra_args.get("seed", None)
|
||||
noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
|
||||
s_noise = s_noise * getattr(model.inner_model.model_patcher.get_model_object('model_sampling'), "noise_scale", 1.0)
|
||||
s_in = x.new_ones([x.shape[0]])
|
||||
for i in trange(len(sigmas) - 1, disable=disable):
|
||||
denoised = model(x, sigmas[i] * s_in, **extra_args)
|
||||
@ -688,7 +686,6 @@ def sample_dpmpp_2s_ancestral_RF(model, x, sigmas, extra_args=None, callback=Non
|
||||
extra_args = {} if extra_args is None else extra_args
|
||||
seed = extra_args.get("seed", None)
|
||||
noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
|
||||
s_noise = s_noise * getattr(model.inner_model.model_patcher.get_model_object('model_sampling'), "noise_scale", 1.0)
|
||||
s_in = x.new_ones([x.shape[0]])
|
||||
sigma_fn = lambda lbda: (lbda.exp() + 1) ** -1
|
||||
lambda_fn = lambda sigma: ((1-sigma)/sigma).log()
|
||||
@ -750,7 +747,6 @@ def sample_dpmpp_sde(model, x, sigmas, extra_args=None, callback=None, disable=N
|
||||
sigma_fn = partial(half_log_snr_to_sigma, model_sampling=model_sampling)
|
||||
lambda_fn = partial(sigma_to_half_log_snr, model_sampling=model_sampling)
|
||||
sigmas = offset_first_sigma_for_snr(sigmas, model_sampling)
|
||||
s_noise = s_noise * getattr(model_sampling, "noise_scale", 1.0)
|
||||
|
||||
for i in trange(len(sigmas) - 1, disable=disable):
|
||||
denoised = model(x, sigmas[i] * s_in, **extra_args)
|
||||
@ -836,7 +832,6 @@ def sample_dpmpp_2m_sde(model, x, sigmas, extra_args=None, callback=None, disabl
|
||||
model_sampling = model.inner_model.model_patcher.get_model_object('model_sampling')
|
||||
lambda_fn = partial(sigma_to_half_log_snr, model_sampling=model_sampling)
|
||||
sigmas = offset_first_sigma_for_snr(sigmas, model_sampling)
|
||||
s_noise = s_noise * getattr(model_sampling, "noise_scale", 1.0)
|
||||
|
||||
old_denoised = None
|
||||
h, h_last = None, None
|
||||
@ -894,7 +889,6 @@ def sample_dpmpp_3m_sde(model, x, sigmas, extra_args=None, callback=None, disabl
|
||||
model_sampling = model.inner_model.model_patcher.get_model_object('model_sampling')
|
||||
lambda_fn = partial(sigma_to_half_log_snr, model_sampling=model_sampling)
|
||||
sigmas = offset_first_sigma_for_snr(sigmas, model_sampling)
|
||||
s_noise = s_noise * getattr(model_sampling, "noise_scale", 1.0)
|
||||
|
||||
denoised_1, denoised_2 = None, None
|
||||
h, h_1, h_2 = None, None, None
|
||||
@ -1012,39 +1006,23 @@ def sample_ddpm(model, x, sigmas, extra_args=None, callback=None, disable=None,
|
||||
return generic_step_sampler(model, x, sigmas, extra_args, callback, disable, noise_sampler, DDPMSampler_step)
|
||||
|
||||
@torch.no_grad()
|
||||
def sample_lcm(model, x, sigmas, extra_args=None, callback=None, disable=None, noise_sampler=None, s_noise=1.0, s_noise_end=None, noise_clip_std=0.0):
|
||||
|
||||
# s_noise / s_noise_end: per-step noise multiplier, linearly interpolated across steps
|
||||
# noise_clip_std: clamp injected noise to +/- N stddevs (0 disables).
|
||||
|
||||
def sample_lcm(model, x, sigmas, extra_args=None, callback=None, disable=None, noise_sampler=None):
|
||||
extra_args = {} if extra_args is None else extra_args
|
||||
seed = extra_args.get("seed", None)
|
||||
noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
|
||||
s_in = x.new_ones([x.shape[0]])
|
||||
n_steps = max(1, len(sigmas) - 1)
|
||||
model_sampling = model.inner_model.model_patcher.get_model_object('model_sampling')
|
||||
|
||||
s_start = float(s_noise)
|
||||
s_end = s_start if s_noise_end is None else float(s_noise_end)
|
||||
for i in trange(n_steps, disable=disable):
|
||||
for i in trange(len(sigmas) - 1, disable=disable):
|
||||
denoised = model(x, sigmas[i] * s_in, **extra_args)
|
||||
if callback is not None:
|
||||
callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised})
|
||||
|
||||
x = denoised
|
||||
if sigmas[i + 1] > 0:
|
||||
noise = noise_sampler(sigmas[i], sigmas[i + 1])
|
||||
if noise_clip_std > 0:
|
||||
clip_val = noise_clip_std * noise.std()
|
||||
noise = noise.clamp(min=-clip_val, max=clip_val)
|
||||
t = (i / (n_steps - 1)) if n_steps > 1 else 0.0
|
||||
s_noise_i = s_start + (s_end - s_start) * t
|
||||
if s_noise_i != 1.0:
|
||||
noise = noise * s_noise_i
|
||||
x = model_sampling.noise_scaling(sigmas[i + 1], noise, x)
|
||||
x = model.inner_model.inner_model.model_sampling.noise_scaling(sigmas[i + 1], noise_sampler(sigmas[i], sigmas[i + 1]), x)
|
||||
return x
|
||||
|
||||
|
||||
|
||||
@torch.no_grad()
|
||||
def sample_heunpp2(model, x, sigmas, extra_args=None, callback=None, disable=None, s_churn=0., s_tmin=0., s_tmax=float('inf'), s_noise=1.):
|
||||
# From MIT licensed: https://github.com/Carzit/sd-webui-samplers-scheduler/
|
||||
@ -1271,7 +1249,6 @@ def sample_euler_ancestral_cfg_pp(model, x, sigmas, extra_args=None, callback=No
|
||||
|
||||
model_sampling = model.inner_model.model_patcher.get_model_object("model_sampling")
|
||||
lambda_fn = partial(sigma_to_half_log_snr, model_sampling=model_sampling)
|
||||
s_noise = s_noise * getattr(model_sampling, "noise_scale", 1.0)
|
||||
|
||||
uncond_denoised = None
|
||||
|
||||
@ -1319,7 +1296,6 @@ def sample_dpmpp_2s_ancestral_cfg_pp(model, x, sigmas, extra_args=None, callback
|
||||
extra_args = {} if extra_args is None else extra_args
|
||||
seed = extra_args.get("seed", None)
|
||||
noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
|
||||
s_noise = s_noise * getattr(model.inner_model.model_patcher.get_model_object('model_sampling'), "noise_scale", 1.0)
|
||||
|
||||
temp = [0]
|
||||
def post_cfg_function(args):
|
||||
@ -1395,7 +1371,6 @@ def res_multistep(model, x, sigmas, extra_args=None, callback=None, disable=None
|
||||
extra_args = {} if extra_args is None else extra_args
|
||||
seed = extra_args.get("seed", None)
|
||||
noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
|
||||
s_noise = s_noise * getattr(model.inner_model.model_patcher.get_model_object('model_sampling'), "noise_scale", 1.0)
|
||||
s_in = x.new_ones([x.shape[0]])
|
||||
sigma_fn = lambda t: t.neg().exp()
|
||||
t_fn = lambda sigma: sigma.log().neg()
|
||||
@ -1529,7 +1504,6 @@ def sample_er_sde(model, x, sigmas, extra_args=None, callback=None, disable=None
|
||||
extra_args = {} if extra_args is None else extra_args
|
||||
seed = extra_args.get("seed", None)
|
||||
noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
|
||||
s_noise = s_noise * getattr(model.inner_model.model_patcher.get_model_object('model_sampling'), "noise_scale", 1.0)
|
||||
s_in = x.new_ones([x.shape[0]])
|
||||
|
||||
def default_er_sde_noise_scaler(x):
|
||||
@ -1600,10 +1574,9 @@ def sample_seeds_2(model, x, sigmas, extra_args=None, callback=None, disable=Non
|
||||
seed = extra_args.get("seed", None)
|
||||
noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
|
||||
s_in = x.new_ones([x.shape[0]])
|
||||
inject_noise = eta > 0 and s_noise > 0
|
||||
|
||||
model_sampling = model.inner_model.model_patcher.get_model_object('model_sampling')
|
||||
s_noise = s_noise * getattr(model_sampling, "noise_scale", 1.0)
|
||||
inject_noise = eta > 0 and s_noise > 0
|
||||
sigma_fn = partial(half_log_snr_to_sigma, model_sampling=model_sampling)
|
||||
lambda_fn = partial(sigma_to_half_log_snr, model_sampling=model_sampling)
|
||||
sigmas = offset_first_sigma_for_snr(sigmas, model_sampling)
|
||||
@ -1672,10 +1645,9 @@ def sample_seeds_3(model, x, sigmas, extra_args=None, callback=None, disable=Non
|
||||
seed = extra_args.get("seed", None)
|
||||
noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
|
||||
s_in = x.new_ones([x.shape[0]])
|
||||
inject_noise = eta > 0 and s_noise > 0
|
||||
|
||||
model_sampling = model.inner_model.model_patcher.get_model_object('model_sampling')
|
||||
s_noise = s_noise * getattr(model_sampling, "noise_scale", 1.0)
|
||||
inject_noise = eta > 0 and s_noise > 0
|
||||
sigma_fn = partial(half_log_snr_to_sigma, model_sampling=model_sampling)
|
||||
lambda_fn = partial(sigma_to_half_log_snr, model_sampling=model_sampling)
|
||||
sigmas = offset_first_sigma_for_snr(sigmas, model_sampling)
|
||||
@ -1741,7 +1713,6 @@ def sample_sa_solver(model, x, sigmas, extra_args=None, callback=None, disable=F
|
||||
s_in = x.new_ones([x.shape[0]])
|
||||
|
||||
model_sampling = model.inner_model.model_patcher.get_model_object("model_sampling")
|
||||
s_noise = s_noise * getattr(model_sampling, "noise_scale", 1.0)
|
||||
sigmas = offset_first_sigma_for_snr(sigmas, model_sampling)
|
||||
lambdas = sigma_to_half_log_snr(sigmas, model_sampling=model_sampling)
|
||||
|
||||
@ -1839,119 +1810,3 @@ def sample_sa_solver(model, x, sigmas, extra_args=None, callback=None, disable=F
|
||||
def sample_sa_solver_pece(model, x, sigmas, extra_args=None, callback=None, disable=False, tau_func=None, s_noise=1.0, noise_sampler=None, predictor_order=3, corrector_order=4, simple_order_2=False):
|
||||
"""Stochastic Adams Solver with PECE (Predict–Evaluate–Correct–Evaluate) mode (NeurIPS 2023)."""
|
||||
return sample_sa_solver(model, x, sigmas, extra_args=extra_args, callback=callback, disable=disable, tau_func=tau_func, s_noise=s_noise, noise_sampler=noise_sampler, predictor_order=predictor_order, corrector_order=corrector_order, use_pece=True, simple_order_2=simple_order_2)
|
||||
|
||||
|
||||
@torch.no_grad()
|
||||
def sample_ar_video(model, x, sigmas, extra_args=None, callback=None, disable=None,
|
||||
num_frame_per_block=1):
|
||||
"""
|
||||
Autoregressive video sampler: block-by-block denoising with KV cache
|
||||
and flow-match re-noising for Causal Forcing / Self-Forcing models.
|
||||
|
||||
Requires a Causal-WAN compatible model (diffusion_model must expose
|
||||
init_kv_caches / init_crossattn_caches) and 5-D latents [B,C,T,H,W].
|
||||
|
||||
All AR-loop parameters are passed via the SamplerARVideo node, not read
|
||||
from the checkpoint or transformer_options.
|
||||
"""
|
||||
extra_args = {} if extra_args is None else extra_args
|
||||
model_options = extra_args.get("model_options", {})
|
||||
transformer_options = model_options.get("transformer_options", {})
|
||||
|
||||
if x.ndim != 5:
|
||||
raise ValueError(
|
||||
f"ar_video sampler requires 5-D video latents [B,C,T,H,W], got {x.ndim}-D tensor with shape {x.shape}. "
|
||||
"This sampler is only compatible with autoregressive video models (e.g. Causal-WAN)."
|
||||
)
|
||||
|
||||
inner_model = model.inner_model.inner_model
|
||||
causal_model = inner_model.diffusion_model
|
||||
|
||||
if not (hasattr(causal_model, "init_kv_caches") and hasattr(causal_model, "init_crossattn_caches")):
|
||||
raise TypeError(
|
||||
"ar_video sampler requires a Causal-WAN compatible model whose diffusion_model "
|
||||
"exposes init_kv_caches() and init_crossattn_caches(). The loaded checkpoint "
|
||||
"does not support this interface — choose a different sampler."
|
||||
)
|
||||
|
||||
seed = extra_args.get("seed", 0)
|
||||
|
||||
bs, c, lat_t, lat_h, lat_w = x.shape
|
||||
frame_seq_len = -(-lat_h // 2) * -(-lat_w // 2) # ceiling division
|
||||
num_blocks = -(-lat_t // num_frame_per_block) # ceiling division
|
||||
device = x.device
|
||||
model_dtype = inner_model.get_dtype()
|
||||
|
||||
kv_caches = causal_model.init_kv_caches(bs, lat_t * frame_seq_len, device, model_dtype)
|
||||
crossattn_caches = causal_model.init_crossattn_caches(bs, device, model_dtype)
|
||||
|
||||
output = torch.zeros_like(x)
|
||||
s_in = x.new_ones([x.shape[0]])
|
||||
current_start_frame = 0
|
||||
|
||||
# I2V: seed KV cache with the initial image latent before the denoising loop
|
||||
initial_latent = transformer_options.get("ar_config", {}).get("initial_latent", None)
|
||||
if initial_latent is not None:
|
||||
initial_latent = inner_model.process_latent_in(initial_latent).to(device=device, dtype=model_dtype)
|
||||
n_init = initial_latent.shape[2]
|
||||
output[:, :, :n_init] = initial_latent
|
||||
|
||||
ar_state = {"start_frame": 0, "kv_caches": kv_caches, "crossattn_caches": crossattn_caches}
|
||||
transformer_options["ar_state"] = ar_state
|
||||
zero_sigma = sigmas.new_zeros([1])
|
||||
_ = model(initial_latent, zero_sigma * s_in, **extra_args)
|
||||
|
||||
current_start_frame = n_init
|
||||
remaining = lat_t - n_init
|
||||
num_blocks = -(-remaining // num_frame_per_block)
|
||||
|
||||
num_sigma_steps = len(sigmas) - 1
|
||||
total_real_steps = num_blocks * num_sigma_steps
|
||||
step_count = 0
|
||||
|
||||
try:
|
||||
for block_idx in trange(num_blocks, disable=disable):
|
||||
bf = min(num_frame_per_block, lat_t - current_start_frame)
|
||||
fs, fe = current_start_frame, current_start_frame + bf
|
||||
noisy_input = x[:, :, fs:fe]
|
||||
|
||||
ar_state = {
|
||||
"start_frame": current_start_frame,
|
||||
"kv_caches": kv_caches,
|
||||
"crossattn_caches": crossattn_caches,
|
||||
}
|
||||
transformer_options["ar_state"] = ar_state
|
||||
|
||||
for i in range(num_sigma_steps):
|
||||
denoised = model(noisy_input, sigmas[i] * s_in, **extra_args)
|
||||
|
||||
if callback is not None:
|
||||
scaled_i = step_count * num_sigma_steps // total_real_steps
|
||||
callback({"x": noisy_input, "i": scaled_i, "sigma": sigmas[i],
|
||||
"sigma_hat": sigmas[i], "denoised": denoised})
|
||||
|
||||
if sigmas[i + 1] == 0:
|
||||
noisy_input = denoised
|
||||
else:
|
||||
sigma_next = sigmas[i + 1]
|
||||
torch.manual_seed(seed + block_idx * 1000 + i)
|
||||
fresh_noise = torch.randn_like(denoised)
|
||||
noisy_input = (1.0 - sigma_next) * denoised + sigma_next * fresh_noise
|
||||
|
||||
for cache in kv_caches:
|
||||
cache["end"] -= bf * frame_seq_len
|
||||
|
||||
step_count += 1
|
||||
|
||||
output[:, :, fs:fe] = noisy_input
|
||||
|
||||
for cache in kv_caches:
|
||||
cache["end"] -= bf * frame_seq_len
|
||||
zero_sigma = sigmas.new_zeros([1])
|
||||
_ = model(noisy_input, zero_sigma * s_in, **extra_args)
|
||||
|
||||
current_start_frame += bf
|
||||
finally:
|
||||
transformer_options.pop("ar_state", None)
|
||||
|
||||
return output
|
||||
|
||||
@ -9,7 +9,6 @@ class LatentFormat:
|
||||
latent_rgb_factors_reshape = None
|
||||
taesd_decoder_name = None
|
||||
spacial_downscale_ratio = 8
|
||||
temporal_downscale_ratio = 1
|
||||
|
||||
def process_in(self, latent):
|
||||
return latent * self.scale_factor
|
||||
@ -225,7 +224,6 @@ class Flux2(LatentFormat):
|
||||
|
||||
self.latent_rgb_factors_bias = [-0.0329, -0.0718, -0.0851]
|
||||
self.latent_rgb_factors_reshape = lambda t: t.reshape(t.shape[0], 32, 2, 2, t.shape[-2], t.shape[-1]).permute(0, 1, 4, 2, 5, 3).reshape(t.shape[0], 32, t.shape[-2] * 2, t.shape[-1] * 2)
|
||||
self.taesd_decoder_name = "taef2_decoder"
|
||||
|
||||
def process_in(self, latent):
|
||||
return latent
|
||||
@ -236,7 +234,6 @@ class Flux2(LatentFormat):
|
||||
class Mochi(LatentFormat):
|
||||
latent_channels = 12
|
||||
latent_dimensions = 3
|
||||
temporal_downscale_ratio = 6
|
||||
|
||||
def __init__(self):
|
||||
self.scale_factor = 1.0
|
||||
@ -280,7 +277,6 @@ class LTXV(LatentFormat):
|
||||
latent_channels = 128
|
||||
latent_dimensions = 3
|
||||
spacial_downscale_ratio = 32
|
||||
temporal_downscale_ratio = 8
|
||||
|
||||
def __init__(self):
|
||||
self.latent_rgb_factors = [
|
||||
@ -424,7 +420,6 @@ class LTXAV(LTXV):
|
||||
class HunyuanVideo(LatentFormat):
|
||||
latent_channels = 16
|
||||
latent_dimensions = 3
|
||||
temporal_downscale_ratio = 4
|
||||
scale_factor = 0.476986
|
||||
latent_rgb_factors = [
|
||||
[-0.0395, -0.0331, 0.0445],
|
||||
@ -451,7 +446,6 @@ class HunyuanVideo(LatentFormat):
|
||||
class Cosmos1CV8x8x8(LatentFormat):
|
||||
latent_channels = 16
|
||||
latent_dimensions = 3
|
||||
temporal_downscale_ratio = 8
|
||||
|
||||
latent_rgb_factors = [
|
||||
[ 0.1817, 0.2284, 0.2423],
|
||||
@ -477,7 +471,6 @@ class Cosmos1CV8x8x8(LatentFormat):
|
||||
class Wan21(LatentFormat):
|
||||
latent_channels = 16
|
||||
latent_dimensions = 3
|
||||
temporal_downscale_ratio = 4
|
||||
|
||||
latent_rgb_factors = [
|
||||
[-0.1299, -0.1692, 0.2932],
|
||||
@ -740,7 +733,6 @@ class HunyuanVideo15(LatentFormat):
|
||||
latent_channels = 32
|
||||
latent_dimensions = 3
|
||||
spacial_downscale_ratio = 16
|
||||
temporal_downscale_ratio = 4
|
||||
scale_factor = 1.03682
|
||||
taesd_decoder_name = "lighttaehy1_5"
|
||||
|
||||
@ -791,36 +783,3 @@ class ZImagePixelSpace(ChromaRadiance):
|
||||
No VAE encoding/decoding — the model operates directly on RGB pixels.
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class HiDreamO1Pixel(ChromaRadiance):
|
||||
"""Pixel-space latent format for HiDream-O1.
|
||||
No VAE — model patches/unpatches raw RGB internally with patch_size=32.
|
||||
"""
|
||||
pass
|
||||
|
||||
class CogVideoX(LatentFormat):
|
||||
"""Latent format for CogVideoX-2b (THUDM/CogVideoX-2b).
|
||||
|
||||
scale_factor matches the vae/config.json scaling_factor for the 2b variant.
|
||||
The 5b-class checkpoints (CogVideoX-5b, CogVideoX-1.5-5B, CogVideoX-Fun-V1.5-*)
|
||||
use a different value; see CogVideoX1_5 below.
|
||||
"""
|
||||
latent_channels = 16
|
||||
latent_dimensions = 3
|
||||
temporal_downscale_ratio = 4
|
||||
|
||||
def __init__(self):
|
||||
self.scale_factor = 1.15258426
|
||||
|
||||
|
||||
class CogVideoX1_5(CogVideoX):
|
||||
"""Latent format for 5b-class CogVideoX checkpoints.
|
||||
|
||||
Covers THUDM/CogVideoX-5b, THUDM/CogVideoX-1.5-5B, and the CogVideoX-Fun
|
||||
V1.5-5b family (including VOID inpainting). All of these have
|
||||
scaling_factor=0.7 in their vae/config.json. Auto-selected in
|
||||
supported_models.CogVideoX_T2V based on transformer hidden dim.
|
||||
"""
|
||||
def __init__(self):
|
||||
self.scale_factor = 0.7
|
||||
|
||||
@ -1,573 +0,0 @@
|
||||
# CogVideoX 3D Transformer - ported to ComfyUI native ops
|
||||
# Architecture reference: diffusers CogVideoXTransformer3DModel
|
||||
# Style reference: comfy/ldm/wan/model.py
|
||||
|
||||
import math
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
from comfy.ldm.modules.attention import optimized_attention
|
||||
import comfy.patcher_extension
|
||||
import comfy.ldm.common_dit
|
||||
|
||||
|
||||
def _get_1d_rotary_pos_embed(dim, pos, theta=10000.0):
|
||||
"""Returns (cos, sin) each with shape [seq_len, dim].
|
||||
|
||||
Frequencies are computed at dim//2 resolution then repeat_interleaved
|
||||
to full dim, matching CogVideoX's interleaved (real, imag) pair format.
|
||||
"""
|
||||
freqs = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float32, device=pos.device) / dim))
|
||||
angles = torch.outer(pos.float(), freqs.float())
|
||||
cos = angles.cos().repeat_interleave(2, dim=-1).float()
|
||||
sin = angles.sin().repeat_interleave(2, dim=-1).float()
|
||||
return (cos, sin)
|
||||
|
||||
|
||||
def apply_rotary_emb(x, freqs_cos_sin):
|
||||
"""Apply CogVideoX rotary embedding to query or key tensor.
|
||||
|
||||
x: [B, heads, seq_len, head_dim]
|
||||
freqs_cos_sin: (cos, sin) each [seq_len, head_dim//2]
|
||||
|
||||
Uses interleaved pair rotation (same as diffusers CogVideoX/Flux).
|
||||
head_dim is reshaped to (-1, 2) pairs, rotated, then flattened back.
|
||||
"""
|
||||
cos, sin = freqs_cos_sin
|
||||
cos = cos[None, None, :, :].to(x.device)
|
||||
sin = sin[None, None, :, :].to(x.device)
|
||||
|
||||
# Interleaved pairs: [B, H, S, D] -> [B, H, S, D//2, 2] -> (real, imag)
|
||||
x_real, x_imag = x.reshape(*x.shape[:-1], -1, 2).unbind(-1)
|
||||
x_rotated = torch.stack([-x_imag, x_real], dim=-1).flatten(3)
|
||||
|
||||
return (x.float() * cos + x_rotated.float() * sin).to(x.dtype)
|
||||
|
||||
|
||||
def get_timestep_embedding(timesteps, dim, flip_sin_to_cos=True, downscale_freq_shift=0, scale=1, max_period=10000):
|
||||
half = dim // 2
|
||||
freqs = torch.exp(-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32, device=timesteps.device) / half)
|
||||
args = timesteps[:, None].float() * freqs[None] * scale
|
||||
embedding = torch.cat([torch.sin(args), torch.cos(args)], dim=-1)
|
||||
if flip_sin_to_cos:
|
||||
embedding = torch.cat([embedding[:, half:], embedding[:, :half]], dim=-1)
|
||||
if dim % 2:
|
||||
embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
|
||||
return embedding
|
||||
|
||||
|
||||
def get_3d_sincos_pos_embed(embed_dim, spatial_size, temporal_size, spatial_interpolation_scale=1.0, temporal_interpolation_scale=1.0, device=None):
|
||||
if isinstance(spatial_size, int):
|
||||
spatial_size = (spatial_size, spatial_size)
|
||||
|
||||
grid_w = torch.arange(spatial_size[0], dtype=torch.float32, device=device) / spatial_interpolation_scale
|
||||
grid_h = torch.arange(spatial_size[1], dtype=torch.float32, device=device) / spatial_interpolation_scale
|
||||
grid_t = torch.arange(temporal_size, dtype=torch.float32, device=device) / temporal_interpolation_scale
|
||||
|
||||
grid_t, grid_h, grid_w = torch.meshgrid(grid_t, grid_h, grid_w, indexing="ij")
|
||||
|
||||
embed_dim_spatial = 2 * (embed_dim // 3)
|
||||
embed_dim_temporal = embed_dim // 3
|
||||
|
||||
pos_embed_spatial = _get_2d_sincos_pos_embed(embed_dim_spatial, grid_h, grid_w, device=device)
|
||||
pos_embed_temporal = _get_1d_sincos_pos_embed(embed_dim_temporal, grid_t[:, 0, 0], device=device)
|
||||
|
||||
T, H, W = grid_t.shape
|
||||
pos_embed_temporal = pos_embed_temporal.unsqueeze(1).unsqueeze(1).expand(-1, H, W, -1)
|
||||
pos_embed = torch.cat([pos_embed_temporal, pos_embed_spatial], dim=-1)
|
||||
|
||||
return pos_embed
|
||||
|
||||
|
||||
def _get_2d_sincos_pos_embed(embed_dim, grid_h, grid_w, device=None):
|
||||
T, H, W = grid_h.shape
|
||||
half_dim = embed_dim // 2
|
||||
pos_h = _get_1d_sincos_pos_embed(half_dim, grid_h.reshape(-1), device=device).reshape(T, H, W, half_dim)
|
||||
pos_w = _get_1d_sincos_pos_embed(half_dim, grid_w.reshape(-1), device=device).reshape(T, H, W, half_dim)
|
||||
return torch.cat([pos_h, pos_w], dim=-1)
|
||||
|
||||
|
||||
def _get_1d_sincos_pos_embed(embed_dim, pos, device=None):
|
||||
half = embed_dim // 2
|
||||
freqs = torch.exp(-math.log(10000.0) * torch.arange(start=0, end=half, dtype=torch.float32, device=device) / half)
|
||||
args = pos.float().reshape(-1)[:, None] * freqs[None]
|
||||
embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
|
||||
if embed_dim % 2:
|
||||
embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
|
||||
return embedding
|
||||
|
||||
|
||||
|
||||
class CogVideoXPatchEmbed(nn.Module):
|
||||
def __init__(self, patch_size=2, patch_size_t=None, in_channels=16, dim=1920,
|
||||
text_dim=4096, bias=True, sample_width=90, sample_height=60,
|
||||
sample_frames=49, temporal_compression_ratio=4,
|
||||
max_text_seq_length=226, spatial_interpolation_scale=1.875,
|
||||
temporal_interpolation_scale=1.0, use_positional_embeddings=True,
|
||||
use_learned_positional_embeddings=True,
|
||||
device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.patch_size = patch_size
|
||||
self.patch_size_t = patch_size_t
|
||||
self.dim = dim
|
||||
self.sample_height = sample_height
|
||||
self.sample_width = sample_width
|
||||
self.sample_frames = sample_frames
|
||||
self.temporal_compression_ratio = temporal_compression_ratio
|
||||
self.max_text_seq_length = max_text_seq_length
|
||||
self.spatial_interpolation_scale = spatial_interpolation_scale
|
||||
self.temporal_interpolation_scale = temporal_interpolation_scale
|
||||
self.use_positional_embeddings = use_positional_embeddings
|
||||
self.use_learned_positional_embeddings = use_learned_positional_embeddings
|
||||
|
||||
if patch_size_t is None:
|
||||
self.proj = operations.Conv2d(in_channels, dim, kernel_size=patch_size, stride=patch_size, bias=bias, device=device, dtype=dtype)
|
||||
else:
|
||||
self.proj = operations.Linear(in_channels * patch_size * patch_size * patch_size_t, dim, device=device, dtype=dtype)
|
||||
|
||||
self.text_proj = operations.Linear(text_dim, dim, device=device, dtype=dtype)
|
||||
|
||||
if use_positional_embeddings or use_learned_positional_embeddings:
|
||||
persistent = use_learned_positional_embeddings
|
||||
pos_embedding = self._get_positional_embeddings(sample_height, sample_width, sample_frames)
|
||||
self.register_buffer("pos_embedding", pos_embedding, persistent=persistent)
|
||||
|
||||
def _get_positional_embeddings(self, sample_height, sample_width, sample_frames, device=None):
|
||||
post_patch_height = sample_height // self.patch_size
|
||||
post_patch_width = sample_width // self.patch_size
|
||||
post_time_compression_frames = (sample_frames - 1) // self.temporal_compression_ratio + 1
|
||||
if self.patch_size_t is not None:
|
||||
post_time_compression_frames = post_time_compression_frames // self.patch_size_t
|
||||
num_patches = post_patch_height * post_patch_width * post_time_compression_frames
|
||||
|
||||
pos_embedding = get_3d_sincos_pos_embed(
|
||||
self.dim,
|
||||
(post_patch_width, post_patch_height),
|
||||
post_time_compression_frames,
|
||||
self.spatial_interpolation_scale,
|
||||
self.temporal_interpolation_scale,
|
||||
device=device,
|
||||
)
|
||||
pos_embedding = pos_embedding.reshape(-1, self.dim)
|
||||
joint_pos_embedding = pos_embedding.new_zeros(
|
||||
1, self.max_text_seq_length + num_patches, self.dim, requires_grad=False
|
||||
)
|
||||
joint_pos_embedding.data[:, self.max_text_seq_length:].copy_(pos_embedding)
|
||||
return joint_pos_embedding
|
||||
|
||||
def forward(self, text_embeds, image_embeds):
|
||||
input_dtype = text_embeds.dtype
|
||||
text_embeds = self.text_proj(text_embeds.to(self.text_proj.weight.dtype)).to(input_dtype)
|
||||
batch_size, num_frames, channels, height, width = image_embeds.shape
|
||||
|
||||
proj_dtype = self.proj.weight.dtype
|
||||
if self.patch_size_t is None:
|
||||
image_embeds = image_embeds.reshape(-1, channels, height, width)
|
||||
image_embeds = self.proj(image_embeds.to(proj_dtype)).to(input_dtype)
|
||||
image_embeds = image_embeds.view(batch_size, num_frames, *image_embeds.shape[1:])
|
||||
image_embeds = image_embeds.flatten(3).transpose(2, 3)
|
||||
image_embeds = image_embeds.flatten(1, 2)
|
||||
else:
|
||||
p = self.patch_size
|
||||
p_t = self.patch_size_t
|
||||
image_embeds = image_embeds.permute(0, 1, 3, 4, 2)
|
||||
image_embeds = image_embeds.reshape(
|
||||
batch_size, num_frames // p_t, p_t, height // p, p, width // p, p, channels
|
||||
)
|
||||
image_embeds = image_embeds.permute(0, 1, 3, 5, 7, 2, 4, 6).flatten(4, 7).flatten(1, 3)
|
||||
image_embeds = self.proj(image_embeds.to(proj_dtype)).to(input_dtype)
|
||||
|
||||
embeds = torch.cat([text_embeds, image_embeds], dim=1).contiguous()
|
||||
|
||||
if self.use_positional_embeddings or self.use_learned_positional_embeddings:
|
||||
text_seq_length = text_embeds.shape[1]
|
||||
num_image_patches = image_embeds.shape[1]
|
||||
|
||||
if self.use_learned_positional_embeddings:
|
||||
image_pos = self.pos_embedding[
|
||||
:, self.max_text_seq_length:self.max_text_seq_length + num_image_patches
|
||||
].to(device=embeds.device, dtype=embeds.dtype)
|
||||
else:
|
||||
image_pos = get_3d_sincos_pos_embed(
|
||||
self.dim,
|
||||
(width // self.patch_size, height // self.patch_size),
|
||||
num_image_patches // ((height // self.patch_size) * (width // self.patch_size)),
|
||||
self.spatial_interpolation_scale,
|
||||
self.temporal_interpolation_scale,
|
||||
device=embeds.device,
|
||||
).reshape(1, num_image_patches, self.dim).to(dtype=embeds.dtype)
|
||||
|
||||
# Build joint: zeros for text + sincos for image
|
||||
joint_pos = torch.zeros(1, text_seq_length + num_image_patches, self.dim, device=embeds.device, dtype=embeds.dtype)
|
||||
joint_pos[:, text_seq_length:] = image_pos
|
||||
embeds = embeds + joint_pos
|
||||
|
||||
return embeds
|
||||
|
||||
|
||||
class CogVideoXLayerNormZero(nn.Module):
|
||||
def __init__(self, time_dim, dim, elementwise_affine=True, eps=1e-5, bias=True,
|
||||
device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.silu = nn.SiLU()
|
||||
self.linear = operations.Linear(time_dim, 6 * dim, bias=bias, device=device, dtype=dtype)
|
||||
self.norm = operations.LayerNorm(dim, eps=eps, elementwise_affine=elementwise_affine, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, hidden_states, encoder_hidden_states, temb):
|
||||
shift, scale, gate, enc_shift, enc_scale, enc_gate = self.linear(self.silu(temb)).chunk(6, dim=1)
|
||||
hidden_states = self.norm(hidden_states) * (1 + scale)[:, None, :] + shift[:, None, :]
|
||||
encoder_hidden_states = self.norm(encoder_hidden_states) * (1 + enc_scale)[:, None, :] + enc_shift[:, None, :]
|
||||
return hidden_states, encoder_hidden_states, gate[:, None, :], enc_gate[:, None, :]
|
||||
|
||||
|
||||
class CogVideoXAdaLayerNorm(nn.Module):
|
||||
def __init__(self, time_dim, dim, elementwise_affine=True, eps=1e-5,
|
||||
device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.silu = nn.SiLU()
|
||||
self.linear = operations.Linear(time_dim, 2 * dim, device=device, dtype=dtype)
|
||||
self.norm = operations.LayerNorm(dim, eps=eps, elementwise_affine=elementwise_affine, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, x, temb):
|
||||
temb = self.linear(self.silu(temb))
|
||||
shift, scale = temb.chunk(2, dim=1)
|
||||
x = self.norm(x) * (1 + scale)[:, None, :] + shift[:, None, :]
|
||||
return x
|
||||
|
||||
|
||||
class CogVideoXBlock(nn.Module):
|
||||
def __init__(self, dim, num_heads, head_dim, time_dim,
|
||||
eps=1e-5, ff_inner_dim=None, ff_bias=True,
|
||||
device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.dim = dim
|
||||
self.num_heads = num_heads
|
||||
self.head_dim = head_dim
|
||||
|
||||
self.norm1 = CogVideoXLayerNormZero(time_dim, dim, eps=eps, device=device, dtype=dtype, operations=operations)
|
||||
|
||||
# Self-attention (joint text + latent)
|
||||
self.q = operations.Linear(dim, dim, bias=True, device=device, dtype=dtype)
|
||||
self.k = operations.Linear(dim, dim, bias=True, device=device, dtype=dtype)
|
||||
self.v = operations.Linear(dim, dim, bias=True, device=device, dtype=dtype)
|
||||
self.norm_q = operations.LayerNorm(head_dim, eps=1e-6, elementwise_affine=True, device=device, dtype=dtype)
|
||||
self.norm_k = operations.LayerNorm(head_dim, eps=1e-6, elementwise_affine=True, device=device, dtype=dtype)
|
||||
self.attn_out = operations.Linear(dim, dim, bias=True, device=device, dtype=dtype)
|
||||
|
||||
self.norm2 = CogVideoXLayerNormZero(time_dim, dim, eps=eps, device=device, dtype=dtype, operations=operations)
|
||||
|
||||
# Feed-forward (GELU approximate)
|
||||
inner_dim = ff_inner_dim or dim * 4
|
||||
self.ff_proj = operations.Linear(dim, inner_dim, bias=ff_bias, device=device, dtype=dtype)
|
||||
self.ff_out = operations.Linear(inner_dim, dim, bias=ff_bias, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, hidden_states, encoder_hidden_states, temb, image_rotary_emb=None, transformer_options=None):
|
||||
if transformer_options is None:
|
||||
transformer_options = {}
|
||||
text_seq_length = encoder_hidden_states.size(1)
|
||||
|
||||
# Norm & modulate
|
||||
norm_hidden, norm_encoder, gate_msa, enc_gate_msa = self.norm1(hidden_states, encoder_hidden_states, temb)
|
||||
|
||||
# Joint self-attention
|
||||
qkv_input = torch.cat([norm_encoder, norm_hidden], dim=1)
|
||||
b, s, _ = qkv_input.shape
|
||||
n, d = self.num_heads, self.head_dim
|
||||
|
||||
q = self.q(qkv_input).view(b, s, n, d)
|
||||
k = self.k(qkv_input).view(b, s, n, d)
|
||||
v = self.v(qkv_input)
|
||||
|
||||
q = self.norm_q(q).view(b, s, n, d)
|
||||
k = self.norm_k(k).view(b, s, n, d)
|
||||
|
||||
# Apply rotary embeddings to image tokens only (diffusers format: [B, heads, seq, head_dim])
|
||||
if image_rotary_emb is not None:
|
||||
q_img = q[:, text_seq_length:].transpose(1, 2) # [B, heads, img_seq, head_dim]
|
||||
k_img = k[:, text_seq_length:].transpose(1, 2)
|
||||
q_img = apply_rotary_emb(q_img, image_rotary_emb)
|
||||
k_img = apply_rotary_emb(k_img, image_rotary_emb)
|
||||
q = torch.cat([q[:, :text_seq_length], q_img.transpose(1, 2)], dim=1)
|
||||
k = torch.cat([k[:, :text_seq_length], k_img.transpose(1, 2)], dim=1)
|
||||
|
||||
attn_out = optimized_attention(
|
||||
q.reshape(b, s, n * d),
|
||||
k.reshape(b, s, n * d),
|
||||
v,
|
||||
heads=self.num_heads,
|
||||
transformer_options=transformer_options,
|
||||
)
|
||||
|
||||
attn_out = self.attn_out(attn_out)
|
||||
|
||||
attn_encoder, attn_hidden = attn_out.split([text_seq_length, s - text_seq_length], dim=1)
|
||||
|
||||
hidden_states = hidden_states + gate_msa * attn_hidden
|
||||
encoder_hidden_states = encoder_hidden_states + enc_gate_msa * attn_encoder
|
||||
|
||||
# Norm & modulate for FF
|
||||
norm_hidden, norm_encoder, gate_ff, enc_gate_ff = self.norm2(hidden_states, encoder_hidden_states, temb)
|
||||
|
||||
# Feed-forward (GELU on concatenated text + latent)
|
||||
ff_input = torch.cat([norm_encoder, norm_hidden], dim=1)
|
||||
ff_output = self.ff_out(F.gelu(self.ff_proj(ff_input), approximate="tanh"))
|
||||
|
||||
hidden_states = hidden_states + gate_ff * ff_output[:, text_seq_length:]
|
||||
encoder_hidden_states = encoder_hidden_states + enc_gate_ff * ff_output[:, :text_seq_length]
|
||||
|
||||
return hidden_states, encoder_hidden_states
|
||||
|
||||
|
||||
class CogVideoXTransformer3DModel(nn.Module):
|
||||
def __init__(self,
|
||||
num_attention_heads=30,
|
||||
attention_head_dim=64,
|
||||
in_channels=16,
|
||||
out_channels=16,
|
||||
flip_sin_to_cos=True,
|
||||
freq_shift=0,
|
||||
time_embed_dim=512,
|
||||
ofs_embed_dim=None,
|
||||
text_embed_dim=4096,
|
||||
num_layers=30,
|
||||
dropout=0.0,
|
||||
attention_bias=True,
|
||||
sample_width=90,
|
||||
sample_height=60,
|
||||
sample_frames=49,
|
||||
patch_size=2,
|
||||
patch_size_t=None,
|
||||
temporal_compression_ratio=4,
|
||||
max_text_seq_length=226,
|
||||
spatial_interpolation_scale=1.875,
|
||||
temporal_interpolation_scale=1.0,
|
||||
use_rotary_positional_embeddings=False,
|
||||
use_learned_positional_embeddings=False,
|
||||
patch_bias=True,
|
||||
image_model=None,
|
||||
device=None,
|
||||
dtype=None,
|
||||
operations=None,
|
||||
):
|
||||
super().__init__()
|
||||
self.dtype = dtype
|
||||
dim = num_attention_heads * attention_head_dim
|
||||
self.dim = dim
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.attention_head_dim = attention_head_dim
|
||||
self.in_channels = in_channels
|
||||
self.out_channels = out_channels
|
||||
self.patch_size = patch_size
|
||||
self.patch_size_t = patch_size_t
|
||||
self.max_text_seq_length = max_text_seq_length
|
||||
self.use_rotary_positional_embeddings = use_rotary_positional_embeddings
|
||||
|
||||
# 1. Patch embedding
|
||||
self.patch_embed = CogVideoXPatchEmbed(
|
||||
patch_size=patch_size,
|
||||
patch_size_t=patch_size_t,
|
||||
in_channels=in_channels,
|
||||
dim=dim,
|
||||
text_dim=text_embed_dim,
|
||||
bias=patch_bias,
|
||||
sample_width=sample_width,
|
||||
sample_height=sample_height,
|
||||
sample_frames=sample_frames,
|
||||
temporal_compression_ratio=temporal_compression_ratio,
|
||||
max_text_seq_length=max_text_seq_length,
|
||||
spatial_interpolation_scale=spatial_interpolation_scale,
|
||||
temporal_interpolation_scale=temporal_interpolation_scale,
|
||||
use_positional_embeddings=not use_rotary_positional_embeddings,
|
||||
use_learned_positional_embeddings=use_learned_positional_embeddings,
|
||||
device=device, dtype=torch.float32, operations=operations,
|
||||
)
|
||||
|
||||
# 2. Time embedding
|
||||
self.time_proj_dim = dim
|
||||
self.time_proj_flip = flip_sin_to_cos
|
||||
self.time_proj_shift = freq_shift
|
||||
self.time_embedding_linear_1 = operations.Linear(dim, time_embed_dim, device=device, dtype=dtype)
|
||||
self.time_embedding_act = nn.SiLU()
|
||||
self.time_embedding_linear_2 = operations.Linear(time_embed_dim, time_embed_dim, device=device, dtype=dtype)
|
||||
|
||||
# Optional OFS embedding (CogVideoX 1.5 I2V)
|
||||
self.ofs_proj_dim = ofs_embed_dim
|
||||
if ofs_embed_dim:
|
||||
self.ofs_embedding_linear_1 = operations.Linear(ofs_embed_dim, ofs_embed_dim, device=device, dtype=dtype)
|
||||
self.ofs_embedding_act = nn.SiLU()
|
||||
self.ofs_embedding_linear_2 = operations.Linear(ofs_embed_dim, ofs_embed_dim, device=device, dtype=dtype)
|
||||
else:
|
||||
self.ofs_embedding_linear_1 = None
|
||||
|
||||
# 3. Transformer blocks
|
||||
self.blocks = nn.ModuleList([
|
||||
CogVideoXBlock(
|
||||
dim=dim,
|
||||
num_heads=num_attention_heads,
|
||||
head_dim=attention_head_dim,
|
||||
time_dim=time_embed_dim,
|
||||
eps=1e-5,
|
||||
device=device, dtype=dtype, operations=operations,
|
||||
)
|
||||
for _ in range(num_layers)
|
||||
])
|
||||
|
||||
self.norm_final = operations.LayerNorm(dim, eps=1e-5, elementwise_affine=True, device=device, dtype=dtype)
|
||||
|
||||
# 4. Output
|
||||
self.norm_out = CogVideoXAdaLayerNorm(
|
||||
time_dim=time_embed_dim, dim=dim, eps=1e-5,
|
||||
device=device, dtype=dtype, operations=operations,
|
||||
)
|
||||
|
||||
if patch_size_t is None:
|
||||
output_dim = patch_size * patch_size * out_channels
|
||||
else:
|
||||
output_dim = patch_size * patch_size * patch_size_t * out_channels
|
||||
|
||||
self.proj_out = operations.Linear(dim, output_dim, device=device, dtype=dtype)
|
||||
|
||||
self.spatial_interpolation_scale = spatial_interpolation_scale
|
||||
self.temporal_interpolation_scale = temporal_interpolation_scale
|
||||
self.temporal_compression_ratio = temporal_compression_ratio
|
||||
|
||||
def forward(self, x, timestep, context, ofs=None, transformer_options=None, **kwargs):
|
||||
if transformer_options is None:
|
||||
transformer_options = {}
|
||||
return comfy.patcher_extension.WrapperExecutor.new_class_executor(
|
||||
self._forward,
|
||||
self,
|
||||
comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, transformer_options)
|
||||
).execute(x, timestep, context, ofs, transformer_options, **kwargs)
|
||||
|
||||
def _forward(self, x, timestep, context, ofs=None, transformer_options=None, **kwargs):
|
||||
if transformer_options is None:
|
||||
transformer_options = {}
|
||||
# ComfyUI passes [B, C, T, H, W]
|
||||
batch_size, channels, t, h, w = x.shape
|
||||
|
||||
# Pad to patch size (temporal + spatial), same pattern as WAN
|
||||
p_t = self.patch_size_t if self.patch_size_t is not None else 1
|
||||
x = comfy.ldm.common_dit.pad_to_patch_size(x, (p_t, self.patch_size, self.patch_size))
|
||||
|
||||
# CogVideoX expects [B, T, C, H, W]
|
||||
x = x.permute(0, 2, 1, 3, 4)
|
||||
batch_size, num_frames, channels, height, width = x.shape
|
||||
|
||||
# Time embedding
|
||||
t_emb = get_timestep_embedding(timestep, self.time_proj_dim, self.time_proj_flip, self.time_proj_shift)
|
||||
t_emb = t_emb.to(dtype=x.dtype)
|
||||
emb = self.time_embedding_linear_2(self.time_embedding_act(self.time_embedding_linear_1(t_emb)))
|
||||
|
||||
if self.ofs_embedding_linear_1 is not None and ofs is not None:
|
||||
ofs_emb = get_timestep_embedding(ofs, self.ofs_proj_dim, self.time_proj_flip, self.time_proj_shift)
|
||||
ofs_emb = ofs_emb.to(dtype=x.dtype)
|
||||
ofs_emb = self.ofs_embedding_linear_2(self.ofs_embedding_act(self.ofs_embedding_linear_1(ofs_emb)))
|
||||
emb = emb + ofs_emb
|
||||
|
||||
# Patch embedding
|
||||
hidden_states = self.patch_embed(context, x)
|
||||
|
||||
text_seq_length = context.shape[1]
|
||||
encoder_hidden_states = hidden_states[:, :text_seq_length]
|
||||
hidden_states = hidden_states[:, text_seq_length:]
|
||||
|
||||
# Rotary embeddings (if used)
|
||||
image_rotary_emb = None
|
||||
if self.use_rotary_positional_embeddings:
|
||||
post_patch_height = height // self.patch_size
|
||||
post_patch_width = width // self.patch_size
|
||||
if self.patch_size_t is None:
|
||||
post_time = num_frames
|
||||
else:
|
||||
post_time = num_frames // self.patch_size_t
|
||||
image_rotary_emb = self._get_rotary_emb(post_patch_height, post_patch_width, post_time, device=x.device)
|
||||
|
||||
# Transformer blocks
|
||||
for i, block in enumerate(self.blocks):
|
||||
hidden_states, encoder_hidden_states = block(
|
||||
hidden_states=hidden_states,
|
||||
encoder_hidden_states=encoder_hidden_states,
|
||||
temb=emb,
|
||||
image_rotary_emb=image_rotary_emb,
|
||||
transformer_options=transformer_options,
|
||||
)
|
||||
|
||||
hidden_states = self.norm_final(hidden_states)
|
||||
|
||||
# Output projection
|
||||
hidden_states = self.norm_out(hidden_states, temb=emb)
|
||||
hidden_states = self.proj_out(hidden_states)
|
||||
|
||||
# Unpatchify
|
||||
p = self.patch_size
|
||||
p_t = self.patch_size_t
|
||||
|
||||
if p_t is None:
|
||||
output = hidden_states.reshape(batch_size, num_frames, height // p, width // p, -1, p, p)
|
||||
output = output.permute(0, 1, 4, 2, 5, 3, 6).flatten(5, 6).flatten(3, 4)
|
||||
else:
|
||||
output = hidden_states.reshape(
|
||||
batch_size, (num_frames + p_t - 1) // p_t, height // p, width // p, -1, p_t, p, p
|
||||
)
|
||||
output = output.permute(0, 1, 5, 4, 2, 6, 3, 7).flatten(6, 7).flatten(4, 5).flatten(1, 2)
|
||||
|
||||
# Back to ComfyUI format [B, C, T, H, W] and crop padding
|
||||
output = output.permute(0, 2, 1, 3, 4)[:, :, :t, :h, :w]
|
||||
return output
|
||||
|
||||
def _get_rotary_emb(self, h, w, t, device):
|
||||
"""Compute CogVideoX 3D rotary positional embeddings.
|
||||
|
||||
For CogVideoX 1.5 (patch_size_t != None): uses "slice" mode — grid positions
|
||||
are integer arange computed at max_size, then sliced to actual size.
|
||||
For CogVideoX 1.0 (patch_size_t == None): uses "linspace" mode with crop coords
|
||||
scaled by spatial_interpolation_scale.
|
||||
"""
|
||||
d = self.attention_head_dim
|
||||
dim_t = d // 4
|
||||
dim_h = d // 8 * 3
|
||||
dim_w = d // 8 * 3
|
||||
|
||||
if self.patch_size_t is not None:
|
||||
# CogVideoX 1.5: "slice" mode — positions are simple integer indices
|
||||
# Compute at max(sample_size, actual_size) then slice to actual
|
||||
base_h = self.patch_embed.sample_height // self.patch_size
|
||||
base_w = self.patch_embed.sample_width // self.patch_size
|
||||
max_h = max(base_h, h)
|
||||
max_w = max(base_w, w)
|
||||
|
||||
grid_h = torch.arange(max_h, device=device, dtype=torch.float32)
|
||||
grid_w = torch.arange(max_w, device=device, dtype=torch.float32)
|
||||
grid_t = torch.arange(t, device=device, dtype=torch.float32)
|
||||
else:
|
||||
# CogVideoX 1.0: "linspace" mode with interpolation scale
|
||||
grid_h = torch.linspace(0, h - 1, h, device=device, dtype=torch.float32) * self.spatial_interpolation_scale
|
||||
grid_w = torch.linspace(0, w - 1, w, device=device, dtype=torch.float32) * self.spatial_interpolation_scale
|
||||
grid_t = torch.arange(t, device=device, dtype=torch.float32)
|
||||
|
||||
freqs_t = _get_1d_rotary_pos_embed(dim_t, grid_t)
|
||||
freqs_h = _get_1d_rotary_pos_embed(dim_h, grid_h)
|
||||
freqs_w = _get_1d_rotary_pos_embed(dim_w, grid_w)
|
||||
|
||||
t_cos, t_sin = freqs_t
|
||||
h_cos, h_sin = freqs_h
|
||||
w_cos, w_sin = freqs_w
|
||||
|
||||
# Slice to actual size (for "slice" mode where grids may be larger)
|
||||
t_cos, t_sin = t_cos[:t], t_sin[:t]
|
||||
h_cos, h_sin = h_cos[:h], h_sin[:h]
|
||||
w_cos, w_sin = w_cos[:w], w_sin[:w]
|
||||
|
||||
# Broadcast and concatenate into [T*H*W, head_dim]
|
||||
t_cos = t_cos[:, None, None, :].expand(-1, h, w, -1)
|
||||
t_sin = t_sin[:, None, None, :].expand(-1, h, w, -1)
|
||||
h_cos = h_cos[None, :, None, :].expand(t, -1, w, -1)
|
||||
h_sin = h_sin[None, :, None, :].expand(t, -1, w, -1)
|
||||
w_cos = w_cos[None, None, :, :].expand(t, h, -1, -1)
|
||||
w_sin = w_sin[None, None, :, :].expand(t, h, -1, -1)
|
||||
|
||||
cos = torch.cat([t_cos, h_cos, w_cos], dim=-1).reshape(t * h * w, -1)
|
||||
sin = torch.cat([t_sin, h_sin, w_sin], dim=-1).reshape(t * h * w, -1)
|
||||
return (cos, sin)
|
||||
@ -1,566 +0,0 @@
|
||||
# CogVideoX VAE - ported to ComfyUI native ops
|
||||
# Architecture reference: diffusers AutoencoderKLCogVideoX
|
||||
# Style reference: comfy/ldm/wan/vae.py
|
||||
|
||||
import numpy as np
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
import comfy.ops
|
||||
ops = comfy.ops.disable_weight_init
|
||||
|
||||
|
||||
class CausalConv3d(nn.Module):
|
||||
"""Causal 3D convolution with temporal padding.
|
||||
|
||||
Uses comfy.ops.Conv3d with autopad='causal_zero' fast path: when input has
|
||||
a single temporal frame and no cache, the 3D conv weight is sliced to act
|
||||
as a 2D conv, avoiding computation on zero-padded temporal dimensions.
|
||||
"""
|
||||
def __init__(self, in_channels, out_channels, kernel_size, stride=1, dilation=1, pad_mode="constant"):
|
||||
super().__init__()
|
||||
if isinstance(kernel_size, int):
|
||||
kernel_size = (kernel_size,) * 3
|
||||
|
||||
time_kernel, height_kernel, width_kernel = kernel_size
|
||||
self.time_kernel_size = time_kernel
|
||||
self.pad_mode = pad_mode
|
||||
|
||||
height_pad = (height_kernel - 1) // 2
|
||||
width_pad = (width_kernel - 1) // 2
|
||||
self.time_causal_padding = (width_pad, width_pad, height_pad, height_pad, time_kernel - 1, 0)
|
||||
|
||||
stride = stride if isinstance(stride, tuple) else (stride, 1, 1)
|
||||
dilation = (dilation, 1, 1)
|
||||
self.conv = ops.Conv3d(
|
||||
in_channels, out_channels, kernel_size,
|
||||
stride=stride, dilation=dilation,
|
||||
padding=(0, height_pad, width_pad),
|
||||
)
|
||||
|
||||
def forward(self, x, conv_cache=None):
|
||||
if self.pad_mode == "replicate":
|
||||
x = F.pad(x, self.time_causal_padding, mode="replicate")
|
||||
conv_cache = None
|
||||
else:
|
||||
kernel_t = self.time_kernel_size
|
||||
if kernel_t > 1:
|
||||
if conv_cache is None and x.shape[2] == 1:
|
||||
# Fast path: single frame, no cache. All temporal padding
|
||||
# frames are copies of the input (replicate-style), so the
|
||||
# 3D conv reduces to a 2D conv with summed temporal kernel.
|
||||
w = comfy.ops.cast_to_input(self.conv.weight, x)
|
||||
b = comfy.ops.cast_to_input(self.conv.bias, x) if self.conv.bias is not None else None
|
||||
w2d = w.sum(dim=2, keepdim=True)
|
||||
out = F.conv3d(x, w2d, b,
|
||||
self.conv.stride, self.conv.padding,
|
||||
self.conv.dilation, self.conv.groups)
|
||||
return out, None
|
||||
cached = [conv_cache] if conv_cache is not None else [x[:, :, :1]] * (kernel_t - 1)
|
||||
x = torch.cat(cached + [x], dim=2)
|
||||
conv_cache = x[:, :, -self.time_kernel_size + 1:].clone() if self.time_kernel_size > 1 else None
|
||||
|
||||
out = self.conv(x)
|
||||
return out, conv_cache
|
||||
|
||||
|
||||
def _interpolate_zq(zq, target_size):
|
||||
"""Interpolate latent z to target (T, H, W), matching CogVideoX's first-frame-special handling."""
|
||||
t = target_size[0]
|
||||
if t > 1 and t % 2 == 1:
|
||||
z_first = F.interpolate(zq[:, :, :1], size=(1, target_size[1], target_size[2]))
|
||||
z_rest = F.interpolate(zq[:, :, 1:], size=(t - 1, target_size[1], target_size[2]))
|
||||
return torch.cat([z_first, z_rest], dim=2)
|
||||
return F.interpolate(zq, size=target_size)
|
||||
|
||||
|
||||
class SpatialNorm3D(nn.Module):
|
||||
"""Spatially conditioned normalization."""
|
||||
def __init__(self, f_channels, zq_channels, groups=32):
|
||||
super().__init__()
|
||||
self.norm_layer = ops.GroupNorm(num_channels=f_channels, num_groups=groups, eps=1e-6, affine=True)
|
||||
self.conv_y = CausalConv3d(zq_channels, f_channels, kernel_size=1, stride=1)
|
||||
self.conv_b = CausalConv3d(zq_channels, f_channels, kernel_size=1, stride=1)
|
||||
|
||||
def forward(self, f, zq, conv_cache=None):
|
||||
new_cache = {}
|
||||
conv_cache = conv_cache or {}
|
||||
|
||||
if zq.shape[-3:] != f.shape[-3:]:
|
||||
zq = _interpolate_zq(zq, f.shape[-3:])
|
||||
|
||||
conv_y, new_cache["conv_y"] = self.conv_y(zq, conv_cache=conv_cache.get("conv_y"))
|
||||
conv_b, new_cache["conv_b"] = self.conv_b(zq, conv_cache=conv_cache.get("conv_b"))
|
||||
|
||||
return self.norm_layer(f) * conv_y + conv_b, new_cache
|
||||
|
||||
|
||||
class ResnetBlock3D(nn.Module):
|
||||
"""3D ResNet block with optional spatial norm."""
|
||||
def __init__(self, in_channels, out_channels=None, temb_channels=512, groups=32,
|
||||
eps=1e-6, act_fn="silu", spatial_norm_dim=None, pad_mode="first"):
|
||||
super().__init__()
|
||||
out_channels = out_channels or in_channels
|
||||
self.in_channels = in_channels
|
||||
self.out_channels = out_channels
|
||||
self.spatial_norm_dim = spatial_norm_dim
|
||||
|
||||
if act_fn == "silu":
|
||||
self.nonlinearity = nn.SiLU()
|
||||
elif act_fn == "swish":
|
||||
self.nonlinearity = nn.SiLU()
|
||||
else:
|
||||
self.nonlinearity = nn.SiLU()
|
||||
|
||||
if spatial_norm_dim is None:
|
||||
self.norm1 = ops.GroupNorm(num_channels=in_channels, num_groups=groups, eps=eps)
|
||||
self.norm2 = ops.GroupNorm(num_channels=out_channels, num_groups=groups, eps=eps)
|
||||
else:
|
||||
self.norm1 = SpatialNorm3D(in_channels, spatial_norm_dim, groups=groups)
|
||||
self.norm2 = SpatialNorm3D(out_channels, spatial_norm_dim, groups=groups)
|
||||
|
||||
self.conv1 = CausalConv3d(in_channels, out_channels, kernel_size=3, pad_mode=pad_mode)
|
||||
|
||||
if temb_channels > 0:
|
||||
self.temb_proj = ops.Linear(temb_channels, out_channels)
|
||||
|
||||
self.conv2 = CausalConv3d(out_channels, out_channels, kernel_size=3, pad_mode=pad_mode)
|
||||
|
||||
if in_channels != out_channels:
|
||||
self.conv_shortcut = ops.Conv3d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
|
||||
else:
|
||||
self.conv_shortcut = None
|
||||
|
||||
def forward(self, x, temb=None, zq=None, conv_cache=None):
|
||||
new_cache = {}
|
||||
conv_cache = conv_cache or {}
|
||||
residual = x
|
||||
|
||||
if zq is not None:
|
||||
x, new_cache["norm1"] = self.norm1(x, zq, conv_cache=conv_cache.get("norm1"))
|
||||
else:
|
||||
x = self.norm1(x)
|
||||
|
||||
x = self.nonlinearity(x)
|
||||
x, new_cache["conv1"] = self.conv1(x, conv_cache=conv_cache.get("conv1"))
|
||||
|
||||
if temb is not None and hasattr(self, "temb_proj"):
|
||||
x = x + self.temb_proj(self.nonlinearity(temb))[:, :, None, None, None]
|
||||
|
||||
if zq is not None:
|
||||
x, new_cache["norm2"] = self.norm2(x, zq, conv_cache=conv_cache.get("norm2"))
|
||||
else:
|
||||
x = self.norm2(x)
|
||||
|
||||
x = self.nonlinearity(x)
|
||||
x, new_cache["conv2"] = self.conv2(x, conv_cache=conv_cache.get("conv2"))
|
||||
|
||||
if self.conv_shortcut is not None:
|
||||
residual = self.conv_shortcut(residual)
|
||||
|
||||
return x + residual, new_cache
|
||||
|
||||
|
||||
class Downsample3D(nn.Module):
|
||||
"""3D downsampling with optional temporal compression."""
|
||||
def __init__(self, in_channels, out_channels, kernel_size=3, stride=2, padding=0, compress_time=False):
|
||||
super().__init__()
|
||||
self.conv = ops.Conv2d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding)
|
||||
self.compress_time = compress_time
|
||||
|
||||
def forward(self, x):
|
||||
if self.compress_time:
|
||||
b, c, t, h, w = x.shape
|
||||
x = x.permute(0, 3, 4, 1, 2).reshape(b * h * w, c, t)
|
||||
if t % 2 == 1:
|
||||
x_first, x_rest = x[..., 0], x[..., 1:]
|
||||
if x_rest.shape[-1] > 0:
|
||||
x_rest = F.avg_pool1d(x_rest, kernel_size=2, stride=2)
|
||||
x = torch.cat([x_first[..., None], x_rest], dim=-1)
|
||||
x = x.reshape(b, h, w, c, x.shape[-1]).permute(0, 3, 4, 1, 2)
|
||||
else:
|
||||
x = F.avg_pool1d(x, kernel_size=2, stride=2)
|
||||
x = x.reshape(b, h, w, c, x.shape[-1]).permute(0, 3, 4, 1, 2)
|
||||
|
||||
pad = (0, 1, 0, 1)
|
||||
x = F.pad(x, pad, mode="constant", value=0)
|
||||
b, c, t, h, w = x.shape
|
||||
x = x.permute(0, 2, 1, 3, 4).reshape(b * t, c, h, w)
|
||||
x = self.conv(x)
|
||||
x = x.reshape(b, t, x.shape[1], x.shape[2], x.shape[3]).permute(0, 2, 1, 3, 4)
|
||||
return x
|
||||
|
||||
|
||||
class Upsample3D(nn.Module):
|
||||
"""3D upsampling with optional temporal decompression."""
|
||||
def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, padding=1, compress_time=False):
|
||||
super().__init__()
|
||||
self.conv = ops.Conv2d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding)
|
||||
self.compress_time = compress_time
|
||||
|
||||
def forward(self, x):
|
||||
if self.compress_time:
|
||||
if x.shape[2] > 1 and x.shape[2] % 2 == 1:
|
||||
x_first, x_rest = x[:, :, 0], x[:, :, 1:]
|
||||
x_first = F.interpolate(x_first, scale_factor=2.0)
|
||||
x_rest = F.interpolate(x_rest, scale_factor=2.0)
|
||||
x = torch.cat([x_first[:, :, None, :, :], x_rest], dim=2)
|
||||
elif x.shape[2] > 1:
|
||||
x = F.interpolate(x, scale_factor=2.0)
|
||||
else:
|
||||
x = x.squeeze(2)
|
||||
x = F.interpolate(x, scale_factor=2.0)
|
||||
x = x[:, :, None, :, :]
|
||||
else:
|
||||
b, c, t, h, w = x.shape
|
||||
x = x.permute(0, 2, 1, 3, 4).reshape(b * t, c, h, w)
|
||||
x = F.interpolate(x, scale_factor=2.0)
|
||||
x = x.reshape(b, t, c, *x.shape[2:]).permute(0, 2, 1, 3, 4)
|
||||
|
||||
b, c, t, h, w = x.shape
|
||||
x = x.permute(0, 2, 1, 3, 4).reshape(b * t, c, h, w)
|
||||
x = self.conv(x)
|
||||
x = x.reshape(b, t, *x.shape[1:]).permute(0, 2, 1, 3, 4)
|
||||
return x
|
||||
|
||||
|
||||
class DownBlock3D(nn.Module):
|
||||
def __init__(self, in_channels, out_channels, temb_channels=0, num_layers=1,
|
||||
eps=1e-6, act_fn="silu", groups=32, add_downsample=True,
|
||||
compress_time=False, pad_mode="first"):
|
||||
super().__init__()
|
||||
self.resnets = nn.ModuleList([
|
||||
ResnetBlock3D(
|
||||
in_channels=in_channels if i == 0 else out_channels,
|
||||
out_channels=out_channels,
|
||||
temb_channels=temb_channels,
|
||||
groups=groups, eps=eps, act_fn=act_fn, pad_mode=pad_mode,
|
||||
)
|
||||
for i in range(num_layers)
|
||||
])
|
||||
self.downsamplers = nn.ModuleList([Downsample3D(out_channels, out_channels, compress_time=compress_time)]) if add_downsample else None
|
||||
|
||||
def forward(self, x, temb=None, zq=None, conv_cache=None):
|
||||
new_cache = {}
|
||||
conv_cache = conv_cache or {}
|
||||
for i, resnet in enumerate(self.resnets):
|
||||
x, new_cache[f"resnet_{i}"] = resnet(x, temb, zq, conv_cache=conv_cache.get(f"resnet_{i}"))
|
||||
if self.downsamplers is not None:
|
||||
for ds in self.downsamplers:
|
||||
x = ds(x)
|
||||
return x, new_cache
|
||||
|
||||
|
||||
class MidBlock3D(nn.Module):
|
||||
def __init__(self, in_channels, temb_channels=0, num_layers=1,
|
||||
eps=1e-6, act_fn="silu", groups=32, spatial_norm_dim=None, pad_mode="first"):
|
||||
super().__init__()
|
||||
self.resnets = nn.ModuleList([
|
||||
ResnetBlock3D(
|
||||
in_channels=in_channels, out_channels=in_channels,
|
||||
temb_channels=temb_channels, groups=groups, eps=eps,
|
||||
act_fn=act_fn, spatial_norm_dim=spatial_norm_dim, pad_mode=pad_mode,
|
||||
)
|
||||
for _ in range(num_layers)
|
||||
])
|
||||
|
||||
def forward(self, x, temb=None, zq=None, conv_cache=None):
|
||||
new_cache = {}
|
||||
conv_cache = conv_cache or {}
|
||||
for i, resnet in enumerate(self.resnets):
|
||||
x, new_cache[f"resnet_{i}"] = resnet(x, temb, zq, conv_cache=conv_cache.get(f"resnet_{i}"))
|
||||
return x, new_cache
|
||||
|
||||
|
||||
class UpBlock3D(nn.Module):
|
||||
def __init__(self, in_channels, out_channels, temb_channels=0, num_layers=1,
|
||||
eps=1e-6, act_fn="silu", groups=32, spatial_norm_dim=16,
|
||||
add_upsample=True, compress_time=False, pad_mode="first"):
|
||||
super().__init__()
|
||||
self.resnets = nn.ModuleList([
|
||||
ResnetBlock3D(
|
||||
in_channels=in_channels if i == 0 else out_channels,
|
||||
out_channels=out_channels,
|
||||
temb_channels=temb_channels, groups=groups, eps=eps,
|
||||
act_fn=act_fn, spatial_norm_dim=spatial_norm_dim, pad_mode=pad_mode,
|
||||
)
|
||||
for i in range(num_layers)
|
||||
])
|
||||
self.upsamplers = nn.ModuleList([Upsample3D(out_channels, out_channels, compress_time=compress_time)]) if add_upsample else None
|
||||
|
||||
def forward(self, x, temb=None, zq=None, conv_cache=None):
|
||||
new_cache = {}
|
||||
conv_cache = conv_cache or {}
|
||||
for i, resnet in enumerate(self.resnets):
|
||||
x, new_cache[f"resnet_{i}"] = resnet(x, temb, zq, conv_cache=conv_cache.get(f"resnet_{i}"))
|
||||
if self.upsamplers is not None:
|
||||
for us in self.upsamplers:
|
||||
x = us(x)
|
||||
return x, new_cache
|
||||
|
||||
|
||||
class Encoder3D(nn.Module):
|
||||
def __init__(self, in_channels=3, out_channels=16,
|
||||
block_out_channels=(128, 256, 256, 512),
|
||||
layers_per_block=3, act_fn="silu",
|
||||
eps=1e-6, groups=32, pad_mode="first",
|
||||
temporal_compression_ratio=4):
|
||||
super().__init__()
|
||||
temporal_compress_level = int(np.log2(temporal_compression_ratio))
|
||||
|
||||
self.conv_in = CausalConv3d(in_channels, block_out_channels[0], kernel_size=3, pad_mode=pad_mode)
|
||||
|
||||
self.down_blocks = nn.ModuleList()
|
||||
output_channel = block_out_channels[0]
|
||||
for i in range(len(block_out_channels)):
|
||||
input_channel = output_channel
|
||||
output_channel = block_out_channels[i]
|
||||
is_final = i == len(block_out_channels) - 1
|
||||
compress_time = i < temporal_compress_level
|
||||
|
||||
self.down_blocks.append(DownBlock3D(
|
||||
in_channels=input_channel, out_channels=output_channel,
|
||||
temb_channels=0, num_layers=layers_per_block,
|
||||
eps=eps, act_fn=act_fn, groups=groups,
|
||||
add_downsample=not is_final, compress_time=compress_time,
|
||||
))
|
||||
|
||||
self.mid_block = MidBlock3D(
|
||||
in_channels=block_out_channels[-1], temb_channels=0,
|
||||
num_layers=2, eps=eps, act_fn=act_fn, groups=groups, pad_mode=pad_mode,
|
||||
)
|
||||
|
||||
self.norm_out = ops.GroupNorm(groups, block_out_channels[-1], eps=1e-6)
|
||||
self.conv_act = nn.SiLU()
|
||||
self.conv_out = CausalConv3d(block_out_channels[-1], 2 * out_channels, kernel_size=3, pad_mode=pad_mode)
|
||||
|
||||
def forward(self, x, conv_cache=None):
|
||||
new_cache = {}
|
||||
conv_cache = conv_cache or {}
|
||||
|
||||
x, new_cache["conv_in"] = self.conv_in(x, conv_cache=conv_cache.get("conv_in"))
|
||||
|
||||
for i, block in enumerate(self.down_blocks):
|
||||
key = f"down_block_{i}"
|
||||
x, new_cache[key] = block(x, None, None, conv_cache.get(key))
|
||||
|
||||
x, new_cache["mid_block"] = self.mid_block(x, None, None, conv_cache=conv_cache.get("mid_block"))
|
||||
|
||||
x = self.norm_out(x)
|
||||
x = self.conv_act(x)
|
||||
x, new_cache["conv_out"] = self.conv_out(x, conv_cache=conv_cache.get("conv_out"))
|
||||
|
||||
return x, new_cache
|
||||
|
||||
|
||||
class Decoder3D(nn.Module):
|
||||
def __init__(self, in_channels=16, out_channels=3,
|
||||
block_out_channels=(128, 256, 256, 512),
|
||||
layers_per_block=3, act_fn="silu",
|
||||
eps=1e-6, groups=32, pad_mode="first",
|
||||
temporal_compression_ratio=4):
|
||||
super().__init__()
|
||||
reversed_channels = list(reversed(block_out_channels))
|
||||
temporal_compress_level = int(np.log2(temporal_compression_ratio))
|
||||
|
||||
self.conv_in = CausalConv3d(in_channels, reversed_channels[0], kernel_size=3, pad_mode=pad_mode)
|
||||
|
||||
self.mid_block = MidBlock3D(
|
||||
in_channels=reversed_channels[0], temb_channels=0,
|
||||
num_layers=2, eps=eps, act_fn=act_fn, groups=groups,
|
||||
spatial_norm_dim=in_channels, pad_mode=pad_mode,
|
||||
)
|
||||
|
||||
self.up_blocks = nn.ModuleList()
|
||||
output_channel = reversed_channels[0]
|
||||
for i in range(len(block_out_channels)):
|
||||
prev_channel = output_channel
|
||||
output_channel = reversed_channels[i]
|
||||
is_final = i == len(block_out_channels) - 1
|
||||
compress_time = i < temporal_compress_level
|
||||
|
||||
self.up_blocks.append(UpBlock3D(
|
||||
in_channels=prev_channel, out_channels=output_channel,
|
||||
temb_channels=0, num_layers=layers_per_block + 1,
|
||||
eps=eps, act_fn=act_fn, groups=groups,
|
||||
spatial_norm_dim=in_channels,
|
||||
add_upsample=not is_final, compress_time=compress_time,
|
||||
))
|
||||
|
||||
self.norm_out = SpatialNorm3D(reversed_channels[-1], in_channels, groups=groups)
|
||||
self.conv_act = nn.SiLU()
|
||||
self.conv_out = CausalConv3d(reversed_channels[-1], out_channels, kernel_size=3, pad_mode=pad_mode)
|
||||
|
||||
def forward(self, sample, conv_cache=None):
|
||||
new_cache = {}
|
||||
conv_cache = conv_cache or {}
|
||||
|
||||
x, new_cache["conv_in"] = self.conv_in(sample, conv_cache=conv_cache.get("conv_in"))
|
||||
|
||||
x, new_cache["mid_block"] = self.mid_block(x, None, sample, conv_cache=conv_cache.get("mid_block"))
|
||||
|
||||
for i, block in enumerate(self.up_blocks):
|
||||
key = f"up_block_{i}"
|
||||
x, new_cache[key] = block(x, None, sample, conv_cache=conv_cache.get(key))
|
||||
|
||||
x, new_cache["norm_out"] = self.norm_out(x, sample, conv_cache=conv_cache.get("norm_out"))
|
||||
x = self.conv_act(x)
|
||||
x, new_cache["conv_out"] = self.conv_out(x, conv_cache=conv_cache.get("conv_out"))
|
||||
|
||||
return x, new_cache
|
||||
|
||||
|
||||
|
||||
class AutoencoderKLCogVideoX(nn.Module):
|
||||
"""CogVideoX VAE. Spatial tiling/slicing handled by ComfyUI's VAE wrapper.
|
||||
|
||||
Uses rolling temporal decode: conv_in + mid_block + temporal up_blocks run
|
||||
on the full (low-res) tensor, then the expensive spatial-only up_blocks +
|
||||
norm_out + conv_out are processed in small temporal chunks with conv_cache
|
||||
carrying causal state between chunks. This keeps peak VRAM proportional to
|
||||
chunk_size rather than total frame count.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
in_channels=3, out_channels=3,
|
||||
block_out_channels=(128, 256, 256, 512),
|
||||
latent_channels=16, layers_per_block=3,
|
||||
act_fn="silu", eps=1e-6, groups=32,
|
||||
temporal_compression_ratio=4,
|
||||
):
|
||||
super().__init__()
|
||||
self.latent_channels = latent_channels
|
||||
self.temporal_compression_ratio = temporal_compression_ratio
|
||||
|
||||
self.encoder = Encoder3D(
|
||||
in_channels=in_channels, out_channels=latent_channels,
|
||||
block_out_channels=block_out_channels, layers_per_block=layers_per_block,
|
||||
act_fn=act_fn, eps=eps, groups=groups,
|
||||
temporal_compression_ratio=temporal_compression_ratio,
|
||||
)
|
||||
self.decoder = Decoder3D(
|
||||
in_channels=latent_channels, out_channels=out_channels,
|
||||
block_out_channels=block_out_channels, layers_per_block=layers_per_block,
|
||||
act_fn=act_fn, eps=eps, groups=groups,
|
||||
temporal_compression_ratio=temporal_compression_ratio,
|
||||
)
|
||||
|
||||
self.num_latent_frames_batch_size = 2
|
||||
self.num_sample_frames_batch_size = 8
|
||||
|
||||
def encode(self, x):
|
||||
t = x.shape[2]
|
||||
frame_batch = self.num_sample_frames_batch_size
|
||||
remainder = t % frame_batch
|
||||
conv_cache = None
|
||||
enc = []
|
||||
|
||||
# Process remainder frames first so only the first chunk can have an
|
||||
# odd temporal dimension — where Downsample3D's first-frame-special
|
||||
# handling in temporal compression is actually correct.
|
||||
if remainder > 0:
|
||||
chunk, conv_cache = self.encoder(x[:, :, :remainder], conv_cache=conv_cache)
|
||||
enc.append(chunk.to(x.device))
|
||||
|
||||
for start in range(remainder, t, frame_batch):
|
||||
chunk, conv_cache = self.encoder(x[:, :, start:start + frame_batch], conv_cache=conv_cache)
|
||||
enc.append(chunk.to(x.device))
|
||||
|
||||
enc = torch.cat(enc, dim=2)
|
||||
mean, _ = enc.chunk(2, dim=1)
|
||||
return mean
|
||||
|
||||
def decode(self, z):
|
||||
return self._decode_rolling(z)
|
||||
|
||||
def _decode_batched(self, z):
|
||||
"""Original batched decode - processes 2 latent frames through full decoder."""
|
||||
t = z.shape[2]
|
||||
frame_batch = self.num_latent_frames_batch_size
|
||||
num_batches = max(t // frame_batch, 1)
|
||||
conv_cache = None
|
||||
dec = []
|
||||
for i in range(num_batches):
|
||||
remaining = t % frame_batch
|
||||
start = frame_batch * i + (0 if i == 0 else remaining)
|
||||
end = frame_batch * (i + 1) + remaining
|
||||
chunk, conv_cache = self.decoder(z[:, :, start:end], conv_cache=conv_cache)
|
||||
dec.append(chunk.cpu())
|
||||
return torch.cat(dec, dim=2).to(z.device)
|
||||
|
||||
def _decode_rolling(self, z):
|
||||
"""Rolling decode - processes low-res layers on full tensor, then rolls
|
||||
through expensive high-res layers in temporal chunks."""
|
||||
decoder = self.decoder
|
||||
device = z.device
|
||||
|
||||
# Determine which up_blocks have temporal upsample vs spatial-only.
|
||||
# Temporal up_blocks are cheap (low res), spatial-only are expensive.
|
||||
temporal_compress_level = int(np.log2(self.temporal_compression_ratio))
|
||||
split_at = temporal_compress_level # first N up_blocks do temporal upsample
|
||||
|
||||
# Phase 1: conv_in + mid_block + temporal up_blocks on full tensor (low/medium res)
|
||||
x, _ = decoder.conv_in(z)
|
||||
x, _ = decoder.mid_block(x, None, z)
|
||||
|
||||
for i in range(split_at):
|
||||
x, _ = decoder.up_blocks[i](x, None, z)
|
||||
|
||||
# Phase 2: remaining spatial-only up_blocks + norm_out + conv_out in temporal chunks
|
||||
remaining_blocks = list(range(split_at, len(decoder.up_blocks)))
|
||||
chunk_size = 4 # pixel frames per chunk through high-res layers
|
||||
t_expanded = x.shape[2]
|
||||
|
||||
if t_expanded <= chunk_size or len(remaining_blocks) == 0:
|
||||
# Small enough to process in one go
|
||||
for i in remaining_blocks:
|
||||
x, _ = decoder.up_blocks[i](x, None, z)
|
||||
x, _ = decoder.norm_out(x, z)
|
||||
x = decoder.conv_act(x)
|
||||
x, _ = decoder.conv_out(x)
|
||||
return x
|
||||
|
||||
# Expand z temporally once to match Phase 2's time dimension.
|
||||
# z stays at latent spatial resolution so this is small (~16 MB vs ~1.3 GB
|
||||
# for the old approach of pre-interpolating to every pixel resolution).
|
||||
z_time_expanded = _interpolate_zq(z, (t_expanded, z.shape[3], z.shape[4]))
|
||||
|
||||
# Process in temporal chunks, interpolating spatially per-chunk to avoid
|
||||
# allocating full [B, C, t_expanded, H, W] tensors at each resolution.
|
||||
dec_out = []
|
||||
conv_caches = {}
|
||||
|
||||
for chunk_start in range(0, t_expanded, chunk_size):
|
||||
chunk_end = min(chunk_start + chunk_size, t_expanded)
|
||||
x_chunk = x[:, :, chunk_start:chunk_end]
|
||||
z_t_chunk = z_time_expanded[:, :, chunk_start:chunk_end]
|
||||
z_spatial_cache = {}
|
||||
|
||||
for i in remaining_blocks:
|
||||
block = decoder.up_blocks[i]
|
||||
cache_key = f"up_block_{i}"
|
||||
hw_key = (x_chunk.shape[3], x_chunk.shape[4])
|
||||
if hw_key not in z_spatial_cache:
|
||||
if z_t_chunk.shape[3] == hw_key[0] and z_t_chunk.shape[4] == hw_key[1]:
|
||||
z_spatial_cache[hw_key] = z_t_chunk
|
||||
else:
|
||||
z_spatial_cache[hw_key] = F.interpolate(z_t_chunk, size=(z_t_chunk.shape[2], hw_key[0], hw_key[1]))
|
||||
x_chunk, new_cache = block(x_chunk, None, z_spatial_cache[hw_key], conv_cache=conv_caches.get(cache_key))
|
||||
conv_caches[cache_key] = new_cache
|
||||
|
||||
hw_key = (x_chunk.shape[3], x_chunk.shape[4])
|
||||
if hw_key not in z_spatial_cache:
|
||||
z_spatial_cache[hw_key] = F.interpolate(z_t_chunk, size=(z_t_chunk.shape[2], hw_key[0], hw_key[1]))
|
||||
x_chunk, new_cache = decoder.norm_out(x_chunk, z_spatial_cache[hw_key], conv_cache=conv_caches.get("norm_out"))
|
||||
conv_caches["norm_out"] = new_cache
|
||||
x_chunk = decoder.conv_act(x_chunk)
|
||||
x_chunk, new_cache = decoder.conv_out(x_chunk, conv_cache=conv_caches.get("conv_out"))
|
||||
conv_caches["conv_out"] = new_cache
|
||||
|
||||
dec_out.append(x_chunk.cpu())
|
||||
del z_spatial_cache
|
||||
|
||||
del x, z_time_expanded
|
||||
return torch.cat(dec_out, dim=2).to(device)
|
||||
@ -1,7 +0,0 @@
|
||||
# Depth Anything 3 - native ComfyUI port (Apache-2.0 monocular variants only).
|
||||
#
|
||||
# Supported variants:
|
||||
# DA3-Small, DA3-Base (vits/vitb backbone, DualDPT head)
|
||||
# DA3Mono-Large, DA3Metric-Large (vitl backbone, DPT head + sky mask)
|
||||
#
|
||||
# Original repo: https://github.com/ByteDance-Seed/Depth-Anything-3
|
||||
@ -1,214 +0,0 @@
|
||||
"""Camera-token encoder and decoder for Depth Anything 3.
|
||||
|
||||
* :class:`CameraEnc` takes per-view extrinsics + intrinsics and produces a
|
||||
per-view camera token that gets injected at the alt-attention boundary
|
||||
in the DINOv2 backbone (block ``alt_start``).
|
||||
* :class:`CameraDec` takes the final-layer camera token output by the
|
||||
backbone and predicts a 9-D pose encoding (translation, quaternion,
|
||||
field-of-view).
|
||||
|
||||
The module/parameter names match the upstream ``cam_enc.py``/``cam_dec.py``
|
||||
so HF safetensors load directly with no key remapping (the upstream uses
|
||||
fused QKV linears, which we replicate here).
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
from .transform import affine_inverse, extri_intri_to_pose_encoding
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Building blocks (mirror ``depth_anything_3.model.utils.{attention,block}``)
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
|
||||
class _Mlp(nn.Module):
|
||||
"""Standard 2-layer MLP with GELU. Matches upstream ``utils.attention.Mlp``."""
|
||||
|
||||
def __init__(self, in_features, hidden_features=None, out_features=None,
|
||||
*, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
out_features = out_features or in_features
|
||||
hidden_features = hidden_features or in_features
|
||||
self.fc1 = operations.Linear(in_features, hidden_features, bias=True,
|
||||
device=device, dtype=dtype)
|
||||
self.fc2 = operations.Linear(hidden_features, out_features, bias=True,
|
||||
device=device, dtype=dtype)
|
||||
|
||||
def forward(self, x):
|
||||
return self.fc2(F.gelu(self.fc1(x)))
|
||||
|
||||
|
||||
class _LayerScale(nn.Module):
|
||||
"""Per-channel learnable scaling. Matches upstream ``LayerScale``."""
|
||||
|
||||
def __init__(self, dim, *, device=None, dtype=None):
|
||||
super().__init__()
|
||||
self.gamma = nn.Parameter(torch.empty(dim, device=device, dtype=dtype))
|
||||
|
||||
def forward(self, x):
|
||||
return x * self.gamma.to(dtype=x.dtype, device=x.device)
|
||||
|
||||
|
||||
class _Attention(nn.Module):
|
||||
"""Self-attention with fused QKV projection.
|
||||
|
||||
Mirrors upstream ``utils.attention.Attention``; layout matches the
|
||||
HF safetensors (``attn.qkv.{weight,bias}`` and ``attn.proj.{weight,bias}``).
|
||||
"""
|
||||
|
||||
def __init__(self, dim, num_heads,
|
||||
*, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
assert dim % num_heads == 0
|
||||
self.num_heads = num_heads
|
||||
self.head_dim = dim // num_heads
|
||||
self.qkv = operations.Linear(dim, dim * 3, bias=True,
|
||||
device=device, dtype=dtype)
|
||||
self.proj = operations.Linear(dim, dim, bias=True,
|
||||
device=device, dtype=dtype)
|
||||
|
||||
def forward(self, x):
|
||||
B, N, C = x.shape
|
||||
qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, self.head_dim)
|
||||
qkv = qkv.permute(2, 0, 3, 1, 4) # 3, B, h, N, d
|
||||
q, k, v = qkv.unbind(0)
|
||||
out = F.scaled_dot_product_attention(q, k, v)
|
||||
out = out.transpose(1, 2).reshape(B, N, C)
|
||||
return self.proj(out)
|
||||
|
||||
|
||||
class _Block(nn.Module):
|
||||
"""Pre-norm transformer block with LayerScale.
|
||||
|
||||
Used by :class:`CameraEnc`. Layout follows upstream ``utils.block.Block``.
|
||||
"""
|
||||
|
||||
def __init__(self, dim, num_heads, mlp_ratio=4, init_values=0.01,
|
||||
*, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.norm1 = operations.LayerNorm(dim, device=device, dtype=dtype)
|
||||
self.attn = _Attention(dim, num_heads,
|
||||
device=device, dtype=dtype, operations=operations)
|
||||
self.ls1 = _LayerScale(dim, device=device, dtype=dtype) if init_values else nn.Identity()
|
||||
self.norm2 = operations.LayerNorm(dim, device=device, dtype=dtype)
|
||||
self.mlp = _Mlp(in_features=dim, hidden_features=int(dim * mlp_ratio),
|
||||
device=device, dtype=dtype, operations=operations)
|
||||
self.ls2 = _LayerScale(dim, device=device, dtype=dtype) if init_values else nn.Identity()
|
||||
|
||||
def forward(self, x):
|
||||
x = x + self.ls1(self.attn(self.norm1(x)))
|
||||
x = x + self.ls2(self.mlp(self.norm2(x)))
|
||||
return x
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Camera encoder
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
|
||||
class CameraEnc(nn.Module):
|
||||
"""Encode per-view (extrinsics, intrinsics) into a camera token.
|
||||
|
||||
Maps a 9-D pose-encoding vector through a small MLP up to the backbone's
|
||||
``embed_dim``, then runs ``trunk_depth`` transformer blocks. The output
|
||||
has shape ``(B, S, embed_dim)`` and is injected at block ``alt_start``
|
||||
of the DINOv2 backbone in place of the cls token.
|
||||
|
||||
Parameters mirror the upstream ``cam_enc.py`` so HF weights load directly.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
dim_out: int = 1024,
|
||||
dim_in: int = 9,
|
||||
trunk_depth: int = 4,
|
||||
target_dim: int = 9,
|
||||
num_heads: int = 16,
|
||||
mlp_ratio: int = 4,
|
||||
init_values: float = 0.01,
|
||||
*,
|
||||
device=None, dtype=None, operations=None,
|
||||
**_kwargs,
|
||||
):
|
||||
super().__init__()
|
||||
self.target_dim = target_dim
|
||||
self.trunk_depth = trunk_depth
|
||||
self.trunk = nn.Sequential(*[
|
||||
_Block(dim_out, num_heads=num_heads, mlp_ratio=mlp_ratio,
|
||||
init_values=init_values,
|
||||
device=device, dtype=dtype, operations=operations)
|
||||
for _ in range(trunk_depth)
|
||||
])
|
||||
self.token_norm = operations.LayerNorm(dim_out, device=device, dtype=dtype)
|
||||
self.trunk_norm = operations.LayerNorm(dim_out, device=device, dtype=dtype)
|
||||
self.pose_branch = _Mlp(
|
||||
in_features=dim_in,
|
||||
hidden_features=dim_out // 2,
|
||||
out_features=dim_out,
|
||||
device=device, dtype=dtype, operations=operations,
|
||||
)
|
||||
|
||||
def forward(self, extrinsics: torch.Tensor, intrinsics: torch.Tensor,
|
||||
image_size_hw) -> torch.Tensor:
|
||||
"""Encode camera parameters into ``(B, S, dim_out)`` tokens."""
|
||||
c2ws = affine_inverse(extrinsics)
|
||||
pose_encoding = extri_intri_to_pose_encoding(c2ws, intrinsics, image_size_hw)
|
||||
tokens = self.pose_branch(pose_encoding.to(self.pose_branch.fc1.weight.dtype))
|
||||
tokens = self.token_norm(tokens)
|
||||
tokens = self.trunk(tokens)
|
||||
tokens = self.trunk_norm(tokens)
|
||||
return tokens
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Camera decoder
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
|
||||
class CameraDec(nn.Module):
|
||||
"""Decode the final cam token into a 9-D pose encoding.
|
||||
|
||||
Output layout: ``[T(3), quat_xyzw(4), fov_h, fov_w]``. The translation is
|
||||
always predicted by the network; the quaternion and FoV can either be
|
||||
predicted or supplied via ``camera_encoding`` (used at training time
|
||||
when GT cameras are available -- not exercised at inference here).
|
||||
|
||||
Parameters mirror the upstream ``cam_dec.py`` so HF weights load directly.
|
||||
"""
|
||||
|
||||
def __init__(self, dim_in: int = 1536,
|
||||
*, device=None, dtype=None, operations=None, **_kwargs):
|
||||
super().__init__()
|
||||
d = dim_in
|
||||
self.backbone = nn.Sequential(
|
||||
operations.Linear(d, d, device=device, dtype=dtype),
|
||||
nn.ReLU(),
|
||||
operations.Linear(d, d, device=device, dtype=dtype),
|
||||
nn.ReLU(),
|
||||
)
|
||||
self.fc_t = operations.Linear(d, 3, device=device, dtype=dtype)
|
||||
self.fc_qvec = operations.Linear(d, 4, device=device, dtype=dtype)
|
||||
self.fc_fov = nn.Sequential(
|
||||
operations.Linear(d, 2, device=device, dtype=dtype),
|
||||
nn.ReLU(),
|
||||
)
|
||||
|
||||
def forward(self, feat: torch.Tensor,
|
||||
camera_encoding: "torch.Tensor | None" = None) -> torch.Tensor:
|
||||
"""Decode ``(B, N, dim_in)`` cam tokens into ``(B, N, 9)`` pose enc."""
|
||||
B, N = feat.shape[:2]
|
||||
feat = feat.reshape(B * N, -1)
|
||||
feat = self.backbone(feat)
|
||||
out_t = self.fc_t(feat.float()).reshape(B, N, 3)
|
||||
if camera_encoding is None:
|
||||
out_qvec = self.fc_qvec(feat.float()).reshape(B, N, 4)
|
||||
out_fov = self.fc_fov(feat.float()).reshape(B, N, 2)
|
||||
else:
|
||||
out_qvec = camera_encoding[..., 3:7]
|
||||
out_fov = camera_encoding[..., -2:]
|
||||
return torch.cat([out_t, out_qvec, out_fov], dim=-1)
|
||||
@ -1,549 +0,0 @@
|
||||
# DPT / DualDPT heads for Depth Anything 3.
|
||||
#
|
||||
# Ported from:
|
||||
# src/depth_anything_3/model/dpt.py (DPT - single main head + sky head)
|
||||
# src/depth_anything_3/model/dualdpt.py (DualDPT - depth + auxiliary "ray" head)
|
||||
#
|
||||
# In the monocular path we always discard the auxiliary "ray" output of
|
||||
# DualDPT. The auxiliary branch is still constructed so that DA3 HF weights
|
||||
# load cleanly without missing-key warnings.
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import List, Optional, Sequence, Tuple
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Helpers (matching upstream head_utils.py)
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
|
||||
class Permute(nn.Module):
|
||||
def __init__(self, dims: Tuple[int, ...]):
|
||||
super().__init__()
|
||||
self.dims = dims
|
||||
|
||||
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
||||
return x.permute(*self.dims)
|
||||
|
||||
|
||||
def _custom_interpolate(
|
||||
x: torch.Tensor,
|
||||
size: Optional[Tuple[int, int]] = None,
|
||||
scale_factor: Optional[float] = None,
|
||||
mode: str = "bilinear",
|
||||
align_corners: bool = True,
|
||||
) -> torch.Tensor:
|
||||
if size is None:
|
||||
assert scale_factor is not None
|
||||
size = (int(x.shape[-2] * scale_factor), int(x.shape[-1] * scale_factor))
|
||||
INT_MAX = 1610612736
|
||||
total = size[0] * size[1] * x.shape[0] * x.shape[1]
|
||||
if total > INT_MAX:
|
||||
chunks = torch.chunk(x, chunks=(total // INT_MAX) + 1, dim=0)
|
||||
outs = [F.interpolate(c, size=size, mode=mode, align_corners=align_corners) for c in chunks]
|
||||
return torch.cat(outs, dim=0).contiguous()
|
||||
return F.interpolate(x, size=size, mode=mode, align_corners=align_corners)
|
||||
|
||||
|
||||
def _create_uv_grid(width: int, height: int, aspect_ratio: float,
|
||||
dtype, device) -> torch.Tensor:
|
||||
"""Normalised UV grid spanning (-x_span, -y_span)..(x_span, y_span)."""
|
||||
diag_factor = (aspect_ratio ** 2 + 1.0) ** 0.5
|
||||
span_x = aspect_ratio / diag_factor
|
||||
span_y = 1.0 / diag_factor
|
||||
left_x = -span_x * (width - 1) / width
|
||||
right_x = span_x * (width - 1) / width
|
||||
top_y = -span_y * (height - 1) / height
|
||||
bottom_y = span_y * (height - 1) / height
|
||||
x_coords = torch.linspace(left_x, right_x, steps=width, dtype=dtype, device=device)
|
||||
y_coords = torch.linspace(top_y, bottom_y, steps=height, dtype=dtype, device=device)
|
||||
uu, vv = torch.meshgrid(x_coords, y_coords, indexing="xy")
|
||||
return torch.stack((uu, vv), dim=-1) # (H, W, 2)
|
||||
|
||||
|
||||
def _make_sincos_pos_embed(embed_dim: int, pos: torch.Tensor, omega_0: float = 100.0) -> torch.Tensor:
|
||||
omega = torch.arange(embed_dim // 2, dtype=torch.float32, device=pos.device)
|
||||
omega = 1.0 / omega_0 ** (omega / (embed_dim / 2.0))
|
||||
pos = pos.reshape(-1)
|
||||
out = torch.einsum("m,d->md", pos, omega)
|
||||
return torch.cat([out.sin(), out.cos()], dim=1).float()
|
||||
|
||||
|
||||
def _position_grid_to_embed(pos_grid: torch.Tensor, embed_dim: int,
|
||||
omega_0: float = 100.0) -> torch.Tensor:
|
||||
H, W, _ = pos_grid.shape
|
||||
pos_flat = pos_grid.reshape(-1, 2)
|
||||
emb_x = _make_sincos_pos_embed(embed_dim // 2, pos_flat[:, 0], omega_0=omega_0)
|
||||
emb_y = _make_sincos_pos_embed(embed_dim // 2, pos_flat[:, 1], omega_0=omega_0)
|
||||
emb = torch.cat([emb_x, emb_y], dim=-1)
|
||||
return emb.view(H, W, embed_dim)
|
||||
|
||||
|
||||
def _add_pos_embed(x: torch.Tensor, W: int, H: int, ratio: float = 0.1) -> torch.Tensor:
|
||||
"""Stateless UV positional embedding added to a feature map (B, C, h, w)."""
|
||||
pw, ph = x.shape[-1], x.shape[-2]
|
||||
pe = _create_uv_grid(pw, ph, aspect_ratio=W / H, dtype=x.dtype, device=x.device)
|
||||
pe = _position_grid_to_embed(pe, x.shape[1]) * ratio
|
||||
pe = pe.permute(2, 0, 1)[None].expand(x.shape[0], -1, -1, -1).to(dtype=x.dtype)
|
||||
return x + pe
|
||||
|
||||
|
||||
def _apply_activation(x: torch.Tensor, activation: str) -> torch.Tensor:
|
||||
act = (activation or "linear").lower()
|
||||
if act == "exp":
|
||||
return torch.exp(x)
|
||||
if act == "expp1":
|
||||
return torch.exp(x) + 1
|
||||
if act == "expm1":
|
||||
return torch.expm1(x)
|
||||
if act == "relu":
|
||||
return torch.relu(x)
|
||||
if act == "sigmoid":
|
||||
return torch.sigmoid(x)
|
||||
if act == "softplus":
|
||||
return F.softplus(x)
|
||||
if act == "tanh":
|
||||
return torch.tanh(x)
|
||||
return x
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Fusion building blocks
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
|
||||
class ResidualConvUnit(nn.Module):
|
||||
def __init__(self, features: int,
|
||||
device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.conv1 = operations.Conv2d(features, features, 3, 1, 1, bias=True,
|
||||
device=device, dtype=dtype)
|
||||
self.conv2 = operations.Conv2d(features, features, 3, 1, 1, bias=True,
|
||||
device=device, dtype=dtype)
|
||||
self.activation = nn.ReLU(inplace=False)
|
||||
|
||||
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
||||
out = self.activation(x)
|
||||
out = self.conv1(out)
|
||||
out = self.activation(out)
|
||||
out = self.conv2(out)
|
||||
return out + x
|
||||
|
||||
|
||||
class FeatureFusionBlock(nn.Module):
|
||||
def __init__(self, features: int, has_residual: bool = True,
|
||||
align_corners: bool = True,
|
||||
device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.align_corners = align_corners
|
||||
self.has_residual = has_residual
|
||||
if has_residual:
|
||||
self.resConfUnit1 = ResidualConvUnit(features, device=device, dtype=dtype, operations=operations)
|
||||
else:
|
||||
self.resConfUnit1 = None
|
||||
self.resConfUnit2 = ResidualConvUnit(features, device=device, dtype=dtype, operations=operations)
|
||||
self.out_conv = operations.Conv2d(features, features, 1, 1, 0, bias=True,
|
||||
device=device, dtype=dtype)
|
||||
|
||||
def forward(self, *xs: torch.Tensor, size: Optional[Tuple[int, int]] = None) -> torch.Tensor:
|
||||
y = xs[0]
|
||||
if self.has_residual and len(xs) > 1 and self.resConfUnit1 is not None:
|
||||
y = y + self.resConfUnit1(xs[1])
|
||||
y = self.resConfUnit2(y)
|
||||
if size is None:
|
||||
up_kwargs = {"scale_factor": 2.0}
|
||||
else:
|
||||
up_kwargs = {"size": size}
|
||||
y = _custom_interpolate(y, **up_kwargs, mode="bilinear",
|
||||
align_corners=self.align_corners)
|
||||
y = self.out_conv(y)
|
||||
return y
|
||||
|
||||
|
||||
class _Scratch(nn.Module):
|
||||
"""Container that mirrors upstream ``scratch`` attribute layout."""
|
||||
|
||||
|
||||
def _make_scratch(in_shape: List[int], out_shape: int,
|
||||
device=None, dtype=None, operations=None) -> _Scratch:
|
||||
scratch = _Scratch()
|
||||
scratch.layer1_rn = operations.Conv2d(in_shape[0], out_shape, 3, 1, 1, bias=False,
|
||||
device=device, dtype=dtype)
|
||||
scratch.layer2_rn = operations.Conv2d(in_shape[1], out_shape, 3, 1, 1, bias=False,
|
||||
device=device, dtype=dtype)
|
||||
scratch.layer3_rn = operations.Conv2d(in_shape[2], out_shape, 3, 1, 1, bias=False,
|
||||
device=device, dtype=dtype)
|
||||
scratch.layer4_rn = operations.Conv2d(in_shape[3], out_shape, 3, 1, 1, bias=False,
|
||||
device=device, dtype=dtype)
|
||||
return scratch
|
||||
|
||||
|
||||
def _make_fusion_block(features: int, has_residual: bool = True,
|
||||
device=None, dtype=None, operations=None) -> FeatureFusionBlock:
|
||||
return FeatureFusionBlock(features, has_residual=has_residual,
|
||||
align_corners=True,
|
||||
device=device, dtype=dtype, operations=operations)
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# DPT (single head + optional sky head) -- used by DA3Mono/Metric
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
|
||||
class DPT(nn.Module):
|
||||
"""Single-head DPT used by DA3Mono-Large and DA3Metric-Large."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
dim_in: int,
|
||||
patch_size: int = 14,
|
||||
output_dim: int = 1,
|
||||
activation: str = "exp",
|
||||
conf_activation: str = "expp1",
|
||||
features: int = 256,
|
||||
out_channels: Sequence[int] = (256, 512, 1024, 1024),
|
||||
pos_embed: bool = False,
|
||||
down_ratio: int = 1,
|
||||
head_name: str = "depth",
|
||||
use_sky_head: bool = True,
|
||||
sky_name: str = "sky",
|
||||
sky_activation: str = "relu",
|
||||
norm_type: str = "idt",
|
||||
device=None, dtype=None, operations=None,
|
||||
):
|
||||
super().__init__()
|
||||
self.patch_size = patch_size
|
||||
self.activation = activation
|
||||
self.conf_activation = conf_activation
|
||||
self.pos_embed = pos_embed
|
||||
self.down_ratio = down_ratio
|
||||
self.head_main = head_name
|
||||
self.sky_name = sky_name
|
||||
self.out_dim = output_dim
|
||||
self.has_conf = output_dim > 1
|
||||
self.use_sky_head = use_sky_head
|
||||
self.sky_activation = sky_activation
|
||||
self.intermediate_layer_idx: Tuple[int, int, int, int] = (0, 1, 2, 3)
|
||||
|
||||
if norm_type == "layer":
|
||||
self.norm = operations.LayerNorm(dim_in, device=device, dtype=dtype)
|
||||
else:
|
||||
self.norm = nn.Identity()
|
||||
|
||||
out_channels = list(out_channels)
|
||||
self.projects = nn.ModuleList([
|
||||
operations.Conv2d(dim_in, oc, kernel_size=1, stride=1, padding=0,
|
||||
device=device, dtype=dtype)
|
||||
for oc in out_channels
|
||||
])
|
||||
self.resize_layers = nn.ModuleList([
|
||||
operations.ConvTranspose2d(out_channels[0], out_channels[0], kernel_size=4, stride=4, padding=0,
|
||||
device=device, dtype=dtype),
|
||||
operations.ConvTranspose2d(out_channels[1], out_channels[1], kernel_size=2, stride=2, padding=0,
|
||||
device=device, dtype=dtype),
|
||||
nn.Identity(),
|
||||
operations.Conv2d(out_channels[3], out_channels[3], kernel_size=3, stride=2, padding=1,
|
||||
device=device, dtype=dtype),
|
||||
])
|
||||
|
||||
self.scratch = _make_scratch(out_channels, features,
|
||||
device=device, dtype=dtype, operations=operations)
|
||||
self.scratch.refinenet1 = _make_fusion_block(features, device=device, dtype=dtype, operations=operations)
|
||||
self.scratch.refinenet2 = _make_fusion_block(features, device=device, dtype=dtype, operations=operations)
|
||||
self.scratch.refinenet3 = _make_fusion_block(features, device=device, dtype=dtype, operations=operations)
|
||||
self.scratch.refinenet4 = _make_fusion_block(features, has_residual=False,
|
||||
device=device, dtype=dtype, operations=operations)
|
||||
|
||||
head_features_1 = features
|
||||
head_features_2 = 32
|
||||
self.scratch.output_conv1 = operations.Conv2d(
|
||||
head_features_1, head_features_1 // 2, kernel_size=3, stride=1, padding=1,
|
||||
device=device, dtype=dtype,
|
||||
)
|
||||
self.scratch.output_conv2 = nn.Sequential(
|
||||
operations.Conv2d(head_features_1 // 2, head_features_2, kernel_size=3, stride=1, padding=1,
|
||||
device=device, dtype=dtype),
|
||||
nn.ReLU(inplace=False),
|
||||
operations.Conv2d(head_features_2, output_dim, kernel_size=1, stride=1, padding=0,
|
||||
device=device, dtype=dtype),
|
||||
)
|
||||
|
||||
if self.use_sky_head:
|
||||
self.scratch.sky_output_conv2 = nn.Sequential(
|
||||
operations.Conv2d(head_features_1 // 2, head_features_2, kernel_size=3, stride=1, padding=1,
|
||||
device=device, dtype=dtype),
|
||||
nn.ReLU(inplace=False),
|
||||
operations.Conv2d(head_features_2, 1, kernel_size=1, stride=1, padding=0,
|
||||
device=device, dtype=dtype),
|
||||
)
|
||||
|
||||
def forward(self, feats: List[torch.Tensor], H: int, W: int,
|
||||
patch_start_idx: int = 0, **_kwargs) -> dict:
|
||||
# feats[i][0] is the patch-token tensor with shape (B, S, N_patch, C)
|
||||
B, S, N, C = feats[0][0].shape
|
||||
feats_flat = [feat[0].reshape(B * S, N, C) for feat in feats]
|
||||
|
||||
ph, pw = H // self.patch_size, W // self.patch_size
|
||||
resized = []
|
||||
for stage_idx, take_idx in enumerate(self.intermediate_layer_idx):
|
||||
x = feats_flat[take_idx][:, patch_start_idx:]
|
||||
x = self.norm(x)
|
||||
x = x.permute(0, 2, 1).contiguous().reshape(B * S, C, ph, pw)
|
||||
x = self.projects[stage_idx](x)
|
||||
if self.pos_embed:
|
||||
x = _add_pos_embed(x, W, H)
|
||||
x = self.resize_layers[stage_idx](x)
|
||||
resized.append(x)
|
||||
|
||||
l1_rn = self.scratch.layer1_rn(resized[0])
|
||||
l2_rn = self.scratch.layer2_rn(resized[1])
|
||||
l3_rn = self.scratch.layer3_rn(resized[2])
|
||||
l4_rn = self.scratch.layer4_rn(resized[3])
|
||||
|
||||
out = self.scratch.refinenet4(l4_rn, size=l3_rn.shape[2:])
|
||||
out = self.scratch.refinenet3(out, l3_rn, size=l2_rn.shape[2:])
|
||||
out = self.scratch.refinenet2(out, l2_rn, size=l1_rn.shape[2:])
|
||||
out = self.scratch.refinenet1(out, l1_rn)
|
||||
|
||||
h_out = int(ph * self.patch_size / self.down_ratio)
|
||||
w_out = int(pw * self.patch_size / self.down_ratio)
|
||||
|
||||
fused = self.scratch.output_conv1(out)
|
||||
fused = _custom_interpolate(fused, (h_out, w_out), mode="bilinear", align_corners=True)
|
||||
if self.pos_embed:
|
||||
fused = _add_pos_embed(fused, W, H)
|
||||
feat = fused
|
||||
|
||||
main_logits = self.scratch.output_conv2(feat)
|
||||
outs = {}
|
||||
if self.has_conf:
|
||||
fmap = main_logits.permute(0, 2, 3, 1)
|
||||
pred = _apply_activation(fmap[..., :-1], self.activation)
|
||||
conf = _apply_activation(fmap[..., -1], self.conf_activation)
|
||||
outs[self.head_main] = pred.squeeze(-1).view(B, S, *pred.shape[1:-1])
|
||||
outs[f"{self.head_main}_conf"] = conf.view(B, S, *conf.shape[1:])
|
||||
else:
|
||||
pred = _apply_activation(main_logits, self.activation)
|
||||
outs[self.head_main] = pred.squeeze(1).view(B, S, *pred.shape[2:])
|
||||
|
||||
if self.use_sky_head:
|
||||
sky_logits = self.scratch.sky_output_conv2(feat)
|
||||
if self.sky_activation.lower() == "sigmoid":
|
||||
sky = torch.sigmoid(sky_logits)
|
||||
elif self.sky_activation.lower() == "relu":
|
||||
sky = F.relu(sky_logits)
|
||||
else:
|
||||
sky = sky_logits
|
||||
outs[self.sky_name] = sky.squeeze(1).view(B, S, *sky.shape[2:])
|
||||
|
||||
return outs
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# DualDPT (depth + auxiliary "ray" head) -- used by DA3-Small / DA3-Base
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
|
||||
class DualDPT(nn.Module):
|
||||
"""Two-head DPT used by DA3-Small / DA3-Base.
|
||||
|
||||
The auxiliary "ray" head is constructed so that HF state-dict keys load
|
||||
cleanly. It is only executed when :attr:`enable_aux` is set on the
|
||||
instance (typically by ``DepthAnything3Net`` when running multi-view
|
||||
with ``use_ray_pose=True``); otherwise the monocular path skips it for
|
||||
speed and the auxiliary submodules sit idle.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
dim_in: int,
|
||||
patch_size: int = 14,
|
||||
output_dim: int = 2,
|
||||
activation: str = "exp",
|
||||
conf_activation: str = "expp1",
|
||||
features: int = 256,
|
||||
out_channels: Sequence[int] = (256, 512, 1024, 1024),
|
||||
pos_embed: bool = True,
|
||||
down_ratio: int = 1,
|
||||
aux_pyramid_levels: int = 4,
|
||||
aux_out1_conv_num: int = 5,
|
||||
head_names: Tuple[str, str] = ("depth", "ray"),
|
||||
device=None, dtype=None, operations=None,
|
||||
):
|
||||
super().__init__()
|
||||
self.patch_size = patch_size
|
||||
self.activation = activation
|
||||
self.conf_activation = conf_activation
|
||||
self.pos_embed = pos_embed
|
||||
self.down_ratio = down_ratio
|
||||
self.aux_levels = aux_pyramid_levels
|
||||
self.aux_out1_conv_num = aux_out1_conv_num
|
||||
self.head_main, self.head_aux = head_names
|
||||
self.intermediate_layer_idx: Tuple[int, int, int, int] = (0, 1, 2, 3)
|
||||
# Toggle the auxiliary ray branch at runtime. Default off (mono path).
|
||||
# ``DepthAnything3Net`` flips this on when running multi-view + ray-pose.
|
||||
self.enable_aux: bool = False
|
||||
|
||||
self.norm = operations.LayerNorm(dim_in, device=device, dtype=dtype)
|
||||
out_channels = list(out_channels)
|
||||
self.projects = nn.ModuleList([
|
||||
operations.Conv2d(dim_in, oc, kernel_size=1, stride=1, padding=0,
|
||||
device=device, dtype=dtype)
|
||||
for oc in out_channels
|
||||
])
|
||||
self.resize_layers = nn.ModuleList([
|
||||
operations.ConvTranspose2d(out_channels[0], out_channels[0], kernel_size=4, stride=4, padding=0,
|
||||
device=device, dtype=dtype),
|
||||
operations.ConvTranspose2d(out_channels[1], out_channels[1], kernel_size=2, stride=2, padding=0,
|
||||
device=device, dtype=dtype),
|
||||
nn.Identity(),
|
||||
operations.Conv2d(out_channels[3], out_channels[3], kernel_size=3, stride=2, padding=1,
|
||||
device=device, dtype=dtype),
|
||||
])
|
||||
|
||||
self.scratch = _make_scratch(out_channels, features,
|
||||
device=device, dtype=dtype, operations=operations)
|
||||
# Main fusion chain
|
||||
self.scratch.refinenet1 = _make_fusion_block(features, device=device, dtype=dtype, operations=operations)
|
||||
self.scratch.refinenet2 = _make_fusion_block(features, device=device, dtype=dtype, operations=operations)
|
||||
self.scratch.refinenet3 = _make_fusion_block(features, device=device, dtype=dtype, operations=operations)
|
||||
self.scratch.refinenet4 = _make_fusion_block(features, has_residual=False,
|
||||
device=device, dtype=dtype, operations=operations)
|
||||
# Auxiliary fusion chain (separate copies)
|
||||
self.scratch.refinenet1_aux = _make_fusion_block(features, device=device, dtype=dtype, operations=operations)
|
||||
self.scratch.refinenet2_aux = _make_fusion_block(features, device=device, dtype=dtype, operations=operations)
|
||||
self.scratch.refinenet3_aux = _make_fusion_block(features, device=device, dtype=dtype, operations=operations)
|
||||
self.scratch.refinenet4_aux = _make_fusion_block(features, has_residual=False,
|
||||
device=device, dtype=dtype, operations=operations)
|
||||
|
||||
head_features_1 = features
|
||||
head_features_2 = 32
|
||||
|
||||
# Main head neck + final projection
|
||||
self.scratch.output_conv1 = operations.Conv2d(
|
||||
head_features_1, head_features_1 // 2, kernel_size=3, stride=1, padding=1,
|
||||
device=device, dtype=dtype,
|
||||
)
|
||||
self.scratch.output_conv2 = nn.Sequential(
|
||||
operations.Conv2d(head_features_1 // 2, head_features_2, kernel_size=3, stride=1, padding=1,
|
||||
device=device, dtype=dtype),
|
||||
nn.ReLU(inplace=False),
|
||||
operations.Conv2d(head_features_2, output_dim, kernel_size=1, stride=1, padding=0,
|
||||
device=device, dtype=dtype),
|
||||
)
|
||||
|
||||
# Aux pre-head per level (multi-level pyramid)
|
||||
self.scratch.output_conv1_aux = nn.ModuleList([
|
||||
self._make_aux_out1_block(head_features_1, device=device, dtype=dtype, operations=operations)
|
||||
for _ in range(self.aux_levels)
|
||||
])
|
||||
|
||||
# Aux final projection per level (includes LayerNorm permute path).
|
||||
ln_seq = [Permute((0, 2, 3, 1)),
|
||||
operations.LayerNorm(head_features_2, device=device, dtype=dtype),
|
||||
Permute((0, 3, 1, 2))]
|
||||
self.scratch.output_conv2_aux = nn.ModuleList([
|
||||
nn.Sequential(
|
||||
operations.Conv2d(head_features_1 // 2, head_features_2, kernel_size=3, stride=1, padding=1,
|
||||
device=device, dtype=dtype),
|
||||
*ln_seq,
|
||||
nn.ReLU(inplace=False),
|
||||
operations.Conv2d(head_features_2, 7, kernel_size=1, stride=1, padding=0,
|
||||
device=device, dtype=dtype),
|
||||
)
|
||||
for _ in range(self.aux_levels)
|
||||
])
|
||||
|
||||
@staticmethod
|
||||
def _make_aux_out1_block(in_ch: int, *, device=None, dtype=None, operations=None) -> nn.Sequential:
|
||||
# aux_out1_conv_num=5 in all Apache-2.0 variants.
|
||||
return nn.Sequential(
|
||||
operations.Conv2d(in_ch, in_ch // 2, 3, 1, 1, device=device, dtype=dtype),
|
||||
operations.Conv2d(in_ch // 2, in_ch, 3, 1, 1, device=device, dtype=dtype),
|
||||
operations.Conv2d(in_ch, in_ch // 2, 3, 1, 1, device=device, dtype=dtype),
|
||||
operations.Conv2d(in_ch // 2, in_ch, 3, 1, 1, device=device, dtype=dtype),
|
||||
operations.Conv2d(in_ch, in_ch // 2, 3, 1, 1, device=device, dtype=dtype),
|
||||
)
|
||||
|
||||
def forward(self, feats: List[torch.Tensor], H: int, W: int,
|
||||
patch_start_idx: int = 0, **_kwargs) -> dict:
|
||||
B, S, N, C = feats[0][0].shape
|
||||
feats_flat = [feat[0].reshape(B * S, N, C) for feat in feats]
|
||||
|
||||
ph, pw = H // self.patch_size, W // self.patch_size
|
||||
resized = []
|
||||
for stage_idx, take_idx in enumerate(self.intermediate_layer_idx):
|
||||
x = feats_flat[take_idx][:, patch_start_idx:]
|
||||
x = self.norm(x)
|
||||
x = x.permute(0, 2, 1).contiguous().reshape(B * S, C, ph, pw)
|
||||
x = self.projects[stage_idx](x)
|
||||
if self.pos_embed:
|
||||
x = _add_pos_embed(x, W, H)
|
||||
x = self.resize_layers[stage_idx](x)
|
||||
resized.append(x)
|
||||
|
||||
l1_rn = self.scratch.layer1_rn(resized[0])
|
||||
l2_rn = self.scratch.layer2_rn(resized[1])
|
||||
l3_rn = self.scratch.layer3_rn(resized[2])
|
||||
l4_rn = self.scratch.layer4_rn(resized[3])
|
||||
|
||||
# Main pyramid (output_conv1 is applied inside the upstream `_fuse`,
|
||||
# before interpolation -- replicate that order here).
|
||||
m = self.scratch.refinenet4(l4_rn, size=l3_rn.shape[2:])
|
||||
if self.enable_aux:
|
||||
a4 = self.scratch.refinenet4_aux(l4_rn, size=l3_rn.shape[2:])
|
||||
aux_pyr = [a4]
|
||||
m = self.scratch.refinenet3(m, l3_rn, size=l2_rn.shape[2:])
|
||||
if self.enable_aux:
|
||||
aux_pyr.append(self.scratch.refinenet3_aux(aux_pyr[-1], l3_rn, size=l2_rn.shape[2:]))
|
||||
m = self.scratch.refinenet2(m, l2_rn, size=l1_rn.shape[2:])
|
||||
if self.enable_aux:
|
||||
aux_pyr.append(self.scratch.refinenet2_aux(aux_pyr[-1], l2_rn, size=l1_rn.shape[2:]))
|
||||
m = self.scratch.refinenet1(m, l1_rn)
|
||||
if self.enable_aux:
|
||||
aux_pyr.append(self.scratch.refinenet1_aux(aux_pyr[-1], l1_rn))
|
||||
m = self.scratch.output_conv1(m)
|
||||
|
||||
h_out = int(ph * self.patch_size / self.down_ratio)
|
||||
w_out = int(pw * self.patch_size / self.down_ratio)
|
||||
|
||||
m = _custom_interpolate(m, (h_out, w_out), mode="bilinear", align_corners=True)
|
||||
if self.pos_embed:
|
||||
m = _add_pos_embed(m, W, H)
|
||||
main_logits = self.scratch.output_conv2(m)
|
||||
fmap = main_logits.permute(0, 2, 3, 1)
|
||||
depth_pred = _apply_activation(fmap[..., :-1], self.activation)
|
||||
depth_conf = _apply_activation(fmap[..., -1], self.conf_activation)
|
||||
|
||||
outs = {
|
||||
self.head_main: depth_pred.squeeze(-1).view(B, S, *depth_pred.shape[1:-1]),
|
||||
f"{self.head_main}_conf": depth_conf.view(B, S, *depth_conf.shape[1:]),
|
||||
}
|
||||
|
||||
if self.enable_aux:
|
||||
# Auxiliary "ray" head (multi-level inside) -- only the last level
|
||||
# is returned. Mirrors upstream ``DualDPT._fuse`` + ``_forward_impl``:
|
||||
# each aux pyramid level goes through ``output_conv1_aux[i]``
|
||||
# (5-layer conv stack that ends at ``features // 2`` channels),
|
||||
# then the last level optionally gets a pos-embed and finally
|
||||
# ``output_conv2_aux[-1]``.
|
||||
aux_processed = [
|
||||
self.scratch.output_conv1_aux[i](a) for i, a in enumerate(aux_pyr)
|
||||
]
|
||||
last_aux = aux_processed[-1]
|
||||
if self.pos_embed:
|
||||
last_aux = _add_pos_embed(last_aux, W, H)
|
||||
last_aux_logits = self.scratch.output_conv2_aux[-1](last_aux)
|
||||
fmap_last = last_aux_logits.permute(0, 2, 3, 1)
|
||||
# Channels: [ray(6), ray_conf(1)]; ray uses 'linear' activation.
|
||||
aux_pred = fmap_last[..., :-1]
|
||||
aux_conf = _apply_activation(fmap_last[..., -1], self.conf_activation)
|
||||
outs[self.head_aux] = aux_pred.view(B, S, *aux_pred.shape[1:])
|
||||
outs[f"{self.head_aux}_conf"] = aux_conf.view(B, S, *aux_conf.shape[1:])
|
||||
|
||||
return outs
|
||||
@ -1,309 +0,0 @@
|
||||
# DepthAnything3Net: top-level wrapper that combines backbone + head.
|
||||
#
|
||||
# Supports both the monocular and the multi-view + camera path:
|
||||
#
|
||||
# * Monocular: ``S = 1``, no camera encoder/decoder. Mirrors the original
|
||||
# port that only handled ``DA3-MONO/METRIC-LARGE`` and the auxiliary-disabled
|
||||
# ``DA3-SMALL/BASE`` configs.
|
||||
# * Multi-view + camera: ``S > 1``. ``cam_enc`` (optional) maps user-supplied
|
||||
# extrinsics + intrinsics into a per-view camera token; ``cam_dec`` decodes
|
||||
# the final layer's camera token into a 9-D pose encoding. When the
|
||||
# auxiliary "ray" head of ``DualDPT`` is enabled the predicted ray map can
|
||||
# alternatively be used to estimate pose via RANSAC (``use_ray_pose=True``).
|
||||
# The 3D-Gaussian head and the nested-architecture wrapper are intentionally
|
||||
# left out of scope here; their state-dict keys are filtered in
|
||||
# ``comfy.supported_models.DepthAnything3.process_unet_state_dict``.
|
||||
#
|
||||
# The backbone is shared with the CLIP-vision DINOv2 path
|
||||
# (``comfy.image_encoders.dino2.Dinov2Model``); the DA3-specific extensions
|
||||
# (RoPE, QK-norm, alternating local/global attention, camera token, multi-
|
||||
# layer feature extraction, reference-view reordering) are opt-in via the
|
||||
# config dict and are all disabled for the Mono/Metric variants.
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Dict, Optional, Sequence
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
from comfy.image_encoders.dino2 import Dinov2Model
|
||||
|
||||
from .camera import CameraDec, CameraEnc
|
||||
from .dpt import DPT, DualDPT
|
||||
from .ray_pose import get_extrinsic_from_camray
|
||||
from .transform import affine_inverse, pose_encoding_to_extri_intri
|
||||
|
||||
|
||||
_HEAD_REGISTRY = {
|
||||
"dpt": DPT,
|
||||
"dualdpt": DualDPT,
|
||||
}
|
||||
|
||||
|
||||
# Backbone presets (mirror the upstream DINOv2 ViT variants).
|
||||
_BACKBONE_PRESETS = {
|
||||
"vits": dict(hidden_size=384, num_hidden_layers=12, num_attention_heads=6, use_swiglu_ffn=False),
|
||||
"vitb": dict(hidden_size=768, num_hidden_layers=12, num_attention_heads=12, use_swiglu_ffn=False),
|
||||
"vitl": dict(hidden_size=1024, num_hidden_layers=24, num_attention_heads=16, use_swiglu_ffn=False),
|
||||
"vitg": dict(hidden_size=1536, num_hidden_layers=40, num_attention_heads=24, use_swiglu_ffn=True),
|
||||
}
|
||||
|
||||
|
||||
def _build_backbone_config(
|
||||
backbone_name: str,
|
||||
*,
|
||||
alt_start: int,
|
||||
qknorm_start: int,
|
||||
rope_start: int,
|
||||
cat_token: bool,
|
||||
) -> dict:
|
||||
if backbone_name not in _BACKBONE_PRESETS:
|
||||
raise ValueError(f"Unknown DINOv2 backbone variant: {backbone_name!r}")
|
||||
cfg = dict(_BACKBONE_PRESETS[backbone_name])
|
||||
cfg.update(dict(
|
||||
layer_norm_eps=1e-6,
|
||||
patch_size=14,
|
||||
image_size=518,
|
||||
# DA3 weights have no mask_token; skip registering it to avoid spurious
|
||||
# missing-key warnings on load.
|
||||
use_mask_token=False,
|
||||
alt_start=alt_start,
|
||||
qknorm_start=qknorm_start,
|
||||
rope_start=rope_start,
|
||||
cat_token=cat_token,
|
||||
rope_freq=100.0,
|
||||
))
|
||||
return cfg
|
||||
|
||||
|
||||
class DepthAnything3Net(nn.Module):
|
||||
"""ComfyUI-side DepthAnything3 network.
|
||||
|
||||
Parameters mirror the variant YAML configs from the upstream repo and
|
||||
are auto-detected from the state dict by ``comfy/model_detection.py``.
|
||||
The kwargs ``device``, ``dtype`` and ``operations`` are injected by
|
||||
``BaseModel``.
|
||||
"""
|
||||
|
||||
PATCH_SIZE = 14
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
# --- Backbone ---
|
||||
backbone_name: str = "vitl",
|
||||
out_layers: Sequence[int] = (4, 11, 17, 23),
|
||||
alt_start: int = -1,
|
||||
qknorm_start: int = -1,
|
||||
rope_start: int = -1,
|
||||
cat_token: bool = False,
|
||||
# --- Head ---
|
||||
head_type: str = "dpt", # "dpt" or "dualdpt"
|
||||
head_dim_in: int = 1024,
|
||||
head_output_dim: int = 1, # 1 = depth only, 2 = depth+conf
|
||||
head_features: int = 256,
|
||||
head_out_channels: Sequence[int] = (256, 512, 1024, 1024),
|
||||
head_use_sky_head: bool = True, # ignored by DualDPT
|
||||
head_pos_embed: Optional[bool] = None, # default: True for DualDPT, False for DPT
|
||||
# --- Camera (multi-view) ---
|
||||
has_cam_enc: bool = False,
|
||||
has_cam_dec: bool = False,
|
||||
cam_dim_out: Optional[int] = None, # CameraEnc dim_out (defaults to embed_dim)
|
||||
cam_dec_dim_in: Optional[int] = None, # CameraDec dim_in (defaults to 2*embed_dim with cat_token)
|
||||
# ComfyUI plumbing
|
||||
device=None, dtype=None, operations=None,
|
||||
**_ignored,
|
||||
):
|
||||
super().__init__()
|
||||
head_cls = _HEAD_REGISTRY[head_type.lower()]
|
||||
self.head_type = head_type.lower()
|
||||
self.has_sky = (self.head_type == "dpt") and head_use_sky_head
|
||||
self.has_conf = head_output_dim > 1
|
||||
self.out_layers = list(out_layers)
|
||||
|
||||
backbone_cfg = _build_backbone_config(
|
||||
backbone_name,
|
||||
alt_start=alt_start,
|
||||
qknorm_start=qknorm_start,
|
||||
rope_start=rope_start,
|
||||
cat_token=cat_token,
|
||||
)
|
||||
self.backbone = Dinov2Model(backbone_cfg, dtype, device, operations)
|
||||
|
||||
head_kwargs = dict(
|
||||
dim_in=head_dim_in,
|
||||
patch_size=self.PATCH_SIZE,
|
||||
output_dim=head_output_dim,
|
||||
features=head_features,
|
||||
out_channels=tuple(head_out_channels),
|
||||
device=device, dtype=dtype, operations=operations,
|
||||
)
|
||||
if self.head_type == "dpt":
|
||||
head_kwargs.update(
|
||||
use_sky_head=head_use_sky_head,
|
||||
pos_embed=(False if head_pos_embed is None else head_pos_embed),
|
||||
)
|
||||
else: # dualdpt
|
||||
head_kwargs.update(
|
||||
pos_embed=(True if head_pos_embed is None else head_pos_embed),
|
||||
)
|
||||
self.head = head_cls(**head_kwargs)
|
||||
|
||||
# Camera encoder / decoder are only constructed when their weights are
|
||||
# present in the checkpoint; the multi-view / pose forward path becomes
|
||||
# available accordingly. ``cam_enc.dim_out`` matches the backbone's
|
||||
# ``embed_dim`` so the cam token slots into block ``alt_start``.
|
||||
embed_dim = backbone_cfg["hidden_size"]
|
||||
if has_cam_enc:
|
||||
self.cam_enc = CameraEnc(
|
||||
dim_out=cam_dim_out if cam_dim_out is not None else embed_dim,
|
||||
num_heads=max(1, embed_dim // 64),
|
||||
device=device, dtype=dtype, operations=operations,
|
||||
)
|
||||
else:
|
||||
self.cam_enc = None
|
||||
if has_cam_dec:
|
||||
# Default cam_dec dim_in is 2*embed_dim when cat_token is on
|
||||
# (the cls/cam token in the output is the cat'd version).
|
||||
default_dim = embed_dim * (2 if cat_token else 1)
|
||||
self.cam_dec = CameraDec(
|
||||
dim_in=cam_dec_dim_in if cam_dec_dim_in is not None else default_dim,
|
||||
device=device, dtype=dtype, operations=operations,
|
||||
)
|
||||
else:
|
||||
self.cam_dec = None
|
||||
|
||||
self.dtype = dtype
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Forward
|
||||
# ------------------------------------------------------------------
|
||||
def forward(
|
||||
self,
|
||||
image: torch.Tensor,
|
||||
extrinsics: Optional[torch.Tensor] = None,
|
||||
intrinsics: Optional[torch.Tensor] = None,
|
||||
*,
|
||||
use_ray_pose: bool = False,
|
||||
ref_view_strategy: str = "saddle_balanced",
|
||||
export_feat_layers: Optional[Sequence[int]] = None,
|
||||
**_unused,
|
||||
) -> Dict[str, torch.Tensor]:
|
||||
"""Run depth (and optionally pose) prediction.
|
||||
|
||||
Args:
|
||||
image: ``(B, 3, H, W)`` ImageNet-normalised image tensor, or
|
||||
``(B, S, 3, H, W)`` for multi-view inputs. ``H`` and ``W``
|
||||
must be multiples of 14.
|
||||
extrinsics: optional ``(B, S, 4, 4)`` world-to-camera extrinsics.
|
||||
When provided together with ``intrinsics``, ``CameraEnc``
|
||||
converts them into per-view camera tokens that the backbone
|
||||
injects at block ``alt_start``.
|
||||
intrinsics: optional ``(B, S, 3, 3)`` pixel-space intrinsics.
|
||||
use_ray_pose: if True, predict pose from the auxiliary "ray" head
|
||||
(RANSAC over per-pixel rays). Only available on DualDPT
|
||||
variants. If False (default) and ``cam_dec`` is present,
|
||||
the final-layer cam token is decoded into pose instead.
|
||||
ref_view_strategy: reference-view selection strategy used when
|
||||
``S >= 3`` and no extrinsics are supplied. See
|
||||
:mod:`comfy.ldm.depth_anything_3.reference_view_selector`.
|
||||
export_feat_layers: optional list of backbone layer indices whose
|
||||
local features to also return as auxiliary outputs (used by
|
||||
downstream nested-architecture wrappers; empty by default).
|
||||
|
||||
Returns:
|
||||
Dict with a subset of:
|
||||
- ``depth`` ``(B*S, H, W)`` raw depth values.
|
||||
- ``depth_conf`` ``(B*S, H, W)`` confidence (DualDPT only).
|
||||
- ``sky`` ``(B*S, H, W)`` sky probability (DPT + sky head).
|
||||
- ``ray`` ``(B, S, h, w, 6)`` per-pixel cam ray (DualDPT,
|
||||
multi-view, ``use_ray_pose=True`` only).
|
||||
- ``ray_conf`` ``(B, S, h, w)`` ray confidence.
|
||||
- ``extrinsics`` ``(B, S, 4, 4)`` world-to-cam, when pose
|
||||
prediction is active.
|
||||
- ``intrinsics`` ``(B, S, 3, 3)`` pixel-space intrinsics.
|
||||
- ``aux_features`` list of ``(B, S, h_p, w_p, C)`` features
|
||||
when ``export_feat_layers`` is non-empty.
|
||||
"""
|
||||
if image.ndim == 4:
|
||||
image = image.unsqueeze(1) # (B, 1, 3, H, W)
|
||||
assert image.ndim == 5 and image.shape[2] == 3, \
|
||||
f"image must be (B,3,H,W) or (B,S,3,H,W); got {tuple(image.shape)}"
|
||||
|
||||
B, S, _, H, W = image.shape
|
||||
assert H % self.PATCH_SIZE == 0 and W % self.PATCH_SIZE == 0, \
|
||||
f"image H,W must be multiples of {self.PATCH_SIZE}; got {(H, W)}"
|
||||
|
||||
# Camera-token preparation (multi-view path).
|
||||
cam_token = None
|
||||
if extrinsics is not None and intrinsics is not None and self.cam_enc is not None:
|
||||
cam_token = self.cam_enc(extrinsics, intrinsics, (H, W))
|
||||
|
||||
# Toggle aux ray output on/off depending on what the caller asked for.
|
||||
if isinstance(self.head, DualDPT):
|
||||
self.head.enable_aux = bool(use_ray_pose)
|
||||
|
||||
feats, aux_feats = self.backbone.get_intermediate_layers(
|
||||
image, self.out_layers, cam_token=cam_token,
|
||||
ref_view_strategy=ref_view_strategy,
|
||||
export_feat_layers=export_feat_layers,
|
||||
)
|
||||
head_out = self.head(feats, H=H, W=W, patch_start_idx=0)
|
||||
|
||||
# Pose prediction.
|
||||
out: Dict[str, torch.Tensor] = {}
|
||||
if use_ray_pose and "ray" in head_out and "ray_conf" in head_out:
|
||||
ray = head_out["ray"]
|
||||
ray_conf = head_out["ray_conf"]
|
||||
extr_c2w, focal, pp = get_extrinsic_from_camray(
|
||||
ray, ray_conf, ray.shape[-3], ray.shape[-2],
|
||||
)
|
||||
# Match the upstream output: w2c, drop the homogeneous row.
|
||||
extr_w2c = affine_inverse(extr_c2w)[:, :, :3, :]
|
||||
# Build pixel-space intrinsics from the normalised focal/pp output.
|
||||
intr = torch.eye(3, device=ray.device, dtype=ray.dtype)
|
||||
intr = intr[None, None].expand(extr_c2w.shape[0], extr_c2w.shape[1], 3, 3).clone()
|
||||
intr[:, :, 0, 0] = focal[:, :, 0] / 2 * W
|
||||
intr[:, :, 1, 1] = focal[:, :, 1] / 2 * H
|
||||
intr[:, :, 0, 2] = pp[:, :, 0] * W * 0.5
|
||||
intr[:, :, 1, 2] = pp[:, :, 1] * H * 0.5
|
||||
out["extrinsics"] = extr_w2c
|
||||
out["intrinsics"] = intr
|
||||
elif self.cam_dec is not None and S > 1:
|
||||
# Decode the cam-token of the final out_layer into a pose encoding.
|
||||
cam_feat = feats[-1][1] # (B, S, dim_in_to_cam_dec)
|
||||
pose_enc = self.cam_dec(cam_feat)
|
||||
c2w_3x4, intr = pose_encoding_to_extri_intri(pose_enc, (H, W))
|
||||
# Match the upstream output convention: w2c (world->camera), 3x4.
|
||||
c2w_4x4 = torch.cat([
|
||||
c2w_3x4,
|
||||
torch.tensor([0, 0, 0, 1], device=c2w_3x4.device, dtype=c2w_3x4.dtype)
|
||||
.view(1, 1, 1, 4).expand(B, S, 1, 4),
|
||||
], dim=-2)
|
||||
out["extrinsics"] = affine_inverse(c2w_4x4)[:, :, :3, :]
|
||||
out["intrinsics"] = intr
|
||||
|
||||
# Flatten the views axis for per-pixel outputs (depth/conf/sky) so the
|
||||
# per-image consumer keeps its (B*S, H, W) interface.
|
||||
for k, v in head_out.items():
|
||||
if k in ("ray", "ray_conf"):
|
||||
# Keep multi-view shape for downstream pose work.
|
||||
out[k] = v
|
||||
elif v.ndim >= 3 and v.shape[0] == B and v.shape[1] == S:
|
||||
out[k] = v.reshape(B * S, *v.shape[2:])
|
||||
else:
|
||||
out[k] = v
|
||||
|
||||
if export_feat_layers:
|
||||
out["aux_features"] = self._reshape_aux_features(aux_feats, H, W)
|
||||
return out
|
||||
|
||||
def _reshape_aux_features(self, aux_feats, H: int, W: int):
|
||||
"""Reshape ``(B, S, N, C)`` aux features into ``(B, S, h_p, w_p, C)``."""
|
||||
ph, pw = H // self.PATCH_SIZE, W // self.PATCH_SIZE
|
||||
out = []
|
||||
for f in aux_feats:
|
||||
B, S, N, C = f.shape
|
||||
assert N == ph * pw, f"aux feature seq mismatch: {N} != {ph}*{pw}"
|
||||
out.append(f.reshape(B, S, ph, pw, C))
|
||||
return out
|
||||
@ -1,184 +0,0 @@
|
||||
# Input/output preprocessing helpers for Depth Anything 3.
|
||||
#
|
||||
# Ported from:
|
||||
# src/depth_anything_3/utils/io/input_processor.py (image normalisation)
|
||||
# src/depth_anything_3/utils/alignment.py (sky-aware depth clip)
|
||||
# src/depth_anything_3/model/da3.py::_process_mono_sky_estimation
|
||||
#
|
||||
# Resize: ``comfy.utils.common_upscale`` with ``upscale_method="lanczos"``.
|
||||
# Upstream uses cv2 INTER_CUBIC (upscale) / INTER_AREA (downscale); a sweep
|
||||
# across {bilinear, bicubic, area, lanczos, bislerp} on a 768->504 test image
|
||||
# showed lanczos has the lowest max-abs-diff vs the upstream cv2 output
|
||||
# (~0.13 vs 0.21-0.71 for the others), so we use it in both directions for
|
||||
# simplicity. This keeps the path stateless, on-device, and free of any
|
||||
# OpenCV dependency.
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Tuple
|
||||
|
||||
import torch
|
||||
|
||||
import comfy.utils
|
||||
|
||||
PATCH_SIZE = 14
|
||||
|
||||
# ImageNet normalization constants used during DA3 training.
|
||||
_IMAGENET_MEAN = torch.tensor([0.485, 0.456, 0.406])
|
||||
_IMAGENET_STD = torch.tensor([0.229, 0.224, 0.225])
|
||||
|
||||
|
||||
def _round_to_patch(x: int, patch: int = PATCH_SIZE) -> int:
|
||||
down = (x // patch) * patch
|
||||
up = down + patch
|
||||
return up if abs(up - x) <= abs(x - down) else down
|
||||
|
||||
|
||||
def compute_target_size(orig_h: int, orig_w: int, process_res: int,
|
||||
method: str = "upper_bound_resize") -> Tuple[int, int]:
|
||||
"""Compute (target_h, target_w) for a single image.
|
||||
|
||||
Methods:
|
||||
- "upper_bound_resize": scale longest side to ``process_res``, then
|
||||
round each dim to nearest multiple of 14 (default upstream method).
|
||||
- "lower_bound_resize": scale shortest side to ``process_res``, then
|
||||
round.
|
||||
"""
|
||||
if method == "upper_bound_resize":
|
||||
longest = max(orig_h, orig_w)
|
||||
scale = process_res / float(longest)
|
||||
elif method == "lower_bound_resize":
|
||||
shortest = min(orig_h, orig_w)
|
||||
scale = process_res / float(shortest)
|
||||
else:
|
||||
raise ValueError(f"Unsupported process_res_method: {method}")
|
||||
|
||||
new_w = max(1, _round_to_patch(int(round(orig_w * scale))))
|
||||
new_h = max(1, _round_to_patch(int(round(orig_h * scale))))
|
||||
return new_h, new_w
|
||||
|
||||
|
||||
def preprocess_image(
|
||||
image: torch.Tensor,
|
||||
process_res: int = 504,
|
||||
method: str = "upper_bound_resize",
|
||||
) -> torch.Tensor:
|
||||
"""Preprocess a ComfyUI ``IMAGE`` batch for DA3.
|
||||
|
||||
Args:
|
||||
image: ``(B, H, W, 3)`` float in [0, 1] (ComfyUI ``IMAGE`` convention).
|
||||
process_res: target resolution (longest or shortest side, depending
|
||||
on ``method``).
|
||||
method: resize strategy.
|
||||
|
||||
Returns:
|
||||
``(B, 3, H', W')`` tensor with H' and W' multiples of 14, normalised
|
||||
with ImageNet statistics. The tensor lives on the same device as
|
||||
``image``.
|
||||
"""
|
||||
assert image.ndim == 4 and image.shape[-1] == 3, \
|
||||
f"expected (B,H,W,3) IMAGE; got {tuple(image.shape)}"
|
||||
B, H, W, _ = image.shape
|
||||
target_h, target_w = compute_target_size(H, W, process_res, method)
|
||||
|
||||
# (B, H, W, 3) -> (B, 3, H, W)
|
||||
x = image.movedim(-1, 1).contiguous()
|
||||
if (target_h, target_w) != (H, W):
|
||||
# Upstream uses cv2 INTER_CUBIC (upscale) / INTER_AREA (downscale).
|
||||
# Lanczos in ``common_upscale`` is anti-aliased and produces the
|
||||
# closest pixel-wise match in a sweep across {bilinear, bicubic,
|
||||
# area, lanczos, bislerp}. Used in both directions for simplicity.
|
||||
x = comfy.utils.common_upscale(
|
||||
x.float(), target_w, target_h, "lanczos", "disabled",
|
||||
)
|
||||
x = x.clamp(0.0, 1.0)
|
||||
|
||||
mean = _IMAGENET_MEAN.to(device=x.device, dtype=x.dtype).view(1, 3, 1, 1)
|
||||
std = _IMAGENET_STD.to(device=x.device, dtype=x.dtype).view(1, 3, 1, 1)
|
||||
x = (x - mean) / std
|
||||
return x
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Output post-processing (sky-aware clipping for Mono/Metric variants)
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
|
||||
def compute_non_sky_mask(sky_prediction: torch.Tensor, threshold: float = 0.3) -> torch.Tensor:
|
||||
"""Boolean mask: True for non-sky pixels (sky probability < threshold)."""
|
||||
return sky_prediction < threshold
|
||||
|
||||
|
||||
def apply_sky_aware_clip(
|
||||
depth: torch.Tensor,
|
||||
sky: torch.Tensor,
|
||||
threshold: float = 0.3,
|
||||
quantile: float = 0.99,
|
||||
) -> torch.Tensor:
|
||||
"""Replicates ``_process_mono_sky_estimation`` from upstream.
|
||||
|
||||
Clips sky regions to the 99th percentile of non-sky depth. Returns a new
|
||||
depth tensor; ``depth`` is not modified in place.
|
||||
"""
|
||||
non_sky = compute_non_sky_mask(sky, threshold=threshold)
|
||||
if non_sky.sum() <= 10 or (~non_sky).sum() <= 10:
|
||||
return depth.clone()
|
||||
|
||||
non_sky_depth = depth[non_sky]
|
||||
if non_sky_depth.numel() > 100_000:
|
||||
idx = torch.randint(0, non_sky_depth.numel(), (100_000,), device=non_sky_depth.device)
|
||||
sampled = non_sky_depth[idx]
|
||||
else:
|
||||
sampled = non_sky_depth
|
||||
|
||||
max_depth = torch.quantile(sampled, quantile)
|
||||
out = depth.clone()
|
||||
out[~non_sky] = max_depth
|
||||
return out
|
||||
|
||||
|
||||
def normalize_depth_v2_style(
|
||||
depth: torch.Tensor,
|
||||
sky: torch.Tensor | None = None,
|
||||
low_quantile: float = 0.01,
|
||||
high_quantile: float = 0.99,
|
||||
) -> torch.Tensor:
|
||||
"""V2-style normalization for ControlNet workflows.
|
||||
|
||||
Computes percentile bounds over non-sky pixels (when available),
|
||||
then maps depth into [0, 1] with near = white (1.0).
|
||||
"""
|
||||
if sky is not None:
|
||||
mask = compute_non_sky_mask(sky)
|
||||
if mask.any():
|
||||
valid = depth[mask]
|
||||
else:
|
||||
valid = depth.flatten()
|
||||
else:
|
||||
valid = depth.flatten()
|
||||
|
||||
if valid.numel() > 100_000:
|
||||
idx = torch.randint(0, valid.numel(), (100_000,), device=valid.device)
|
||||
sample = valid[idx]
|
||||
else:
|
||||
sample = valid
|
||||
|
||||
lo = torch.quantile(sample, low_quantile)
|
||||
hi = torch.quantile(sample, high_quantile)
|
||||
rng = (hi - lo).clamp(min=1e-6)
|
||||
norm = ((depth - lo) / rng).clamp(0.0, 1.0)
|
||||
# ControlNet convention: nearer pixels are brighter (1.0).
|
||||
norm = 1.0 - norm
|
||||
if sky is not None:
|
||||
# Sky pixels become black (far / unknown).
|
||||
sky_mask = ~compute_non_sky_mask(sky)
|
||||
norm = torch.where(sky_mask, torch.zeros_like(norm), norm)
|
||||
return norm
|
||||
|
||||
|
||||
def normalize_depth_min_max(depth: torch.Tensor) -> torch.Tensor:
|
||||
"""Simple per-frame min/max normalization with near=1.0 convention."""
|
||||
lo = depth.amin(dim=(-2, -1), keepdim=True)
|
||||
hi = depth.amax(dim=(-2, -1), keepdim=True)
|
||||
rng = (hi - lo).clamp(min=1e-6)
|
||||
return 1.0 - ((depth - lo) / rng).clamp(0.0, 1.0)
|
||||
@ -1,320 +0,0 @@
|
||||
"""Ray-to-pose conversion for the multi-view path of Depth Anything 3.
|
||||
|
||||
Converts the auxiliary "ray" output of :class:`DualDPT` (per-pixel camera
|
||||
ray vectors, predicted on the per-view local feature map) into per-view
|
||||
extrinsics + intrinsics. Implementation is a 1:1 port of
|
||||
``depth_anything_3.utils.ray_utils`` upstream, using a weighted-RANSAC
|
||||
homography fit followed by a QL decomposition.
|
||||
|
||||
No learned parameters; pure tensor math. Output:
|
||||
|
||||
* ``R`` -- ``(B, S, 3, 3)`` rotation matrix
|
||||
* ``T`` -- ``(B, S, 3)`` camera-space translation
|
||||
* ``focal_lengths`` -- ``(B, S, 2)`` in normalised image space (image=2x2)
|
||||
* ``principal_points`` -- ``(B, S, 2)`` ditto
|
||||
|
||||
:func:`get_extrinsic_from_camray` wraps these into a 4x4 extrinsic matrix
|
||||
that the public node converts back into pixel-space intrinsics.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Optional, Tuple
|
||||
|
||||
import torch
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Linear-algebra helpers
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _ql_decomposition(A: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
"""Decompose ``A = Q @ L`` with ``Q`` orthogonal and ``L`` lower-triangular.
|
||||
|
||||
Implemented in terms of QR by reversing the columns/rows; the standard
|
||||
trick from the upstream reference. Inputs ``A`` are ``(3, 3)``.
|
||||
"""
|
||||
P = torch.tensor([[0, 0, 1], [0, 1, 0], [1, 0, 0]],
|
||||
device=A.device, dtype=A.dtype)
|
||||
A_tilde = A @ P
|
||||
# CUDA QR is not implemented for fp16/bf16; upcast just for this call.
|
||||
Q_tilde, R_tilde = torch.linalg.qr(A_tilde.float())
|
||||
Q_tilde = Q_tilde.to(A.dtype)
|
||||
R_tilde = R_tilde.to(A.dtype)
|
||||
Q = Q_tilde @ P
|
||||
L = P @ R_tilde @ P
|
||||
d = torch.diag(L)
|
||||
sign = torch.sign(d)
|
||||
Q = Q * sign[None, :] # scale columns of Q
|
||||
L = L * sign[:, None] # scale rows of L
|
||||
return Q, L
|
||||
|
||||
|
||||
def _homogenize_points(points: torch.Tensor) -> torch.Tensor:
|
||||
return torch.cat([points, torch.ones_like(points[..., :1])], dim=-1)
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Weighted-LSQ + RANSAC homography (batched)
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _find_homography_weighted_lsq(
|
||||
src_pts: torch.Tensor,
|
||||
dst_pts: torch.Tensor,
|
||||
confident_weight: torch.Tensor,
|
||||
) -> torch.Tensor:
|
||||
"""Solve a single ``H`` with weighted least-squares (DLT)."""
|
||||
N = src_pts.shape[0]
|
||||
if N < 4:
|
||||
raise ValueError("At least 4 points are required to compute a homography.")
|
||||
w = confident_weight.sqrt().unsqueeze(1) # (N, 1)
|
||||
x = src_pts[:, 0:1]
|
||||
y = src_pts[:, 1:2]
|
||||
u = dst_pts[:, 0:1]
|
||||
v = dst_pts[:, 1:2]
|
||||
zeros = torch.zeros_like(x)
|
||||
A1 = torch.cat([-x * w, -y * w, -w, zeros, zeros, zeros, x * u * w, y * u * w, u * w], dim=1)
|
||||
A2 = torch.cat([zeros, zeros, zeros, -x * w, -y * w, -w, x * v * w, y * v * w, v * w], dim=1)
|
||||
A = torch.cat([A1, A2], dim=0) # (2N, 9)
|
||||
# CUDA SVD is not implemented for fp16/bf16; upcast just for this call.
|
||||
_, _, Vh = torch.linalg.svd(A.float())
|
||||
Vh = Vh.to(A.dtype)
|
||||
H = Vh[-1].reshape(3, 3)
|
||||
return H / H[-1, -1]
|
||||
|
||||
|
||||
def _find_homography_weighted_lsq_batched(
|
||||
src_pts_batch: torch.Tensor,
|
||||
dst_pts_batch: torch.Tensor,
|
||||
confident_weight_batch: torch.Tensor,
|
||||
) -> torch.Tensor:
|
||||
"""Batched DLT solver. Inputs ``(B, K, 2)`` / ``(B, K)``; output ``(B, 3, 3)``."""
|
||||
B, K, _ = src_pts_batch.shape
|
||||
w = confident_weight_batch.sqrt().unsqueeze(2)
|
||||
x = src_pts_batch[:, :, 0:1]
|
||||
y = src_pts_batch[:, :, 1:2]
|
||||
u = dst_pts_batch[:, :, 0:1]
|
||||
v = dst_pts_batch[:, :, 1:2]
|
||||
zeros = torch.zeros_like(x)
|
||||
A1 = torch.cat([-x * w, -y * w, -w, zeros, zeros, zeros, x * u * w, y * u * w, u * w], dim=2)
|
||||
A2 = torch.cat([zeros, zeros, zeros, -x * w, -y * w, -w, x * v * w, y * v * w, v * w], dim=2)
|
||||
A = torch.cat([A1, A2], dim=1) # (B, 2K, 9)
|
||||
# CUDA SVD is not implemented for fp16/bf16; upcast just for this call.
|
||||
_, _, Vh = torch.linalg.svd(A.float())
|
||||
Vh = Vh.to(A.dtype)
|
||||
H = Vh[:, -1].reshape(B, 3, 3)
|
||||
return H / H[:, 2:3, 2:3]
|
||||
|
||||
|
||||
def _ransac_find_homography_weighted_batched(
|
||||
src_pts: torch.Tensor, # (B, N, 2)
|
||||
dst_pts: torch.Tensor, # (B, N, 2)
|
||||
confident_weight: torch.Tensor, # (B, N)
|
||||
n_sample: int,
|
||||
n_iter: int = 100,
|
||||
reproj_threshold: float = 3.0,
|
||||
num_sample_for_ransac: int = 8,
|
||||
max_inlier_num: int = 10000,
|
||||
rand_sample_iters_idx: Optional[torch.Tensor] = None,
|
||||
) -> torch.Tensor:
|
||||
"""Batched weighted-RANSAC homography estimator.
|
||||
|
||||
Returns ``(B, 3, 3)`` homography matrices.
|
||||
"""
|
||||
B, N, _ = src_pts.shape
|
||||
assert N >= 4
|
||||
device = src_pts.device
|
||||
|
||||
sorted_idx = torch.argsort(confident_weight, descending=True, dim=1)
|
||||
candidate_idx = sorted_idx[:, :n_sample] # (B, n_sample)
|
||||
|
||||
if rand_sample_iters_idx is None:
|
||||
rand_sample_iters_idx = torch.stack(
|
||||
[torch.randperm(n_sample, device=device)[:num_sample_for_ransac]
|
||||
for _ in range(n_iter)],
|
||||
dim=0,
|
||||
)
|
||||
|
||||
rand_idx = candidate_idx[:, rand_sample_iters_idx] # (B, n_iter, k)
|
||||
b_idx = (
|
||||
torch.arange(B, device=device)
|
||||
.view(B, 1, 1)
|
||||
.expand(B, n_iter, num_sample_for_ransac)
|
||||
)
|
||||
src_b = src_pts[b_idx, rand_idx]
|
||||
dst_b = dst_pts[b_idx, rand_idx]
|
||||
w_b = confident_weight[b_idx, rand_idx]
|
||||
|
||||
cB, cN = src_b.shape[:2]
|
||||
H_batch = _find_homography_weighted_lsq_batched(
|
||||
src_b.flatten(0, 1), dst_b.flatten(0, 1), w_b.flatten(0, 1),
|
||||
).unflatten(0, (cB, cN)) # (B, n_iter, 3, 3)
|
||||
|
||||
src_homo = torch.cat([src_pts, torch.ones(B, N, 1, device=device, dtype=src_pts.dtype)], dim=2)
|
||||
proj = torch.bmm(
|
||||
src_homo.unsqueeze(1).expand(B, n_iter, N, 3).reshape(-1, N, 3),
|
||||
H_batch.reshape(-1, 3, 3).transpose(1, 2),
|
||||
) # (B*n_iter, N, 3)
|
||||
proj_xy = (proj[:, :, :2] / proj[:, :, 2:3]).reshape(B, n_iter, N, 2)
|
||||
err = ((proj_xy - dst_pts.unsqueeze(1)) ** 2).sum(-1).sqrt() # (B, n_iter, N)
|
||||
inlier_mask = err < reproj_threshold
|
||||
score = (inlier_mask * confident_weight.unsqueeze(1)).sum(dim=2)
|
||||
best_idx = torch.argmax(score, dim=1)
|
||||
best_inlier_mask = inlier_mask[torch.arange(B, device=device), best_idx]
|
||||
|
||||
# Refit with the inlier set (per-batch, since the inlier counts vary).
|
||||
H_inlier_list = []
|
||||
for b in range(B):
|
||||
mask = best_inlier_mask[b]
|
||||
in_src = src_pts[b][mask]
|
||||
in_dst = dst_pts[b][mask]
|
||||
in_w = confident_weight[b][mask]
|
||||
if in_src.shape[0] < 4:
|
||||
# Fall back to identity when RANSAC fails to find enough inliers.
|
||||
H_inlier_list.append(torch.eye(3, device=device, dtype=src_pts.dtype))
|
||||
continue
|
||||
sorted_w = torch.argsort(in_w, descending=True)
|
||||
if len(sorted_w) > max_inlier_num:
|
||||
keep = max(int(len(sorted_w) * 0.95), max_inlier_num)
|
||||
sorted_w = sorted_w[:keep][torch.randperm(keep, device=device)[:max_inlier_num]]
|
||||
H_inlier_list.append(
|
||||
_find_homography_weighted_lsq(in_src[sorted_w], in_dst[sorted_w], in_w[sorted_w])
|
||||
)
|
||||
return torch.stack(H_inlier_list, dim=0)
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Camera-ray utilities
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _unproject_identity(num_y: int, num_x: int, B: int, S: int,
|
||||
device, dtype) -> torch.Tensor:
|
||||
"""Camera-space unit rays for an identity intrinsic on a 2x2 image plane.
|
||||
|
||||
Replicates ``unproject_depth(..., ixt_normalized=True)`` upstream: pixel
|
||||
coords ``(x, y)`` in ``[dx, 2-dx] x [dy, 2-dy]`` get mapped to
|
||||
camera-space rays ``(x-1, y-1, 1)`` via the identity intrinsic
|
||||
``[[1,0,1],[0,1,1],[0,0,1]]``. Returns ``(B, S, num_y, num_x, 3)``.
|
||||
"""
|
||||
dx = 1.0 / num_x
|
||||
dy = 1.0 / num_y
|
||||
# Centered camera-space coords directly (skip the K^-1 step since it's
|
||||
# just a translation by -1 on x and y when K is identity-with-center=1).
|
||||
y = torch.linspace(-(1 - dy), (1 - dy), num_y, device=device, dtype=dtype)
|
||||
x = torch.linspace(-(1 - dx), (1 - dx), num_x, device=device, dtype=dtype)
|
||||
yy, xx = torch.meshgrid(y, x, indexing="ij")
|
||||
grid = torch.stack((xx, yy), dim=-1) # (h, w, 2)
|
||||
grid = grid.unsqueeze(0).unsqueeze(0).expand(B, S, num_y, num_x, 2)
|
||||
return torch.cat([grid, torch.ones_like(grid[..., :1])], dim=-1)
|
||||
|
||||
|
||||
def _camray_to_caminfo(
|
||||
camray: torch.Tensor, # (B, S, h, w, 6)
|
||||
confidence: Optional[torch.Tensor] = None, # (B, S, h, w)
|
||||
reproj_threshold: float = 0.2,
|
||||
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
|
||||
"""Convert per-pixel camera rays to per-view (R, T, focal, principal)."""
|
||||
if confidence is None:
|
||||
confidence = torch.ones_like(camray[..., 0])
|
||||
B, S, h, w, _ = camray.shape
|
||||
device = camray.device
|
||||
dtype = camray.dtype
|
||||
|
||||
rays_target = camray[..., :3] # (B, S, h, w, 3)
|
||||
rays_origin = _unproject_identity(h, w, B, S, device, dtype)
|
||||
|
||||
# Flatten (B*S, h*w, *) for the RANSAC routine.
|
||||
rays_target = rays_target.flatten(0, 1).flatten(1, 2)
|
||||
rays_origin = rays_origin.flatten(0, 1).flatten(1, 2)
|
||||
weights = confidence.flatten(0, 1).flatten(1, 2).clone()
|
||||
|
||||
# Project to 2D in homogeneous form (the upstream calls this "perspective division").
|
||||
z_thresh = 1e-4
|
||||
mask = (rays_target[:, :, 2].abs() > z_thresh) & (rays_origin[:, :, 2].abs() > z_thresh)
|
||||
weights = torch.where(mask, weights, torch.zeros_like(weights))
|
||||
src = rays_origin.clone()
|
||||
dst = rays_target.clone()
|
||||
src[..., 0] = torch.where(mask, src[..., 0] / src[..., 2], src[..., 0])
|
||||
src[..., 1] = torch.where(mask, src[..., 1] / src[..., 2], src[..., 1])
|
||||
dst[..., 0] = torch.where(mask, dst[..., 0] / dst[..., 2], dst[..., 0])
|
||||
dst[..., 1] = torch.where(mask, dst[..., 1] / dst[..., 2], dst[..., 1])
|
||||
src = src[..., :2]
|
||||
dst = dst[..., :2]
|
||||
|
||||
N = src.shape[1]
|
||||
n_iter = 100
|
||||
sample_ratio = 0.3
|
||||
num_sample_for_ransac = 8
|
||||
n_sample = max(num_sample_for_ransac, int(N * sample_ratio))
|
||||
rand_idx = torch.stack(
|
||||
[torch.randperm(n_sample, device=device)[:num_sample_for_ransac] for _ in range(n_iter)],
|
||||
dim=0,
|
||||
)
|
||||
|
||||
# Chunk along the view axis to keep peak memory predictable.
|
||||
chunk = 2
|
||||
A_list = []
|
||||
for i in range(0, src.shape[0], chunk):
|
||||
A = _ransac_find_homography_weighted_batched(
|
||||
src[i:i + chunk], dst[i:i + chunk], weights[i:i + chunk],
|
||||
n_sample=n_sample, n_iter=n_iter,
|
||||
num_sample_for_ransac=num_sample_for_ransac,
|
||||
reproj_threshold=reproj_threshold,
|
||||
rand_sample_iters_idx=rand_idx,
|
||||
max_inlier_num=8000,
|
||||
)
|
||||
# Flip sign on dets that come out < 0 (so that the QL produces a
|
||||
# right-handed rotation). ``det`` lacks fp16/bf16 CUDA kernels, so
|
||||
# do the comparison in fp32.
|
||||
flip = torch.linalg.det(A.float()) < 0
|
||||
A = torch.where(flip[:, None, None], -A, A)
|
||||
A_list.append(A)
|
||||
A = torch.cat(A_list, dim=0) # (B*S, 3, 3)
|
||||
|
||||
R_list, f_list, pp_list = [], [], []
|
||||
for i in range(A.shape[0]):
|
||||
R, L = _ql_decomposition(A[i])
|
||||
L = L / L[2][2]
|
||||
f_list.append(torch.stack((L[0][0], L[1][1])))
|
||||
pp_list.append(torch.stack((L[2][0], L[2][1])))
|
||||
R_list.append(R)
|
||||
R = torch.stack(R_list).reshape(B, S, 3, 3)
|
||||
focal = torch.stack(f_list).reshape(B, S, 2)
|
||||
pp = torch.stack(pp_list).reshape(B, S, 2)
|
||||
|
||||
# Translation: confidence-weighted average of camray direction(s).
|
||||
cf = confidence.flatten(0, 1).flatten(1, 2)
|
||||
T = (camray.flatten(0, 1).flatten(1, 2)[..., 3:] * cf.unsqueeze(-1)).sum(dim=1)
|
||||
T = T / cf.sum(dim=-1, keepdim=True)
|
||||
T = T.reshape(B, S, 3)
|
||||
|
||||
# Match upstream output convention: focal -> 1/focal, pp + 1.
|
||||
return R, T, 1.0 / focal, pp + 1.0
|
||||
|
||||
|
||||
def get_extrinsic_from_camray(
|
||||
camray: torch.Tensor, # (B, S, h, w, 6)
|
||||
conf: torch.Tensor, # (B, S, h, w, 1) or (B, S, h, w)
|
||||
patch_size_y: int,
|
||||
patch_size_x: int,
|
||||
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
|
||||
"""Wrap a 4x4 extrinsic + per-view focal + principal-point output.
|
||||
|
||||
Returns:
|
||||
* extrinsic ``(B, S, 4, 4)`` camera-to-world (the inverse is
|
||||
what gets stored in ``output.extrinsics``
|
||||
by the caller).
|
||||
* focals ``(B, S, 2)`` in normalised image space.
|
||||
* pp ``(B, S, 2)`` in normalised image space.
|
||||
"""
|
||||
if conf.ndim == 5 and conf.shape[-1] == 1:
|
||||
conf = conf.squeeze(-1)
|
||||
R, T, focal, pp = _camray_to_caminfo(camray, confidence=conf)
|
||||
extr = torch.cat([R, T.unsqueeze(-1)], dim=-1) # (B, S, 3, 4)
|
||||
homo_row = torch.tensor([0, 0, 0, 1], dtype=R.dtype, device=R.device)
|
||||
homo_row = homo_row.view(1, 1, 1, 4).expand(R.shape[0], R.shape[1], 1, 4)
|
||||
extr = torch.cat([extr, homo_row], dim=-2) # (B, S, 4, 4)
|
||||
return extr, focal, pp
|
||||
@ -1,116 +0,0 @@
|
||||
"""Reference-view selection for the multi-view path of Depth Anything 3.
|
||||
|
||||
Pure tensor math, no learned parameters. Exposed as three free functions:
|
||||
|
||||
* :func:`select_reference_view` -- pick a reference view per batch.
|
||||
* :func:`reorder_by_reference` -- move the reference view to position 0.
|
||||
* :func:`restore_original_order` -- inverse of :func:`reorder_by_reference`.
|
||||
|
||||
Mirrors ``depth_anything_3.model.reference_view_selector`` upstream.
|
||||
The default strategy (``"saddle_balanced"``) selects the view whose CLS
|
||||
token features are closest to the median across multiple metrics.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Literal
|
||||
|
||||
import torch
|
||||
|
||||
|
||||
RefViewStrategy = Literal["first", "middle", "saddle_balanced", "saddle_sim_range"]
|
||||
|
||||
|
||||
# Per the upstream constants module: ``THRESH_FOR_REF_SELECTION = 3``.
|
||||
# Reference selection only runs when there are at least this many views.
|
||||
THRESH_FOR_REF_SELECTION: int = 3
|
||||
|
||||
|
||||
def select_reference_view(
|
||||
x: torch.Tensor,
|
||||
strategy: RefViewStrategy = "saddle_balanced",
|
||||
) -> torch.Tensor:
|
||||
"""Pick a reference view index per batch element.
|
||||
|
||||
Args:
|
||||
x: ``(B, S, N, C)`` token tensor. Index 0 along ``N`` is the
|
||||
cls/cam token used by the feature-based strategies.
|
||||
strategy: One of ``"first" | "middle" | "saddle_balanced" |
|
||||
"saddle_sim_range"``.
|
||||
|
||||
Returns:
|
||||
``(B,)`` long tensor with the chosen reference view index for
|
||||
each batch element.
|
||||
"""
|
||||
B, S, _, _ = x.shape
|
||||
if S <= 1:
|
||||
return torch.zeros(B, dtype=torch.long, device=x.device)
|
||||
if strategy == "first":
|
||||
return torch.zeros(B, dtype=torch.long, device=x.device)
|
||||
if strategy == "middle":
|
||||
return torch.full((B,), S // 2, dtype=torch.long, device=x.device)
|
||||
|
||||
# Feature-based strategies: normalised cls/cam token per view.
|
||||
img_class_feat = x[:, :, 0] / x[:, :, 0].norm(dim=-1, keepdim=True) # (B,S,C)
|
||||
|
||||
if strategy == "saddle_balanced":
|
||||
sim = torch.matmul(img_class_feat, img_class_feat.transpose(1, 2)) # (B,S,S)
|
||||
sim_no_diag = sim - torch.eye(S, device=sim.device).unsqueeze(0)
|
||||
sim_score = sim_no_diag.sum(dim=-1) / (S - 1) # (B,S)
|
||||
feat_norm = x[:, :, 0].norm(dim=-1) # (B,S)
|
||||
feat_var = img_class_feat.var(dim=-1) # (B,S)
|
||||
|
||||
def _normalize(metric):
|
||||
mn = metric.min(dim=1, keepdim=True).values
|
||||
mx = metric.max(dim=1, keepdim=True).values
|
||||
return (metric - mn) / (mx - mn + 1e-8)
|
||||
|
||||
sim_n, norm_n, var_n = _normalize(sim_score), _normalize(feat_norm), _normalize(feat_var)
|
||||
balance = (sim_n - 0.5).abs() + (norm_n - 0.5).abs() + (var_n - 0.5).abs()
|
||||
return balance.argmin(dim=1)
|
||||
|
||||
if strategy == "saddle_sim_range":
|
||||
sim = torch.matmul(img_class_feat, img_class_feat.transpose(1, 2))
|
||||
sim_no_diag = sim - torch.eye(S, device=sim.device).unsqueeze(0)
|
||||
sim_max = sim_no_diag.max(dim=-1).values
|
||||
sim_min = sim_no_diag.min(dim=-1).values
|
||||
return (sim_max - sim_min).argmax(dim=1)
|
||||
|
||||
raise ValueError(
|
||||
f"Unknown reference view selection strategy: {strategy!r}. "
|
||||
f"Must be one of: 'first', 'middle', 'saddle_balanced', 'saddle_sim_range'"
|
||||
)
|
||||
|
||||
|
||||
def reorder_by_reference(x: torch.Tensor, b_idx: torch.Tensor) -> torch.Tensor:
|
||||
"""Reorder ``x`` so the reference view is at position 0 in axis ``S``."""
|
||||
B, S = x.shape[0], x.shape[1]
|
||||
if S <= 1:
|
||||
return x
|
||||
positions = torch.arange(S, device=x.device).unsqueeze(0).expand(B, -1)
|
||||
b_idx_exp = b_idx.unsqueeze(1)
|
||||
reorder = torch.where(
|
||||
(positions > 0) & (positions <= b_idx_exp),
|
||||
positions - 1,
|
||||
positions,
|
||||
)
|
||||
reorder[:, 0] = b_idx
|
||||
batch = torch.arange(B, device=x.device).unsqueeze(1)
|
||||
return x[batch, reorder]
|
||||
|
||||
|
||||
def restore_original_order(x: torch.Tensor, b_idx: torch.Tensor) -> torch.Tensor:
|
||||
"""Inverse of :func:`reorder_by_reference`."""
|
||||
B, S = x.shape[0], x.shape[1]
|
||||
if S <= 1:
|
||||
return x
|
||||
target_positions = torch.arange(S, device=x.device).unsqueeze(0).expand(B, -1)
|
||||
b_idx_exp = b_idx.unsqueeze(1)
|
||||
restore = torch.where(target_positions < b_idx_exp,
|
||||
target_positions + 1,
|
||||
target_positions)
|
||||
restore = torch.scatter(
|
||||
restore, dim=1, index=b_idx_exp, src=torch.zeros_like(b_idx_exp),
|
||||
)
|
||||
batch = torch.arange(B, device=x.device).unsqueeze(1)
|
||||
return x[batch, restore]
|
||||
@ -1,180 +0,0 @@
|
||||
"""Geometry / camera transform helpers for Depth Anything 3.
|
||||
|
||||
Pure tensor math, no learned parameters. Mirrors the upstream upstream
|
||||
``depth_anything_3.model.utils.transform`` and the parts of
|
||||
``depth_anything_3.utils.geometry`` used at inference time on the
|
||||
multi-view + camera path. Kept self-contained so the DA3 module is fully
|
||||
ported and does not depend on the upstream repo at runtime.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Tuple
|
||||
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Affine 4x4 helpers
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
|
||||
def as_homogeneous(ext: torch.Tensor) -> torch.Tensor:
|
||||
"""Promote ``(...,3,4)`` extrinsics to ``(...,4,4)`` homogeneous form.
|
||||
|
||||
A no-op when the input is already ``(...,4,4)``.
|
||||
"""
|
||||
if ext.shape[-2:] == (4, 4):
|
||||
return ext
|
||||
if ext.shape[-2:] == (3, 4):
|
||||
ones = torch.zeros_like(ext[..., :1, :4])
|
||||
ones[..., 0, 3] = 1.0
|
||||
return torch.cat([ext, ones], dim=-2)
|
||||
raise ValueError(f"Invalid affine shape: {ext.shape}")
|
||||
|
||||
|
||||
def affine_inverse(A: torch.Tensor) -> torch.Tensor:
|
||||
"""Inverse of an affine matrix ``[R|T; 0 0 0 1]``."""
|
||||
R = A[..., :3, :3]
|
||||
T = A[..., :3, 3:]
|
||||
P = A[..., 3:, :]
|
||||
return torch.cat([torch.cat([R.mT, -R.mT @ T], dim=-1), P], dim=-2)
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Quaternion <-> rotation matrix (xyzw / scalar-last)
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _sqrt_positive_part(x: torch.Tensor) -> torch.Tensor:
|
||||
"""``sqrt(max(0, x))`` with a zero subgradient where ``x == 0``."""
|
||||
ret = torch.zeros_like(x)
|
||||
positive_mask = x > 0
|
||||
if torch.is_grad_enabled():
|
||||
ret[positive_mask] = torch.sqrt(x[positive_mask])
|
||||
else:
|
||||
ret = torch.where(positive_mask, torch.sqrt(x), ret)
|
||||
return ret
|
||||
|
||||
|
||||
def standardize_quaternion(quaternions: torch.Tensor) -> torch.Tensor:
|
||||
"""Force the real part of a unit quaternion (xyzw) to be non-negative."""
|
||||
return torch.where(quaternions[..., 3:4] < 0, -quaternions, quaternions)
|
||||
|
||||
|
||||
def quat_to_mat(quaternions: torch.Tensor) -> torch.Tensor:
|
||||
"""Convert quaternions (xyzw) to ``(...,3,3)`` rotation matrices."""
|
||||
i, j, k, r = torch.unbind(quaternions, -1)
|
||||
two_s = 2.0 / (quaternions * quaternions).sum(-1)
|
||||
o = torch.stack(
|
||||
(
|
||||
1 - two_s * (j * j + k * k),
|
||||
two_s * (i * j - k * r),
|
||||
two_s * (i * k + j * r),
|
||||
two_s * (i * j + k * r),
|
||||
1 - two_s * (i * i + k * k),
|
||||
two_s * (j * k - i * r),
|
||||
two_s * (i * k - j * r),
|
||||
two_s * (j * k + i * r),
|
||||
1 - two_s * (i * i + j * j),
|
||||
),
|
||||
-1,
|
||||
)
|
||||
return o.reshape(quaternions.shape[:-1] + (3, 3))
|
||||
|
||||
|
||||
def mat_to_quat(matrix: torch.Tensor) -> torch.Tensor:
|
||||
"""Convert ``(...,3,3)`` rotation matrices to quaternions (xyzw)."""
|
||||
if matrix.size(-1) != 3 or matrix.size(-2) != 3:
|
||||
raise ValueError(f"Invalid rotation matrix shape {matrix.shape}.")
|
||||
|
||||
batch_dim = matrix.shape[:-2]
|
||||
m00, m01, m02, m10, m11, m12, m20, m21, m22 = torch.unbind(
|
||||
matrix.reshape(batch_dim + (9,)), dim=-1
|
||||
)
|
||||
|
||||
q_abs = _sqrt_positive_part(
|
||||
torch.stack(
|
||||
[
|
||||
1.0 + m00 + m11 + m22,
|
||||
1.0 + m00 - m11 - m22,
|
||||
1.0 - m00 + m11 - m22,
|
||||
1.0 - m00 - m11 + m22,
|
||||
],
|
||||
dim=-1,
|
||||
)
|
||||
)
|
||||
|
||||
quat_by_rijk = torch.stack(
|
||||
[
|
||||
torch.stack([q_abs[..., 0] ** 2, m21 - m12, m02 - m20, m10 - m01], dim=-1),
|
||||
torch.stack([m21 - m12, q_abs[..., 1] ** 2, m10 + m01, m02 + m20], dim=-1),
|
||||
torch.stack([m02 - m20, m10 + m01, q_abs[..., 2] ** 2, m12 + m21], dim=-1),
|
||||
torch.stack([m10 - m01, m20 + m02, m21 + m12, q_abs[..., 3] ** 2], dim=-1),
|
||||
],
|
||||
dim=-2,
|
||||
)
|
||||
|
||||
flr = torch.tensor(0.1).to(dtype=q_abs.dtype, device=q_abs.device)
|
||||
quat_candidates = quat_by_rijk / (2.0 * q_abs[..., None].max(flr))
|
||||
|
||||
out = quat_candidates[F.one_hot(q_abs.argmax(dim=-1), num_classes=4) > 0.5, :].reshape(
|
||||
batch_dim + (4,)
|
||||
)
|
||||
# Reorder rijk -> xyzw (i.e. ijkr).
|
||||
out = out[..., [1, 2, 3, 0]]
|
||||
return standardize_quaternion(out)
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Pose-encoding <-> extrinsics + intrinsics
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
|
||||
def extri_intri_to_pose_encoding(
|
||||
extrinsics: torch.Tensor,
|
||||
intrinsics: torch.Tensor,
|
||||
image_size_hw: Tuple[int, int],
|
||||
) -> torch.Tensor:
|
||||
"""Pack ``(extr, intr, image_size)`` into the 9-D pose-encoding vector.
|
||||
|
||||
``extrinsics`` are camera-to-world (c2w) ``(B,S,4,4)`` matrices,
|
||||
``intrinsics`` are pixel-space ``(B,S,3,3)`` matrices, ``image_size_hw``
|
||||
is a ``(H, W)`` pair. The encoding is ``[T(3), quat_xyzw(4), fov_h, fov_w]``.
|
||||
"""
|
||||
R = extrinsics[..., :3, :3]
|
||||
T = extrinsics[..., :3, 3]
|
||||
quat = mat_to_quat(R)
|
||||
H, W = image_size_hw
|
||||
fov_h = 2 * torch.atan((H / 2) / intrinsics[..., 1, 1])
|
||||
fov_w = 2 * torch.atan((W / 2) / intrinsics[..., 0, 0])
|
||||
return torch.cat([T, quat, fov_h[..., None], fov_w[..., None]], dim=-1).float()
|
||||
|
||||
|
||||
def pose_encoding_to_extri_intri(
|
||||
pose_encoding: torch.Tensor,
|
||||
image_size_hw: Tuple[int, int],
|
||||
) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
"""Inverse of :func:`extri_intri_to_pose_encoding`.
|
||||
|
||||
Returns a ``(B,S,3,4)`` c2w extrinsic matrix and a ``(B,S,3,3)``
|
||||
pixel-space intrinsic matrix.
|
||||
"""
|
||||
T = pose_encoding[..., :3]
|
||||
quat = pose_encoding[..., 3:7]
|
||||
fov_h = pose_encoding[..., 7]
|
||||
fov_w = pose_encoding[..., 8]
|
||||
R = quat_to_mat(quat)
|
||||
extrinsics = torch.cat([R, T[..., None]], dim=-1)
|
||||
H, W = image_size_hw
|
||||
fy = (H / 2.0) / torch.clamp(torch.tan(fov_h / 2.0), 1e-6)
|
||||
fx = (W / 2.0) / torch.clamp(torch.tan(fov_w / 2.0), 1e-6)
|
||||
intrinsics = torch.zeros(pose_encoding.shape[:2] + (3, 3),
|
||||
device=pose_encoding.device, dtype=pose_encoding.dtype)
|
||||
intrinsics[..., 0, 0] = fx
|
||||
intrinsics[..., 1, 1] = fy
|
||||
intrinsics[..., 0, 2] = W / 2
|
||||
intrinsics[..., 1, 2] = H / 2
|
||||
intrinsics[..., 2, 2] = 1.0
|
||||
return extrinsics, intrinsics
|
||||
@ -1,41 +0,0 @@
|
||||
"""HiDream-O1 two-pass attention: tokens [0, ar_len) are causal, [ar_len, T)
|
||||
attend full K/V. Splitting Q at the boundary avoids the (B, 1, T, T) additive
|
||||
mask the general-purpose path would build (~500 MB at T~16K) and lets the
|
||||
gen half hit the user's preferred backend via optimized_attention.
|
||||
"""
|
||||
|
||||
import torch
|
||||
|
||||
import comfy.ops
|
||||
from comfy.ldm.modules.attention import optimized_attention
|
||||
|
||||
|
||||
def make_two_pass_attention(ar_len: int, transformer_options=None):
|
||||
"""Build a two-pass attention callable. AR pass uses SDPA-causal directly, gen pass routes through optimized_attention.
|
||||
The AR pass goes through SDPA directand bypasses wrappers, it is only ~1% of T at typical edit sizes.
|
||||
"""
|
||||
|
||||
def two_pass_attention(q, k, v, heads, **kwargs):
|
||||
B, H, T, D = q.shape
|
||||
|
||||
if T < k.shape[2]: # KV-cache hot path: Q is shorter than K/V (cached AR prefix is in K/V only), all fresh Q positions are in the gen region, single full-attention call
|
||||
out = optimized_attention(q, k, v, heads, mask=None, skip_reshape=True, skip_output_reshape=True, transformer_options=transformer_options)
|
||||
elif ar_len >= T:
|
||||
out = comfy.ops.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=0.0, is_causal=True)
|
||||
elif ar_len <= 0:
|
||||
out = optimized_attention(q, k, v, heads, mask=None, skip_reshape=True, skip_output_reshape=True, transformer_options=transformer_options)
|
||||
else:
|
||||
out_ar = comfy.ops.scaled_dot_product_attention(
|
||||
q[:, :, :ar_len], k[:, :, :ar_len], v[:, :, :ar_len],
|
||||
attn_mask=None, dropout_p=0.0, is_causal=True,
|
||||
)
|
||||
out_gen = optimized_attention(
|
||||
q[:, :, ar_len:], k, v, heads,
|
||||
mask=None, skip_reshape=True, skip_output_reshape=True,
|
||||
transformer_options=transformer_options,
|
||||
)
|
||||
out = torch.cat([out_ar, out_gen], dim=2)
|
||||
|
||||
return out.transpose(1, 2).reshape(B, T, H * D)
|
||||
|
||||
return two_pass_attention
|
||||
@ -1,230 +0,0 @@
|
||||
"""HiDream-O1 conditioning prep — ref-image dual path + extra_conds assembly.
|
||||
|
||||
Each ref image goes through two paths: a 32x32 patchified stream concatenated
|
||||
to the noised target, and a Qwen3-VL ViT path producing tokens that scatter
|
||||
into input_ids at <|image_pad|> positions.
|
||||
"""
|
||||
|
||||
from typing import List
|
||||
|
||||
import torch
|
||||
|
||||
import comfy.utils
|
||||
from comfy.text_encoders.qwen_vl import process_qwen2vl_images
|
||||
|
||||
from .utils import (PATCH_SIZE, calculate_dimensions, cond_image_size, ref_max_size, resize_tensor)
|
||||
|
||||
# Qwen3-VL ViT preprocessing constants (preprocessor_config.json).
|
||||
VIT_PATCH = 16
|
||||
VIT_MERGE = 2
|
||||
VIT_IMAGE_MEAN = [0.5, 0.5, 0.5]
|
||||
VIT_IMAGE_STD = [0.5, 0.5, 0.5]
|
||||
|
||||
|
||||
def prepare_ref_images(
|
||||
ref_images: List[torch.Tensor],
|
||||
target_h: int,
|
||||
target_w: int,
|
||||
device: torch.device,
|
||||
dtype: torch.dtype,
|
||||
):
|
||||
"""Build the dual-path tensors for K reference images at (target_h, target_w).
|
||||
|
||||
Returns None for K=0, else a dict with ref_patches, ref_pixel_values,
|
||||
ref_image_grid_thw, per_ref_vit_tokens, per_ref_patch_grids.
|
||||
"""
|
||||
K = len(ref_images)
|
||||
if K == 0:
|
||||
return None
|
||||
max_size = ref_max_size(max(target_h, target_w), K)
|
||||
cis = cond_image_size(K)
|
||||
|
||||
refs_t = [img[0].clamp(0, 1).permute(2, 0, 1).unsqueeze(0).contiguous().float() for img in ref_images]
|
||||
refs_t = [resize_tensor(t, max_size, PATCH_SIZE) for t in refs_t]
|
||||
|
||||
# 32-patch path.
|
||||
ref_patches_per = []
|
||||
per_ref_patch_grids = []
|
||||
for t in refs_t:
|
||||
t_norm = (t.squeeze(0) - 0.5) / 0.5 # (3, H, W) in [-1, 1]
|
||||
h_p, w_p = t_norm.shape[-2] // PATCH_SIZE, t_norm.shape[-1] // PATCH_SIZE
|
||||
per_ref_patch_grids.append((h_p, w_p))
|
||||
patches = (
|
||||
t_norm.reshape(3, h_p, PATCH_SIZE, w_p, PATCH_SIZE)
|
||||
.permute(1, 3, 0, 2, 4)
|
||||
.reshape(h_p * w_p, 3 * PATCH_SIZE * PATCH_SIZE)
|
||||
)
|
||||
ref_patches_per.append(patches)
|
||||
ref_patches = torch.cat(ref_patches_per, dim=0).unsqueeze(0).to(device=device, dtype=dtype)
|
||||
|
||||
# ViT path.
|
||||
refs_vlm_t = []
|
||||
for t in refs_t:
|
||||
_, _, h, w = t.shape
|
||||
cond_w, cond_h = calculate_dimensions(cis, w / h)
|
||||
cond_w = max(cond_w, VIT_PATCH * VIT_MERGE)
|
||||
cond_h = max(cond_h, VIT_PATCH * VIT_MERGE)
|
||||
refs_vlm_t.append(comfy.utils.common_upscale(t, cond_w, cond_h, "lanczos", "disabled"))
|
||||
|
||||
pv_list, grid_list, per_ref_vit_tokens = [], [], []
|
||||
for t_v in refs_vlm_t:
|
||||
pv, grid_thw = process_qwen2vl_images(
|
||||
t_v.permute(0, 2, 3, 1),
|
||||
min_pixels=0, max_pixels=10**12,
|
||||
patch_size=VIT_PATCH, merge_size=VIT_MERGE,
|
||||
image_mean=VIT_IMAGE_MEAN, image_std=VIT_IMAGE_STD,
|
||||
)
|
||||
grid_thw = grid_thw[0]
|
||||
pv_list.append(pv.to(device=device, dtype=dtype))
|
||||
grid_list.append(grid_thw.to(device=device))
|
||||
# Post-merge token count = number of <|image_pad|> tokens this image expands to in input_ids.
|
||||
gh, gw = int(grid_thw[1].item()), int(grid_thw[2].item())
|
||||
per_ref_vit_tokens.append((gh // VIT_MERGE) * (gw // VIT_MERGE))
|
||||
|
||||
return {
|
||||
"ref_patches": ref_patches,
|
||||
"ref_pixel_values": torch.cat(pv_list, dim=0),
|
||||
"ref_image_grid_thw": torch.stack(grid_list, dim=0),
|
||||
"per_ref_vit_tokens": per_ref_vit_tokens,
|
||||
"per_ref_patch_grids": per_ref_patch_grids,
|
||||
}
|
||||
|
||||
|
||||
def build_ref_input_ids(
|
||||
text_input_ids: torch.Tensor,
|
||||
per_ref_vit_tokens: List[int],
|
||||
image_token_id: int,
|
||||
vision_start_id: int,
|
||||
vision_end_id: int,
|
||||
):
|
||||
"""Splice [vision_start, image_pad*N, vision_end] blocks into input_ids
|
||||
after the [im_start, user, \\n] prefix (matches original chat template).
|
||||
"""
|
||||
ids = text_input_ids[0].tolist()
|
||||
inserted = []
|
||||
for n_pad in per_ref_vit_tokens:
|
||||
inserted.extend([vision_start_id] + [image_token_id] * n_pad + [vision_end_id])
|
||||
new_ids = ids[:3] + inserted + ids[3:] # 3 = len([im_start, user, \n])
|
||||
return torch.tensor([new_ids], dtype=text_input_ids.dtype, device=text_input_ids.device)
|
||||
|
||||
|
||||
def build_extra_conds(
|
||||
text_input_ids: torch.Tensor,
|
||||
noise: torch.Tensor,
|
||||
ref_images: List[torch.Tensor] = None,
|
||||
target_patch_size: int = 32,
|
||||
):
|
||||
"""Assemble all conditioning tensors for HiDreamO1Transformer.forward:
|
||||
input_ids (with ref-vision tokens spliced in for the edit/IP path),
|
||||
position_ids (MRoPE), token_types, vinput_mask, plus the ref
|
||||
dual-path tensors when refs are provided.
|
||||
"""
|
||||
from .utils import get_rope_index_fix_point
|
||||
from comfy.text_encoders.hidream_o1 import (
|
||||
IMAGE_TOKEN_ID, VISION_START_ID, VISION_END_ID,
|
||||
)
|
||||
|
||||
if text_input_ids.dim() == 1:
|
||||
text_input_ids = text_input_ids.unsqueeze(0)
|
||||
text_input_ids = text_input_ids.long().to(noise.device)
|
||||
B = noise.shape[0]
|
||||
if text_input_ids.shape[0] == 1 and B > 1:
|
||||
text_input_ids = text_input_ids.expand(B, -1)
|
||||
|
||||
H, W = noise.shape[-2], noise.shape[-1]
|
||||
h_p, w_p = H // target_patch_size, W // target_patch_size
|
||||
image_len = h_p * w_p
|
||||
image_grid_thw_tgt = torch.tensor(
|
||||
[[1, h_p, w_p]], dtype=torch.long, device=text_input_ids.device,
|
||||
)
|
||||
|
||||
out = {}
|
||||
if ref_images:
|
||||
ref = prepare_ref_images(ref_images, H, W, device=noise.device, dtype=noise.dtype)
|
||||
text_input_ids = build_ref_input_ids(
|
||||
text_input_ids, ref["per_ref_vit_tokens"],
|
||||
IMAGE_TOKEN_ID, VISION_START_ID, VISION_END_ID,
|
||||
)
|
||||
new_txt_len = text_input_ids.shape[1]
|
||||
|
||||
# Each ref's patchified stream gets a [vision_start, image_pad*N-1]
|
||||
# block in the position-id stream after the noised target.
|
||||
ref_grid_lengths = [hp * wp for (hp, wp) in ref["per_ref_patch_grids"]]
|
||||
tgt_vision = torch.full((1, image_len), IMAGE_TOKEN_ID,
|
||||
dtype=text_input_ids.dtype, device=text_input_ids.device)
|
||||
tgt_vision[:, 0] = VISION_START_ID
|
||||
ref_vision_blocks = []
|
||||
for rl in ref_grid_lengths:
|
||||
blk = torch.full((1, rl), IMAGE_TOKEN_ID,
|
||||
dtype=text_input_ids.dtype, device=text_input_ids.device)
|
||||
blk[:, 0] = VISION_START_ID
|
||||
ref_vision_blocks.append(blk)
|
||||
ref_vision_cat = torch.cat([tgt_vision] + ref_vision_blocks, dim=1)
|
||||
input_ids_pad = torch.cat([text_input_ids, ref_vision_cat], dim=-1)
|
||||
total_ref_patches_len = sum(ref_grid_lengths)
|
||||
total_len = new_txt_len + image_len + total_ref_patches_len
|
||||
|
||||
# K (ViT, post-merge) + 1 (target) + K (ref-patches) image grids.
|
||||
K = len(ref_images)
|
||||
igthw_cond = ref["ref_image_grid_thw"].clone()
|
||||
igthw_cond[:, 1] //= 2
|
||||
igthw_cond[:, 2] //= 2
|
||||
image_grid_thw_ref = torch.tensor(
|
||||
[[1, hp, wp] for (hp, wp) in ref["per_ref_patch_grids"]],
|
||||
dtype=torch.long, device=text_input_ids.device,
|
||||
)
|
||||
igthw_all = torch.cat([
|
||||
igthw_cond.to(text_input_ids.device),
|
||||
image_grid_thw_tgt,
|
||||
image_grid_thw_ref,
|
||||
], dim=0)
|
||||
position_ids, _ = get_rope_index_fix_point(
|
||||
spatial_merge_size=1,
|
||||
image_token_id=IMAGE_TOKEN_ID,
|
||||
vision_start_token_id=VISION_START_ID,
|
||||
input_ids=input_ids_pad, image_grid_thw=igthw_all,
|
||||
attention_mask=None,
|
||||
skip_vision_start_token=[0] * K + [1] + [1] * K,
|
||||
fix_point=4096,
|
||||
)
|
||||
|
||||
# tms + target_image + ref_patches are all gen.
|
||||
tms_pos = new_txt_len - 1
|
||||
ar_len = tms_pos
|
||||
token_types = torch.zeros(B, total_len, dtype=torch.long, device=noise.device)
|
||||
token_types[:, tms_pos:] = 1
|
||||
vinput_mask = torch.zeros(B, total_len, dtype=torch.bool, device=noise.device)
|
||||
vinput_mask[:, new_txt_len:] = True
|
||||
|
||||
# Leading batch dim sidesteps CONDRegular.process_cond's repeat_to_batch_size truncation
|
||||
out["ref_pixel_values"] = ref["ref_pixel_values"].unsqueeze(0)
|
||||
out["ref_image_grid_thw"] = ref["ref_image_grid_thw"].unsqueeze(0)
|
||||
out["ref_patches"] = ref["ref_patches"]
|
||||
else:
|
||||
# T2I: text + noised target only, vision_start replaces the first image token
|
||||
txt_len = text_input_ids.shape[1]
|
||||
total_len = txt_len + image_len
|
||||
vision_tokens = torch.full((B, image_len), IMAGE_TOKEN_ID,
|
||||
dtype=text_input_ids.dtype, device=text_input_ids.device)
|
||||
vision_tokens[:, 0] = VISION_START_ID
|
||||
input_ids_pad = torch.cat([text_input_ids, vision_tokens], dim=-1)
|
||||
position_ids, _ = get_rope_index_fix_point(
|
||||
spatial_merge_size=1,
|
||||
image_token_id=IMAGE_TOKEN_ID,
|
||||
vision_start_token_id=VISION_START_ID,
|
||||
input_ids=input_ids_pad, image_grid_thw=image_grid_thw_tgt,
|
||||
attention_mask=None,
|
||||
skip_vision_start_token=[1],
|
||||
)
|
||||
ar_len = txt_len - 1
|
||||
token_types = torch.zeros(B, total_len, dtype=torch.long, device=noise.device)
|
||||
token_types[:, ar_len:] = 1
|
||||
vinput_mask = torch.zeros(B, total_len, dtype=torch.bool, device=noise.device)
|
||||
vinput_mask[:, txt_len:] = True
|
||||
|
||||
out["input_ids"] = text_input_ids
|
||||
out["position_ids"] = position_ids[:, 0].unsqueeze(0) # Collapse position_ids batch and add a leading dim so CONDRegular's batch-resize doesn't truncate the 3-axis MRoPE dim
|
||||
out["token_types"] = token_types
|
||||
out["vinput_mask"] = vinput_mask
|
||||
out["ar_len"] = ar_len
|
||||
return out
|
||||
@ -1,306 +0,0 @@
|
||||
"""HiDream-O1-Image transformer.
|
||||
|
||||
Pixel-space DiT built on Qwen3-VL: the vision tower (Qwen35VisionModel)
|
||||
encodes ref images, the Qwen3-VL-8B decoder (Llama2_ with interleaved MRoPE)
|
||||
processes a unified text+image sequence, and 32x32 patch embed/unembed
|
||||
shims map raw RGB in and out of LLM hidden space. The Qwen3-VL deepstack
|
||||
mergers go unused — their weights are dropped at load.
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from typing import List, Optional
|
||||
|
||||
import einops
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
import comfy.patcher_extension
|
||||
from comfy.ldm.modules.diffusionmodules.mmdit import TimestepEmbedder
|
||||
from comfy.text_encoders.llama import Llama2_
|
||||
from comfy.text_encoders.qwen35 import Qwen35VisionModel
|
||||
|
||||
from .attention import make_two_pass_attention
|
||||
|
||||
|
||||
IMAGE_TOKEN_ID = 151655 # Qwen3-VL <|image_pad|>
|
||||
TMS_TOKEN_ID = 151673 # HiDream-O1 <|tms_token|>
|
||||
PATCH_SIZE = 32
|
||||
|
||||
|
||||
@dataclass
|
||||
class HiDreamO1TextConfig:
|
||||
"""Qwen3-VL-8B text-decoder dims (matches public Qwen3-VL-8B-Instruct)."""
|
||||
vocab_size: int = 151936
|
||||
hidden_size: int = 4096
|
||||
intermediate_size: int = 12288
|
||||
num_hidden_layers: int = 36
|
||||
num_attention_heads: int = 32
|
||||
num_key_value_heads: int = 8
|
||||
head_dim: int = 128
|
||||
max_position_embeddings: int = 128000
|
||||
rms_norm_eps: float = 1e-6
|
||||
rope_theta: float = 5000000.0
|
||||
rope_scale: Optional[float] = None
|
||||
rope_dims: List[int] = field(default_factory=lambda: [24, 20, 20])
|
||||
interleaved_mrope: bool = True
|
||||
transformer_type: str = "llama"
|
||||
rms_norm_add: bool = False
|
||||
mlp_activation: str = "silu"
|
||||
qkv_bias: bool = False
|
||||
q_norm: str = "gemma3"
|
||||
k_norm: str = "gemma3"
|
||||
final_norm: bool = True
|
||||
lm_head: bool = False
|
||||
stop_tokens: List[int] = field(default_factory=lambda: [151643, 151645])
|
||||
|
||||
|
||||
QWEN3VL_VISION_DEFAULTS = dict(
|
||||
hidden_size=1152,
|
||||
num_heads=16,
|
||||
intermediate_size=4304,
|
||||
depth=27,
|
||||
patch_size=16,
|
||||
temporal_patch_size=2,
|
||||
in_channels=3,
|
||||
spatial_merge_size=2,
|
||||
num_position_embeddings=2304,
|
||||
deepstack_visual_indexes=(8, 16, 24),
|
||||
out_hidden_size=4096, # final merger projects directly into LLM hidden
|
||||
)
|
||||
|
||||
|
||||
class BottleneckPatchEmbed(nn.Module):
|
||||
# 3072 -> 1024 -> 4096 (raw 32x32 RGB patch -> bottleneck -> LLM hidden).
|
||||
def __init__(self, patch_size=32, in_chans=3, pca_dim=1024, embed_dim=4096, bias=True, device=None, dtype=None, ops=None):
|
||||
super().__init__()
|
||||
self.proj1 = ops.Linear(patch_size * patch_size * in_chans, pca_dim, bias=False, device=device, dtype=dtype)
|
||||
self.proj2 = ops.Linear(pca_dim, embed_dim, bias=bias, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, x):
|
||||
return self.proj2(self.proj1(x))
|
||||
|
||||
|
||||
class FinalLayer(nn.Module):
|
||||
# 4096 -> 3072 (LLM hidden -> flat pixel patch).
|
||||
def __init__(self, hidden_size, patch_size=32, out_channels=3, device=None, dtype=None, ops=None):
|
||||
super().__init__()
|
||||
self.linear = ops.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, x):
|
||||
return self.linear(x)
|
||||
|
||||
|
||||
class HiDreamO1Transformer(nn.Module):
|
||||
"""HiDream-O1 unified pixel-level transformer."""
|
||||
|
||||
def __init__(self, image_model=None, dtype=None, device=None, operations=None,
|
||||
text_config_overrides=None, vision_config_overrides=None, **kwargs):
|
||||
super().__init__()
|
||||
self.dtype = dtype
|
||||
|
||||
text_cfg = HiDreamO1TextConfig(**(text_config_overrides or {}))
|
||||
vision_cfg = dict(QWEN3VL_VISION_DEFAULTS)
|
||||
if vision_config_overrides:
|
||||
vision_cfg.update(vision_config_overrides)
|
||||
vision_cfg["out_hidden_size"] = text_cfg.hidden_size
|
||||
|
||||
self.text_config = text_cfg
|
||||
self.vision_config = vision_cfg
|
||||
self.hidden_size = text_cfg.hidden_size
|
||||
self.patch_size = PATCH_SIZE
|
||||
self.in_channels = 3
|
||||
self.tms_token_id = TMS_TOKEN_ID
|
||||
|
||||
self.visual = Qwen35VisionModel(vision_cfg, device=device, dtype=dtype, ops=operations)
|
||||
self.language_model = Llama2_(text_cfg, device=device, dtype=dtype, ops=operations)
|
||||
self.t_embedder1 = TimestepEmbedder(
|
||||
text_cfg.hidden_size, device=device, dtype=dtype, operations=operations,
|
||||
)
|
||||
self.x_embedder = BottleneckPatchEmbed(
|
||||
patch_size=self.patch_size, in_chans=self.in_channels,
|
||||
pca_dim=text_cfg.hidden_size // 4, embed_dim=text_cfg.hidden_size,
|
||||
bias=True, device=device, dtype=dtype, ops=operations,
|
||||
)
|
||||
self.final_layer2 = FinalLayer(
|
||||
text_cfg.hidden_size, patch_size=self.patch_size,
|
||||
out_channels=self.in_channels, device=device, dtype=dtype, ops=operations,
|
||||
)
|
||||
|
||||
self._visual_cache = None
|
||||
self._kv_cache_entries = []
|
||||
|
||||
def clear_kv_cache(self):
|
||||
self._kv_cache_entries = []
|
||||
self._visual_cache = None
|
||||
|
||||
def forward(self, x, timesteps, context=None, transformer_options={}, **kwargs):
|
||||
return comfy.patcher_extension.WrapperExecutor.new_class_executor(
|
||||
self._forward,
|
||||
self,
|
||||
comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, transformer_options)
|
||||
).execute(x, timesteps, context, transformer_options, **kwargs)
|
||||
|
||||
def _forward(self, x, timesteps, context=None, transformer_options={}, input_ids=None, attention_mask=None, position_ids=None,
|
||||
vinput_mask=None, ar_len=None, ref_pixel_values=None, ref_image_grid_thw=None, ref_patches=None, **kwargs):
|
||||
"""Returns flow-match velocity (x - x_pred) / sigma"""
|
||||
|
||||
if input_ids is None or position_ids is None:
|
||||
raise ValueError("HiDreamO1Transformer requires input_ids and position_ids in conditioning")
|
||||
|
||||
B, _, H, W = x.shape
|
||||
h_p, w_p = H // self.patch_size, W // self.patch_size
|
||||
tgt_image_len = h_p * w_p
|
||||
|
||||
z = einops.rearrange(
|
||||
x, 'B C (H p1) (W p2) -> B (H W) (C p1 p2)',
|
||||
p1=self.patch_size, p2=self.patch_size,
|
||||
)
|
||||
vinputs = torch.cat([z, ref_patches.to(z.dtype)], dim=1) if ref_patches is not None else z
|
||||
|
||||
inputs_embeds = self.language_model.embed_tokens(input_ids).to(x.dtype)
|
||||
|
||||
if ref_pixel_values is not None and ref_image_grid_thw is not None:
|
||||
# ViT output is constant across sampling steps within a generation
|
||||
# identity-key by the input tensor so refs don't recompute every step.
|
||||
cached = self._visual_cache
|
||||
if cached is not None and cached[0] is ref_pixel_values:
|
||||
image_embeds = cached[1]
|
||||
else:
|
||||
ref_pv = ref_pixel_values.to(inputs_embeds.device)
|
||||
ref_grid = ref_image_grid_thw.to(inputs_embeds.device).long()
|
||||
# extra_conds wraps with a leading batch dim; refs are model-level so [0] always recovers them.
|
||||
if ref_pv.dim() == 3:
|
||||
ref_pv = ref_pv[0]
|
||||
if ref_grid.dim() == 3:
|
||||
ref_grid = ref_grid[0]
|
||||
image_embeds = self.visual(ref_pv, ref_grid).to(inputs_embeds.dtype)
|
||||
self._visual_cache = (ref_pixel_values, image_embeds)
|
||||
# image_pad positions identical across batch (input_ids shared cond/uncond).
|
||||
image_idx = (input_ids[0] == IMAGE_TOKEN_ID).nonzero(as_tuple=True)[0]
|
||||
if image_idx.shape[0] != image_embeds.shape[0]:
|
||||
raise ValueError(
|
||||
f"Image-token count {image_idx.shape[0]} != ViT output count "
|
||||
f"{image_embeds.shape[0]}; check tokenizer/processor alignment."
|
||||
)
|
||||
inputs_embeds[:, image_idx] = image_embeds.unsqueeze(0).expand(B, -1, -1)
|
||||
|
||||
sigma = timesteps.float() / 1000.0
|
||||
t_pixeldit = 1.0 - sigma
|
||||
t_emb = self.t_embedder1(t_pixeldit * 1000, inputs_embeds.dtype)
|
||||
tms_mask_3d = (input_ids == self.tms_token_id).unsqueeze(-1).expand_as(inputs_embeds)
|
||||
inputs_embeds = torch.where(tms_mask_3d, t_emb.unsqueeze(1).expand_as(inputs_embeds), inputs_embeds)
|
||||
|
||||
vinputs_embedded = self.x_embedder(vinputs.to(inputs_embeds.dtype))
|
||||
inputs_embeds = torch.cat([inputs_embeds, vinputs_embedded], dim=1)
|
||||
|
||||
# extra_conds stores position_ids as (1, 3, T); process_cond repeats dim 0 to B. Take row 0.
|
||||
freqs_cis = self.language_model.compute_freqs_cis(position_ids[0].to(x.device), x.device)
|
||||
freqs_cis = tuple(t.to(x.dtype) for t in freqs_cis)
|
||||
|
||||
two_pass_attn = make_two_pass_attention(ar_len, transformer_options=transformer_options)
|
||||
patches_replace = transformer_options.get("patches_replace", {})
|
||||
blocks_replace = patches_replace.get("dit", {})
|
||||
transformer_options["total_blocks"] = len(self.language_model.layers)
|
||||
transformer_options["block_type"] = "double"
|
||||
|
||||
# Cache prefix K/V across steps. Key includes input_ids (prompt), ref_id
|
||||
# (refs scatter into inputs_embeds), and position_ids (RoPE baked into cached K).
|
||||
can_cache = not blocks_replace and ar_len > 0
|
||||
cache_len = ar_len if can_cache else 0
|
||||
ref_id = id(ref_pixel_values) if ref_pixel_values is not None else None
|
||||
pos_ids_key = position_ids[..., :cache_len] if can_cache else position_ids
|
||||
cache_entries = self._kv_cache_entries
|
||||
# Drop stale entries from a previous device (model was unloaded and reloaded).
|
||||
if cache_entries and cache_entries[0]["input_ids"].device != input_ids.device:
|
||||
cache_entries = []
|
||||
self._kv_cache_entries = []
|
||||
kv_cache = None
|
||||
if can_cache:
|
||||
for entry in cache_entries:
|
||||
ck = entry["input_ids"]
|
||||
ep = entry["position_ids"]
|
||||
if (entry["cache_len"] == cache_len
|
||||
and ck.shape == input_ids.shape and torch.equal(ck, input_ids)
|
||||
and entry["ref_id"] == ref_id
|
||||
and ep.shape == pos_ids_key.shape and torch.equal(ep, pos_ids_key)):
|
||||
kv_cache = entry
|
||||
break
|
||||
|
||||
if kv_cache is not None:
|
||||
# Hot path: project Q/K/V only for fresh positions; past_key_value prepends cached AR K/V.
|
||||
hidden_states = inputs_embeds[:, cache_len:]
|
||||
sliced_freqs = tuple(t[..., cache_len:, :] for t in freqs_cis)
|
||||
for i, layer in enumerate(self.language_model.layers):
|
||||
transformer_options["block_index"] = i
|
||||
K_i, V_i = kv_cache["kv"][i]
|
||||
hidden_states, _ = layer(
|
||||
x=hidden_states, attention_mask=None, freqs_cis=sliced_freqs, optimized_attention=two_pass_attn,
|
||||
past_key_value=(K_i, V_i, cache_len),
|
||||
)
|
||||
else:
|
||||
# Cold path: run full sequence; if cacheable, snapshot K/V at AR positions.
|
||||
snapshots = [] if can_cache else None
|
||||
past_kv_cold = () if can_cache else None
|
||||
hidden_states = inputs_embeds
|
||||
for i, layer in enumerate(self.language_model.layers):
|
||||
transformer_options["block_index"] = i
|
||||
if ("double_block", i) in blocks_replace:
|
||||
def block_wrap(args, _layer=layer):
|
||||
out = {}
|
||||
out["x"], _ = _layer(
|
||||
x=args["x"], attention_mask=args.get("attention_mask"),
|
||||
freqs_cis=args["freqs_cis"], optimized_attention=args["optimized_attention"],
|
||||
past_key_value=None,
|
||||
)
|
||||
return out
|
||||
out = blocks_replace[("double_block", i)](
|
||||
{"x": hidden_states, "attention_mask": None,
|
||||
"freqs_cis": freqs_cis, "optimized_attention": two_pass_attn,
|
||||
"transformer_options": transformer_options},
|
||||
{"original_block": block_wrap},
|
||||
)
|
||||
hidden_states = out["x"]
|
||||
else:
|
||||
hidden_states, present_kv = layer(
|
||||
x=hidden_states, attention_mask=None,
|
||||
freqs_cis=freqs_cis, optimized_attention=two_pass_attn,
|
||||
past_key_value=past_kv_cold,
|
||||
)
|
||||
if snapshots is not None:
|
||||
K, V, _ = present_kv
|
||||
snapshots.append((K[:, :, :cache_len].contiguous(),
|
||||
V[:, :, :cache_len].contiguous()))
|
||||
if snapshots is not None:
|
||||
# Cap at 2 entries (cond + uncond). Multi-cond workflows LRU-evict.
|
||||
new_entry = {
|
||||
"input_ids": input_ids.clone(),
|
||||
"cache_len": cache_len,
|
||||
"kv": snapshots,
|
||||
"ref_id": ref_id,
|
||||
"position_ids": pos_ids_key.clone(),
|
||||
}
|
||||
self._kv_cache_entries = (cache_entries + [new_entry])[-2:]
|
||||
|
||||
if self.language_model.norm is not None:
|
||||
hidden_states = self.language_model.norm(hidden_states)
|
||||
|
||||
# Slice target-image positions before the final projection so the Linear only runs on tgt_image_len tokens.
|
||||
# In the hot path hidden_states starts at original position cache_len, so masks/indices shift by cache_len.
|
||||
sliced_offset = cache_len if kv_cache is not None else 0
|
||||
if vinput_mask is not None:
|
||||
vmask = vinput_mask.to(x.device).bool()
|
||||
if sliced_offset > 0:
|
||||
vmask = vmask[:, sliced_offset:]
|
||||
target_hidden = hidden_states[vmask].view(B, -1, hidden_states.shape[-1])[:, :tgt_image_len]
|
||||
else:
|
||||
txt_seq_len = input_ids.shape[1]
|
||||
start = txt_seq_len - sliced_offset
|
||||
target_hidden = hidden_states[:, start:start + tgt_image_len]
|
||||
x_pred_tgt = self.final_layer2(target_hidden)
|
||||
|
||||
# fp32 final subtraction, bf16 here noticeably degrades samples.
|
||||
x_pred_img = einops.rearrange(
|
||||
x_pred_tgt, 'B (H W) (C p1 p2) -> B C (H p1) (W p2)',
|
||||
H=h_p, W=w_p, p1=self.patch_size, p2=self.patch_size,
|
||||
)
|
||||
return (x.float() - x_pred_img.float()) / sigma.view(B, 1, 1, 1).clamp_min(1e-3)
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user