Reformat models variable into multiline array CORE-59

2026-05-21 08:47:07 +08:00 · 2026-05-01 11:03:02 +02:00
259 changed files with 2669 additions and 42659 deletions
--- a/.ci/windows_amd_base_files/run_amd_gpu_disable_smart_memory.bat
+++ b/.ci/windows_amd_base_files/run_amd_gpu_disable_smart_memory.bat
@ -1,2 +1,2 @@
-.\python_embeded\python.exe -s ComfyUI\main.py --windows-standalone-build --enable-dynamic-vram
+.\python_embeded\python.exe -s ComfyUI\main.py --windows-standalone-build --disable-smart-memory
 pause
--- a/.github/workflows/openapi-lint.yml
+++ b/.github/workflows/openapi-lint.yml
@ -1,31 +0,0 @@
-name: OpenAPI Lint
-
-on:
-  pull_request:
-    paths:
-      - 'openapi.yaml'
-      - '.spectral.yaml'
-      - '.github/workflows/openapi-lint.yml'
-
-permissions:
-  contents: read
-
-jobs:
-  spectral:
-    name: Run Spectral
-    runs-on: ubuntu-latest
-
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v4
-
-      - name: Set up Node.js
-        uses: actions/setup-node@v4
-        with:
-          node-version: '20'
-
-      - name: Install Spectral
-        run: npm install -g @stoplight/spectral-cli@6
-
-      - name: Lint openapi.yaml
-        run: spectral lint openapi.yaml --ruleset .spectral.yaml --fail-severity=error
--- a/.github/workflows/stable-release.yml
+++ b/.github/workflows/stable-release.yml
@ -145,8 +145,6 @@ jobs:
          cp -r ComfyUI/.ci/windows_${{ inputs.rel_name }}_base_files/* ./
          cp ../update_comfyui_and_python_dependencies.bat ./update/

-          echo 'local-portable' > ComfyUI/.comfy_environment
-
          cd ..

          "C:\Program Files\7-Zip\7z.exe" a -t7z -m0=lzma2 -mx=9 -mfb=128 -md=768m -ms=on -mf=BCJ2 ComfyUI_windows_portable.7z ComfyUI_windows_portable
--- a/.gitignore
+++ b/.gitignore
@ -23,4 +23,3 @@ web_custom_versions/
 .DS_Store
 filtered-openapi.yaml
 uv.lock
-.comfy_environment
--- a/.spectral.yaml
+++ b/.spectral.yaml
@ -1,100 +0,0 @@
-extends:
-  - spectral:oas
-
-# Severity levels: error, warn, info, hint, off
-# Rules from the built-in "spectral:oas" ruleset are active by default.
-# Below we tune severity and add custom rules for our conventions.
-#
-# This ruleset mirrors Comfy-Org/cloud/.spectral.yaml so specs across the
-# organization are linted against a single consistent standard.
-
-rules:
-  # -----------------------------------------------------------------------
-  # Built-in rule severity overrides
-  # -----------------------------------------------------------------------
-  operation-operationId: error
-  operation-description: warn
-  operation-tag-defined: error
-  info-contact: off
-  info-description: warn
-  no-eval-in-markdown: error
-  no-$ref-siblings: error
-
-  # -----------------------------------------------------------------------
-  # Custom rules: naming conventions
-  # -----------------------------------------------------------------------
-
-  # Property names should be snake_case
-  property-name-snake-case:
-    description: Property names must be snake_case
-    severity: warn
-    given: "$.components.schemas.*.properties[*]~"
-    then:
-      function: pattern
-      functionOptions:
-        match: "^[a-z][a-z0-9]*(_[a-z0-9]+)*$"
-
-  # Operation IDs should be camelCase
-  operation-id-camel-case:
-    description: Operation IDs must be camelCase
-    severity: warn
-    given: "$.paths.*.*.operationId"
-    then:
-      function: pattern
-      functionOptions:
-        match: "^[a-z][a-zA-Z0-9]*$"
-
-  # -----------------------------------------------------------------------
-  # Custom rules: response conventions
-  # -----------------------------------------------------------------------
-
-  # Error responses (4xx, 5xx) should use a consistent shape
-  error-response-schema:
-    description: Error responses should reference a standard error schema
-    severity: hint
-    given: "$.paths.*.*.responses[?(@property >= '400' && @property < '600')].content['application/json'].schema"
-    then:
-      field: "$ref"
-      function: truthy
-
-  # All 2xx responses with JSON body should have a schema
-  response-schema-defined:
-    description: Success responses with JSON content should define a schema
-    severity: warn
-    given: "$.paths.*.*.responses[?(@property >= '200' && @property < '300')].content['application/json']"
-    then:
-      field: schema
-      function: truthy
-
-  # -----------------------------------------------------------------------
-  # Custom rules: best practices
-  # -----------------------------------------------------------------------
-
-  # Path parameters must have a description
-  path-param-description:
-    description: Path parameters should have a description
-    severity: warn
-    given:
-      - "$.paths.*.parameters[?(@.in == 'path')]"
-      - "$.paths.*.*.parameters[?(@.in == 'path')]"
-    then:
-      field: description
-      function: truthy
-
-  # Schemas should have a description
-  schema-description:
-    description: Component schemas should have a description
-    severity: hint
-    given: "$.components.schemas.*"
-    then:
-      field: description
-      function: truthy
-
-overrides:
-  # /ws uses HTTP 101 (Switching Protocols) — a legitimate response for a
-  # WebSocket upgrade, but not a 2xx, so operation-success-response fires
-  # as a false positive. OpenAPI 3.x has no native WebSocket support.
-  - files:
-      - "openapi.yaml#/paths/~1ws"
-    rules:
-      operation-success-response: off
--- a/7
+++ b/7
@ -1,5 +1,2 @@
-* @comfyanonymous @kosinkadink @guill @alexisrolland @rattus128 @kijai
-
-/CODEOWNERS @comfyanonymous
-/.ci/ @comfyanonymous
-/.github/ @comfyanonymous
+# Admins
+* @comfyanonymous @kosinkadink @guill @alexisrolland @rattus128
--- a/README.md
+++ b/README.md
@ -1,7 +1,7 @@
 <div align="center">

 # ComfyUI
-**The most powerful and modular AI engine for content creation.**
+**The most powerful and modular visual AI engine and application.**


 [![Website][website-shield]][website-url]
@ -31,16 +31,10 @@
 [github-downloads-latest-shield]: https://img.shields.io/github/downloads/comfyanonymous/ComfyUI/latest/total?style=flat&label=downloads%40latest
 [github-downloads-link]: https://github.com/comfyanonymous/ComfyUI/releases

-<img width="1590" height="795" alt="ComfyUI Screenshot" src="https://github.com/user-attachments/assets/36e065e0-bfae-4456-8c7f-8369d5ea48a2" />
-<br>
+![ComfyUI Screenshot](https://github.com/user-attachments/assets/7ccaf2c1-9b72-41ae-9a89-5688c94b7abe)
 </div>

-ComfyUI is the AI creation engine for visual professionals who demand control over every model, every parameter, and every output. Its powerful and modular node graph interface empowers creatives to generate images, videos, 3D models, audio, and more...
- ComfyUI natively supports the latest open-source state of the art models.
- API nodes provide access to the best closed source models such as Nano Banana, Seedance, Hunyuan3D, etc.
- It is available on Windows, Linux, and macOS, locally with our [desktop application](https://www.comfy.org/download), our [portable install](#installing) or on our [cloud](https://www.comfy.org/cloud).
- The most sophisticated workflows can be exposed through a simple UI thanks to App Mode.
- It integrates seamlessly into production pipelines with our API endpoints.
+ComfyUI lets you design and execute advanced stable diffusion pipelines using a graph/nodes/flowchart based interface. Available on Windows, Linux, and macOS.

 ## Get Started

@ -83,7 +77,6 @@ See what ComfyUI can do with the [newer template workflows](https://comfy.org/wo
   - [Hunyuan Image 2.1](https://comfyanonymous.github.io/ComfyUI_examples/hunyuan_image/)
   - [Flux 2](https://comfyanonymous.github.io/ComfyUI_examples/flux2/)
   - [Z Image](https://comfyanonymous.github.io/ComfyUI_examples/z_image/)
-   - Ernie Image
 - Image Editing Models
   - [Omnigen 2](https://comfyanonymous.github.io/ComfyUI_examples/omnigen/)
   - [Flux Kontext](https://comfyanonymous.github.io/ComfyUI_examples/flux/#flux-kontext-image-editing-model)
@ -133,7 +126,7 @@ Workflow examples can be found on the [Examples page](https://comfyanonymous.git
 ComfyUI follows a weekly release cycle targeting Monday but this regularly changes because of model releases or large changes to the codebase. There are three interconnected repositories:

 1. **[ComfyUI Core](https://github.com/comfyanonymous/ComfyUI)**
-   - Releases a new major stable version (e.g., v0.7.0) roughly every 2 weeks.
+   - Releases a new stable version (e.g., v0.7.0) roughly every week.
   - Starting from v0.4.0 patch versions will be used for fixes backported onto the current stable release.
   - Minor versions will be used for releases off the master branch.
   - Patch versions may still be used for releases on the master branch in cases where a backport would not make sense.
@ -200,15 +193,13 @@ If you have trouble extracting it, right click the file -> properties -> unblock

 The portable above currently comes with python 3.13 and pytorch cuda 13.0. Update your Nvidia drivers if it doesn't start.

-#### All Official Portable Downloads:
+#### Alternative Downloads:

 [Portable for AMD GPUs](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_amd.7z)

-[Portable for Intel GPUs](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_intel.7z)
+[Experimental portable for Intel GPUs](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_intel.7z)

-[Portable for Nvidia GPUs](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_nvidia.7z) (supports 20 series and above).
-
-[Portable for Nvidia GPUs with pytorch cuda 12.6 and python 3.12](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_nvidia_cu126.7z) (Supports Nvidia 10 series and older GPUs).
+[Portable with pytorch cuda 12.6 and python 3.12](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_nvidia_cu126.7z) (Supports Nvidia 10 series and older GPUs).

 #### How do I share models between another UI and ComfyUI?

@ -429,8 +420,6 @@ Use `--tls-keyfile key.pem --tls-certfile cert.pem` to enable TLS/SSL, the app w

 See also: [https://www.comfy.org/](https://www.comfy.org/)

-> _psst — we're hiring!_ Help build ComfyUI: [comfy.org/careers](https://www.comfy.org/careers)
-
 ## Frontend Development

 As of August 15, 2024, we have transitioned to a new frontend, which is now hosted in a separate repository: [ComfyUI Frontend](https://github.com/Comfy-Org/ComfyUI_frontend). This repository now hosts the compiled JS (from TS/Vue) under the `web/` directory.
--- a/SECURITY.md
+++ b/SECURITY.md
@ -1,44 +0,0 @@
-# Security Policy
-
-## Scope
-
-ComfyUI is designed to run locally. By default, the server binds to `127.0.0.1`, meaning only the user's own machine can reach it. Our threat model assumes:
-
- The user installed ComfyUI through a supported channel: the desktop application, the portable build, or a manual install following the README.
- The user has not installed untrusted custom nodes. Custom nodes are arbitrary Python code and are trusted as much as any other software the user chooses to install.
- Anyone with access to the ComfyUI URL is trusted (a direct consequence of the localhost-only default).
- PyTorch and other dependencies are at the versions we ship or recommend in the README.
-
-A report is in scope only if it affects a user operating within this threat model.
-
-## What We Consider a Vulnerability
-
-We want to hear about issues where a **reasonable user** — someone who does not install random untrusted nodes and who reads UI prompts and warnings before clicking through them — can be harmed by ComfyUI itself.
-
-The clearest example: a workflow file that such a user might plausibly load and run, using only built-in nodes, that results in **untrusted code execution, arbitrary file read/write outside expected directories, or credential/data exfiltration**.
-
-When submitting a report, please include a clear description of *why this is a problem for a typical local ComfyUI user*. Reports without this context are difficult to act on.
-
-## What We Do Not Consider a Security Vulnerability
-
-Please report the following through our regular [GitHub issues](https://github.com/comfyanonymous/ComfyUI/issues) instead. Filing them as security reports will likely cause them to be deprioritized or closed.
-
- **Issues requiring `--listen` or any non-default network exposure.** ComfyUI binds to localhost by default. If a remote attacker needs to reach the server for the attack to work, the user has chosen to expose it and is responsible for securing that deployment (firewall, reverse proxy, authentication, etc.). These are bugs, not vulnerabilities.
- **`torch.load` and related deserialization issues in old PyTorch versions.** These are upstream PyTorch issues. Our distributions ship with — and our documentation recommends — recent PyTorch versions where these are addressed.
- **Vulnerabilities that depend on outdated library versions** that we neither ship nor recommend (e.g., requiring PyTorch 2.6 or older).
- **Issues that require a specific custom node to be installed.** Custom nodes are third-party code. Report these to the maintainer of that node.
- **Crashes, hangs, or resource exhaustion from a loaded workflow.** Annoying, but not a security issue in our model. File a regular bug.
- **Social-engineering scenarios** where the user is expected to ignore an explicit UI warning or prompt.
-
-## Reporting
-
-If you believe you have found an issue that falls within the scope above, please report it privately via GitHub's [Report a vulnerability](https://github.com/comfyanonymous/ComfyUI/security/advisories/new) feature rather than opening a public issue.
-
-Please include:
-
-1. A description of the vulnerability and the affected component.
-2. Reproduction steps, ideally with a minimal workflow file or proof-of-concept.
-3. The ComfyUI version, install method (desktop / portable / manual), and OS.
-4. An explanation of how this affects a typical local user as described in the threat model.
-
-We will acknowledge valid reports and coordinate a fix and disclosure timeline with you.
--- a/app/frontend_management.py
+++ b/app/frontend_management.py
@ -27,7 +27,7 @@ def frontend_install_warning_message():
    return f"""
 {get_missing_requirements_message()}

-The ComfyUI frontend is shipped in a pip package so it needs to be updated separately from the ComfyUI code.
+This error is happening because the ComfyUI frontend is no longer shipped as part of the main repo but as a pip package instead.
 """.strip()

 def parse_version(version: str) -> tuple[int, int, int]:
@ -38,54 +38,40 @@ def is_valid_version(version: str) -> bool:
    pattern = r"^(\d+)\.(\d+)\.(\d+)$"
    return bool(re.match(pattern, version))

+def get_installed_frontend_version():
+    """Get the currently installed frontend package version."""
+    frontend_version_str = version("comfyui-frontend-package")
+    return frontend_version_str
+
+
 def get_required_frontend_version():
    return get_required_packages_versions().get("comfyui-frontend-package", None)


-COMFY_PACKAGE_VERSIONS = []
-def get_comfy_package_versions():
-    """List installed/required versions for every comfy* package in requirements.txt."""
-    if COMFY_PACKAGE_VERSIONS:
-        return COMFY_PACKAGE_VERSIONS.copy()
-    out = COMFY_PACKAGE_VERSIONS
-    for name, required in (get_required_packages_versions() or {}).items():
-        if not name.startswith("comfy"):
-            continue
-        try:
-            installed = version(name)
-        except Exception:
-            installed = None
-        out.append({"name": name, "installed": installed, "required": required})
-    return out.copy()
+def check_frontend_version():
+    """Check if the frontend version is up to date."""

-
-def check_comfy_packages_versions():
-    """Warn for every comfy* package whose installed version is below requirements.txt."""
-    from packaging.version import InvalidVersion, parse as parse_pep440
-    for pkg in get_comfy_package_versions():
-        installed_str = pkg["installed"]
-        required_str = pkg["required"]
-        if not installed_str or not required_str:
-            continue
-        try:
-            outdated = parse_pep440(installed_str) < parse_pep440(required_str)
-        except InvalidVersion as e:
-            logging.error(f"Failed to check {pkg['name']} version: {e}")
-            continue
-        if outdated:
+    try:
+        frontend_version_str = get_installed_frontend_version()
+        frontend_version = parse_version(frontend_version_str)
+        required_frontend_str = get_required_frontend_version()
+        required_frontend = parse_version(required_frontend_str)
+        if frontend_version < required_frontend:
            app.logger.log_startup_warning(
                f"""
 ________________________________________________________________________
 WARNING WARNING WARNING WARNING WARNING

-Installed {pkg["name"]} version {installed_str} is lower than the recommended version {required_str}.
+Installed frontend version {".".join(map(str, frontend_version))} is lower than the recommended version {".".join(map(str, required_frontend))}.

-{get_missing_requirements_message()}
+{frontend_install_warning_message()}
 ________________________________________________________________________
 """.strip()
            )
        else:
-            logging.info("{} version: {}".format(pkg["name"], installed_str))
+            logging.info("ComfyUI frontend version: {}".format(frontend_version_str))
+    except Exception as e:
+        logging.error(f"Failed to check frontend version: {e}")


 REQUEST_TIMEOUT = 10  # seconds
@ -215,11 +201,6 @@ class FrontendManager:
    def get_required_templates_version(cls) -> str:
        return get_required_packages_versions().get("comfyui-workflow-templates", None)

-    @classmethod
-    def get_comfy_package_versions(cls):
-        """List installed/required versions for every comfy* package in requirements.txt."""
-        return get_comfy_package_versions()
-
    @classmethod
    def default_frontend_path(cls) -> str:
        try:
@ -360,7 +341,7 @@ comfyui-workflow-templates is not installed.
            main error source might be request timeout or invalid URL.
        """
        if version_string == DEFAULT_VERSION_STRING:
-            check_comfy_packages_versions()
+            check_frontend_version()
            return cls.default_frontend_path()

        repo_owner, repo_name, version = cls.parse_version_string(version_string)
@ -422,7 +403,7 @@ comfyui-workflow-templates is not installed.
        except Exception as e:
            logging.error("Failed to initialize frontend: %s", e)
            logging.info("Falling back to the default frontend.")
-            check_comfy_packages_versions()
+            check_frontend_version()
            return cls.default_frontend_path()
    @classmethod
    def template_asset_handler(cls):
--- a/app/node_replace_manager.py
+++ b/app/node_replace_manager.py
@ -1,7 +1,5 @@
 from __future__ import annotations

-import logging
-
 from aiohttp import web

 from typing import TYPE_CHECKING, TypedDict
@ -33,22 +31,8 @@ class NodeReplaceManager:
        self._replacements: dict[str, list[NodeReplace]] = {}

    def register(self, node_replace: NodeReplace):
-        """Register a node replacement mapping.
-
-        Idempotent: if a replacement with the same (old_node_id, new_node_id)
-        is already registered, the duplicate is ignored. This prevents stale
-        entries from accumulating when custom nodes are reloaded in the same
-        process (e.g. via ComfyUI-Manager).
-        """
-        existing = self._replacements.setdefault(node_replace.old_node_id, [])
-        for entry in existing:
-            if entry.new_node_id == node_replace.new_node_id:
-                logging.debug(
-                    "Node replacement %s -> %s already registered, ignoring duplicate.",
-                    node_replace.old_node_id, node_replace.new_node_id,
-                )
-                return
-        existing.append(node_replace)
+        """Register a node replacement mapping."""
+        self._replacements.setdefault(node_replace.old_node_id, []).append(node_replace)

    def get_replacement(self, old_node_id: str) -> list[NodeReplace] | None:
        """Get replacements for an old node ID."""
--- a/app/user_manager.py
+++ b/app/user_manager.py
@ -28,8 +28,8 @@ def get_file_info(path: str, relative_to: str) -> FileInfo:
    return {
        "path": os.path.relpath(path, relative_to).replace(os.sep, '/'),
        "size": os.path.getsize(path),
-        "modified": int(os.path.getmtime(path) * 1000),
-        "created": int(os.path.getctime(path) * 1000),
+        "modified": os.path.getmtime(path),
+        "created": os.path.getctime(path)
    }


--- a/blueprints/Brightness
+++ b/blueprints/Brightness
@ -431,10 +431,9 @@
        "extra": {
          "workflowRendererVersion": "LG"
        },
-        "category": "Image Tools/Color adjust",
-        "description": "Adjusts image brightness and contrast using a real-time GPU fragment shader."
+        "category": "Image Tools/Color adjust"
      }
    ]
  },
  "extra": {}
-}
+}
--- a/(Z-Image-Turbo).json
+++ b/(Z-Image-Turbo).json
@ -162,7 +162,7 @@
        },
        "revision": 0,
        "config": {},
-        "name": "Canny to Image (Z-Image-Turbo)",
+        "name": "local-Canny to Image (Z-Image-Turbo)",
        "inputNode": {
          "id": -10,
          "bounding": [
@ -1553,8 +1553,7 @@
          "VHS_MetadataImage": true,
          "VHS_KeepIntermediate": true
        },
-        "category": "Image generation and editing/Canny to image",
-        "description": "Generates an image from a Canny edge map using Z-Image-Turbo, with text conditioning."
+        "category": "Image generation and editing/Canny to image"
      }
    ]
  },
@ -1575,4 +1574,4 @@
    }
  },
  "version": 0.4
-}
+}
--- a/blueprints/Canny
+++ b/blueprints/Canny
@ -192,7 +192,7 @@
        },
        "revision": 0,
        "config": {},
-        "name": "Canny to Video (LTX 2.0)",
+        "name": "local-Canny to Video (LTX 2.0)",
        "inputNode": {
          "id": -10,
          "bounding": [
@ -3600,8 +3600,7 @@
        "extra": {
          "workflowRendererVersion": "LG"
        },
-        "category": "Video generation and editing/Canny to video",
-        "description": "Generates video from Canny edge maps using LTX-2, with optional synchronized audio."
+        "category": "Video generation and editing/Canny to video"
      }
    ]
  },
@ -3617,4 +3616,4 @@
    }
  },
  "version": 0.4
-}
+}
--- a/blueprints/Chromatic
+++ b/blueprints/Chromatic
@ -377,9 +377,8 @@
        "extra": {
          "workflowRendererVersion": "LG"
        },
-        "category": "Image Tools/Color adjust",
-        "description": "Adds lens-style chromatic aberration (color fringing) using a real-time GPU fragment shader."
+        "category": "Image Tools/Color adjust"
      }
    ]
  }
-}
+}
--- a/blueprints/Color
+++ b/blueprints/Color
@ -596,8 +596,7 @@
        "extra": {
          "workflowRendererVersion": "LG"
        },
-        "category": "Image Tools/Color adjust",
-        "description": "Adjusts saturation, temperature, tint, and vibrance using a real-time GPU fragment shader."
+        "category": "Image Tools/Color adjust"
      }
    ]
  }
--- a/blueprints/Color
+++ b/blueprints/Color
@ -1129,8 +1129,7 @@
        "extra": {
          "workflowRendererVersion": "LG"
        },
-        "category": "Image Tools/Color adjust",
-        "description": "Balances colors across shadows, midtones, and highlights using a real-time GPU fragment shader."
+        "category": "Image Tools/Color adjust"
      }
    ]
  }
--- a/blueprints/Color
+++ b/blueprints/Color
@ -608,8 +608,7 @@
        "extra": {
          "workflowRendererVersion": "LG"
        },
-        "category": "Image Tools/Color adjust",
-        "description": "Fine-tunes tone and color with per-channel curve adjustments using a real-time GPU fragment shader."
+        "category": "Image Tools/Color adjust"
      }
    ]
  }
--- a/blueprints/ControlNet
+++ b/blueprints/ControlNet
--- a/blueprints/Crop
+++ b/blueprints/Crop
@ -1609,8 +1609,7 @@
          }
        ],
        "extra": {},
-        "category": "Image Tools/Crop",
-        "description": "Splits an image into a 2×2 grid of four equal tiles."
+        "category": "Image Tools/Crop"
      }
    ]
  },
--- a/blueprints/Crop
+++ b/blueprints/Crop
@ -2946,8 +2946,7 @@
          }
        ],
        "extra": {},
-        "category": "Image Tools/Crop",
-        "description": "Splits an image into a 3×3 grid of nine equal tiles."
+        "category": "Image Tools/Crop"
      }
    ]
  },
--- a/(Z-Image-Turbo).json
+++ b/(Z-Image-Turbo).json
@ -1579,8 +1579,7 @@
          "VHS_MetadataImage": true,
          "VHS_KeepIntermediate": true
        },
-        "category": "Image generation and editing/Depth to image",
-        "description": "Generates an image from a depth map using Z-Image-Turbo with text conditioning."
+        "category": "Image generation and editing/Depth to image"
      },
      {
        "id": "458bdf3c-4b58-421c-af50-c9c663a4d74c",
@ -2462,8 +2461,7 @@
            ]
          },
          "workflowRendererVersion": "LG"
-        },
-        "description": "Estimates a monocular depth map from an input image using the Lotus depth estimation model."
+        }
      }
    ]
  },
--- a/blueprints/Depth
+++ b/blueprints/Depth
@ -4233,8 +4233,7 @@
        "extra": {
          "workflowRendererVersion": "LG"
        },
-        "category": "Video generation and editing/Depth to video",
-        "description": "Generates depth-controlled video with LTX-2: motion and structure follow a depth-reference video alongside text prompting, optional first-frame image conditioning, with optional synchronized audio."
+        "category": "Video generation and editing/Depth to video"
      },
      {
        "id": "38b60539-50a7-42f9-a5fe-bdeca26272e2",
@ -5193,8 +5192,7 @@
        ],
        "extra": {
          "workflowRendererVersion": "LG"
-        },
-        "description": "Estimates a monocular depth map from an input image using the Lotus depth estimation model."
+        }
      }
    ]
  },
--- a/blueprints/Edge-Preserving
+++ b/blueprints/Edge-Preserving
@ -450,10 +450,9 @@
        "extra": {
          "workflowRendererVersion": "LG"
        },
-        "category": "Image Tools/Blur",
-        "description": "Applies bilateral (edge-preserving) blur to soften images while retaining detail."
+        "category": "Image Tools/Blur"
      }
    ]
  },
  "extra": {}
-}
+}
--- a/blueprints/Film
+++ b/blueprints/Film
@ -580,9 +580,8 @@
        "extra": {
          "workflowRendererVersion": "LG"
        },
-        "category": "Image Tools/Color adjust",
-        "description": "Adds procedural film grain texture for a cinematic look via GPU fragment shader."
+        "category": "Image Tools/Color adjust"
      }
    ]
  }
-}
+}
--- a/blueprints/First-Last-Frame
+++ b/blueprints/First-Last-Frame
@ -3350,8 +3350,7 @@
          }
        ],
        "extra": {},
-        "category": "Video generation and editing/First-Last-Frame to Video",
-        "description": "Generates a video interpolating between first and last keyframes using LTX-2.3."
+        "category": "Video generation and editing/First-Last-Frame to Video"
      }
    ]
  },
--- a/blueprints/First-Last-Frame
+++ b/blueprints/First-Last-Frame
--- a/Interpolation.json
+++ b/Interpolation.json
@ -1,858 +0,0 @@
-{
-  "revision": 0,
-  "last_node_id": 16,
-  "last_link_id": 0,
-  "nodes": [
-    {
-      "id": 16,
-      "type": "022693be-2baa-4009-870a-28921508a7ef",
-      "pos": [
-        -2990,
-        -3240
-      ],
-      "size": [
-        410,
-        200
-      ],
-      "flags": {},
-      "order": 2,
-      "mode": 0,
-      "inputs": [
-        {
-          "localized_name": "video",
-          "name": "video",
-          "type": "VIDEO",
-          "link": null
-        },
-        {
-          "label": "multiplier",
-          "name": "value",
-          "type": "INT",
-          "widget": {
-            "name": "value"
-          },
-          "link": null
-        },
-        {
-          "label": "enable_fps_multiplier",
-          "name": "value_1",
-          "type": "BOOLEAN",
-          "widget": {
-            "name": "value_1"
-          },
-          "link": null
-        },
-        {
-          "name": "model_name",
-          "type": "COMBO",
-          "widget": {
-            "name": "model_name"
-          },
-          "link": null
-        }
-      ],
-      "outputs": [
-        {
-          "label": "VIDEO",
-          "name": "VIDEO_1",
-          "type": "VIDEO",
-          "links": []
-        },
-        {
-          "name": "IMAGE",
-          "type": "IMAGE",
-          "links": null
-        }
-      ],
-      "properties": {
-        "proxyWidgets": [
-          [
-            "9",
-            "value"
-          ],
-          [
-            "13",
-            "value"
-          ],
-          [
-            "1",
-            "model_name"
-          ]
-        ],
-        "enableTabs": false,
-        "tabWidth": 65,
-        "tabXOffset": 10,
-        "hasSecondTab": false,
-        "secondTabText": "Send Back",
-        "secondTabOffset": 80,
-        "secondTabWidth": 65,
-        "cnr_id": "comfy-core",
-        "ver": "0.19.3"
-      },
-      "widgets_values": [],
-      "title": "Frame Interpolation"
-    }
-  ],
-  "links": [],
-  "version": 0.4,
-  "definitions": {
-    "subgraphs": [
-      {
-        "id": "022693be-2baa-4009-870a-28921508a7ef",
-        "version": 1,
-        "state": {
-          "lastGroupId": 0,
-          "lastNodeId": 17,
-          "lastLinkId": 28,
-          "lastRerouteId": 0
-        },
-        "revision": 0,
-        "config": {},
-        "name": "Frame Interpolation",
-        "inputNode": {
-          "id": -10,
-          "bounding": [
-            -2810,
-            -3070,
-            159.7421875,
-            120
-          ]
-        },
-        "outputNode": {
-          "id": -20,
-          "bounding": [
-            -1270,
-            -3075,
-            120,
-            80
-          ]
-        },
-        "inputs": [
-          {
-            "id": "05e31c51-dcb6-4a1e-9651-1b9ad4f7a287",
-            "name": "video",
-            "type": "VIDEO",
-            "linkIds": [
-              2
-            ],
-            "localized_name": "video",
-            "pos": [
-              -2670.2578125,
-              -3050
-            ]
-          },
-          {
-            "id": "feecb409-7d1c-4a99-9c63-50c5fecdd3c9",
-            "name": "value",
-            "type": "INT",
-            "linkIds": [
-              22
-            ],
-            "label": "multiplier",
-            "pos": [
-              -2670.2578125,
-              -3030
-            ]
-          },
-          {
-            "id": "0b8a861b-b581-4068-9e8c-f8d15daf1ca6",
-            "name": "value_1",
-            "type": "BOOLEAN",
-            "linkIds": [
-              23
-            ],
-            "label": "enable_fps_multiplier",
-            "pos": [
-              -2670.2578125,
-              -3010
-            ]
-          },
-          {
-            "id": "a22b101e-8773-4e17-a297-7ee3aae09162",
-            "name": "model_name",
-            "type": "COMBO",
-            "linkIds": [
-              24
-            ],
-            "pos": [
-              -2670.2578125,
-              -2990
-            ]
-          }
-        ],
-        "outputs": [
-          {
-            "id": "ef2ada05-d5aa-492a-9394-6c3e71e39ebb",
-            "name": "VIDEO_1",
-            "type": "VIDEO",
-            "linkIds": [
-              26
-            ],
-            "label": "VIDEO",
-            "pos": [
-              -1250,
-              -3055
-            ]
-          },
-          {
-            "id": "5aacc622-2a07-4983-b31c-e04461f7f953",
-            "name": "IMAGE",
-            "type": "IMAGE",
-            "linkIds": [
-              28
-            ],
-            "pos": [
-              -1250,
-              -3035
-            ]
-          }
-        ],
-        "widgets": [],
-        "nodes": [
-          {
-            "id": 1,
-            "type": "FrameInterpolationModelLoader",
-            "pos": [
-              -2510,
-              -3370
-            ],
-            "size": [
-              370,
-              90
-            ],
-            "flags": {},
-            "order": 0,
-            "mode": 0,
-            "inputs": [
-              {
-                "localized_name": "model_name",
-                "name": "model_name",
-                "type": "COMBO",
-                "widget": {
-                  "name": "model_name"
-                },
-                "link": 24
-              }
-            ],
-            "outputs": [
-              {
-                "localized_name": "INTERP_MODEL",
-                "name": "INTERP_MODEL",
-                "type": "INTERP_MODEL",
-                "links": [
-                  1
-                ]
-              }
-            ],
-            "properties": {
-              "Node name for S&R": "FrameInterpolationModelLoader",
-              "enableTabs": false,
-              "tabWidth": 65,
-              "tabXOffset": 10,
-              "hasSecondTab": false,
-              "secondTabText": "Send Back",
-              "secondTabOffset": 80,
-              "secondTabWidth": 65,
-              "cnr_id": "comfy-core",
-              "ver": "0.19.3",
-              "models": [
-                {
-                  "name": "film_net_fp16.safetensors",
-                  "url": "https://huggingface.co/Comfy-Org/frame_interpolation/resolve/main/frame_interpolation/film_net_fp16.safetensors",
-                  "directory": "frame_interpolation"
-                }
-              ]
-            },
-            "widgets_values": [
-              "film_net_fp16.safetensors"
-            ]
-          },
-          {
-            "id": 2,
-            "type": "FrameInterpolate",
-            "pos": [
-              -2040,
-              -3370
-            ],
-            "size": [
-              270,
-              110
-            ],
-            "flags": {},
-            "order": 1,
-            "mode": 0,
-            "inputs": [
-              {
-                "localized_name": "interp_model",
-                "name": "interp_model",
-                "type": "INTERP_MODEL",
-                "link": 1
-              },
-              {
-                "localized_name": "images",
-                "name": "images",
-                "type": "IMAGE",
-                "link": 3
-              },
-              {
-                "localized_name": "multiplier",
-                "name": "multiplier",
-                "type": "INT",
-                "widget": {
-                  "name": "multiplier"
-                },
-                "link": 8
-              }
-            ],
-            "outputs": [
-              {
-                "localized_name": "IMAGE",
-                "name": "IMAGE",
-                "type": "IMAGE",
-                "links": [
-                  4,
-                  28
-                ]
-              }
-            ],
-            "properties": {
-              "Node name for S&R": "FrameInterpolate",
-              "enableTabs": false,
-              "tabWidth": 65,
-              "tabXOffset": 10,
-              "hasSecondTab": false,
-              "secondTabText": "Send Back",
-              "secondTabOffset": 80,
-              "secondTabWidth": 65,
-              "cnr_id": "comfy-core",
-              "ver": "0.19.3"
-            },
-            "widgets_values": [
-              2
-            ]
-          },
-          {
-            "id": 5,
-            "type": "CreateVideo",
-            "pos": [
-              -1600,
-              -3370
-            ],
-            "size": [
-              270,
-              110
-            ],
-            "flags": {},
-            "order": 3,
-            "mode": 0,
-            "inputs": [
-              {
-                "localized_name": "images",
-                "name": "images",
-                "type": "IMAGE",
-                "link": 4
-              },
-              {
-                "localized_name": "audio",
-                "name": "audio",
-                "shape": 7,
-                "type": "AUDIO",
-                "link": 5
-              },
-              {
-                "localized_name": "fps",
-                "name": "fps",
-                "type": "FLOAT",
-                "widget": {
-                  "name": "fps"
-                },
-                "link": 12
-              }
-            ],
-            "outputs": [
-              {
-                "localized_name": "VIDEO",
-                "name": "VIDEO",
-                "type": "VIDEO",
-                "links": [
-                  26
-                ]
-              }
-            ],
-            "properties": {
-              "Node name for S&R": "CreateVideo",
-              "enableTabs": false,
-              "tabWidth": 65,
-              "tabXOffset": 10,
-              "hasSecondTab": false,
-              "secondTabText": "Send Back",
-              "secondTabOffset": 80,
-              "secondTabWidth": 65,
-              "cnr_id": "comfy-core",
-              "ver": "0.19.3"
-            },
-            "widgets_values": [
-              30
-            ]
-          },
-          {
-            "id": 9,
-            "type": "PrimitiveInt",
-            "pos": [
-              -2500,
-              -2970
-            ],
-            "size": [
-              270,
-              90
-            ],
-            "flags": {},
-            "order": 4,
-            "mode": 0,
-            "inputs": [
-              {
-                "localized_name": "value",
-                "name": "value",
-                "type": "INT",
-                "widget": {
-                  "name": "value"
-                },
-                "link": 22
-              }
-            ],
-            "outputs": [
-              {
-                "localized_name": "INT",
-                "name": "INT",
-                "type": "INT",
-                "links": [
-                  8,
-                  19
-                ]
-              }
-            ],
-            "title": "Int (Multiplier)",
-            "properties": {
-              "Node name for S&R": "PrimitiveInt",
-              "enableTabs": false,
-              "tabWidth": 65,
-              "tabXOffset": 10,
-              "hasSecondTab": false,
-              "secondTabText": "Send Back",
-              "secondTabOffset": 80,
-              "secondTabWidth": 65,
-              "cnr_id": "comfy-core",
-              "ver": "0.19.3"
-            },
-            "widgets_values": [
-              2,
-              "fixed"
-            ]
-          },
-          {
-            "id": 10,
-            "type": "ComfySwitchNode",
-            "pos": [
-              -1610,
-              -3120
-            ],
-            "size": [
-              270,
-              130
-            ],
-            "flags": {},
-            "order": 5,
-            "mode": 0,
-            "inputs": [
-              {
-                "localized_name": "on_false",
-                "name": "on_false",
-                "type": "*",
-                "link": 11
-              },
-              {
-                "localized_name": "on_true",
-                "name": "on_true",
-                "type": "*",
-                "link": 13
-              },
-              {
-                "localized_name": "switch",
-                "name": "switch",
-                "type": "BOOLEAN",
-                "widget": {
-                  "name": "switch"
-                },
-                "link": 15
-              }
-            ],
-            "outputs": [
-              {
-                "localized_name": "output",
-                "name": "output",
-                "type": "*",
-                "links": [
-                  12
-                ]
-              }
-            ],
-            "properties": {
-              "Node name for S&R": "ComfySwitchNode",
-              "enableTabs": false,
-              "tabWidth": 65,
-              "tabXOffset": 10,
-              "hasSecondTab": false,
-              "secondTabText": "Send Back",
-              "secondTabOffset": 80,
-              "secondTabWidth": 65,
-              "cnr_id": "comfy-core",
-              "ver": "0.19.3"
-            },
-            "widgets_values": [
-              true
-            ]
-          },
-          {
-            "id": 13,
-            "type": "PrimitiveBoolean",
-            "pos": [
-              -2500,
-              -2770
-            ],
-            "size": [
-              310,
-              90
-            ],
-            "flags": {},
-            "order": 7,
-            "mode": 0,
-            "inputs": [
-              {
-                "localized_name": "value",
-                "name": "value",
-                "type": "BOOLEAN",
-                "widget": {
-                  "name": "value"
-                },
-                "link": 23
-              }
-            ],
-            "outputs": [
-              {
-                "localized_name": "BOOLEAN",
-                "name": "BOOLEAN",
-                "type": "BOOLEAN",
-                "links": [
-                  15
-                ]
-              }
-            ],
-            "title": "Boolean (Apply multiplier to FPS?)",
-            "properties": {
-              "Node name for S&R": "PrimitiveBoolean",
-              "enableTabs": false,
-              "tabWidth": 65,
-              "tabXOffset": 10,
-              "hasSecondTab": false,
-              "secondTabText": "Send Back",
-              "secondTabOffset": 80,
-              "secondTabWidth": 65,
-              "cnr_id": "comfy-core",
-              "ver": "0.19.3"
-            },
-            "widgets_values": [
-              true
-            ]
-          },
-          {
-            "id": 3,
-            "type": "GetVideoComponents",
-            "pos": [
-              -2500,
-              -3170
-            ],
-            "size": [
-              230,
-              100
-            ],
-            "flags": {},
-            "order": 2,
-            "mode": 0,
-            "inputs": [
-              {
-                "localized_name": "video",
-                "name": "video",
-                "type": "VIDEO",
-                "link": 2
-              }
-            ],
-            "outputs": [
-              {
-                "localized_name": "images",
-                "name": "images",
-                "type": "IMAGE",
-                "links": [
-                  3
-                ]
-              },
-              {
-                "localized_name": "audio",
-                "name": "audio",
-                "type": "AUDIO",
-                "links": [
-                  5
-                ]
-              },
-              {
-                "localized_name": "fps",
-                "name": "fps",
-                "type": "FLOAT",
-                "links": [
-                  11,
-                  18
-                ]
-              }
-            ],
-            "properties": {
-              "Node name for S&R": "GetVideoComponents",
-              "enableTabs": false,
-              "tabWidth": 65,
-              "tabXOffset": 10,
-              "hasSecondTab": false,
-              "secondTabText": "Send Back",
-              "secondTabOffset": 80,
-              "secondTabWidth": 65,
-              "cnr_id": "comfy-core",
-              "ver": "0.19.3"
-            }
-          },
-          {
-            "id": 11,
-            "type": "ComfyMathExpression",
-            "pos": [
-              -2090,
-              -3070
-            ],
-            "size": [
-              400,
-              210
-            ],
-            "flags": {
-              "collapsed": false
-            },
-            "order": 6,
-            "mode": 0,
-            "inputs": [
-              {
-                "label": "a",
-                "localized_name": "values.a",
-                "name": "values.a",
-                "type": "FLOAT,INT",
-                "link": 18
-              },
-              {
-                "label": "b",
-                "localized_name": "values.b",
-                "name": "values.b",
-                "shape": 7,
-                "type": "FLOAT,INT",
-                "link": 19
-              },
-              {
-                "label": "c",
-                "localized_name": "values.c",
-                "name": "values.c",
-                "shape": 7,
-                "type": "FLOAT,INT",
-                "link": null
-              },
-              {
-                "localized_name": "expression",
-                "name": "expression",
-                "type": "STRING",
-                "widget": {
-                  "name": "expression"
-                },
-                "link": null
-              }
-            ],
-            "outputs": [
-              {
-                "localized_name": "FLOAT",
-                "name": "FLOAT",
-                "type": "FLOAT",
-                "links": [
-                  13
-                ]
-              },
-              {
-                "localized_name": "INT",
-                "name": "INT",
-                "type": "INT",
-                "links": null
-              }
-            ],
-            "properties": {
-              "Node name for S&R": "ComfyMathExpression",
-              "enableTabs": false,
-              "tabWidth": 65,
-              "tabXOffset": 10,
-              "hasSecondTab": false,
-              "secondTabText": "Send Back",
-              "secondTabOffset": 80,
-              "secondTabWidth": 65,
-              "cnr_id": "comfy-core",
-              "ver": "0.19.3"
-            },
-            "widgets_values": [
-              "min(abs(b), 16) * a"
-            ]
-          }
-        ],
-        "groups": [],
-        "links": [
-          {
-            "id": 1,
-            "origin_id": 1,
-            "origin_slot": 0,
-            "target_id": 2,
-            "target_slot": 0,
-            "type": "INTERP_MODEL"
-          },
-          {
-            "id": 3,
-            "origin_id": 3,
-            "origin_slot": 0,
-            "target_id": 2,
-            "target_slot": 1,
-            "type": "IMAGE"
-          },
-          {
-            "id": 8,
-            "origin_id": 9,
-            "origin_slot": 0,
-            "target_id": 2,
-            "target_slot": 2,
-            "type": "INT"
-          },
-          {
-            "id": 4,
-            "origin_id": 2,
-            "origin_slot": 0,
-            "target_id": 5,
-            "target_slot": 0,
-            "type": "IMAGE"
-          },
-          {
-            "id": 5,
-            "origin_id": 3,
-            "origin_slot": 1,
-            "target_id": 5,
-            "target_slot": 1,
-            "type": "AUDIO"
-          },
-          {
-            "id": 12,
-            "origin_id": 10,
-            "origin_slot": 0,
-            "target_id": 5,
-            "target_slot": 2,
-            "type": "FLOAT"
-          },
-          {
-            "id": 11,
-            "origin_id": 3,
-            "origin_slot": 2,
-            "target_id": 10,
-            "target_slot": 0,
-            "type": "FLOAT"
-          },
-          {
-            "id": 13,
-            "origin_id": 11,
-            "origin_slot": 0,
-            "target_id": 10,
-            "target_slot": 1,
-            "type": "FLOAT"
-          },
-          {
-            "id": 15,
-            "origin_id": 13,
-            "origin_slot": 0,
-            "target_id": 10,
-            "target_slot": 2,
-            "type": "BOOLEAN"
-          },
-          {
-            "id": 18,
-            "origin_id": 3,
-            "origin_slot": 2,
-            "target_id": 11,
-            "target_slot": 0,
-            "type": "FLOAT"
-          },
-          {
-            "id": 19,
-            "origin_id": 9,
-            "origin_slot": 0,
-            "target_id": 11,
-            "target_slot": 1,
-            "type": "INT"
-          },
-          {
-            "id": 2,
-            "origin_id": -10,
-            "origin_slot": 0,
-            "target_id": 3,
-            "target_slot": 0,
-            "type": "VIDEO"
-          },
-          {
-            "id": 22,
-            "origin_id": -10,
-            "origin_slot": 1,
-            "target_id": 9,
-            "target_slot": 0,
-            "type": "INT"
-          },
-          {
-            "id": 23,
-            "origin_id": -10,
-            "origin_slot": 2,
-            "target_id": 13,
-            "target_slot": 0,
-            "type": "BOOLEAN"
-          },
-          {
-            "id": 24,
-            "origin_id": -10,
-            "origin_slot": 3,
-            "target_id": 1,
-            "target_slot": 0,
-            "type": "COMBO"
-          },
-          {
-            "id": 26,
-            "origin_id": 5,
-            "origin_slot": 0,
-            "target_id": -20,
-            "target_slot": 0,
-            "type": "VIDEO"
-          },
-          {
-            "id": 28,
-            "origin_id": 2,
-            "origin_slot": 0,
-            "target_id": -20,
-            "target_slot": 1,
-            "type": "IMAGE"
-          }
-        ],
-        "extra": {},
-        "category": "Video Tools",
-        "description": "Increases video frame rate by synthesizing intermediate frames with a frame interpolation model."
-      }
-    ]
-  },
-  "extra": {}
-}
--- a/blueprints/Get
+++ b/blueprints/Get
@ -1,485 +0,0 @@
-{
-  "revision": 0,
-  "last_node_id": 98,
-  "last_link_id": 0,
-  "nodes": [
-    {
-      "id": 98,
-      "type": "dca6e78d-fb06-421e-97f7-6ce17a665260",
-      "pos": [
-        -410,
-        -2230
-      ],
-      "size": [
-        270,
-        104
-      ],
-      "flags": {},
-      "order": 7,
-      "mode": 0,
-      "inputs": [
-        {
-          "name": "video",
-          "type": "VIDEO",
-          "link": null
-        },
-        {
-          "label": "frame_index",
-          "name": "value",
-          "type": "INT",
-          "widget": {
-            "name": "value"
-          },
-          "link": null
-        }
-      ],
-      "outputs": [
-        {
-          "name": "IMAGE",
-          "type": "IMAGE",
-          "links": []
-        }
-      ],
-      "title": "Get Any Video Frame",
-      "properties": {
-        "proxyWidgets": [
-          [
-            "100",
-            "value"
-          ]
-        ]
-      },
-      "widgets_values": []
-    }
-  ],
-  "links": [],
-  "version": 0.4,
-  "definitions": {
-    "subgraphs": [
-      {
-        "id": "dca6e78d-fb06-421e-97f7-6ce17a665260",
-        "version": 1,
-        "state": {
-          "lastGroupId": 1,
-          "lastNodeId": 136,
-          "lastLinkId": 302,
-          "lastRerouteId": 0
-        },
-        "revision": 0,
-        "config": {},
-        "name": "Get Any Video Frame",
-        "inputNode": {
-          "id": -10,
-          "bounding": [
-            380,
-            -57,
-            120,
-            80
-          ]
-        },
-        "outputNode": {
-          "id": -20,
-          "bounding": [
-            1460,
-            -57,
-            120,
-            60
-          ]
-        },
-        "inputs": [
-          {
-            "id": "2ceec378-8dcf-4340-8570-155967f59a93",
-            "name": "video",
-            "type": "VIDEO",
-            "linkIds": [
-              4
-            ],
-            "pos": [
-              480,
-              -37
-            ]
-          },
-          {
-            "id": "819955f6-c686-4896-8032-ff2d0059109a",
-            "name": "value",
-            "type": "INT",
-            "linkIds": [
-              283
-            ],
-            "label": "frame_index",
-            "pos": [
-              480,
-              -17
-            ]
-          }
-        ],
-        "outputs": [
-          {
-            "id": "1ab0684d-6a44-45b6-8aa4-a0b971a1d41e",
-            "name": "IMAGE",
-            "type": "IMAGE",
-            "linkIds": [
-              5
-            ],
-            "pos": [
-              1480,
-              -37
-            ]
-          }
-        ],
-        "widgets": [],
-        "nodes": [
-          {
-            "id": 1,
-            "type": "GetVideoComponents",
-            "pos": [
-              560,
-              -150
-            ],
-            "size": [
-              230,
-              120
-            ],
-            "flags": {},
-            "order": 0,
-            "mode": 0,
-            "inputs": [
-              {
-                "localized_name": "video",
-                "name": "video",
-                "type": "VIDEO",
-                "link": 4
-              }
-            ],
-            "outputs": [
-              {
-                "localized_name": "images",
-                "name": "images",
-                "type": "IMAGE",
-                "links": [
-                  1,
-                  2
-                ]
-              },
-              {
-                "localized_name": "audio",
-                "name": "audio",
-                "type": "AUDIO",
-                "links": null
-              },
-              {
-                "localized_name": "fps",
-                "name": "fps",
-                "type": "FLOAT",
-                "links": null
-              }
-            ],
-            "properties": {
-              "Node name for S&R": "GetVideoComponents"
-            }
-          },
-          {
-            "id": 2,
-            "type": "GetImageSize",
-            "pos": [
-              560,
-              50
-            ],
-            "size": [
-              230,
-              120
-            ],
-            "flags": {},
-            "order": 1,
-            "mode": 0,
-            "inputs": [
-              {
-                "localized_name": "image",
-                "name": "image",
-                "type": "IMAGE",
-                "link": 1
-              }
-            ],
-            "outputs": [
-              {
-                "localized_name": "width",
-                "name": "width",
-                "type": "INT",
-                "links": null
-              },
-              {
-                "localized_name": "height",
-                "name": "height",
-                "type": "INT",
-                "links": null
-              },
-              {
-                "localized_name": "batch_size",
-                "name": "batch_size",
-                "type": "INT",
-                "links": [
-                  285
-                ]
-              }
-            ],
-            "properties": {
-              "Node name for S&R": "GetImageSize"
-            }
-          },
-          {
-            "id": 3,
-            "type": "ImageFromBatch",
-            "pos": [
-              1130,
-              -150
-            ],
-            "size": [
-              270,
-              140
-            ],
-            "flags": {},
-            "order": 2,
-            "mode": 0,
-            "inputs": [
-              {
-                "localized_name": "image",
-                "name": "image",
-                "type": "IMAGE",
-                "link": 2
-              },
-              {
-                "localized_name": "batch_index",
-                "name": "batch_index",
-                "type": "INT",
-                "widget": {
-                  "name": "batch_index"
-                },
-                "link": 286
-              },
-              {
-                "localized_name": "length",
-                "name": "length",
-                "type": "INT",
-                "widget": {
-                  "name": "length"
-                },
-                "link": null
-              }
-            ],
-            "outputs": [
-              {
-                "localized_name": "IMAGE",
-                "name": "IMAGE",
-                "type": "IMAGE",
-                "links": [
-                  5
-                ]
-              }
-            ],
-            "properties": {
-              "Node name for S&R": "ImageFromBatch"
-            },
-            "widgets_values": [
-              0,
-              1
-            ]
-          },
-          {
-            "id": 99,
-            "type": "ComfyMathExpression",
-            "pos": [
-              910,
-              100
-            ],
-            "size": [
-              400,
-              200
-            ],
-            "flags": {},
-            "order": 3,
-            "mode": 0,
-            "inputs": [
-              {
-                "label": "a",
-                "localized_name": "values.a",
-                "name": "values.a",
-                "type": "FLOAT,INT",
-                "link": 284
-              },
-              {
-                "label": "b",
-                "localized_name": "values.b",
-                "name": "values.b",
-                "shape": 7,
-                "type": "FLOAT,INT",
-                "link": 285
-              },
-              {
-                "label": "c",
-                "localized_name": "values.c",
-                "name": "values.c",
-                "shape": 7,
-                "type": "FLOAT,INT",
-                "link": null
-              },
-              {
-                "localized_name": "expression",
-                "name": "expression",
-                "type": "STRING",
-                "widget": {
-                  "name": "expression"
-                },
-                "link": null
-              }
-            ],
-            "outputs": [
-              {
-                "localized_name": "FLOAT",
-                "name": "FLOAT",
-                "type": "FLOAT",
-                "links": null
-              },
-              {
-                "localized_name": "INT",
-                "name": "INT",
-                "type": "INT",
-                "links": [
-                  286
-                ]
-              }
-            ],
-            "properties": {
-              "Node name for S&R": "ComfyMathExpression"
-            },
-            "widgets_values": [
-              "min(max(int(a if a >= 0 else b + a), 0), b - 1)"
-            ]
-          },
-          {
-            "id": 100,
-            "type": "PrimitiveInt",
-            "pos": [
-              560,
-              250
-            ],
-            "size": [
-              270,
-              110
-            ],
-            "flags": {},
-            "order": 4,
-            "mode": 0,
-            "inputs": [
-              {
-                "localized_name": "value",
-                "name": "value",
-                "type": "INT",
-                "widget": {
-                  "name": "value"
-                },
-                "link": 283
-              }
-            ],
-            "outputs": [
-              {
-                "localized_name": "INT",
-                "name": "INT",
-                "type": "INT",
-                "links": [
-                  284
-                ]
-              }
-            ],
-            "properties": {
-              "Node name for S&R": "PrimitiveInt"
-            },
-            "widgets_values": [
-              0,
-              "fixed"
-            ]
-          }
-        ],
-        "groups": [],
-        "links": [
-          {
-            "id": 1,
-            "origin_id": 1,
-            "origin_slot": 0,
-            "target_id": 2,
-            "target_slot": 0,
-            "type": "IMAGE"
-          },
-          {
-            "id": 2,
-            "origin_id": 1,
-            "origin_slot": 0,
-            "target_id": 3,
-            "target_slot": 0,
-            "type": "IMAGE"
-          },
-          {
-            "id": 4,
-            "origin_id": -10,
-            "origin_slot": 0,
-            "target_id": 1,
-            "target_slot": 0,
-            "type": "VIDEO"
-          },
-          {
-            "id": 5,
-            "origin_id": 3,
-            "origin_slot": 0,
-            "target_id": -20,
-            "target_slot": 0,
-            "type": "IMAGE"
-          },
-          {
-            "id": 283,
-            "origin_id": -10,
-            "origin_slot": 1,
-            "target_id": 100,
-            "target_slot": 0,
-            "type": "INT"
-          },
-          {
-            "id": 284,
-            "origin_id": 100,
-            "origin_slot": 0,
-            "target_id": 99,
-            "target_slot": 0,
-            "type": "INT"
-          },
-          {
-            "id": 285,
-            "origin_id": 2,
-            "origin_slot": 2,
-            "target_id": 99,
-            "target_slot": 1,
-            "type": "INT"
-          },
-          {
-            "id": 286,
-            "origin_id": 99,
-            "origin_slot": 1,
-            "target_id": 3,
-            "target_slot": 1,
-            "type": "INT"
-          }
-        ],
-        "extra": {},
-        "category": "Video Tools",
-        "description": "Extracts one image frame from a video at a chosen index, with optional trim and FPS control."
-      }
-    ]
-  },
-  "extra": {
-    "ds": {
-      "scale": 1.197015527856339,
-      "offset": [
-        -168.76833554248222,
-        540.6638955283997
-      ]
-    },
-    "frontendVersion": "1.42.8"
-  }
-}
--- a/blueprints/Glow.json
+++ b/blueprints/Glow.json
@ -575,9 +575,8 @@
        "extra": {
          "workflowRendererVersion": "LG"
        },
-        "category": "Image Tools/Color adjust",
-        "description": "Adds a glow/bloom effect around bright image areas via GPU fragment shader."
+        "category": "Image Tools/Color adjust"
      }
    ]
  }
-}
+}
--- a/Saturation.json
+++ b/Saturation.json
@ -752,9 +752,8 @@
        "extra": {
          "workflowRendererVersion": "LG"
        },
-        "category": "Image Tools/Color adjust",
-        "description": "Adjusts hue, saturation, and lightness of an image using a real-time GPU fragment shader."
+        "category": "Image Tools/Color adjust"
      }
    ]
  }
-}
+}
--- a/blueprints/Image
+++ b/blueprints/Image
@ -374,8 +374,7 @@
        "extra": {
          "workflowRendererVersion": "LG"
        },
-        "category": "Image Tools/Blur",
-        "description": "Applies Gaussian, Box, or Radial blur to soften images and create stylized depth or motion effects."
+        "category": "Image Tools/Blur"
      }
    ]
  }
--- a/blueprints/Image
+++ b/blueprints/Image
@ -310,8 +310,7 @@
        "extra": {
          "workflowRendererVersion": "LG"
        },
-        "category": "Text generation/Image Captioning",
-        "description": "Generates descriptive captions for images using Google's Gemini multimodal LLM."
+        "category": "Text generation/Image Captioning"
      }
    ]
  }
--- a/blueprints/Image
+++ b/blueprints/Image
@ -315,9 +315,8 @@
        "extra": {
          "workflowRendererVersion": "LG"
        },
-        "category": "Image Tools/Color adjust",
-        "description": "Manipulates individual RGBA channels for masking, compositing, and channel effects."
+        "category": "Image Tools/Color adjust"
      }
    ]
  }
-}
+}
--- a/blueprints/Image
+++ b/blueprints/Image
--- a/blueprints/Image
+++ b/blueprints/Image
--- a/blueprints/Image
+++ b/blueprints/Image
@ -1472,8 +1472,7 @@
        "extra": {
          "workflowRendererVersion": "LG"
        },
-        "category": "Image generation and editing/Edit image",
-        "description": "Edits an input image via text instructions using FLUX.2 [klein] 4B."
+        "category": "Image generation and editing/Edit image"
      },
      {
        "id": "6007e698-2ebd-4917-84d8-299b35d7b7ab",
@ -1822,8 +1821,7 @@
        ],
        "extra": {
          "workflowRendererVersion": "LG"
-        },
-        "description": "Applies reference image conditioning for style/identity transfer (Flux.2 Klein 4B)."
+        }
      }
    ]
  },
@ -1839,4 +1837,4 @@
    }
  },
  "version": 0.4
-}
+}
--- a/blueprints/Image
+++ b/blueprints/Image
@ -1417,8 +1417,7 @@
          }
        ],
        "extra": {},
-        "category": "Image generation and editing/Edit image",
-        "description": "Edits images via text instructions using LongCat Image Edit, an instruction-following image editing diffusion model."
+        "category": "Image generation and editing/Edit image"
      }
    ]
  },
--- a/blueprints/Image
+++ b/blueprints/Image
--- a/blueprints/Image
+++ b/blueprints/Image
@ -132,7 +132,7 @@
        },
        "revision": 0,
        "config": {},
-        "name": "Image Edit (Qwen 2511)",
+        "name": "local-Image Edit (Qwen 2511)",
        "inputNode": {
          "id": -10,
          "bounding": [
@ -1468,8 +1468,7 @@
          "VHS_MetadataImage": true,
          "VHS_KeepIntermediate": true
        },
-        "category": "Image generation and editing/Edit image",
-        "description": "Edits images via text instructions using Qwen-Image-Edit-2511 with improved character consistency and integrated LoRA."
+        "category": "Image generation and editing/Edit image"
      }
    ]
  },
@ -1490,4 +1489,4 @@
    }
  },
  "version": 0.4
-}
+}
--- a/blueprints/Image
+++ b/blueprints/Image
@ -1188,8 +1188,7 @@
        "extra": {
          "workflowRendererVersion": "LG"
        },
-        "category": "Image generation and editing/Inpaint image",
-        "description": "Inpaints masked image regions using Flux.1 fill [dev], Black Forest Labs' inpainting/outpainting model."
+        "category": "Image generation and editing/Inpaint image"
      }
    ]
  },
@ -1203,4 +1202,4 @@
    },
    "ue_links": []
  }
-}
+}
--- a/(Qwen-image).json
+++ b/(Qwen-image).json
@ -1548,8 +1548,7 @@
        "extra": {
          "workflowRendererVersion": "LG"
        },
-        "category": "Image generation and editing/Inpaint image",
-        "description": "Inpaints masked regions using Qwen-Image, extending its multilingual text rendering to inpainting tasks."
+        "category": "Image generation and editing/Inpaint image"
      },
      {
        "id": "56a1f603-fbd2-40ed-94ef-c9ecbd96aca8",
@ -1908,8 +1907,7 @@
        ],
        "extra": {
          "workflowRendererVersion": "LG"
-        },
-        "description": "Expands and softens mask edges to reduce visible seams after image processing."
+        }
      }
    ]
  },
--- a/blueprints/Image
+++ b/blueprints/Image
@ -742,10 +742,9 @@
        "extra": {
          "workflowRendererVersion": "LG"
        },
-        "category": "Image Tools/Color adjust",
-        "description": "Adjusts black point, white point, and gamma for tonal range control via GPU shader."
+        "category": "Image Tools/Color adjust"
      }
    ]
  },
  "extra": {}
-}
+}
--- a/(Qwen-Image).json
+++ b/(Qwen-Image).json
@ -1919,8 +1919,7 @@
        "extra": {
          "workflowRendererVersion": "LG"
        },
-        "category": "Image generation and editing/Outpaint image",
-        "description": "Outpaints beyond image boundaries using Qwen-Image's outpainting capabilities."
+        "category": "Image generation and editing/Outpaint image"
      },
      {
        "id": "f93c215e-c393-460e-9534-ed2c3d8a652e",
@ -2279,8 +2278,7 @@
        ],
        "extra": {
          "workflowRendererVersion": "LG"
-        },
-        "description": "Expands and softens mask edges to reduce visible seams after image processing."
+        }
      },
      {
        "id": "2a4b2cc0-db37-4302-a067-da392f38f06b",
@ -2735,8 +2733,7 @@
        ],
        "extra": {
          "workflowRendererVersion": "LG"
-        },
-        "description": "Scales both image and mask together while preserving alignment for editing workflows."
+        }
      }
    ]
  },
--- a/blueprints/Image
+++ b/blueprints/Image
@ -1,714 +0,0 @@
-{
-  "revision": 0,
-  "last_node_id": 99,
-  "last_link_id": 0,
-  "nodes": [
-    {
-      "id": 99,
-      "type": "6e7ab3ea-96aa-470f-9b94-3d9d0e01f481",
-      "pos": [
-        -1630,
-        -3270
-      ],
-      "size": [
-        290,
-        370
-      ],
-      "flags": {},
-      "order": 3,
-      "mode": 0,
-      "inputs": [
-        {
-          "label": "image",
-          "localized_name": "image",
-          "name": "image",
-          "type": "IMAGE",
-          "link": null
-        },
-        {
-          "label": "object",
-          "name": "text",
-          "type": "STRING",
-          "widget": {
-            "name": "text"
-          },
-          "link": null
-        },
-        {
-          "name": "bboxes",
-          "type": "BOUNDING_BOX",
-          "link": null
-        },
-        {
-          "name": "positive_coords",
-          "type": "STRING",
-          "link": null
-        },
-        {
-          "name": "negative_coords",
-          "type": "STRING",
-          "link": null
-        },
-        {
-          "name": "threshold",
-          "type": "FLOAT",
-          "widget": {
-            "name": "threshold"
-          },
-          "link": null
-        },
-        {
-          "name": "refine_iterations",
-          "type": "INT",
-          "widget": {
-            "name": "refine_iterations"
-          },
-          "link": null
-        },
-        {
-          "name": "individual_masks",
-          "type": "BOOLEAN",
-          "widget": {
-            "name": "individual_masks"
-          },
-          "link": null
-        },
-        {
-          "name": "ckpt_name",
-          "type": "COMBO",
-          "widget": {
-            "name": "ckpt_name"
-          },
-          "link": null
-        }
-      ],
-      "outputs": [
-        {
-          "localized_name": "masks",
-          "name": "masks",
-          "type": "MASK",
-          "links": []
-        },
-        {
-          "localized_name": "bboxes",
-          "name": "bboxes",
-          "type": "BOUNDING_BOX",
-          "links": []
-        }
-      ],
-      "properties": {
-        "proxyWidgets": [
-          [
-            "78",
-            "text"
-          ],
-          [
-            "75",
-            "threshold"
-          ],
-          [
-            "75",
-            "refine_iterations"
-          ],
-          [
-            "75",
-            "individual_masks"
-          ],
-          [
-            "77",
-            "ckpt_name"
-          ]
-        ],
-        "ue_properties": {
-          "widget_ue_connectable": {
-            "text": true
-          },
-          "version": "7.7",
-          "input_ue_unconnectable": {}
-        },
-        "cnr_id": "comfy-core",
-        "ver": "0.19.3",
-        "enableTabs": false,
-        "tabWidth": 65,
-        "tabXOffset": 10,
-        "hasSecondTab": false,
-        "secondTabText": "Send Back",
-        "secondTabOffset": 80,
-        "secondTabWidth": 65
-      },
-      "widgets_values": [],
-      "title": "Image Segmentation (SAM3)"
-    }
-  ],
-  "links": [],
-  "version": 0.4,
-  "definitions": {
-    "subgraphs": [
-      {
-        "id": "6e7ab3ea-96aa-470f-9b94-3d9d0e01f481",
-        "version": 1,
-        "state": {
-          "lastGroupId": 0,
-          "lastNodeId": 113,
-          "lastLinkId": 283,
-          "lastRerouteId": 0
-        },
-        "revision": 0,
-        "config": {},
-        "name": "Image Segmentation (SAM3)",
-        "inputNode": {
-          "id": -10,
-          "bounding": [
-            -2260,
-            -3450,
-            136.369140625,
-            220
-          ]
-        },
-        "outputNode": {
-          "id": -20,
-          "bounding": [
-            -1130,
-            -3305,
-            120,
-            80
-          ]
-        },
-        "inputs": [
-          {
-            "id": "a6e75fa2-162a-4af0-a2fd-1e9c899a5ab6",
-            "name": "image",
-            "type": "IMAGE",
-            "linkIds": [
-              264
-            ],
-            "localized_name": "image",
-            "label": "image",
-            "pos": [
-              -2143.630859375,
-              -3430
-            ]
-          },
-          {
-            "id": "3cefd304-7631-4ff6-a5a0-5a0ffb120745",
-            "name": "text",
-            "type": "STRING",
-            "linkIds": [
-              265
-            ],
-            "label": "object",
-            "pos": [
-              -2143.630859375,
-              -3410
-            ]
-          },
-          {
-            "id": "1aec91c5-d8d2-441c-928c-49c14e7e80ed",
-            "name": "bboxes",
-            "type": "BOUNDING_BOX",
-            "linkIds": [
-              266
-            ],
-            "pos": [
-              -2143.630859375,
-              -3390
-            ]
-          },
-          {
-            "id": "1ec7ce1a-8257-4719-8a81-60ebc8a98899",
-            "name": "positive_coords",
-            "type": "STRING",
-            "linkIds": [
-              267
-            ],
-            "pos": [
-              -2143.630859375,
-              -3370
-            ]
-          },
-          {
-            "id": "c65f8b87-9bd7-48be-9fc2-823431e95019",
-            "name": "negative_coords",
-            "type": "STRING",
-            "linkIds": [
-              268
-            ],
-            "pos": [
-              -2143.630859375,
-              -3350
-            ]
-          },
-          {
-            "id": "bb4ba35a-ccfe-4c37-98e5-d9b0d69585fb",
-            "name": "threshold",
-            "type": "FLOAT",
-            "linkIds": [
-              269
-            ],
-            "pos": [
-              -2143.630859375,
-              -3330
-            ]
-          },
-          {
-            "id": "b1439668-b050-490b-a5dc-fc4052c55666",
-            "name": "refine_iterations",
-            "type": "INT",
-            "linkIds": [
-              270
-            ],
-            "pos": [
-              -2143.630859375,
-              -3310
-            ]
-          },
-          {
-            "id": "86e239e5-c098-4302-b54d-d42a38bc0f89",
-            "name": "individual_masks",
-            "type": "BOOLEAN",
-            "linkIds": [
-              271
-            ],
-            "pos": [
-              -2143.630859375,
-              -3290
-            ]
-          },
-          {
-            "id": "f9e0b9d4-b2f1-4907-a4a5-305656576706",
-            "name": "ckpt_name",
-            "type": "COMBO",
-            "linkIds": [
-              272
-            ],
-            "pos": [
-              -2143.630859375,
-              -3270
-            ]
-          }
-        ],
-        "outputs": [
-          {
-            "id": "ff50da09-1e59-4a58-9b7f-be1a00aa5913",
-            "name": "masks",
-            "type": "MASK",
-            "linkIds": [
-              231
-            ],
-            "localized_name": "masks",
-            "pos": [
-              -1110,
-              -3285
-            ]
-          },
-          {
-            "id": "8f622e40-8528-4078-b7d3-147e9f872194",
-            "name": "bboxes",
-            "type": "BOUNDING_BOX",
-            "linkIds": [
-              232
-            ],
-            "localized_name": "bboxes",
-            "pos": [
-              -1110,
-              -3265
-            ]
-          }
-        ],
-        "widgets": [],
-        "nodes": [
-          {
-            "id": 75,
-            "type": "SAM3_Detect",
-            "pos": [
-              -1470,
-              -3460
-            ],
-            "size": [
-              270,
-              260
-            ],
-            "flags": {},
-            "order": 0,
-            "mode": 0,
-            "inputs": [
-              {
-                "label": "model",
-                "localized_name": "model",
-                "name": "model",
-                "type": "MODEL",
-                "link": 237
-              },
-              {
-                "label": "image",
-                "localized_name": "image",
-                "name": "image",
-                "type": "IMAGE",
-                "link": 264
-              },
-              {
-                "label": "conditioning",
-                "localized_name": "conditioning",
-                "name": "conditioning",
-                "shape": 7,
-                "type": "CONDITIONING",
-                "link": 200
-              },
-              {
-                "label": "bboxes",
-                "localized_name": "bboxes",
-                "name": "bboxes",
-                "shape": 7,
-                "type": "BOUNDING_BOX",
-                "link": 266
-              },
-              {
-                "label": "positive_coords",
-                "localized_name": "positive_coords",
-                "name": "positive_coords",
-                "shape": 7,
-                "type": "STRING",
-                "link": 267
-              },
-              {
-                "label": "negative_coords",
-                "localized_name": "negative_coords",
-                "name": "negative_coords",
-                "shape": 7,
-                "type": "STRING",
-                "link": 268
-              },
-              {
-                "localized_name": "threshold",
-                "name": "threshold",
-                "type": "FLOAT",
-                "widget": {
-                  "name": "threshold"
-                },
-                "link": 269
-              },
-              {
-                "localized_name": "refine_iterations",
-                "name": "refine_iterations",
-                "type": "INT",
-                "widget": {
-                  "name": "refine_iterations"
-                },
-                "link": 270
-              },
-              {
-                "localized_name": "individual_masks",
-                "name": "individual_masks",
-                "type": "BOOLEAN",
-                "widget": {
-                  "name": "individual_masks"
-                },
-                "link": 271
-              }
-            ],
-            "outputs": [
-              {
-                "localized_name": "masks",
-                "name": "masks",
-                "type": "MASK",
-                "links": [
-                  231
-                ]
-              },
-              {
-                "localized_name": "bboxes",
-                "name": "bboxes",
-                "type": "BOUNDING_BOX",
-                "links": [
-                  232
-                ]
-              }
-            ],
-            "properties": {
-              "ue_properties": {
-                "widget_ue_connectable": {},
-                "version": "7.7",
-                "input_ue_unconnectable": {}
-              },
-              "cnr_id": "comfy-core",
-              "ver": "0.19.3",
-              "Node name for S&R": "SAM3_Detect",
-              "enableTabs": false,
-              "tabWidth": 65,
-              "tabXOffset": 10,
-              "hasSecondTab": false,
-              "secondTabText": "Send Back",
-              "secondTabOffset": 80,
-              "secondTabWidth": 65
-            },
-            "widgets_values": [
-              0.5,
-              2,
-              false
-            ]
-          },
-          {
-            "id": 77,
-            "type": "CheckpointLoaderSimple",
-            "pos": [
-              -1970,
-              -3200
-            ],
-            "size": [
-              330,
-              140
-            ],
-            "flags": {},
-            "order": 1,
-            "mode": 0,
-            "inputs": [
-              {
-                "localized_name": "ckpt_name",
-                "name": "ckpt_name",
-                "type": "COMBO",
-                "widget": {
-                  "name": "ckpt_name"
-                },
-                "link": 272
-              }
-            ],
-            "outputs": [
-              {
-                "localized_name": "MODEL",
-                "name": "MODEL",
-                "type": "MODEL",
-                "links": [
-                  237
-                ]
-              },
-              {
-                "localized_name": "CLIP",
-                "name": "CLIP",
-                "type": "CLIP",
-                "links": [
-                  240
-                ]
-              },
-              {
-                "localized_name": "VAE",
-                "name": "VAE",
-                "type": "VAE",
-                "links": null
-              }
-            ],
-            "properties": {
-              "ue_properties": {
-                "widget_ue_connectable": {},
-                "version": "7.7",
-                "input_ue_unconnectable": {}
-              },
-              "cnr_id": "comfy-core",
-              "ver": "0.19.3",
-              "Node name for S&R": "CheckpointLoaderSimple",
-              "enableTabs": false,
-              "tabWidth": 65,
-              "tabXOffset": 10,
-              "hasSecondTab": false,
-              "secondTabText": "Send Back",
-              "secondTabOffset": 80,
-              "secondTabWidth": 65,
-              "models": [
-                {
-                  "name": "sam3.1_multiplex_fp16.safetensors",
-                  "url": "https://huggingface.co/Comfy-Org/sam3.1/resolve/main/checkpoints/sam3.1_multiplex_fp16.safetensors",
-                  "directory": "checkpoints"
-                }
-              ]
-            },
-            "widgets_values": [
-              "sam3.1_multiplex_fp16.safetensors"
-            ]
-          },
-          {
-            "id": 78,
-            "type": "CLIPTextEncode",
-            "pos": [
-              -2000,
-              -3000
-            ],
-            "size": [
-              400,
-              200
-            ],
-            "flags": {},
-            "order": 2,
-            "mode": 0,
-            "inputs": [
-              {
-                "localized_name": "clip",
-                "name": "clip",
-                "type": "CLIP",
-                "link": 240
-              },
-              {
-                "localized_name": "text",
-                "name": "text",
-                "type": "STRING",
-                "widget": {
-                  "name": "text"
-                },
-                "link": 265
-              }
-            ],
-            "outputs": [
-              {
-                "localized_name": "CONDITIONING",
-                "name": "CONDITIONING",
-                "type": "CONDITIONING",
-                "links": [
-                  200
-                ]
-              }
-            ],
-            "properties": {
-              "ue_properties": {
-                "widget_ue_connectable": {},
-                "version": "7.7",
-                "input_ue_unconnectable": {}
-              },
-              "cnr_id": "comfy-core",
-              "ver": "0.19.3",
-              "Node name for S&R": "CLIPTextEncode",
-              "enableTabs": false,
-              "tabWidth": 65,
-              "tabXOffset": 10,
-              "hasSecondTab": false,
-              "secondTabText": "Send Back",
-              "secondTabOffset": 80,
-              "secondTabWidth": 65
-            },
-            "widgets_values": [
-              ""
-            ]
-          }
-        ],
-        "groups": [],
-        "links": [
-          {
-            "id": 237,
-            "origin_id": 77,
-            "origin_slot": 0,
-            "target_id": 75,
-            "target_slot": 0,
-            "type": "MODEL"
-          },
-          {
-            "id": 200,
-            "origin_id": 78,
-            "origin_slot": 0,
-            "target_id": 75,
-            "target_slot": 2,
-            "type": "CONDITIONING"
-          },
-          {
-            "id": 240,
-            "origin_id": 77,
-            "origin_slot": 1,
-            "target_id": 78,
-            "target_slot": 0,
-            "type": "CLIP"
-          },
-          {
-            "id": 231,
-            "origin_id": 75,
-            "origin_slot": 0,
-            "target_id": -20,
-            "target_slot": 0,
-            "type": "MASK"
-          },
-          {
-            "id": 232,
-            "origin_id": 75,
-            "origin_slot": 1,
-            "target_id": -20,
-            "target_slot": 1,
-            "type": "BOUNDING_BOX"
-          },
-          {
-            "id": 264,
-            "origin_id": -10,
-            "origin_slot": 0,
-            "target_id": 75,
-            "target_slot": 1,
-            "type": "IMAGE"
-          },
-          {
-            "id": 265,
-            "origin_id": -10,
-            "origin_slot": 1,
-            "target_id": 78,
-            "target_slot": 1,
-            "type": "STRING"
-          },
-          {
-            "id": 266,
-            "origin_id": -10,
-            "origin_slot": 2,
-            "target_id": 75,
-            "target_slot": 3,
-            "type": "BOUNDING_BOX"
-          },
-          {
-            "id": 267,
-            "origin_id": -10,
-            "origin_slot": 3,
-            "target_id": 75,
-            "target_slot": 4,
-            "type": "STRING"
-          },
-          {
-            "id": 268,
-            "origin_id": -10,
-            "origin_slot": 4,
-            "target_id": 75,
-            "target_slot": 5,
-            "type": "STRING"
-          },
-          {
-            "id": 269,
-            "origin_id": -10,
-            "origin_slot": 5,
-            "target_id": 75,
-            "target_slot": 6,
-            "type": "FLOAT"
-          },
-          {
-            "id": 270,
-            "origin_id": -10,
-            "origin_slot": 6,
-            "target_id": 75,
-            "target_slot": 7,
-            "type": "INT"
-          },
-          {
-            "id": 271,
-            "origin_id": -10,
-            "origin_slot": 7,
-            "target_id": 75,
-            "target_slot": 8,
-            "type": "BOOLEAN"
-          },
-          {
-            "id": 272,
-            "origin_id": -10,
-            "origin_slot": 8,
-            "target_id": 77,
-            "target_slot": 0,
-            "type": "COMBO"
-          }
-        ],
-        "extra": {},
-        "category": "Image Tools/Image Segmentation",
-        "description": "Segments images into masks using Meta SAM3 from text prompts, points, or boxes."
-      }
-    ]
-  },
-  "extra": {
-    "ue_links": []
-  }
-}
--- a/Upscale(Z-image-Turbo).json
+++ b/Upscale(Z-image-Turbo).json
@ -141,7 +141,7 @@
        },
        "revision": 0,
        "config": {},
-        "name": "Image Upscale (Z-image-Turbo)",
+        "name": "local-Image Upscale(Z-image-Turbo)",
        "inputNode": {
          "id": -10,
          "bounding": [
@ -1302,8 +1302,7 @@
        "extra": {
          "workflowRendererVersion": "LG"
        },
-        "category": "Image generation and editing/Enhance",
-        "description": "Upscales images to higher resolution using Z-Image-Turbo."
+        "category": "Image generation and editing/Enhance"
      }
    ]
  },
--- a/blueprints/Image
+++ b/blueprints/Image
@ -99,7 +99,7 @@
        },
        "revision": 0,
        "config": {},
-        "name": "Image to Depth Map (Lotus)",
+        "name": "local-Image to Depth Map (Lotus)",
        "inputNode": {
          "id": -10,
          "bounding": [
@ -948,8 +948,7 @@
        "extra": {
          "workflowRendererVersion": "LG"
        },
-        "category": "Image generation and editing/Depth to image",
-        "description": "Estimates a monocular depth map from an input image using the Lotus depth estimation model."
+        "category": "Image generation and editing/Depth to image"
      }
    ]
  },
@ -965,4 +964,4 @@
    "workflowRendererVersion": "LG"
  },
  "version": 0.4
-}
+}
--- a/Layers(Qwen-Image-Layered).json
+++ b/Layers(Qwen-Image-Layered).json
@ -1586,8 +1586,7 @@
        "extra": {
          "workflowRendererVersion": "LG"
        },
-        "category": "Image generation and editing/Image to layers",
-        "description": "Decomposes an image into variable-resolution RGBA layers for independent editing using Qwen-Image-Layered."
+        "category": "Image generation and editing/Image to layers"
      }
    ]
  },
--- a/blueprints/Image
+++ b/blueprints/Image
@ -72,7 +72,7 @@
        },
        "revision": 0,
        "config": {},
-        "name": "Image to 3D Model (Hunyuan3d 2.1)",
+        "name": "local-Image to Model (Hunyuan3d 2.1)",
        "inputNode": {
          "id": -10,
          "bounding": [
@ -765,8 +765,7 @@
        "extra": {
          "workflowRendererVersion": "LG"
        },
-        "category": "3D/Image to 3D Model",
-        "description": "Generates 3D mesh models from a single input image using Hunyuan3D 2.0/2.1."
+        "category": "3D/Image to 3D Model"
      }
    ]
  },
--- a/blueprints/Image
+++ b/blueprints/Image
@ -4223,8 +4223,7 @@
        "extra": {
          "workflowRendererVersion": "Vue-corrected"
        },
-        "category": "Video generation and editing/Image to video",
-        "description": "Generates video from a single input image using LTX-2.3."
+        "category": "Video generation and editing/Image to video"
      }
    ]
  },
--- a/blueprints/Image
+++ b/blueprints/Image
@ -206,7 +206,7 @@
        },
        "revision": 0,
        "config": {},
-        "name": "Image to Video (Wan 2.2)",
+        "name": "local-Image to Video (Wan 2.2)",
        "inputNode": {
          "id": -10,
          "bounding": [
@ -2027,8 +2027,7 @@
        "extra": {
          "workflowRendererVersion": "LG"
        },
-        "category": "Video generation and editing/Image to video",
-        "description": "Image-to-video with Wan 2.2 using a start image plus text prompt to extend motion from the still frame."
+        "category": "Video generation and editing/Image to video"
      }
    ]
  },
--- a/(Z-Image-Turbo).json
+++ b/(Z-Image-Turbo).json
@ -134,7 +134,7 @@
        },
        "revision": 0,
        "config": {},
-        "name": "Pose to Image (Z-Image-Turbo)",
+        "name": "local-Pose to Image (Z-Image-Turbo)",
        "inputNode": {
          "id": -10,
          "bounding": [
@ -1298,8 +1298,7 @@
          "VHS_MetadataImage": true,
          "VHS_KeepIntermediate": true
        },
-        "category": "Image generation and editing/Pose to image",
-        "description": "Generates an image from pose keypoints using Z-Image-Turbo with text conditioning."
+        "category": "Image generation and editing/Pose to image"
      }
    ]
  },
@ -1320,4 +1319,4 @@
    }
  },
  "version": 0.4
-}
+}
--- a/blueprints/Pose
+++ b/blueprints/Pose
@ -3870,8 +3870,7 @@
        "extra": {
          "workflowRendererVersion": "LG"
        },
-        "category": "Video generation and editing/Pose to video",
-        "description": "Generates video from pose reference frames using LTX-2, with optional synchronized audio."
+        "category": "Video generation and editing/Pose to video"
      }
    ]
  },
--- a/blueprints/Prompt
+++ b/blueprints/Prompt
@ -270,10 +270,9 @@
        "extra": {
          "workflowRendererVersion": "LG"
        },
-        "category": "Text generation/Prompt enhance",
-        "description": "Expands short text prompts into detailed descriptions using a text generation model for better generation quality."
+        "category": "Text generation/Prompt enhance"
      }
    ]
  },
  "extra": {}
-}
+}
--- a/blueprints/Remove
+++ b/blueprints/Remove
@ -1,397 +0,0 @@
-{
-  "revision": 0,
-  "last_node_id": 19,
-  "last_link_id": 0,
-  "nodes": [
-    {
-      "id": 19,
-      "type": "5b40ca21-ba1a-41d5-b403-4d2d7acdc195",
-      "pos": [
-        -6411.330578108367,
-        1940.2638932730042
-      ],
-      "size": [
-        349.609375,
-        145.9375
-      ],
-      "flags": {},
-      "order": 2,
-      "mode": 0,
-      "inputs": [
-        {
-          "localized_name": "image",
-          "name": "image",
-          "type": "IMAGE",
-          "link": null
-        },
-        {
-          "name": "bg_removal_name",
-          "type": "COMBO",
-          "widget": {
-            "name": "bg_removal_name"
-          },
-          "link": null
-        }
-      ],
-      "outputs": [
-        {
-          "localized_name": "IMAGE",
-          "name": "IMAGE",
-          "type": "IMAGE",
-          "links": []
-        },
-        {
-          "name": "mask",
-          "type": "MASK",
-          "links": []
-        }
-      ],
-      "properties": {
-        "proxyWidgets": [
-          [
-            "14",
-            "bg_removal_name"
-          ]
-        ]
-      },
-      "widgets_values": [],
-      "title": "Remove Background (BiRefNet)"
-    }
-  ],
-  "links": [],
-  "version": 0.4,
-  "definitions": {
-    "subgraphs": [
-      {
-        "id": "5b40ca21-ba1a-41d5-b403-4d2d7acdc195",
-        "version": 1,
-        "state": {
-          "lastGroupId": 0,
-          "lastNodeId": 21,
-          "lastLinkId": 16,
-          "lastRerouteId": 0
-        },
-        "revision": 0,
-        "config": {},
-        "name": "Remove Background (BiRefNet)",
-        "description": "Removes or replaces image backgrounds using BiRefNet segmentation and alpha compositing.",
-        "inputNode": {
-          "id": -10,
-          "bounding": [
-            -6728.534070722246,
-            1475.2619799128663,
-            150.9140625,
-            88
-          ]
-        },
-        "outputNode": {
-          "id": -20,
-          "bounding": [
-            -6169.049695722246,
-            1475.2619799128663,
-            128,
-            88
-          ]
-        },
-        "inputs": [
-          {
-            "id": "7bc321cd-df31-4c39-aaf7-7f0d01326189",
-            "name": "image",
-            "type": "IMAGE",
-            "linkIds": [
-              5,
-              7
-            ],
-            "localized_name": "image",
-            "pos": [
-              -6601.620008222246,
-              1499.2619799128663
-            ]
-          },
-          {
-            "id": "e89d2cd8-daa3-4e29-8a69-851db85072cb",
-            "name": "bg_removal_name",
-            "type": "COMBO",
-            "linkIds": [
-              12
-            ],
-            "pos": [
-              -6601.620008222246,
-              1519.2619799128663
-            ]
-          }
-        ],
-        "outputs": [
-          {
-            "id": "16e7863c-4c38-46c2-aa74-e82991fbfe8d",
-            "name": "IMAGE",
-            "type": "IMAGE",
-            "linkIds": [
-              8
-            ],
-            "localized_name": "IMAGE",
-            "pos": [
-              -6145.049695722246,
-              1499.2619799128663
-            ]
-          },
-          {
-            "id": "f7240c19-5b80-406e-a8e2-9b12440ee2d6",
-            "name": "mask",
-            "type": "MASK",
-            "linkIds": [
-              11
-            ],
-            "pos": [
-              -6145.049695722246,
-              1519.2619799128663
-            ]
-          }
-        ],
-        "widgets": [],
-        "nodes": [
-          {
-            "id": 13,
-            "type": "RemoveBackground",
-            "pos": [
-              -6536.764823982709,
-              1444.9963409012412
-            ],
-            "size": [
-              302.25,
-              72
-            ],
-            "flags": {},
-            "order": 0,
-            "mode": 0,
-            "inputs": [
-              {
-                "localized_name": "image",
-                "name": "image",
-                "type": "IMAGE",
-                "link": 5
-              },
-              {
-                "localized_name": "bg_removal_model",
-                "name": "bg_removal_model",
-                "type": "BACKGROUND_REMOVAL",
-                "link": 3
-              }
-            ],
-            "outputs": [
-              {
-                "localized_name": "mask",
-                "name": "mask",
-                "type": "MASK",
-                "links": [
-                  4,
-                  11
-                ]
-              }
-            ],
-            "properties": {
-              "Node name for S&R": "RemoveBackground"
-            }
-          },
-          {
-            "id": 14,
-            "type": "LoadBackgroundRemovalModel",
-            "pos": [
-              -6540.534070722246,
-              1302.223464635445
-            ],
-            "size": [
-              311.484375,
-              85.515625
-            ],
-            "flags": {},
-            "order": 1,
-            "mode": 0,
-            "inputs": [
-              {
-                "localized_name": "bg_removal_name",
-                "name": "bg_removal_name",
-                "type": "COMBO",
-                "widget": {
-                  "name": "bg_removal_name"
-                },
-                "link": 12
-              }
-            ],
-            "outputs": [
-              {
-                "localized_name": "bg_model",
-                "name": "bg_model",
-                "type": "BACKGROUND_REMOVAL",
-                "links": [
-                  3
-                ]
-              }
-            ],
-            "properties": {
-              "Node name for S&R": "LoadBackgroundRemovalModel",
-              "models": [
-                {
-                  "name": "birefnet.safetensors",
-                  "url": "https://huggingface.co/Comfy-Org/BiRefNet/resolve/main/background_removal/birefnet.safetensors",
-                  "directory": "background_removal"
-                }
-              ]
-            },
-            "widgets_values": [
-              "birefnet.safetensors"
-            ]
-          },
-          {
-            "id": 15,
-            "type": "InvertMask",
-            "pos": [
-              -6532.446160529669,
-              1571.1111286839914
-            ],
-            "size": [
-              285.984375,
-              48
-            ],
-            "flags": {},
-            "order": 2,
-            "mode": 0,
-            "inputs": [
-              {
-                "localized_name": "mask",
-                "name": "mask",
-                "type": "MASK",
-                "link": 4
-              }
-            ],
-            "outputs": [
-              {
-                "localized_name": "MASK",
-                "name": "MASK",
-                "type": "MASK",
-                "links": [
-                  6
-                ]
-              }
-            ],
-            "properties": {
-              "Node name for S&R": "InvertMask"
-            }
-          },
-          {
-            "id": 16,
-            "type": "JoinImageWithAlpha",
-            "pos": [
-              -6527.4370171636665,
-              1674.3004951902876
-            ],
-            "size": [
-              284.96875,
-              72
-            ],
-            "flags": {},
-            "order": 3,
-            "mode": 0,
-            "inputs": [
-              {
-                "localized_name": "image",
-                "name": "image",
-                "type": "IMAGE",
-                "link": 7
-              },
-              {
-                "localized_name": "alpha",
-                "name": "alpha",
-                "type": "MASK",
-                "link": 6
-              }
-            ],
-            "outputs": [
-              {
-                "localized_name": "IMAGE",
-                "name": "IMAGE",
-                "type": "IMAGE",
-                "links": [
-                  8
-                ]
-              }
-            ],
-            "properties": {
-              "Node name for S&R": "JoinImageWithAlpha"
-            }
-          }
-        ],
-        "groups": [],
-        "links": [
-          {
-            "id": 3,
-            "origin_id": 14,
-            "origin_slot": 0,
-            "target_id": 13,
-            "target_slot": 1,
-            "type": "BACKGROUND_REMOVAL"
-          },
-          {
-            "id": 4,
-            "origin_id": 13,
-            "origin_slot": 0,
-            "target_id": 15,
-            "target_slot": 0,
-            "type": "MASK"
-          },
-          {
-            "id": 6,
-            "origin_id": 15,
-            "origin_slot": 0,
-            "target_id": 16,
-            "target_slot": 1,
-            "type": "MASK"
-          },
-          {
-            "id": 5,
-            "origin_id": -10,
-            "origin_slot": 0,
-            "target_id": 13,
-            "target_slot": 0,
-            "type": "IMAGE"
-          },
-          {
-            "id": 7,
-            "origin_id": -10,
-            "origin_slot": 0,
-            "target_id": 16,
-            "target_slot": 0,
-            "type": "IMAGE"
-          },
-          {
-            "id": 8,
-            "origin_id": 16,
-            "origin_slot": 0,
-            "target_id": -20,
-            "target_slot": 0,
-            "type": "IMAGE"
-          },
-          {
-            "id": 11,
-            "origin_id": 13,
-            "origin_slot": 0,
-            "target_id": -20,
-            "target_slot": 1,
-            "type": "MASK"
-          },
-          {
-            "id": 12,
-            "origin_id": -10,
-            "origin_slot": 1,
-            "target_id": 14,
-            "target_slot": 0,
-            "type": "COMBO"
-          }
-        ],
-        "extra": {},
-        "category": "Image generation and editing/Background Removal"
-      }
-    ]
-  },
-  "extra": {}
-}
--- a/blueprints/Sharpen.json
+++ b/blueprints/Sharpen.json
@ -302,9 +302,8 @@
        "extra": {
          "workflowRendererVersion": "LG"
        },
-        "category": "Image Tools/Sharpen",
-        "description": "Sharpens image details using a GPU fragment shader for enhanced clarity."
+        "category": "Image Tools/Sharpen"
      }
    ]
  }
-}
+}
--- a/blueprints/Text
+++ b/blueprints/Text
@ -222,7 +222,7 @@
        },
        "revision": 0,
        "config": {},
-        "name": "Text to Audio (ACE-Step 1.5)",
+        "name": "local-Text to Audio (ACE-Step 1.5)",
        "inputNode": {
          "id": -10,
          "bounding": [
@ -1502,8 +1502,7 @@
        "extra": {
          "workflowRendererVersion": "LG"
        },
-        "category": "Audio/Music generation",
-        "description": "Generates audio/music from text prompts using ACE-Step 1.5, a diffusion-based audio generation model."
+        "category": "Audio/Music generation"
      }
    ]
  },
@ -1519,4 +1518,4 @@
    }
  },
  "version": 0.4
-}
+}
--- a/blueprints/Text
+++ b/blueprints/Text
--- a/blueprints/Text
+++ b/blueprints/Text
--- a/blueprints/Text
+++ b/blueprints/Text
@ -1029,8 +1029,7 @@
        "extra": {
          "workflowRendererVersion": "LG"
        },
-        "category": "Image generation and editing/Text to image",
-        "description": "Generates images from prompts using FLUX.1 [dev]: a 12B rectified-flow MMDiT with dual CLIP plus T5-XXL text encoders and guidance-distilled sampling for sharp prompt following versus classic DDPM diffusion."
+        "category": "Image generation and editing/Text to image"
      }
    ]
  },
@ -1044,4 +1043,4 @@
    },
    "ue_links": []
  }
-}
+}
--- a/blueprints/Text
+++ b/blueprints/Text
@ -1023,8 +1023,7 @@
        "extra": {
          "workflowRendererVersion": "LG"
        },
-        "category": "Image generation and editing/Text to image",
-        "description": "FLUX.1 Krea [dev] (Black Forest Labs × Krea): open-weight 12B rectified-flow text-to-image drop-in alongside FLUX.1 [dev], tuned away from overcooked saturation toward more natural diversity in people, realism, and style while keeping ecosystem compatibility."
+        "category": "Image generation and editing/Text to image"
      }
    ]
  },
@ -1038,4 +1037,4 @@
    },
    "ue_links": []
  }
-}
+}
--- a/blueprints/Text
+++ b/blueprints/Text
--- a/blueprints/Text
+++ b/blueprints/Text
@ -1104,8 +1104,7 @@
        "extra": {
          "workflowRendererVersion": "LG"
        },
-        "category": "Image generation and editing/Text to image",
-        "description": "Generates images from text prompts using NetaYume Lumina, fine-tuned from Neta Lumina for anime-style and illustration generation."
+        "category": "Image generation and editing/Text to image"
      },
      {
        "id": "a07fdf06-1bda-4dac-bdbd-63ee8ebca1c9",
@ -1459,12 +1458,11 @@
        ],
        "extra": {
          "workflowRendererVersion": "LG"
-        },
-        "description": "Encodes a negative text prompt via CLIP for classifier-free guidance in anime-style generation (NetaYume Lumina)."
+        }
      }
    ]
  },
  "extra": {
    "ue_links": []
  }
-}
+}
--- a/blueprints/Text
+++ b/blueprints/Text
@ -1941,8 +1941,7 @@
        "extra": {
          "workflowRendererVersion": "Vue-corrected"
        },
-        "category": "Image generation and editing/Text to image",
-        "description": "Generates images from text prompts using Qwen-Image-2512, with enhanced human realism and finer natural detail over the base version."
+        "category": "Image generation and editing/Text to image"
      }
    ]
  },
--- a/(Qwen-Image).json
+++ b/(Qwen-Image).json
@ -1873,8 +1873,7 @@
        "extra": {
          "workflowRendererVersion": "LG"
        },
-        "category": "Image generation and editing/Text to image",
-        "description": "Generates images from text prompts using Qwen-Image, Alibaba's 20B MMDiT model with excellent multilingual text rendering."
+        "category": "Image generation and editing/Text to image"
      }
    ]
  },
--- a/(Z-Image-Base).json
+++ b/(Z-Image-Base).json
--- a/(Z-Image-Turbo).json
+++ b/(Z-Image-Turbo).json
@ -1,21 +1,22 @@
 {
+  "id": "1c3eaa76-5cfa-4dc7-8571-97a570324e01",
  "revision": 0,
-  "last_node_id": 57,
-  "last_link_id": 0,
+  "last_node_id": 34,
+  "last_link_id": 40,
  "nodes": [
    {
-      "id": 57,
-      "type": "f2fdebf6-dfaf-43b6-9eb2-7f70613cfdc1",
+      "id": 5,
+      "type": "dfe9eb32-97c0-43a5-90d5-4fd37768d91b",
      "pos": [
-        130,
-        200
+        -2.5766491043910378e-05,
+        1229.999928629805
      ],
      "size": [
        400,
        470
      ],
      "flags": {},
-      "order": 1,
+      "order": 0,
      "mode": 0,
      "inputs": [
        {
@ -43,22 +44,6 @@
          },
          "link": null
        },
-        {
-          "name": "seed",
-          "type": "INT",
-          "widget": {
-            "name": "seed"
-          },
-          "link": null
-        },
-        {
-          "name": "steps",
-          "type": "INT",
-          "widget": {
-            "name": "steps"
-          },
-          "link": null
-        },
        {
          "name": "unet_name",
          "type": "COMBO",
@ -95,15 +80,15 @@
      "properties": {
        "proxyWidgets": [
          [
-            "27",
+            "-1",
            "text"
          ],
          [
-            "13",
+            "-1",
            "width"
          ],
          [
-            "13",
+            "-1",
            "height"
          ],
          [
@ -112,23 +97,19 @@
          ],
          [
            "3",
-            "steps"
+            "control_after_generate"
          ],
          [
-            "28",
+            "-1",
            "unet_name"
          ],
          [
-            "30",
+            "-1",
            "clip_name"
          ],
          [
-            "29",
+            "-1",
            "vae_name"
-          ],
-          [
-            "3",
-            "control_after_generate"
          ]
        ],
        "cnr_id": "comfy-core",
@ -141,40 +122,48 @@
        "secondTabOffset": 80,
        "secondTabWidth": 65
      },
-      "widgets_values": [],
-      "title": "Text to Image (Z-Image-Turbo)"
+      "widgets_values": [
+        "",
+        1024,
+        1024,
+        null,
+        null,
+        "z_image_turbo_bf16.safetensors",
+        "qwen_3_4b.safetensors",
+        "ae.safetensors"
+      ]
    }
  ],
  "links": [],
-  "version": 0.4,
+  "groups": [],
  "definitions": {
    "subgraphs": [
      {
-        "id": "f2fdebf6-dfaf-43b6-9eb2-7f70613cfdc1",
+        "id": "dfe9eb32-97c0-43a5-90d5-4fd37768d91b",
        "version": 1,
        "state": {
          "lastGroupId": 4,
-          "lastNodeId": 61,
-          "lastLinkId": 75,
+          "lastNodeId": 34,
+          "lastLinkId": 40,
          "lastRerouteId": 0
        },
        "revision": 0,
        "config": {},
-        "name": "Text to Image (Z-Image-Turbo)",
+        "name": "local-Text to Image (Z-Image-Turbo)",
        "inputNode": {
          "id": -10,
          "bounding": [
-            -560,
-            480,
+            -80,
+            425,
            120,
-            200
+            160
          ]
        },
        "outputNode": {
          "id": -20,
          "bounding": [
-            1670,
-            320,
+            1490,
+            415,
            120,
            60
          ]
@ -189,8 +178,8 @@
            ],
            "label": "prompt",
            "pos": [
-              -460,
-              500
+              20,
+              445
            ]
          },
          {
@ -201,8 +190,8 @@
              35
            ],
            "pos": [
-              -460,
-              520
+              20,
+              465
            ]
          },
          {
@ -213,68 +202,44 @@
              36
            ],
            "pos": [
-              -460,
-              540
+              20,
+              485
            ]
          },
          {
-            "id": "f77677f7-6bf6-4c19-a71f-c4a553d5981e",
-            "name": "seed",
-            "type": "INT",
-            "linkIds": [
-              71
-            ],
-            "pos": [
-              -460,
-              560
-            ]
-          },
-          {
-            "id": "ef9a9fb1-5983-4bc9-a60b-cf5aec48bff1",
-            "name": "steps",
-            "type": "INT",
-            "linkIds": [
-              72
-            ],
-            "pos": [
-              -460,
-              580
-            ]
-          },
-          {
-            "id": "a20a1b30-785f-4a04-bb6d-3d61adab9764",
+            "id": "23087d15-8412-4fbd-b71e-9b6d7ef76de1",
            "name": "unet_name",
            "type": "COMBO",
            "linkIds": [
-              73
+              38
            ],
            "pos": [
-              -460,
-              600
+              20,
+              505
            ]
          },
          {
-            "id": "4af8fc2b-4655-4086-8240-45f8cb38c6f6",
+            "id": "0677f5c3-2a3f-43d4-98ac-a4c56d5efdc0",
            "name": "clip_name",
            "type": "COMBO",
            "linkIds": [
-              74
+              39
            ],
            "pos": [
-              -460,
-              620
+              20,
+              525
            ]
          },
          {
-            "id": "4d518693-2807-439c-9cb6-cffd23ccba2c",
+            "id": "c85c0445-2641-48b1-bbca-95057edf2fcf",
            "name": "vae_name",
            "type": "COMBO",
            "linkIds": [
-              75
+              40
            ],
            "pos": [
-              -460,
-              640
+              20,
+              545
            ]
          }
        ],
@ -288,8 +253,8 @@
            ],
            "localized_name": "IMAGE",
            "pos": [
-              1690,
-              340
+              1510,
+              435
            ]
          }
        ],
@ -299,15 +264,15 @@
            "id": 30,
            "type": "CLIPLoader",
            "pos": [
-              30,
-              420
+              109.99997264844609,
+              329.99999029608756
            ],
            "size": [
-              270,
-              150
+              269.9869791666667,
+              106
            ],
            "flags": {},
-            "order": 7,
+            "order": 0,
            "mode": 0,
            "inputs": [
              {
@ -317,7 +282,7 @@
                "widget": {
                  "name": "clip_name"
                },
-                "link": 74
+                "link": 39
              },
              {
                "localized_name": "type",
@ -350,9 +315,9 @@
              }
            ],
            "properties": {
-              "Node name for S&R": "CLIPLoader",
              "cnr_id": "comfy-core",
              "ver": "0.3.73",
+              "Node name for S&R": "CLIPLoader",
              "models": [
                {
                  "name": "qwen_3_4b.safetensors",
@ -378,15 +343,15 @@
            "id": 29,
            "type": "VAELoader",
            "pos": [
-              30,
-              650
+              109.99997264844609,
+              479.9999847172637
            ],
            "size": [
-              270,
-              110
+              269.9869791666667,
+              58
            ],
            "flags": {},
-            "order": 6,
+            "order": 1,
            "mode": 0,
            "inputs": [
              {
@ -396,7 +361,7 @@
                "widget": {
                  "name": "vae_name"
                },
-                "link": 75
+                "link": 40
              }
            ],
            "outputs": [
@ -410,9 +375,9 @@
              }
            ],
            "properties": {
-              "Node name for S&R": "VAELoader",
              "cnr_id": "comfy-core",
              "ver": "0.3.73",
+              "Node name for S&R": "VAELoader",
              "models": [
                {
                  "name": "ae.safetensors",
@ -436,12 +401,12 @@
            "id": 33,
            "type": "ConditioningZeroOut",
            "pos": [
-              630,
-              960
+              639.9999103333332,
+              620.0000271257795
            ],
            "size": [
-              230,
-              80
+              204.134765625,
+              26
            ],
            "flags": {},
            "order": 8,
@ -465,9 +430,9 @@
              }
            ],
            "properties": {
-              "Node name for S&R": "ConditioningZeroOut",
              "cnr_id": "comfy-core",
              "ver": "0.3.73",
+              "Node name for S&R": "ConditioningZeroOut",
              "enableTabs": false,
              "tabWidth": 65,
              "tabXOffset": 10,
@ -475,21 +440,22 @@
              "secondTabText": "Send Back",
              "secondTabOffset": 80,
              "secondTabWidth": 65
-            }
+            },
+            "widgets_values": []
          },
          {
            "id": 8,
            "type": "VAEDecode",
            "pos": [
-              1320,
-              230
+              1219.9999088104782,
+              160.00009184959066
            ],
            "size": [
-              230,
-              100
+              209.98697916666669,
+              46
            ],
            "flags": {},
-            "order": 1,
+            "order": 5,
            "mode": 0,
            "inputs": [
              {
@ -517,9 +483,9 @@
              }
            ],
            "properties": {
-              "Node name for S&R": "VAEDecode",
              "cnr_id": "comfy-core",
              "ver": "0.3.64",
+              "Node name for S&R": "VAEDecode",
              "enableTabs": false,
              "tabWidth": 65,
              "tabXOffset": 10,
@ -527,21 +493,22 @@
              "secondTabText": "Send Back",
              "secondTabOffset": 80,
              "secondTabWidth": 65
-            }
+            },
+            "widgets_values": []
          },
          {
            "id": 28,
            "type": "UNETLoader",
            "pos": [
-              30,
-              230
+              109.99997264844609,
+              200.0000502647102
            ],
            "size": [
-              270,
-              110
+              269.9869791666667,
+              82
            ],
            "flags": {},
-            "order": 5,
+            "order": 2,
            "mode": 0,
            "inputs": [
              {
@ -551,7 +518,7 @@
                "widget": {
                  "name": "unet_name"
                },
-                "link": 73
+                "link": 38
              },
              {
                "localized_name": "weight_dtype",
@ -574,9 +541,9 @@
              }
            ],
            "properties": {
-              "Node name for S&R": "UNETLoader",
              "cnr_id": "comfy-core",
              "ver": "0.3.73",
+              "Node name for S&R": "UNETLoader",
              "models": [
                {
                  "name": "z_image_turbo_bf16.safetensors",
@ -601,15 +568,15 @@
            "id": 27,
            "type": "CLIPTextEncode",
            "pos": [
-              400,
-              230
+              429.99997828947767,
+              200.0000502647102
            ],
            "size": [
-              450,
-              650
+              409.9869791666667,
+              319.9869791666667
            ],
            "flags": {},
-            "order": 4,
+            "order": 7,
            "mode": 0,
            "inputs": [
              {
@ -640,9 +607,9 @@
              }
            ],
            "properties": {
-              "Node name for S&R": "CLIPTextEncode",
              "cnr_id": "comfy-core",
              "ver": "0.3.73",
+              "Node name for S&R": "CLIPTextEncode",
              "enableTabs": false,
              "tabWidth": 65,
              "tabXOffset": 10,
@ -659,15 +626,15 @@
            "id": 13,
            "type": "EmptySD3LatentImage",
            "pos": [
-              40,
-              890
+              109.99997264844609,
+              629.9999791384399
            ],
            "size": [
-              260,
-              170
+              259.9869791666667,
+              106
            ],
            "flags": {},
-            "order": 3,
+            "order": 6,
            "mode": 0,
            "inputs": [
              {
@ -710,9 +677,9 @@
              }
            ],
            "properties": {
-              "Node name for S&R": "EmptySD3LatentImage",
              "cnr_id": "comfy-core",
              "ver": "0.3.64",
+              "Node name for S&R": "EmptySD3LatentImage",
              "enableTabs": false,
              "tabWidth": 65,
              "tabXOffset": 10,
@ -727,77 +694,19 @@
              1
            ]
          },
-          {
-            "id": 11,
-            "type": "ModelSamplingAuraFlow",
-            "pos": [
-              950,
-              230
-            ],
-            "size": [
-              310,
-              110
-            ],
-            "flags": {},
-            "order": 2,
-            "mode": 0,
-            "inputs": [
-              {
-                "localized_name": "model",
-                "name": "model",
-                "type": "MODEL",
-                "link": 26
-              },
-              {
-                "localized_name": "shift",
-                "name": "shift",
-                "type": "FLOAT",
-                "widget": {
-                  "name": "shift"
-                },
-                "link": null
-              }
-            ],
-            "outputs": [
-              {
-                "localized_name": "MODEL",
-                "name": "MODEL",
-                "type": "MODEL",
-                "slot_index": 0,
-                "links": [
-                  13
-                ]
-              }
-            ],
-            "properties": {
-              "Node name for S&R": "ModelSamplingAuraFlow",
-              "cnr_id": "comfy-core",
-              "ver": "0.3.64",
-              "enableTabs": false,
-              "tabWidth": 65,
-              "tabXOffset": 10,
-              "hasSecondTab": false,
-              "secondTabText": "Send Back",
-              "secondTabOffset": 80,
-              "secondTabWidth": 65
-            },
-            "widgets_values": [
-              3
-            ]
-          },
          {
            "id": 3,
            "type": "KSampler",
            "pos": [
-              950,
-              400
+              879.9999615530063,
+              269.9999774911694
            ],
            "size": [
-              320,
-              350
+              314.9869791666667,
+              262
            ],
            "flags": {},
-            "order": 0,
+            "order": 4,
            "mode": 0,
            "inputs": [
              {
@ -831,7 +740,7 @@
                "widget": {
                  "name": "seed"
                },
-                "link": 71
+                "link": null
              },
              {
                "localized_name": "steps",
@ -840,7 +749,7 @@
                "widget": {
                  "name": "steps"
                },
-                "link": 72
+                "link": null
              },
              {
                "localized_name": "cfg",
@ -891,9 +800,9 @@
              }
            ],
            "properties": {
-              "Node name for S&R": "KSampler",
              "cnr_id": "comfy-core",
              "ver": "0.3.64",
+              "Node name for S&R": "KSampler",
              "enableTabs": false,
              "tabWidth": 65,
              "tabXOffset": 10,
@ -905,23 +814,81 @@
            "widgets_values": [
              0,
              "randomize",
-              8,
+              4,
              1,
              "res_multistep",
              "simple",
              1
            ]
+          },
+          {
+            "id": 11,
+            "type": "ModelSamplingAuraFlow",
+            "pos": [
+              879.9999615530063,
+              160.00009184959066
+            ],
+            "size": [
+              309.9869791666667,
+              58
+            ],
+            "flags": {},
+            "order": 3,
+            "mode": 0,
+            "inputs": [
+              {
+                "localized_name": "model",
+                "name": "model",
+                "type": "MODEL",
+                "link": 26
+              },
+              {
+                "localized_name": "shift",
+                "name": "shift",
+                "type": "FLOAT",
+                "widget": {
+                  "name": "shift"
+                },
+                "link": null
+              }
+            ],
+            "outputs": [
+              {
+                "localized_name": "MODEL",
+                "name": "MODEL",
+                "type": "MODEL",
+                "slot_index": 0,
+                "links": [
+                  13
+                ]
+              }
+            ],
+            "properties": {
+              "cnr_id": "comfy-core",
+              "ver": "0.3.64",
+              "Node name for S&R": "ModelSamplingAuraFlow",
+              "enableTabs": false,
+              "tabWidth": 65,
+              "tabXOffset": 10,
+              "hasSecondTab": false,
+              "secondTabText": "Send Back",
+              "secondTabOffset": 80,
+              "secondTabWidth": 65
+            },
+            "widgets_values": [
+              3
+            ]
          }
        ],
        "groups": [
          {
            "id": 2,
-            "title": "Step2 - Image size",
+            "title": "Image size",
            "bounding": [
-              10,
-              820,
-              320,
-              280
+              100,
+              560,
+              290,
+              200
            ],
            "color": "#3f789e",
            "font_size": 24,
@ -929,12 +896,12 @@
          },
          {
            "id": 3,
-            "title": "Step3 - Prompt",
+            "title": "Prompt",
            "bounding": [
-              360,
+              410,
              130,
-              530,
-              970
+              450,
+              540
            ],
            "color": "#3f789e",
            "font_size": 24,
@ -942,12 +909,12 @@
          },
          {
            "id": 4,
-            "title": "Step1 - Load models",
+            "title": "Models",
            "bounding": [
-              0,
+              100,
              130,
-              330,
-              660
+              290,
+              413.6
            ],
            "color": "#3f789e",
            "font_size": 24,
@ -1060,41 +1027,25 @@
            "type": "INT"
          },
          {
-            "id": 71,
+            "id": 38,
            "origin_id": -10,
            "origin_slot": 3,
-            "target_id": 3,
-            "target_slot": 4,
-            "type": "INT"
-          },
-          {
-            "id": 72,
-            "origin_id": -10,
-            "origin_slot": 4,
-            "target_id": 3,
-            "target_slot": 5,
-            "type": "INT"
-          },
-          {
-            "id": 73,
-            "origin_id": -10,
-            "origin_slot": 5,
            "target_id": 28,
            "target_slot": 0,
            "type": "COMBO"
          },
          {
-            "id": 74,
+            "id": 39,
            "origin_id": -10,
-            "origin_slot": 6,
+            "origin_slot": 4,
            "target_id": 30,
            "target_slot": 0,
            "type": "COMBO"
          },
          {
-            "id": 75,
+            "id": 40,
            "origin_id": -10,
-            "origin_slot": 7,
+            "origin_slot": 5,
            "target_id": 29,
            "target_slot": 0,
            "type": "COMBO"
@ -1103,10 +1054,25 @@
        "extra": {
          "workflowRendererVersion": "LG"
        },
-        "category": "Image generation and editing/Text to image",
-        "description": "Generates images from text prompts using Z-Image-Turbo, Alibaba's distilled 6B DiT model."
+        "category": "Image generation and editing/Text to image"
      }
    ]
  },
-  "extra": {}
-}
+  "config": {},
+  "extra": {
+    "frontendVersion": "1.37.10",
+    "workflowRendererVersion": "LG",
+    "VHS_latentpreview": false,
+    "VHS_latentpreviewrate": 0,
+    "VHS_MetadataImage": true,
+    "VHS_KeepIntermediate": true,
+    "ds": {
+      "scale": 0.8401370345180755,
+      "offset": [
+        940.0587067393087,
+        -830.7121087564725
+      ]
+    }
+  },
+  "version": 0.4
+}
--- a/blueprints/Text
+++ b/blueprints/Text
--- a/blueprints/Text
+++ b/blueprints/Text
@ -4286,8 +4286,7 @@
        "extra": {
          "workflowRendererVersion": "Vue-corrected"
        },
-        "category": "Video generation and editing/Text to video",
-        "description": "Generates video from text prompts using LTX-2.3, Lightricks' video diffusion model."
+        "category": "Video generation and editing/Text to video"
      }
    ]
  },
--- a/blueprints/Text
+++ b/blueprints/Text
@ -1572,8 +1572,7 @@
        "extra": {
          "workflowRendererVersion": "LG"
        },
-        "category": "Video generation and editing/Text to video",
-        "description": "Generates video from text prompts using Wan2.2, Alibaba's diffusion video model."
+        "category": "Video generation and editing/Text to video"
      }
    ]
  },
@ -1587,4 +1586,4 @@
    "VHS_KeepIntermediate": true
  },
  "version": 0.4
-}
+}
--- a/blueprints/Unsharp
+++ b/blueprints/Unsharp
@ -434,9 +434,8 @@
        "extra": {
          "workflowRendererVersion": "LG"
        },
-        "category": "Image Tools/Sharpen",
-        "description": "Enhances edge contrast via unsharp masking for a sharper image appearance."
+        "category": "Image Tools/Sharpen"
      }
    ]
  }
-}
+}
--- a/blueprints/Video
+++ b/blueprints/Video
@ -307,8 +307,7 @@
        "extra": {
          "workflowRendererVersion": "LG"
        },
-        "category": "Text generation/Video Captioning",
-        "description": "Generates descriptive captions for video input using Google's Gemini multimodal LLM."
+        "category": "Text generation/Video Captioning"
      }
    ]
  }
--- a/blueprints/Video
+++ b/blueprints/Video
@ -165,7 +165,7 @@
        },
        "revision": 0,
        "config": {},
-        "name": "Video Inpaint (Wan 2.1 VACE)",
+        "name": "local-Video Inpaint(Wan2.1 VACE)",
        "inputNode": {
          "id": -10,
          "bounding": [
@ -2368,8 +2368,7 @@
        "extra": {
          "workflowRendererVersion": "LG"
        },
-        "category": "Video generation and editing/Inpaint video",
-        "description": "Inpaints masked regions in video frames using Wan 2.1 VACE."
+        "category": "Video generation and editing/Inpaint video"
      }
    ]
  },
--- a/blueprints/Video
+++ b/blueprints/Video
@ -1,827 +0,0 @@
-{
-  "revision": 0,
-  "last_node_id": 130,
-  "last_link_id": 0,
-  "nodes": [
-    {
-      "id": 130,
-      "type": "7937cf78-b52b-40a3-93b2-b4e2e5f98df1",
-      "pos": [
-        -1210,
-        -2780
-      ],
-      "size": [
-        300,
-        370
-      ],
-      "flags": {},
-      "order": 3,
-      "mode": 0,
-      "inputs": [
-        {
-          "name": "video",
-          "type": "VIDEO",
-          "link": null
-        },
-        {
-          "name": "text",
-          "type": "STRING",
-          "widget": {
-            "name": "text"
-          },
-          "link": null
-        },
-        {
-          "name": "bboxes",
-          "type": "BOUNDING_BOX",
-          "link": null
-        },
-        {
-          "name": "positive_coords",
-          "type": "STRING",
-          "link": null
-        },
-        {
-          "name": "negative_coords",
-          "type": "STRING",
-          "link": null
-        },
-        {
-          "name": "threshold",
-          "type": "FLOAT",
-          "widget": {
-            "name": "threshold"
-          },
-          "link": null
-        },
-        {
-          "name": "refine_iterations",
-          "type": "INT",
-          "widget": {
-            "name": "refine_iterations"
-          },
-          "link": null
-        },
-        {
-          "name": "individual_masks",
-          "type": "BOOLEAN",
-          "widget": {
-            "name": "individual_masks"
-          },
-          "link": null
-        },
-        {
-          "name": "ckpt_name",
-          "type": "COMBO",
-          "widget": {
-            "name": "ckpt_name"
-          },
-          "link": null
-        }
-      ],
-      "outputs": [
-        {
-          "localized_name": "masks",
-          "name": "masks",
-          "type": "MASK",
-          "links": []
-        },
-        {
-          "localized_name": "bboxes",
-          "name": "bboxes",
-          "type": "BOUNDING_BOX",
-          "links": []
-        },
-        {
-          "name": "audio",
-          "type": "AUDIO",
-          "links": null
-        },
-        {
-          "name": "fps",
-          "type": "FLOAT",
-          "links": null
-        }
-      ],
-      "properties": {
-        "proxyWidgets": [
-          [
-            "125",
-            "text"
-          ],
-          [
-            "126",
-            "threshold"
-          ],
-          [
-            "126",
-            "refine_iterations"
-          ],
-          [
-            "126",
-            "individual_masks"
-          ],
-          [
-            "127",
-            "ckpt_name"
-          ]
-        ],
-        "cnr_id": "comfy-core",
-        "ver": "0.19.3",
-        "enableTabs": false,
-        "tabWidth": 65,
-        "tabXOffset": 10,
-        "hasSecondTab": false,
-        "secondTabText": "Send Back",
-        "secondTabOffset": 80,
-        "secondTabWidth": 65
-      },
-      "widgets_values": [],
-      "title": "Video Segmentation (SAM3)"
-    }
-  ],
-  "links": [],
-  "version": 0.4,
-  "definitions": {
-    "subgraphs": [
-      {
-        "id": "7937cf78-b52b-40a3-93b2-b4e2e5f98df1",
-        "version": 1,
-        "state": {
-          "lastGroupId": 0,
-          "lastNodeId": 130,
-          "lastLinkId": 299,
-          "lastRerouteId": 0
-        },
-        "revision": 0,
-        "config": {},
-        "name": "Video Segmentation (SAM3)",
-        "inputNode": {
-          "id": -10,
-          "bounding": [
-            -2260,
-            -3450,
-            136.369140625,
-            220
-          ]
-        },
-        "outputNode": {
-          "id": -20,
-          "bounding": [
-            -1050,
-            -3510,
-            120,
-            120
-          ]
-        },
-        "inputs": [
-          {
-            "id": "680ffd88-32fe-48be-88d6-91ea44d5eaee",
-            "name": "video",
-            "type": "VIDEO",
-            "linkIds": [
-              252
-            ],
-            "pos": [
-              -2143.630859375,
-              -3430
-            ]
-          },
-          {
-            "id": "ceaf249c-32d7-4624-8bf6-e590e347ed90",
-            "name": "text",
-            "type": "STRING",
-            "linkIds": [
-              254
-            ],
-            "pos": [
-              -2143.630859375,
-              -3410
-            ]
-          },
-          {
-            "id": "1ffbff36-da0c-4854-8cb4-88ad31e64f99",
-            "name": "bboxes",
-            "type": "BOUNDING_BOX",
-            "linkIds": [
-              255
-            ],
-            "pos": [
-              -2143.630859375,
-              -3390
-            ]
-          },
-          {
-            "id": "67b7f4c7-cec0-4e00-b154-23cc1abf880e",
-            "name": "positive_coords",
-            "type": "STRING",
-            "linkIds": [
-              256
-            ],
-            "pos": [
-              -2143.630859375,
-              -3370
-            ]
-          },
-          {
-            "id": "b090a498-2bde-46b9-9554-18501401d687",
-            "name": "negative_coords",
-            "type": "STRING",
-            "linkIds": [
-              257
-            ],
-            "pos": [
-              -2143.630859375,
-              -3350
-            ]
-          },
-          {
-            "id": "1a76dfcf-ce95-46af-bba5-c42160c683dd",
-            "name": "threshold",
-            "type": "FLOAT",
-            "linkIds": [
-              261
-            ],
-            "pos": [
-              -2143.630859375,
-              -3330
-            ]
-          },
-          {
-            "id": "999523fa-c476-4c53-80c3-0a2f554d18ab",
-            "name": "refine_iterations",
-            "type": "INT",
-            "linkIds": [
-              262
-            ],
-            "pos": [
-              -2143.630859375,
-              -3310
-            ]
-          },
-          {
-            "id": "d2371011-7fe5-4a39-b0c1-df2e0bbd6ece",
-            "name": "individual_masks",
-            "type": "BOOLEAN",
-            "linkIds": [
-              263
-            ],
-            "pos": [
-              -2143.630859375,
-              -3290
-            ]
-          },
-          {
-            "id": "675a8b37-17db-48d1-853c-2fe5d6a74582",
-            "name": "ckpt_name",
-            "type": "COMBO",
-            "linkIds": [
-              273
-            ],
-            "pos": [
-              -2143.630859375,
-              -3270
-            ]
-          }
-        ],
-        "outputs": [
-          {
-            "id": "ff50da09-1e59-4a58-9b7f-be1a00aa5913",
-            "name": "masks",
-            "type": "MASK",
-            "linkIds": [
-              231
-            ],
-            "localized_name": "masks",
-            "pos": [
-              -1030,
-              -3490
-            ]
-          },
-          {
-            "id": "8f622e40-8528-4078-b7d3-147e9f872194",
-            "name": "bboxes",
-            "type": "BOUNDING_BOX",
-            "linkIds": [
-              232
-            ],
-            "localized_name": "bboxes",
-            "pos": [
-              -1030,
-              -3470
-            ]
-          },
-          {
-            "id": "6c9924ec-f0fa-4509-83ea-8f97f5889bcc",
-            "name": "audio",
-            "type": "AUDIO",
-            "linkIds": [
-              259
-            ],
-            "pos": [
-              -1030,
-              -3450
-            ]
-          },
-          {
-            "id": "82c1cddc-ab11-44eb-9e2f-1a5c7ea5645b",
-            "name": "fps",
-            "type": "FLOAT",
-            "linkIds": [
-              260
-            ],
-            "pos": [
-              -1030,
-              -3430
-            ]
-          }
-        ],
-        "widgets": [],
-        "nodes": [
-          {
-            "id": 125,
-            "type": "CLIPTextEncode",
-            "pos": [
-              -2010,
-              -3040
-            ],
-            "size": [
-              400,
-              200
-            ],
-            "flags": {},
-            "order": 1,
-            "mode": 0,
-            "inputs": [
-              {
-                "localized_name": "clip",
-                "name": "clip",
-                "type": "CLIP",
-                "link": 240
-              },
-              {
-                "localized_name": "text",
-                "name": "text",
-                "type": "STRING",
-                "widget": {
-                  "name": "text"
-                },
-                "link": 254
-              }
-            ],
-            "outputs": [
-              {
-                "localized_name": "CONDITIONING",
-                "name": "CONDITIONING",
-                "type": "CONDITIONING",
-                "links": [
-                  200
-                ]
-              }
-            ],
-            "properties": {
-              "Node name for S&R": "CLIPTextEncode",
-              "cnr_id": "comfy-core",
-              "ver": "0.19.3",
-              "enableTabs": false,
-              "tabWidth": 65,
-              "tabXOffset": 10,
-              "hasSecondTab": false,
-              "secondTabText": "Send Back",
-              "secondTabOffset": 80,
-              "secondTabWidth": 65
-            },
-            "widgets_values": [
-              ""
-            ]
-          },
-          {
-            "id": 126,
-            "type": "SAM3_Detect",
-            "pos": [
-              -1520,
-              -3520
-            ],
-            "size": [
-              270,
-              290
-            ],
-            "flags": {},
-            "order": 2,
-            "mode": 0,
-            "inputs": [
-              {
-                "label": "model",
-                "localized_name": "model",
-                "name": "model",
-                "type": "MODEL",
-                "link": 237
-              },
-              {
-                "label": "image",
-                "localized_name": "image",
-                "name": "image",
-                "type": "IMAGE",
-                "link": 253
-              },
-              {
-                "label": "conditioning",
-                "localized_name": "conditioning",
-                "name": "conditioning",
-                "shape": 7,
-                "type": "CONDITIONING",
-                "link": 200
-              },
-              {
-                "label": "bboxes",
-                "localized_name": "bboxes",
-                "name": "bboxes",
-                "shape": 7,
-                "type": "BOUNDING_BOX",
-                "link": 255
-              },
-              {
-                "label": "positive_coords",
-                "localized_name": "positive_coords",
-                "name": "positive_coords",
-                "shape": 7,
-                "type": "STRING",
-                "link": 256
-              },
-              {
-                "label": "negative_coords",
-                "localized_name": "negative_coords",
-                "name": "negative_coords",
-                "shape": 7,
-                "type": "STRING",
-                "link": 257
-              },
-              {
-                "localized_name": "threshold",
-                "name": "threshold",
-                "type": "FLOAT",
-                "widget": {
-                  "name": "threshold"
-                },
-                "link": 261
-              },
-              {
-                "localized_name": "refine_iterations",
-                "name": "refine_iterations",
-                "type": "INT",
-                "widget": {
-                  "name": "refine_iterations"
-                },
-                "link": 262
-              },
-              {
-                "localized_name": "individual_masks",
-                "name": "individual_masks",
-                "type": "BOOLEAN",
-                "widget": {
-                  "name": "individual_masks"
-                },
-                "link": 263
-              }
-            ],
-            "outputs": [
-              {
-                "localized_name": "masks",
-                "name": "masks",
-                "type": "MASK",
-                "links": [
-                  231
-                ]
-              },
-              {
-                "localized_name": "bboxes",
-                "name": "bboxes",
-                "type": "BOUNDING_BOX",
-                "links": [
-                  232
-                ]
-              }
-            ],
-            "properties": {
-              "Node name for S&R": "SAM3_Detect",
-              "cnr_id": "comfy-core",
-              "ver": "0.19.3",
-              "enableTabs": false,
-              "tabWidth": 65,
-              "tabXOffset": 10,
-              "hasSecondTab": false,
-              "secondTabText": "Send Back",
-              "secondTabOffset": 80,
-              "secondTabWidth": 65
-            },
-            "widgets_values": [
-              0.5,
-              2,
-              false
-            ]
-          },
-          {
-            "id": 127,
-            "type": "CheckpointLoaderSimple",
-            "pos": [
-              -1970,
-              -3310
-            ],
-            "size": [
-              330,
-              160
-            ],
-            "flags": {},
-            "order": 3,
-            "mode": 0,
-            "inputs": [
-              {
-                "localized_name": "ckpt_name",
-                "name": "ckpt_name",
-                "type": "COMBO",
-                "widget": {
-                  "name": "ckpt_name"
-                },
-                "link": 273
-              }
-            ],
-            "outputs": [
-              {
-                "localized_name": "MODEL",
-                "name": "MODEL",
-                "type": "MODEL",
-                "links": [
-                  237
-                ]
-              },
-              {
-                "localized_name": "CLIP",
-                "name": "CLIP",
-                "type": "CLIP",
-                "links": [
-                  240
-                ]
-              },
-              {
-                "localized_name": "VAE",
-                "name": "VAE",
-                "type": "VAE",
-                "links": null
-              }
-            ],
-            "properties": {
-              "Node name for S&R": "CheckpointLoaderSimple",
-              "cnr_id": "comfy-core",
-              "ver": "0.19.3",
-              "enableTabs": false,
-              "tabWidth": 65,
-              "tabXOffset": 10,
-              "hasSecondTab": false,
-              "secondTabText": "Send Back",
-              "secondTabOffset": 80,
-              "secondTabWidth": 65,
-              "models": [
-                {
-                  "name": "sam3.1_multiplex_fp16.safetensors",
-                  "url": "https://huggingface.co/Comfy-Org/sam3.1/resolve/main/checkpoints/sam3.1_multiplex_fp16.safetensors",
-                  "directory": "checkpoints"
-                }
-              ]
-            },
-            "widgets_values": [
-              "sam3.1_multiplex_fp16.safetensors"
-            ]
-          },
-          {
-            "id": 128,
-            "type": "GetVideoComponents",
-            "pos": [
-              -1910,
-              -3540
-            ],
-            "size": [
-              230,
-              120
-            ],
-            "flags": {},
-            "order": 4,
-            "mode": 0,
-            "inputs": [
-              {
-                "localized_name": "video",
-                "name": "video",
-                "type": "VIDEO",
-                "link": 252
-              }
-            ],
-            "outputs": [
-              {
-                "localized_name": "images",
-                "name": "images",
-                "type": "IMAGE",
-                "links": [
-                  253
-                ]
-              },
-              {
-                "localized_name": "audio",
-                "name": "audio",
-                "type": "AUDIO",
-                "links": [
-                  259
-                ]
-              },
-              {
-                "localized_name": "fps",
-                "name": "fps",
-                "type": "FLOAT",
-                "links": [
-                  260
-                ]
-              }
-            ],
-            "properties": {
-              "Node name for S&R": "GetVideoComponents",
-              "cnr_id": "comfy-core",
-              "ver": "0.19.3",
-              "enableTabs": false,
-              "tabWidth": 65,
-              "tabXOffset": 10,
-              "hasSecondTab": false,
-              "secondTabText": "Send Back",
-              "secondTabOffset": 80,
-              "secondTabWidth": 65
-            }
-          },
-          {
-            "id": 129,
-            "type": "Note",
-            "pos": [
-              -1980,
-              -2790
-            ],
-            "size": [
-              370,
-              250
-            ],
-            "flags": {},
-            "order": 0,
-            "mode": 0,
-            "inputs": [],
-            "outputs": [],
-            "title": "Note: Prompt format",
-            "properties": {},
-            "widgets_values": [
-              "Max tokens for this model is only 32, to separately prompt multiple subjects you can separate prompts with comma, and set the max amount of objects detected for each prompt with :N\n\nFor example above test prompt finds 2 cakes, one apron, 4 window panels"
-            ],
-            "color": "#432",
-            "bgcolor": "#653"
-          }
-        ],
-        "groups": [],
-        "links": [
-          {
-            "id": 237,
-            "origin_id": 127,
-            "origin_slot": 0,
-            "target_id": 126,
-            "target_slot": 0,
-            "type": "MODEL"
-          },
-          {
-            "id": 200,
-            "origin_id": 125,
-            "origin_slot": 0,
-            "target_id": 126,
-            "target_slot": 2,
-            "type": "CONDITIONING"
-          },
-          {
-            "id": 240,
-            "origin_id": 127,
-            "origin_slot": 1,
-            "target_id": 125,
-            "target_slot": 0,
-            "type": "CLIP"
-          },
-          {
-            "id": 231,
-            "origin_id": 126,
-            "origin_slot": 0,
-            "target_id": -20,
-            "target_slot": 0,
-            "type": "MASK"
-          },
-          {
-            "id": 232,
-            "origin_id": 126,
-            "origin_slot": 1,
-            "target_id": -20,
-            "target_slot": 1,
-            "type": "BOUNDING_BOX"
-          },
-          {
-            "id": 252,
-            "origin_id": -10,
-            "origin_slot": 0,
-            "target_id": 128,
-            "target_slot": 0,
-            "type": "VIDEO"
-          },
-          {
-            "id": 253,
-            "origin_id": 128,
-            "origin_slot": 0,
-            "target_id": 126,
-            "target_slot": 1,
-            "type": "IMAGE"
-          },
-          {
-            "id": 254,
-            "origin_id": -10,
-            "origin_slot": 1,
-            "target_id": 125,
-            "target_slot": 1,
-            "type": "STRING"
-          },
-          {
-            "id": 255,
-            "origin_id": -10,
-            "origin_slot": 2,
-            "target_id": 126,
-            "target_slot": 3,
-            "type": "BOUNDING_BOX"
-          },
-          {
-            "id": 256,
-            "origin_id": -10,
-            "origin_slot": 3,
-            "target_id": 126,
-            "target_slot": 4,
-            "type": "STRING"
-          },
-          {
-            "id": 257,
-            "origin_id": -10,
-            "origin_slot": 4,
-            "target_id": 126,
-            "target_slot": 5,
-            "type": "STRING"
-          },
-          {
-            "id": 259,
-            "origin_id": 128,
-            "origin_slot": 1,
-            "target_id": -20,
-            "target_slot": 2,
-            "type": "AUDIO"
-          },
-          {
-            "id": 260,
-            "origin_id": 128,
-            "origin_slot": 2,
-            "target_id": -20,
-            "target_slot": 3,
-            "type": "FLOAT"
-          },
-          {
-            "id": 261,
-            "origin_id": -10,
-            "origin_slot": 5,
-            "target_id": 126,
-            "target_slot": 6,
-            "type": "FLOAT"
-          },
-          {
-            "id": 262,
-            "origin_id": -10,
-            "origin_slot": 6,
-            "target_id": 126,
-            "target_slot": 7,
-            "type": "INT"
-          },
-          {
-            "id": 263,
-            "origin_id": -10,
-            "origin_slot": 7,
-            "target_id": 126,
-            "target_slot": 8,
-            "type": "BOOLEAN"
-          },
-          {
-            "id": 273,
-            "origin_id": -10,
-            "origin_slot": 8,
-            "target_id": 127,
-            "target_slot": 0,
-            "type": "COMBO"
-          }
-        ],
-        "extra": {},
-        "category": "Video Tools",
-        "description": "Segments video into temporally consistent masks using Meta SAM3 from text or interactive prompts."
-      }
-    ]
-  },
-  "extra": {}
-}
--- a/blueprints/Video
+++ b/blueprints/Video
@ -1,21 +1,21 @@
 {
  "revision": 0,
-  "last_node_id": 85,
+  "last_node_id": 84,
  "last_link_id": 0,
  "nodes": [
    {
-      "id": 85,
-      "type": "637913e7-0206-46ba-8ded-70ae3a7c2e19",
+      "id": 84,
+      "type": "8e8aa94a-647e-436d-8440-8ee4691864de",
      "pos": [
-        -880,
-        -2260
+        -6100,
+        2620
      ],
      "size": [
        290,
        160
      ],
      "flags": {},
-      "order": 2,
+      "order": 0,
      "mode": 0,
      "inputs": [
        {
@ -76,26 +76,31 @@
      "properties": {
        "proxyWidgets": [
          [
-            "79",
+            "-1",
            "direction"
          ],
          [
-            "79",
+            "-1",
            "match_image_size"
          ],
          [
-            "79",
+            "-1",
            "spacing_width"
          ],
          [
-            "79",
+            "-1",
            "spacing_color"
          ]
        ],
        "cnr_id": "comfy-core",
        "ver": "0.13.0"
      },
-      "widgets_values": [],
+      "widgets_values": [
+        "right",
+        true,
+        0,
+        "white"
+      ],
      "title": "Video Stitch"
    }
  ],
@ -104,12 +109,12 @@
  "definitions": {
    "subgraphs": [
      {
-        "id": "637913e7-0206-46ba-8ded-70ae3a7c2e19",
+        "id": "8e8aa94a-647e-436d-8440-8ee4691864de",
        "version": 1,
        "state": {
          "lastGroupId": 1,
-          "lastNodeId": 97,
-          "lastLinkId": 282,
+          "lastNodeId": 84,
+          "lastLinkId": 262,
          "lastRerouteId": 0
        },
        "revision": 0,
@ -118,8 +123,8 @@
        "inputNode": {
          "id": -10,
          "bounding": [
-            -6810,
-            2580,
+            -6580,
+            2649,
            143.55859375,
            160
          ]
@ -127,8 +132,8 @@
        "outputNode": {
          "id": -20,
          "bounding": [
-            -4770,
-            2600,
+            -5720,
+            2659,
            120,
            60
          ]
@ -144,8 +149,8 @@
            "localized_name": "video",
            "label": "Before Video",
            "pos": [
-              -6686.44140625,
-              2600
+              -6456.44140625,
+              2669
            ]
          },
          {
@ -158,8 +163,8 @@
            "localized_name": "video_1",
            "label": "After Video",
            "pos": [
-              -6686.44140625,
-              2620
+              -6456.44140625,
+              2689
            ]
          },
          {
@ -170,8 +175,8 @@
              259
            ],
            "pos": [
-              -6686.44140625,
-              2640
+              -6456.44140625,
+              2709
            ]
          },
          {
@ -182,8 +187,8 @@
              260
            ],
            "pos": [
-              -6686.44140625,
-              2660
+              -6456.44140625,
+              2729
            ]
          },
          {
@ -194,8 +199,8 @@
              261
            ],
            "pos": [
-              -6686.44140625,
-              2680
+              -6456.44140625,
+              2749
            ]
          },
          {
@ -206,8 +211,8 @@
              262
            ],
            "pos": [
-              -6686.44140625,
-              2700
+              -6456.44140625,
+              2769
            ]
          }
        ],
@ -221,8 +226,8 @@
            ],
            "localized_name": "VIDEO",
            "pos": [
-              -4750,
-              2620
+              -5700,
+              2679
            ]
          }
        ],
@ -233,11 +238,11 @@
            "type": "GetVideoComponents",
            "pos": [
              -6390,
-              2600
+              2560
            ],
            "size": [
-              230,
-              120
+              193.530859375,
+              66
            ],
            "flags": {},
            "order": 1,
@ -273,9 +278,9 @@
              }
            ],
            "properties": {
-              "Node name for S&R": "GetVideoComponents",
              "cnr_id": "comfy-core",
-              "ver": "0.13.0"
+              "ver": "0.13.0",
+              "Node name for S&R": "GetVideoComponents"
            }
          },
          {
@ -286,8 +291,8 @@
              2420
            ],
            "size": [
-              230,
-              120
+              193.530859375,
+              66
            ],
            "flags": {},
            "order": 0,
@ -327,254 +332,21 @@
              }
            ],
            "properties": {
-              "Node name for S&R": "GetVideoComponents",
              "cnr_id": "comfy-core",
-              "ver": "0.13.0"
+              "ver": "0.13.0",
+              "Node name for S&R": "GetVideoComponents"
            }
          },
-          {
-            "id": 90,
-            "type": "GetImageSize",
-            "pos": [
-              -6390,
-              3030
-            ],
-            "size": [
-              230,
-              120
-            ],
-            "flags": {},
-            "order": 4,
-            "mode": 0,
-            "inputs": [
-              {
-                "localized_name": "image",
-                "name": "image",
-                "type": "IMAGE",
-                "link": 266
-              }
-            ],
-            "outputs": [
-              {
-                "localized_name": "width",
-                "name": "width",
-                "type": "INT",
-                "links": [
-                  274
-                ]
-              },
-              {
-                "localized_name": "height",
-                "name": "height",
-                "type": "INT",
-                "links": [
-                  276
-                ]
-              },
-              {
-                "localized_name": "batch_size",
-                "name": "batch_size",
-                "type": "INT",
-                "links": null
-              }
-            ],
-            "properties": {
-              "Node name for S&R": "GetImageSize"
-            }
-          },
-          {
-            "id": 80,
-            "type": "CreateVideo",
-            "pos": [
-              -5190,
-              2420
-            ],
-            "size": [
-              270,
-              130
-            ],
-            "flags": {},
-            "order": 3,
-            "mode": 0,
-            "inputs": [
-              {
-                "localized_name": "images",
-                "name": "images",
-                "type": "IMAGE",
-                "link": 282
-              },
-              {
-                "localized_name": "audio",
-                "name": "audio",
-                "shape": 7,
-                "type": "AUDIO",
-                "link": 251
-              },
-              {
-                "localized_name": "fps",
-                "name": "fps",
-                "type": "FLOAT",
-                "widget": {
-                  "name": "fps"
-                },
-                "link": 252
-              }
-            ],
-            "outputs": [
-              {
-                "localized_name": "VIDEO",
-                "name": "VIDEO",
-                "type": "VIDEO",
-                "links": [
-                  255
-                ]
-              }
-            ],
-            "properties": {
-              "Node name for S&R": "CreateVideo",
-              "cnr_id": "comfy-core",
-              "ver": "0.13.0"
-            },
-            "widgets_values": [
-              30
-            ]
-          },
-          {
-            "id": 95,
-            "type": "ComfyMathExpression",
-            "pos": [
-              -6040,
-              3020
-            ],
-            "size": [
-              400,
-              200
-            ],
-            "flags": {},
-            "order": 5,
-            "mode": 0,
-            "inputs": [
-              {
-                "label": "a",
-                "localized_name": "values.a",
-                "name": "values.a",
-                "type": "FLOAT,INT",
-                "link": 274
-              },
-              {
-                "label": "b",
-                "localized_name": "values.b",
-                "name": "values.b",
-                "shape": 7,
-                "type": "FLOAT,INT",
-                "link": null
-              },
-              {
-                "localized_name": "expression",
-                "name": "expression",
-                "type": "STRING",
-                "widget": {
-                  "name": "expression"
-                },
-                "link": null
-              }
-            ],
-            "outputs": [
-              {
-                "localized_name": "FLOAT",
-                "name": "FLOAT",
-                "type": "FLOAT",
-                "links": null
-              },
-              {
-                "localized_name": "INT",
-                "name": "INT",
-                "type": "INT",
-                "links": [
-                  279
-                ]
-              }
-            ],
-            "properties": {
-              "Node name for S&R": "ComfyMathExpression"
-            },
-            "widgets_values": [
-              "a & ~1"
-            ]
-          },
-          {
-            "id": 96,
-            "type": "ComfyMathExpression",
-            "pos": [
-              -6040,
-              3290
-            ],
-            "size": [
-              400,
-              200
-            ],
-            "flags": {},
-            "order": 6,
-            "mode": 0,
-            "inputs": [
-              {
-                "label": "a",
-                "localized_name": "values.a",
-                "name": "values.a",
-                "type": "FLOAT,INT",
-                "link": 276
-              },
-              {
-                "label": "b",
-                "localized_name": "values.b",
-                "name": "values.b",
-                "shape": 7,
-                "type": "FLOAT,INT",
-                "link": null
-              },
-              {
-                "localized_name": "expression",
-                "name": "expression",
-                "type": "STRING",
-                "widget": {
-                  "name": "expression"
-                },
-                "link": null
-              }
-            ],
-            "outputs": [
-              {
-                "localized_name": "FLOAT",
-                "name": "FLOAT",
-                "type": "FLOAT",
-                "links": null
-              },
-              {
-                "localized_name": "INT",
-                "name": "INT",
-                "type": "INT",
-                "links": [
-                  280
-                ]
-              }
-            ],
-            "properties": {
-              "Node name for S&R": "ComfyMathExpression"
-            },
-            "widgets_values": [
-              "a & ~1"
-            ]
-          },
          {
            "id": 79,
            "type": "ImageStitch",
            "pos": [
              -6390,
-              2780
+              2700
            ],
            "size": [
              270,
-              160
+              150
            ],
            "flags": {},
            "order": 2,
@ -636,15 +408,14 @@
                "name": "IMAGE",
                "type": "IMAGE",
                "links": [
-                  266,
-                  281
+                  250
                ]
              }
            ],
            "properties": {
-              "Node name for S&R": "ImageStitch",
              "cnr_id": "comfy-core",
-              "ver": "0.13.0"
+              "ver": "0.13.0",
+              "Node name for S&R": "ImageStitch"
            },
            "widgets_values": [
              "right",
@ -654,91 +425,60 @@
            ]
          },
          {
-            "id": 97,
-            "type": "ResizeImageMaskNode",
+            "id": 80,
+            "type": "CreateVideo",
            "pos": [
-              -5560,
-              2790
+              -6040,
+              2610
            ],
            "size": [
              270,
-              160
+              78
            ],
            "flags": {},
-            "order": 7,
+            "order": 3,
            "mode": 0,
            "inputs": [
              {
-                "localized_name": "input",
-                "name": "input",
-                "type": "IMAGE,MASK",
-                "link": 281
+                "localized_name": "images",
+                "name": "images",
+                "type": "IMAGE",
+                "link": 250
              },
              {
-                "localized_name": "resize_type",
-                "name": "resize_type",
-                "type": "COMFY_DYNAMICCOMBO_V3",
-                "widget": {
-                  "name": "resize_type"
-                },
-                "link": null
+                "localized_name": "audio",
+                "name": "audio",
+                "shape": 7,
+                "type": "AUDIO",
+                "link": 251
              },
              {
-                "localized_name": "width",
-                "name": "resize_type.width",
-                "type": "INT",
+                "localized_name": "fps",
+                "name": "fps",
+                "type": "FLOAT",
                "widget": {
-                  "name": "resize_type.width"
+                  "name": "fps"
                },
-                "link": 279
-              },
-              {
-                "localized_name": "height",
-                "name": "resize_type.height",
-                "type": "INT",
-                "widget": {
-                  "name": "resize_type.height"
-                },
-                "link": 280
-              },
-              {
-                "localized_name": "crop",
-                "name": "resize_type.crop",
-                "type": "COMBO",
-                "widget": {
-                  "name": "resize_type.crop"
-                },
-                "link": null
-              },
-              {
-                "localized_name": "scale_method",
-                "name": "scale_method",
-                "type": "COMBO",
-                "widget": {
-                  "name": "scale_method"
-                },
-                "link": null
+                "link": 252
              }
            ],
            "outputs": [
              {
-                "localized_name": "resized",
-                "name": "resized",
-                "type": "*",
+                "localized_name": "VIDEO",
+                "name": "VIDEO",
+                "type": "VIDEO",
                "links": [
-                  282
+                  255
                ]
              }
            ],
            "properties": {
-              "Node name for S&R": "ResizeImageMaskNode"
+              "cnr_id": "comfy-core",
+              "ver": "0.13.0",
+              "Node name for S&R": "CreateVideo"
            },
            "widgets_values": [
-              "scale dimensions",
-              512,
-              512,
-              "center",
-              "area"
+              30
            ]
          }
        ],
@ -760,6 +500,14 @@
            "target_slot": 1,
            "type": "IMAGE"
          },
+          {
+            "id": 250,
+            "origin_id": 79,
+            "origin_slot": 0,
+            "target_id": 80,
+            "target_slot": 0,
+            "type": "IMAGE"
+          },
          {
            "id": 251,
            "origin_id": 77,
@ -831,71 +579,13 @@
            "target_id": 79,
            "target_slot": 5,
            "type": "COMBO"
-          },
-          {
-            "id": 266,
-            "origin_id": 79,
-            "origin_slot": 0,
-            "target_id": 90,
-            "target_slot": 0,
-            "type": "IMAGE"
-          },
-          {
-            "id": 274,
-            "origin_id": 90,
-            "origin_slot": 0,
-            "target_id": 95,
-            "target_slot": 0,
-            "type": "INT"
-          },
-          {
-            "id": 276,
-            "origin_id": 90,
-            "origin_slot": 1,
-            "target_id": 96,
-            "target_slot": 0,
-            "type": "INT"
-          },
-          {
-            "id": 279,
-            "origin_id": 95,
-            "origin_slot": 1,
-            "target_id": 97,
-            "target_slot": 2,
-            "type": "INT"
-          },
-          {
-            "id": 280,
-            "origin_id": 96,
-            "origin_slot": 1,
-            "target_id": 97,
-            "target_slot": 3,
-            "type": "INT"
-          },
-          {
-            "id": 281,
-            "origin_id": 79,
-            "origin_slot": 0,
-            "target_id": 97,
-            "target_slot": 0,
-            "type": "IMAGE"
-          },
-          {
-            "id": 282,
-            "origin_id": 97,
-            "origin_slot": 0,
-            "target_id": 80,
-            "target_slot": 0,
-            "type": "IMAGE"
          }
        ],
        "extra": {
          "workflowRendererVersion": "LG"
        },
-        "category": "Video Tools/Stitch videos",
-        "description": "Stitches multiple video clips into a single sequential video file."
+        "category": "Video Tools/Stitch videos"
      }
    ]
-  },
-  "extra": {}
-}
+  }
+}
--- a/blueprints/Video
+++ b/blueprints/Video
@ -412,10 +412,9 @@
        "extra": {
          "workflowRendererVersion": "LG"
        },
-        "category": "Video generation and editing/Enhance video",
-        "description": "Upscales video to 4× resolution using a GAN-based upscaling model."
+        "category": "Video generation and editing/Enhance video"
      }
    ]
  },
  "extra": {}
-}
+}
--- a/comfy/background_removal/birefnet.json
+++ b/comfy/background_removal/birefnet.json
@ -1,7 +0,0 @@
-{
-    "model_type": "birefnet",
-    "image_std": [1.0, 1.0, 1.0],
-    "image_mean": [0.0, 0.0, 0.0],
-    "image_size": 1024,
-    "resize_to_original": true
-}
--- a/comfy/background_removal/birefnet.py
+++ b/comfy/background_removal/birefnet.py
@ -1,689 +0,0 @@
-import torch
-import comfy.ops
-import numpy as np
-import torch.nn as nn
-from functools import partial
-import torch.nn.functional as F
-from torchvision.ops import deform_conv2d
-from comfy.ldm.modules.attention import optimized_attention_for_device
-
-CXT = [3072, 1536, 768, 384][1:][::-1][-3:]
-
-class Attention(nn.Module):
-    def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, device=None, dtype=None, operations=None):
-        super().__init__()
-
-        self.dim = dim
-        self.num_heads = num_heads
-        head_dim = dim // num_heads
-        self.scale = qk_scale or head_dim ** -0.5
-
-        self.q = operations.Linear(dim, dim, bias=qkv_bias, device=device, dtype=dtype)
-        self.kv = operations.Linear(dim, dim * 2, bias=qkv_bias, device=device, dtype=dtype)
-        self.proj = operations.Linear(dim, dim, device=device, dtype=dtype)
-
-    def forward(self, x):
-        B, N, C = x.shape
-        optimized_attention = optimized_attention_for_device(x.device, mask=False, small_input=True)
-        q = self.q(x).reshape(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
-        kv = self.kv(x).reshape(B, -1, 2, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
-        k, v = kv[0], kv[1]
-
-        x = optimized_attention(
-            q, k, v, heads=self.num_heads, skip_output_reshape=True, skip_reshape=True
-        ).transpose(1, 2).reshape(B, N, C)
-        x = self.proj(x)
-
-        return x
-
-class Mlp(nn.Module):
-    def __init__(self, in_features, hidden_features=None, out_features=None, device=None, dtype=None, operations=None):
-        super().__init__()
-        out_features = out_features or in_features
-        hidden_features = hidden_features or in_features
-        self.fc1 = operations.Linear(in_features, hidden_features, device=device, dtype=dtype)
-        self.act = nn.GELU()
-        self.fc2 = operations.Linear(hidden_features, out_features, device=device, dtype=dtype)
-
-    def forward(self, x):
-        x = self.fc1(x)
-        x = self.act(x)
-        x = self.fc2(x)
-        return x
-
-
-def window_partition(x, window_size):
-    B, H, W, C = x.shape
-    x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
-    windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
-    return windows
-
-
-def window_reverse(windows, window_size, H, W):
-    B = int(windows.shape[0] / (H * W / window_size / window_size))
-    x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
-    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
-    return x
-
-
-class WindowAttention(nn.Module):
-    def __init__(self, dim, window_size, num_heads, qkv_bias=True, qk_scale=None, device=None, dtype=None, operations=None):
-
-        super().__init__()
-        self.dim = dim
-        self.window_size = window_size  # Wh, Ww
-        self.num_heads = num_heads
-        head_dim = dim // num_heads
-        self.scale = qk_scale or head_dim ** -0.5
-
-        self.relative_position_bias_table = nn.Parameter(
-            torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads, device=device, dtype=dtype))
-
-        coords_h = torch.arange(self.window_size[0])
-        coords_w = torch.arange(self.window_size[1])
-        coords = torch.stack(torch.meshgrid([coords_h, coords_w], indexing='ij'))  # 2, Wh, Ww
-        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
-        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
-        relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
-        relative_coords[:, :, 0] += self.window_size[0] - 1
-        relative_coords[:, :, 1] += self.window_size[1] - 1
-        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
-        relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
-        self.register_buffer("relative_position_index", relative_position_index)
-
-        self.qkv = operations.Linear(dim, dim * 3, bias=qkv_bias, device=device, dtype=dtype)
-        self.proj = operations.Linear(dim, dim, device=device, dtype=dtype)
-        self.softmax = nn.Softmax(dim=-1)
-
-    def forward(self, x, mask=None):
-        B_, N, C = x.shape
-        qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
-        q, k, v = qkv[0], qkv[1], qkv[2]
-
-        q = q * self.scale
-        attn = (q @ k.transpose(-2, -1))
-
-        relative_position_bias = self.relative_position_bias_table[self.relative_position_index.long().view(-1)].view(
-            self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1)  # Wh*Ww,Wh*Ww,nH
-        relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
-        attn = attn + relative_position_bias.unsqueeze(0)
-
-        if mask is not None:
-            nW = mask.shape[0]
-            attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0)
-            attn = attn.view(-1, self.num_heads, N, N)
-            attn = self.softmax(attn)
-        else:
-            attn = self.softmax(attn)
-
-        x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
-        x = self.proj(x)
-        return x
-
-
-class SwinTransformerBlock(nn.Module):
-    def __init__(self, dim, num_heads, window_size=7, shift_size=0,
-                 mlp_ratio=4., qkv_bias=True, qk_scale=None,
-                 norm_layer=nn.LayerNorm, device=None, dtype=None, operations=None):
-        super().__init__()
-        self.dim = dim
-        self.num_heads = num_heads
-        self.window_size = window_size
-        self.shift_size = shift_size
-        self.mlp_ratio = mlp_ratio
-
-        self.norm1 = norm_layer(dim, device=device, dtype=dtype)
-        self.attn = WindowAttention(
-            dim, window_size=(self.window_size, self.window_size), num_heads=num_heads,
-            qkv_bias=qkv_bias, qk_scale=qk_scale, device=device, dtype=dtype, operations=operations)
-
-        self.norm2 = norm_layer(dim, device=device, dtype=dtype)
-        mlp_hidden_dim = int(dim * mlp_ratio)
-        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, device=device, dtype=dtype, operations=operations)
-
-        self.H = None
-        self.W = None
-
-    def forward(self, x, mask_matrix):
-        B, L, C = x.shape
-        H, W = self.H, self.W
-
-        shortcut = x
-        x = self.norm1(x)
-        x = x.view(B, H, W, C)
-
-        pad_l = pad_t = 0
-        pad_r = (self.window_size - W % self.window_size) % self.window_size
-        pad_b = (self.window_size - H % self.window_size) % self.window_size
-        x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b))
-        _, Hp, Wp, _ = x.shape
-
-        if self.shift_size > 0:
-            shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
-            attn_mask = mask_matrix
-        else:
-            shifted_x = x
-            attn_mask = None
-
-        x_windows = window_partition(shifted_x, self.window_size)
-        x_windows = x_windows.view(-1, self.window_size * self.window_size, C)
-
-        attn_windows = self.attn(x_windows, mask=attn_mask)
-
-        attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C)
-        shifted_x = window_reverse(attn_windows, self.window_size, Hp, Wp)  # B H' W' C
-
-        if self.shift_size > 0:
-            x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
-        else:
-            x = shifted_x
-
-        if pad_r > 0 or pad_b > 0:
-            x = x[:, :H, :W, :].contiguous()
-
-        x = x.view(B, H * W, C)
-
-        x = shortcut + x
-        x = x + self.mlp(self.norm2(x))
-
-        return x
-
-
-class PatchMerging(nn.Module):
-    def __init__(self, dim, device=None, dtype=None, operations=None):
-        super().__init__()
-        self.dim = dim
-        self.reduction = operations.Linear(4 * dim, 2 * dim, bias=False, device=device, dtype=dtype)
-        self.norm = operations.LayerNorm(4 * dim, device=device, dtype=dtype)
-
-    def forward(self, x, H, W):
-        B, L, C = x.shape
-        x = x.view(B, H, W, C)
-
-        # padding
-        pad_input = (H % 2 == 1) or (W % 2 == 1)
-        if pad_input:
-            x = F.pad(x, (0, 0, 0, W % 2, 0, H % 2))
-
-        x0 = x[:, 0::2, 0::2, :]  # B H/2 W/2 C
-        x1 = x[:, 1::2, 0::2, :]  # B H/2 W/2 C
-        x2 = x[:, 0::2, 1::2, :]  # B H/2 W/2 C
-        x3 = x[:, 1::2, 1::2, :]  # B H/2 W/2 C
-        x = torch.cat([x0, x1, x2, x3], -1)  # B H/2 W/2 4*C
-        x = x.view(B, -1, 4 * C)  # B H/2*W/2 4*C
-
-        x = self.norm(x)
-        x = self.reduction(x)
-
-        return x
-
-
-class BasicLayer(nn.Module):
-    def __init__(self,
-                 dim,
-                 depth,
-                 num_heads,
-                 window_size=7,
-                 mlp_ratio=4.,
-                 qkv_bias=True,
-                 qk_scale=None,
-                 norm_layer=nn.LayerNorm,
-                 downsample=None,
-                 device=None, dtype=None, operations=None):
-        super().__init__()
-        self.window_size = window_size
-        self.shift_size = window_size // 2
-        self.depth = depth
-
-        # build blocks
-        self.blocks = nn.ModuleList([
-            SwinTransformerBlock(
-                dim=dim,
-                num_heads=num_heads,
-                window_size=window_size,
-                shift_size=0 if (i % 2 == 0) else window_size // 2,
-                mlp_ratio=mlp_ratio,
-                qkv_bias=qkv_bias,
-                qk_scale=qk_scale,
-                norm_layer=norm_layer,
-                device=device, dtype=dtype, operations=operations)
-            for i in range(depth)])
-
-        # patch merging layer
-        if downsample is not None:
-            self.downsample = downsample(dim=dim, device=device, dtype=dtype, operations=operations)
-        else:
-            self.downsample = None
-
-    def forward(self, x, H, W):
-        Hp = int(np.ceil(H / self.window_size)) * self.window_size
-        Wp = int(np.ceil(W / self.window_size)) * self.window_size
-        img_mask = torch.zeros((1, Hp, Wp, 1), device=x.device)  # 1 Hp Wp 1
-        h_slices = (slice(0, -self.window_size),
-                    slice(-self.window_size, -self.shift_size),
-                    slice(-self.shift_size, None))
-        w_slices = (slice(0, -self.window_size),
-                    slice(-self.window_size, -self.shift_size),
-                    slice(-self.shift_size, None))
-        cnt = 0
-        for h in h_slices:
-            for w in w_slices:
-                img_mask[:, h, w, :] = cnt
-                cnt += 1
-
-        mask_windows = window_partition(img_mask, self.window_size)
-        mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
-        attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
-        attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
-
-        for blk in self.blocks:
-            blk.H, blk.W = H, W
-            x = blk(x, attn_mask)
-        if self.downsample is not None:
-            x_down = self.downsample(x, H, W)
-            Wh, Ww = (H + 1) // 2, (W + 1) // 2
-            return x, H, W, x_down, Wh, Ww
-        else:
-            return x, H, W, x, H, W
-
-
-class PatchEmbed(nn.Module):
-    def __init__(self, patch_size=4, in_channels=3, embed_dim=96, norm_layer=None, device=None, dtype=None, operations=None):
-        super().__init__()
-        patch_size = (patch_size, patch_size)
-        self.patch_size = patch_size
-
-        self.in_channels = in_channels
-        self.embed_dim = embed_dim
-
-        self.proj = operations.Conv2d(in_channels, embed_dim, kernel_size=patch_size, stride=patch_size, device=device, dtype=dtype)
-        if norm_layer is not None:
-            self.norm = norm_layer(embed_dim, device=device, dtype=dtype)
-        else:
-            self.norm = None
-
-    def forward(self, x):
-        _, _, H, W = x.size()
-        if W % self.patch_size[1] != 0:
-            x = F.pad(x, (0, self.patch_size[1] - W % self.patch_size[1]))
-        if H % self.patch_size[0] != 0:
-            x = F.pad(x, (0, 0, 0, self.patch_size[0] - H % self.patch_size[0]))
-
-        x = self.proj(x)  # B C Wh Ww
-        if self.norm is not None:
-            Wh, Ww = x.size(2), x.size(3)
-            x = x.flatten(2).transpose(1, 2)
-            x = self.norm(x)
-            x = x.transpose(1, 2).view(-1, self.embed_dim, Wh, Ww)
-
-        return x
-
-
-class SwinTransformer(nn.Module):
-    def __init__(self,
-                 pretrain_img_size=224,
-                 patch_size=4,
-                 in_channels=3,
-                 embed_dim=96,
-                 depths=[2, 2, 6, 2],
-                 num_heads=[3, 6, 12, 24],
-                 window_size=7,
-                 mlp_ratio=4.,
-                 qkv_bias=True,
-                 qk_scale=None,
-                 patch_norm=True,
-                 out_indices=(0, 1, 2, 3),
-                 frozen_stages=-1,
-                 device=None, dtype=None, operations=None):
-        super().__init__()
-
-        norm_layer = partial(operations.LayerNorm, device=device, dtype=dtype)
-        self.pretrain_img_size = pretrain_img_size
-        self.num_layers = len(depths)
-        self.embed_dim = embed_dim
-        self.patch_norm = patch_norm
-        self.out_indices = out_indices
-        self.frozen_stages = frozen_stages
-
-        self.patch_embed = PatchEmbed(
-            patch_size=patch_size, in_channels=in_channels, embed_dim=embed_dim,
-            device=device, dtype=dtype, operations=operations,
-            norm_layer=norm_layer if self.patch_norm else None)
-
-        self.layers = nn.ModuleList()
-        for i_layer in range(self.num_layers):
-            layer = BasicLayer(
-                dim=int(embed_dim * 2 ** i_layer),
-                depth=depths[i_layer],
-                num_heads=num_heads[i_layer],
-                window_size=window_size,
-                mlp_ratio=mlp_ratio,
-                qkv_bias=qkv_bias,
-                qk_scale=qk_scale,
-                norm_layer=norm_layer,
-                downsample=PatchMerging if (i_layer < self.num_layers - 1) else None,
-                device=device, dtype=dtype, operations=operations)
-            self.layers.append(layer)
-
-        num_features = [int(embed_dim * 2 ** i) for i in range(self.num_layers)]
-        self.num_features = num_features
-
-        for i_layer in out_indices:
-            layer = norm_layer(num_features[i_layer])
-            layer_name = f'norm{i_layer}'
-            self.add_module(layer_name, layer)
-
-
-    def forward(self, x):
-        x = self.patch_embed(x)
-
-        Wh, Ww = x.size(2), x.size(3)
-
-        outs = []
-        x = x.flatten(2).transpose(1, 2)
-        for i in range(self.num_layers):
-            layer = self.layers[i]
-            x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww)
-
-            if i in self.out_indices:
-                norm_layer = getattr(self, f'norm{i}')
-                x_out = norm_layer(x_out)
-
-                out = x_out.view(-1, H, W, self.num_features[i]).permute(0, 3, 1, 2).contiguous()
-                outs.append(out)
-
-        return tuple(outs)
-
-class DeformableConv2d(nn.Module):
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size=3,
-                 stride=1,
-                 padding=1,
-                 bias=False, device=None, dtype=None, operations=None):
-
-        super(DeformableConv2d, self).__init__()
-
-        kernel_size = kernel_size if type(kernel_size) is tuple else (kernel_size, kernel_size)
-        self.stride = stride if type(stride) is tuple else (stride, stride)
-        self.padding = padding
-
-        self.offset_conv = operations.Conv2d(in_channels,
-                                     2 * kernel_size[0] * kernel_size[1],
-                                     kernel_size=kernel_size,
-                                     stride=stride,
-                                     padding=self.padding,
-                                     bias=True, device=device, dtype=dtype)
-
-        self.modulator_conv = operations.Conv2d(in_channels,
-                                     1 * kernel_size[0] * kernel_size[1],
-                                     kernel_size=kernel_size,
-                                     stride=stride,
-                                     padding=self.padding,
-                                     bias=True, device=device, dtype=dtype)
-
-        self.regular_conv = operations.Conv2d(in_channels,
-                                      out_channels=out_channels,
-                                      kernel_size=kernel_size,
-                                      stride=stride,
-                                      padding=self.padding,
-                                      bias=bias, device=device, dtype=dtype)
-
-    def forward(self, x):
-        offset = self.offset_conv(x)
-        modulator = 2. * torch.sigmoid(self.modulator_conv(x))
-        weight, bias, offload_info = comfy.ops.cast_bias_weight(self.regular_conv, x, offloadable=True)
-
-        x = deform_conv2d(
-            input=x,
-            offset=offset,
-            weight=weight,
-            bias=None,
-            padding=self.padding,
-            mask=modulator,
-            stride=self.stride,
-        )
-        comfy.ops.uncast_bias_weight(self.regular_conv, weight, bias, offload_info)
-        return x
-
-class BasicDecBlk(nn.Module):
-    def __init__(self, in_channels=64, out_channels=64, inter_channels=64, device=None, dtype=None, operations=None):
-        super(BasicDecBlk, self).__init__()
-        inter_channels = 64
-        self.conv_in = operations.Conv2d(in_channels, inter_channels, 3, 1, padding=1, device=device, dtype=dtype)
-        self.relu_in = nn.ReLU(inplace=True)
-        self.dec_att = ASPPDeformable(in_channels=inter_channels, device=device, dtype=dtype, operations=operations)
-        self.conv_out = operations.Conv2d(inter_channels, out_channels, 3, 1, padding=1, device=device, dtype=dtype)
-        self.bn_in = operations.BatchNorm2d(inter_channels, device=device, dtype=dtype)
-        self.bn_out = operations.BatchNorm2d(out_channels, device=device, dtype=dtype)
-
-    def forward(self, x):
-        x = self.conv_in(x)
-        x = self.bn_in(x)
-        x = self.relu_in(x)
-        x = self.dec_att(x)
-        x = self.conv_out(x)
-        x = self.bn_out(x)
-        return x
-
-
-class BasicLatBlk(nn.Module):
-    def __init__(self, in_channels=64, out_channels=64, device=None, dtype=None, operations=None):
-        super(BasicLatBlk, self).__init__()
-        self.conv = operations.Conv2d(in_channels, out_channels, 1, 1, 0, device=device, dtype=dtype)
-
-    def forward(self, x):
-        x = self.conv(x)
-        return x
-
-
-class _ASPPModuleDeformable(nn.Module):
-    def __init__(self, in_channels, planes, kernel_size, padding, device, dtype, operations):
-        super(_ASPPModuleDeformable, self).__init__()
-        self.atrous_conv = DeformableConv2d(in_channels, planes, kernel_size=kernel_size,
-                                            stride=1, padding=padding, bias=False, device=device, dtype=dtype, operations=operations)
-        self.bn = operations.BatchNorm2d(planes, device=device, dtype=dtype)
-        self.relu = nn.ReLU(inplace=True)
-
-    def forward(self, x):
-        x = self.atrous_conv(x)
-        x = self.bn(x)
-
-        return self.relu(x)
-
-
-class ASPPDeformable(nn.Module):
-    def __init__(self, in_channels, out_channels=None, parallel_block_sizes=[1, 3, 7], device=None, dtype=None, operations=None):
-        super(ASPPDeformable, self).__init__()
-        self.down_scale = 1
-        if out_channels is None:
-            out_channels = in_channels
-        self.in_channelster = 256 // self.down_scale
-
-        self.aspp1 = _ASPPModuleDeformable(in_channels, self.in_channelster, 1, padding=0, device=device, dtype=dtype, operations=operations)
-        self.aspp_deforms = nn.ModuleList([
-            _ASPPModuleDeformable(in_channels, self.in_channelster, conv_size, padding=int(conv_size//2), device=device, dtype=dtype, operations=operations)
-              for conv_size in parallel_block_sizes
-        ])
-
-        self.global_avg_pool = nn.Sequential(nn.AdaptiveAvgPool2d((1, 1)),
-                                             operations.Conv2d(in_channels, self.in_channelster, 1, stride=1, bias=False, device=device, dtype=dtype),
-                                             operations.BatchNorm2d(self.in_channelster, device=device, dtype=dtype),
-                                             nn.ReLU(inplace=True))
-        self.conv1 = operations.Conv2d(self.in_channelster * (2 + len(self.aspp_deforms)), out_channels, 1, bias=False, device=device, dtype=dtype)
-        self.bn1 = operations.BatchNorm2d(out_channels, device=device, dtype=dtype)
-        self.relu = nn.ReLU(inplace=True)
-
-    def forward(self, x):
-        x1 = self.aspp1(x)
-        x_aspp_deforms = [aspp_deform(x) for aspp_deform in self.aspp_deforms]
-        x5 = self.global_avg_pool(x)
-        x5 = F.interpolate(x5, size=x1.size()[2:], mode='bilinear', align_corners=True)
-        x = torch.cat((x1, *x_aspp_deforms, x5), dim=1)
-
-        x = self.conv1(x)
-        x = self.bn1(x)
-        x = self.relu(x)
-
-        return x
-
-class BiRefNet(nn.Module):
-    def __init__(self, config=None, dtype=None, device=None, operations=None):
-        super(BiRefNet, self).__init__()
-        self.bb = SwinTransformer(embed_dim=192, depths=[2, 2, 18, 2], num_heads=[6, 12, 24, 48], window_size=12, device=device, dtype=dtype, operations=operations)
-
-        channels = [1536, 768, 384, 192]
-        channels = [c * 2 for c in channels]
-        self.cxt = channels[1:][::-1][-3:]
-        self.squeeze_module = nn.Sequential(*[
-            BasicDecBlk(channels[0]+sum(self.cxt), channels[0], device=device, dtype=dtype, operations=operations)
-            for _ in range(1)
-        ])
-
-        self.decoder = Decoder(channels, device=device, dtype=dtype, operations=operations)
-
-    def forward_enc(self, x):
-        x1, x2, x3, x4 = self.bb(x)
-        B, C, H, W = x.shape
-        x1_, x2_, x3_, x4_ = self.bb(F.interpolate(x, size=(H//2, W//2), mode='bilinear', align_corners=True))
-        x1 = torch.cat([x1, F.interpolate(x1_, size=x1.shape[2:], mode='bilinear', align_corners=True)], dim=1)
-        x2 = torch.cat([x2, F.interpolate(x2_, size=x2.shape[2:], mode='bilinear', align_corners=True)], dim=1)
-        x3 = torch.cat([x3, F.interpolate(x3_, size=x3.shape[2:], mode='bilinear', align_corners=True)], dim=1)
-        x4 = torch.cat([x4, F.interpolate(x4_, size=x4.shape[2:], mode='bilinear', align_corners=True)], dim=1)
-        x4 = torch.cat(
-            (
-                *[
-                    F.interpolate(x1, size=x4.shape[2:], mode='bilinear', align_corners=True),
-                    F.interpolate(x2, size=x4.shape[2:], mode='bilinear', align_corners=True),
-                    F.interpolate(x3, size=x4.shape[2:], mode='bilinear', align_corners=True),
-                ][-len(CXT):],
-                x4
-            ),
-            dim=1
-        )
-        return (x1, x2, x3, x4)
-
-    def forward_ori(self, x):
-        (x1, x2, x3, x4) = self.forward_enc(x)
-        x4 = self.squeeze_module(x4)
-        features = [x, x1, x2, x3, x4]
-        scaled_preds = self.decoder(features)
-        return scaled_preds
-
-    def forward(self, pixel_values, intermediate_output=None):
-        scaled_preds = self.forward_ori(pixel_values)
-        return scaled_preds
-
-
-class Decoder(nn.Module):
-    def __init__(self, channels, device, dtype, operations):
-        super(Decoder, self).__init__()
-        # factory kwargs
-        fk = {"device":device, "dtype":dtype, "operations":operations}
-        DecoderBlock = partial(BasicDecBlk, **fk)
-        LateralBlock = partial(BasicLatBlk, **fk)
-        DBlock = partial(SimpleConvs, **fk)
-
-        self.split = True
-        N_dec_ipt = 64
-        ic = 64
-        ipt_cha_opt = 1
-        self.ipt_blk5 = DBlock(2**10*3 if self.split else 3, [N_dec_ipt, channels[0]//8][ipt_cha_opt], inter_channels=ic)
-        self.ipt_blk4 = DBlock(2**8*3 if self.split else 3, [N_dec_ipt, channels[0]//8][ipt_cha_opt], inter_channels=ic)
-        self.ipt_blk3 = DBlock(2**6*3 if self.split else 3, [N_dec_ipt, channels[1]//8][ipt_cha_opt], inter_channels=ic)
-        self.ipt_blk2 = DBlock(2**4*3 if self.split else 3, [N_dec_ipt, channels[2]//8][ipt_cha_opt], inter_channels=ic)
-        self.ipt_blk1 = DBlock(2**0*3 if self.split else 3, [N_dec_ipt, channels[3]//8][ipt_cha_opt], inter_channels=ic)
-
-        self.decoder_block4 = DecoderBlock(channels[0]+([N_dec_ipt, channels[0]//8][ipt_cha_opt]), channels[1])
-        self.decoder_block3 = DecoderBlock(channels[1]+([N_dec_ipt, channels[0]//8][ipt_cha_opt]), channels[2])
-        self.decoder_block2 = DecoderBlock(channels[2]+([N_dec_ipt, channels[1]//8][ipt_cha_opt]), channels[3])
-        self.decoder_block1 = DecoderBlock(channels[3]+([N_dec_ipt, channels[2]//8][ipt_cha_opt]), channels[3]//2)
-
-        fk = {"device":device, "dtype":dtype}
-
-        self.conv_out1 = nn.Sequential(operations.Conv2d(channels[3]//2+([N_dec_ipt, channels[3]//8][ipt_cha_opt]), 1, 1, 1, 0, **fk))
-
-        self.lateral_block4 = LateralBlock(channels[1], channels[1])
-        self.lateral_block3 = LateralBlock(channels[2], channels[2])
-        self.lateral_block2 = LateralBlock(channels[3], channels[3])
-
-        self.conv_ms_spvn_4 = operations.Conv2d(channels[1], 1, 1, 1, 0, **fk)
-        self.conv_ms_spvn_3 = operations.Conv2d(channels[2], 1, 1, 1, 0, **fk)
-        self.conv_ms_spvn_2 = operations.Conv2d(channels[3], 1, 1, 1, 0, **fk)
-
-        _N = 16
-
-        self.gdt_convs_4 = nn.Sequential(operations.Conv2d(channels[0] // 2, _N, 3, 1, 1, **fk), operations.BatchNorm2d(_N, **fk), nn.ReLU(inplace=True))
-        self.gdt_convs_3 = nn.Sequential(operations.Conv2d(channels[1] // 2, _N, 3, 1, 1, **fk), operations.BatchNorm2d(_N, **fk), nn.ReLU(inplace=True))
-        self.gdt_convs_2 = nn.Sequential(operations.Conv2d(channels[2] // 2, _N, 3, 1, 1, **fk), operations.BatchNorm2d(_N, **fk), nn.ReLU(inplace=True))
-
-        [setattr(self, f"gdt_convs_pred_{i}", nn.Sequential(operations.Conv2d(_N, 1, 1, 1, 0, **fk))) for i in range(2, 5)]
-        [setattr(self, f"gdt_convs_attn_{i}", nn.Sequential(operations.Conv2d(_N, 1, 1, 1, 0, **fk))) for i in range(2, 5)]
-
-    def get_patches_batch(self, x, p):
-        _size_h, _size_w = p.shape[2:]
-        patches_batch = []
-        for idx in range(x.shape[0]):
-            columns_x = torch.split(x[idx], split_size_or_sections=_size_w, dim=-1)
-            patches_x = []
-            for column_x in columns_x:
-                patches_x += [p.unsqueeze(0) for p in torch.split(column_x, split_size_or_sections=_size_h, dim=-2)]
-            patch_sample = torch.cat(patches_x, dim=1)
-            patches_batch.append(patch_sample)
-        return torch.cat(patches_batch, dim=0)
-
-    def forward(self, features):
-        x, x1, x2, x3, x4 = features
-
-        patches_batch = self.get_patches_batch(x, x4) if self.split else x
-        x4 = torch.cat((x4, self.ipt_blk5(F.interpolate(patches_batch, size=x4.shape[2:], mode='bilinear', align_corners=True))), 1)
-        p4 = self.decoder_block4(x4)
-        p4_gdt = self.gdt_convs_4(p4)
-        gdt_attn_4 = self.gdt_convs_attn_4(p4_gdt).sigmoid()
-        p4 = p4 * gdt_attn_4
-        _p4 = F.interpolate(p4, size=x3.shape[2:], mode='bilinear', align_corners=True)
-        _p3 = _p4 + self.lateral_block4(x3)
-
-        patches_batch = self.get_patches_batch(x, _p3) if self.split else x
-        _p3 = torch.cat((_p3, self.ipt_blk4(F.interpolate(patches_batch, size=x3.shape[2:], mode='bilinear', align_corners=True))), 1)
-        p3 = self.decoder_block3(_p3)
-
-        p3_gdt = self.gdt_convs_3(p3)
-        gdt_attn_3 = self.gdt_convs_attn_3(p3_gdt).sigmoid()
-        p3 = p3 * gdt_attn_3
-        _p3 = F.interpolate(p3, size=x2.shape[2:], mode='bilinear', align_corners=True)
-        _p2 = _p3 + self.lateral_block3(x2)
-
-        patches_batch = self.get_patches_batch(x, _p2) if self.split else x
-        _p2 = torch.cat((_p2, self.ipt_blk3(F.interpolate(patches_batch, size=x2.shape[2:], mode='bilinear', align_corners=True))), 1)
-        p2 = self.decoder_block2(_p2)
-
-        p2_gdt = self.gdt_convs_2(p2)
-        gdt_attn_2 = self.gdt_convs_attn_2(p2_gdt).sigmoid()
-        p2 = p2 * gdt_attn_2
-
-        _p2 = F.interpolate(p2, size=x1.shape[2:], mode='bilinear', align_corners=True)
-        _p1 = _p2 + self.lateral_block2(x1)
-
-        patches_batch = self.get_patches_batch(x, _p1) if self.split else x
-        _p1 = torch.cat((_p1, self.ipt_blk2(F.interpolate(patches_batch, size=x1.shape[2:], mode='bilinear', align_corners=True))), 1)
-        _p1 = self.decoder_block1(_p1)
-        _p1 = F.interpolate(_p1, size=x.shape[2:], mode='bilinear', align_corners=True)
-
-        patches_batch = self.get_patches_batch(x, _p1) if self.split else x
-        _p1 = torch.cat((_p1, self.ipt_blk1(F.interpolate(patches_batch, size=x.shape[2:], mode='bilinear', align_corners=True))), 1)
-        p1_out = self.conv_out1(_p1)
-        return p1_out
-
-
-class SimpleConvs(nn.Module):
-    def __init__(
-        self, in_channels: int, out_channels: int, inter_channels=64, device=None, dtype=None, operations=None
-    ) -> None:
-        super().__init__()
-        self.conv1 = operations.Conv2d(in_channels, inter_channels, 3, 1, 1, device=device, dtype=dtype)
-        self.conv_out = operations.Conv2d(inter_channels, out_channels, 3, 1, 1, device=device, dtype=dtype)
-
-    def forward(self, x):
-        return self.conv_out(self.conv1(x))
--- a/comfy/bg_removal_model.py
+++ b/comfy/bg_removal_model.py
@ -1,85 +0,0 @@
-from .utils import load_torch_file
-import os
-import json
-import torch
-import logging
-
-import comfy.ops
-import comfy.model_patcher
-import comfy.model_management
-import comfy.clip_model
-import comfy.background_removal.birefnet
-
-BG_REMOVAL_MODELS = {
-    "birefnet": comfy.background_removal.birefnet.BiRefNet
-}
-
-class BackgroundRemovalModel():
-    def __init__(self, json_config):
-        with open(json_config) as f:
-            config = json.load(f)
-
-        self.image_size = config.get("image_size", 1024)
-        self.image_mean = config.get("image_mean", [0.0, 0.0, 0.0])
-        self.image_std = config.get("image_std", [1.0, 1.0, 1.0])
-        self.model_type = config.get("model_type", "birefnet")
-        self.config = config.copy()
-        model_class = BG_REMOVAL_MODELS.get(self.model_type)
-
-        self.load_device = comfy.model_management.text_encoder_device()
-        offload_device = comfy.model_management.text_encoder_offload_device()
-        self.dtype = comfy.model_management.text_encoder_dtype(self.load_device)
-        self.model = model_class(config, self.dtype, offload_device, comfy.ops.manual_cast)
-        self.model.eval()
-
-        self.patcher = comfy.model_patcher.CoreModelPatcher(self.model, load_device=self.load_device, offload_device=offload_device)
-
-    def load_sd(self, sd):
-        return self.model.load_state_dict(sd, strict=False, assign=self.patcher.is_dynamic())
-
-    def get_sd(self):
-        return self.model.state_dict()
-
-    def encode_image(self, image):
-        comfy.model_management.load_model_gpu(self.patcher)
-        H, W = image.shape[1], image.shape[2]
-        pixel_values = comfy.clip_model.clip_preprocess(image.to(self.load_device), size=self.image_size, mean=self.image_mean, std=self.image_std, crop=False)
-
-        if pixel_values.shape[0] > 1:
-            out = torch.cat([
-                self.model(pixel_values=pixel_values[i:i+1])
-                for i in range(pixel_values.shape[0])
-            ], dim=0)
-        else:
-            out = self.model(pixel_values=pixel_values)
-        out = torch.nn.functional.interpolate(out, size=(H, W), mode="bicubic", antialias=False)
-
-        mask = out.sigmoid().to(device=comfy.model_management.intermediate_device(), dtype=comfy.model_management.intermediate_dtype())
-        if mask.ndim == 3:
-            mask = mask.unsqueeze(0)
-        if mask.shape[1] != 1:
-            mask = mask.movedim(-1, 1)
-
-        return mask
-
-
-def load_background_removal_model(sd):
-    if "bb.layers.1.blocks.0.attn.relative_position_index" in sd:
-        json_config = os.path.join(os.path.join(os.path.dirname(os.path.realpath(__file__)), "background_removal"), "birefnet.json")
-    else:
-        return None
-
-    bg_model = BackgroundRemovalModel(json_config)
-    m, u = bg_model.load_sd(sd)
-    if len(m) > 0:
-        logging.warning("missing background removal: {}".format(m))
-    u = set(u)
-    keys = list(sd.keys())
-    for k in keys:
-        if k not in u:
-            sd.pop(k)
-    return bg_model
-
-def load(ckpt_path):
-    sd = load_torch_file(ckpt_path)
-    return load_background_removal_model(sd)
--- a/comfy/cli_args.py
+++ b/comfy/cli_args.py
@ -90,8 +90,8 @@ parser.add_argument("--force-channels-last", action="store_true", help="Force ch
 parser.add_argument("--directml", type=int, nargs="?", metavar="DIRECTML_DEVICE", const=-1, help="Use torch-directml.")

 parser.add_argument("--oneapi-device-selector", type=str, default=None, metavar="SELECTOR_STRING", help="Sets the oneAPI device(s) this instance will use.")
+parser.add_argument("--disable-ipex-optimize", action="store_true", help="Disables ipex.optimize default when loading models with Intel's Extension for Pytorch.")
 parser.add_argument("--supports-fp8-compute", action="store_true", help="ComfyUI will act like if the device supports fp8 compute.")
-parser.add_argument("--enable-triton-backend", action="store_true", help="ComfyUI will enable the use of Triton backend in comfy-kitchen. Is disabled at launch by default.")

 class LatentPreviewMethod(enum.Enum):
    NoPreviews = "none"
@ -110,11 +110,13 @@ parser.add_argument("--preview-method", type=LatentPreviewMethod, default=Latent

 parser.add_argument("--preview-size", type=int, default=512, help="Sets the maximum preview size for sampler nodes.")

+CACHE_RAM_AUTO_GB = -1.0
+
 cache_group = parser.add_mutually_exclusive_group()
-cache_group.add_argument("--cache-ram", nargs='*', type=float, default=[], metavar="GB", help="Use RAM pressure caching with the specified headroom thresholds. This is the default caching mode. The first value sets the active-cache threshold; the optional second value sets the inactive-cache/pin threshold. Defaults when no values are provided: active 25%% of system RAM (min 4GB, max 32GB), inactive 75%% of system RAM (min 12GB, max 96GB).")
 cache_group.add_argument("--cache-classic", action="store_true", help="Use the old style (aggressive) caching.")
 cache_group.add_argument("--cache-lru", type=int, default=0, help="Use LRU caching with a maximum of N node results cached. May use more RAM/VRAM.")
 cache_group.add_argument("--cache-none", action="store_true", help="Reduced RAM/VRAM usage at the expense of executing every node for each run.")
+cache_group.add_argument("--cache-ram", nargs='?', const=CACHE_RAM_AUTO_GB, type=float, default=0, help="Use RAM pressure caching with the specified headroom threshold. If available RAM drops below the threshold the cache removes large items to free RAM. Default (when no value is provided): 25%% of system RAM (min 4GB, max 32GB).")

 attn_group = parser.add_mutually_exclusive_group()
 attn_group.add_argument("--use-split-cross-attention", action="store_true", help="Use the split cross attention optimization. Ignored when xformers is used.")
@ -139,7 +141,8 @@ manager_group.add_argument("--enable-manager-legacy-ui", action="store_true", he
 vram_group = parser.add_mutually_exclusive_group()
 vram_group.add_argument("--gpu-only", action="store_true", help="Store and run everything (text encoders/CLIP models, etc... on the GPU).")
 vram_group.add_argument("--highvram", action="store_true", help="By default models will be unloaded to CPU memory after being used. This option keeps them in GPU memory.")
-vram_group.add_argument("--lowvram", action="store_true", help="Doesn't do anything if dynamic vram is enabled. If dynamic vram isn't being used this option makes the text encoders run on the CPU.")
+vram_group.add_argument("--normalvram", action="store_true", help="Used to force normal vram use if lowvram gets automatically enabled.")
+vram_group.add_argument("--lowvram", action="store_true", help="Split the unet in parts to use less vram.")
 vram_group.add_argument("--novram", action="store_true", help="When lowvram isn't enough.")
 vram_group.add_argument("--cpu", action="store_true", help="To use the CPU for everything (slow).")

@ -235,17 +238,12 @@ database_default_path = os.path.abspath(
 )
 parser.add_argument("--database-url", type=str, default=f"sqlite:///{database_default_path}", help="Specify the database URL, e.g. for an in-memory database you can use 'sqlite:///:memory:'.")
 parser.add_argument("--enable-assets", action="store_true", help="Enable the assets system (API routes, database synchronization, and background scanning).")
-parser.add_argument("--feature-flag", type=str, action='append', default=[], metavar="KEY[=VALUE]", help="Set a server feature flag. Use KEY=VALUE to set an explicit value, or bare KEY to set it to true. Can be specified multiple times. Boolean values (true/false) and numbers are auto-converted. Examples: --feature-flag show_signin_button=true  or  --feature-flag show_signin_button")
-parser.add_argument("--list-feature-flags", action="store_true", help="Print the registry of known CLI-settable feature flags as JSON and exit.")

 if comfy.options.args_parsing:
    args = parser.parse_args()
 else:
    args = parser.parse_args([])

-if args.cache_ram is not None and len(args.cache_ram) > 2:
-    parser.error("--cache-ram accepts at most two values: active GB and inactive GB")
-
 if args.windows_standalone_build:
    args.auto_launch = True

--- a/comfy/context_windows.py
+++ b/comfy/context_windows.py
@ -63,11 +63,7 @@ class IndexListContextWindow(ContextWindowABC):
            dim = self.dim
        if dim == 0 and full.shape[dim] == 1:
            return full
-        indices = self.index_list
-        anchor_idx = getattr(self, 'causal_anchor_index', None)
-        if anchor_idx is not None and anchor_idx >= 0:
-            indices = [anchor_idx] + list(indices)
-        idx = tuple([slice(None)] * dim + [indices])
+        idx = tuple([slice(None)] * dim + [self.index_list])
        window = full[idx]
        if retain_index_list:
            idx = tuple([slice(None)] * dim + [retain_index_list])
@ -117,14 +113,7 @@ def slice_cond(cond_value, window: IndexListContextWindow, x_in: torch.Tensor, d

    # skip leading latent positions that have no corresponding conditioning (e.g. reference frames)
    if temporal_offset > 0:
-        anchor_idx = getattr(window, 'causal_anchor_index', None)
-        if anchor_idx is not None and anchor_idx >= 0:
-            # anchor occupies one of the no-cond positions, so skip one fewer from window.index_list
-            skip_count = temporal_offset - 1
-        else:
-            skip_count = temporal_offset
-
-        indices = [i - temporal_offset for i in window.index_list[skip_count:]]
+        indices = [i - temporal_offset for i in window.index_list[temporal_offset:]]
        indices = [i for i in indices if 0 <= i]
    else:
        indices = list(window.index_list)
@ -161,8 +150,7 @@ class ContextFuseMethod:
 ContextResults = collections.namedtuple("ContextResults", ['window_idx', 'sub_conds_out', 'sub_conds', 'window'])
 class IndexListContextHandler(ContextHandlerABC):
    def __init__(self, context_schedule: ContextSchedule, fuse_method: ContextFuseMethod, context_length: int=1, context_overlap: int=0, context_stride: int=1,
-                 closed_loop: bool=False, dim:int=0, freenoise: bool=False, cond_retain_index_list: list[int]=[], split_conds_to_windows: bool=False,
-                 causal_window_fix: bool=True):
+                 closed_loop: bool=False, dim:int=0, freenoise: bool=False, cond_retain_index_list: list[int]=[], split_conds_to_windows: bool=False):
        self.context_schedule = context_schedule
        self.fuse_method = fuse_method
        self.context_length = context_length
@ -174,7 +162,6 @@ class IndexListContextHandler(ContextHandlerABC):
        self.freenoise = freenoise
        self.cond_retain_index_list = [int(x.strip()) for x in cond_retain_index_list.split(",")] if cond_retain_index_list else []
        self.split_conds_to_windows = split_conds_to_windows
-        self.causal_window_fix = causal_window_fix

        self.callbacks = {}

@ -331,14 +318,6 @@ class IndexListContextHandler(ContextHandlerABC):
            # allow processing to end between context window executions for faster Cancel
            comfy.model_management.throw_exception_if_processing_interrupted()

-            # causal_window_fix: prepend a pre-window frame that will be stripped post-forward
-            anchor_applied = False
-            if self.causal_window_fix:
-                anchor_idx = window.index_list[0] - 1
-                if 0 <= anchor_idx < x_in.size(self.dim):
-                    window.causal_anchor_index = anchor_idx
-                    anchor_applied = True
-
            for callback in comfy.patcher_extension.get_all_callbacks(IndexListCallbacks.EVALUATE_CONTEXT_WINDOWS, self.callbacks):
                callback(self, model, x_in, conds, timestep, model_options, window_idx, window, model_options, device, first_device)

@ -353,12 +332,6 @@ class IndexListContextHandler(ContextHandlerABC):
            if device is not None:
                for i in range(len(sub_conds_out)):
                    sub_conds_out[i] = sub_conds_out[i].to(x_in.device)
-
-            # strip causal_window_fix anchor if applied
-            if anchor_applied:
-                for i in range(len(sub_conds_out)):
-                    sub_conds_out[i] = sub_conds_out[i].narrow(self.dim, 1, sub_conds_out[i].shape[self.dim] - 1)
-
            results.append(ContextResults(window_idx, sub_conds_out, sub_conds, window))
        return results

--- a/comfy/deploy_environment.py
+++ b/comfy/deploy_environment.py
@ -1,34 +0,0 @@
-import functools
-import logging
-import os
-
-logger = logging.getLogger(__name__)
-
-_DEFAULT_DEPLOY_ENV = "local-git"
-_ENV_FILENAME = ".comfy_environment"
-
-# Resolve the ComfyUI install directory (the parent of this `comfy/` package).
-# We deliberately avoid `folder_paths.base_path` here because that is overridden
-# by the `--base-directory` CLI arg to a user-supplied path, whereas the
-# `.comfy_environment` marker is written by launchers/installers next to the
-# ComfyUI install itself.
-_COMFY_INSTALL_DIR = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
-
-
-@functools.cache
-def get_deploy_environment() -> str:
-    env_file = os.path.join(_COMFY_INSTALL_DIR, _ENV_FILENAME)
-    try:
-        with open(env_file, encoding="utf-8") as f:
-            # Cap the read so a malformed or maliciously crafted file (e.g.
-            # a single huge line with no newline) can't blow up memory.
-            first_line = f.readline(128).strip()
-            value = "".join(c for c in first_line if 32 <= ord(c) < 127)
-            if value:
-                return value
-    except FileNotFoundError:
-        pass
-    except Exception as e:
-        logger.error("Failed to read %s: %s", env_file, e)
-
-    return _DEFAULT_DEPLOY_ENV
--- a/comfy/hooks.py
+++ b/comfy/hooks.py
@ -93,7 +93,7 @@ class Hook:
        self.hook_scope = hook_scope
        '''Scope of where this hook should apply in terms of the conds used in sampling run.'''
        self.custom_should_register = default_should_register
-        '''Can be overridden with a compatible function to decide if this hook should be registered without the need to override .should_register'''
+        '''Can be overriden with a compatible function to decide if this hook should be registered without the need to override .should_register'''

    @property
    def strength(self):
--- a/comfy/image_encoders/dino2.py
+++ b/comfy/image_encoders/dino2.py
@ -106,7 +106,6 @@ class Dino2Encoder(torch.nn.Module):
 class Dino2PatchEmbeddings(torch.nn.Module):
    def __init__(self, dim, num_channels=3, patch_size=14, image_size=518, dtype=None, device=None, operations=None):
        super().__init__()
-        self.patch_size = patch_size
        self.projection = operations.Conv2d(
            in_channels=num_channels,
            out_channels=dim,
@ -126,37 +125,17 @@ class Dino2Embeddings(torch.nn.Module):
        super().__init__()
        patch_size = 14
        image_size = 518
-        self.patch_size = patch_size

        self.patch_embeddings = Dino2PatchEmbeddings(dim, patch_size=patch_size, image_size=image_size, dtype=dtype, device=device, operations=operations)
        self.position_embeddings = torch.nn.Parameter(torch.empty(1, (image_size // patch_size) ** 2 + 1, dim, dtype=dtype, device=device))
-        self.cls_token = torch.nn.Parameter(torch.empty(1, 1, dim, dtype=dtype, device=device)) # mask_token is a pre-training param, kept only so strict loading accepts the key.
+        self.cls_token = torch.nn.Parameter(torch.empty(1, 1, dim, dtype=dtype, device=device))
        self.mask_token = torch.nn.Parameter(torch.empty(1, dim, dtype=dtype, device=device))

-    def interpolate_pos_encoding(self, x, h_pixels, w_pixels):
-        pos_embed = comfy.model_management.cast_to_device(self.position_embeddings, x.device, torch.float32)
-
-        class_pos = pos_embed[:, 0:1]
-        patch_pos = pos_embed[:, 1:]
-        N = patch_pos.shape[1]
-        M = int(N ** 0.5)
-        h0 = h_pixels // self.patch_size
-        w0 = w_pixels // self.patch_size
-        scale_factor = ((h0 + 0.1) / M, (w0 + 0.1) / M)  # +0.1 matches upstream DINOv2's FP-rounding workaround so the interpolate output size lands on (h0, w0).
-
-        patch_pos = patch_pos.reshape(1, M, M, -1).permute(0, 3, 1, 2)
-        patch_pos = torch.nn.functional.interpolate(patch_pos, scale_factor=scale_factor, mode="bicubic", antialias=False)
-        patch_pos = patch_pos.permute(0, 2, 3, 1).flatten(1, 2)
-        return torch.cat((class_pos, patch_pos), dim=1).to(x.dtype)
-
    def forward(self, pixel_values):
        x = self.patch_embeddings(pixel_values)
+        # TODO: mask_token?
        x = torch.cat((self.cls_token.to(device=x.device, dtype=x.dtype).expand(x.shape[0], -1, -1), x), dim=1)
-        if x.shape[1] - 1 == self.position_embeddings.shape[1] - 1:
-            x = x + comfy.model_management.cast_to_device(self.position_embeddings, x.device, x.dtype)
-        else:
-            h, w = pixel_values.shape[-2:]
-            x = x + self.interpolate_pos_encoding(x, h, w)
+        x = x + comfy.model_management.cast_to_device(self.position_embeddings, x.device, x.dtype)
        return x


@ -179,21 +158,3 @@ class Dinov2Model(torch.nn.Module):
        x = self.layernorm(x)
        pooled_output = x[:, 0, :]
        return x, i, pooled_output, None
-
-    def get_intermediate_layers(self, pixel_values, indices, apply_norm=True):
-        x = self.embeddings(pixel_values)
-        optimized_attention = optimized_attention_for_device(x.device, False, small_input=True)
-        n_layers = len(self.encoder.layer)
-        resolved = [(i if i >= 0 else n_layers + i) for i in indices]
-        target = set(resolved)
-        max_idx = max(resolved)
-        n_skip = 1  # skip cls token
-        cache = {}
-        for i, layer in enumerate(self.encoder.layer):
-            x = layer(x, optimized_attention)
-            if i in target:
-                normed = self.layernorm(x) if apply_norm else x
-                cache[i] = (normed[:, n_skip:], normed[:, 0])
-            if i >= max_idx:
-                break
-        return [cache[i] for i in resolved]
--- a/comfy/k_diffusion/sampling.py
+++ b/comfy/k_diffusion/sampling.py
@ -242,7 +242,6 @@ def sample_euler_ancestral_RF(model, x, sigmas, extra_args=None, callback=None,
    extra_args = {} if extra_args is None else extra_args
    seed = extra_args.get("seed", None)
    noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
-    s_noise = s_noise * getattr(model.inner_model.model_patcher.get_model_object('model_sampling'), "noise_scale", 1.0)
    s_in = x.new_ones([x.shape[0]])
    for i in trange(len(sigmas) - 1, disable=disable):
        denoised = model(x, sigmas[i] * s_in, **extra_args)
@ -374,7 +373,6 @@ def sample_dpm_2_ancestral_RF(model, x, sigmas, extra_args=None, callback=None,
    extra_args = {} if extra_args is None else extra_args
    seed = extra_args.get("seed", None)
    noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
-    s_noise = s_noise * getattr(model.inner_model.model_patcher.get_model_object('model_sampling'), "noise_scale", 1.0)
    s_in = x.new_ones([x.shape[0]])
    for i in trange(len(sigmas) - 1, disable=disable):
        denoised = model(x, sigmas[i] * s_in, **extra_args)
@ -688,7 +686,6 @@ def sample_dpmpp_2s_ancestral_RF(model, x, sigmas, extra_args=None, callback=Non
    extra_args = {} if extra_args is None else extra_args
    seed = extra_args.get("seed", None)
    noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
-    s_noise = s_noise * getattr(model.inner_model.model_patcher.get_model_object('model_sampling'), "noise_scale", 1.0)
    s_in = x.new_ones([x.shape[0]])
    sigma_fn = lambda lbda: (lbda.exp() + 1) ** -1
    lambda_fn = lambda sigma: ((1-sigma)/sigma).log()
@ -750,7 +747,6 @@ def sample_dpmpp_sde(model, x, sigmas, extra_args=None, callback=None, disable=N
    sigma_fn = partial(half_log_snr_to_sigma, model_sampling=model_sampling)
    lambda_fn = partial(sigma_to_half_log_snr, model_sampling=model_sampling)
    sigmas = offset_first_sigma_for_snr(sigmas, model_sampling)
-    s_noise = s_noise * getattr(model_sampling, "noise_scale", 1.0)

    for i in trange(len(sigmas) - 1, disable=disable):
        denoised = model(x, sigmas[i] * s_in, **extra_args)
@ -836,7 +832,6 @@ def sample_dpmpp_2m_sde(model, x, sigmas, extra_args=None, callback=None, disabl
    model_sampling = model.inner_model.model_patcher.get_model_object('model_sampling')
    lambda_fn = partial(sigma_to_half_log_snr, model_sampling=model_sampling)
    sigmas = offset_first_sigma_for_snr(sigmas, model_sampling)
-    s_noise = s_noise * getattr(model_sampling, "noise_scale", 1.0)

    old_denoised = None
    h, h_last = None, None
@ -894,7 +889,6 @@ def sample_dpmpp_3m_sde(model, x, sigmas, extra_args=None, callback=None, disabl
    model_sampling = model.inner_model.model_patcher.get_model_object('model_sampling')
    lambda_fn = partial(sigma_to_half_log_snr, model_sampling=model_sampling)
    sigmas = offset_first_sigma_for_snr(sigmas, model_sampling)
-    s_noise = s_noise * getattr(model_sampling, "noise_scale", 1.0)

    denoised_1, denoised_2 = None, None
    h, h_1, h_2 = None, None, None
@ -1012,39 +1006,23 @@ def sample_ddpm(model, x, sigmas, extra_args=None, callback=None, disable=None,
    return generic_step_sampler(model, x, sigmas, extra_args, callback, disable, noise_sampler, DDPMSampler_step)

@torch.no_grad()
-def sample_lcm(model, x, sigmas, extra_args=None, callback=None, disable=None, noise_sampler=None, s_noise=1.0, s_noise_end=None, noise_clip_std=0.0):
-
-    # s_noise / s_noise_end: per-step noise multiplier, linearly interpolated across steps
-    # noise_clip_std: clamp injected noise to +/- N stddevs (0 disables).
-
+def sample_lcm(model, x, sigmas, extra_args=None, callback=None, disable=None, noise_sampler=None):
    extra_args = {} if extra_args is None else extra_args
    seed = extra_args.get("seed", None)
    noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
    s_in = x.new_ones([x.shape[0]])
-    n_steps = max(1, len(sigmas) - 1)
-    model_sampling = model.inner_model.model_patcher.get_model_object('model_sampling')
-
-    s_start = float(s_noise)
-    s_end = s_start if s_noise_end is None else float(s_noise_end)
-    for i in trange(n_steps, disable=disable):
+    for i in trange(len(sigmas) - 1, disable=disable):
        denoised = model(x, sigmas[i] * s_in, **extra_args)
        if callback is not None:
            callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised})

        x = denoised
        if sigmas[i + 1] > 0:
-            noise = noise_sampler(sigmas[i], sigmas[i + 1])
-            if noise_clip_std > 0:
-                clip_val = noise_clip_std * noise.std()
-                noise = noise.clamp(min=-clip_val, max=clip_val)
-            t = (i / (n_steps - 1)) if n_steps > 1 else 0.0
-            s_noise_i = s_start + (s_end - s_start) * t
-            if s_noise_i != 1.0:
-                noise = noise * s_noise_i
-            x = model_sampling.noise_scaling(sigmas[i + 1], noise, x)
+            x = model.inner_model.inner_model.model_sampling.noise_scaling(sigmas[i + 1], noise_sampler(sigmas[i], sigmas[i + 1]), x)
    return x


+
@torch.no_grad()
 def sample_heunpp2(model, x, sigmas, extra_args=None, callback=None, disable=None, s_churn=0., s_tmin=0., s_tmax=float('inf'), s_noise=1.):
    # From MIT licensed: https://github.com/Carzit/sd-webui-samplers-scheduler/
@ -1271,7 +1249,6 @@ def sample_euler_ancestral_cfg_pp(model, x, sigmas, extra_args=None, callback=No

    model_sampling = model.inner_model.model_patcher.get_model_object("model_sampling")
    lambda_fn = partial(sigma_to_half_log_snr, model_sampling=model_sampling)
-    s_noise = s_noise * getattr(model_sampling, "noise_scale", 1.0)

    uncond_denoised = None

@ -1319,7 +1296,6 @@ def sample_dpmpp_2s_ancestral_cfg_pp(model, x, sigmas, extra_args=None, callback
    extra_args = {} if extra_args is None else extra_args
    seed = extra_args.get("seed", None)
    noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
-    s_noise = s_noise * getattr(model.inner_model.model_patcher.get_model_object('model_sampling'), "noise_scale", 1.0)

    temp = [0]
    def post_cfg_function(args):
@ -1395,7 +1371,6 @@ def res_multistep(model, x, sigmas, extra_args=None, callback=None, disable=None
    extra_args = {} if extra_args is None else extra_args
    seed = extra_args.get("seed", None)
    noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
-    s_noise = s_noise * getattr(model.inner_model.model_patcher.get_model_object('model_sampling'), "noise_scale", 1.0)
    s_in = x.new_ones([x.shape[0]])
    sigma_fn = lambda t: t.neg().exp()
    t_fn = lambda sigma: sigma.log().neg()
@ -1529,7 +1504,6 @@ def sample_er_sde(model, x, sigmas, extra_args=None, callback=None, disable=None
    extra_args = {} if extra_args is None else extra_args
    seed = extra_args.get("seed", None)
    noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
-    s_noise = s_noise * getattr(model.inner_model.model_patcher.get_model_object('model_sampling'), "noise_scale", 1.0)
    s_in = x.new_ones([x.shape[0]])

    def default_er_sde_noise_scaler(x):
@ -1600,10 +1574,9 @@ def sample_seeds_2(model, x, sigmas, extra_args=None, callback=None, disable=Non
    seed = extra_args.get("seed", None)
    noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
    s_in = x.new_ones([x.shape[0]])
+    inject_noise = eta > 0 and s_noise > 0

    model_sampling = model.inner_model.model_patcher.get_model_object('model_sampling')
-    s_noise = s_noise * getattr(model_sampling, "noise_scale", 1.0)
-    inject_noise = eta > 0 and s_noise > 0
    sigma_fn = partial(half_log_snr_to_sigma, model_sampling=model_sampling)
    lambda_fn = partial(sigma_to_half_log_snr, model_sampling=model_sampling)
    sigmas = offset_first_sigma_for_snr(sigmas, model_sampling)
@ -1672,10 +1645,9 @@ def sample_seeds_3(model, x, sigmas, extra_args=None, callback=None, disable=Non
    seed = extra_args.get("seed", None)
    noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
    s_in = x.new_ones([x.shape[0]])
+    inject_noise = eta > 0 and s_noise > 0

    model_sampling = model.inner_model.model_patcher.get_model_object('model_sampling')
-    s_noise = s_noise * getattr(model_sampling, "noise_scale", 1.0)
-    inject_noise = eta > 0 and s_noise > 0
    sigma_fn = partial(half_log_snr_to_sigma, model_sampling=model_sampling)
    lambda_fn = partial(sigma_to_half_log_snr, model_sampling=model_sampling)
    sigmas = offset_first_sigma_for_snr(sigmas, model_sampling)
@ -1741,7 +1713,6 @@ def sample_sa_solver(model, x, sigmas, extra_args=None, callback=None, disable=F
    s_in = x.new_ones([x.shape[0]])

    model_sampling = model.inner_model.model_patcher.get_model_object("model_sampling")
-    s_noise = s_noise * getattr(model_sampling, "noise_scale", 1.0)
    sigmas = offset_first_sigma_for_snr(sigmas, model_sampling)
    lambdas = sigma_to_half_log_snr(sigmas, model_sampling=model_sampling)

@ -1839,119 +1810,3 @@ def sample_sa_solver(model, x, sigmas, extra_args=None, callback=None, disable=F
 def sample_sa_solver_pece(model, x, sigmas, extra_args=None, callback=None, disable=False, tau_func=None, s_noise=1.0, noise_sampler=None, predictor_order=3, corrector_order=4, simple_order_2=False):
    """Stochastic Adams Solver with PECE (Predict–Evaluate–Correct–Evaluate) mode (NeurIPS 2023)."""
    return sample_sa_solver(model, x, sigmas, extra_args=extra_args, callback=callback, disable=disable, tau_func=tau_func, s_noise=s_noise, noise_sampler=noise_sampler, predictor_order=predictor_order, corrector_order=corrector_order, use_pece=True, simple_order_2=simple_order_2)
-
-
-@torch.no_grad()
-def sample_ar_video(model, x, sigmas, extra_args=None, callback=None, disable=None,
-                    num_frame_per_block=1):
-    """
-    Autoregressive video sampler: block-by-block denoising with KV cache
-    and flow-match re-noising for Causal Forcing / Self-Forcing models.
-
-    Requires a Causal-WAN compatible model (diffusion_model must expose
-    init_kv_caches / init_crossattn_caches) and 5-D latents [B,C,T,H,W].
-
-    All AR-loop parameters are passed via the SamplerARVideo node, not read
-    from the checkpoint or transformer_options.
-    """
-    extra_args = {} if extra_args is None else extra_args
-    model_options = extra_args.get("model_options", {})
-    transformer_options = model_options.get("transformer_options", {})
-
-    if x.ndim != 5:
-        raise ValueError(
-            f"ar_video sampler requires 5-D video latents [B,C,T,H,W], got {x.ndim}-D tensor with shape {x.shape}. "
-            "This sampler is only compatible with autoregressive video models (e.g. Causal-WAN)."
-        )
-
-    inner_model = model.inner_model.inner_model
-    causal_model = inner_model.diffusion_model
-
-    if not (hasattr(causal_model, "init_kv_caches") and hasattr(causal_model, "init_crossattn_caches")):
-        raise TypeError(
-            "ar_video sampler requires a Causal-WAN compatible model whose diffusion_model "
-            "exposes init_kv_caches() and init_crossattn_caches(). The loaded checkpoint "
-            "does not support this interface — choose a different sampler."
-        )
-
-    seed = extra_args.get("seed", 0)
-
-    bs, c, lat_t, lat_h, lat_w = x.shape
-    frame_seq_len = -(-lat_h // 2) * -(-lat_w // 2) # ceiling division
-    num_blocks = -(-lat_t // num_frame_per_block)   # ceiling division
-    device = x.device
-    model_dtype = inner_model.get_dtype()
-
-    kv_caches = causal_model.init_kv_caches(bs, lat_t * frame_seq_len, device, model_dtype)
-    crossattn_caches = causal_model.init_crossattn_caches(bs, device, model_dtype)
-
-    output = torch.zeros_like(x)
-    s_in = x.new_ones([x.shape[0]])
-    current_start_frame = 0
-
-    # I2V: seed KV cache with the initial image latent before the denoising loop
-    initial_latent = transformer_options.get("ar_config", {}).get("initial_latent", None)
-    if initial_latent is not None:
-        initial_latent = inner_model.process_latent_in(initial_latent).to(device=device, dtype=model_dtype)
-        n_init = initial_latent.shape[2]
-        output[:, :, :n_init] = initial_latent
-
-        ar_state = {"start_frame": 0, "kv_caches": kv_caches, "crossattn_caches": crossattn_caches}
-        transformer_options["ar_state"] = ar_state
-        zero_sigma = sigmas.new_zeros([1])
-        _ = model(initial_latent, zero_sigma * s_in, **extra_args)
-
-        current_start_frame = n_init
-        remaining = lat_t - n_init
-        num_blocks = -(-remaining // num_frame_per_block)
-
-    num_sigma_steps = len(sigmas) - 1
-    total_real_steps = num_blocks * num_sigma_steps
-    step_count = 0
-
-    try:
-        for block_idx in trange(num_blocks, disable=disable):
-            bf = min(num_frame_per_block, lat_t - current_start_frame)
-            fs, fe = current_start_frame, current_start_frame + bf
-            noisy_input = x[:, :, fs:fe]
-
-            ar_state = {
-                "start_frame": current_start_frame,
-                "kv_caches": kv_caches,
-                "crossattn_caches": crossattn_caches,
-            }
-            transformer_options["ar_state"] = ar_state
-
-            for i in range(num_sigma_steps):
-                denoised = model(noisy_input, sigmas[i] * s_in, **extra_args)
-
-                if callback is not None:
-                    scaled_i = step_count * num_sigma_steps // total_real_steps
-                    callback({"x": noisy_input, "i": scaled_i, "sigma": sigmas[i],
-                              "sigma_hat": sigmas[i], "denoised": denoised})
-
-                if sigmas[i + 1] == 0:
-                    noisy_input = denoised
-                else:
-                    sigma_next = sigmas[i + 1]
-                    torch.manual_seed(seed + block_idx * 1000 + i)
-                    fresh_noise = torch.randn_like(denoised)
-                    noisy_input = (1.0 - sigma_next) * denoised + sigma_next * fresh_noise
-
-                    for cache in kv_caches:
-                        cache["end"] -= bf * frame_seq_len
-
-                step_count += 1
-
-            output[:, :, fs:fe] = noisy_input
-
-            for cache in kv_caches:
-                cache["end"] -= bf * frame_seq_len
-            zero_sigma = sigmas.new_zeros([1])
-            _ = model(noisy_input, zero_sigma * s_in, **extra_args)
-
-            current_start_frame += bf
-    finally:
-        transformer_options.pop("ar_state", None)
-
-    return output
--- a/comfy/latent_formats.py
+++ b/comfy/latent_formats.py
@ -9,7 +9,6 @@ class LatentFormat:
    latent_rgb_factors_reshape = None
    taesd_decoder_name = None
    spacial_downscale_ratio = 8
-    temporal_downscale_ratio = 1

    def process_in(self, latent):
        return latent * self.scale_factor
@ -150,12 +149,6 @@ class SD3(LatentFormat):
 class StableAudio1(LatentFormat):
    latent_channels = 64
    latent_dimensions = 1
-    temporal_downscale_ratio = 2048
-
-class StableAudio3(LatentFormat):
-    latent_channels = 256
-    latent_dimensions = 1
-    temporal_downscale_ratio = 4096

 class Flux(SD3):
    latent_channels = 16
@ -242,7 +235,6 @@ class Flux2(LatentFormat):
 class Mochi(LatentFormat):
    latent_channels = 12
    latent_dimensions = 3
-    temporal_downscale_ratio = 6

    def __init__(self):
        self.scale_factor = 1.0
@ -286,7 +278,6 @@ class LTXV(LatentFormat):
    latent_channels = 128
    latent_dimensions = 3
    spacial_downscale_ratio = 32
-    temporal_downscale_ratio = 8

    def __init__(self):
        self.latent_rgb_factors = [
@ -430,7 +421,6 @@ class LTXAV(LTXV):
 class HunyuanVideo(LatentFormat):
    latent_channels = 16
    latent_dimensions = 3
-    temporal_downscale_ratio = 4
    scale_factor = 0.476986
    latent_rgb_factors = [
        [-0.0395, -0.0331,  0.0445],
@ -457,7 +447,6 @@ class HunyuanVideo(LatentFormat):
 class Cosmos1CV8x8x8(LatentFormat):
    latent_channels = 16
    latent_dimensions = 3
-    temporal_downscale_ratio = 8

    latent_rgb_factors = [
        [ 0.1817,  0.2284,  0.2423],
@ -483,7 +472,6 @@ class Cosmos1CV8x8x8(LatentFormat):
 class Wan21(LatentFormat):
    latent_channels = 16
    latent_dimensions = 3
-    temporal_downscale_ratio = 4

    latent_rgb_factors = [
            [-0.1299, -0.1692,  0.2932],
@ -746,7 +734,6 @@ class HunyuanVideo15(LatentFormat):
    latent_channels = 32
    latent_dimensions = 3
    spacial_downscale_ratio = 16
-    temporal_downscale_ratio = 4
    scale_factor = 1.03682
    taesd_decoder_name = "lighttaehy1_5"

@ -772,7 +759,6 @@ class ACEAudio(LatentFormat):
 class ACEAudio15(LatentFormat):
    latent_channels = 64
    latent_dimensions = 1
-    temporal_downscale_ratio = 1764

 class ChromaRadiance(LatentFormat):
    latent_channels = 3
@ -799,35 +785,9 @@ class ZImagePixelSpace(ChromaRadiance):
    """
    pass

-
-class HiDreamO1Pixel(ChromaRadiance):
-    """Pixel-space latent format for HiDream-O1.
-    No VAE — model patches/unpatches raw RGB internally with patch_size=32.
-    """
-    pass
-
 class CogVideoX(LatentFormat):
-    """Latent format for CogVideoX-2b (THUDM/CogVideoX-2b).
-
-    scale_factor matches the vae/config.json scaling_factor for the 2b variant.
-    The 5b-class checkpoints (CogVideoX-5b, CogVideoX-1.5-5B, CogVideoX-Fun-V1.5-*)
-    use a different value; see CogVideoX1_5 below.
-    """
    latent_channels = 16
    latent_dimensions = 3
-    temporal_downscale_ratio = 4

    def __init__(self):
        self.scale_factor = 1.15258426
-
-
-class CogVideoX1_5(CogVideoX):
-    """Latent format for 5b-class CogVideoX checkpoints.
-
-    Covers THUDM/CogVideoX-5b, THUDM/CogVideoX-1.5-5B, and the CogVideoX-Fun
-    V1.5-5b family (including VOID inpainting). All of these have
-    scaling_factor=0.7 in their vae/config.json. Auto-selected in
-    supported_models.CogVideoX_T2V based on transformer hidden dim.
-    """
-    def __init__(self):
-        self.scale_factor = 0.7
--- a/comfy/ldm/audio/dit.py
+++ b/comfy/ldm/audio/dit.py
@ -10,17 +10,6 @@ from torch import nn
 from torch.nn import functional as F
 import math
 import comfy.ops
-from .embedders import ExpoFourierFeatures
-
-
-def _left_pad_to_match(emb, target_len):
-    emb_len = emb.shape[-2]
-    if emb_len < target_len:
-        return F.pad(emb, (0, 0, target_len - emb_len, 0), value=0.)
-    elif emb_len > target_len:
-        return emb[:, -target_len:, :]
-    return emb
-

 class FourierFeatures(nn.Module):
    def __init__(self, in_features, out_features, std=1., dtype=None, device=None):
@ -33,7 +22,6 @@ class FourierFeatures(nn.Module):
        f = 2 * math.pi * input @ comfy.ops.cast_to_input(self.weight.T, input)
        return torch.cat([f.cos(), f.sin()], dim=-1)

-
 # norms
 class LayerNorm(nn.Module):
    def __init__(self, dim, bias=False, fix_scale=False, dtype=None, device=None):
@ -55,16 +43,6 @@ class LayerNorm(nn.Module):
            beta = comfy.ops.cast_to_input(beta, x)
        return F.layer_norm(x, x.shape[-1:], weight=comfy.ops.cast_to_input(self.gamma, x), bias=beta)

-
-class RMSNorm(nn.Module):
-    def __init__(self, dim, dtype=None, device=None):
-        super().__init__()
-        self.gamma = nn.Parameter(torch.empty(dim, dtype=dtype, device=device))
-
-    def forward(self, x):
-        return F.rms_norm(x, x.shape[-1:], weight=comfy.ops.cast_to_input(self.gamma, x))
-
-
 class GLU(nn.Module):
    def __init__(
        self,
@ -258,6 +236,13 @@ class FeedForward(nn.Module):

        linear_out = operations.Linear(inner_dim, dim_out, bias = not no_bias, dtype=dtype, device=device) if not use_conv else operations.Conv1d(inner_dim, dim_out, conv_kernel_size, padding = (conv_kernel_size // 2), bias = not no_bias, dtype=dtype, device=device)

+        # # init last linear layer to 0
+        # if zero_init_output:
+        #     nn.init.zeros_(linear_out.weight)
+        #     if not no_bias:
+        #         nn.init.zeros_(linear_out.bias)
+
+
        self.ff = nn.Sequential(
            linear_in,
            rearrange('b d n -> b n d') if use_conv else nn.Identity(),
@ -276,10 +261,8 @@ class Attention(nn.Module):
        dim_context = None,
        causal = False,
        zero_init_output=True,
-        qk_norm = "none",
-        differential = False,
+        qk_norm = False,
        natten_kernel_size = None,
-        feat_scale = False,
        dtype=None,
        device=None,
        operations=None,
@ -288,7 +271,6 @@ class Attention(nn.Module):
        self.dim = dim
        self.dim_heads = dim_heads
        self.causal = causal
-        self.differential = differential

        dim_kv = dim_context if dim_context is not None else dim

@ -296,37 +278,18 @@ class Attention(nn.Module):
        self.kv_heads = dim_kv // dim_heads

        if dim_context is not None:
-            if differential:
-                self.to_q = operations.Linear(dim, dim * 2, bias=False, dtype=dtype, device=device)
-                self.to_kv = operations.Linear(dim_kv, dim_kv * 3, bias=False, dtype=dtype, device=device)
-            else:
-                self.to_q = operations.Linear(dim, dim, bias=False, dtype=dtype, device=device)
-                self.to_kv = operations.Linear(dim_kv, dim_kv * 2, bias=False, dtype=dtype, device=device)
+            self.to_q = operations.Linear(dim, dim, bias=False, dtype=dtype, device=device)
+            self.to_kv = operations.Linear(dim_kv, dim_kv * 2, bias=False, dtype=dtype, device=device)
        else:
-            if differential:
-                self.to_qkv = operations.Linear(dim, dim * 5, bias=False, dtype=dtype, device=device)
-            else:
-                self.to_qkv = operations.Linear(dim, dim * 3, bias=False, dtype=dtype, device=device)
+            self.to_qkv = operations.Linear(dim, dim * 3, bias=False, dtype=dtype, device=device)

        self.to_out = operations.Linear(dim, dim, bias=False, dtype=dtype, device=device)

-        # Accept bool for backward compat
-        if isinstance(qk_norm, bool):
-            qk_norm = "l2" if qk_norm else "none"
+        # if zero_init_output:
+        #     nn.init.zeros_(self.to_out.weight)
+
        self.qk_norm = qk_norm

-        if self.qk_norm == "ln":
-            self.q_norm = operations.LayerNorm(dim_heads, elementwise_affine=True, eps=1.0e-6, dtype=dtype, device=device)
-            self.k_norm = operations.LayerNorm(dim_heads, elementwise_affine=True, eps=1.0e-6, dtype=dtype, device=device)
-        elif self.qk_norm == "rms":
-            self.q_norm = RMSNorm(dim_heads, dtype=dtype, device=device)
-            self.k_norm = RMSNorm(dim_heads, dtype=dtype, device=device)
-
-        self.feat_scale = feat_scale
-
-        if self.feat_scale:
-            self.lambda_dc = nn.Parameter(torch.empty(dim, dtype=dtype, device=device))
-            self.lambda_hf = nn.Parameter(torch.empty(dim, dtype=dtype, device=device))

    def forward(
        self,
@ -343,51 +306,22 @@ class Attention(nn.Module):
        kv_input = context if has_context else x

        if hasattr(self, 'to_q'):
-            if self.differential:
-                # cross-attention differential: to_q → (q, q_diff), to_kv → (k, k_diff, v)
-                q, q_diff = self.to_q(x).chunk(2, dim=-1)
-                q      = rearrange(q,      'b n (h d) -> b h n d', h=h)
-                q_diff = rearrange(q_diff, 'b n (h d) -> b h n d', h=h)
-                q = torch.stack([q, q_diff], dim=1)  # (B, 2, H, N, D)
-                k, k_diff, v = self.to_kv(kv_input).chunk(3, dim=-1)
-                k      = rearrange(k,      'b n (h d) -> b h n d', h=kv_h)
-                k_diff = rearrange(k_diff, 'b n (h d) -> b h n d', h=kv_h)
-                v      = rearrange(v,      'b n (h d) -> b h n d', h=kv_h)
-                k = torch.stack([k, k_diff], dim=1)  # (B, 2, H, M, D)
-            else:
-                # Use separate linear projections for q and k/v
-                q = self.to_q(x)
-                q = rearrange(q, 'b n (h d) -> b h n d', h = h)
+            # Use separate linear projections for q and k/v
+            q = self.to_q(x)
+            q = rearrange(q, 'b n (h d) -> b h n d', h = h)

-                k, v = self.to_kv(kv_input).chunk(2, dim=-1)
+            k, v = self.to_kv(kv_input).chunk(2, dim=-1)

-                k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = kv_h), (k, v))
+            k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = kv_h), (k, v))
        else:
-            if self.differential:
-                # self-attention differential: to_qkv → (q, k, v, q_diff, k_diff)
-                q, k, v, q_diff, k_diff = self.to_qkv(x).chunk(5, dim=-1)
-                q, k, v, q_diff, k_diff = map(
-                    lambda t: rearrange(t, 'b n (h d) -> b h n d', h=h),
-                    (q, k, v, q_diff, k_diff)
-                )
-                q = torch.stack([q, q_diff], dim=1)  # (B, 2, H, N, D)
-                k = torch.stack([k, k_diff], dim=1)
-            else:
-                # Use fused linear projection
-                q, k, v = self.to_qkv(x).chunk(3, dim=-1)
-                q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = h), (q, k, v))
+            # Use fused linear projection
+            q, k, v = self.to_qkv(x).chunk(3, dim=-1)
+            q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = h), (q, k, v))

        # Normalize q and k for cosine sim attention
-        if self.qk_norm == "l2":
+        if self.qk_norm:
            q = F.normalize(q, dim=-1)
            k = F.normalize(k, dim=-1)
-        elif self.qk_norm == "rms":
-            q_type, k_type = q.dtype, k.dtype
-            q = self.q_norm(q).to(q_type)
-            k = self.k_norm(k).to(k_type)
-        elif self.qk_norm != 'none':
-            q = self.q_norm(q)
-            k = self.k_norm(k)

        if rotary_pos_emb is not None and not has_context:
            freqs, _ = rotary_pos_emb
@ -430,24 +364,9 @@ class Attention(nn.Module):
            heads_per_kv_head = h // kv_h
            k, v = map(lambda t: t.repeat_interleave(heads_per_kv_head, dim = 1), (k, v))

-        if self.differential:
-            q, q_diff = q.unbind(dim=1)
-            k, k_diff = k.unbind(dim=1)
-            out      = optimized_attention(q,      k,      v, h, skip_reshape=True, transformer_options=transformer_options)
-            out_diff = optimized_attention(q_diff, k_diff, v, h, skip_reshape=True, transformer_options=transformer_options)
-            out = out - out_diff
-        else:
-            out = optimized_attention(q, k, v, h, skip_reshape=True, transformer_options=transformer_options)
-
+        out = optimized_attention(q, k, v, h, skip_reshape=True, transformer_options=transformer_options)
        out = self.to_out(out)

-        if self.feat_scale:
-            out_dc = out.mean(dim=-2, keepdim=True)
-            out_hf = out - out_dc
-
-            # Selectively modulate DC and high frequency components
-            out = out + comfy.ops.cast_to_input(self.lambda_dc, out) * out_dc + comfy.ops.cast_to_input(self.lambda_hf, out) * out_hf
-
        if mask is not None:
            mask = rearrange(mask, 'b n -> b n 1')
            out = out.masked_fill(~mask, 0.)
@ -498,14 +417,11 @@ class TransformerBlock(nn.Module):
            cross_attend = False,
            dim_context = None,
            global_cond_dim = None,
-            global_cond_shared_embed = False,
-            local_add_cond_dim = None,
            causal = False,
            zero_init_branch_outputs = True,
            conformer = False,
            layer_ix = -1,
            remove_norms = False,
-            norm_type = "layer_norm",
            attn_kwargs = {},
            ff_kwargs = {},
            norm_kwargs = {},
@ -520,20 +436,8 @@ class TransformerBlock(nn.Module):
        self.cross_attend = cross_attend
        self.dim_context = dim_context
        self.causal = causal
-        self.global_cond_shared_embed = global_cond_shared_embed

-        norm_layer_map = {
-            "layer_norm": LayerNorm,
-            "rms_norm": RMSNorm,
-        }
-        norm_cls = norm_layer_map.get(norm_type, LayerNorm)
-
-        def make_norm():
-            if remove_norms:
-                return nn.Identity()
-            return norm_cls(dim, dtype=dtype, device=device, **norm_kwargs)
-
-        self.pre_norm = make_norm()
+        self.pre_norm = LayerNorm(dim, dtype=dtype, device=device, **norm_kwargs) if not remove_norms else nn.Identity()

        self.self_attn = Attention(
            dim,
@ -547,7 +451,7 @@ class TransformerBlock(nn.Module):
        )

        if cross_attend:
-            self.cross_attend_norm = make_norm()
+            self.cross_attend_norm = LayerNorm(dim, dtype=dtype, device=device, **norm_kwargs) if not remove_norms else nn.Identity()
            self.cross_attn = Attention(
                dim,
                dim_heads = dim_heads,
@ -560,56 +464,37 @@ class TransformerBlock(nn.Module):
                **attn_kwargs
            )

-        self.ff_norm = make_norm()
-        self.ff = FeedForward(dim, zero_init_output=zero_init_branch_outputs, dtype=dtype, device=device, operations=operations, **ff_kwargs)
+        self.ff_norm = LayerNorm(dim, dtype=dtype, device=device, **norm_kwargs) if not remove_norms else nn.Identity()
+        self.ff = FeedForward(dim, zero_init_output=zero_init_branch_outputs, dtype=dtype, device=device, operations=operations,**ff_kwargs)

        self.layer_ix = layer_ix

        self.conformer = ConformerModule(dim, norm_kwargs=norm_kwargs) if conformer else None

-        # Global conditioning
-        self.has_global_cond = (global_cond_dim is not None) or global_cond_shared_embed
+        self.global_cond_dim = global_cond_dim

-        if global_cond_shared_embed:
-            # SA3 style: learnable per-block additive bias; global_cond is pre-projected to (B, dim*6)
-            self.to_scale_shift_gate = nn.Parameter(torch.empty(dim * 6, device=device, dtype=dtype))
-        elif global_cond_dim is not None:
-            # SA1 style: per-block MLP projects global_cond → (B, dim*6)
+        if global_cond_dim is not None:
            self.to_scale_shift_gate = nn.Sequential(
                nn.SiLU(),
-                operations.Linear(global_cond_dim, dim * 6, bias=False, device=device, dtype=dtype)
+                nn.Linear(global_cond_dim, dim * 6, bias=False)
            )

-        # Local additive conditioning (e.g. inpaint mask + masked latent)
-        self.local_add_cond_dim = local_add_cond_dim
-        if local_add_cond_dim is not None:
-            self.to_local_embed = nn.Sequential(
-                operations.Linear(local_add_cond_dim, dim, bias=True, dtype=dtype, device=device),
-                nn.SiLU(),
-                operations.Linear(dim, dim, bias=True, dtype=dtype, device=device),
-            )
-        else:
-            self.to_local_embed = None
+            nn.init.zeros_(self.to_scale_shift_gate[1].weight)
+            #nn.init.zeros_(self.to_scale_shift_gate_self[1].bias)

    def forward(
        self,
        x,
        context = None,
        global_cond=None,
-        local_add_cond=None,
        mask = None,
        context_mask = None,
        rotary_pos_emb = None,
        transformer_options={}
    ):
-        if self.has_global_cond and global_cond is not None:
-            if self.global_cond_shared_embed:
-                # global_cond already has shape (B, dim*6)
-                ssg = (comfy.ops.cast_to_input(self.to_scale_shift_gate, global_cond) + global_cond).unsqueeze(1)
-            else:
-                ssg = self.to_scale_shift_gate(global_cond).unsqueeze(1)
+        if self.global_cond_dim is not None and self.global_cond_dim > 0 and global_cond is not None:

-            scale_self, shift_self, gate_self, scale_ff, shift_ff, gate_ff = ssg.chunk(6, dim = -1)
+            scale_self, shift_self, gate_self, scale_ff, shift_ff, gate_ff = self.to_scale_shift_gate(global_cond).unsqueeze(1).chunk(6, dim = -1)

            # self-attention with adaLN
            residual = x
@ -625,9 +510,6 @@ class TransformerBlock(nn.Module):
            if self.conformer is not None:
                x = x + self.conformer(x)

-            if local_add_cond is not None and self.to_local_embed is not None:
-                x = x + _left_pad_to_match(self.to_local_embed(local_add_cond), x.shape[-2])
-
            # feedforward with adaLN
            residual = x
            x = self.ff_norm(x)
@ -645,9 +527,6 @@ class TransformerBlock(nn.Module):
            if self.conformer is not None:
                x = x + self.conformer(x)

-            if local_add_cond is not None and self.to_local_embed is not None:
-                x = x + _left_pad_to_match(self.to_local_embed(local_add_cond), x.shape[-2])
-
            x = x + self.ff(self.ff_norm(x))

        return x
@ -664,8 +543,6 @@ class ContinuousTransformer(nn.Module):
        cross_attend=False,
        cond_token_dim=None,
        global_cond_dim=None,
-        global_cond_shared_embed=False,
-        local_add_cond_dim=None,
        causal=False,
        rotary_pos_emb=True,
        zero_init_branch_outputs=True,
@ -673,7 +550,6 @@ class ContinuousTransformer(nn.Module):
        use_sinusoidal_emb=False,
        use_abs_pos_emb=False,
        abs_pos_emb_max_length=10000,
-        num_memory_tokens=0,
        dtype=None,
        device=None,
        operations=None,
@ -686,8 +562,6 @@ class ContinuousTransformer(nn.Module):
        self.depth = depth
        self.causal = causal
        self.layers = nn.ModuleList([])
-        self.num_memory_tokens = num_memory_tokens
-        self.global_cond_shared_embed = global_cond_shared_embed

        self.project_in = operations.Linear(dim_in, dim, bias=False, dtype=dtype, device=device) if dim_in is not None else nn.Identity()
        self.project_out = operations.Linear(dim, dim_out, bias=False, dtype=dtype, device=device) if dim_out is not None else nn.Identity()
@ -703,22 +577,7 @@ class ContinuousTransformer(nn.Module):

        self.use_abs_pos_emb = use_abs_pos_emb
        if use_abs_pos_emb:
-            self.pos_emb = AbsolutePositionalEmbedding(dim, abs_pos_emb_max_length + num_memory_tokens)
-
-        if num_memory_tokens > 0:
-            self.memory_tokens = nn.Parameter(torch.empty(num_memory_tokens, dim, device=device, dtype=dtype))
-
-        # Shared global-cond embedder (SA3 style): projects (B, global_cond_dim) → (B, dim*6)
-        self.global_cond_embedder = None
-        if global_cond_shared_embed and global_cond_dim is not None:
-            self.global_cond_embedder = nn.Sequential(
-                operations.Linear(global_cond_dim, dim, bias=True, dtype=dtype, device=device),
-                nn.SiLU(),
-                operations.Linear(dim, dim * 6, bias=True, dtype=dtype, device=device),
-            )
-
-        # When using shared embed, TransformerBlocks use per-block Parameter (not per-block MLP)
-        block_global_cond_dim = None if global_cond_shared_embed else global_cond_dim
+            self.pos_emb = AbsolutePositionalEmbedding(dim, abs_pos_emb_max_length)

        for i in range(depth):
            self.layers.append(
@ -727,9 +586,7 @@ class ContinuousTransformer(nn.Module):
                    dim_heads = dim_heads,
                    cross_attend = cross_attend,
                    dim_context = cond_token_dim,
-                    global_cond_dim = block_global_cond_dim,
-                    global_cond_shared_embed = global_cond_shared_embed,
-                    local_add_cond_dim = local_add_cond_dim,
+                    global_cond_dim = global_cond_dim,
                    causal = causal,
                    zero_init_branch_outputs = zero_init_branch_outputs,
                    conformer=conformer,
@ -748,7 +605,6 @@ class ContinuousTransformer(nn.Module):
        prepend_embeds = None,
        prepend_mask = None,
        global_cond = None,
-        local_add_cond = None,
        return_info = False,
        **kwargs
    ):
@ -776,9 +632,7 @@ class ContinuousTransformer(nn.Module):

                mask = torch.cat((prepend_mask, mask), dim = -1)

-        if self.num_memory_tokens > 0:
-            memory_tokens = comfy.ops.cast_to_input(self.memory_tokens, x).expand(batch, -1, -1)
-            x = torch.cat((memory_tokens, x), dim=1)
+        # Attention layers

        if self.rotary_pos_emb is not None:
            rotary_pos_emb = self.rotary_pos_emb.forward_from_seq_len(x.shape[1], dtype=torch.float, device=x.device)
@ -788,10 +642,6 @@ class ContinuousTransformer(nn.Module):
        if self.use_sinusoidal_emb or self.use_abs_pos_emb:
            x = x + self.pos_emb(x)

-        # Project global_cond once (SA3 shared-embed path)
-        if global_cond is not None and self.global_cond_embedder is not None:
-            global_cond = self.global_cond_embedder(global_cond)
-
        blocks_replace = patches_replace.get("dit", {})
        # Iterate over the transformer layers
        for i, layer in enumerate(self.layers):
@ -804,17 +654,12 @@ class ContinuousTransformer(nn.Module):
                out = blocks_replace[("double_block", i)]({"img": x, "txt": context, "vec": global_cond, "pe": rotary_pos_emb, "transformer_options": transformer_options}, {"original_block": block_wrap})
                x = out["img"]
            else:
-                x = layer(x, rotary_pos_emb=rotary_pos_emb, global_cond=global_cond,
-                          local_add_cond=local_add_cond, context=context,
-                          transformer_options=transformer_options)
+                x = layer(x, rotary_pos_emb = rotary_pos_emb, global_cond=global_cond, context=context, transformer_options=transformer_options)
+            # x = checkpoint(layer, x, rotary_pos_emb = rotary_pos_emb, global_cond=global_cond, **kwargs)

            if return_info:
                info["hidden_states"].append(x)

-        # Strip memory tokens before projecting out
-        if self.num_memory_tokens > 0:
-            x = x[:, self.num_memory_tokens:, :]
-
        x = self.project_out(x)

        if return_info:
@ -837,7 +682,6 @@ class AudioDiffusionTransformer(nn.Module):
        num_heads=24,
        transformer_type: tp.Literal["continuous_transformer"] = "continuous_transformer",
        global_cond_type: tp.Literal["prepend", "adaLN"] = "prepend",
-        timestep_features_type: str = "learned",
        audio_model="",
        dtype=None,
        device=None,
@ -852,10 +696,7 @@ class AudioDiffusionTransformer(nn.Module):
        # Timestep embeddings
        timestep_features_dim = 256

-        if timestep_features_type == "expo":
-            self.timestep_features = ExpoFourierFeatures(timestep_features_dim, 0.5, 10000.0)
-        else:
-            self.timestep_features = FourierFeatures(1, timestep_features_dim, dtype=dtype, device=device)
+        self.timestep_features = FourierFeatures(1, timestep_features_dim, dtype=dtype, device=device)

        self.to_timestep_embed = nn.Sequential(
            operations.Linear(timestep_features_dim, embed_dim, bias=True, dtype=dtype, device=device),
@ -940,7 +781,6 @@ class AudioDiffusionTransformer(nn.Module):
        cross_attn_cond=None,
        cross_attn_cond_mask=None,
        input_concat_cond=None,
-        local_add_cond=None,
        global_embed=None,
        prepend_cond=None,
        prepend_cond_mask=None,
@ -962,13 +802,9 @@ class AudioDiffusionTransformer(nn.Module):
            prepend_cond = self.to_prepend_embed(prepend_cond)

            prepend_inputs = prepend_cond
-            prepend_length = prepend_cond.shape[1]
            if prepend_cond_mask is not None:
                prepend_mask = prepend_cond_mask

-        if local_add_cond is not None and local_add_cond.dim() == 3:
-            local_add_cond = local_add_cond.permute(0, 2, 1)
-
        if input_concat_cond is not None:

            # Interpolate input_concat_cond to the same length as x
@ -1014,7 +850,7 @@ class AudioDiffusionTransformer(nn.Module):
        if self.transformer_type == "x-transformers":
            output = self.transformer(x, prepend_embeds=prepend_inputs, context=cross_attn_cond, context_mask=cross_attn_cond_mask, mask=mask, prepend_mask=prepend_mask, **extra_args, **kwargs)
        elif self.transformer_type == "continuous_transformer":
-            output = self.transformer(x, prepend_embeds=prepend_inputs, context=cross_attn_cond, context_mask=cross_attn_cond_mask, mask=mask, prepend_mask=prepend_mask, return_info=return_info, local_add_cond=local_add_cond, **extra_args, **kwargs)
+            output = self.transformer(x, prepend_embeds=prepend_inputs, context=cross_attn_cond, context_mask=cross_attn_cond_mask, mask=mask, prepend_mask=prepend_mask, return_info=return_info, **extra_args, **kwargs)

            if return_info:
                output, info = output
@ -1040,7 +876,6 @@ class AudioDiffusionTransformer(nn.Module):
        context=None,
        context_mask=None,
        input_concat_cond=None,
-        local_add_cond=None,
        global_embed=None,
        negative_global_embed=None,
        prepend_cond=None,
@ -1055,7 +890,6 @@ class AudioDiffusionTransformer(nn.Module):
                cross_attn_cond=context,
                cross_attn_cond_mask=context_mask,
                input_concat_cond=input_concat_cond,
-                local_add_cond=local_add_cond,
                global_embed=global_embed,
                prepend_cond=prepend_cond,
                prepend_cond_mask=prepend_cond_mask,
--- a/comfy/ldm/audio/embedders.py
+++ b/comfy/ldm/audio/embedders.py
@ -31,39 +31,15 @@ def TimePositionalEmbedding(dim: int, out_features: int) -> nn.Module:
    )


-class ExpoFourierFeatures(nn.Module):
-    """Exponentially-spaced Fourier features (no learnable parameters)."""
-    def __init__(self, dim, min_freq=0.5, max_freq=10000.0):
-        super().__init__()
-        self.dim = dim
-        self.min_freq = min_freq
-        self.max_freq = max_freq
-
-    def forward(self, t):
-        in_dtype = t.dtype
-        t = t.float()
-        if t.dim() == 1:
-            t = t.unsqueeze(-1)
-        half_dim = self.dim // 2
-        ramp = torch.linspace(0, 1, half_dim, device=t.device, dtype=torch.float32)
-        freqs = torch.exp(ramp * (math.log(self.max_freq) - math.log(self.min_freq)) + math.log(self.min_freq))
-        args = t * freqs * 2 * math.pi
-        return torch.cat([args.cos(), args.sin()], dim=-1).to(in_dtype)
-
-
 class NumberEmbedder(nn.Module):
    def __init__(
        self,
        features: int,
        dim: int = 256,
-        fourier_features_type="learned",
    ):
        super().__init__()
        self.features = features
-        if fourier_features_type == "expo":
-            self.embedding = nn.Sequential(ExpoFourierFeatures(dim=dim), comfy.ops.manual_cast.Linear(in_features=dim, out_features=features))
-        else:
-            self.embedding = TimePositionalEmbedding(dim=dim, out_features=features)
+        self.embedding = TimePositionalEmbedding(dim=dim, out_features=features)

    def forward(self, x: Union[List[float], Tensor]) -> Tensor:
        if not torch.is_tensor(x):
@ -101,15 +77,14 @@ class NumberConditioner(Conditioner):
    def __init__(self,
                output_dim: int,
                min_val: float=0,
-                max_val: float=1,
-                fourier_features_type: str = "learned",
+                max_val: float=1
                ):
        super().__init__(output_dim, output_dim)

        self.min_val = min_val
        self.max_val = max_val

-        self.embedder = NumberEmbedder(features=output_dim, fourier_features_type=fourier_features_type)
+        self.embedder = NumberEmbedder(features=output_dim)

    def forward(self, floats, device=None):
            # Cast the inputs to floats
--- a/comfy/ldm/audio/vae_sa3.py
+++ b/comfy/ldm/audio/vae_sa3.py
@ -1,533 +0,0 @@
-import torch
-import torch.nn as nn
-
-import comfy.ops
-import comfy.model_management
-from comfy.ldm.modules.attention import optimized_attention
-from comfy.ldm.audio.autoencoder import WNConv1d
-
-ops = comfy.ops.disable_weight_init
-
-class Transpose(nn.Module):
-    def forward(self, x, **kwargs):
-        return x.transpose(-2, -1)
-
-
-def _zero_pad_modulo_sequence(x, size, dim=-2):
-    input_len = x.shape[dim]
-    pad_len = (size - input_len % size) % size
-    if pad_len > 0:
-        pad_shape = list(x.shape)
-        pad_shape[dim] = pad_len
-        x = torch.cat([x, torch.zeros(pad_shape, device=x.device, dtype=x.dtype)], dim=dim)
-    return x
-
-
-def _sliding_window_mask(seq_len, window, device, dtype):
-    """Additive attention mask enforcing a ±window local window (matches flash_attn window_size)."""
-    i = torch.arange(seq_len, device=device).unsqueeze(1)
-    j = torch.arange(seq_len, device=device).unsqueeze(0)
-    out_of_window = (j - i).abs() > window
-    return torch.where(
-        out_of_window,
-        torch.full((1,), torch.finfo(dtype).min / 4, device=device, dtype=dtype),
-        torch.zeros(1, device=device, dtype=dtype),
-    )
-
-
-class DynamicTanh(nn.Module):
-    def __init__(self, dim, init_alpha=4.0, dtype=None, device=None, **kwargs):
-        super().__init__()
-        self.alpha = nn.Parameter(torch.empty(1, dtype=dtype, device=device))
-        self.gamma = nn.Parameter(torch.empty(dim, dtype=dtype, device=device))
-        self.beta = nn.Parameter(torch.empty(dim, dtype=dtype, device=device))
-
-    def forward(self, x):
-        alpha = comfy.ops.cast_to_input(self.alpha, x)
-        gamma = comfy.ops.cast_to_input(self.gamma, x)
-        beta = comfy.ops.cast_to_input(self.beta, x)
-        return gamma * torch.tanh(alpha * x) + beta
-
-
-class RotaryEmbedding(nn.Module):
-    def __init__(self, dim, base=10000, base_rescale_factor=1., dtype=None, device=None):
-        super().__init__()
-        base = base * base_rescale_factor ** (dim / (dim - 2))
-        self.register_buffer("inv_freq", torch.empty(dim // 2, dtype=dtype, device=device))
-
-    def forward_from_seq_len(self, seq_len, device, dtype=None):
-        t = torch.arange(seq_len, device=device, dtype=torch.float32)
-        return self.forward(t)
-
-    def forward(self, t):
-        freqs = torch.outer(t.float(), comfy.model_management.cast_to(self.inv_freq, dtype=torch.float32, device=t.device))
-        freqs = torch.cat((freqs, freqs), dim=-1)
-        return freqs, 1.
-
-
-def _rotate_half(x):
-    d = x.shape[-1] // 2
-    return torch.cat((-x[..., d:], x[..., :d]), dim=-1)
-
-
-def _apply_rotary_pos_emb(t, freqs):
-    out_dtype = t.dtype
-    rot_dim = freqs.shape[-1]
-    seq_len = t.shape[-2]
-    freqs = freqs[-seq_len:]
-    t_rot, t_pass = t[..., :rot_dim], t[..., rot_dim:]
-    t_rot = t_rot * freqs.cos() + _rotate_half(t_rot) * freqs.sin()
-    return torch.cat((t_rot.to(out_dtype), t_pass.to(out_dtype)), dim=-1)
-
-
-class Attention(nn.Module):
-    def __init__(self, dim, dim_heads=64, qk_norm="none", qk_norm_eps=1e-6,
-                 differential=False, zero_init_output=True,
-                 dtype=None, device=None, operations=None, **kwargs):
-        super().__init__()
-        self.num_heads = dim // dim_heads
-        self.differential = differential
-        self.qk_norm = qk_norm
-
-        self.to_qkv = operations.Linear(
-            dim, dim * (5 if differential else 3), bias=False, dtype=dtype, device=device)
-        self.to_out = operations.Linear(dim, dim, bias=False, dtype=dtype, device=device)
-
-        if qk_norm == "dyt":
-            self.q_norm = DynamicTanh(dim_heads, dtype=dtype, device=device)
-            self.k_norm = DynamicTanh(dim_heads, dtype=dtype, device=device)
-        elif qk_norm == "rms":
-            self.q_norm = operations.RMSNorm(dim_heads, eps=qk_norm_eps, dtype=dtype, device=device)
-            self.k_norm = operations.RMSNorm(dim_heads, eps=qk_norm_eps, dtype=dtype, device=device)
-
-    def forward(self, x, rotary_pos_emb=None, mask=None, **kwargs):
-        B, N, _ = x.shape
-        h = self.num_heads
-
-        qkv = self.to_qkv(x)
-        if self.differential:
-            q, k, v, q_diff, k_diff = qkv.chunk(5, dim=-1)
-            del qkv
-            q = q.view(B, N, h, -1).transpose(1, 2)
-            k = k.view(B, N, h, -1).transpose(1, 2)
-            v = v.view(B, N, h, -1).transpose(1, 2)
-            q_diff = q_diff.view(B, N, h, -1).transpose(1, 2)
-            k_diff = k_diff.view(B, N, h, -1).transpose(1, 2)
-        else:
-            q, k, v = qkv.chunk(3, dim=-1)
-            del qkv
-            q = q.view(B, N, h, -1).transpose(1, 2)
-            k = k.view(B, N, h, -1).transpose(1, 2)
-            v = v.view(B, N, h, -1).transpose(1, 2)
-
-        if self.qk_norm != "none":
-            q_dtype, k_dtype = q.dtype, k.dtype
-            q = self.q_norm(q).to(q_dtype)
-            k = self.k_norm(k).to(k_dtype)
-            if self.differential:
-                q_diff = self.q_norm(q_diff).to(q_dtype)
-                k_diff = self.k_norm(k_diff).to(k_dtype)
-
-        if rotary_pos_emb is not None:
-            freqs, _ = rotary_pos_emb
-            q_dtype, k_dtype = q.dtype, k.dtype
-            q = _apply_rotary_pos_emb(q.float(), freqs).to(q_dtype)
-            k = _apply_rotary_pos_emb(k.float(), freqs).to(k_dtype)
-            if self.differential:
-                q_diff = _apply_rotary_pos_emb(q_diff.float(), freqs).to(q_dtype)
-                k_diff = _apply_rotary_pos_emb(k_diff.float(), freqs).to(k_dtype)
-
-        if self.differential:
-            out = (optimized_attention(q, k, v, h, mask=mask, skip_reshape=True)
-                   - optimized_attention(q_diff, k_diff, v, h, mask=mask, skip_reshape=True))
-            del q, k, v, q_diff, k_diff
-        else:
-            out = optimized_attention(q, k, v, h, mask=mask, skip_reshape=True)
-            del q, k, v
-
-        return self.to_out(out)
-
-
-class _Sin(nn.Module):
-    def forward(self, x):
-        return torch.sin(3.14159265359 * x)
-
-
-class _GLU(nn.Module):
-    def __init__(self, dim_in, dim_out, activation, dtype=None, device=None, operations=None):
-        super().__init__()
-        self.act = activation
-        self.proj = operations.Linear(dim_in, dim_out * 2, dtype=dtype, device=device)
-
-    def forward(self, x):
-        x = self.proj(x)
-        x, gate = x.chunk(2, dim=-1)
-        return x * self.act(gate)
-
-
-class FeedForward(nn.Module):
-    def __init__(self, dim, mult=4, no_bias=False, zero_init_output=True,
-                 sinusoidal=False, dtype=None, device=None, operations=None, **kwargs):
-        super().__init__()
-        inner_dim = int(dim * mult)
-        act = _Sin() if sinusoidal else nn.SiLU()
-        self.ff = nn.Sequential(
-            _GLU(dim, inner_dim, act, dtype=dtype, device=device, operations=operations),
-            nn.Identity(),
-            operations.Linear(inner_dim, dim, bias=not no_bias, dtype=dtype, device=device),
-            nn.Identity(),
-        )
-
-    def forward(self, x, **kwargs):
-        return self.ff(x)
-
-
-class TransformerBlock(nn.Module):
-    def __init__(self, dim, dim_heads=64, causal=False, zero_init_branch_outputs=True,
-                 norm_type="dyt", add_rope=False, attn_kwargs=None, ff_kwargs=None,
-                 norm_kwargs=None, dtype=None, device=None, operations=None, **kwargs):
-        super().__init__()
-        if attn_kwargs is None:
-            attn_kwargs = {}
-        if ff_kwargs is None:
-            ff_kwargs = {}
-        if norm_kwargs is None:
-            norm_kwargs = {}
-        dim_heads = min(dim_heads, dim)
-
-        Norm = DynamicTanh if norm_type == "dyt" else operations.RMSNorm
-        norm_kw = {**norm_kwargs, "dtype": dtype, "device": device}
-
-        self.pre_norm = Norm(dim, **norm_kw)
-        self.self_attn = Attention(dim, dim_heads=dim_heads,
-                                   zero_init_output=zero_init_branch_outputs,
-                                   dtype=dtype, device=device, operations=operations,
-                                   **attn_kwargs)
-        self.ff_norm = Norm(dim, **norm_kw)
-        self.ff = FeedForward(dim, zero_init_output=zero_init_branch_outputs,
-                              dtype=dtype, device=device, operations=operations, **ff_kwargs)
-        self.rope = RotaryEmbedding(dim_heads // 2, dtype=dtype, device=device) if add_rope else None
-
-    def forward(self, x, mask=None, **kwargs):
-        rope = self.rope.forward_from_seq_len(x.shape[-2], device=x.device) \
-               if self.rope is not None else None
-        x = x + self.self_attn(self.pre_norm(x), rotary_pos_emb=rope, mask=mask)
-        x = x + self.ff(self.ff_norm(x))
-        return x
-
-
-class TransformerResamplingBlock(nn.Module):
-    def __init__(self, in_channels, out_channels, stride, type="encoder",
-                 transformer_depth=3, dim_heads=128, differential=True,
-                 sliding_window=None, chunk_size=128, chunk_midpoint_shift=False,
-                 dyt=True, ff_mult=3, mapping_bias=True, variable_stride=False,
-                 sinusoidal_blocks=0, conv_mapping=False, dtype=None, device=None, operations=None, **kwargs):
-        super().__init__()
-        if type not in ("encoder", "decoder"):
-            raise ValueError(f"type must be 'encoder' or 'decoder', got {type!r}")
-
-        self.type = type
-        self.stride = stride
-        self.chunk_size = chunk_size
-        self.chunk_midpoint_shift = chunk_midpoint_shift
-        self.variable_stride = variable_stride
-        self.transformer_depth = transformer_depth
-
-        transformer_dim = out_channels if type == "encoder" else in_channels
-
-        self.mapping = (WNConv1d(in_channels, out_channels, 3 if conv_mapping else 1, padding="same", bias=mapping_bias)
-                        if in_channels != out_channels else nn.Identity())
-
-        self.sliding_window_latents = sliding_window
-        self.sliding_window_seq = self._get_sliding_window_size(sliding_window, stride)
-        self.input_seg_size, self.output_seg_size, self.sub_chunk_size = self._get_seg_sizes(stride)
-
-        token_seq = 1 if variable_stride else self.output_seg_size
-        self.new_tokens = nn.Parameter(torch.empty(1, token_seq, transformer_dim, dtype=dtype, device=device))
-
-        norm_type = "dyt" if dyt else "rms_norm"
-        attn_kwargs = {"qk_norm": "dyt" if dyt else "rms", "qk_norm_eps": 1e-3,
-                       "differential": differential}
-        norm_kwargs = {"eps": 1e-3}
-        transformers = []
-        for i in range(transformer_depth):
-            sinusoidal = (transformer_depth - i) < sinusoidal_blocks
-            transformers.append(TransformerBlock(
-                transformer_dim,
-                dim_heads=dim_heads,
-                causal=False,
-                zero_init_branch_outputs=True,
-                norm_type=norm_type,
-                add_rope=True,
-                attn_kwargs=attn_kwargs,
-                ff_kwargs={"mult": ff_mult, "no_bias": False, "sinusoidal": sinusoidal},
-                norm_kwargs=norm_kwargs,
-                dtype=dtype, device=device, operations=operations,
-            ))
-        self.transformers = nn.ModuleList(transformers)
-
-    def _get_sliding_window_size(self, window, stride, prepend_cond_length=0):
-        if window is None:
-            return None
-        return [w * (stride + 1 + prepend_cond_length) for w in window]
-
-    def _get_seg_sizes(self, stride, prepend_cond_length=0):
-        sub_chunk_size = stride + 1 + prepend_cond_length
-        input_seg_size = stride if self.type == "encoder" else 1
-        output_seg_size = 1 if self.type == "encoder" else stride
-        return input_seg_size, output_seg_size, sub_chunk_size
-
-    def forward(self, x, stride=None, **kwargs):
-        B = x.shape[0]
-
-        if stride is None:
-            input_seg = self.input_seg_size
-            output_seg = self.output_seg_size
-            sub_chunk = self.sub_chunk_size
-            sliding_window = self.sliding_window_seq
-        else:
-            input_seg, output_seg, sub_chunk = self._get_seg_sizes(stride)
-            sliding_window = self._get_sliding_window_size(self.sliding_window_latents, stride)
-
-        if self.type == "encoder":
-            if self.transformer_depth > 0:
-                pad_mod = self.chunk_size if sliding_window is None else input_seg
-                x = _zero_pad_modulo_sequence(x, pad_mod, dim=-1)
-            x = self.mapping(x)
-
-        if self.transformer_depth > 0:
-            x = x.permute(0, 2, 1)
-
-            if self.type != "encoder":
-                pad_mod = 1 if sliding_window is not None else (
-                    self.chunk_size // (stride if stride is not None else self.stride))
-                x = _zero_pad_modulo_sequence(x, pad_mod)
-
-            C = x.shape[2]
-            x = x.reshape(-1, input_seg, C)
-
-            new_tokens = self.new_tokens.expand(x.shape[0], output_seg, -1)
-            x = torch.cat([x, comfy.ops.cast_to_input(new_tokens, x)], dim=-2)
-            del new_tokens
-
-            x = x.reshape(B, -1, C)
-
-            if sliding_window is None:
-                eff_chunk = self.chunk_size + self.chunk_size // (stride if stride is not None else self.stride)
-
-            if sliding_window is None and self.chunk_midpoint_shift:
-                split = self.transformer_depth // 2
-                shift = eff_chunk // 2
-
-                x = x.reshape(-1, eff_chunk, C)
-                for layer in self.transformers[:split]:
-                    x = layer(x)
-                x = x.reshape(B, -1, C)
-
-                shifted = torch.cat([x[:, :shift, :], x, x[:, -shift:, :]], dim=1)
-                del x
-                x = shifted.reshape(-1, eff_chunk, C)
-                del shifted
-                for layer in self.transformers[split:]:
-                    x = layer(x)
-                x = x.reshape(B, -1, C)
-                x = x[:, shift:-shift, :]
-            elif sliding_window is None:
-                x = x.reshape(-1, eff_chunk, C)
-                for layer in self.transformers:
-                    x = layer(x)
-                x = x.reshape(B, -1, C)
-            else:
-                attn_mask = _sliding_window_mask(x.shape[1], sliding_window[0], x.device, x.dtype)
-                for layer in self.transformers:
-                    x = layer(x, mask=attn_mask)
-
-            x = x.reshape(-1, sub_chunk, C)
-            x = x[:, -output_seg:, :]
-            x = x.reshape(B, -1, C).transpose(1, 2)
-
-        if self.type == "decoder":
-            x = self.mapping(x)
-
-        return x
-
-
-class SAMEEncoder(nn.Module):
-    def __init__(self, in_channels=2, channels=128, latent_dim=32,
-                 c_mults=(1, 2, 4, 8), strides=(2, 4, 8, 8),
-                 transformer_depths=(3, 3, 3, 3),
-                 dtype=None, device=None, operations=None, **kwargs):
-        super().__init__()
-        channel_dims = [in_channels] + [channels * c for c in c_mults]
-        layers = []
-        for i in range(len(c_mults)):
-            layers.append(TransformerResamplingBlock(
-                in_channels=channel_dims[i], out_channels=channel_dims[i + 1],
-                stride=strides[i], type="encoder",
-                transformer_depth=transformer_depths[i],
-                dtype=dtype, device=device, operations=operations, **kwargs))
-        layers += [
-            Transpose(),
-            operations.Linear(channel_dims[-1], latent_dim, dtype=dtype, device=device),
-            Transpose(),
-        ]
-        self.layers = nn.ModuleList(layers)
-
-    def forward(self, x, **kwargs):
-        for layer in self.layers:
-            x = layer(x)
-        return x
-
-
-class SAMEDecoder(nn.Module):
-    def __init__(self, out_channels=2, channels=128, latent_dim=32,
-                 c_mults=(1, 2, 4, 8), strides=(2, 4, 8, 8),
-                 transformer_depths=(3, 3, 3, 3), sinusoidal_blocks=None,
-                 dtype=None, device=None, operations=None, **kwargs):
-        super().__init__()
-        if sinusoidal_blocks is None:
-            sinusoidal_blocks = [0] * len(c_mults)
-        channel_dims = [out_channels] + [channels * c for c in c_mults]
-        layers = [
-            Transpose(),
-            operations.Linear(latent_dim, channel_dims[-1], dtype=dtype, device=device),
-            Transpose(),
-        ]
-        for i in range(len(c_mults) - 1, -1, -1):
-            layers.append(TransformerResamplingBlock(
-                in_channels=channel_dims[i + 1], out_channels=channel_dims[i],
-                stride=strides[i], type="decoder",
-                transformer_depth=transformer_depths[i],
-                sinusoidal_blocks=sinusoidal_blocks[i],
-                dtype=dtype, device=device, operations=operations, **kwargs))
-        self.layers = nn.ModuleList(layers)
-
-    def forward(self, x, **kwargs):
-        for layer in self.layers:
-            x = layer(x)
-        return x
-
-
-class SoftNormBottleneck(nn.Module):
-    def __init__(self, dim=32, noise_augment_dim=0, noise_regularize=False,
-                 auto_scale=False, freeze=False, dtype=None, device=None, **kwargs):
-        super().__init__()
-        self.noise_augment_dim = noise_augment_dim
-        self.noise_regularize = noise_regularize
-        self.scaling_factor = nn.Parameter(torch.empty(1, dim, 1, dtype=dtype, device=device))
-        self.bias = nn.Parameter(torch.empty(1, dim, 1, dtype=dtype, device=device))
-        self.noise_scaling_factor = nn.Parameter(torch.empty(1, noise_augment_dim, 1, dtype=dtype, device=device))
-        if auto_scale:
-            self.register_parameter("running_std", nn.Parameter(
-                torch.empty(1, dtype=dtype, device=device), requires_grad=False))
-        if freeze:
-            for p in self.parameters():
-                p.requires_grad = False
-
-    def encode(self, x, return_info=False, **kwargs):
-        x = x * comfy.ops.cast_to_input(self.scaling_factor, x) \
-              + comfy.ops.cast_to_input(self.bias, x)
-        if hasattr(self, "running_std"):
-            x = x / comfy.ops.cast_to_input(self.running_std, x)
-        if return_info:
-            return x, {}
-        return x
-
-    def decode(self, x, **kwargs):
-        if hasattr(self, "running_std"):
-            x = x * comfy.ops.cast_to_input(self.running_std, x)
-        if self.noise_regularize:
-            scaling = self.running_std if hasattr(self, "running_std") \
-                      else x.std(dim=-1, keepdim=True)
-            noise = torch.randn_like(x) * comfy.ops.cast_to_input(scaling, x) * 1e-3
-            x = x + noise
-        if self.noise_augment_dim > 0:
-            noise = comfy.ops.cast_to_input(self.noise_scaling_factor, x) * torch.randn(
-                x.shape[0], self.noise_augment_dim, x.shape[-1], device=x.device, dtype=x.dtype)
-            x = torch.cat([x, noise], dim=1)
-        return x
-
-
-class PatchedPretransform(nn.Module):
-    def __init__(self, channels, patch_size, **kwargs):
-        super().__init__()
-        self.channels = channels
-        self.patch_size = patch_size
-        self.enable_grad = False
-
-    def _pad(self, x):
-        pad_len = (self.patch_size - x.shape[-1] % self.patch_size) % self.patch_size
-        if pad_len > 0:
-            x = torch.cat([x, torch.zeros_like(x[:, :, :pad_len])], dim=-1)
-        return x
-
-    def encode(self, x):
-        x = self._pad(x)
-        B, C, T = x.shape
-        h = self.patch_size
-        L = T // h
-        # b c (l h) -> b (c h) l
-        return x.reshape(B, C, L, h).permute(0, 1, 3, 2).reshape(B, C * h, L)
-
-    def decode(self, x):
-        B, Ch, L = x.shape
-        h = self.patch_size
-        C = Ch // h
-        # b (c h) l -> b c (l h)
-        return x.reshape(B, C, h, L).permute(0, 1, 3, 2).reshape(B, C, L * h)
-
-
-class SA3AudioVAE(nn.Module):
-    """SA3 VAE. State dict keys match checkpoint after stripping 'pretransform.model.'"""
-
-    def __init__(self, channels=256, transformer_depths=12, sinusoidal_blocks=8,
-                 sliding_window=None, decoder_conv_mapping=False,
-                 chunk_size=128, chunk_midpoint_shift=False,
-                 dtype=None, device=None, operations=None):
-        super().__init__()
-        if operations is None:
-            operations = ops
-
-        self.pretransform = PatchedPretransform(channels=2, patch_size=256)
-
-        common_kwargs = dict(
-            differential=True, dyt=True, dim_heads=64,
-            sliding_window=sliding_window, variable_stride=True,
-            chunk_size=chunk_size, chunk_midpoint_shift=chunk_midpoint_shift,
-            dtype=dtype, device=device, operations=operations,
-        )
-        self.encoder = SAMEEncoder(
-            in_channels=512, channels=channels, c_mults=[6], strides=[16],
-            latent_dim=256, transformer_depths=[transformer_depths],
-            conv_mapping=False, **common_kwargs,
-        )
-        self.decoder = SAMEDecoder(
-            out_channels=512, channels=channels, c_mults=[6], strides=[16],
-            latent_dim=256, transformer_depths=[transformer_depths], sinusoidal_blocks=[sinusoidal_blocks],
-            conv_mapping=decoder_conv_mapping, **common_kwargs,
-        )
-        self.bottleneck = SoftNormBottleneck(
-            dim=256, noise_augment_dim=0, noise_regularize=True,
-            auto_scale=True, freeze=True,
-            dtype=dtype, device=device,
-        )
-
-    @torch.no_grad()
-    def _pretransform_encode(self, x):
-        return self.pretransform.encode(x)
-
-    @torch.no_grad()
-    def _pretransform_decode(self, x):
-        return self.pretransform.decode(x)
-
-    def encode(self, x):
-        x = self._pretransform_encode(x)
-        x = self.encoder(x)
-        x = self.bottleneck.encode(x)
-        return x
-
-    def decode(self, x):
-        x = self.bottleneck.decode(x)
-        x = self.decoder(x)
-        x = self._pretransform_decode(x)
-        return x
--- a/comfy/ldm/hidream_o1/attention.py
+++ b/comfy/ldm/hidream_o1/attention.py
@ -1,41 +0,0 @@
-"""HiDream-O1 two-pass attention: tokens [0, ar_len) are causal, [ar_len, T)
-attend full K/V. Splitting Q at the boundary avoids the (B, 1, T, T) additive
-mask the general-purpose path would build (~500 MB at T~16K) and lets the
-gen half hit the user's preferred backend via optimized_attention.
-"""
-
-import torch
-
-import comfy.ops
-from comfy.ldm.modules.attention import optimized_attention
-
-
-def make_two_pass_attention(ar_len: int, transformer_options=None):
-    """Build a two-pass attention callable. AR pass uses SDPA-causal directly, gen pass routes through optimized_attention.
-    The AR pass goes through SDPA directand bypasses wrappers, it is only ~1% of T at typical edit sizes.
-    """
-
-    def two_pass_attention(q, k, v, heads, **kwargs):
-        B, H, T, D = q.shape
-
-        if T < k.shape[2]: # KV-cache hot path: Q is shorter than K/V (cached AR prefix is in K/V only), all fresh Q positions are in the gen region, single full-attention call
-            out = optimized_attention(q, k, v, heads, mask=None, skip_reshape=True, skip_output_reshape=True, transformer_options=transformer_options)
-        elif ar_len >= T:
-            out = comfy.ops.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=0.0, is_causal=True)
-        elif ar_len <= 0:
-            out = optimized_attention(q, k, v, heads, mask=None, skip_reshape=True, skip_output_reshape=True, transformer_options=transformer_options)
-        else:
-            out_ar = comfy.ops.scaled_dot_product_attention(
-                q[:, :, :ar_len], k[:, :, :ar_len], v[:, :, :ar_len],
-                attn_mask=None, dropout_p=0.0, is_causal=True,
-            )
-            out_gen = optimized_attention(
-                q[:, :, ar_len:], k, v, heads,
-                mask=None, skip_reshape=True, skip_output_reshape=True,
-                transformer_options=transformer_options,
-            )
-            out = torch.cat([out_ar, out_gen], dim=2)
-
-        return out.transpose(1, 2).reshape(B, T, H * D)
-
-    return two_pass_attention
--- a/comfy/ldm/hidream_o1/conditioning.py
+++ b/comfy/ldm/hidream_o1/conditioning.py
@ -1,230 +0,0 @@
-"""HiDream-O1 conditioning prep — ref-image dual path + extra_conds assembly.
-
-Each ref image goes through two paths: a 32x32 patchified stream concatenated
-to the noised target, and a Qwen3-VL ViT path producing tokens that scatter
-into input_ids at <|image_pad|> positions.
-"""
-
-from typing import List
-
-import torch
-
-import comfy.utils
-from comfy.text_encoders.qwen_vl import process_qwen2vl_images
-
-from .utils import (PATCH_SIZE, calculate_dimensions, cond_image_size, ref_max_size, resize_tensor)
-
-# Qwen3-VL ViT preprocessing constants (preprocessor_config.json).
-VIT_PATCH = 16
-VIT_MERGE = 2
-VIT_IMAGE_MEAN = [0.5, 0.5, 0.5]
-VIT_IMAGE_STD = [0.5, 0.5, 0.5]
-
-
-def prepare_ref_images(
-    ref_images: List[torch.Tensor],
-    target_h: int,
-    target_w: int,
-    device: torch.device,
-    dtype: torch.dtype,
-):
-    """Build the dual-path tensors for K reference images at (target_h, target_w).
-
-    Returns None for K=0, else a dict with ref_patches, ref_pixel_values,
-    ref_image_grid_thw, per_ref_vit_tokens, per_ref_patch_grids.
-    """
-    K = len(ref_images)
-    if K == 0:
-        return None
-    max_size = ref_max_size(max(target_h, target_w), K)
-    cis = cond_image_size(K)
-
-    refs_t = [img[0].clamp(0, 1).permute(2, 0, 1).unsqueeze(0).contiguous().float() for img in ref_images]
-    refs_t = [resize_tensor(t, max_size, PATCH_SIZE) for t in refs_t]
-
-    # 32-patch path.
-    ref_patches_per = []
-    per_ref_patch_grids = []
-    for t in refs_t:
-        t_norm = (t.squeeze(0) - 0.5) / 0.5  # (3, H, W) in [-1, 1]
-        h_p, w_p = t_norm.shape[-2] // PATCH_SIZE, t_norm.shape[-1] // PATCH_SIZE
-        per_ref_patch_grids.append((h_p, w_p))
-        patches = (
-            t_norm.reshape(3, h_p, PATCH_SIZE, w_p, PATCH_SIZE)
-            .permute(1, 3, 0, 2, 4)
-            .reshape(h_p * w_p, 3 * PATCH_SIZE * PATCH_SIZE)
-        )
-        ref_patches_per.append(patches)
-    ref_patches = torch.cat(ref_patches_per, dim=0).unsqueeze(0).to(device=device, dtype=dtype)
-
-    # ViT path.
-    refs_vlm_t = []
-    for t in refs_t:
-        _, _, h, w = t.shape
-        cond_w, cond_h = calculate_dimensions(cis, w / h)
-        cond_w = max(cond_w, VIT_PATCH * VIT_MERGE)
-        cond_h = max(cond_h, VIT_PATCH * VIT_MERGE)
-        refs_vlm_t.append(comfy.utils.common_upscale(t, cond_w, cond_h, "lanczos", "disabled"))
-
-    pv_list, grid_list, per_ref_vit_tokens = [], [], []
-    for t_v in refs_vlm_t:
-        pv, grid_thw = process_qwen2vl_images(
-            t_v.permute(0, 2, 3, 1),
-            min_pixels=0, max_pixels=10**12,
-            patch_size=VIT_PATCH, merge_size=VIT_MERGE,
-            image_mean=VIT_IMAGE_MEAN, image_std=VIT_IMAGE_STD,
-        )
-        grid_thw = grid_thw[0]
-        pv_list.append(pv.to(device=device, dtype=dtype))
-        grid_list.append(grid_thw.to(device=device))
-        # Post-merge token count = number of <|image_pad|> tokens this image expands to in input_ids.
-        gh, gw = int(grid_thw[1].item()), int(grid_thw[2].item())
-        per_ref_vit_tokens.append((gh // VIT_MERGE) * (gw // VIT_MERGE))
-
-    return {
-        "ref_patches": ref_patches,
-        "ref_pixel_values": torch.cat(pv_list, dim=0),
-        "ref_image_grid_thw": torch.stack(grid_list, dim=0),
-        "per_ref_vit_tokens": per_ref_vit_tokens,
-        "per_ref_patch_grids": per_ref_patch_grids,
-    }
-
-
-def build_ref_input_ids(
-    text_input_ids: torch.Tensor,
-    per_ref_vit_tokens: List[int],
-    image_token_id: int,
-    vision_start_id: int,
-    vision_end_id: int,
-):
-    """Splice [vision_start, image_pad*N, vision_end] blocks into input_ids
-    after the [im_start, user, \\n] prefix (matches original chat template).
-    """
-    ids = text_input_ids[0].tolist()
-    inserted = []
-    for n_pad in per_ref_vit_tokens:
-        inserted.extend([vision_start_id] + [image_token_id] * n_pad + [vision_end_id])
-    new_ids = ids[:3] + inserted + ids[3:]  # 3 = len([im_start, user, \n])
-    return torch.tensor([new_ids], dtype=text_input_ids.dtype, device=text_input_ids.device)
-
-
-def build_extra_conds(
-    text_input_ids: torch.Tensor,
-    noise: torch.Tensor,
-    ref_images: List[torch.Tensor] = None,
-    target_patch_size: int = 32,
-):
-    """Assemble all conditioning tensors for HiDreamO1Transformer.forward:
-    input_ids (with ref-vision tokens spliced in for the edit/IP path),
-    position_ids (MRoPE), token_types, vinput_mask, plus the ref
-    dual-path tensors when refs are provided.
-    """
-    from .utils import get_rope_index_fix_point
-    from comfy.text_encoders.hidream_o1 import (
-        IMAGE_TOKEN_ID, VISION_START_ID, VISION_END_ID,
-    )
-
-    if text_input_ids.dim() == 1:
-        text_input_ids = text_input_ids.unsqueeze(0)
-    text_input_ids = text_input_ids.long().to(noise.device)
-    B = noise.shape[0]
-    if text_input_ids.shape[0] == 1 and B > 1:
-        text_input_ids = text_input_ids.expand(B, -1)
-
-    H, W = noise.shape[-2], noise.shape[-1]
-    h_p, w_p = H // target_patch_size, W // target_patch_size
-    image_len = h_p * w_p
-    image_grid_thw_tgt = torch.tensor(
-        [[1, h_p, w_p]], dtype=torch.long, device=text_input_ids.device,
-    )
-
-    out = {}
-    if ref_images:
-        ref = prepare_ref_images(ref_images, H, W, device=noise.device, dtype=noise.dtype)
-        text_input_ids = build_ref_input_ids(
-            text_input_ids, ref["per_ref_vit_tokens"],
-            IMAGE_TOKEN_ID, VISION_START_ID, VISION_END_ID,
-        )
-        new_txt_len = text_input_ids.shape[1]
-
-        # Each ref's patchified stream gets a [vision_start, image_pad*N-1]
-        # block in the position-id stream after the noised target.
-        ref_grid_lengths = [hp * wp for (hp, wp) in ref["per_ref_patch_grids"]]
-        tgt_vision = torch.full((1, image_len), IMAGE_TOKEN_ID,
-                                dtype=text_input_ids.dtype, device=text_input_ids.device)
-        tgt_vision[:, 0] = VISION_START_ID
-        ref_vision_blocks = []
-        for rl in ref_grid_lengths:
-            blk = torch.full((1, rl), IMAGE_TOKEN_ID,
-                             dtype=text_input_ids.dtype, device=text_input_ids.device)
-            blk[:, 0] = VISION_START_ID
-            ref_vision_blocks.append(blk)
-        ref_vision_cat = torch.cat([tgt_vision] + ref_vision_blocks, dim=1)
-        input_ids_pad = torch.cat([text_input_ids, ref_vision_cat], dim=-1)
-        total_ref_patches_len = sum(ref_grid_lengths)
-        total_len = new_txt_len + image_len + total_ref_patches_len
-
-        # K (ViT, post-merge) + 1 (target) + K (ref-patches) image grids.
-        K = len(ref_images)
-        igthw_cond = ref["ref_image_grid_thw"].clone()
-        igthw_cond[:, 1] //= 2
-        igthw_cond[:, 2] //= 2
-        image_grid_thw_ref = torch.tensor(
-            [[1, hp, wp] for (hp, wp) in ref["per_ref_patch_grids"]],
-            dtype=torch.long, device=text_input_ids.device,
-        )
-        igthw_all = torch.cat([
-            igthw_cond.to(text_input_ids.device),
-            image_grid_thw_tgt,
-            image_grid_thw_ref,
-        ], dim=0)
-        position_ids, _ = get_rope_index_fix_point(
-            spatial_merge_size=1,
-            image_token_id=IMAGE_TOKEN_ID,
-            vision_start_token_id=VISION_START_ID,
-            input_ids=input_ids_pad, image_grid_thw=igthw_all,
-            attention_mask=None,
-            skip_vision_start_token=[0] * K + [1] + [1] * K,
-            fix_point=4096,
-        )
-
-        # tms + target_image + ref_patches are all gen.
-        tms_pos = new_txt_len - 1
-        ar_len = tms_pos
-        token_types = torch.zeros(B, total_len, dtype=torch.long, device=noise.device)
-        token_types[:, tms_pos:] = 1
-        vinput_mask = torch.zeros(B, total_len, dtype=torch.bool, device=noise.device)
-        vinput_mask[:, new_txt_len:] = True
-
-        # Leading batch dim sidesteps CONDRegular.process_cond's repeat_to_batch_size truncation
-        out["ref_pixel_values"] = ref["ref_pixel_values"].unsqueeze(0)
-        out["ref_image_grid_thw"] = ref["ref_image_grid_thw"].unsqueeze(0)
-        out["ref_patches"] = ref["ref_patches"]
-    else:
-        # T2I: text + noised target only, vision_start replaces the first image token
-        txt_len = text_input_ids.shape[1]
-        total_len = txt_len + image_len
-        vision_tokens = torch.full((B, image_len), IMAGE_TOKEN_ID,
-                                   dtype=text_input_ids.dtype, device=text_input_ids.device)
-        vision_tokens[:, 0] = VISION_START_ID
-        input_ids_pad = torch.cat([text_input_ids, vision_tokens], dim=-1)
-        position_ids, _ = get_rope_index_fix_point(
-            spatial_merge_size=1,
-            image_token_id=IMAGE_TOKEN_ID,
-            vision_start_token_id=VISION_START_ID,
-            input_ids=input_ids_pad, image_grid_thw=image_grid_thw_tgt,
-            attention_mask=None,
-            skip_vision_start_token=[1],
-        )
-        ar_len = txt_len - 1
-        token_types = torch.zeros(B, total_len, dtype=torch.long, device=noise.device)
-        token_types[:, ar_len:] = 1
-        vinput_mask = torch.zeros(B, total_len, dtype=torch.bool, device=noise.device)
-        vinput_mask[:, txt_len:] = True
-
-    out["input_ids"] = text_input_ids
-    out["position_ids"] = position_ids[:, 0].unsqueeze(0) # Collapse position_ids batch and add a leading dim so CONDRegular's batch-resize doesn't truncate the 3-axis MRoPE dim
-    out["token_types"] = token_types
-    out["vinput_mask"] = vinput_mask
-    out["ar_len"] = ar_len
-    return out
--- a/comfy/ldm/hidream_o1/model.py
+++ b/comfy/ldm/hidream_o1/model.py
@ -1,306 +0,0 @@
-"""HiDream-O1-Image transformer.
-
-Pixel-space DiT built on Qwen3-VL: the vision tower (Qwen35VisionModel)
-encodes ref images, the Qwen3-VL-8B decoder (Llama2_ with interleaved MRoPE)
-processes a unified text+image sequence, and 32x32 patch embed/unembed
-shims map raw RGB in and out of LLM hidden space. The Qwen3-VL deepstack
-mergers go unused — their weights are dropped at load.
-"""
-
-from dataclasses import dataclass, field
-from typing import List, Optional
-
-import einops
-import torch
-import torch.nn as nn
-
-import comfy.patcher_extension
-from comfy.ldm.modules.diffusionmodules.mmdit import TimestepEmbedder
-from comfy.text_encoders.llama import Llama2_
-from comfy.text_encoders.qwen35 import Qwen35VisionModel
-
-from .attention import make_two_pass_attention
-
-
-IMAGE_TOKEN_ID = 151655   # Qwen3-VL <|image_pad|>
-TMS_TOKEN_ID = 151673     # HiDream-O1 <|tms_token|>
-PATCH_SIZE = 32
-
-
-@dataclass
-class HiDreamO1TextConfig:
-    """Qwen3-VL-8B text-decoder dims (matches public Qwen3-VL-8B-Instruct)."""
-    vocab_size: int = 151936
-    hidden_size: int = 4096
-    intermediate_size: int = 12288
-    num_hidden_layers: int = 36
-    num_attention_heads: int = 32
-    num_key_value_heads: int = 8
-    head_dim: int = 128
-    max_position_embeddings: int = 128000
-    rms_norm_eps: float = 1e-6
-    rope_theta: float = 5000000.0
-    rope_scale: Optional[float] = None
-    rope_dims: List[int] = field(default_factory=lambda: [24, 20, 20])
-    interleaved_mrope: bool = True
-    transformer_type: str = "llama"
-    rms_norm_add: bool = False
-    mlp_activation: str = "silu"
-    qkv_bias: bool = False
-    q_norm: str = "gemma3"
-    k_norm: str = "gemma3"
-    final_norm: bool = True
-    lm_head: bool = False
-    stop_tokens: List[int] = field(default_factory=lambda: [151643, 151645])
-
-
-QWEN3VL_VISION_DEFAULTS = dict(
-    hidden_size=1152,
-    num_heads=16,
-    intermediate_size=4304,
-    depth=27,
-    patch_size=16,
-    temporal_patch_size=2,
-    in_channels=3,
-    spatial_merge_size=2,
-    num_position_embeddings=2304,
-    deepstack_visual_indexes=(8, 16, 24),
-    out_hidden_size=4096,  # final merger projects directly into LLM hidden
-)
-
-
-class BottleneckPatchEmbed(nn.Module):
-    # 3072 -> 1024 -> 4096 (raw 32x32 RGB patch -> bottleneck -> LLM hidden).
-    def __init__(self, patch_size=32, in_chans=3, pca_dim=1024, embed_dim=4096, bias=True, device=None, dtype=None, ops=None):
-        super().__init__()
-        self.proj1 = ops.Linear(patch_size * patch_size * in_chans, pca_dim, bias=False, device=device, dtype=dtype)
-        self.proj2 = ops.Linear(pca_dim, embed_dim, bias=bias, device=device, dtype=dtype)
-
-    def forward(self, x):
-        return self.proj2(self.proj1(x))
-
-
-class FinalLayer(nn.Module):
-    # 4096 -> 3072 (LLM hidden -> flat pixel patch).
-    def __init__(self, hidden_size, patch_size=32, out_channels=3, device=None, dtype=None, ops=None):
-        super().__init__()
-        self.linear = ops.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True, device=device, dtype=dtype)
-
-    def forward(self, x):
-        return self.linear(x)
-
-
-class HiDreamO1Transformer(nn.Module):
-    """HiDream-O1 unified pixel-level transformer."""
-
-    def __init__(self, image_model=None, dtype=None, device=None, operations=None,
-                 text_config_overrides=None, vision_config_overrides=None, **kwargs):
-        super().__init__()
-        self.dtype = dtype
-
-        text_cfg = HiDreamO1TextConfig(**(text_config_overrides or {}))
-        vision_cfg = dict(QWEN3VL_VISION_DEFAULTS)
-        if vision_config_overrides:
-            vision_cfg.update(vision_config_overrides)
-        vision_cfg["out_hidden_size"] = text_cfg.hidden_size
-
-        self.text_config = text_cfg
-        self.vision_config = vision_cfg
-        self.hidden_size = text_cfg.hidden_size
-        self.patch_size = PATCH_SIZE
-        self.in_channels = 3
-        self.tms_token_id = TMS_TOKEN_ID
-
-        self.visual = Qwen35VisionModel(vision_cfg, device=device, dtype=dtype, ops=operations)
-        self.language_model = Llama2_(text_cfg, device=device, dtype=dtype, ops=operations)
-        self.t_embedder1 = TimestepEmbedder(
-            text_cfg.hidden_size, device=device, dtype=dtype, operations=operations,
-        )
-        self.x_embedder = BottleneckPatchEmbed(
-            patch_size=self.patch_size, in_chans=self.in_channels,
-            pca_dim=text_cfg.hidden_size // 4, embed_dim=text_cfg.hidden_size,
-            bias=True, device=device, dtype=dtype, ops=operations,
-        )
-        self.final_layer2 = FinalLayer(
-            text_cfg.hidden_size, patch_size=self.patch_size,
-            out_channels=self.in_channels, device=device, dtype=dtype, ops=operations,
-        )
-
-        self._visual_cache = None
-        self._kv_cache_entries = []
-
-    def clear_kv_cache(self):
-        self._kv_cache_entries = []
-        self._visual_cache = None
-
-    def forward(self, x, timesteps, context=None, transformer_options={}, **kwargs):
-        return comfy.patcher_extension.WrapperExecutor.new_class_executor(
-            self._forward,
-            self,
-            comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, transformer_options)
-        ).execute(x, timesteps, context, transformer_options, **kwargs)
-
-    def _forward(self, x, timesteps, context=None, transformer_options={}, input_ids=None, attention_mask=None, position_ids=None,
-                 vinput_mask=None, ar_len=None, ref_pixel_values=None, ref_image_grid_thw=None, ref_patches=None, **kwargs):
-        """Returns flow-match velocity (x - x_pred) / sigma"""
-
-        if input_ids is None or position_ids is None:
-            raise ValueError("HiDreamO1Transformer requires input_ids and position_ids in conditioning")
-
-        B, _, H, W = x.shape
-        h_p, w_p = H // self.patch_size, W // self.patch_size
-        tgt_image_len = h_p * w_p
-
-        z = einops.rearrange(
-            x, 'B C (H p1) (W p2) -> B (H W) (C p1 p2)',
-            p1=self.patch_size, p2=self.patch_size,
-        )
-        vinputs = torch.cat([z, ref_patches.to(z.dtype)], dim=1) if ref_patches is not None else z
-
-        inputs_embeds = self.language_model.embed_tokens(input_ids).to(x.dtype)
-
-        if ref_pixel_values is not None and ref_image_grid_thw is not None:
-            # ViT output is constant across sampling steps within a generation
-            # identity-key by the input tensor so refs don't recompute every step.
-            cached = self._visual_cache
-            if cached is not None and cached[0] is ref_pixel_values:
-                image_embeds = cached[1]
-            else:
-                ref_pv = ref_pixel_values.to(inputs_embeds.device)
-                ref_grid = ref_image_grid_thw.to(inputs_embeds.device).long()
-                # extra_conds wraps with a leading batch dim; refs are model-level so [0] always recovers them.
-                if ref_pv.dim() == 3:
-                    ref_pv = ref_pv[0]
-                if ref_grid.dim() == 3:
-                    ref_grid = ref_grid[0]
-                image_embeds = self.visual(ref_pv, ref_grid).to(inputs_embeds.dtype)
-                self._visual_cache = (ref_pixel_values, image_embeds)
-            # image_pad positions identical across batch (input_ids shared cond/uncond).
-            image_idx = (input_ids[0] == IMAGE_TOKEN_ID).nonzero(as_tuple=True)[0]
-            if image_idx.shape[0] != image_embeds.shape[0]:
-                raise ValueError(
-                    f"Image-token count {image_idx.shape[0]} != ViT output count "
-                    f"{image_embeds.shape[0]}; check tokenizer/processor alignment."
-                )
-            inputs_embeds[:, image_idx] = image_embeds.unsqueeze(0).expand(B, -1, -1)
-
-        sigma = timesteps.float() / 1000.0
-        t_pixeldit = 1.0 - sigma
-        t_emb = self.t_embedder1(t_pixeldit * 1000, inputs_embeds.dtype)
-        tms_mask_3d = (input_ids == self.tms_token_id).unsqueeze(-1).expand_as(inputs_embeds)
-        inputs_embeds = torch.where(tms_mask_3d, t_emb.unsqueeze(1).expand_as(inputs_embeds), inputs_embeds)
-
-        vinputs_embedded = self.x_embedder(vinputs.to(inputs_embeds.dtype))
-        inputs_embeds = torch.cat([inputs_embeds, vinputs_embedded], dim=1)
-
-        # extra_conds stores position_ids as (1, 3, T); process_cond repeats dim 0 to B. Take row 0.
-        freqs_cis = self.language_model.compute_freqs_cis(position_ids[0].to(x.device), x.device)
-        freqs_cis = tuple(t.to(x.dtype) for t in freqs_cis)
-
-        two_pass_attn = make_two_pass_attention(ar_len, transformer_options=transformer_options)
-        patches_replace = transformer_options.get("patches_replace", {})
-        blocks_replace = patches_replace.get("dit", {})
-        transformer_options["total_blocks"] = len(self.language_model.layers)
-        transformer_options["block_type"] = "double"
-
-        # Cache prefix K/V across steps. Key includes input_ids (prompt), ref_id
-        # (refs scatter into inputs_embeds), and position_ids (RoPE baked into cached K).
-        can_cache = not blocks_replace and ar_len > 0
-        cache_len = ar_len if can_cache else 0
-        ref_id = id(ref_pixel_values) if ref_pixel_values is not None else None
-        pos_ids_key = position_ids[..., :cache_len] if can_cache else position_ids
-        cache_entries = self._kv_cache_entries
-        # Drop stale entries from a previous device (model was unloaded and reloaded).
-        if cache_entries and cache_entries[0]["input_ids"].device != input_ids.device:
-            cache_entries = []
-            self._kv_cache_entries = []
-        kv_cache = None
-        if can_cache:
-            for entry in cache_entries:
-                ck = entry["input_ids"]
-                ep = entry["position_ids"]
-                if (entry["cache_len"] == cache_len
-                        and ck.shape == input_ids.shape and torch.equal(ck, input_ids)
-                        and entry["ref_id"] == ref_id
-                        and ep.shape == pos_ids_key.shape and torch.equal(ep, pos_ids_key)):
-                    kv_cache = entry
-                    break
-
-        if kv_cache is not None:
-            # Hot path: project Q/K/V only for fresh positions; past_key_value prepends cached AR K/V.
-            hidden_states = inputs_embeds[:, cache_len:]
-            sliced_freqs = tuple(t[..., cache_len:, :] for t in freqs_cis)
-            for i, layer in enumerate(self.language_model.layers):
-                transformer_options["block_index"] = i
-                K_i, V_i = kv_cache["kv"][i]
-                hidden_states, _ = layer(
-                    x=hidden_states, attention_mask=None, freqs_cis=sliced_freqs, optimized_attention=two_pass_attn,
-                    past_key_value=(K_i, V_i, cache_len),
-                )
-        else:
-            # Cold path: run full sequence; if cacheable, snapshot K/V at AR positions.
-            snapshots = [] if can_cache else None
-            past_kv_cold = () if can_cache else None
-            hidden_states = inputs_embeds
-            for i, layer in enumerate(self.language_model.layers):
-                transformer_options["block_index"] = i
-                if ("double_block", i) in blocks_replace:
-                    def block_wrap(args, _layer=layer):
-                        out = {}
-                        out["x"], _ = _layer(
-                            x=args["x"], attention_mask=args.get("attention_mask"),
-                            freqs_cis=args["freqs_cis"], optimized_attention=args["optimized_attention"],
-                            past_key_value=None,
-                        )
-                        return out
-                    out = blocks_replace[("double_block", i)](
-                        {"x": hidden_states, "attention_mask": None,
-                         "freqs_cis": freqs_cis, "optimized_attention": two_pass_attn,
-                         "transformer_options": transformer_options},
-                        {"original_block": block_wrap},
-                    )
-                    hidden_states = out["x"]
-                else:
-                    hidden_states, present_kv = layer(
-                        x=hidden_states, attention_mask=None,
-                        freqs_cis=freqs_cis, optimized_attention=two_pass_attn,
-                        past_key_value=past_kv_cold,
-                    )
-                    if snapshots is not None:
-                        K, V, _ = present_kv
-                        snapshots.append((K[:, :, :cache_len].contiguous(),
-                                          V[:, :, :cache_len].contiguous()))
-            if snapshots is not None:
-                # Cap at 2 entries (cond + uncond). Multi-cond workflows LRU-evict.
-                new_entry = {
-                    "input_ids": input_ids.clone(),
-                    "cache_len": cache_len,
-                    "kv": snapshots,
-                    "ref_id": ref_id,
-                    "position_ids": pos_ids_key.clone(),
-                }
-                self._kv_cache_entries = (cache_entries + [new_entry])[-2:]
-
-        if self.language_model.norm is not None:
-            hidden_states = self.language_model.norm(hidden_states)
-
-        # Slice target-image positions before the final projection so the Linear only runs on tgt_image_len tokens.
-        # In the hot path hidden_states starts at original position cache_len, so masks/indices shift by cache_len.
-        sliced_offset = cache_len if kv_cache is not None else 0
-        if vinput_mask is not None:
-            vmask = vinput_mask.to(x.device).bool()
-            if sliced_offset > 0:
-                vmask = vmask[:, sliced_offset:]
-            target_hidden = hidden_states[vmask].view(B, -1, hidden_states.shape[-1])[:, :tgt_image_len]
-        else:
-            txt_seq_len = input_ids.shape[1]
-            start = txt_seq_len - sliced_offset
-            target_hidden = hidden_states[:, start:start + tgt_image_len]
-        x_pred_tgt = self.final_layer2(target_hidden)
-
-        # fp32 final subtraction, bf16 here noticeably degrades samples.
-        x_pred_img = einops.rearrange(
-            x_pred_tgt, 'B (H W) (C p1 p2) -> B C (H p1) (W p2)',
-            H=h_p, W=w_p, p1=self.patch_size, p2=self.patch_size,
-        )
-        return (x.float() - x_pred_img.float()) / sigma.view(B, 1, 1, 1).clamp_min(1e-3)
--- a/comfy/ldm/hidream_o1/utils.py
+++ b/comfy/ldm/hidream_o1/utils.py
@ -1,173 +0,0 @@
-"""HiDream-O1 input-prep helpers: image/resolution math and unified-sequence
-RoPE position-id assembly. The fix_point offset in get_rope_index_fix_point
-lets the target image and patchified ref images share spatial RoPE positions
-despite living at different sequence indices — same 2D image plane.
-"""
-
-import math
-from typing import Optional
-
-import torch
-
-
-PATCH_SIZE = 32
-CONDITION_IMAGE_SIZE = 384  # ViT-side base size for ref images
-
-
-def resize_tensor(img_t, image_size, patch_size=16):
-    """img_t: (1, 3, H, W) float [0, 1]. Fit to image_size**2 area, patch-aligned, center-cropped."""
-
-    while min(img_t.shape[-2], img_t.shape[-1]) >= 2 * image_size: # Pre-halves with 2x2 box averaging while the image is still very large
-        img_t = torch.nn.functional.avg_pool2d(img_t, kernel_size=2, stride=2)
-
-    _, _, height, width = img_t.shape
-    m = patch_size
-    s_max = image_size * image_size
-    scale = math.sqrt(s_max / (width * height))
-
-    candidates = [
-        (round(width * scale) // m * m, round(height * scale) // m * m),
-        (round(width * scale) // m * m, math.floor(height * scale) // m * m),
-        (math.floor(width * scale) // m * m, round(height * scale) // m * m),
-        (math.floor(width * scale) // m * m, math.floor(height * scale) // m * m),
-    ]
-    candidates = sorted(candidates, key=lambda x: x[0] * x[1], reverse=True)
-    new_size = candidates[-1]
-    for c in candidates:
-        if c[0] * c[1] <= s_max:
-            new_size = c
-            break
-
-    new_w, new_h = new_size
-    s1 = width / new_w
-    s2 = height / new_h
-    if s1 < s2:
-        resize_w, resize_h = new_w, round(height / s1)
-    else:
-        resize_w, resize_h = round(width / s2), new_h
-    img_t = torch.nn.functional.interpolate(img_t, size=(resize_h, resize_w), mode="bicubic")
-    top = (resize_h - new_h) // 2
-    left = (resize_w - new_w) // 2
-    return img_t[..., top:top + new_h, left:left + new_w]
-
-
-def calculate_dimensions(max_size, ratio):
-    """(W, H) for an aspect ratio fitting in max_size**2 area, 32-aligned."""
-    width = math.sqrt(max_size * max_size * ratio)
-    height = width / ratio
-    width = int(width / 32) * 32
-    height = int(height / 32) * 32
-    return width, height
-
-
-def ref_max_size(target_max_dim, k):
-    """K-dependent ref-image max dim before patchifying."""
-    if k == 1:
-        return target_max_dim
-    if k == 2:
-        return target_max_dim * 48 // 64
-    if k <= 4:
-        return target_max_dim // 2
-    if k <= 8:
-        return target_max_dim * 24 // 64
-    return target_max_dim // 4
-
-
-def cond_image_size(k):
-    """K-dependent ViT-side image size."""
-    if k <= 4:
-        return CONDITION_IMAGE_SIZE
-    if k <= 8:
-        return CONDITION_IMAGE_SIZE * 48 // 64
-    return CONDITION_IMAGE_SIZE // 2
-
-
-def get_rope_index_fix_point(
-    spatial_merge_size: int,
-    image_token_id: int,
-    vision_start_token_id: int,
-    input_ids: Optional[torch.LongTensor] = None,
-    image_grid_thw: Optional[torch.LongTensor] = None,
-    attention_mask: Optional[torch.Tensor] = None,
-    skip_vision_start_token=None,
-    fix_point: int = 4096,
-):
-    mrope_position_deltas = []
-    if input_ids is not None and image_grid_thw is not None:
-        total_input_ids = input_ids
-        if attention_mask is None:
-            attention_mask = torch.ones_like(total_input_ids)
-        position_ids = torch.ones(
-            3, input_ids.shape[0], input_ids.shape[1],
-            dtype=input_ids.dtype, device=input_ids.device,
-        )
-        attention_mask = attention_mask.to(total_input_ids.device)
-        for i, input_ids_b in enumerate(total_input_ids):
-            fp = fix_point
-            image_index = 0
-            input_ids_b = input_ids_b[attention_mask[i] == 1]
-            vision_start_indices = torch.argwhere(input_ids_b == vision_start_token_id).squeeze(1)
-            vision_tokens = input_ids_b[vision_start_indices + 1]
-            image_nums = (vision_tokens == image_token_id).sum()
-            input_tokens = input_ids_b.tolist()
-            llm_pos_ids_list = []
-            st = 0
-            remain_images = image_nums
-            for _ in range(image_nums):
-                if image_token_id in input_tokens and remain_images > 0:
-                    ed = input_tokens.index(image_token_id, st)
-                else:
-                    ed = len(input_tokens) + 1
-                t = image_grid_thw[image_index][0]
-                h = image_grid_thw[image_index][1]
-                w = image_grid_thw[image_index][2]
-                image_index += 1
-                remain_images -= 1
-                llm_grid_t = t.item()
-                llm_grid_h = h.item() // spatial_merge_size
-                llm_grid_w = w.item() // spatial_merge_size
-                text_len = ed - st
-                text_len -= skip_vision_start_token[image_index - 1]
-                text_len = max(0, text_len)
-                st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
-                llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
-
-                t_index = torch.arange(llm_grid_t).view(-1, 1).expand(-1, llm_grid_h * llm_grid_w).flatten()
-                h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(llm_grid_t, -1, llm_grid_w).flatten()
-                w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(llm_grid_t, llm_grid_h, -1).flatten()
-
-                if skip_vision_start_token[image_index - 1]:
-                    if fp > 0:
-                        fp = fp - st_idx
-                    llm_pos_ids_list.append(torch.stack([t_index, h_index, w_index]) + fp + st_idx)
-                    fp = 0
-                else:
-                    llm_pos_ids_list.append(torch.stack([t_index, h_index, w_index]) + text_len + st_idx)
-                st = ed + llm_grid_t * llm_grid_h * llm_grid_w
-
-            if st < len(input_tokens):
-                st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
-                text_len = len(input_tokens) - st
-                llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
-
-            llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
-            position_ids[..., i, attention_mask[i] == 1] = llm_positions.to(position_ids.device)
-            mrope_position_deltas.append(llm_positions.max() + 1 - len(total_input_ids[i]))
-        mrope_position_deltas = torch.tensor(mrope_position_deltas, device=input_ids.device).unsqueeze(1)
-        return position_ids, mrope_position_deltas
-
-    if attention_mask is not None:
-        position_ids = attention_mask.long().cumsum(-1) - 1
-        position_ids.masked_fill_(attention_mask == 0, 1)
-        position_ids = position_ids.unsqueeze(0).expand(3, -1, -1).to(attention_mask.device)
-        max_position_ids = position_ids.max(0, keepdim=False)[0].max(-1, keepdim=True)[0]
-        mrope_position_deltas = max_position_ids + 1 - attention_mask.shape[-1]
-    else:
-        position_ids = (
-            torch.arange(input_ids.shape[1], device=input_ids.device)
-            .view(1, 1, -1).expand(3, input_ids.shape[0], -1)
-        )
-        mrope_position_deltas = torch.zeros(
-            [input_ids.shape[0], 1], device=input_ids.device, dtype=input_ids.dtype,
-        )
-    return position_ids, mrope_position_deltas
--- a/comfy/ldm/hunyuan3dv2_1/hunyuandit.py
+++ b/comfy/ldm/hunyuan3dv2_1/hunyuandit.py
@ -328,7 +328,7 @@ class CrossAttention(nn.Module):
        kv = torch.cat((k, v), dim=-1)
        split_size = kv.shape[-1] // self.num_heads // 2

-        kv = kv.view(b, -1, self.num_heads, split_size * 2)
+        kv = kv.view(1, -1, self.num_heads, split_size * 2)
        k, v = torch.split(kv, split_size, dim=-1)

        q = q.view(b, s1, self.num_heads, self.head_dim)
@ -398,7 +398,7 @@ class Attention(nn.Module):
        qkv_combined = torch.cat((query, key, value), dim=-1)
        split_size = qkv_combined.shape[-1] // self.num_heads // 3

-        qkv = qkv_combined.view(B, -1, self.num_heads, split_size * 3)
+        qkv = qkv_combined.view(1, -1, self.num_heads, split_size * 3)
        query, key, value = torch.split(qkv, split_size, dim=-1)

        query = query.reshape(B, N, self.num_heads, self.head_dim)
@ -607,9 +607,9 @@ class HunYuanDiTPlain(nn.Module):
    def forward(self, x, t, context, transformer_options = {}, **kwargs):

        x = x.movedim(-1, -2)
-        if context.shape[0] >= 2:
-            uncond_emb, cond_emb = context.chunk(2, dim = 0)
-            context = torch.cat([cond_emb, uncond_emb], dim = 0)
+        uncond_emb, cond_emb = context.chunk(2, dim = 0)
+
+        context = torch.cat([cond_emb, uncond_emb], dim = 0)
        main_condition = context

        t = 1.0 - t
@ -657,8 +657,5 @@ class HunYuanDiTPlain(nn.Module):
        output = self.final_layer(combined)
        output =  output.movedim(-2, -1) * (-1.0)

-        if output.shape[0] >= 2:
-            cond_emb, uncond_emb = output.chunk(2, dim = 0)
-            return torch.cat([uncond_emb, cond_emb])
-        else:
-            return output
+        cond_emb, uncond_emb = output.chunk(2, dim = 0)
+        return torch.cat([uncond_emb, cond_emb])
--- a/comfy/ldm/lightricks/av_model.py
+++ b/comfy/ldm/lightricks/av_model.py
@ -16,31 +16,31 @@ from comfy.ldm.lightricks.model import (
 from comfy.ldm.lightricks.symmetric_patchifier import AudioPatchifier
 from comfy.ldm.lightricks.embeddings_connector import Embeddings1DConnector
 import comfy.ldm.common_dit
-import comfy.model_prefetch

 class CompressedTimestep:
    """Store video timestep embeddings in compressed form using per-frame indexing."""
    __slots__ = ('data', 'batch_size', 'num_frames', 'patches_per_frame', 'feature_dim')

-    def __init__(self, tensor: torch.Tensor, patches_per_frame: int, per_frame: bool = False):
+    def __init__(self, tensor: torch.Tensor, patches_per_frame: int):
        """
-        tensor: [batch, num_tokens, feature_dim] (per-token, default) or
-                [batch, num_frames, feature_dim] (per_frame=True, already compressed).
-        patches_per_frame: spatial patches per frame; pass None to disable compression.
+        tensor: [batch_size, num_tokens, feature_dim] tensor where num_tokens = num_frames * patches_per_frame
+        patches_per_frame: Number of spatial patches per frame (height * width in latent space), or None to disable compression
        """
-        self.batch_size, n, self.feature_dim = tensor.shape
-        if per_frame:
+        self.batch_size, num_tokens, self.feature_dim = tensor.shape
+
+        # Check if compression is valid (num_tokens must be divisible by patches_per_frame)
+        if patches_per_frame is not None and num_tokens % patches_per_frame == 0 and num_tokens >= patches_per_frame:
            self.patches_per_frame = patches_per_frame
-            self.num_frames = n
-            self.data = tensor
-        elif patches_per_frame is not None and n >= patches_per_frame and n % patches_per_frame == 0:
-            self.patches_per_frame = patches_per_frame
-            self.num_frames = n // patches_per_frame
-            # All patches in a frame are identical — keep only the first.
-            self.data = tensor.view(self.batch_size, self.num_frames, patches_per_frame, self.feature_dim)[:, :, 0, :].contiguous()
+            self.num_frames = num_tokens // patches_per_frame
+
+            # Reshape to [batch, frames, patches_per_frame, feature_dim] and store one value per frame
+            # All patches in a frame are identical, so we only keep the first one
+            reshaped = tensor.view(self.batch_size, self.num_frames, patches_per_frame, self.feature_dim)
+            self.data = reshaped[:, :, 0, :].contiguous()  # [batch, frames, feature_dim]
        else:
+            # Not divisible or too small - store directly without compression
            self.patches_per_frame = 1
-            self.num_frames = n
+            self.num_frames = num_tokens
            self.data = tensor

    def expand(self):
@ -715,35 +715,32 @@ class LTXAVModel(LTXVModel):

    def _prepare_timestep(self, timestep, batch_size, hidden_dtype, **kwargs):
        """Prepare timestep embeddings."""
+        # TODO: some code reuse is needed here.
        grid_mask = kwargs.get("grid_mask", None)
-        orig_shape = kwargs.get("orig_shape")
-        has_spatial_mask = kwargs.get("has_spatial_mask", None)
-        v_patches_per_frame = None
-        if not has_spatial_mask and orig_shape is not None and len(orig_shape) == 5:
-            v_patches_per_frame = orig_shape[3] * orig_shape[4]
+        if grid_mask is not None:
+            timestep = timestep[:, grid_mask]

-        # Used by compute_prompt_timestep and the audio cross-attention paths.
-        timestep_scaled = (timestep[:, grid_mask] if grid_mask is not None else timestep) * self.timestep_scale_multiplier
-
-        # When patches in a frame share a timestep (no spatial mask), project one row per frame instead of one per token
-        per_frame_path = v_patches_per_frame is not None and (timestep.numel() // batch_size) % v_patches_per_frame == 0
-        if per_frame_path:
-            per_frame = timestep.reshape(batch_size, -1, v_patches_per_frame)[:, :, 0]
-            if grid_mask is not None:
-                # All-or-nothing per frame when has_spatial_mask=False.
-                per_frame = per_frame[:, grid_mask[::v_patches_per_frame]]
-            ts_input = per_frame * self.timestep_scale_multiplier
-        else:
-            ts_input = timestep_scaled
+        timestep_scaled = timestep * self.timestep_scale_multiplier

        v_timestep, v_embedded_timestep = self.adaln_single(
-            ts_input.flatten(),
+            timestep_scaled.flatten(),
            {"resolution": None, "aspect_ratio": None},
            batch_size=batch_size,
            hidden_dtype=hidden_dtype,
        )
-        v_timestep = CompressedTimestep(v_timestep.view(batch_size, -1, v_timestep.shape[-1]), v_patches_per_frame, per_frame=per_frame_path)
-        v_embedded_timestep = CompressedTimestep(v_embedded_timestep.view(batch_size, -1, v_embedded_timestep.shape[-1]), v_patches_per_frame, per_frame=per_frame_path)
+
+        # Calculate patches_per_frame from orig_shape: [batch, channels, frames, height, width]
+        # Video tokens are arranged as (frames * height * width), so patches_per_frame = height * width
+        orig_shape = kwargs.get("orig_shape")
+        has_spatial_mask = kwargs.get("has_spatial_mask", None)
+        v_patches_per_frame = None
+        if not has_spatial_mask and orig_shape is not None and len(orig_shape) == 5:
+            # orig_shape[3] = height, orig_shape[4] = width (in latent space)
+            v_patches_per_frame = orig_shape[3] * orig_shape[4]
+
+        # Reshape to [batch_size, num_tokens, dim] and compress for storage
+        v_timestep = CompressedTimestep(v_timestep.view(batch_size, -1, v_timestep.shape[-1]), v_patches_per_frame)
+        v_embedded_timestep = CompressedTimestep(v_embedded_timestep.view(batch_size, -1, v_embedded_timestep.shape[-1]), v_patches_per_frame)

        v_prompt_timestep = compute_prompt_timestep(
            self.prompt_adaln_single, timestep_scaled, batch_size, hidden_dtype
@ -910,11 +907,9 @@ class LTXAVModel(LTXVModel):
        """Process transformer blocks for LTXAV."""
        patches_replace = transformer_options.get("patches_replace", {})
        blocks_replace = patches_replace.get("dit", {})
-        prefetch_queue = comfy.model_prefetch.make_prefetch_queue(list(self.transformer_blocks), vx.device, transformer_options)

        # Process transformer blocks
        for i, block in enumerate(self.transformer_blocks):
-            comfy.model_prefetch.prefetch_queue_pop(prefetch_queue, vx.device, block)
            if ("double_block", i) in blocks_replace:

                def block_wrap(args):
@ -987,8 +982,6 @@ class LTXAVModel(LTXVModel):
                    a_prompt_timestep=a_prompt_timestep,
                )

-        comfy.model_prefetch.prefetch_queue_pop(prefetch_queue, vx.device, None)
-
        return [vx, ax]

    def _process_output(self, x, embedded_timestep, keyframe_idxs, **kwargs):
--- a/comfy/ldm/lightricks/model.py
+++ b/comfy/ldm/lightricks/model.py
@ -358,61 +358,6 @@ def apply_split_rotary_emb(input_tensor, cos, sin):
    return output.swapaxes(1, 2).reshape(B, T, -1) if needs_reshape else output


-class GuideAttentionMask:
-    """Holds the two per-group masks for LTXV guide self-attention.
-    _attention_with_guide_mask splits queries into noisy and tracked-guide
-    groups, so the largest mask is (1, 1, tracked_count, T).
-    """
-    __slots__ = ("guide_start", "tracked_count", "noisy_mask", "tracked_mask")
-
-    def __init__(self, total_tokens, guide_start, tracked_count, tracked_weights):
-        device = tracked_weights.device
-        dtype = tracked_weights.dtype
-        finfo = torch.finfo(dtype)
-
-        pos = tracked_weights > 0
-        log_w = torch.full_like(tracked_weights, finfo.min)
-        log_w[pos] = torch.log(tracked_weights[pos].clamp(min=finfo.tiny))
-
-        self.guide_start = guide_start
-        self.tracked_count = tracked_count
-
-        self.noisy_mask = torch.zeros((1, 1, 1, total_tokens), device=device, dtype=dtype)
-        self.noisy_mask[:, :, :, guide_start:guide_start + tracked_count] = log_w.view(1, 1, 1, -1)
-
-        self.tracked_mask = torch.zeros((1, 1, tracked_count, total_tokens), device=device, dtype=dtype)
-        self.tracked_mask[:, :, :, :guide_start] = log_w.view(1, 1, -1, 1)
-
-
-def _attention_with_guide_mask(q, k, v, heads, guide_mask, attn_precision, transformer_options):
-    """Apply the guide mask by partitioning Q into noisy and tracked-guide
-    groups, so each group needs only its own sub-mask. Avoids materializing
-    the (1,1,T,T) dense mask.
-    """
-    guide_start = guide_mask.guide_start
-    tracked_end = guide_start + guide_mask.tracked_count
-
-    out = torch.empty_like(q)
-
-    if guide_start > 0: # In practice currently guides are always after noise, guard for safety if this changes.
-        out[:, :guide_start, :] = comfy.ldm.modules.attention.optimized_attention(
-            q[:, :guide_start, :], k, v, heads, mask=guide_mask.noisy_mask,
-            attn_precision=attn_precision, transformer_options=transformer_options,
-            low_precision_attention=False, # sageattn mask support is unreliable
-        )
-    out[:, guide_start:tracked_end, :] = comfy.ldm.modules.attention.optimized_attention(
-        q[:, guide_start:tracked_end, :], k, v, heads, mask=guide_mask.tracked_mask,
-        attn_precision=attn_precision, transformer_options=transformer_options,
-        low_precision_attention=False,
-    )
-    if tracked_end < q.shape[1]: # Every guide token is tracked, and nothing comes after them, guard for safety if this changes.
-        out[:, tracked_end:, :] = comfy.ldm.modules.attention.optimized_attention(
-            q[:, tracked_end:, :], k, v, heads,
-            attn_precision=attn_precision, transformer_options=transformer_options,
-        )
-    return out
-
-
 class CrossAttention(nn.Module):
    def __init__(
        self,
@ -467,10 +412,8 @@ class CrossAttention(nn.Module):

        if mask is None:
            out = comfy.ldm.modules.attention.optimized_attention(q, k, v, self.heads, attn_precision=self.attn_precision, transformer_options=transformer_options)
-        elif isinstance(mask, GuideAttentionMask):
-            out = _attention_with_guide_mask(q, k, v, self.heads, mask, attn_precision=self.attn_precision, transformer_options=transformer_options)
        else:
-            out = comfy.ldm.modules.attention.optimized_attention(q, k, v, self.heads, mask=mask, attn_precision=self.attn_precision, transformer_options=transformer_options)
+            out = comfy.ldm.modules.attention.optimized_attention_masked(q, k, v, self.heads, mask, attn_precision=self.attn_precision, transformer_options=transformer_options)

        # Apply per-head gating if enabled
        if self.to_gate_logits is not None:
@ -1120,9 +1063,7 @@ class LTXVModel(LTXBaseModel):
                additional_args["resolved_guide_entries"] = resolved_entries

            keyframe_idxs = keyframe_idxs[..., kf_grid_mask, :]
-
-            if keyframe_idxs.shape[2] > 0: # Guard for the case of no keyframes surviving
-                pixel_coords[:, :, -keyframe_idxs.shape[2]:, :] = keyframe_idxs
+            pixel_coords[:, :, -keyframe_idxs.shape[2]:, :] = keyframe_idxs

            # Total surviving guide tokens (all guides)
            additional_args["num_guide_tokens"] = keyframe_idxs.shape[2]
@ -1158,12 +1099,12 @@ class LTXVModel(LTXBaseModel):
        if not resolved_entries:
            return None

-        # strength != 1.0 means we want to either attenuate (< 1) or amplify (> 1) guide attention.
-        needs_mask = any(
-            e["strength"] != 1.0 or e.get("pixel_mask") is not None
+        # Check if any attenuation is actually needed
+        needs_attenuation = any(
+            e["strength"] < 1.0 or e.get("pixel_mask") is not None
            for e in resolved_entries
        )
-        if not needs_mask:
+        if not needs_attenuation:
            return None

        # Build per-guide-token weights for all tracked guide tokens.
@ -1218,11 +1159,16 @@ class LTXVModel(LTXBaseModel):
        # Concatenate per-token weights for all tracked guides
        tracked_weights = torch.cat(all_weights, dim=1)  # (1, total_tracked)

-        # Skip when every weight is exactly 1.0 (additive bias would be 0).
-        if (tracked_weights == 1.0).all():
+        # Check if any weight is actually < 1.0 (otherwise no attenuation needed)
+        if (tracked_weights >= 1.0).all():
            return None

-        return GuideAttentionMask(total_tokens, guide_start, total_tracked, tracked_weights)
+        # Build the mask: guide tokens are at the end of the sequence.
+        # Tracked guides come first (in order), untracked follow.
+        return self._build_self_attention_mask(
+            total_tokens, num_guide_tokens, total_tracked,
+            tracked_weights, guide_start, device, dtype,
+        )

    @staticmethod
    def _downsample_mask_to_latent(mask, f_lat, h_lat, w_lat):
@ -1288,6 +1234,45 @@ class LTXVModel(LTXBaseModel):

        return rearrange(latent_mask, "b 1 f h w -> b (f h w)")

+    @staticmethod
+    def _build_self_attention_mask(total_tokens, num_guide_tokens, tracked_count,
+                                    tracked_weights, guide_start, device, dtype):
+        """Build a log-space additive self-attention bias mask.
+
+        Attenuates attention between noisy tokens and tracked guide tokens.
+        Untracked guide tokens (at the end of the guide portion) keep full attention.
+
+        Args:
+            total_tokens: Total sequence length.
+            num_guide_tokens: Total guide tokens (all guides) at end of sequence.
+            tracked_count: Number of tracked guide tokens (first in the guide portion).
+            tracked_weights: (1, tracked_count) tensor, values in [0, 1].
+            guide_start: Index where guide tokens begin in the sequence.
+            device: Target device.
+            dtype: Target dtype.
+
+        Returns:
+            (1, 1, total_tokens, total_tokens) additive bias mask.
+            0.0 = full attention, negative = attenuated, finfo.min = effectively fully masked.
+        """
+        finfo = torch.finfo(dtype)
+        mask = torch.zeros((1, 1, total_tokens, total_tokens), device=device, dtype=dtype)
+        tracked_end = guide_start + tracked_count
+
+        # Convert weights to log-space bias
+        w = tracked_weights.to(device=device, dtype=dtype)  # (1, tracked_count)
+        log_w = torch.full_like(w, finfo.min)
+        positive_mask = w > 0
+        if positive_mask.any():
+            log_w[positive_mask] = torch.log(w[positive_mask].clamp(min=finfo.tiny))
+
+        # noisy → tracked guides: each noisy row gets the same per-guide weight
+        mask[:, :, :guide_start, guide_start:tracked_end] = log_w.view(1, 1, 1, -1)
+        # tracked guides → noisy: each guide row broadcasts its weight across noisy cols
+        mask[:, :, guide_start:tracked_end, :guide_start] = log_w.view(1, 1, -1, 1)
+
+        return mask
+
    def _process_transformer_blocks(self, x, context, attention_mask, timestep, pe, transformer_options={}, self_attention_mask=None, **kwargs):
        """Process transformer blocks for LTXV."""
        patches_replace = transformer_options.get("patches_replace", {})
--- a/comfy/ldm/modules/attention.py
+++ b/comfy/ldm/modules/attention.py
@ -14,8 +14,6 @@ from .sub_quadratic_attention import efficient_dot_product_attention

 from comfy import model_management

-TORCH_HAS_GQA = model_management.torch_version_numeric >= (2, 5)
-
 if model_management.xformers_enabled():
    import xformers
    import xformers.ops
@ -152,12 +150,7 @@ def attention_basic(q, k, v, heads, mask=None, attn_precision=None, skip_reshape
        b, _, dim_head = q.shape
        dim_head //= heads

-    if kwargs.get("enable_gqa", False) and q.shape[-3] != k.shape[-3]:
-        n_rep = q.shape[-3] // k.shape[-3]
-        k = k.repeat_interleave(n_rep, dim=-3)
-        v = v.repeat_interleave(n_rep, dim=-3)
-
-    scale = kwargs.get("scale", dim_head ** -0.5)
+    scale = dim_head ** -0.5

    h = heads
    if skip_reshape:
@ -226,10 +219,6 @@ def attention_sub_quad(query, key, value, heads, mask=None, attn_precision=None,
        b, _, dim_head = query.shape
        dim_head //= heads

-    if "scale" in kwargs:
-        # Pre-scale query to match requested scale (cancels internal 1/sqrt(dim_head))
-        query = query * (kwargs["scale"] * dim_head ** 0.5)
-
    if skip_reshape:
        query = query.reshape(b * heads, -1, dim_head)
        value = value.reshape(b * heads, -1, dim_head)
@ -301,7 +290,7 @@ def attention_split(q, k, v, heads, mask=None, attn_precision=None, skip_reshape
        b, _, dim_head = q.shape
        dim_head //= heads

-    scale = kwargs.get("scale", dim_head ** -0.5)
+    scale = dim_head ** -0.5

    if skip_reshape:
         q, k, v = map(
@ -511,13 +500,8 @@ def attention_pytorch(q, k, v, heads, mask=None, attn_precision=None, skip_resha
        if mask.ndim == 3:
            mask = mask.unsqueeze(1)

-    # Pass through extra SDPA kwargs (scale, enable_gqa) if provided
-    # enable_gqa requires PyTorch 2.5+; older versions use manual KV expansion above
-    sdpa_keys = ("scale", "enable_gqa") if TORCH_HAS_GQA else ("scale",)
-    sdpa_extra = {k: v for k, v in kwargs.items() if k in sdpa_keys}
-
    if SDP_BATCH_LIMIT >= b:
-        out = comfy.ops.scaled_dot_product_attention(q, k, v, attn_mask=mask, dropout_p=0.0, is_causal=False, **sdpa_extra)
+        out = comfy.ops.scaled_dot_product_attention(q, k, v, attn_mask=mask, dropout_p=0.0, is_causal=False)
        if not skip_output_reshape:
            out = (
                out.transpose(1, 2).reshape(b, -1, heads * dim_head)
@ -535,7 +519,7 @@ def attention_pytorch(q, k, v, heads, mask=None, attn_precision=None, skip_resha
                k[i : i + SDP_BATCH_LIMIT],
                v[i : i + SDP_BATCH_LIMIT],
                attn_mask=m,
-                dropout_p=0.0, is_causal=False, **sdpa_extra
+                dropout_p=0.0, is_causal=False
            ).transpose(1, 2).reshape(-1, q.shape[2], heads * dim_head)
    return out

--- a/comfy/ldm/modules/diffusionmodules/util.py
+++ b/comfy/ldm/modules/diffusionmodules/util.py
@ -140,7 +140,7 @@ def make_ddim_sampling_parameters(alphacums, ddim_timesteps, eta, verbose=True):
    alphas = alphacums[ddim_timesteps]
    alphas_prev = np.asarray([alphacums[0]] + alphacums[ddim_timesteps[:-1]].tolist())

-    # according to the formula provided in https://arxiv.org/abs/2010.02502
+    # according the the formula provided in https://arxiv.org/abs/2010.02502
    sigmas = eta * np.sqrt((1 - alphas_prev) / (1 - alphas) * (1 - alphas / alphas_prev))
    if verbose:
        logging.info(f'Selected alphas for ddim sampler: a_t: {alphas}; a_(t-1): {alphas_prev}')
--- a/comfy/ldm/moge/geometry.py
+++ b/comfy/ldm/moge/geometry.py
@ -1,189 +0,0 @@
-"""Pure-torch + scipy geometry helpers for MoGe inference and mesh export."""
-
-from __future__ import annotations
-
-from typing import Optional, Tuple
-
-import numpy as np
-import torch
-import torch.nn.functional as F
-
-from scipy.optimize import least_squares
-
-def normalized_view_plane_uv(width: int, height: int, aspect_ratio: Optional[float] = None,
-                             dtype: Optional[torch.dtype] = None, device: Optional[torch.device] = None) -> torch.Tensor:
-    """Normalized view-plane UV coordinates with corners at +/-(W, H)/diagonal."""
-    if aspect_ratio is None:
-        aspect_ratio = width / height
-    span_x = aspect_ratio / (1 + aspect_ratio ** 2) ** 0.5
-    span_y = 1.0 / (1 + aspect_ratio ** 2) ** 0.5
-    u = torch.linspace(-span_x * (width - 1) / width, span_x * (width - 1) / width, width, dtype=dtype, device=device)
-    v = torch.linspace(-span_y * (height - 1) / height, span_y * (height - 1) / height, height, dtype=dtype, device=device)
-    u, v = torch.meshgrid(u, v, indexing="xy")
-    return torch.stack([u, v], dim=-1)
-
-
-def intrinsics_from_focal_center(fx: torch.Tensor, fy: torch.Tensor, cx: torch.Tensor, cy: torch.Tensor) -> torch.Tensor:
-    """Assemble (..., 3, 3) intrinsics from broadcastable fx, fy, cx, cy."""
-    fx, fy, cx, cy = [torch.as_tensor(v) for v in (fx, fy, cx, cy)]
-    fx, fy, cx, cy = torch.broadcast_tensors(fx, fy, cx, cy)
-    zero = torch.zeros_like(fx)
-    one = torch.ones_like(fx)
-    return torch.stack([
-        torch.stack([fx,   zero, cx], dim=-1),
-        torch.stack([zero, fy,   cy], dim=-1),
-        torch.stack([zero, zero, one], dim=-1),
-    ], dim=-2)
-
-
-def depth_map_to_point_map(depth: torch.Tensor, intrinsics: torch.Tensor) -> torch.Tensor:
-    """Back-project a (..., H, W) depth map through K^-1 to (..., H, W, 3) camera-space points.
-
-    Intrinsics use normalized image coords (x in [0, 1] left->right, y in [0, 1] top->bottom).
-    """
-    H, W = depth.shape[-2:]
-    device, dtype = depth.device, depth.dtype
-    u = (torch.arange(W, dtype=dtype, device=device) + 0.5) / W
-    v = (torch.arange(H, dtype=dtype, device=device) + 0.5) / H
-    grid_v, grid_u = torch.meshgrid(v, u, indexing="ij")
-    pix = torch.stack([grid_u, grid_v, torch.ones_like(grid_u)], dim=-1)
-    K_inv = torch.linalg.inv(intrinsics)
-    rays = torch.einsum("...ij,hwj->...hwi", K_inv, pix)
-    return rays * depth.unsqueeze(-1)
-
-
-def _solve_optimal_shift(uv: np.ndarray, xyz: np.ndarray,
-                         focal: Optional[float] = None) -> Tuple[float, float]:
-    """LM-solve for z-shift; when focal is None, also recovers the optimal focal."""
-    uv = uv.reshape(-1, 2)
-    xy = xyz[..., :2].reshape(-1, 2)
-    z = xyz[..., 2].reshape(-1)
-
-    def fn(shift):
-        xy_proj = xy / (z + shift)[:, None]
-        f = focal if focal is not None else (xy_proj * uv).sum() / np.square(xy_proj).sum()
-        return (f * xy_proj - uv).ravel()
-
-    sol = least_squares(fn, x0=0.0, ftol=1e-3, method="lm")
-    shift = float(np.asarray(sol["x"]).squeeze())
-    if focal is None:
-        xy_proj = xy / (z + shift)[:, None]
-        focal = float((xy_proj * uv).sum() / np.square(xy_proj).sum())
-    return shift, focal
-
-
-def recover_focal_shift(points: torch.Tensor, mask: Optional[torch.Tensor] = None,
-                        focal: Optional[torch.Tensor] = None, downsample_size: Tuple[int, int] = (64, 64)
-                        ) -> Tuple[torch.Tensor, torch.Tensor]:
-    """Recover the focal length and z-shift that turn points into a metric point map.
-
-    Optical center is at the image center; returned focal is relative to half the image diagonal.
-    Returns (focal, shift) on the same device/dtype as points.
-    """
-    shape = points.shape
-    H, W = shape[-3], shape[-2]
-    points_b = points.reshape(-1, H, W, 3)
-    mask_b = None if mask is None else mask.reshape(-1, H, W)
-    focal_b = None if focal is None else focal.reshape(-1)
-
-    uv = normalized_view_plane_uv(W, H, dtype=points.dtype, device=points.device)
-
-    points_lr = F.interpolate(points_b.permute(0, 3, 1, 2), downsample_size, mode="nearest").permute(0, 2, 3, 1)
-    uv_lr = F.interpolate(uv.unsqueeze(0).permute(0, 3, 1, 2), downsample_size, mode="nearest").squeeze(0).permute(1, 2, 0)
-    mask_lr = None
-    if mask_b is not None:
-        mask_lr = F.interpolate(mask_b.to(torch.float32).unsqueeze(1), downsample_size, mode="nearest").squeeze(1) > 0
-
-    uv_np = uv_lr.detach().cpu().numpy()
-    points_np = points_lr.detach().cpu().numpy()
-    mask_np = None if mask_lr is None else mask_lr.detach().cpu().numpy()
-    focal_np = None if focal_b is None else focal_b.detach().cpu().numpy()
-
-    out_focal: list = []
-    out_shift: list = []
-    for i in range(points_b.shape[0]):
-        if mask_np is None:
-            xyz_i = points_np[i].reshape(-1, 3)
-            uv_i = uv_np.reshape(-1, 2)
-        else:
-            sel = mask_np[i]
-            if sel.sum() < 2:
-                out_focal.append(1.0)
-                out_shift.append(0.0)
-                continue
-            xyz_i = points_np[i][sel]
-            uv_i = uv_np[sel]
-        if focal_np is None:
-            shift_i, focal_i = _solve_optimal_shift(uv_i, xyz_i)
-            out_focal.append(focal_i)
-        else:
-            shift_i, _ = _solve_optimal_shift(uv_i, xyz_i, focal=float(focal_np[i]))
-        out_shift.append(shift_i)
-
-    shift_t = torch.tensor(out_shift, device=points.device, dtype=points.dtype).reshape(shape[:-3])
-    if focal is None:
-        focal_t = torch.tensor(out_focal, device=points.device, dtype=points.dtype).reshape(shape[:-3])
-    else:
-        focal_t = focal.reshape(shape[:-3])
-    return focal_t, shift_t
-
-
-def depth_map_edge(depth: torch.Tensor, atol: Optional[float] = None, rtol: Optional[float] = None, kernel_size: int = 3) -> torch.Tensor:
-    """Per-pixel boolean: True where the local depth window's max-min span exceeds atol or rtol*depth."""
-    shape = depth.shape
-    d = depth.reshape(-1, 1, *shape[-2:])
-    pad = kernel_size // 2
-    diff = F.max_pool2d(d, kernel_size, stride=1, padding=pad) + F.max_pool2d(-d, kernel_size, stride=1, padding=pad)
-    edge = torch.zeros_like(d, dtype=torch.bool)
-    if atol is not None:
-        edge |= diff > atol
-    if rtol is not None:
-        edge |= (diff / d.clamp_min(1e-6)).nan_to_num_() > rtol
-    return edge.reshape(*shape)
-
-
-def triangulate_grid_mesh(points: torch.Tensor, mask: Optional[torch.Tensor] = None, decimation: int = 1, discontinuity_threshold: float = 0.04,
-                          depth: Optional[torch.Tensor] = None) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-    """Triangulate a (H, W, 3) point map into (vertices, faces, uvs) on CPU.
-
-    Vertices: pixels with finite coords (passing optional mask).  Quads with four valid corners
-    become two triangles.  depth overrides the scalar used for the rtol edge check; pass radial
-    depth for panoramas (the default points[..., 2] goes negative below the equator).
-    """
-    points = points.detach().cpu()
-    finite = torch.isfinite(points).all(dim=-1)
-    if mask is None:
-        mask = finite
-    else:
-        mask = mask.detach().cpu().to(torch.bool) & finite
-
-    if discontinuity_threshold > 0:
-        d = depth.detach().cpu() if depth is not None else points[..., 2]
-        # Replace inf with 0 so max-pool doesn't poison neighbourhoods (mask above already excludes those pixels).
-        d_finite = torch.where(finite, d, torch.zeros_like(d))
-        edge = depth_map_edge(d_finite, rtol=discontinuity_threshold)
-        mask = mask & ~edge
-
-    if decimation > 1:
-        points = points[::decimation, ::decimation].contiguous()
-        mask = mask[::decimation, ::decimation].contiguous()
-    H, W = points.shape[:2]
-
-    flat_mask = mask.reshape(-1)
-    idx = torch.full((H * W,), -1, dtype=torch.long)
-    n_valid = int(flat_mask.sum().item())
-    idx[flat_mask] = torch.arange(n_valid, dtype=torch.long)
-    idx = idx.reshape(H, W)
-
-    vertices = points.reshape(-1, 3)[flat_mask].contiguous()
-
-    yy, xx = torch.meshgrid(torch.arange(H), torch.arange(W), indexing="ij")
-    u = xx.float() / max(W - 1, 1)
-    v = yy.float() / max(H - 1, 1)
-    uvs = torch.stack([u, v], dim=-1).reshape(-1, 2)[flat_mask].contiguous()
-
-    a, b, c, d = idx[:-1, :-1], idx[:-1, 1:], idx[1:, 1:], idx[1:, :-1]
-    quad_ok = (a >= 0) & (b >= 0) & (c >= 0) & (d >= 0)
-    a, b, c, d = a[quad_ok], b[quad_ok], c[quad_ok], d[quad_ok]
-    faces = torch.cat([torch.stack([a, b, c], dim=-1), torch.stack([a, c, d], dim=-1)], dim=0).contiguous()
-    return vertices, faces, uvs
--- a/comfy/ldm/moge/model.py
+++ b/comfy/ldm/moge/model.py
@ -1,347 +0,0 @@
-"""MoGe v1 / v2 inference modules and a state-dict-driven builder.
-
-V1: DINOv2 backbone + multi-output head (points, mask).
-V2: DINOv2 encoder + neck + per-output heads (points, mask, normal, optional metric-scale MLP).
-"""
-
-from __future__ import annotations
-
-from numbers import Number
-from typing import Any, Dict, List, Optional, Tuple, Union
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-import comfy.ops
-import comfy.model_management
-import comfy.model_patcher
-
-from comfy.image_encoders.dino2 import Dinov2Model
-
-from .geometry import depth_map_to_point_map, intrinsics_from_focal_center, recover_focal_shift
-from .modules import ConvStack, DINOv2Encoder, HeadV1, MLP, _view_plane_uv_grid
-
-
-def _remap_points(points: torch.Tensor) -> torch.Tensor:
-    """Apply the exp remap: z -> exp(z), xy stays linear and gets scaled by the new z."""
-    xy, z = points.split([2, 1], dim=-1)
-    z = torch.exp(z)
-    return torch.cat([xy * z, z], dim=-1)
-
-
-def _detect_dinov2(sd: dict, prefix: str) -> Dict[str, Any]:
-    # All shipped MoGe checkpoints use plain DINOv2
-    hidden = sd[prefix + "embeddings.cls_token"].shape[-1]
-    layer_prefix = prefix + "encoder.layer."
-    depth = 1 + max(int(k[len(layer_prefix):].split(".")[0]) for k in sd if k.startswith(layer_prefix))
-    return {
-        "hidden_size": hidden,
-        "num_attention_heads": hidden // 64,
-        "num_hidden_layers": depth,
-        "layer_norm_eps": 1e-6,
-        "use_swiglu_ffn": False,
-    }
-
-
-class MoGeModelV1(nn.Module):
-    """MoGe v1: DINOv2 backbone + HeadV1 (points, mask)."""
-
-    image_mean: torch.Tensor
-    image_std: torch.Tensor
-
-    intermediate_layers = 4
-    num_tokens_range: Tuple[Number, Number] = (1200, 2500)
-    mask_threshold = 0.5
-
-    def __init__(self, backbone: Dict[str, Any], dim_upsample: List[int] = (256, 128, 128),
-                 num_res_blocks: int = 1, dim_times_res_block_hidden: int = 1,
-                 dtype=None, device=None, operations=comfy.ops.manual_cast):
-        super().__init__()
-        self.backbone = Dinov2Model(backbone, dtype, device, operations)
-        self.head = HeadV1(dim_in=backbone["hidden_size"], dim_upsample=list(dim_upsample),
-                           num_res_blocks=num_res_blocks, dim_times_res_block_hidden=dim_times_res_block_hidden,
-                           dtype=dtype, device=device, operations=operations)
-        self.register_buffer("image_mean", torch.tensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1))
-        self.register_buffer("image_std", torch.tensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1))
-
-    def forward(self, image: torch.Tensor, num_tokens: int) -> Dict[str, torch.Tensor]:
-        H, W = image.shape[-2:]
-        resize = ((num_tokens * 14 ** 2) / (H * W)) ** 0.5
-        rh, rw = int(H * resize), int(W * resize)
-        x = F.interpolate(image, (rh, rw), mode="bicubic", align_corners=False, antialias=True)
-        x = (x - self.image_mean) / self.image_std
-        x14 = F.interpolate(x, (rh // 14 * 14, rw // 14 * 14), mode="bilinear", align_corners=False, antialias=True)
-
-        n_layers = len(self.backbone.encoder.layer)
-        indices = list(range(n_layers - self.intermediate_layers, n_layers))
-        feats = self.backbone.get_intermediate_layers(x14, indices, apply_norm=True)
-
-        points, mask = self.head(feats, x)
-        points = F.interpolate(points.float(), (H, W), mode="bilinear", align_corners=False)
-        points = _remap_points(points.permute(0, 2, 3, 1))
-
-        mask = F.interpolate(mask.float(), (H, W), mode="bilinear", align_corners=False).squeeze(1)
-
-        return {"points": points, "mask": mask}
-
-    @classmethod
-    def from_state_dict(cls, sd, dtype=None, device=None, operations=comfy.ops.manual_cast):
-        """Detect the v1 head config from sd, build a model, and load weights."""
-        n_up = 1 + max(int(k.split(".")[2]) for k in sd if k.startswith("head.upsample_blocks."))
-        dim_upsample = [sd[f"head.upsample_blocks.{i}.0.0.weight"].shape[1] for i in range(n_up)]
-        # Each upsample stage is Sequential[upsampler, *res_blocks]; count res blocks at level 0.
-        num_res_blocks = max({int(k.split(".")[3]) for k in sd if k.startswith("head.upsample_blocks.0.")})
-        hidden_out = sd["head.upsample_blocks.0.1.layers.2.weight"].shape[0]
-        dim_times = max(hidden_out // dim_upsample[0], 1)
-        model = cls(backbone=_detect_dinov2(sd, prefix="backbone."),
-                    dim_upsample=dim_upsample, num_res_blocks=num_res_blocks, dim_times_res_block_hidden=dim_times,
-                    dtype=dtype, device=device, operations=operations)
-        model.load_state_dict(sd, strict=True)
-        return model
-
-
-class MoGeModelV2(nn.Module):
-    """MoGe v2: DINOv2 encoder + neck + per-output heads (points/mask/normal/metric-scale)."""
-
-    intermediate_layers = 4
-    num_tokens_range: Tuple[Number, Number] = (1200, 3600)
-
-    def __init__(self,
-                 encoder: Dict[str, Any],
-                 neck: Dict[str, Any],
-                 points_head: Dict[str, Any],
-                 mask_head: Dict[str, Any],
-                 scale_head: Dict[str, Any],
-                 normal_head: Optional[Dict[str, Any]] = None,
-                 dtype=None, device=None, operations=comfy.ops.manual_cast):
-        super().__init__()
-        self.encoder = DINOv2Encoder(**encoder, dtype=dtype, device=device, operations=operations)
-        self.neck = ConvStack(**neck, dtype=dtype, device=device, operations=operations)
-        self.points_head = ConvStack(**points_head, dtype=dtype, device=device, operations=operations)
-        self.mask_head = ConvStack(**mask_head, dtype=dtype, device=device, operations=operations)
-        self.scale_head = MLP(**scale_head, dtype=dtype, device=device, operations=operations)
-        if normal_head is not None:
-            self.normal_head = ConvStack(**normal_head, dtype=dtype, device=device, operations=operations)
-
-    def forward(self, image: torch.Tensor, num_tokens: int) -> Dict[str, torch.Tensor]:
-        B, _, H, W = image.shape
-        device, dtype = image.device, image.dtype
-        aspect_ratio = W / H
-        base_h = round((num_tokens / aspect_ratio) ** 0.5)
-        base_w = round((num_tokens * aspect_ratio) ** 0.5)
-
-        feat_top, cls_token = self.encoder(image, base_h, base_w, return_class_token=True)
-
-        # 5-level pyramid: feat at level 0 concatenated with UV, other levels UV-only.
-        levels = [_view_plane_uv_grid(B, base_h * (2 ** L), base_w * (2 ** L), aspect_ratio, dtype, device)
-                                    for L in range(5)]
-        levels[0] = torch.cat([feat_top, levels[0]], dim=1)
-
-        feats = self.neck(levels)
-
-        def _resize(v):
-            return F.interpolate(v, (H, W), mode="bilinear", align_corners=False)
-
-        points = _remap_points(_resize(self.points_head(feats)[-1]).permute(0, 2, 3, 1))
-        mask = _resize(self.mask_head(feats)[-1]).squeeze(1).sigmoid()
-        metric_scale = self.scale_head(cls_token).squeeze(1).exp()
-
-        result = {"points": points, "mask": mask, "metric_scale": metric_scale}
-        if hasattr(self, "normal_head"):
-            normal = _resize(self.normal_head(feats)[-1])
-            result["normal"] = F.normalize(normal.permute(0, 2, 3, 1), dim=-1)
-        return result
-
-    @classmethod
-    def from_state_dict(cls, sd, dtype=None, device=None, operations=comfy.ops.manual_cast):
-        """Detect the v2 encoder/neck/heads config from sd, build a model, and load weights."""
-        backbone = _detect_dinov2(sd, prefix="encoder.backbone.")
-        depth = backbone["num_hidden_layers"]
-        n = cls.intermediate_layers
-        encoder = {
-            "backbone": backbone,
-            "intermediate_layers": [(depth // n) * (i + 1) - 1 for i in range(n)],
-            "dim_out": sd["encoder.output_projections.0.weight"].shape[0],
-        }
-        # scale_head is an MLP: Sequential of [Linear, ReLU, ..., Linear]; Linear weight is (out, in).
-        scale_idxs = sorted({int(k.split(".")[1]) for k in sd if k.startswith("scale_head.")})
-        scale_first = sd[f"scale_head.{scale_idxs[0]}.weight"]
-        cfg: Dict[str, Any] = {
-            "encoder": encoder,
-            "neck": cls._detect_convstack(sd, "neck."),
-            "points_head": cls._detect_convstack(sd, "points_head."),
-            "mask_head": cls._detect_convstack(sd, "mask_head."),
-            "scale_head": {"dims": [scale_first.shape[1]] + [sd[f"scale_head.{i}.weight"].shape[0] for i in scale_idxs]},
-        }
-        if any(k.startswith("normal_head.") for k in sd):
-            cfg["normal_head"] = cls._detect_convstack(sd, "normal_head.")
-        model = cls(**cfg, dtype=dtype, device=device, operations=operations)
-        model.load_state_dict(sd, strict=True)
-        return model
-
-    @staticmethod
-    def _detect_convstack(sd: dict, prefix: str) -> Dict[str, Any]:
-        """Reconstruct a ConvStack config from the keys under prefix"""
-        in_keys = [k for k in sd if k.startswith(f"{prefix}input_blocks.") and k.endswith(".weight")]
-        n = 1 + max(int(k[len(f"{prefix}input_blocks."):].split(".")[0]) for k in in_keys)
-
-        in_shapes = [sd[f"{prefix}input_blocks.{i}.weight"].shape for i in range(n)]
-        has_out = lambda i: f"{prefix}output_blocks.{i}.weight" in sd
-        has_norm = f"{prefix}res_blocks.0.0.layers.0.weight" in sd
-
-        def num_res_at(i):
-            rb_prefix = f"{prefix}res_blocks.{i}."
-            return len({int(k[len(rb_prefix):].split(".")[0]) for k in sd if k.startswith(rb_prefix)})
-
-        return {
-            "dim_in": [s[1] for s in in_shapes],
-            "dim_res_blocks": [s[0] for s in in_shapes],
-            "dim_out": [sd[f"{prefix}output_blocks.{i}.weight"].shape[0] if has_out(i) else None for i in range(n)],
-            "num_res_blocks": [num_res_at(i) for i in range(n)],
-            "resamplers": ["conv_transpose" if f"{prefix}resamplers.{i}.0.weight" in sd else "bilinear"
-                           for i in range(n - 1)],
-            "res_block_in_norm": "layer_norm" if has_norm else "none",
-            "res_block_hidden_norm": "group_norm" if has_norm else "none",
-        }
-
-
-# Translate the Meta-style DINOv2 keys MoGe ships to the naming ComfyUI DINOv2 port expects,
-# and split each fused qkv tensor into Q/K/V.
-_DINOV2_TOPLEVEL_RENAMES = {
-    "patch_embed.proj.weight": "embeddings.patch_embeddings.projection.weight",
-    "patch_embed.proj.bias":   "embeddings.patch_embeddings.projection.bias",
-    "cls_token":               "embeddings.cls_token",
-    "pos_embed":               "embeddings.position_embeddings",
-    "register_tokens":         "embeddings.register_tokens",
-    "mask_token":              "embeddings.mask_token",
-    "norm.weight":             "layernorm.weight",
-    "norm.bias":               "layernorm.bias",
-}
-_DINOV2_BLOCK_RENAMES = [
-    ("ls1.gamma",  "layer_scale1.lambda1"),
-    ("ls2.gamma",  "layer_scale2.lambda1"),
-    ("attn.proj.", "attention.output.dense."),
-    ("mlp.w12.",   "mlp.weights_in."),
-    ("mlp.w3.",    "mlp.weights_out."),
-]
-
-
-def _remap_state_dict(sd: dict) -> dict:
-    if "model" in sd and "model_config" in sd:
-        sd = sd["model"]
-    prefix = "encoder.backbone." if any(k.startswith("encoder.backbone.") for k in sd) else "backbone."
-    out: dict = {}
-    for k, v in sd.items():
-        if not k.startswith(prefix):
-            out[k] = v
-            continue
-        rel = k[len(prefix):]
-        if rel in _DINOV2_TOPLEVEL_RENAMES:
-            out[prefix + _DINOV2_TOPLEVEL_RENAMES[rel]] = v
-            continue
-        if not rel.startswith("blocks."):
-            out[k] = v
-            continue
-        _, idx, sub = rel.split(".", 2)
-        if sub in ("attn.qkv.weight", "attn.qkv.bias"):
-            tail = sub.rsplit(".", 1)[1]
-            q, kw, vw = v.chunk(3, dim=0)
-            base = f"{prefix}encoder.layer.{idx}.attention.attention"
-            out[f"{base}.query.{tail}"] = q
-            out[f"{base}.key.{tail}"] = kw
-            out[f"{base}.value.{tail}"] = vw
-            continue
-        for old, new in _DINOV2_BLOCK_RENAMES:
-            sub = sub.replace(old, new)
-        out[f"{prefix}encoder.layer.{idx}.{sub}"] = v
-    return out
-
-
-def build_from_state_dict(sd: dict, dtype=None, device=None, operations=comfy.ops.manual_cast) -> nn.Module:
-    """Dispatch to v1 or v2 based on the DINOv2 backbone prefix."""
-    sd = _remap_state_dict(sd)
-    cls = MoGeModelV2 if any(k.startswith("encoder.backbone.") for k in sd) else MoGeModelV1
-    return cls.from_state_dict(sd, dtype=dtype, device=device, operations=operations)
-
-
-class MoGeModel:
-    """Loaded MoGe model + ComfyUI memory management."""
-
-    def __init__(self, state_dict: dict):
-        # text encoder dtype closest match
-        self.load_device = comfy.model_management.text_encoder_device()
-        offload_device = comfy.model_management.text_encoder_offload_device()
-        self.dtype = comfy.model_management.text_encoder_dtype(self.load_device)
-
-        self.model = build_from_state_dict(state_dict, dtype=self.dtype, device=offload_device, operations=comfy.ops.manual_cast).eval()
-        self.patcher = comfy.model_patcher.CoreModelPatcher(self.model, load_device=self.load_device, offload_device=offload_device)
-        self.version = "v2" if hasattr(self.model, "encoder") else "v1"
-        self.mask_threshold = float(getattr(self.model, "mask_threshold", 0.5))
-        nt = getattr(self.model, "num_tokens_range", (1200, 2500 if self.version == "v1" else 3600))
-        self.num_tokens_range = (int(nt[0]), int(nt[1]))
-
-    def infer(self, image: torch.Tensor, num_tokens: Optional[int] = None,
-              resolution_level: int = 9, fov_x: Optional[Union[Number, torch.Tensor]] = None,
-              force_projection: bool = True, apply_mask: bool = True,
-              apply_metric_scale: bool = True
-              ) -> Dict[str, torch.Tensor]:
-        """Run a single MoGe forward + post-process pass. image is (B, 3, H, W) in [0, 1]."""
-        comfy.model_management.load_model_gpu(self.patcher)
-        image = image.to(device=self.load_device, dtype=self.dtype)
-        H, W = image.shape[-2:]
-        aspect_ratio = W / H
-
-        if num_tokens is None:
-            lo, hi = self.num_tokens_range
-            num_tokens = int(lo + (resolution_level / 9) * (hi - lo))
-
-        out = self.model.forward(image, num_tokens=num_tokens)
-        points = out["points"].float()  # recover_focal_shift goes through scipy on CPU; needs fp32.
-        mask_binary = out["mask"] > self.mask_threshold
-        normal = out.get("normal")
-        metric_scale = out.get("metric_scale")
-
-        diag = (1 + aspect_ratio ** 2) ** 0.5
-
-        def focal_from_fov_deg(deg):
-            fov = torch.as_tensor(deg, device=points.device, dtype=points.dtype)
-            return aspect_ratio / diag / torch.tan(torch.deg2rad(fov / 2))
-
-        if fov_x is None:
-            focal, shift = recover_focal_shift(points, mask_binary)
-            # Fall back to 60 deg FoV when the least-squares solver flips the focal sign.
-            bad = ~torch.isfinite(focal) | (focal <= 0)
-            if bool(bad.any()):
-                focal = torch.where(bad, focal_from_fov_deg(60.0), focal)
-                _, shift = recover_focal_shift(points, mask_binary, focal=focal)
-        else:
-            focal = focal_from_fov_deg(fov_x).expand(points.shape[0])
-            _, shift = recover_focal_shift(points, mask_binary, focal=focal)
-
-        f_diag = focal / 2 * diag
-        half = torch.tensor(0.5, device=points.device, dtype=points.dtype)
-        intrinsics = intrinsics_from_focal_center(f_diag / aspect_ratio, f_diag, half, half)
-        points[..., 2] = points[..., 2] + shift[..., None, None]
-        # v2 only: filter mask by depth>0 to drop metric-scale negative-depth artifacts.
-        if self.version == "v2":
-            mask_binary = mask_binary & (points[..., 2] > 0)
-        depth = points[..., 2].clone()
-
-        if force_projection:
-            points = depth_map_to_point_map(depth, intrinsics=intrinsics)
-
-        if apply_metric_scale and metric_scale is not None:
-            points = points * metric_scale[:, None, None, None]
-            depth = depth * metric_scale[:, None, None]
-
-        if apply_mask:
-            points = torch.where(mask_binary[..., None], points, torch.full_like(points, float("inf")))
-            depth = torch.where(mask_binary, depth, torch.full_like(depth, float("inf")))
-            if normal is not None:
-                normal = torch.where(mask_binary[..., None], normal, torch.zeros_like(normal))
-
-        result = {"points": points, "depth": depth, "intrinsics": intrinsics, "mask": mask_binary}
-        if normal is not None:
-            result["normal"] = normal
-        return result
--- a/Show More
+++ b/Show More